PyPI - maxframe - Versions diffs - 0.1.0b4__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0__cp310-cp310-macosx_10_9_universal2.whl - Mend

maxframe 0.1.0b4__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (214) hide show

maxframe/__init__.py +1 -0
maxframe/_utils.cpython-310-darwin.so +0 -0
maxframe/codegen.py +56 -5
maxframe/config/config.py +78 -10
maxframe/config/validators.py +42 -11
maxframe/conftest.py +58 -14
maxframe/core/__init__.py +2 -16
maxframe/core/entity/__init__.py +1 -12
maxframe/core/entity/executable.py +1 -1
maxframe/core/entity/objects.py +46 -45
maxframe/core/entity/output_types.py +0 -3
maxframe/core/entity/tests/test_objects.py +43 -0
maxframe/core/entity/tileables.py +5 -78
maxframe/core/graph/__init__.py +2 -2
maxframe/core/graph/builder/__init__.py +0 -1
maxframe/core/graph/builder/base.py +5 -4
maxframe/core/graph/builder/tileable.py +4 -4
maxframe/core/graph/builder/utils.py +4 -8
maxframe/core/graph/core.cpython-310-darwin.so +0 -0
maxframe/core/graph/core.pyx +4 -4
maxframe/core/graph/entity.py +9 -33
maxframe/core/operator/__init__.py +2 -9
maxframe/core/operator/base.py +3 -5
maxframe/core/operator/objects.py +0 -9
maxframe/core/operator/utils.py +55 -0
maxframe/dataframe/__init__.py +2 -1
maxframe/dataframe/arithmetic/around.py +5 -17
maxframe/dataframe/arithmetic/core.py +15 -7
maxframe/dataframe/arithmetic/docstring.py +7 -33
maxframe/dataframe/arithmetic/equal.py +4 -2
maxframe/dataframe/arithmetic/greater.py +4 -2
maxframe/dataframe/arithmetic/greater_equal.py +4 -2
maxframe/dataframe/arithmetic/less.py +2 -2
maxframe/dataframe/arithmetic/less_equal.py +4 -2
maxframe/dataframe/arithmetic/not_equal.py +4 -2
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
maxframe/dataframe/core.py +58 -12
maxframe/dataframe/datasource/date_range.py +2 -2
maxframe/dataframe/datasource/read_odps_query.py +120 -24
maxframe/dataframe/datasource/read_odps_table.py +9 -4
maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
maxframe/dataframe/datastore/to_odps.py +28 -0
maxframe/dataframe/extensions/__init__.py +5 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +317 -0
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/cum.py +0 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
maxframe/dataframe/groupby/transform.py +5 -1
maxframe/dataframe/indexing/align.py +1 -1
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/indexing/rename.py +5 -28
maxframe/dataframe/indexing/sample.py +0 -1
maxframe/dataframe/indexing/set_index.py +68 -1
maxframe/dataframe/initializer.py +11 -1
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +237 -3
maxframe/dataframe/merge/tests/test_merge.py +126 -1
maxframe/dataframe/misc/__init__.py +4 -0
maxframe/dataframe/misc/apply.py +6 -11
maxframe/dataframe/misc/case_when.py +141 -0
maxframe/dataframe/misc/describe.py +2 -2
maxframe/dataframe/misc/drop_duplicates.py +8 -8
maxframe/dataframe/misc/eval.py +4 -0
maxframe/dataframe/misc/memory_usage.py +2 -2
maxframe/dataframe/misc/pct_change.py +1 -83
maxframe/dataframe/misc/pivot_table.py +262 -0
maxframe/dataframe/misc/tests/test_misc.py +93 -1
maxframe/dataframe/misc/transform.py +1 -30
maxframe/dataframe/misc/value_counts.py +4 -17
maxframe/dataframe/missing/dropna.py +1 -1
maxframe/dataframe/missing/fillna.py +5 -5
maxframe/dataframe/operators.py +1 -17
maxframe/dataframe/plotting/core.py +2 -2
maxframe/dataframe/reduction/core.py +4 -3
maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
maxframe/dataframe/sort/sort_values.py +1 -11
maxframe/dataframe/statistics/corr.py +3 -3
maxframe/dataframe/statistics/quantile.py +13 -19
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_initializer.py +33 -2
maxframe/dataframe/utils.py +33 -11
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/errors.py +13 -0
maxframe/extension.py +12 -0
maxframe/io/__init__.py +13 -0
maxframe/io/objects/__init__.py +24 -0
maxframe/io/objects/core.py +140 -0
maxframe/io/objects/tensor.py +76 -0
maxframe/io/objects/tests/__init__.py +13 -0
maxframe/io/objects/tests/test_object_io.py +97 -0
maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
maxframe/{odpsio → io/odpsio}/schema.py +38 -16
maxframe/io/odpsio/tableio.py +719 -0
maxframe/io/odpsio/tests/__init__.py +13 -0
maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
maxframe/io/odpsio/volumeio.py +63 -0
maxframe/learn/contrib/__init__.py +3 -1
maxframe/learn/contrib/graph/__init__.py +15 -0
maxframe/learn/contrib/graph/connected_components.py +215 -0
maxframe/learn/contrib/graph/tests/__init__.py +13 -0
maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/learn/contrib/utils.py +52 -0
maxframe/learn/contrib/xgboost/__init__.py +26 -0
maxframe/learn/contrib/xgboost/classifier.py +110 -0
maxframe/learn/contrib/xgboost/core.py +241 -0
maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
maxframe/learn/contrib/xgboost/predict.py +121 -0
maxframe/learn/contrib/xgboost/regressor.py +71 -0
maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
maxframe/learn/contrib/xgboost/train.py +132 -0
maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
maxframe/learn/utils/__init__.py +15 -0
maxframe/learn/utils/core.py +29 -0
maxframe/lib/mmh3.cpython-310-darwin.so +0 -0
maxframe/lib/mmh3.pyi +43 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/lib/wrapped_pickle.py +2 -1
maxframe/opcodes.py +11 -0
maxframe/protocol.py +154 -27
maxframe/remote/core.py +4 -8
maxframe/serialization/__init__.py +1 -0
maxframe/serialization/core.cpython-310-darwin.so +0 -0
maxframe/serialization/core.pxd +3 -0
maxframe/serialization/core.pyi +64 -0
maxframe/serialization/core.pyx +67 -26
maxframe/serialization/exception.py +1 -1
maxframe/serialization/pandas.py +52 -17
maxframe/serialization/serializables/core.py +180 -15
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +54 -5
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/session.py +37 -2
maxframe/tensor/__init__.py +81 -2
maxframe/tensor/arithmetic/isclose.py +1 -0
maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
maxframe/tensor/core.py +5 -136
maxframe/tensor/datasource/array.py +7 -2
maxframe/tensor/datasource/full.py +1 -1
maxframe/tensor/datasource/scalar.py +1 -1
maxframe/tensor/datasource/tests/test_datasource.py +1 -1
maxframe/tensor/indexing/flatnonzero.py +1 -1
maxframe/tensor/indexing/getitem.py +2 -0
maxframe/tensor/merge/__init__.py +2 -0
maxframe/tensor/merge/concatenate.py +101 -0
maxframe/tensor/merge/tests/test_merge.py +30 -1
maxframe/tensor/merge/vstack.py +74 -0
maxframe/tensor/{base → misc}/__init__.py +4 -0
maxframe/tensor/misc/atleast_1d.py +72 -0
maxframe/tensor/misc/atleast_2d.py +70 -0
maxframe/tensor/misc/atleast_3d.py +85 -0
maxframe/tensor/misc/tests/__init__.py +13 -0
maxframe/tensor/{base → misc}/transpose.py +22 -18
maxframe/tensor/misc/unique.py +205 -0
maxframe/tensor/operators.py +1 -7
maxframe/tensor/random/core.py +1 -1
maxframe/tensor/reduction/count_nonzero.py +2 -1
maxframe/tensor/reduction/mean.py +1 -0
maxframe/tensor/reduction/nanmean.py +1 -0
maxframe/tensor/reduction/nanvar.py +2 -0
maxframe/tensor/reduction/tests/test_reduction.py +12 -1
maxframe/tensor/reduction/var.py +2 -0
maxframe/tensor/statistics/quantile.py +2 -2
maxframe/tensor/utils.py +2 -22
maxframe/tests/test_protocol.py +34 -0
maxframe/tests/test_utils.py +0 -12
maxframe/tests/utils.py +17 -2
maxframe/typing_.py +4 -1
maxframe/udf.py +62 -3
maxframe/utils.py +112 -86
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
maxframe_client/__init__.py +0 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +123 -54
maxframe_client/session/consts.py +3 -0
maxframe_client/session/graph.py +8 -2
maxframe_client/session/odps.py +223 -40
maxframe_client/session/task.py +108 -80
maxframe_client/tests/test_fetcher.py +21 -3
maxframe_client/tests/test_session.py +136 -8
maxframe/core/entity/chunks.py +0 -68
maxframe/core/entity/fuse.py +0 -73
maxframe/core/graph/builder/chunk.py +0 -430
maxframe/odpsio/tableio.py +0 -300
maxframe/odpsio/volumeio.py +0 -95
maxframe_client/clients/spe.py +0 -104
/maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
/maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
/maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
/maxframe/tensor/{base → misc}/astype.py +0 -0
/maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
/maxframe/tensor/{base → misc}/ravel.py +0 -0
/maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
/maxframe/tensor/{base → misc}/where.py +0 -0
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0

maxframe/dataframe/misc/case_when.py ADDED Viewed

@@ -0,0 +1,141 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from pandas.core.dtypes.cast import find_common_type
+from ... import opcodes
+from ...core import TILEABLE_TYPE
+from ...serialization.serializables import FieldTypes, ListField
+from ..core import SERIES_TYPE
+from ..operators import DataFrameOperator, DataFrameOperatorMixin
+from ..utils import apply_if_callable
+class DataFrameCaseWhen(DataFrameOperator, DataFrameOperatorMixin):
+    _op_type_ = opcodes.CASE_WHEN
+    conditions = ListField("conditions", FieldTypes.reference, default=None)
+    replacements = ListField("replacements", FieldTypes.reference, default=None)
+    def __init__(self, output_types=None, **kw):
+        super().__init__(_output_types=output_types, **kw)
+    def _set_inputs(self, inputs):
+        super()._set_inputs(inputs)
+        it = iter(inputs)
+        next(it)
+        self.conditions = [
+            next(it) if isinstance(t, TILEABLE_TYPE) else t for t in self.conditions
+        ]
+        self.replacements = [
+            next(it) if isinstance(t, TILEABLE_TYPE) else t for t in self.replacements
+        ]
+    def __call__(self, series):
+        replacement_dtypes = [
+            it.dtype if isinstance(it, SERIES_TYPE) else np.array(it).dtype
+            for it in self.replacements
+        ]
+        dtype = find_common_type([series.dtype] + replacement_dtypes)
+        condition_tileables = [
+            it for it in self.conditions if isinstance(it, TILEABLE_TYPE)
+        ]
+        replacement_tileables = [
+            it for it in self.replacements if isinstance(it, TILEABLE_TYPE)
+        ]
+        inputs = [series] + condition_tileables + replacement_tileables
+        params = series.params
+        params["dtype"] = dtype
+        return self.new_series(inputs, **params)
+def case_when(series, caselist):
+    """
+    Replace values where the conditions are True.
+    Parameters
+    ----------
+    caselist : A list of tuples of conditions and expected replacements
+        Takes the form:  ``(condition0, replacement0)``,
+        ``(condition1, replacement1)``, ... .
+        ``condition`` should be a 1-D boolean array-like object
+        or a callable. If ``condition`` is a callable,
+        it is computed on the Series
+        and should return a boolean Series or array.
+        The callable must not change the input Series
+        (though pandas doesn`t check it). ``replacement`` should be a
+        1-D array-like object, a scalar or a callable.
+        If ``replacement`` is a callable, it is computed on the Series
+        and should return a scalar or Series. The callable
+        must not change the input Series.
+    Returns
+    -------
+    Series
+    See Also
+    --------
+    Series.mask : Replace values where the condition is True.
+    Examples
+    --------
+    >>> import maxframe.dataframe as md
+    >>> c = md.Series([6, 7, 8, 9], name='c')
+    >>> a = md.Series([0, 0, 1, 2])
+    >>> b = md.Series([0, 3, 4, 5])
+    >>> c.case_when(caselist=[(a.gt(0), a),  # condition, replacement
+    ...                       (b.gt(0), b)]).execute()
+    0    6
+    1    3
+    2    1
+    3    2
+    Name: c, dtype: int64
+    """
+    if not isinstance(caselist, list):
+        raise TypeError(
+            f"The caselist argument should be a list; instead got {type(caselist)}"
+        )
+    if not caselist:
+        raise ValueError(
+            "provide at least one boolean condition, "
+            "with a corresponding replacement."
+        )
+    for num, entry in enumerate(caselist):
+        if not isinstance(entry, tuple):
+            raise TypeError(
+                f"Argument {num} must be a tuple; instead got {type(entry)}."
+            )
+        if len(entry) != 2:
+            raise ValueError(
+                f"Argument {num} must have length 2; "
+                "a condition and replacement; "
+                f"instead got length {len(entry)}."
+            )
+    caselist = [
+        (
+            apply_if_callable(condition, series),
+            apply_if_callable(replacement, series),
+        )
+        for condition, replacement in caselist
+    ]
+    conditions = [case[0] for case in caselist]
+    replacements = [case[1] for case in caselist]
+    op = DataFrameCaseWhen(conditions=conditions, replacements=replacements)
+    return op(series)

maxframe/dataframe/misc/describe.py CHANGED Viewed

@@ -15,7 +15,7 @@
 import numpy as np
 import pandas as pd
-from ... import opcodes as OperandDef
+from ... import opcodes
 from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
 from ..core import SERIES_TYPE
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -23,7 +23,7 @@ from ..utils import build_empty_df, parse_index
 class DataFrameDescribe(DataFrameOperator, DataFrameOperatorMixin):
-    _op_type_ = OperandDef.DESCRIBE
+    _op_type_ = opcodes.DESCRIBE
     input = KeyField("input", default=None)
     percentiles = ListField("percentiles", FieldTypes.float64, default=None)

maxframe/dataframe/misc/drop_duplicates.py CHANGED Viewed

@@ -37,16 +37,19 @@ class DataFrameDropDuplicates(DuplicateOperand):
             shape += (3,)
         return shape
-    @classmethod
-    def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
+    def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
         params = input_params.copy()
-        if op.ignore_index:
+        if op.ignore_index and self._output_types[0] != OutputType.index:
             params["index_value"] = parse_index(pd.RangeIndex(-1))
         else:
             params["index_value"] = gen_unknown_index_value(
-                input_params["index_value"], op.keep, op.subset, type(op).__name__
+                input_params["index_value"],
+                op.keep,
+                op.subset,
+                type(op).__name__,
+                normalize_range_index=True,
             )
-        params["shape"] = cls._get_shape(input_params["shape"], op)
+        params["shape"] = self._get_shape(input_params["shape"], op)
         return params
     def __call__(self, inp, inplace=False):
@@ -151,17 +154,14 @@ def series_drop_duplicates(
     With the 'keep' parameter, the selection behaviour of duplicated values
     can be changed. The value 'first' keeps the first occurrence for each
     set of duplicated entries. The default value of keep is 'first'.
     >>> s.drop_duplicates().execute()
     0      lame
     1       cow
     3    beetle
     5     hippo
     Name: animal, dtype: object
     The value 'last' for parameter 'keep' keeps the last occurrence for
     each set of duplicated entries.
     >>> s.drop_duplicates(keep='last').execute()
     1       cow
     3    beetle

maxframe/dataframe/misc/eval.py CHANGED Viewed

@@ -120,6 +120,10 @@ class CollectionVisitor(ast.NodeVisitor):
         if obj_name in self.env:
             self.referenced_vars.add(obj_name)
             return self.env[obj_name]
+        try:
+            return self.target[obj_name]
+        except KeyError:
+            pass
         raise KeyError(f"name {obj_name} is not defined")
     def visit(self, node):

maxframe/dataframe/misc/memory_usage.py CHANGED Viewed

@@ -58,7 +58,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
         """
         if df_or_series.ndim == 1:
             # the input data is a series, a Scalar will be returned
-            return self.new_scalar([df_or_series], dtype=np.dtype(np.int_))
+            return self.new_scalar([df_or_series], dtype=np.dtype(int))
         else:
             # the input data is a DataFrame, a Scalar will be returned
             # calculate shape of returning series given ``op.index``
@@ -71,7 +71,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
                 [df_or_series],
                 index_value=self._adapt_index(df_or_series.columns_value),
                 shape=new_shape,
-                dtype=np.dtype(np.int_),
+                dtype=np.dtype(int),
             )

maxframe/dataframe/misc/pct_change.py CHANGED Viewed

@@ -18,6 +18,7 @@ from ..utils import validate_axis
 def pct_change(
     df_or_series, periods=1, fill_method="pad", limit=None, freq=None, **kwargs
 ):
+    # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/32
     """
     Percentage change between the current and a prior element.
@@ -50,89 +51,6 @@ def pct_change(
     DataFrame.diff : Compute the difference of two elements in a DataFrame.
     Series.shift : Shift the index by some number of periods.
     DataFrame.shift : Shift the index by some number of periods.
-    Examples
-    --------
-    **Series**
-    >>> import maxframe.dataframe as md
-    >>> s = md.Series([90, 91, 85])
-    >>> s.execute()
-    0    90
-    1    91
-    2    85
-    dtype: int64
-    >>> s.pct_change().execute()
-    0         NaN
-    1    0.011111
-    2   -0.065934
-    dtype: float64
-    >>> s.pct_change(periods=2).execute()
-    0         NaN
-    1         NaN
-    2   -0.055556
-    dtype: float64
-    See the percentage change in a Series where filling NAs with last
-    valid observation forward to next valid.
-    >>> s = md.Series([90, 91, None, 85])
-    >>> s.execute()
-    0    90.0
-    1    91.0
-    2     NaN
-    3    85.0
-    dtype: float64
-    >>> s.pct_change(fill_method='ffill').execute()
-    0         NaN
-    1    0.011111
-    2    0.000000
-    3   -0.065934
-    dtype: float64
-    **DataFrame**
-    Percentage change in French franc, Deutsche Mark, and Italian lira from
-    1980-01-01 to 1980-03-01.
-    >>> df = md.DataFrame({
-    ...     'FR': [4.0405, 4.0963, 4.3149],
-    ...     'GR': [1.7246, 1.7482, 1.8519],
-    ...     'IT': [804.74, 810.01, 860.13]},
-    ...     index=['1980-01-01', '1980-02-01', '1980-03-01'])
-    >>> df.execute()
-                    FR      GR      IT
-    1980-01-01  4.0405  1.7246  804.74
-    1980-02-01  4.0963  1.7482  810.01
-    1980-03-01  4.3149  1.8519  860.13
-    >>> df.pct_change().execute()
-                      FR        GR        IT
-    1980-01-01       NaN       NaN       NaN
-    1980-02-01  0.013810  0.013684  0.006549
-    1980-03-01  0.053365  0.059318  0.061876
-    Percentage of change in GOOG and APPL stock volume. Shows computing
-    the percentage change between columns.
-    >>> df = md.DataFrame({
-    ...     '2016': [1769950, 30586265],
-    ...     '2015': [1500923, 40912316],
-    ...     '2014': [1371819, 41403351]},
-    ...     index=['GOOG', 'APPL'])
-    >>> df.execute()
-              2016      2015      2014
-    GOOG   1769950   1500923   1371819
-    APPL  30586265  40912316  41403351
-    >>> df.pct_change(axis='columns').execute()
-          2016      2015      2014
-    GOOG   NaN -0.151997 -0.086016
-    APPL   NaN  0.337604  0.012002
     """
     axis = validate_axis(kwargs.pop("axis", 0))

maxframe/dataframe/misc/pivot_table.py ADDED Viewed

@@ -0,0 +1,262 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_list_like
+from ... import opcodes
+from ...core import OutputType
+from ...serialization.serializables import AnyField, BoolField, StringField
+from ...utils import no_default
+from ..operators import DataFrameOperator, DataFrameOperatorMixin
+from ..utils import build_df, parse_index
+class DataFramePivotTable(DataFrameOperator, DataFrameOperatorMixin):
+    _op_type_ = opcodes.PIVOT_TABLE
+    values = AnyField("values", default=None)
+    index = AnyField("index", default=None)
+    columns = AnyField("columns", default=None)
+    aggfunc = AnyField("aggfunc", default="mean")
+    fill_value = AnyField("fill_value", default=None)
+    margins = BoolField("margins", default=False)
+    dropna = BoolField("dropna", default=True)
+    margins_name = StringField("margins_name", default=None)
+    sort = BoolField("sort", default=False)
+    def __init__(self, **kw):
+        super().__init__(**kw)
+        self.output_types = [OutputType.dataframe]
+    def __call__(self, df):
+        index_value = columns_value = dtypes = None
+        if self.index is not None:
+            # index is now a required field
+            if len(self.index) == 1:
+                index_data = pd.Index(
+                    [], dtype=df.dtypes[self.index[0]], name=self.index[0]
+                )
+            else:
+                index_data = pd.MultiIndex.from_frame(build_df(df[self.index]))
+            index_value = parse_index(index_data)
+            if self.columns is None:  # output columns can be determined
+                sel_df = df
+                groupby_obj = sel_df.groupby(self.index)
+                if self.values:
+                    groupby_obj = groupby_obj[self.values]
+                aggregated_df = groupby_obj.agg(self.aggfunc)
+                index_value = aggregated_df.index_value
+                columns_value = aggregated_df.columns_value
+                dtypes = aggregated_df.dtypes
+            else:
+                columns_value = dtypes = None
+        return self.new_dataframe(
+            [df],
+            shape=(np.nan, np.nan),
+            dtypes=dtypes,
+            columns_value=columns_value,
+            index_value=index_value,
+        )
+def pivot_table(
+    data,
+    values=None,
+    index=None,
+    columns=None,
+    aggfunc="mean",
+    fill_value=None,
+    margins=False,
+    dropna=True,
+    margins_name="All",
+    sort=True,
+):
+    """
+    Create a spreadsheet-style pivot table as a DataFrame.
+    The levels in the pivot table will be stored in MultiIndex objects
+    (hierarchical indexes) on the index and columns of the result DataFrame.
+    Parameters
+    ----------
+    values : column to aggregate, optional
+    index : column, Grouper, array, or list of the previous
+        If an array is passed, it must be the same length as the data. The
+        list can contain any of the other types (except list).
+        Keys to group by on the pivot table index.  If an array is passed,
+        it is being used as the same manner as column values.
+    columns : column, Grouper, array, or list of the previous
+        If an array is passed, it must be the same length as the data. The
+        list can contain any of the other types (except list).
+        Keys to group by on the pivot table column.  If an array is passed,
+        it is being used as the same manner as column values.
+    aggfunc : function, list of functions, dict, default numpy.mean
+        If list of functions passed, the resulting pivot table will have
+        hierarchical columns whose top level are the function names
+        (inferred from the function objects themselves)
+        If dict is passed, the key is column to aggregate and value
+        is function or list of functions.
+    fill_value : scalar, default None
+        Value to replace missing values with (in the resulting pivot table,
+        after aggregation).
+    margins : bool, default False
+        Add all row / columns (e.g. for subtotal / grand totals).
+    dropna : bool, default True
+        Do not include columns whose entries are all NaN.
+    margins_name : str, default 'All'
+        Name of the row / column that will contain the totals
+        when margins is True.
+    sort : bool, default True
+        Specifies if the result should be sorted.
+    Returns
+    -------
+    DataFrame
+        An Excel style pivot table.
+    See Also
+    --------
+    DataFrame.pivot : Pivot without aggregation that can handle
+        non-numeric data.
+    DataFrame.melt: Unpivot a DataFrame from wide to long format,
+        optionally leaving identifiers set.
+    wide_to_long : Wide panel to long format. Less flexible but more
+        user-friendly than melt.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import maxframe.dataframe as md
+    >>> df = md.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
+    ...                          "bar", "bar", "bar", "bar"],
+    ...                    "B": ["one", "one", "one", "two", "two",
+    ...                          "one", "one", "two", "two"],
+    ...                    "C": ["small", "large", "large", "small",
+    ...                          "small", "large", "small", "small",
+    ...                          "large"],
+    ...                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+    ...                    "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
+    >>> df.execute()
+         A    B      C  D  E
+    0  foo  one  small  1  2
+    1  foo  one  large  2  4
+    2  foo  one  large  2  5
+    3  foo  two  small  3  5
+    4  foo  two  small  3  6
+    5  bar  one  large  4  6
+    6  bar  one  small  5  8
+    7  bar  two  small  6  9
+    8  bar  two  large  7  9
+    This first example aggregates values by taking the sum.
+    >>> table = md.pivot_table(df, values='D', index=['A', 'B'],
+    ...                        columns=['C'], aggfunc=np.sum)
+    >>> table.execute()
+    C        large  small
+    A   B
+    bar one    4.0    5.0
+        two    7.0    6.0
+    foo one    4.0    1.0
+        two    NaN    6.0
+    We can also fill missing values using the `fill_value` parameter.
+    >>> table = md.pivot_table(df, values='D', index=['A', 'B'],
+    ...                        columns=['C'], aggfunc=np.sum, fill_value=0)
+    >>> table.execute()
+    C        large  small
+    A   B
+    bar one      4      5
+        two      7      6
+    foo one      4      1
+        two      0      6
+    The next example aggregates by taking the mean across multiple columns.
+    >>> table = md.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+    ...                        aggfunc={'D': np.mean,
+    ...                                 'E': np.mean})
+    >>> table.execute()
+                    D         E
+    A   C
+    bar large  5.500000  7.500000
+        small  5.500000  8.500000
+    foo large  2.000000  4.500000
+        small  2.333333  4.333333
+    We can also calculate multiple types of aggregations for any given
+    value column.
+    >>> table = md.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+    ...                        aggfunc={'D': np.mean,
+    ...                                 'E': [min, max, np.mean]})
+    >>> table.execute()
+                    D    E
+                mean  max      mean  min
+    A   C
+    bar large  5.500000  9.0  7.500000  6.0
+        small  5.500000  9.0  8.500000  8.0
+    foo large  2.000000  5.0  4.500000  4.0
+        small  2.333333  6.0  4.333333  2.0
+    """
+    if index is None and columns is None:
+        raise ValueError(
+            "No group keys passed, need to specify at least one of index or columns"
+        )
+    def make_col_list(col):
+        try:
+            if col in data.dtypes.index:
+                return [col]
+        except TypeError:
+            return col
+        return col
+    values_list = make_col_list(values)
+    index_list = make_col_list(index)
+    columns_list = make_col_list(columns)
+    name_to_attr = {"values": values_list, "index": index_list, "columns": columns_list}
+    for key, val in name_to_attr.items():
+        if val is None:
+            continue
+        if not is_list_like(val):
+            raise ValueError(f"Need to specify {key} as a list-like object.")
+        non_exist_key = next((c for c in val if c not in data.dtypes.index), no_default)
+        if non_exist_key is not no_default:
+            raise ValueError(
+                f"Column {non_exist_key} specified in {key} is not a valid column."
+            )
+    if columns is None and not margins:
+        if values_list:
+            data = data[index_list + values_list]
+        return data.groupby(index, sort=sort).agg(aggfunc)
+    op = DataFramePivotTable(
+        values=values,
+        index=index,
+        columns=columns,
+        aggfunc=aggfunc,
+        fill_value=fill_value,
+        margins=margins,
+        dropna=dropna,
+        margins_name=margins_name,
+        sort=sort,
+    )
+    return op(data)