PyPI - maxframe - Versions diffs - 1.0.0rc4__cp38-cp38-win_amd64.whl → 1.1.1__cp38-cp38-win_amd64.whl - Mend

maxframe 1.0.0rc4__cp38-cp38-win_amd64.whl → 1.1.1__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (88) hide show

maxframe/_utils.cp38-win_amd64.pyd +0 -0
maxframe/config/__init__.py +1 -1
maxframe/config/config.py +26 -0
maxframe/config/tests/test_config.py +20 -1
maxframe/conftest.py +17 -4
maxframe/core/graph/core.cp38-win_amd64.pyd +0 -0
maxframe/core/operator/base.py +2 -0
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
maxframe/dataframe/core.py +24 -2
maxframe/dataframe/datasource/read_odps_query.py +65 -35
maxframe/dataframe/datasource/read_odps_table.py +4 -2
maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
maxframe/dataframe/extensions/__init__.py +5 -0
maxframe/dataframe/extensions/apply_chunk.py +649 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +28 -40
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
maxframe/dataframe/groupby/__init__.py +1 -0
maxframe/dataframe/groupby/aggregation.py +1 -0
maxframe/dataframe/groupby/apply.py +9 -1
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
maxframe/dataframe/groupby/transform.py +8 -2
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +1 -1
maxframe/dataframe/merge/tests/test_merge.py +3 -1
maxframe/dataframe/misc/apply.py +3 -0
maxframe/dataframe/misc/drop_duplicates.py +5 -1
maxframe/dataframe/misc/map.py +3 -1
maxframe/dataframe/misc/tests/test_misc.py +24 -2
maxframe/dataframe/misc/transform.py +22 -13
maxframe/dataframe/reduction/__init__.py +3 -0
maxframe/dataframe/reduction/aggregation.py +1 -0
maxframe/dataframe/reduction/median.py +56 -0
maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
maxframe/dataframe/statistics/quantile.py +8 -2
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_utils.py +60 -0
maxframe/dataframe/utils.py +110 -7
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/io/objects/tests/test_object_io.py +39 -12
maxframe/io/odpsio/__init__.py +1 -1
maxframe/io/odpsio/arrow.py +51 -2
maxframe/io/odpsio/schema.py +23 -5
maxframe/io/odpsio/tableio.py +80 -124
maxframe/io/odpsio/tests/test_schema.py +40 -0
maxframe/io/odpsio/tests/test_tableio.py +5 -5
maxframe/io/odpsio/tests/test_volumeio.py +35 -11
maxframe/io/odpsio/volumeio.py +27 -3
maxframe/learn/contrib/__init__.py +3 -2
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/lib/mmh3.cp38-win_amd64.pyd +0 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/opcodes.py +7 -1
maxframe/serialization/core.cp38-win_amd64.pyd +0 -0
maxframe/serialization/core.pyx +13 -1
maxframe/serialization/pandas.py +50 -20
maxframe/serialization/serializables/core.py +70 -15
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +12 -2
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/tensor/__init__.py +19 -7
maxframe/tensor/merge/vstack.py +1 -1
maxframe/tests/utils.py +16 -0
maxframe/udf.py +27 -0
maxframe/utils.py +42 -8
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +2 -2
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +23 -8
maxframe_client/session/odps.py +40 -11
maxframe_client/session/task.py +6 -25
maxframe_client/session/tests/test_task.py +35 -6
maxframe_client/tests/test_session.py +30 -10
{maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0

maxframe/dataframe/indexing/loc.py CHANGED Viewed

@@ -25,13 +25,14 @@ from ...core import ENTITY_TYPE, OutputType
 from ...serialization.serializables import AnyField, KeyField, ListField
 from ...tensor.datasource import asarray
 from ...tensor.utils import calc_sliced_size, filter_inputs
-from ...utils import is_full_slice, lazy_import
+from ...utils import is_full_slice, lazy_import, pd_release_version
 from ..core import DATAFRAME_TYPE, IndexValue
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import parse_index
 from .iloc import DataFrameIlocSetItem
 cudf = lazy_import("cudf")
+with_slice_locs_kind = pd_release_version < (1, 4, 0)
 def process_loc_indexes(inp, indexes, fetch_index: bool = True):
@@ -210,9 +211,10 @@ class DataFrameLocGetItem(DataFrameOperator, DataFrameOperatorMixin):
             if axis == 1:
                 param["dtypes"] = inp.dtypes
         elif input_index_value.has_value():
-            start, end = pd_index.slice_locs(
-                index.start, index.stop, index.step, kind="loc"
-            )
+            kw = {}
+            if with_slice_locs_kind:
+                kw["kind"] = "loc"
+            start, end = pd_index.slice_locs(index.start, index.stop, index.step, **kw)
             slc = slice(start, end, index.step)
             size = calc_sliced_size(inp.shape[axis], slc)
             param["shape"] = size

maxframe/dataframe/merge/__init__.py CHANGED Viewed

@@ -14,7 +14,15 @@
 from .append import DataFrameAppend, append
 from .concat import DataFrameConcat, concat
-from .merge import DataFrameMerge, DataFrameMergeAlign, join, merge
+from .merge import (
+    DataFrameMerge,
+    DataFrameMergeAlign,
+    DistributedMapJoinHint,
+    MapJoinHint,
+    SkewJoinHint,
+    join,
+    merge,
+)
 def _install():

maxframe/dataframe/merge/concat.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import List, Union
 import pandas as pd
@@ -24,6 +25,7 @@ from ...serialization.serializables import (
     StringField,
 )
 from ...utils import lazy_import
+from ..core import DataFrame, Series
 from ..operators import SERIES_TYPE, DataFrameOperator, DataFrameOperatorMixin
 from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
@@ -55,41 +57,53 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
         return self.names
     @classmethod
-    def _concat_index(cls, prev_index: pd.Index, cur_index: pd.Index):
-        if isinstance(prev_index, pd.RangeIndex) and isinstance(
-            cur_index, pd.RangeIndex
-        ):
-            # handle RangeIndex that append may generate huge amount of data
-            # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
-            # will generate a Int64Index full of data
-            # for details see GH#1647
-            prev_stop = prev_index.start + prev_index.size * prev_index.step
-            cur_start = cur_index.start
-            if prev_stop == cur_start and prev_index.step == cur_index.step:
-                # continuous RangeIndex, still return RangeIndex
-                return prev_index.append(cur_index)
-            else:
-                # otherwise, return an empty index
-                return pd.Index([], dtype=prev_index.dtype)
-        elif isinstance(prev_index, pd.RangeIndex):
-            return pd.Index([], prev_index.dtype).append(cur_index)
-        elif isinstance(cur_index, pd.RangeIndex):
-            return prev_index.append(pd.Index([], cur_index.dtype))
-        return prev_index.append(cur_index)
+    def _concat_index(cls, df_or_series_list: Union[List[DataFrame], List[Series]]):
+        concat_index = None
+        all_indexes_have_value = all(
+            input.index_value.has_value() for input in df_or_series_list
+        )
+        def _concat(prev_index: pd.Index, cur_index: pd.Index):
+            if prev_index is None:
+                return cur_index
+            if (
+                all_indexes_have_value
+                and isinstance(prev_index, pd.RangeIndex)
+                and isinstance(cur_index, pd.RangeIndex)
+            ):
+                # handle RangeIndex that append may generate huge amount of data
+                # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
+                # will generate a Int64Index full of data
+                # for details see GH#1647
+                prev_stop = prev_index.start + prev_index.size * prev_index.step
+                cur_start = cur_index.start
+                if prev_stop == cur_start and prev_index.step == cur_index.step:
+                    # continuous RangeIndex, still return RangeIndex
+                    return prev_index.append(cur_index)
+                else:
+                    # otherwise, return an empty index
+                    return pd.Index([], dtype=prev_index.dtype)
+            elif isinstance(prev_index, pd.RangeIndex):
+                return pd.Index([], prev_index.dtype).append(cur_index)
+            elif isinstance(cur_index, pd.RangeIndex):
+                return prev_index.append(pd.Index([], cur_index.dtype))
+            return prev_index.append(cur_index)
+        for input in df_or_series_list:
+            concat_index = _concat(concat_index, input.index_value.to_pandas())
+        return concat_index
     def _call_series(self, objs):
         if self.axis == 0:
             row_length = 0
-            index = None
             for series in objs:
-                if index is None:
-                    index = series.index_value.to_pandas()
-                else:
-                    index = self._concat_index(index, series.index_value.to_pandas())
                 row_length += series.shape[0]
             if self.ignore_index:  # pragma: no cover
                 index_value = parse_index(pd.RangeIndex(row_length))
             else:
+                index = self._concat_index(objs)
                 index_value = parse_index(index, objs)
             obj_names = {obj.name for obj in objs}
             return self.new_series(
@@ -130,13 +144,8 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
     def _call_dataframes(self, objs):
         if self.axis == 0:
             row_length = 0
-            index = None
             empty_dfs = []
             for df in objs:
-                if index is None:
-                    index = df.index_value.to_pandas()
-                else:
-                    index = self._concat_index(index, df.index_value.to_pandas())
                 row_length += df.shape[0]
                 if df.ndim == 2:
                     empty_dfs.append(build_empty_df(df.dtypes))
@@ -153,6 +162,7 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
             if self.ignore_index:  # pragma: no cover
                 index_value = parse_index(pd.RangeIndex(row_length))
             else:
+                index = self._concat_index(objs)
                 index_value = parse_index(index, objs)
             new_objs = []

maxframe/dataframe/merge/merge.py CHANGED Viewed

@@ -353,7 +353,7 @@ def merge(
     df: Union[DataFrame, Series],
     right: Union[DataFrame, Series],
     how: str = "inner",
-    on: str = None,
+    on: Union[str, List[str]] = None,
     left_on: str = None,
     right_on: str = None,
     left_index: bool = False,

maxframe/dataframe/merge/tests/test_merge.py CHANGED Viewed

@@ -16,6 +16,7 @@ import numpy as np
 import pandas as pd
 import pytest
+from ....tests.utils import assert_mf_index_dtype
 from ...core import IndexValue
 from ...datasource.dataframe import from_pandas
 from .. import DataFrameMerge, concat
@@ -161,7 +162,7 @@ def test_append():
     adf = mdf1.append(mdf2)
     assert adf.shape == (20, 4)
-    assert isinstance(adf.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(adf.index_value.value, np.int64)
     mdf1 = from_pandas(df1, chunk_size=3)
     mdf2 = from_pandas(df2, chunk_size=3)
@@ -181,6 +182,7 @@ def test_concat():
     r = concat([mdf1, mdf2], axis="index")
     assert r.shape == (20, 4)
+    assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
     pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
     df3 = pd.DataFrame(

maxframe/dataframe/misc/apply.py CHANGED Viewed

@@ -35,6 +35,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import (
     build_df,
     build_series,
+    copy_func_scheduling_hints,
     make_dtype,
     make_dtypes,
     pack_func_args,
@@ -79,6 +80,8 @@ class ApplyOperator(
         if output_type:
             kw["_output_types"] = [output_type]
         super().__init__(**kw)
+        if hasattr(self, "func"):
+            copy_func_scheduling_hints(self.func, self)
     def _update_key(self):
         values = [v for v in self._values_ if v is not self.func] + [

maxframe/dataframe/misc/drop_duplicates.py CHANGED Viewed

@@ -43,7 +43,11 @@ class DataFrameDropDuplicates(DuplicateOperand):
             params["index_value"] = parse_index(pd.RangeIndex(-1))
         else:
             params["index_value"] = gen_unknown_index_value(
-                input_params["index_value"], op.keep, op.subset, type(op).__name__
+                input_params["index_value"],
+                op.keep,
+                op.subset,
+                type(op).__name__,
+                normalize_range_index=True,
             )
         params["shape"] = self._get_shape(input_params["shape"], op)
         return params

maxframe/dataframe/misc/map.py CHANGED Viewed

@@ -24,7 +24,7 @@ from ...serialization.serializables import AnyField, KeyField, StringField
 from ...utils import quiet_stdio
 from ..core import SERIES_TYPE
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
-from ..utils import build_series
+from ..utils import build_series, copy_func_scheduling_hints
 class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
@@ -38,6 +38,8 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
         super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
         if not self.output_types:
             self.output_types = [OutputType.series]
+        if hasattr(self, "arg"):
+            copy_func_scheduling_hints(self.arg, self)
     def _set_inputs(self, inputs):
         super()._set_inputs(inputs)

maxframe/dataframe/misc/tests/test_misc.py CHANGED Viewed

@@ -20,6 +20,7 @@ from .... import opcodes
 from ....core import OutputType
 from ....dataframe import DataFrame
 from ....tensor.core import TENSOR_TYPE
+from ....udf import with_running_options
 from ... import eval as maxframe_eval
 from ... import get_dummies, to_numeric
 from ...arithmetic import DataFrameGreater, DataFrameLess
@@ -65,6 +66,17 @@ def test_transform():
     assert r.op._op_type_ == opcodes.TRANSFORM
     assert r.op.output_types[0] == OutputType.dataframe
+    def transform_df_with_param(row, param, k):
+        assert param == 5
+        assert k == "6"
+        return row
+    r = df.transform(transform_df_with_param, 1, 5, k="6")
+    assert all(v == np.dtype("int64") for v in r.dtypes) is True
+    assert r.shape == df.shape
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.dataframe
     r = df.transform(lambda x: list(range(len(x))), axis=1)
     assert all(v == np.dtype("int64") for v in r.dtypes) is True
     assert r.shape == df.shape
@@ -349,7 +361,9 @@ def test_drop():
 def test_drop_duplicates():
     rs = np.random.RandomState(0)
     raw = pd.DataFrame(
-        rs.randint(1000, size=(20, 7)), columns=["c" + str(i + 1) for i in range(7)]
+        rs.randint(1000, size=(20, 7)),
+        columns=["c" + str(i + 1) for i in range(7)],
+        index=pd.Index(range(20), name="idx"),
     )
     raw["c7"] = [f"s{j}" for j in range(20)]
@@ -361,6 +375,12 @@ def test_drop_duplicates():
     with pytest.raises(KeyError):
         df.drop_duplicates(subset="c8")
+    # check index
+    distinct_df = df.drop_duplicates()
+    assert distinct_df.index_value.name == df.index_value.name
+    assert isinstance(df.index_value.to_pandas(), pd.RangeIndex)
+    assert not isinstance(distinct_df.index_value.to_pandas(), pd.RangeIndex)
     s = df["c7"]
     with pytest.raises(ValueError):
         s.drop_duplicates(method="unknown")
@@ -436,6 +456,7 @@ def test_apply():
     keys = [1, 2]
+    @with_running_options(engine="spe")
     def f(x, keys):
         if x["a"] in keys:
             return [1, 0]
@@ -451,6 +472,7 @@ def test_apply():
         keys=keys,
     )
     assert apply_df.shape == (3, 2)
+    assert apply_df.op.expect_engine == "SPE"
 def test_pivot_table():
@@ -474,7 +496,7 @@ def test_pivot_table():
     with pytest.raises(ValueError):
         df.pivot_table(values=["D", "E"], aggfunc="sum")
-    t = df.pivot_table(index="A")
+    t = df.pivot_table(index=["A", "B", "C"])
     assert isinstance(t.op, DataFrameGroupByAgg)
     t = df.pivot_table(index="A", values=["D", "E"], aggfunc="sum")
     assert isinstance(t.op, DataFrameGroupByAgg)

maxframe/dataframe/misc/transform.py CHANGED Viewed

@@ -27,6 +27,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import (
     build_df,
     build_series,
+    copy_func_scheduling_hints,
     make_dtypes,
     pack_func_args,
     parse_index,
@@ -49,10 +50,12 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
     def __init__(self, output_types=None, memory_scale=None, **kw):
         super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
+        if hasattr(self, "func"):
+            copy_func_scheduling_hints(self.func, self)
     def _infer_df_func_returns(self, df, dtypes):
-        packed_funcs = self.get_packed_funcs(df)
-        test_df = self._build_stub_pandas_obj(df)
+        packed_funcs = self.func
+        test_df = _build_stub_pandas_obj(df, self.output_types[0])
         if self.output_types[0] == OutputType.dataframe:
             try:
                 with np.errstate(all="ignore"), quiet_stdio():
@@ -147,16 +150,18 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
                 index_value=new_index_value,
             )
-    def get_packed_funcs(self, df=None) -> Any:
-        stub_df = self._build_stub_pandas_obj(df or self.inputs[0])
-        return pack_func_args(stub_df, self.func, *self.args, **self.kwds)
-    def _build_stub_pandas_obj(self, df) -> Union[DataFrame, Series]:
-        # TODO: Simulate a dataframe with the corresponding indexes if self.func is
-        # a dict and axis=1
-        if self.output_types[0] == OutputType.dataframe:
-            return build_df(df, fill_value=1, size=1)
-        return build_series(df, size=1, name=df.name)
+def get_packed_funcs(df, output_type, func, *args, **kwds) -> Any:
+    stub_df = _build_stub_pandas_obj(df, output_type)
+    return pack_func_args(stub_df, func, *args, **kwds)
+def _build_stub_pandas_obj(df, output_type) -> Union[DataFrame, Series]:
+    # TODO: Simulate a dataframe with the corresponding indexes if self.func is
+    # a dict and axis=1
+    if output_type == OutputType.dataframe:
+        return build_df(df, fill_value=1, size=1)
+    return build_series(df, size=1, name=df.name)
 def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwargs):
@@ -229,13 +234,15 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
     1  2  3
     2  3  4
     """
+    call_agg = kwargs.pop("_call_agg", False)
+    func = get_packed_funcs(df, OutputType.dataframe, func, *args, **kwargs)
     op = TransformOperator(
         func=func,
         axis=axis,
         args=args,
         kwds=kwargs,
         output_types=[OutputType.dataframe],
-        call_agg=kwargs.pop("_call_agg", False),
+        call_agg=call_agg,
     )
     return op(df, dtypes=dtypes, skip_infer=skip_infer)
@@ -319,6 +326,8 @@ def series_transform(
     1  2  3
     2  3  4
     """
+    call_agg = kwargs.pop("_call_agg", False)
+    func = get_packed_funcs(series, OutputType.series, func, *args, **kwargs)
     op = TransformOperator(
         func=func,
         axis=axis,
@@ -326,7 +335,7 @@ def series_transform(
         args=args,
         kwds=kwargs,
         output_types=[OutputType.series],
-        call_agg=kwargs.pop("_call_agg", False),
+        call_agg=call_agg,
     )
     dtypes = (series.name, dtype) if dtype is not None else None
     return op(series, dtypes=dtypes, skip_infer=skip_infer)

maxframe/dataframe/reduction/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ from .custom_reduction import DataFrameCustomReduction
 from .kurtosis import DataFrameKurtosis
 from .max import DataFrameMax
 from .mean import DataFrameMean
+from .median import DataFrameMedian
 from .min import DataFrameMin
 from .nunique import DataFrameNunique
 from .prod import DataFrameProd
@@ -50,6 +51,7 @@ def _install():
     from .kurtosis import kurt_dataframe, kurt_series
     from .max import max_dataframe, max_index, max_series
     from .mean import mean_dataframe, mean_series
+    from .median import median_dataframe, median_series
     from .min import min_dataframe, min_index, min_series
     from .nunique import nunique_dataframe, nunique_series
     from .prod import prod_dataframe, prod_series
@@ -68,6 +70,7 @@ def _install():
         ("min", min_series, min_dataframe),
         ("count", count_series, count_dataframe),
         ("mean", mean_series, mean_dataframe),
+        ("median", median_series, median_dataframe),
         ("var", var_series, var_dataframe),
         ("std", std_series, std_dataframe),
         ("all", all_series, all_dataframe),

maxframe/dataframe/reduction/aggregation.py CHANGED Viewed

@@ -71,6 +71,7 @@ _agg_functions = {
     "kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias),
     "kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias),
     "nunique": lambda x: x.nunique(),
+    "median": lambda x, skipna=True: x.median(skipna=skipna),
 }

maxframe/dataframe/reduction/median.py ADDED Viewed

@@ -0,0 +1,56 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ... import opcodes
+from ...core import OutputType
+from .core import DataFrameReductionMixin, DataFrameReductionOperator
+class DataFrameMedian(DataFrameReductionOperator, DataFrameReductionMixin):
+    _op_type_ = opcodes.MEDIAN
+    _func_name = "median"
+    @property
+    def is_atomic(self):
+        return True
+def median_series(df, axis=None, skipna=True, level=None, method=None):
+    op = DataFrameMedian(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        output_types=[OutputType.scalar if level is not None else OutputType.scalar],
+        method=method,
+    )
+    return op(df)
+def median_dataframe(
+    df,
+    axis=0,
+    skipna=True,
+    level=None,
+    numeric_only=None,
+    method=None,
+):
+    op = DataFrameMedian(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        numeric_only=numeric_only,
+        output_types=[OutputType.dataframe if level is not None else OutputType.series],
+        method=method,
+    )
+    return op(df)

maxframe/dataframe/reduction/tests/test_reduction.py CHANGED Viewed

@@ -23,6 +23,7 @@ import pytest
 from .... import dataframe as md
 from ....tensor import Tensor
+from ....tests.utils import assert_mf_index_dtype
 from ...core import DataFrame, IndexValue, OutputType, Series
 from ...datasource.dataframe import from_pandas as from_pandas_df
 from ...datasource.series import from_pandas as from_pandas_series
@@ -38,6 +39,7 @@ from .. import (
     DataFrameKurtosis,
     DataFrameMax,
     DataFrameMean,
+    DataFrameMedian,
     DataFrameMin,
     DataFrameNunique,
     DataFrameProd,
@@ -71,6 +73,7 @@ reduction_functions = [
     ("sem", DataFrameSem, FunctionOptions()),
     ("all", DataFrameAll, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
     ("any", DataFrameAny, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
+    ("median", DataFrameMedian, FunctionOptions()),
 ]
@@ -111,10 +114,7 @@ def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
     reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
     assert isinstance(reduction_df, Series)
-    assert isinstance(
-        reduction_df.index_value._index_value,
-        (IndexValue.RangeIndex, IndexValue.Int64Index),
-    )
+    assert_mf_index_dtype(reduction_df.index_value._index_value, np.int64)
     assert reduction_df.shape == (10,)
     data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
@@ -210,6 +210,7 @@ def test_dataframe_aggregate():
         "skew",
         "kurt",
         "sem",
+        "median",
     ]
     df = from_pandas_df(data)
@@ -253,7 +254,7 @@ def test_dataframe_aggregate():
     assert result.op.output_types[0] == OutputType.dataframe
     assert result.op.func == agg_funcs
-    dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std"]}
+    dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std", "median"]}
     all_cols = set(
         reduce(
             operator.add, [[v] if isinstance(v, str) else v for v in dict_fun.values()]
@@ -268,9 +269,9 @@ def test_dataframe_aggregate():
     assert result.op.func[2] == dict_fun[2]
     with pytest.raises(TypeError):
-        df.agg(sum_0="sum", mean_0="mean")
+        df.agg(sum_0="sum", mean_0="mean", median_0="median")
     with pytest.raises(NotImplementedError):
-        df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}, axis=1)
+        df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std", "median"]}, axis=1)
 def test_series_aggregate():
@@ -287,6 +288,7 @@ def test_series_aggregate():
         "skew",
         "kurt",
         "sem",
+        "median",
     ]
     series = from_pandas_series(data)
@@ -303,6 +305,14 @@ def test_series_aggregate():
     assert result.shape == ()
     assert result.op.output_types[0] == OutputType.scalar
+    result = series.agg("median")
+    assert result.shape == ()
+    assert result.op.output_types[0] == OutputType.scalar
+    result = series.median(level=0)
+    assert result.shape == (np.nan,)
+    assert result.op.output_types[0] == OutputType.series
     result = series.agg(agg_funcs)
     assert result.shape == (len(agg_funcs),)
     assert list(result.index_value.to_pandas()) == agg_funcs

maxframe/dataframe/statistics/quantile.py CHANGED Viewed

@@ -81,7 +81,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
             store_index_value = False
         else:
             q_val = np.asanyarray(self.q)
-            pd_index = pd.Index(q_val)
+            if q_val.ndim == 0:
+                pd_index = pd.Index(q_val.reshape(1))
+            else:
+                pd_index = pd.Index(q_val)
             name = self.q if q_val.size == 1 else None
             store_index_value = True
         tokenize_objects = (a, q_val, self.interpolation, type(self).__name__)
@@ -164,7 +167,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
             store_index_value = False
         else:
             q_val = np.asanyarray(self.q)
-            index_val = pd.Index(q_val)
+            if q_val.ndim == 0:
+                index_val = pd.Index(q_val.reshape(1))
+            else:
+                index_val = pd.Index(q_val)
             store_index_value = True
         # get dtype by tensor

maxframe/dataframe/statistics/tests/test_statistics.py CHANGED Viewed

@@ -49,7 +49,7 @@ def test_dataframe_quantile():
     # q = 0.3, axis = 0
     r = s.quantile(0.3)
-    e = raw.quantile(0.3)
+    e = raw.quantile(0.3, numeric_only=True)
     assert isinstance(r, Series)
     assert r.shape == (2,)
     assert r.dtype == e.dtype
@@ -57,7 +57,7 @@ def test_dataframe_quantile():
     # q = 0.3, axis = 1
     r = s.quantile(0.3, axis=1)
-    e = raw.quantile(0.3, axis=1)
+    e = raw.quantile(0.3, numeric_only=True, axis=1)
     assert isinstance(r, Series)
     assert r.shape == e.shape
     assert r.dtype == e.dtype
@@ -65,7 +65,7 @@ def test_dataframe_quantile():
     # q = [0.3, 0.7], axis = 0
     r = s.quantile([0.3, 0.7])
-    e = raw.quantile([0.3, 0.7])
+    e = raw.quantile([0.3, 0.7], numeric_only=True)
     assert isinstance(r, DataFrame)
     assert r.shape == e.shape
     pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -74,7 +74,7 @@ def test_dataframe_quantile():
     # q = [0.3, 0.7], axis = 1
     r = s.quantile([0.3, 0.7], axis=1)
-    e = raw.quantile([0.3, 0.7], axis=1)
+    e = raw.quantile([0.3, 0.7], numeric_only=True, axis=1)
     assert isinstance(r, DataFrame)
     assert r.shape == e.shape
     pd.testing.assert_series_equal(r.dtypes, e.dtypes)