PyPI - maxframe - Versions diffs - 1.0.0rc3__cp39-cp39-win_amd64.whl → 1.1.0__cp39-cp39-win_amd64.whl - Mend

maxframe 1.0.0rc3__cp39-cp39-win_amd64.whl → 1.1.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show

maxframe/_utils.cp39-win_amd64.pyd +0 -0
maxframe/codegen.py +1 -0
maxframe/config/config.py +16 -1
maxframe/conftest.py +52 -14
maxframe/core/entity/executable.py +1 -1
maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
maxframe/core/operator/base.py +2 -0
maxframe/dataframe/arithmetic/docstring.py +26 -2
maxframe/dataframe/arithmetic/equal.py +4 -2
maxframe/dataframe/arithmetic/greater.py +4 -2
maxframe/dataframe/arithmetic/greater_equal.py +4 -2
maxframe/dataframe/arithmetic/less.py +2 -2
maxframe/dataframe/arithmetic/less_equal.py +4 -2
maxframe/dataframe/arithmetic/not_equal.py +4 -2
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
maxframe/dataframe/core.py +26 -2
maxframe/dataframe/datasource/read_odps_query.py +116 -28
maxframe/dataframe/datasource/read_odps_table.py +3 -1
maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
maxframe/dataframe/datastore/to_odps.py +7 -0
maxframe/dataframe/extensions/__init__.py +8 -0
maxframe/dataframe/extensions/apply_chunk.py +649 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +314 -0
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
maxframe/dataframe/groupby/__init__.py +1 -0
maxframe/dataframe/groupby/aggregation.py +1 -0
maxframe/dataframe/groupby/apply.py +9 -1
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
maxframe/dataframe/groupby/transform.py +8 -2
maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/indexing/rename.py +11 -0
maxframe/dataframe/initializer.py +11 -1
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +1 -1
maxframe/dataframe/merge/tests/test_merge.py +3 -1
maxframe/dataframe/misc/apply.py +3 -0
maxframe/dataframe/misc/drop_duplicates.py +23 -2
maxframe/dataframe/misc/map.py +3 -1
maxframe/dataframe/misc/tests/test_misc.py +24 -2
maxframe/dataframe/misc/transform.py +22 -13
maxframe/dataframe/reduction/__init__.py +3 -0
maxframe/dataframe/reduction/aggregation.py +1 -0
maxframe/dataframe/reduction/median.py +56 -0
maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
maxframe/dataframe/statistics/quantile.py +8 -2
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_initializer.py +33 -2
maxframe/dataframe/tests/test_utils.py +60 -0
maxframe/dataframe/utils.py +110 -7
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/io/objects/tests/test_object_io.py +39 -12
maxframe/io/odpsio/arrow.py +30 -2
maxframe/io/odpsio/schema.py +28 -8
maxframe/io/odpsio/tableio.py +55 -133
maxframe/io/odpsio/tests/test_schema.py +40 -4
maxframe/io/odpsio/tests/test_tableio.py +5 -5
maxframe/io/odpsio/tests/test_volumeio.py +35 -11
maxframe/io/odpsio/volumeio.py +36 -6
maxframe/learn/contrib/__init__.py +3 -1
maxframe/learn/contrib/graph/__init__.py +15 -0
maxframe/learn/contrib/graph/connected_components.py +215 -0
maxframe/learn/contrib/graph/tests/__init__.py +13 -0
maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/learn/contrib/xgboost/classifier.py +3 -3
maxframe/learn/contrib/xgboost/predict.py +8 -39
maxframe/learn/contrib/xgboost/train.py +4 -3
maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/opcodes.py +10 -1
maxframe/protocol.py +6 -1
maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
maxframe/serialization/core.pyx +13 -1
maxframe/serialization/pandas.py +50 -20
maxframe/serialization/serializables/core.py +24 -5
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +8 -1
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/session.py +9 -2
maxframe/tensor/__init__.py +19 -7
maxframe/tensor/indexing/getitem.py +2 -0
maxframe/tensor/merge/concatenate.py +23 -20
maxframe/tensor/merge/vstack.py +5 -1
maxframe/tensor/misc/transpose.py +1 -1
maxframe/tests/utils.py +16 -0
maxframe/udf.py +27 -0
maxframe/utils.py +64 -14
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +28 -10
maxframe_client/session/consts.py +3 -0
maxframe_client/session/odps.py +104 -20
maxframe_client/session/task.py +42 -26
maxframe_client/session/tests/test_task.py +0 -4
maxframe_client/tests/test_session.py +44 -12
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0

maxframe/dataframe/extensions/tests/test_extensions.py CHANGED Viewed

@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
 import pandas as pd
 import pytest
 from .... import dataframe as md
-from ...core import IndexValue
+from ....tests.utils import assert_mf_index_dtype
+from ... import DataFrame
+from ...core import DATAFRAME_TYPE, SERIES_TYPE, IndexValue
 from ..reshuffle import DataFrameReshuffle
@@ -31,8 +33,111 @@ def test_reshuffle():
     r = mdf.mf.reshuffle()
     assert isinstance(r.op, DataFrameReshuffle)
-    assert isinstance(r.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(r.index_value.value, np.int64)
     r = mdf.mf.reshuffle(ignore_index=True)
     assert isinstance(r.op, DataFrameReshuffle)
     assert isinstance(r.index_value.value, IndexValue.RangeIndex)
+@pytest.fixture
+def df1():
+    return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
+@pytest.fixture
+def df2():
+    return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
+@pytest.fixture
+def df3():
+    return DataFrame(
+        [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
+        columns=["a", "b", "c"],
+        index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
+    )
+def test_flatmap(df1, df2, df3):
+    def f(x, keys):
+        if x["a"] in keys:
+            yield [1, 0]
+            yield [0, 1]
+    apply_df = df1[["a"]].mf.flatmap(
+        f,
+        dtypes={"a": "int64", "b": "int64"},
+    )
+    assert apply_df.shape == (np.nan, 2)
+    assert df1.index_value.key != apply_df.index_value.key
+    assert isinstance(df1.index_value.to_pandas(), pd.RangeIndex)
+    assert not isinstance(apply_df.index_value.to_pandas(), pd.RangeIndex)
+    apply_df = df2[["a"]].mf.flatmap(
+        f,
+        dtypes=pd.Series(["int64", "int64"]),
+    )
+    assert apply_df.shape == (np.nan, 2)
+    assert df2.index_value.key != apply_df.index_value.key
+    with pytest.raises(TypeError):
+        apply_s = df3["a"].mf.flatmap(
+            f,
+        )
+    apply_s = df3["a"].mf.flatmap(
+        f,
+        dtype="int64",
+    )
+    assert apply_s.shape == (np.nan,)
+    assert df3.index_value.key != apply_s.index_value.key
+    assert df3.key != apply_s.index_value.key
+    apply_s = df3["a"].mf.flatmap(
+        f,
+        output_type="dataframe",
+        dtypes=["int64", "int64"],
+    )
+    assert apply_s.shape == (np.nan, 2)
+    assert df3.index_value.key != apply_s.index_value.key
+    assert df3.key != apply_s.index_value.key
+def test_flatjson():
+    s1 = md.Series(["{{'a': 1, 'b': false}}"], index=[1])
+    df1 = s1.mf.flatjson(
+        ["$.a", "$.b"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"])
+    )
+    assert df1.shape == (1, 2)
+    assert df1.index_value.key == s1.index_value.key
+    assert isinstance(df1, DATAFRAME_TYPE)
+    assert list(df1.dtypes) == [np.dtype("int32"), np.dtype("bool")]
+    assert list(df1.dtypes.index) == ["a", "b"]
+    df2 = s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32"], index=["a"]))
+    assert df2.shape == (1, 1)
+    assert df2.index_value.key == s1.index_value.key
+    assert isinstance(df2, DATAFRAME_TYPE)
+    assert list(df2.dtypes) == [np.dtype("int32")]
+    assert list(df2.dtypes.index) == ["a"]
+    s2 = s1.mf.flatjson("$.a", dtype="int32", name="a")
+    assert s2.shape == (1,)
+    assert s2.index_value.key == s1.index_value.key
+    assert isinstance(s2, SERIES_TYPE)
+    assert s2.dtype == np.dtype("int32")
+    assert s2.name == "a"
+    with pytest.raises(ValueError):
+        s1.mf.flatjson([], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
+    with pytest.raises(ValueError):
+        s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
+    with pytest.raises(ValueError):
+        s1.mf.flatjson(["$.a"], dtypes=pd.Series(["int32", "bool"], index=["a", "b"]))
+    with pytest.raises(ValueError):
+        s1.mf.flatjson(["$.a", "$.b"], dtypes=pd.Series(["bool"], index=["b"]))
+    with pytest.raises(ValueError):
+        s1.mf.flatjson(
+            ["$.a"],
+            dtype="int32",
+            dtypes=pd.Series(["int32"], index=["a"]),
+        )
+    with pytest.raises(ValueError):
+        s1.mf.flatjson(["$.a"])

maxframe/dataframe/groupby/__init__.py CHANGED Viewed

@@ -55,6 +55,7 @@ def _install():
         setattr(cls, "kurtosis", lambda groupby, **kw: agg(groupby, "kurtosis", **kw))
         setattr(cls, "sem", lambda groupby, **kw: agg(groupby, "sem", **kw))
         setattr(cls, "nunique", lambda groupby, **kw: agg(groupby, "nunique", **kw))
+        setattr(cls, "median", lambda groupby, **kw: agg(groupby, "median", **kw))
         setattr(cls, "apply", groupby_apply)
         setattr(cls, "transform", groupby_transform)

maxframe/dataframe/groupby/aggregation.py CHANGED Viewed

@@ -79,6 +79,7 @@ _agg_functions = {
     "kurt": lambda x, bias=False: x.kurt(bias=bias),
     "kurtosis": lambda x, bias=False: x.kurtosis(bias=bias),
     "nunique": lambda x: x.nunique(),
+    "median": lambda x: x.median(),
 }
 _series_col_name = "col_name"

maxframe/dataframe/groupby/apply.py CHANGED Viewed

@@ -28,7 +28,13 @@ from ...serialization.serializables import (
 )
 from ...utils import get_func_token, quiet_stdio, tokenize
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
-from ..utils import make_dtype, make_dtypes, parse_index, validate_output_types
+from ..utils import (
+    copy_func_scheduling_hints,
+    make_dtype,
+    make_dtypes,
+    parse_index,
+    validate_output_types,
+)
 class GroupByApplyLogicKeyGeneratorMixin(OperatorLogicKeyGeneratorMixin):
@@ -56,6 +62,8 @@ class GroupByApply(
     def __init__(self, output_types=None, **kw):
         super().__init__(_output_types=output_types, **kw)
+        if hasattr(self, "func"):
+            copy_func_scheduling_hints(self.func, self)
     def _update_key(self):
         values = [v for v in self._values_ if v is not self.func] + [

maxframe/dataframe/groupby/core.py CHANGED Viewed

@@ -28,7 +28,7 @@ from ..utils import build_df, build_series, parse_index
 cudf = lazy_import("cudf")
-_GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
+_GROUP_KEYS_NO_DEFAULT = pd_release_version[:2] == (1, 5)
 _default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True

maxframe/dataframe/groupby/fill.py CHANGED Viewed

@@ -35,12 +35,15 @@ class GroupByFillOperator(DataFrameOperator, DataFrameOperatorMixin):
         func_name = getattr(self, "_func_name")
         if func_name == "fillna":
+            kw = {}
+            if self.axis is not None:
+                kw["axis"] = self.axis
             result_df = mock_groupby.fillna(
                 value=self.value,
                 method=self.method,
-                axis=self.axis,
                 limit=self.limit,
                 downcast=self.downcast,
+                **kw,
             )
         else:
             result_df = getattr(mock_groupby, func_name)(limit=self.limit)

maxframe/dataframe/groupby/getitem.py CHANGED Viewed

@@ -88,5 +88,11 @@ def df_groupby_getitem(df_groupby, item):
     if df_groupby.selection:
         raise IndexError(f"Column(s) {df_groupby.selection!r} already selected")
+    if (
+        isinstance(item, tuple)
+        and item not in df_groupby.dtypes
+        and item not in df_groupby.index.names
+    ):
+        item = list(item)
     op = GroupByIndex(selection=item, output_types=output_types)
     return op(df_groupby)

maxframe/dataframe/groupby/tests/test_groupby.py CHANGED Viewed

@@ -230,7 +230,7 @@ def test_groupby_transform():
     assert r.op._op_type_ == opcodes.TRANSFORM
     assert r.op.output_types[0] == OutputType.dataframe
-    r = mdf.groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
+    r = mdf[list("abde")].groupby("b").transform(["cummax", "cumcount"], _call_agg=True)
     assert r.shape == (np.nan, 6)
     assert r.op._op_type_ == opcodes.TRANSFORM
     assert r.op.output_types[0] == OutputType.dataframe

maxframe/dataframe/groupby/transform.py CHANGED Viewed

@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import numpy as np
 import pandas as pd
@@ -20,7 +22,9 @@ from ...core import OutputType
 from ...serialization.serializables import AnyField, BoolField, DictField, TupleField
 from ...utils import quiet_stdio
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
-from ..utils import parse_index
+from ..utils import copy_func_scheduling_hints, parse_index
+logger = logging.getLogger(__name__)
 class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
@@ -35,6 +39,8 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
     def __init__(self, output_types=None, **kw):
         super().__init__(_output_types=output_types, **kw)
+        if hasattr(self, "func"):
+            copy_func_scheduling_hints(self.func, self)
     def _infer_df_func_returns(self, in_groupby, dtypes, index):
         index_value, output_types, new_dtypes = None, None, None
@@ -65,7 +71,7 @@ class GroupByTransform(DataFrameOperator, DataFrameOperatorMixin):
                 output_types = [OutputType.series]
                 new_dtypes = new_dtypes or (infer_df.name, infer_df.dtype)
         except:  # noqa: E722  # nosec
-            pass
+            logger.info("Exception raised while inferring df_func", exc_info=True)
         self.output_types = output_types if not self.output_types else self.output_types
         dtypes = new_dtypes if dtypes is None else dtypes

maxframe/dataframe/indexing/add_prefix_suffix.py CHANGED Viewed

@@ -51,7 +51,7 @@ def _get_prefix_suffix_docs(is_prefix: bool):
     Examples
     --------
     >>> import maxframe.dataframe as md
-        >>> s = md.Series([1, 2, 3, 4])
+    >>> s = md.Series([1, 2, 3, 4])
     >>> s.execute()
     0    1
     1    2

maxframe/dataframe/indexing/loc.py CHANGED Viewed

@@ -25,13 +25,14 @@ from ...core import ENTITY_TYPE, OutputType
 from ...serialization.serializables import AnyField, KeyField, ListField
 from ...tensor.datasource import asarray
 from ...tensor.utils import calc_sliced_size, filter_inputs
-from ...utils import is_full_slice, lazy_import
+from ...utils import is_full_slice, lazy_import, pd_release_version
 from ..core import DATAFRAME_TYPE, IndexValue
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import parse_index
 from .iloc import DataFrameIlocSetItem
 cudf = lazy_import("cudf")
+with_slice_locs_kind = pd_release_version < (1, 4, 0)
 def process_loc_indexes(inp, indexes, fetch_index: bool = True):
@@ -210,9 +211,10 @@ class DataFrameLocGetItem(DataFrameOperator, DataFrameOperatorMixin):
             if axis == 1:
                 param["dtypes"] = inp.dtypes
         elif input_index_value.has_value():
-            start, end = pd_index.slice_locs(
-                index.start, index.stop, index.step, kind="loc"
-            )
+            kw = {}
+            if with_slice_locs_kind:
+                kw["kind"] = "loc"
+            start, end = pd_index.slice_locs(index.start, index.stop, index.step, **kw)
             slc = slice(start, end, index.step)
             size = calc_sliced_size(inp.shape[axis], slc)
             param["shape"] = size

maxframe/dataframe/indexing/rename.py CHANGED Viewed

@@ -248,6 +248,7 @@ def df_rename(
     )
+# fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/58
 def series_rename(
     series,
     index=None,
@@ -382,6 +383,7 @@ def index_rename(index, name, inplace=False):
         return ret
+# fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/59
 def index_set_names(index, names, level=None, inplace=False):
     """
     Set Index or MultiIndex name.
@@ -407,6 +409,15 @@ def index_set_names(index, names, level=None, inplace=False):
     See Also
     --------
     Index.rename : Able to set new names without level.
+    Examples
+    --------
+    >>> import maxframe.dataframe as md
+    >>> idx = md.Index([1, 2, 3, 4])
+    >>> idx.execute()
+    Int64Index([1, 2, 3, 4], dtype='int64')
+    >>> idx.set_names('quarter').execute()
+    Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
     """
     op = DataFrameRename(
         index_mapper=names, level=level, output_types=get_output_types(index)

maxframe/dataframe/initializer.py CHANGED Viewed

@@ -15,6 +15,7 @@
 from typing import Union
 import pandas as pd
+from pandas.api.types import is_list_like
 from pandas.core.dtypes.common import pandas_dtype
 from ..core import ENTITY_TYPE
@@ -61,6 +62,8 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
         num_partitions=None,
     ):
         need_repart = False
+        if columns is not None and not is_list_like(columns):
+            raise ValueError("columns must be a list-like object")
         if isinstance(data, TENSOR_TYPE):
             if chunk_size is not None:
                 data = data.rechunk(chunk_size)
@@ -69,7 +72,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
             )
             need_repart = num_partitions is not None
         elif isinstance(data, SERIES_TYPE):
-            df = data.to_frame()
+            if columns is not None and len(columns) != 1:
+                raise ValueError("columns' length must be 1 when data is Series")
+            col_name = columns[0] if columns else None
+            df = data.to_frame(name=col_name)
             need_repart = num_partitions is not None
         elif isinstance(data, DATAFRAME_TYPE):
             if not hasattr(data, "data"):
@@ -77,6 +83,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
                 df = _Frame(data)
             else:
                 df = data
+            if columns is not None:
+                if len(df.columns) != len(columns):
+                    raise ValueError("columns' length must be equal to the data's")
+                df.columns = columns
             need_repart = num_partitions is not None
         elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
             # data is a dict and some value is tensor

maxframe/dataframe/merge/__init__.py CHANGED Viewed

@@ -14,7 +14,15 @@
 from .append import DataFrameAppend, append
 from .concat import DataFrameConcat, concat
-from .merge import DataFrameMerge, DataFrameMergeAlign, join, merge
+from .merge import (
+    DataFrameMerge,
+    DataFrameMergeAlign,
+    DistributedMapJoinHint,
+    MapJoinHint,
+    SkewJoinHint,
+    join,
+    merge,
+)
 def _install():

maxframe/dataframe/merge/concat.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import List, Union
 import pandas as pd
@@ -24,6 +25,7 @@ from ...serialization.serializables import (
     StringField,
 )
 from ...utils import lazy_import
+from ..core import DataFrame, Series
 from ..operators import SERIES_TYPE, DataFrameOperator, DataFrameOperatorMixin
 from ..utils import build_empty_df, build_empty_series, parse_index, validate_axis
@@ -55,41 +57,53 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
         return self.names
     @classmethod
-    def _concat_index(cls, prev_index: pd.Index, cur_index: pd.Index):
-        if isinstance(prev_index, pd.RangeIndex) and isinstance(
-            cur_index, pd.RangeIndex
-        ):
-            # handle RangeIndex that append may generate huge amount of data
-            # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
-            # will generate a Int64Index full of data
-            # for details see GH#1647
-            prev_stop = prev_index.start + prev_index.size * prev_index.step
-            cur_start = cur_index.start
-            if prev_stop == cur_start and prev_index.step == cur_index.step:
-                # continuous RangeIndex, still return RangeIndex
-                return prev_index.append(cur_index)
-            else:
-                # otherwise, return an empty index
-                return pd.Index([], dtype=prev_index.dtype)
-        elif isinstance(prev_index, pd.RangeIndex):
-            return pd.Index([], prev_index.dtype).append(cur_index)
-        elif isinstance(cur_index, pd.RangeIndex):
-            return prev_index.append(pd.Index([], cur_index.dtype))
-        return prev_index.append(cur_index)
+    def _concat_index(cls, df_or_series_list: Union[List[DataFrame], List[Series]]):
+        concat_index = None
+        all_indexes_have_value = all(
+            input.index_value.has_value() for input in df_or_series_list
+        )
+        def _concat(prev_index: pd.Index, cur_index: pd.Index):
+            if prev_index is None:
+                return cur_index
+            if (
+                all_indexes_have_value
+                and isinstance(prev_index, pd.RangeIndex)
+                and isinstance(cur_index, pd.RangeIndex)
+            ):
+                # handle RangeIndex that append may generate huge amount of data
+                # e.g. pd.RangeIndex(10_000) and pd.RangeIndex(10_000)
+                # will generate a Int64Index full of data
+                # for details see GH#1647
+                prev_stop = prev_index.start + prev_index.size * prev_index.step
+                cur_start = cur_index.start
+                if prev_stop == cur_start and prev_index.step == cur_index.step:
+                    # continuous RangeIndex, still return RangeIndex
+                    return prev_index.append(cur_index)
+                else:
+                    # otherwise, return an empty index
+                    return pd.Index([], dtype=prev_index.dtype)
+            elif isinstance(prev_index, pd.RangeIndex):
+                return pd.Index([], prev_index.dtype).append(cur_index)
+            elif isinstance(cur_index, pd.RangeIndex):
+                return prev_index.append(pd.Index([], cur_index.dtype))
+            return prev_index.append(cur_index)
+        for input in df_or_series_list:
+            concat_index = _concat(concat_index, input.index_value.to_pandas())
+        return concat_index
     def _call_series(self, objs):
         if self.axis == 0:
             row_length = 0
-            index = None
             for series in objs:
-                if index is None:
-                    index = series.index_value.to_pandas()
-                else:
-                    index = self._concat_index(index, series.index_value.to_pandas())
                 row_length += series.shape[0]
             if self.ignore_index:  # pragma: no cover
                 index_value = parse_index(pd.RangeIndex(row_length))
             else:
+                index = self._concat_index(objs)
                 index_value = parse_index(index, objs)
             obj_names = {obj.name for obj in objs}
             return self.new_series(
@@ -130,13 +144,8 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
     def _call_dataframes(self, objs):
         if self.axis == 0:
             row_length = 0
-            index = None
             empty_dfs = []
             for df in objs:
-                if index is None:
-                    index = df.index_value.to_pandas()
-                else:
-                    index = self._concat_index(index, df.index_value.to_pandas())
                 row_length += df.shape[0]
                 if df.ndim == 2:
                     empty_dfs.append(build_empty_df(df.dtypes))
@@ -153,6 +162,7 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
             if self.ignore_index:  # pragma: no cover
                 index_value = parse_index(pd.RangeIndex(row_length))
             else:
+                index = self._concat_index(objs)
                 index_value = parse_index(index, objs)
             new_objs = []

maxframe/dataframe/merge/merge.py CHANGED Viewed

@@ -353,7 +353,7 @@ def merge(
     df: Union[DataFrame, Series],
     right: Union[DataFrame, Series],
     how: str = "inner",
-    on: str = None,
+    on: Union[str, List[str]] = None,
     left_on: str = None,
     right_on: str = None,
     left_index: bool = False,

maxframe/dataframe/merge/tests/test_merge.py CHANGED Viewed

@@ -16,6 +16,7 @@ import numpy as np
 import pandas as pd
 import pytest
+from ....tests.utils import assert_mf_index_dtype
 from ...core import IndexValue
 from ...datasource.dataframe import from_pandas
 from .. import DataFrameMerge, concat
@@ -161,7 +162,7 @@ def test_append():
     adf = mdf1.append(mdf2)
     assert adf.shape == (20, 4)
-    assert isinstance(adf.index_value.value, IndexValue.Int64Index)
+    assert_mf_index_dtype(adf.index_value.value, np.int64)
     mdf1 = from_pandas(df1, chunk_size=3)
     mdf2 = from_pandas(df2, chunk_size=3)
@@ -181,6 +182,7 @@ def test_concat():
     r = concat([mdf1, mdf2], axis="index")
     assert r.shape == (20, 4)
+    assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
     pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
     df3 = pd.DataFrame(

maxframe/dataframe/misc/apply.py CHANGED Viewed

@@ -35,6 +35,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import (
     build_df,
     build_series,
+    copy_func_scheduling_hints,
     make_dtype,
     make_dtypes,
     pack_func_args,
@@ -79,6 +80,8 @@ class ApplyOperator(
         if output_type:
             kw["_output_types"] = [output_type]
         super().__init__(**kw)
+        if hasattr(self, "func"):
+            copy_func_scheduling_hints(self.func, self)
     def _update_key(self):
         values = [v for v in self._values_ if v is not self.func] + [

maxframe/dataframe/misc/drop_duplicates.py CHANGED Viewed

@@ -43,7 +43,11 @@ class DataFrameDropDuplicates(DuplicateOperand):
             params["index_value"] = parse_index(pd.RangeIndex(-1))
         else:
             params["index_value"] = gen_unknown_index_value(
-                input_params["index_value"], op.keep, op.subset, type(op).__name__
+                input_params["index_value"],
+                op.keep,
+                op.subset,
+                type(op).__name__,
+                normalize_range_index=True,
             )
         params["shape"] = self._get_shape(input_params["shape"], op)
         return params
@@ -104,7 +108,6 @@ def df_drop_duplicates(
 def series_drop_duplicates(
     series, keep="first", inplace=False, ignore_index=False, method="auto"
 ):
-    # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
     """
     Return Series with duplicate values removed.
@@ -148,6 +151,24 @@ def series_drop_duplicates(
     5     hippo
     Name: animal, dtype: object
+    With the 'keep' parameter, the selection behaviour of duplicated values
+    can be changed. The value 'first' keeps the first occurrence for each
+    set of duplicated entries. The default value of keep is 'first'.
+    >>> s.drop_duplicates().execute()
+    0      lame
+    1       cow
+    3    beetle
+    5     hippo
+    Name: animal, dtype: object
+    The value 'last' for parameter 'keep' keeps the last occurrence for
+    each set of duplicated entries.
+    >>> s.drop_duplicates(keep='last').execute()
+    1       cow
+    3    beetle
+    4      lame
+    5     hippo
+    Name: animal, dtype: object
     The value ``False`` for parameter 'keep' discards all sets of
     duplicated entries. Setting the value of 'inplace' to ``True`` performs
     the operation inplace and returns ``None``.

maxframe/dataframe/misc/map.py CHANGED Viewed

@@ -24,7 +24,7 @@ from ...serialization.serializables import AnyField, KeyField, StringField
 from ...utils import quiet_stdio
 from ..core import SERIES_TYPE
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
-from ..utils import build_series
+from ..utils import build_series, copy_func_scheduling_hints
 class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
@@ -38,6 +38,8 @@ class DataFrameMap(DataFrameOperator, DataFrameOperatorMixin):
         super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
         if not self.output_types:
             self.output_types = [OutputType.series]
+        if hasattr(self, "arg"):
+            copy_func_scheduling_hints(self.arg, self)
     def _set_inputs(self, inputs):
         super()._set_inputs(inputs)