PyPI - maxframe - Versions diffs - 1.0.0rc3__cp311-cp311-win_amd64.whl → 1.1.0__cp311-cp311-win_amd64.whl - Mend

maxframe 1.0.0rc3__cp311-cp311-win_amd64.whl → 1.1.0__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show

maxframe/_utils.cp311-win_amd64.pyd +0 -0
maxframe/codegen.py +1 -0
maxframe/config/config.py +16 -1
maxframe/conftest.py +52 -14
maxframe/core/entity/executable.py +1 -1
maxframe/core/graph/core.cp311-win_amd64.pyd +0 -0
maxframe/core/operator/base.py +2 -0
maxframe/dataframe/arithmetic/docstring.py +26 -2
maxframe/dataframe/arithmetic/equal.py +4 -2
maxframe/dataframe/arithmetic/greater.py +4 -2
maxframe/dataframe/arithmetic/greater_equal.py +4 -2
maxframe/dataframe/arithmetic/less.py +2 -2
maxframe/dataframe/arithmetic/less_equal.py +4 -2
maxframe/dataframe/arithmetic/not_equal.py +4 -2
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
maxframe/dataframe/core.py +26 -2
maxframe/dataframe/datasource/read_odps_query.py +116 -28
maxframe/dataframe/datasource/read_odps_table.py +3 -1
maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
maxframe/dataframe/datastore/to_odps.py +7 -0
maxframe/dataframe/extensions/__init__.py +8 -0
maxframe/dataframe/extensions/apply_chunk.py +649 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +314 -0
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
maxframe/dataframe/groupby/__init__.py +1 -0
maxframe/dataframe/groupby/aggregation.py +1 -0
maxframe/dataframe/groupby/apply.py +9 -1
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
maxframe/dataframe/groupby/transform.py +8 -2
maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/indexing/rename.py +11 -0
maxframe/dataframe/initializer.py +11 -1
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +1 -1
maxframe/dataframe/merge/tests/test_merge.py +3 -1
maxframe/dataframe/misc/apply.py +3 -0
maxframe/dataframe/misc/drop_duplicates.py +23 -2
maxframe/dataframe/misc/map.py +3 -1
maxframe/dataframe/misc/tests/test_misc.py +24 -2
maxframe/dataframe/misc/transform.py +22 -13
maxframe/dataframe/reduction/__init__.py +3 -0
maxframe/dataframe/reduction/aggregation.py +1 -0
maxframe/dataframe/reduction/median.py +56 -0
maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
maxframe/dataframe/statistics/quantile.py +8 -2
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_initializer.py +33 -2
maxframe/dataframe/tests/test_utils.py +60 -0
maxframe/dataframe/utils.py +110 -7
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/io/objects/tests/test_object_io.py +39 -12
maxframe/io/odpsio/arrow.py +30 -2
maxframe/io/odpsio/schema.py +28 -8
maxframe/io/odpsio/tableio.py +55 -133
maxframe/io/odpsio/tests/test_schema.py +40 -4
maxframe/io/odpsio/tests/test_tableio.py +5 -5
maxframe/io/odpsio/tests/test_volumeio.py +35 -11
maxframe/io/odpsio/volumeio.py +36 -6
maxframe/learn/contrib/__init__.py +3 -1
maxframe/learn/contrib/graph/__init__.py +15 -0
maxframe/learn/contrib/graph/connected_components.py +215 -0
maxframe/learn/contrib/graph/tests/__init__.py +13 -0
maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/learn/contrib/xgboost/classifier.py +3 -3
maxframe/learn/contrib/xgboost/predict.py +8 -39
maxframe/learn/contrib/xgboost/train.py +4 -3
maxframe/lib/mmh3.cp311-win_amd64.pyd +0 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/opcodes.py +10 -1
maxframe/protocol.py +6 -1
maxframe/serialization/core.cp311-win_amd64.pyd +0 -0
maxframe/serialization/core.pyx +13 -1
maxframe/serialization/pandas.py +50 -20
maxframe/serialization/serializables/core.py +24 -5
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +8 -1
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/session.py +9 -2
maxframe/tensor/__init__.py +19 -7
maxframe/tensor/indexing/getitem.py +2 -0
maxframe/tensor/merge/concatenate.py +23 -20
maxframe/tensor/merge/vstack.py +5 -1
maxframe/tensor/misc/transpose.py +1 -1
maxframe/tests/utils.py +16 -0
maxframe/udf.py +27 -0
maxframe/utils.py +64 -14
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +28 -10
maxframe_client/session/consts.py +3 -0
maxframe_client/session/odps.py +104 -20
maxframe_client/session/task.py +42 -26
maxframe_client/session/tests/test_task.py +0 -4
maxframe_client/tests/test_session.py +44 -12
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0

maxframe/dataframe/misc/tests/test_misc.py CHANGED Viewed

@@ -20,6 +20,7 @@ from .... import opcodes
 from ....core import OutputType
 from ....dataframe import DataFrame
 from ....tensor.core import TENSOR_TYPE
+from ....udf import with_running_options
 from ... import eval as maxframe_eval
 from ... import get_dummies, to_numeric
 from ...arithmetic import DataFrameGreater, DataFrameLess
@@ -65,6 +66,17 @@ def test_transform():
     assert r.op._op_type_ == opcodes.TRANSFORM
     assert r.op.output_types[0] == OutputType.dataframe
+    def transform_df_with_param(row, param, k):
+        assert param == 5
+        assert k == "6"
+        return row
+    r = df.transform(transform_df_with_param, 1, 5, k="6")
+    assert all(v == np.dtype("int64") for v in r.dtypes) is True
+    assert r.shape == df.shape
+    assert r.op._op_type_ == opcodes.TRANSFORM
+    assert r.op.output_types[0] == OutputType.dataframe
     r = df.transform(lambda x: list(range(len(x))), axis=1)
     assert all(v == np.dtype("int64") for v in r.dtypes) is True
     assert r.shape == df.shape
@@ -349,7 +361,9 @@ def test_drop():
 def test_drop_duplicates():
     rs = np.random.RandomState(0)
     raw = pd.DataFrame(
-        rs.randint(1000, size=(20, 7)), columns=["c" + str(i + 1) for i in range(7)]
+        rs.randint(1000, size=(20, 7)),
+        columns=["c" + str(i + 1) for i in range(7)],
+        index=pd.Index(range(20), name="idx"),
     )
     raw["c7"] = [f"s{j}" for j in range(20)]
@@ -361,6 +375,12 @@ def test_drop_duplicates():
     with pytest.raises(KeyError):
         df.drop_duplicates(subset="c8")
+    # check index
+    distinct_df = df.drop_duplicates()
+    assert distinct_df.index_value.name == df.index_value.name
+    assert isinstance(df.index_value.to_pandas(), pd.RangeIndex)
+    assert not isinstance(distinct_df.index_value.to_pandas(), pd.RangeIndex)
     s = df["c7"]
     with pytest.raises(ValueError):
         s.drop_duplicates(method="unknown")
@@ -436,6 +456,7 @@ def test_apply():
     keys = [1, 2]
+    @with_running_options(engine="spe")
     def f(x, keys):
         if x["a"] in keys:
             return [1, 0]
@@ -451,6 +472,7 @@ def test_apply():
         keys=keys,
     )
     assert apply_df.shape == (3, 2)
+    assert apply_df.op.expect_engine == "SPE"
 def test_pivot_table():
@@ -474,7 +496,7 @@ def test_pivot_table():
     with pytest.raises(ValueError):
         df.pivot_table(values=["D", "E"], aggfunc="sum")
-    t = df.pivot_table(index="A")
+    t = df.pivot_table(index=["A", "B", "C"])
     assert isinstance(t.op, DataFrameGroupByAgg)
     t = df.pivot_table(index="A", values=["D", "E"], aggfunc="sum")
     assert isinstance(t.op, DataFrameGroupByAgg)

maxframe/dataframe/misc/transform.py CHANGED Viewed

@@ -27,6 +27,7 @@ from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import (
     build_df,
     build_series,
+    copy_func_scheduling_hints,
     make_dtypes,
     pack_func_args,
     parse_index,
@@ -49,10 +50,12 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
     def __init__(self, output_types=None, memory_scale=None, **kw):
         super().__init__(_output_types=output_types, _memory_scale=memory_scale, **kw)
+        if hasattr(self, "func"):
+            copy_func_scheduling_hints(self.func, self)
     def _infer_df_func_returns(self, df, dtypes):
-        packed_funcs = self.get_packed_funcs(df)
-        test_df = self._build_stub_pandas_obj(df)
+        packed_funcs = self.func
+        test_df = _build_stub_pandas_obj(df, self.output_types[0])
         if self.output_types[0] == OutputType.dataframe:
             try:
                 with np.errstate(all="ignore"), quiet_stdio():
@@ -147,16 +150,18 @@ class TransformOperator(DataFrameOperator, DataFrameOperatorMixin):
                 index_value=new_index_value,
             )
-    def get_packed_funcs(self, df=None) -> Any:
-        stub_df = self._build_stub_pandas_obj(df or self.inputs[0])
-        return pack_func_args(stub_df, self.func, *self.args, **self.kwds)
-    def _build_stub_pandas_obj(self, df) -> Union[DataFrame, Series]:
-        # TODO: Simulate a dataframe with the corresponding indexes if self.func is
-        # a dict and axis=1
-        if self.output_types[0] == OutputType.dataframe:
-            return build_df(df, fill_value=1, size=1)
-        return build_series(df, size=1, name=df.name)
+def get_packed_funcs(df, output_type, func, *args, **kwds) -> Any:
+    stub_df = _build_stub_pandas_obj(df, output_type)
+    return pack_func_args(stub_df, func, *args, **kwds)
+def _build_stub_pandas_obj(df, output_type) -> Union[DataFrame, Series]:
+    # TODO: Simulate a dataframe with the corresponding indexes if self.func is
+    # a dict and axis=1
+    if output_type == OutputType.dataframe:
+        return build_df(df, fill_value=1, size=1)
+    return build_series(df, size=1, name=df.name)
 def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwargs):
@@ -229,13 +234,15 @@ def df_transform(df, func, axis=0, *args, dtypes=None, skip_infer=False, **kwarg
     1  2  3
     2  3  4
     """
+    call_agg = kwargs.pop("_call_agg", False)
+    func = get_packed_funcs(df, OutputType.dataframe, func, *args, **kwargs)
     op = TransformOperator(
         func=func,
         axis=axis,
         args=args,
         kwds=kwargs,
         output_types=[OutputType.dataframe],
-        call_agg=kwargs.pop("_call_agg", False),
+        call_agg=call_agg,
     )
     return op(df, dtypes=dtypes, skip_infer=skip_infer)
@@ -319,6 +326,8 @@ def series_transform(
     1  2  3
     2  3  4
     """
+    call_agg = kwargs.pop("_call_agg", False)
+    func = get_packed_funcs(series, OutputType.series, func, *args, **kwargs)
     op = TransformOperator(
         func=func,
         axis=axis,
@@ -326,7 +335,7 @@ def series_transform(
         args=args,
         kwds=kwargs,
         output_types=[OutputType.series],
-        call_agg=kwargs.pop("_call_agg", False),
+        call_agg=call_agg,
     )
     dtypes = (series.name, dtype) if dtype is not None else None
     return op(series, dtypes=dtypes, skip_infer=skip_infer)

maxframe/dataframe/reduction/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ from .custom_reduction import DataFrameCustomReduction
 from .kurtosis import DataFrameKurtosis
 from .max import DataFrameMax
 from .mean import DataFrameMean
+from .median import DataFrameMedian
 from .min import DataFrameMin
 from .nunique import DataFrameNunique
 from .prod import DataFrameProd
@@ -50,6 +51,7 @@ def _install():
     from .kurtosis import kurt_dataframe, kurt_series
     from .max import max_dataframe, max_index, max_series
     from .mean import mean_dataframe, mean_series
+    from .median import median_dataframe, median_series
     from .min import min_dataframe, min_index, min_series
     from .nunique import nunique_dataframe, nunique_series
     from .prod import prod_dataframe, prod_series
@@ -68,6 +70,7 @@ def _install():
         ("min", min_series, min_dataframe),
         ("count", count_series, count_dataframe),
         ("mean", mean_series, mean_dataframe),
+        ("median", median_series, median_dataframe),
         ("var", var_series, var_dataframe),
         ("std", std_series, std_dataframe),
         ("all", all_series, all_dataframe),

maxframe/dataframe/reduction/aggregation.py CHANGED Viewed

@@ -71,6 +71,7 @@ _agg_functions = {
     "kurt": lambda x, skipna=True, bias=False: x.kurt(skipna=skipna, bias=bias),
     "kurtosis": lambda x, skipna=True, bias=False: x.kurtosis(skipna=skipna, bias=bias),
     "nunique": lambda x: x.nunique(),
+    "median": lambda x, skipna=True: x.median(skipna=skipna),
 }

maxframe/dataframe/reduction/median.py ADDED Viewed

@@ -0,0 +1,56 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ... import opcodes
+from ...core import OutputType
+from .core import DataFrameReductionMixin, DataFrameReductionOperator
+class DataFrameMedian(DataFrameReductionOperator, DataFrameReductionMixin):
+    _op_type_ = opcodes.MEDIAN
+    _func_name = "median"
+    @property
+    def is_atomic(self):
+        return True
+def median_series(df, axis=None, skipna=True, level=None, method=None):
+    op = DataFrameMedian(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        output_types=[OutputType.scalar if level is not None else OutputType.scalar],
+        method=method,
+    )
+    return op(df)
+def median_dataframe(
+    df,
+    axis=0,
+    skipna=True,
+    level=None,
+    numeric_only=None,
+    method=None,
+):
+    op = DataFrameMedian(
+        axis=axis,
+        skipna=skipna,
+        level=level,
+        numeric_only=numeric_only,
+        output_types=[OutputType.dataframe if level is not None else OutputType.series],
+        method=method,
+    )
+    return op(df)

maxframe/dataframe/reduction/tests/test_reduction.py CHANGED Viewed

@@ -23,6 +23,7 @@ import pytest
 from .... import dataframe as md
 from ....tensor import Tensor
+from ....tests.utils import assert_mf_index_dtype
 from ...core import DataFrame, IndexValue, OutputType, Series
 from ...datasource.dataframe import from_pandas as from_pandas_df
 from ...datasource.series import from_pandas as from_pandas_series
@@ -38,6 +39,7 @@ from .. import (
     DataFrameKurtosis,
     DataFrameMax,
     DataFrameMean,
+    DataFrameMedian,
     DataFrameMin,
     DataFrameNunique,
     DataFrameProd,
@@ -71,6 +73,7 @@ reduction_functions = [
     ("sem", DataFrameSem, FunctionOptions()),
     ("all", DataFrameAll, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
     ("any", DataFrameAny, FunctionOptions(has_numeric_only=False, has_bool_only=True)),
+    ("median", DataFrameMedian, FunctionOptions()),
 ]
@@ -111,10 +114,7 @@ def test_dataframe_reduction(func_name, op, func_opts: FunctionOptions):
     reduction_df = getattr(from_pandas_df(data, chunk_size=3), func_name)()
     assert isinstance(reduction_df, Series)
-    assert isinstance(
-        reduction_df.index_value._index_value,
-        (IndexValue.RangeIndex, IndexValue.Int64Index),
-    )
+    assert_mf_index_dtype(reduction_df.index_value._index_value, np.int64)
     assert reduction_df.shape == (10,)
     data = pd.DataFrame(np.random.rand(20, 20), index=[str(i) for i in range(20)])
@@ -210,6 +210,7 @@ def test_dataframe_aggregate():
         "skew",
         "kurt",
         "sem",
+        "median",
     ]
     df = from_pandas_df(data)
@@ -253,7 +254,7 @@ def test_dataframe_aggregate():
     assert result.op.output_types[0] == OutputType.dataframe
     assert result.op.func == agg_funcs
-    dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std"]}
+    dict_fun = {0: "sum", 2: ["var", "max"], 9: ["mean", "var", "std", "median"]}
     all_cols = set(
         reduce(
             operator.add, [[v] if isinstance(v, str) else v for v in dict_fun.values()]
@@ -268,9 +269,9 @@ def test_dataframe_aggregate():
     assert result.op.func[2] == dict_fun[2]
     with pytest.raises(TypeError):
-        df.agg(sum_0="sum", mean_0="mean")
+        df.agg(sum_0="sum", mean_0="mean", median_0="median")
     with pytest.raises(NotImplementedError):
-        df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std"]}, axis=1)
+        df.agg({0: ["sum", "min", "var"], 9: ["mean", "var", "std", "median"]}, axis=1)
 def test_series_aggregate():
@@ -287,6 +288,7 @@ def test_series_aggregate():
         "skew",
         "kurt",
         "sem",
+        "median",
     ]
     series = from_pandas_series(data)
@@ -303,6 +305,14 @@ def test_series_aggregate():
     assert result.shape == ()
     assert result.op.output_types[0] == OutputType.scalar
+    result = series.agg("median")
+    assert result.shape == ()
+    assert result.op.output_types[0] == OutputType.scalar
+    result = series.median(level=0)
+    assert result.shape == (np.nan,)
+    assert result.op.output_types[0] == OutputType.series
     result = series.agg(agg_funcs)
     assert result.shape == (len(agg_funcs),)
     assert list(result.index_value.to_pandas()) == agg_funcs

maxframe/dataframe/statistics/quantile.py CHANGED Viewed

@@ -81,7 +81,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
             store_index_value = False
         else:
             q_val = np.asanyarray(self.q)
-            pd_index = pd.Index(q_val)
+            if q_val.ndim == 0:
+                pd_index = pd.Index(q_val.reshape(1))
+            else:
+                pd_index = pd.Index(q_val)
             name = self.q if q_val.size == 1 else None
             store_index_value = True
         tokenize_objects = (a, q_val, self.interpolation, type(self).__name__)
@@ -164,7 +167,10 @@ class DataFrameQuantile(DataFrameOperator, DataFrameOperatorMixin):
             store_index_value = False
         else:
             q_val = np.asanyarray(self.q)
-            index_val = pd.Index(q_val)
+            if q_val.ndim == 0:
+                index_val = pd.Index(q_val.reshape(1))
+            else:
+                index_val = pd.Index(q_val)
             store_index_value = True
         # get dtype by tensor

maxframe/dataframe/statistics/tests/test_statistics.py CHANGED Viewed

@@ -49,7 +49,7 @@ def test_dataframe_quantile():
     # q = 0.3, axis = 0
     r = s.quantile(0.3)
-    e = raw.quantile(0.3)
+    e = raw.quantile(0.3, numeric_only=True)
     assert isinstance(r, Series)
     assert r.shape == (2,)
     assert r.dtype == e.dtype
@@ -57,7 +57,7 @@ def test_dataframe_quantile():
     # q = 0.3, axis = 1
     r = s.quantile(0.3, axis=1)
-    e = raw.quantile(0.3, axis=1)
+    e = raw.quantile(0.3, numeric_only=True, axis=1)
     assert isinstance(r, Series)
     assert r.shape == e.shape
     assert r.dtype == e.dtype
@@ -65,7 +65,7 @@ def test_dataframe_quantile():
     # q = [0.3, 0.7], axis = 0
     r = s.quantile([0.3, 0.7])
-    e = raw.quantile([0.3, 0.7])
+    e = raw.quantile([0.3, 0.7], numeric_only=True)
     assert isinstance(r, DataFrame)
     assert r.shape == e.shape
     pd.testing.assert_series_equal(r.dtypes, e.dtypes)
@@ -74,7 +74,7 @@ def test_dataframe_quantile():
     # q = [0.3, 0.7], axis = 1
     r = s.quantile([0.3, 0.7], axis=1)
-    e = raw.quantile([0.3, 0.7], axis=1)
+    e = raw.quantile([0.3, 0.7], numeric_only=True, axis=1)
     assert isinstance(r, DataFrame)
     assert r.shape == e.shape
     pd.testing.assert_series_equal(r.dtypes, e.dtypes)

maxframe/dataframe/tests/test_initializer.py CHANGED Viewed

@@ -13,12 +13,13 @@
 # limitations under the License.
 import pandas as pd
+import pytest
 from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
-from ..initializer import read_pandas
+from ..initializer import DataFrame, Series, read_pandas
-def test_from_pandas():
+def test_read_pandas():
     df_data = pd.DataFrame([["a", 1], ["b", 2]], columns=["a", "b"])
     assert isinstance(read_pandas(df_data), DATAFRAME_TYPE)
@@ -27,3 +28,33 @@ def test_from_pandas():
     idx_data = pd.Index(["a", "b"])
     assert isinstance(read_pandas(idx_data), INDEX_TYPE)
+def test_init_dataframe_from_maxframe_series():
+    s = Series([1, 2, 3, 4], index=[1, 2, 3, 4])
+    df = DataFrame(s, index=s.index, columns=["col1"])
+    assert isinstance(df, DATAFRAME_TYPE)
+    assert df.dtypes.index == ["col1"]
+    with pytest.raises(ValueError):
+        DataFrame(s, index=s.index, columns=[])
+    with pytest.raises(ValueError):
+        DataFrame(s, index=s.index, columns="col1")
+    with pytest.raises(ValueError):
+        DataFrame(s, index=s.index, columns="col2")
+def test_init_dataframe_from_maxframe_dataframe():
+    df1 = DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
+    df2 = DataFrame(df1, index=df1.index, columns=["col1", "col2"])
+    assert isinstance(df2, DATAFRAME_TYPE)
+    assert list(df2.dtypes.index) == ["col1", "col2"]
+    with pytest.raises(ValueError):
+        DataFrame(df1, index=df1.index, columns=["col1", "col2", "col3"])

maxframe/dataframe/tests/test_utils.py ADDED Viewed

@@ -0,0 +1,60 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pandas as pd
+import pytest
+from ...udf import MarkedFunction, with_python_requirements, with_resources
+from ..utils import pack_func_args
+@pytest.fixture
+def df1():
+    return pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
+def test_pack_function(df1):
+    # pack normal function
+    @with_resources("a.zip")
+    def keep(df):
+        return df
+    f = pack_func_args(df1, keep)
+    assert f(df1).equals(df1)
+    assert isinstance(f, MarkedFunction)
+    assert f.resources == ["a.zip"]
+    # pack with args
+    @with_python_requirements("numpy")
+    def add(a, b):
+        return a + b
+    f = pack_func_args(df1, add, 1)
+    assert f(df1).equals(df1 + 1)
+    assert isinstance(f, MarkedFunction)
+    assert f.pythonpacks[0].requirements == ("numpy",)
+    f = pack_func_args(df1, np.sum)
+    assert f(df1).equals(np.sum(df1))
+    @with_resources("a.txt")
+    @with_python_requirements("pandas")
+    def times_add(df, param, times):
+        return df * times + param
+    f = pack_func_args(df1, times_add, 5, 6)
+    assert f(df1).equals(df1 * 6 + 5)
+    assert isinstance(f, MarkedFunction)
+    assert f.resources == ["a.txt"]
+    assert f.pythonpacks[0].requirements == ("pandas",)

maxframe/dataframe/utils.py CHANGED Viewed

@@ -20,7 +20,7 @@ import operator
 import sys
 from contextlib import contextmanager
 from numbers import Integral
-from typing import Any, Callable, List
+from typing import TYPE_CHECKING, Any, Callable, List
 import numpy as np
 import pandas as pd
@@ -30,6 +30,7 @@ from pandas.core.dtypes.inference import is_dict_like, is_list_like
 from ..core import Entity, ExecutableTuple
 from ..lib.mmh3 import hash as mmh_hash
+from ..udf import MarkedFunction
 from ..utils import (
     ModulePlaceholder,
     is_full_slice,
@@ -44,6 +45,9 @@ try:
 except ImportError:  # pragma: no cover
     pa = ModulePlaceholder("pyarrow")
+if TYPE_CHECKING:
+    from .operators import DataFrameOperator
 cudf = lazy_import("cudf", rename="cudf")
 vineyard = lazy_import("vineyard")
 try:
@@ -263,12 +267,30 @@ def parse_index(index_value, *args, store_data=False, key=None):
         return IndexValue(_index_value=_serialize_index(index_value))
-def gen_unknown_index_value(index_value, *args):
+def gen_unknown_index_value(index_value, *args, normalize_range_index=False):
+    """
+    Generate new index value with the same likes of given index_value and args, but without any value.
+    Parameters
+    ----------
+    index_value
+        Given index value.
+    args
+        Arguments for parse_index.
+    normalize_range_index
+        If normalize range index to normal index.
+    Returns
+    -------
+        New created range index value.
+    """
     pd_index = index_value.to_pandas()
-    if isinstance(pd_index, pd.RangeIndex):
-        return parse_index(pd.RangeIndex(-1), *args)
+    if not normalize_range_index and isinstance(pd_index, pd.RangeIndex):
+        return parse_index(pd.RangeIndex(-1, name=pd_index.name), *args)
     elif not isinstance(pd_index, pd.MultiIndex):
-        return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
+        return parse_index(
+            pd.Index([], dtype=pd_index.dtype, name=pd_index.name), *args
+        )
     else:
         i = pd.MultiIndex.from_arrays(
             [c[:0] for c in pd_index.levels], names=pd_index.names
@@ -1160,7 +1182,65 @@ def patch_sa_engine_execute():
     Engine.execute = execute
-def pack_func_args(df, funcs, *args, **kwargs) -> Any:
+def bind_func_args_from_pos(func, args_bind_position, *bound_args, **bound_kwargs):
+    """
+    Create a new function with arguments bound from specified position.
+    Parameters
+    ----------
+    func : callable
+        Target function to be wrapped.
+    args_bind_position : int
+        Position to start binding arguments (0-based).
+        e.g., n=0 binds from first arg, n=1 binds from second arg.
+    *bound_args : tuple
+        Arguments to be bound from position n.
+    **bound_kwargs : dict
+        Keyword arguments to be bound.
+    Returns
+    -------
+    callable
+        Wrapped function with bound arguments.
+    Examples
+    --------
+    >>> def func(x, y, z=0):
+    ...    return x * y + z
+    >>> f = bind_func_args_from_pos(func, 0, 10)  # bind from second position
+    >>> f(5)  # equals func(5, 10)
+    10
+    Raises
+    ------
+    TypeError
+        If func is not callable or n is not an integer.
+    ValueError
+        If n is negative or exceeds the number of parameters.
+    """
+    @functools.wraps(func)
+    def wrapper(*runtime_args, **runtime_kwargs):
+        try:
+            # Combine arguments
+            all_args = (
+                runtime_args[:args_bind_position]
+                + bound_args
+                + runtime_args[args_bind_position:]
+            )
+            all_kwargs = {**bound_kwargs, **runtime_kwargs}
+            return func(*all_args, **all_kwargs)
+        except Exception as e:
+            # Enhance error message with context
+            raise type(e)(
+                f"Error calling {func.__name__} with bound arguments: {str(e)}"
+            ) from e
+    return wrapper
+def pack_func_args(df, funcs, *args, args_bind_position=1, **kwargs) -> Any:
     """
     Pack the funcs with args and kwargs to avoid the ambiguity between other
     positional and keyword arguments. It will process the funcs by the following rule:
@@ -1189,6 +1269,9 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
         The DataFrame or Series object to test the function.
     funcs : function, str, list-like or dict-like
         Function to pack. It should have the same type with Dataframe.transform().
+    args_bind_position: int
+        Position to start binding arguments (0-based).
+            e.g., n=0 binds from first arg, n=1 binds from second arg.
     *args :
         The positional arguments to func. If funcs contains many functions, each one
         should be able to accept *args.
@@ -1219,8 +1302,19 @@ def pack_func_args(df, funcs, *args, **kwargs) -> Any:
     f = get_callable_by_name(df, funcs) if isinstance(funcs, str) else funcs
+    from ..udf import MarkedFunction
+    if isinstance(f, MarkedFunction):
+        # for marked function, pack the inner function, and reset as mark function
+        packed_func = f.copy()
+        packed_func.func = bind_func_args_from_pos(
+            f.func, args_bind_position, *args, **kwargs
+        )
+    else:
+        packed_func = bind_func_args_from_pos(f, args_bind_position, *args, **kwargs)
     # Callable
-    return functools.partial(f, *args, **kwargs)
+    return packed_func
 def get_callable_by_name(df: Any, func_name: str) -> Callable:
@@ -1262,3 +1356,12 @@ def get_callable_by_name(df: Any, func_name: str) -> Callable:
     raise AttributeError(
         f"'{func_name}' is not a valid function for '{type(df).__name__}' object"
     )
+def copy_func_scheduling_hints(func, op: "DataFrameOperator") -> None:
+    if not isinstance(func, MarkedFunction):
+        return
+    if func.expect_engine:
+        op.expect_engine = func.expect_engine
+    if func.expect_resources:
+        op.expect_resources = func.expect_resources