PyPI - maxframe - Versions diffs - 1.2.1__cp311-cp311-macosx_10_9_universal2.whl → 1.3.1__cp311-cp311-macosx_10_9_universal2.whl - Mend

maxframe 1.2.1__cp311-cp311-macosx_10_9_universal2.whl → 1.3.1__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (73) hide show

maxframe/_utils.cpython-311-darwin.so +0 -0
maxframe/codegen.py +70 -21
maxframe/config/config.py +6 -0
maxframe/core/accessor.py +1 -0
maxframe/core/graph/core.cpython-311-darwin.so +0 -0
maxframe/dataframe/accessors/__init__.py +1 -1
maxframe/dataframe/accessors/dict_/accessor.py +1 -0
maxframe/dataframe/accessors/dict_/length.py +1 -0
maxframe/dataframe/accessors/dict_/setitem.py +1 -0
maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
maxframe/dataframe/accessors/list_/__init__.py +37 -0
maxframe/dataframe/accessors/list_/accessor.py +39 -0
maxframe/dataframe/accessors/list_/getitem.py +135 -0
maxframe/dataframe/accessors/list_/length.py +73 -0
maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
maxframe/dataframe/accessors/plotting/__init__.py +2 -0
maxframe/dataframe/accessors/string_/__init__.py +1 -0
maxframe/dataframe/datastore/to_odps.py +6 -0
maxframe/dataframe/extensions/accessor.py +1 -0
maxframe/dataframe/extensions/apply_chunk.py +34 -21
maxframe/dataframe/extensions/flatmap.py +8 -1
maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
maxframe/dataframe/groupby/aggregation.py +53 -1
maxframe/dataframe/merge/concat.py +7 -4
maxframe/dataframe/merge/merge.py +1 -0
maxframe/dataframe/merge/tests/test_merge.py +97 -47
maxframe/dataframe/missing/tests/test_missing.py +1 -0
maxframe/dataframe/reduction/aggregation.py +63 -0
maxframe/dataframe/reduction/core.py +17 -5
maxframe/dataframe/tests/test_utils.py +7 -0
maxframe/dataframe/ufunc/ufunc.py +1 -0
maxframe/dataframe/utils.py +3 -0
maxframe/io/odpsio/schema.py +1 -0
maxframe/learn/contrib/__init__.py +2 -4
maxframe/learn/contrib/llm/__init__.py +1 -0
maxframe/learn/contrib/llm/core.py +31 -10
maxframe/learn/contrib/llm/models/__init__.py +1 -0
maxframe/learn/contrib/llm/models/dashscope.py +38 -3
maxframe/learn/contrib/llm/models/managed.py +54 -0
maxframe/learn/contrib/llm/multi_modal.py +93 -0
maxframe/learn/contrib/llm/text.py +268 -8
maxframe/learn/contrib/models.py +77 -0
maxframe/learn/contrib/utils.py +1 -0
maxframe/learn/contrib/xgboost/__init__.py +8 -1
maxframe/learn/contrib/xgboost/classifier.py +15 -4
maxframe/learn/contrib/xgboost/core.py +108 -1
maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
maxframe/learn/contrib/xgboost/predict.py +6 -3
maxframe/learn/contrib/xgboost/regressor.py +15 -1
maxframe/learn/contrib/xgboost/train.py +5 -4
maxframe/lib/dtypes_extension/__init__.py +2 -1
maxframe/lib/dtypes_extension/dtypes.py +21 -0
maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
maxframe/opcodes.py +19 -0
maxframe/serialization/__init__.py +1 -0
maxframe/serialization/core.cpython-311-darwin.so +0 -0
maxframe/serialization/core.pyx +12 -1
maxframe/serialization/numpy.py +12 -4
maxframe/serialization/serializables/tests/test_serializable.py +13 -2
maxframe/serialization/tests/test_serial.py +2 -0
maxframe/tensor/merge/concatenate.py +1 -0
maxframe/tensor/misc/unique.py +11 -10
maxframe/tensor/reshape/reshape.py +4 -1
maxframe/utils.py +4 -0
{maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/METADATA +3 -2
{maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/RECORD +73 -65
{maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/WHEEL +1 -1
maxframe_client/session/odps.py +3 -0
maxframe_client/session/tests/test_task.py +1 -0
{maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/top_level.txt +0 -0

maxframe/dataframe/accessors/list_/tests/test_list_accessor.py ADDED Viewed

@@ -0,0 +1,79 @@
+# Copyright 1999-2025 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pytest
+from ..... import dataframe as md
+from .....lib.dtypes_extension import list_
+from .....utils import ARROW_DTYPE_NOT_SUPPORTED
+from ..getitem import SeriesListGetItemOperator
+from ..length import SeriesListLengthOperator
+pytestmark = pytest.mark.skipif(
+    ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported"
+)
+@pytest.fixture
+def df():
+    return md.DataFrame(
+        {
+            "A": pd.Series([[5, 3, 2]], dtype=list_(pa.int32())),
+            "B": pd.Series([["ab", "cd"]], dtype=list_(pa.string())),
+            "C": pd.Series([1], dtype=np.dtype("int64")),
+        },
+        index=[1],
+    )
+def test_invalid_dtype(df):
+    with pytest.raises(AttributeError):
+        df["C"].list.len()
+def test_getitem(df):
+    s1 = df["A"].list[1]
+    assert isinstance(s1, md.Series)
+    assert s1.dtype == pd.ArrowDtype(pa.int32())
+    assert s1.shape == (1,)
+    assert s1.index_value == df.index_value
+    op = s1.op
+    assert isinstance(op, SeriesListGetItemOperator)
+    assert op.query_index == 1
+    assert op.ignore_index_error is False
+def test_getitem_ignore_index_err(df):
+    s1 = df["B"].list.get(1)
+    assert isinstance(s1, md.Series)
+    assert s1.dtype == pd.ArrowDtype(pa.string())
+    assert s1.shape == (1,)
+    assert s1.index_value == df.index_value
+    op = s1.op
+    assert isinstance(op, SeriesListGetItemOperator)
+    assert op.query_index == 1
+    assert op.ignore_index_error is True
+def test_length(df):
+    s1 = df["A"].list.len()
+    assert isinstance(s1, md.Series)
+    assert s1.dtype == pd.ArrowDtype(pa.int64())
+    assert s1.shape == (1,)
+    assert s1.index_value == df.index_value
+    op = s1.op
+    assert isinstance(op, SeriesListLengthOperator)

maxframe/dataframe/accessors/plotting/__init__.py CHANGED Viewed

@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 def _install():
     import pandas as pd

maxframe/dataframe/accessors/string_/__init__.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .core import SeriesStringMethod

maxframe/dataframe/datastore/to_odps.py CHANGED Viewed

@@ -27,6 +27,7 @@ from ...core import OutputType
 from ...io.odpsio import build_dataframe_table_meta
 from ...serialization.serializables import (
     BoolField,
+    DictField,
     FieldTypes,
     Int64Field,
     ListField,
@@ -55,6 +56,7 @@ class DataFrameToODPSTable(DataFrameDataStore):
     index = BoolField("index", default=True)
     index_label = ListField("index_label", FieldTypes.string, default=None)
     lifecycle = Int64Field("lifecycle", default=None)
+    table_properties = DictField("table_properties", default=None)
     def __init__(self, **kw):
         super().__init__(_output_types=[OutputType.dataframe], **kw)
@@ -84,6 +86,7 @@ def to_odps_table(
     index: bool = True,
     index_label: Union[None, str, List[str]] = None,
     lifecycle: Optional[int] = None,
+    table_properties: Optional[dict] = None,
 ):
     """
     Write DataFrame object into a MaxCompute (ODPS) table.
@@ -122,6 +125,8 @@ def to_odps_table(
         names will be used.
     lifecycle: Optional[int]
         Specify lifecycle of the output table.
+    table_properties: Optional[dict]
+        Specify properties of the output table.
     Returns
     -------
@@ -186,5 +191,6 @@ def to_odps_table(
         index=index,
         index_label=index_label,
         lifecycle=lifecycle or options.session.table_lifecycle,
+        table_properties=table_properties,
     )
     return op(df)

maxframe/dataframe/extensions/accessor.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import TYPE_CHECKING
 from ...core import BaseMaxFrameAccessor

maxframe/dataframe/extensions/apply_chunk.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import functools
 from typing import Any, Callable, Dict, List, Tuple, Union
@@ -19,7 +20,12 @@ import pandas as pd
 from ... import opcodes
 from ...core import OutputType
-from ...serialization.serializables import FunctionField, Int32Field
+from ...serialization.serializables import (
+    DictField,
+    FunctionField,
+    Int32Field,
+    TupleField,
+)
 from ...utils import quiet_stdio
 from ..core import DATAFRAME_TYPE, DataFrame, IndexValue, Series
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -38,7 +44,9 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
     _op_type_ = opcodes.APPLY_CHUNK
     func = FunctionField("func")
-    batch_rows = Int32Field("batch_rows")
+    batch_rows = Int32Field("batch_rows", default=None)
+    args = TupleField("args", default=None)
+    kwargs = DictField("kwargs", default=None)
     def __init__(self, output_type=None, **kw):
         if output_type:
@@ -104,12 +112,11 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
         dtypes: Union[Tuple[str, Any], Dict[str, Any]] = None,
         output_type=None,
         index=None,
-        args=(),
-        **kwargs,
     ):
+        args = self.args or ()
+        kwargs = self.kwargs or {}
         # if not dtypes and not skip_infer:
-        origin_func = self.func
-        self.func = get_packed_func(df_or_series, origin_func, *args, **kwargs)
+        packed_func = get_packed_func(df_or_series, self.func, *args, **kwargs)
         # if skip_infer, directly build a frame
         if self.output_types and self.output_types[0] == OutputType.df_or_series:
@@ -118,8 +125,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
         # infer return index and dtypes
         dtypes, index_value, elementwise = self._infer_batch_func_returns(
             df_or_series,
-            origin_func=origin_func,
-            packed_func=self.func,
+            origin_func=self.func,
+            packed_func=packed_func,
             given_output_type=output_type,
             given_dtypes=dtypes,
             given_index=index,
@@ -166,6 +173,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
         given_dtypes: Union[Tuple[str, Any], pd.Series, List[Any], Dict[str, Any]],
         given_index: Union[pd.Index, IndexValue],
         given_elementwise: bool = False,
+        *args,
+        **kwargs,
     ):
         inferred_output_type = inferred_dtypes = inferred_index_value = None
         inferred_is_elementwise = False
@@ -190,7 +199,7 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
         try:
             # execute
             with np.errstate(all="ignore"), quiet_stdio():
-                infer_result = packed_func(empty_data)
+                infer_result = packed_func(empty_data, *args, **kwargs)
             #  if executed successfully, get index and dtypes from returned object
             if inferred_index_value is None:
@@ -258,7 +267,7 @@ def get_packed_func(df, func, *args, **kwargs) -> Any:
 def df_apply_chunk(
     dataframe,
     func: Union[str, Callable],
-    batch_rows,
+    batch_rows=None,
     dtypes=None,
     dtype=None,
     name=None,
@@ -462,11 +471,11 @@ def df_apply_chunk(
     if not isinstance(func, Callable):
         raise TypeError("function must be a callable object")
-    if not isinstance(batch_rows, int):
-        raise TypeError("batch_rows must be an integer")
-    if batch_rows <= 0:
-        raise ValueError("batch_rows must be greater than 0")
+    if batch_rows is not None:
+        if not isinstance(batch_rows, int):
+            raise TypeError("batch_rows must be an integer")
+        elif batch_rows <= 0:
+            raise ValueError("batch_rows must be greater than 0")
     dtypes = (name, dtype) if dtype is not None else dtypes
@@ -481,15 +490,17 @@ def df_apply_chunk(
     # bind args and kwargs
     op = DataFrameApplyChunkOperator(
-        func=func, batch_rows=batch_rows, output_type=output_type
+        func=func,
+        batch_rows=batch_rows,
+        output_type=output_type,
+        args=args,
+        kwargs=kwargs,
     )
     return op(
         dataframe,
         dtypes=dtypes,
         index=index,
-        args=args,
-        **kwargs,
     )
@@ -720,7 +731,11 @@ def series_apply_chunk(
         output_type = OutputType.df_or_series
     op = DataFrameApplyChunkOperator(
-        func=func, batch_rows=batch_rows, output_type=output_type
+        func=func,
+        batch_rows=batch_rows,
+        output_type=output_type,
+        args=args,
+        kwargs=kwargs,
     )
     dtypes = (name, dtype) if dtype is not None else dtypes
@@ -729,6 +744,4 @@ def series_apply_chunk(
         dtypes=dtypes,
         output_type=output_type,
         index=index,
-        args=args,
-        **kwargs,
     )

maxframe/dataframe/extensions/flatmap.py CHANGED Viewed

@@ -27,7 +27,12 @@ from ...serialization.serializables import (
 )
 from ..core import DataFrame
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
-from ..utils import gen_unknown_index_value, make_dtypes, parse_index
+from ..utils import (
+    copy_func_scheduling_hints,
+    gen_unknown_index_value,
+    make_dtypes,
+    parse_index,
+)
 class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
@@ -40,6 +45,8 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
     def __init__(self, output_types=None, **kw):
         super().__init__(_output_types=output_types, **kw)
+        if hasattr(self, "func"):
+            copy_func_scheduling_hints(self.func, self)
     def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
         dtypes = make_dtypes(dtypes)

maxframe/dataframe/extensions/tests/test_apply_chunk.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import pandas as pd
 import pytest
@@ -102,7 +103,7 @@ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
     assert result.index_value is df1.index_value
     assert result.dtypes.equals(df1.dtypes)
     assert isinstance(result.op.func, MarkedFunction)
-    assert result.op.func is not process
+    assert result.op.func is process
     assert result.op.func.resources is process.resources
     assert result.op.func.pythonpacks is process.pythonpacks

maxframe/dataframe/extensions/tests/test_extensions.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import pandas as pd
 import pytest

maxframe/dataframe/groupby/aggregation.py CHANGED Viewed

@@ -303,11 +303,63 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
         if aggregated result is very large, 'auto' will use 'shuffle' method
         in distributed mode and use 'tree' in local mode.
     Returns
     -------
     Series or DataFrame
         Aggregated result.
+    Examples
+    --------
+    >>> import maxframe.dataframe as md
+    >>> df = md.DataFrame(
+    ...     {
+    ...         "A": [1, 1, 2, 2],
+    ...         "B": [1, 2, 3, 4],
+    ...         "C": [0.362838, 0.227877, 1.267767, -0.562860],
+    ...     }
+    ... ).execute()
+       A  B         C
+    0  1  1  0.362838
+    1  1  2  0.227877
+    2  2  3  1.267767
+    3  2  4 -0.562860
+    The aggregation is for each column.
+    >>> df.groupby('A').agg('min').execute()
+       B         C
+    A
+    1  1  0.227877
+    2  3 -0.562860
+    Multiple aggregations.
+    >>> df.groupby('A').agg(['min', 'max']).execute()
+        B             C
+      min max       min       max
+    A
+    1   1   2  0.227877  0.362838
+    2   3   4 -0.562860  1.267767
+    Different aggregations per column
+    >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}).execute()
+        B             C
+      min max       sum
+    A
+    1   1   2  0.590715
+    2   3   4  0.704907
+    To control the output names with different aggregations per column, pandas supports “named aggregation”
+    >>> from maxframe.dataframe.groupby import NamedAgg
+    >>> df.groupby("A").agg(
+    ...  b_min=NamedAgg(column="B", aggfunc="min"),
+    ...  c_sum=NamedAgg(column="C", aggfunc="sum")).execute()
+       b_min     c_sum
+    A
+    1      1  0.590715
+    2      3  0.704907
     """
     # When perform a computation on the grouped data, we won't shuffle

maxframe/dataframe/merge/concat.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Union
 import pandas as pd
@@ -100,8 +101,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
             row_length = 0
             for series in objs:
                 row_length += series.shape[0]
-            if self.ignore_index:  # pragma: no cover
-                index_value = parse_index(pd.RangeIndex(row_length))
+            if self.ignore_index:
+                idx_length = 0 if pd.isna(row_length) else row_length
+                index_value = parse_index(pd.RangeIndex(idx_length))
             else:
                 index = self._concat_index(objs)
                 index_value = parse_index(index, objs)
@@ -159,8 +161,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
             if self.join == "inner":
                 objs = [o[list(emtpy_result.columns)] for o in objs]
-            if self.ignore_index:  # pragma: no cover
-                index_value = parse_index(pd.RangeIndex(row_length))
+            if self.ignore_index:
+                idx_length = 0 if pd.isna(row_length) else row_length
+                index_value = parse_index(pd.RangeIndex(idx_length))
             else:
                 index = self._concat_index(objs)
                 index_value = parse_index(index, objs)

maxframe/dataframe/merge/merge.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from abc import abstractmethod
 from collections import namedtuple