PyPI - maxframe - Versions diffs - 2.2.0__cp39-cp39-win_amd64.whl → 2.3.0rc1__cp39-cp39-win_amd64.whl - Mend

maxframe 2.2.0__cp39-cp39-win_amd64.whl → 2.3.0rc1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (114) hide show

maxframe/_utils.cp39-win_amd64.pyd +0 -0
maxframe/codegen/core.py +3 -2
maxframe/codegen/spe/dataframe/merge.py +4 -0
maxframe/codegen/spe/dataframe/misc.py +2 -0
maxframe/codegen/spe/dataframe/reduction.py +18 -0
maxframe/codegen/spe/dataframe/sort.py +9 -1
maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
maxframe/codegen/spe/dataframe/tseries.py +9 -0
maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
maxframe/codegen/spe/tensor/datasource.py +1 -0
maxframe/config/config.py +3 -0
maxframe/conftest.py +10 -0
maxframe/core/base.py +2 -1
maxframe/core/entity/tileables.py +2 -0
maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
maxframe/core/graph/entity.py +7 -1
maxframe/core/mode.py +6 -1
maxframe/dataframe/__init__.py +2 -2
maxframe/dataframe/arithmetic/__init__.py +4 -0
maxframe/dataframe/arithmetic/maximum.py +33 -0
maxframe/dataframe/arithmetic/minimum.py +33 -0
maxframe/dataframe/core.py +98 -106
maxframe/dataframe/datasource/core.py +6 -0
maxframe/dataframe/datasource/direct.py +57 -0
maxframe/dataframe/datasource/read_csv.py +19 -11
maxframe/dataframe/datasource/read_odps_query.py +29 -6
maxframe/dataframe/datasource/read_odps_table.py +32 -10
maxframe/dataframe/datasource/read_parquet.py +38 -39
maxframe/dataframe/datastore/__init__.py +6 -0
maxframe/dataframe/datastore/direct.py +268 -0
maxframe/dataframe/datastore/to_odps.py +6 -0
maxframe/dataframe/extensions/flatjson.py +2 -1
maxframe/dataframe/groupby/__init__.py +5 -1
maxframe/dataframe/groupby/aggregation.py +10 -6
maxframe/dataframe/groupby/apply_chunk.py +1 -3
maxframe/dataframe/groupby/core.py +20 -4
maxframe/dataframe/indexing/__init__.py +2 -1
maxframe/dataframe/indexing/insert.py +45 -17
maxframe/dataframe/merge/__init__.py +3 -0
maxframe/dataframe/merge/combine.py +244 -0
maxframe/dataframe/misc/__init__.py +14 -3
maxframe/dataframe/misc/check_unique.py +41 -10
maxframe/dataframe/misc/drop.py +31 -0
maxframe/dataframe/misc/infer_dtypes.py +251 -0
maxframe/dataframe/misc/map.py +31 -18
maxframe/dataframe/misc/repeat.py +159 -0
maxframe/dataframe/misc/tests/test_misc.py +35 -1
maxframe/dataframe/missing/checkna.py +3 -2
maxframe/dataframe/reduction/__init__.py +10 -5
maxframe/dataframe/reduction/aggregation.py +6 -6
maxframe/dataframe/reduction/argmax.py +7 -4
maxframe/dataframe/reduction/argmin.py +7 -4
maxframe/dataframe/reduction/core.py +18 -9
maxframe/dataframe/reduction/mode.py +144 -0
maxframe/dataframe/reduction/nunique.py +10 -3
maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
maxframe/dataframe/sort/__init__.py +9 -2
maxframe/dataframe/sort/argsort.py +7 -1
maxframe/dataframe/sort/core.py +1 -1
maxframe/dataframe/sort/rank.py +147 -0
maxframe/dataframe/tseries/__init__.py +19 -0
maxframe/dataframe/tseries/at_time.py +61 -0
maxframe/dataframe/tseries/between_time.py +122 -0
maxframe/dataframe/utils.py +30 -26
maxframe/learn/contrib/llm/core.py +16 -7
maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
maxframe/learn/contrib/llm/deploy/config.py +221 -0
maxframe/learn/contrib/llm/deploy/core.py +247 -0
maxframe/learn/contrib/llm/deploy/framework.py +35 -0
maxframe/learn/contrib/llm/deploy/loader.py +360 -0
maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
maxframe/learn/contrib/llm/models/__init__.py +1 -0
maxframe/learn/contrib/llm/models/dashscope.py +12 -6
maxframe/learn/contrib/llm/models/managed.py +76 -11
maxframe/learn/contrib/llm/models/openai.py +72 -0
maxframe/learn/contrib/llm/tests/__init__.py +13 -0
maxframe/learn/contrib/llm/tests/test_core.py +34 -0
maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
maxframe/learn/contrib/llm/text.py +348 -42
maxframe/learn/contrib/models.py +4 -1
maxframe/learn/contrib/xgboost/classifier.py +2 -0
maxframe/learn/contrib/xgboost/core.py +31 -7
maxframe/learn/contrib/xgboost/predict.py +4 -2
maxframe/learn/contrib/xgboost/regressor.py +5 -0
maxframe/learn/contrib/xgboost/train.py +2 -0
maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
maxframe/learn/utils/__init__.py +1 -0
maxframe/learn/utils/extmath.py +42 -9
maxframe/learn/utils/odpsio.py +80 -11
maxframe/lib/filesystem/_oss_lib/common.py +2 -0
maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
maxframe/opcodes.py +9 -1
maxframe/remote/core.py +4 -0
maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
maxframe/serialization/tests/test_serial.py +2 -2
maxframe/tensor/arithmetic/__init__.py +1 -1
maxframe/tensor/arithmetic/core.py +2 -2
maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
maxframe/tensor/core.py +3 -0
maxframe/tensor/misc/copyto.py +1 -1
maxframe/tests/test_udf.py +61 -0
maxframe/tests/test_utils.py +8 -5
maxframe/udf.py +103 -7
maxframe/utils.py +61 -8
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
maxframe_client/session/task.py +8 -1
maxframe_client/tests/test_session.py +24 -0
maxframe/dataframe/arrays.py +0 -864
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0

maxframe/tests/test_udf.py ADDED Viewed

@@ -0,0 +1,61 @@
+# Copyright 1999-2025 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import textwrap
+from odps import ODPS
+from odps.errors import NoSuchObject
+from maxframe.tests.utils import tn
+from maxframe.udf import ODPSFunction
+def test_odps_function():
+    func_body = """from odps.udf import annotate
+    @annotate("bigint->bigint")
+    class MyMul(object):
+        def evaluate(self, arg0):
+            return arg0 * 2 if arg0 is not None else None"""
+    odps_entry = ODPS.from_environments()
+    res_name = tn("test_res")
+    func_name = tn("test_odps_func")
+    def _cleanup():
+        try:
+            odps_entry.delete_resource(res_name + ".py")
+        except NoSuchObject:
+            pass
+        try:
+            odps_entry.delete_function(func_name)
+        except NoSuchObject:
+            pass
+    _cleanup()
+    try:
+        test_res = odps_entry.create_resource(
+            res_name + ".py", "py", fileobj=textwrap.dedent(func_body)
+        )
+        test_odps_func_obj = odps_entry.create_function(
+            func_name, class_type=f"{res_name}.MyMul", resources=[test_res]
+        )
+        func = ODPSFunction.wrap(test_odps_func_obj)
+        assert isinstance(func, ODPSFunction)
+        assert func.__name__ == func_name
+        assert func.full_function_name in (
+            f"{odps_entry.project}:{func_name}",
+            f"{odps_entry.project}:default:{func_name}",
+        )
+    finally:
+        _cleanup()

maxframe/tests/test_utils.py CHANGED Viewed

@@ -31,6 +31,7 @@ import pyarrow as pa
 import pytest
 from .. import utils
+from ..lib.dtypes_extension import ArrowDtype
 from ..serialization import PickleContainer
 from ..utils import parse_size_to_megabytes, validate_and_adjust_resource_ratio
@@ -298,11 +299,11 @@ def test_estimate_pandas_size():
     s1 = pd.Series(np.random.rand(1000))
     assert utils.estimate_pandas_size(s1) == sys.getsizeof(s1)
-    from ..dataframe.arrays import ArrowStringArray
-    array = ArrowStringArray(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
-    s2 = pd.Series(array)
-    assert utils.estimate_pandas_size(s2) == sys.getsizeof(s2)
+    if hasattr(pd, "ArrowDtype"):
+        arrow_array = pa.array(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
+        array = pd.array(arrow_array, dtype=ArrowDtype(arrow_array.type))
+        s2 = pd.Series(array)
+        assert utils.estimate_pandas_size(s2) == sys.getsizeof(s2)
     s3 = pd.Series(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
     assert (
@@ -366,6 +367,8 @@ def test_arrow_type_from_string():
     _assert_arrow_type_convert(pa.decimal128(10, 2))
     _assert_arrow_type_convert(pa.list_(pa.int64()))
     _assert_arrow_type_convert(pa.map_(pa.string(), pa.int64()))
+    _assert_arrow_type_convert(pa.date32())
+    _assert_arrow_type_convert(pa.date64())
     _assert_arrow_type_convert(
         pa.struct([("key", pa.string()), ("value", pa.list_(pa.int64()))])
     )

maxframe/udf.py CHANGED Viewed

@@ -13,13 +13,18 @@
 # limitations under the License.
 import shlex
+import sys
 from typing import Callable, List, Optional, Union
-from odps.models import Resource
+import numpy as np
+from odps.models import Function as ODPSFunctionObj
+from odps.models import Resource as ODPSResourceObj
 from .config.validators import is_positive_integer
+from .core.mode import is_mock_mode
 from .serialization import load_member
 from .serialization.serializables import (
+    AnyField,
     BoolField,
     DictField,
     FieldTypes,
@@ -28,7 +33,8 @@ from .serialization.serializables import (
     Serializable,
     StringField,
 )
-from .utils import extract_class_name, tokenize
+from .typing_ import PandasDType
+from .utils import extract_class_name, make_dtype, tokenize
 class PythonPackOptions(Serializable):
@@ -122,8 +128,100 @@ class MarkedFunction(Serializable):
         return f"<MarkedFunction {self.func!r}>"
-def with_resources(*resources: Union[str, Resource], use_wrapper_class: bool = True):
-    def res_to_str(res: Union[str, Resource]) -> str:
+class ODPSFunction(Serializable):
+    __slots__ = ("_caller_type",)
+    full_function_name = StringField("full_function_name")
+    expect_engine = StringField("expect_engine", default=None)
+    expect_resources = DictField(
+        "expect_resources", FieldTypes.string, default_factory=dict
+    )
+    result_dtype = AnyField("result_dtype", default=None)
+    def __init__(
+        self,
+        func,
+        expect_engine: str = None,
+        expect_resources: dict = None,
+        dtype: PandasDType = None,
+        **kw,
+    ):
+        full_function_name = None
+        if isinstance(func, str):
+            full_function_name = func
+        elif isinstance(func, ODPSFunctionObj):
+            func_parts = [func.project.name]
+            if func.schema:
+                func_parts.append(func.schema.name)
+            func_parts.append(func.name)
+            full_function_name = ":".join(func_parts)
+        if full_function_name:
+            kw["full_function_name"] = full_function_name
+        if dtype is not None:
+            kw["result_dtype"] = make_dtype(dtype)
+        super().__init__(
+            expect_engine=expect_engine, expect_resources=expect_resources, **kw
+        )
+    @property
+    def __name__(self):
+        return self.full_function_name.rsplit(":", 1)[-1]
+    def _detect_caller_type(self) -> Optional[str]:
+        if hasattr(self, "_caller_type"):
+            return self._caller_type
+        frame = sys._getframe(1)
+        is_set = False
+        while frame.f_back:
+            f_mod = frame.f_globals.get("__name__")
+            if f_mod and f_mod.startswith("maxframe.dataframe."):
+                if f_mod.endswith(".map"):
+                    self._caller_type, is_set = "map", True
+                elif f_mod.endswith(".aggregation") or ".reduction." in f_mod:
+                    self._caller_type, is_set = "agg", True
+                if is_set:
+                    return self._caller_type
+            frame = frame.f_back
+        return None
+    def __call__(self, obj, *args, **kwargs):
+        caller_type = self._detect_caller_type()
+        if caller_type == "agg":
+            return self._call_aggregate(obj, *args, **kwargs)
+        raise NotImplementedError("Need to be referenced inside apply or map functions")
+    def _call_aggregate(self, obj, *args, **kwargs):
+        from .dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
+        from .dataframe.reduction.custom_reduction import build_custom_reduction_result
+        if isinstance(obj, (DATAFRAME_TYPE, SERIES_TYPE)):
+            return build_custom_reduction_result(obj, self)
+        if is_mock_mode():
+            ret = obj.iloc[0]
+            if self.result_dtype:
+                if hasattr(ret, "astype"):
+                    ret = ret.astype(self.result_dtype)
+                else:  # pragma: no cover
+                    ret = np.array(ret).astype(self.result_dtype).item()
+            return ret
+        raise NotImplementedError("Need to be referenced inside apply or map functions")
+    def __repr__(self):
+        return f"<ODPSStoredFunction {self.full_function_name}>"
+    @classmethod
+    def wrap(cls, func):
+        if isinstance(func, ODPSFunctionObj):
+            return ODPSFunction(func)
+        return func
+def with_resources(
+    *resources: Union[str, ODPSResourceObj], use_wrapper_class: bool = True
+):
+    def res_to_str(res: Union[str, ODPSResourceObj]) -> str:
         if isinstance(res, str):
             return res
         res_parts = [res.project.name]
@@ -250,9 +348,7 @@ def with_running_options(
 with_resource_libraries = with_resources
-def get_udf_resources(
-    func: Callable,
-) -> List[Union[Resource, str]]:
+def get_udf_resources(func: Callable) -> List[Union[ODPSResourceObj, str]]:
     return getattr(func, "resources", None) or []

maxframe/utils.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import asyncio.events
 import concurrent.futures
+import contextlib
 import contextvars
 import copy
 import dataclasses
@@ -80,6 +81,7 @@ from ._utils import (  # noqa: F401 # pylint: disable=unused-import
     tokenize,
     tokenize_int,
 )
+from .lib.dtypes_extension import ArrowDtype
 from .lib.version import parse as parse_version
 from .typing_ import TileableType, TimeoutType
@@ -204,13 +206,28 @@ def on_serialize_nsplits(value: Tuple[Tuple[int]]):
     return tuple(new_nsplits)
-def has_unknown_shape(*tiled_tileables: TileableType) -> bool:
+def has_unknown_shape(
+    *tiled_tileables: TileableType, axis: Union[None, int, List[int]] = None
+) -> bool:
+    if isinstance(axis, int):
+        axis = [axis]
     for tileable in tiled_tileables:
         if getattr(tileable, "shape", None) is None:
             continue
-        if any(pd.isnull(s) for s in tileable.shape):
+        shape_iter = (
+            tileable.shape if axis is None else (tileable.shape[idx] for idx in axis)
+        )
+        if any(pd.isnull(s) for s in shape_iter):
             return True
-        if any(pd.isnull(s) for s in itertools.chain(*tileable.nsplits)):
+        nsplits_iter = (
+            tileable.nsplits
+            if axis is None
+            else (tileable.nsplits[idx] for idx in axis)
+        )
+        if any(pd.isnull(s) for s in itertools.chain(*nsplits_iter)):
             return True
     return False
@@ -281,7 +298,10 @@ def make_dtype(dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]):
     elif dtype is pd.Timedelta or dtype is datetime.timedelta:
         return np.dtype("timedelta64[ns]")
     else:
-        return np.dtype(dtype)
+        try:
+            return pd.api.types.pandas_dtype(dtype)
+        except TypeError:
+            return np.dtype("O")
 def make_dtypes(
@@ -448,7 +468,10 @@ def create_sync_primitive(
         return cls(loop=loop)
     # From Python3.10 the loop parameter has been removed. We should work around here.
-    old_loop = asyncio.get_event_loop()
+    try:
+        old_loop = asyncio.get_event_loop()
+    except RuntimeError:
+        old_loop = None
     try:
         asyncio.set_event_loop(loop)
         primitive = cls()
@@ -599,8 +622,6 @@ def estimate_pandas_size(
         # MultiIndex's sample size can't be used to estimate
         return sys.getsizeof(pd_obj)
-    from .dataframe.arrays import ArrowDtype
     def _is_fast_dtype(dtype):
         if isinstance(dtype, np.dtype):
             return np.issubdtype(dtype, np.number)
@@ -1182,13 +1203,16 @@ if pa:
         "float": pa.float32,
         "double": pa.float64,
         "decimal": pa.decimal128,
+        # repr() of date32 and date64 has `day` or `ms`
+        #  which is not needed in constructors
+        "date32": lambda *_: pa.date32(),
+        "date64": lambda *_: pa.date64(),
     }
     _plain_arrow_types = """
     null
     int8 int16 int32 int64
     uint8 uint16 uint32 uint64
     float16 float32 float64
-    date32 date64
     decimal128 decimal256
     string utf8 binary
     time32 time64 duration timestamp
@@ -1719,3 +1743,32 @@ def validate_and_adjust_resource_ratio(
             )
     return expect_resources, False
+def get_pd_option(option_name, default=no_default):
+    """Get pandas option. If not exist return `default`."""
+    try:
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=FutureWarning)
+            return pd.get_option(option_name)
+    except (KeyError, AttributeError):
+        if default is no_default:
+            raise
+        return default
+@contextlib.contextmanager
+def pd_option_context(*args):
+    arg_kv = dict(zip(args[0::2], args[1::2]))
+    new_args = []
+    for k, v in arg_kv.items():
+        try:
+            get_pd_option(k)
+        except (KeyError, AttributeError):  # pragma: no cover
+            continue
+        new_args.extend([k, v])
+    if not new_args:  # pragma: no cover
+        yield
+    else:
+        with pd.option_context(*new_args):
+            yield

{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: maxframe
-Version: 2.2.0
+Version: 2.3.0rc1
 Summary: MaxFrame operator-based data analyze framework
 Requires-Dist: numpy<2.0.0,>=1.19.0
 Requires-Dist: pandas>=1.0.0
@@ -107,4 +107,3 @@ License
 Licensed under the `Apache License
 2.0 <https://www.apache.org/licenses/LICENSE-2.0.html>`__.