PyPI - maxframe - Versions diffs - 0.1.0b4__cp310-cp310-win_amd64.whl → 1.0.0rc1__cp310-cp310-win_amd64.whl - Mend

maxframe 0.1.0b4__cp310-cp310-win_amd64.whl → 1.0.0rc1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (81) hide show

maxframe/__init__.py +1 -0
maxframe/_utils.cp310-win_amd64.pyd +0 -0
maxframe/codegen.py +56 -3
maxframe/config/config.py +15 -1
maxframe/core/__init__.py +0 -3
maxframe/core/entity/__init__.py +1 -8
maxframe/core/entity/objects.py +3 -45
maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
maxframe/core/graph/core.pyx +4 -4
maxframe/dataframe/__init__.py +1 -0
maxframe/dataframe/core.py +30 -8
maxframe/dataframe/datasource/read_odps_query.py +3 -1
maxframe/dataframe/datasource/read_odps_table.py +3 -1
maxframe/dataframe/datastore/tests/__init__.py +13 -0
maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
maxframe/dataframe/datastore/to_odps.py +21 -0
maxframe/dataframe/indexing/align.py +1 -1
maxframe/dataframe/misc/__init__.py +4 -0
maxframe/dataframe/misc/apply.py +3 -1
maxframe/dataframe/misc/case_when.py +141 -0
maxframe/dataframe/misc/memory_usage.py +2 -2
maxframe/dataframe/misc/pivot_table.py +262 -0
maxframe/dataframe/misc/tests/test_misc.py +84 -0
maxframe/dataframe/plotting/core.py +2 -2
maxframe/dataframe/reduction/core.py +2 -1
maxframe/dataframe/statistics/corr.py +3 -3
maxframe/dataframe/utils.py +7 -0
maxframe/errors.py +13 -0
maxframe/extension.py +12 -0
maxframe/learn/contrib/utils.py +52 -0
maxframe/learn/contrib/xgboost/__init__.py +26 -0
maxframe/learn/contrib/xgboost/classifier.py +86 -0
maxframe/learn/contrib/xgboost/core.py +156 -0
maxframe/learn/contrib/xgboost/dmatrix.py +150 -0
maxframe/learn/contrib/xgboost/predict.py +138 -0
maxframe/learn/contrib/xgboost/regressor.py +78 -0
maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
maxframe/learn/contrib/xgboost/train.py +121 -0
maxframe/learn/utils/__init__.py +15 -0
maxframe/learn/utils/core.py +29 -0
maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
maxframe/lib/mmh3.pyi +43 -0
maxframe/lib/wrapped_pickle.py +2 -1
maxframe/odpsio/arrow.py +2 -3
maxframe/odpsio/tableio.py +22 -0
maxframe/odpsio/tests/test_schema.py +16 -11
maxframe/opcodes.py +3 -0
maxframe/protocol.py +108 -10
maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
maxframe/serialization/core.pxd +3 -0
maxframe/serialization/core.pyi +64 -0
maxframe/serialization/core.pyx +54 -25
maxframe/serialization/exception.py +1 -1
maxframe/serialization/pandas.py +7 -2
maxframe/serialization/serializables/core.py +119 -12
maxframe/serialization/serializables/tests/test_serializable.py +46 -4
maxframe/session.py +28 -0
maxframe/tensor/__init__.py +1 -1
maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
maxframe/tensor/base/__init__.py +2 -0
maxframe/tensor/base/atleast_1d.py +74 -0
maxframe/tensor/base/unique.py +205 -0
maxframe/tensor/datasource/array.py +4 -2
maxframe/tensor/datasource/scalar.py +1 -1
maxframe/tensor/reduction/count_nonzero.py +1 -1
maxframe/tests/test_protocol.py +34 -0
maxframe/tests/test_utils.py +0 -12
maxframe/tests/utils.py +2 -2
maxframe/udf.py +63 -3
maxframe/utils.py +22 -13
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/METADATA +3 -3
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/RECORD +80 -61
maxframe_client/__init__.py +0 -1
maxframe_client/fetcher.py +65 -3
maxframe_client/session/odps.py +74 -5
maxframe_client/session/task.py +65 -71
maxframe_client/tests/test_session.py +64 -1
maxframe_client/clients/spe.py +0 -104
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/WHEEL +0 -0
{maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/top_level.txt +0 -0

maxframe/tensor/base/unique.py ADDED Viewed

@@ -0,0 +1,205 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from ... import opcodes as OperandDef
+from ...serialization.serializables import BoolField, Int32Field
+from ..core import TensorOrder
+from ..operators import TensorHasInput, TensorOperatorMixin
+from ..utils import validate_axis
+class TensorUnique(TensorHasInput, TensorOperatorMixin):
+    _op_type_ = OperandDef.UNIQUE
+    return_index = BoolField("return_index", default=False)
+    return_inverse = BoolField("return_inverse", default=False)
+    return_counts = BoolField("return_counts", default=False)
+    axis = Int32Field("axis", default=None)
+    @property
+    def output_limit(self):
+        return 1
+    def _gen_kws(self, input_obj, chunk=False, chunk_index=None):
+        kws = []
+        # unique tensor
+        shape = list(input_obj.shape)
+        shape[self.axis] = np.nan
+        kw = {"shape": tuple(shape), "dtype": input_obj.dtype, "gpu": input_obj.op.gpu}
+        if chunk:
+            idx = [0] * len(shape)
+            idx[self.axis] = chunk_index or 0
+            kw["index"] = tuple(idx)
+        kws.append(kw)
+        # unique indices tensor
+        if self.return_index:
+            kw = {
+                "shape": (np.nan,),
+                "dtype": np.dtype(np.intp),
+                "gpu": input_obj.op.gpu,
+                "type": "indices",
+            }
+            if chunk:
+                kw["index"] = (chunk_index or 0,)
+            kws.append(kw)
+        # unique inverse tensor
+        if self.return_inverse:
+            kw = {
+                "shape": (input_obj.shape[self.axis],),
+                "dtype": np.dtype(np.intp),
+                "gpu": input_obj.op.gpu,
+                "type": "inverse",
+            }
+            if chunk:
+                kw["index"] = (chunk_index or 0,)
+            kws.append(kw)
+        # unique counts tensor
+        if self.return_counts:
+            kw = {
+                "shape": (np.nan,),
+                "dtype": np.dtype(int),
+                "gpu": input_obj.op.gpu,
+                "type": "counts",
+            }
+            if chunk:
+                kw["index"] = (chunk_index or 0,)
+            kws.append(kw)
+        return kws
+    def __call__(self, ar):
+        from .atleast_1d import atleast_1d
+        ar = atleast_1d(ar)
+        if self.axis is None:
+            if ar.ndim > 1:
+                ar = ar.flatten()
+            self._axis = 0
+        else:
+            self._axis = validate_axis(ar.ndim, self._axis)
+        kws = self._gen_kws(self, ar)
+        tensors = self.new_tensors([ar], kws=kws, order=TensorOrder.C_ORDER)
+        if len(tensors) == 1:
+            return tensors[0]
+        return tensors
+def unique(
+    ar,
+    return_index=False,
+    return_inverse=False,
+    return_counts=False,
+    axis=None,
+):
+    """
+    Find the unique elements of a tensor.
+    Returns the sorted unique elements of a tensor. There are three optional
+    outputs in addition to the unique elements:
+    * the indices of the input tensor that give the unique values
+    * the indices of the unique tensor that reconstruct the input tensor
+    * the number of times each unique value comes up in the input tensor
+    Parameters
+    ----------
+    ar : array_like
+        Input tensor. Unless `axis` is specified, this will be flattened if it
+        is not already 1-D.
+    return_index : bool, optional
+        If True, also return the indices of `ar` (along the specified axis,
+        if provided, or in the flattened tensor) that result in the unique tensor.
+    return_inverse : bool, optional
+        If True, also return the indices of the unique tensor (for the specified
+        axis, if provided) that can be used to reconstruct `ar`.
+    return_counts : bool, optional
+        If True, also return the number of times each unique item appears
+        in `ar`.
+    axis : int or None, optional
+        The axis to operate on. If None, `ar` will be flattened. If an integer,
+        the subarrays indexed by the given axis will be flattened and treated
+        as the elements of a 1-D tensor with the dimension of the given axis,
+        see the notes for more details.  Object tensors or structured tensors
+        that contain objects are not supported if the `axis` kwarg is used. The
+        default is None.
+    Returns
+    -------
+    unique : Tensor
+        The sorted unique values.
+    unique_indices : Tensor, optional
+        The indices of the first occurrences of the unique values in the
+        original tensor. Only provided if `return_index` is True.
+    unique_inverse : Tensor, optional
+        The indices to reconstruct the original tensor from the
+        unique tensor. Only provided if `return_inverse` is True.
+    unique_counts : Tensor, optional
+        The number of times each of the unique values comes up in the
+        original tensor. Only provided if `return_counts` is True.
+    Examples
+    --------
+    >>> import maxframe.tensor as mt
+    >>> mt.unique([1, 1, 2, 2, 3, 3]).execute()
+    array([1, 2, 3])
+    >>> a = mt.array([[1, 1], [2, 3]])
+    >>> mt.unique(a).execute()
+    array([1, 2, 3])
+    Return the unique rows of a 2D tensor
+    >>> a = mt.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]])
+    >>> mt.unique(a, axis=0).execute()
+    array([[1, 0, 0], [2, 3, 4]])
+    Return the indices of the original tensor that give the unique values:
+    >>> a = mt.array(['a', 'b', 'b', 'c', 'a'])
+    >>> u, indices = mt.unique(a, return_index=True)
+    >>> u.execute()
+    array(['a', 'b', 'c'],
+           dtype='|S1')
+    >>> indices.execute()
+    array([0, 1, 3])
+    >>> a[indices].execute()
+    array(['a', 'b', 'c'],
+           dtype='|S1')
+    Reconstruct the input array from the unique values:
+    >>> a = mt.array([1, 2, 6, 4, 2, 3, 2])
+    >>> u, indices = mt.unique(a, return_inverse=True)
+    >>> u.execute()
+    array([1, 2, 3, 4, 6])
+    >>> indices.execute()
+    array([0, 1, 4, 3, 1, 2, 1])
+    >>> u[indices].execute()
+    array([1, 2, 6, 4, 2, 3, 2])
+    """
+    op = TensorUnique(
+        return_index=return_index,
+        return_inverse=return_inverse,
+        return_counts=return_counts,
+        axis=axis,
+    )
+    return op(ar)

maxframe/tensor/datasource/array.py CHANGED Viewed

@@ -20,6 +20,7 @@ from ...serialization.serializables import (
     AnyField,
     FieldTypes,
     NDArrayField,
+    StringField,
     TupleField,
 )
 from ...utils import on_deserialize_shape, on_serialize_shape
@@ -37,8 +38,9 @@ class ArrayDataSource(TensorNoInput):
     _op_type_ = opcodes.TENSOR_DATA_SOURCE
-    data = NDArrayField("data")
-    chunk_size = AnyField("chunk_size")
+    data = NDArrayField("data", default=None)
+    chunk_size = AnyField("chunk_size", default=None)
+    order = StringField("order", default=None)
     def __init__(self, data=None, dtype=None, gpu=None, **kw):
         if dtype is not None:

maxframe/tensor/datasource/scalar.py CHANGED Viewed

@@ -33,7 +33,7 @@ class Scalar(TensorNoInput):
 def scalar(data, dtype=None, gpu=None):
     try:
         arr = np.array(data, dtype=dtype)
-        op = Scalar(arr, dtype=arr.dtype, gpu=gpu)
+        op = Scalar(data=arr, dtype=arr.dtype, gpu=gpu)
         shape = ()
         return op(shape)
     except ValueError:

maxframe/tensor/reduction/count_nonzero.py CHANGED Viewed

@@ -77,5 +77,5 @@ def count_nonzero(a, axis=None):
     array([2, 3])
     """
-    op = TensorCountNonzero(axis=axis, dtype=np.dtype(np.int_), keepdims=None)
+    op = TensorCountNonzero(axis=axis, dtype=np.dtype(int), keepdims=None)
     return op(a)

maxframe/tests/test_protocol.py CHANGED Viewed

@@ -85,6 +85,40 @@ def test_error_info_json_serialize():
         deserial_err_info.reraise()
+class CannotPickleException(Exception):
+    def __reduce__(self):
+        raise ValueError
+class CannotUnpickleException(Exception):
+    @classmethod
+    def load_from_pk(cls, _):
+        raise ValueError
+    def __reduce__(self):
+        return type(self).load_from_pk, (0,)
+def test_error_info_fallback_json_serialize():
+    try:
+        raise CannotPickleException
+    except CannotPickleException as ex:
+        err_info1 = ErrorInfo.from_exception(ex)
+    try:
+        raise CannotUnpickleException
+    except CannotUnpickleException as ex:
+        err_info2 = ErrorInfo.from_exception(ex)
+    for err_info in (err_info1, err_info2):
+        deserial_err_info = ErrorInfo.from_json(err_info.to_json())
+        assert deserial_err_info.raw_error_source is None
+        assert deserial_err_info.raw_error_data is None
+        with pytest.raises(RemoteException):
+            deserial_err_info.reraise()
 def test_dag_info_json_serialize():
     try:
         raise ValueError("ERR_DATA")

maxframe/tests/test_utils.py CHANGED Viewed

@@ -288,15 +288,6 @@ def test_estimate_pandas_size():
     df2 = pd.DataFrame(np.random.rand(1000, 10))
     assert utils.estimate_pandas_size(df2) == sys.getsizeof(df2)
-    df3 = pd.DataFrame(
-        {
-            "A": np.random.choice(["abcd", "def", "gh"], size=(1000,)),
-            "B": np.random.rand(1000),
-            "C": np.random.rand(1000),
-        }
-    )
-    assert utils.estimate_pandas_size(df3) != sys.getsizeof(df3)
     s1 = pd.Series(np.random.rand(1000))
     assert utils.estimate_pandas_size(s1) == sys.getsizeof(s1)
@@ -307,7 +298,6 @@ def test_estimate_pandas_size():
     assert utils.estimate_pandas_size(s2) == sys.getsizeof(s2)
     s3 = pd.Series(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
-    assert utils.estimate_pandas_size(s3) != sys.getsizeof(s3)
     assert (
         pytest.approx(utils.estimate_pandas_size(s3) / sys.getsizeof(s3), abs=0.5) == 1
     )
@@ -318,7 +308,6 @@ def test_estimate_pandas_size():
     assert utils.estimate_pandas_size(idx1) == sys.getsizeof(idx1)
     string_idx = pd.Index(np.random.choice(["a", "bb", "cc"], size=(1000,)))
-    assert utils.estimate_pandas_size(string_idx) != sys.getsizeof(string_idx)
     assert (
         pytest.approx(
             utils.estimate_pandas_size(string_idx) / sys.getsizeof(string_idx), abs=0.5
@@ -338,7 +327,6 @@ def test_estimate_pandas_size():
         },
         index=idx2,
     )
-    assert utils.estimate_pandas_size(df4) != sys.getsizeof(df4)
     assert (
         pytest.approx(utils.estimate_pandas_size(df4) / sys.getsizeof(df4), abs=0.5)
         == 1

maxframe/tests/utils.py CHANGED Viewed

@@ -25,7 +25,7 @@ import pytest
 from tornado import netutil
 from ..core import Tileable, TileableGraph
-from ..utils import lazy_import
+from ..utils import create_event, lazy_import
 try:
     from flaky import flaky
@@ -102,7 +102,7 @@ def run_app_in_thread(app_func):
     def fixture_func(*args, **kwargs):
         app_loop = asyncio.new_event_loop()
         q = queue.Queue()
-        exit_event = asyncio.Event(loop=app_loop)
+        exit_event = create_event(app_loop)
         app_thread = Thread(
             name="TestAppThread",
             target=app_thread_func,

maxframe/udf.py CHANGED Viewed

@@ -12,21 +12,51 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import shlex
 from typing import Callable, List, Optional, Union
 from odps.models import Resource
 from .serialization.serializables import (
+    BoolField,
     FieldTypes,
     FunctionField,
     ListField,
     Serializable,
+    StringField,
 )
+from .utils import tokenize
+class PythonPackOptions(Serializable):
+    key = StringField("key")
+    requirements = ListField("requirements", FieldTypes.string, default_factory=list)
+    force_rebuild = BoolField("force_rebuild", default=False)
+    prefer_binary = BoolField("prefer_binary", default=False)
+    pre_release = BoolField("pre_release", default=False)
+    pack_instance_id = StringField("pack_instance_id", default=None)
+    def __init__(self, key: str = None, **kw):
+        super().__init__(key=key, **kw)
+        if self.key is None:
+            args = {
+                "force_rebuild": self.force_rebuild,
+                "prefer_binary": self.prefer_binary,
+                "pre_release": self.pre_release,
+            }
+            self.key = tokenize(set(self.requirements), args)
+    def __repr__(self):
+        return (
+            f"<PythonPackOptions {self.requirements} force_rebuild={self.force_rebuild} "
+            f"prefer_binary={self.prefer_binary} pre_release={self.pre_release}>"
+        )
 class MarkedFunction(Serializable):
     func = FunctionField("func")
     resources = ListField("resources", FieldTypes.string, default_factory=list)
+    pythonpacks = ListField("pythonpacks", FieldTypes.reference, default_factory=list)
     def __init__(self, func: Optional[Callable] = None, **kw):
         super().__init__(func=func, **kw)
@@ -54,13 +84,39 @@ def with_resources(*resources: Union[str, Resource], use_wrapper_class: bool = T
     def func_wrapper(func):
         str_resources = [res_to_str(r) for r in resources]
         if not use_wrapper_class:
-            func.resources = str_resources
+            existing = getattr(func, "resources") or []
+            func.resources = existing + str_resources
+            return func
+        if isinstance(func, MarkedFunction):
+            func.resources = func.resources + str_resources
             return func
+        return MarkedFunction(func, resources=str_resources)
+    return func_wrapper
+def with_python_requirements(
+    *requirements: str,
+    force_rebuild: bool = False,
+    prefer_binary: bool = False,
+    pre_release: bool = False,
+):
+    result_req = []
+    for req in requirements:
+        result_req.extend(shlex.split(req))
+    def func_wrapper(func):
+        pack_item = PythonPackOptions(
+            requirements=requirements,
+            force_rebuild=force_rebuild,
+            prefer_binary=prefer_binary,
+            pre_release=pre_release,
+        )
         if isinstance(func, MarkedFunction):
-            func.resources = str_resources
+            func.pythonpacks.append(pack_item)
             return func
-        return MarkedFunction(func, resources=list(str_resources))
+        return MarkedFunction(func, pythonpacks=[pack_item])
     return func_wrapper
@@ -72,3 +128,7 @@ def get_udf_resources(
     func: Callable,
 ) -> List[Union[Resource, str]]:
     return getattr(func, "resources", None) or []
+def get_udf_pythonpacks(func: Callable) -> List[PythonPackOptions]:
+    return getattr(func, "pythonpacks", None) or []

maxframe/utils.py CHANGED Viewed

@@ -33,7 +33,6 @@ import sys
 import threading
 import time
 import tokenize as pytokenize
-import traceback
 import types
 import weakref
 import zlib
@@ -396,18 +395,6 @@ def build_tileable_dir_name(tileable_key: str) -> str:
     return m.hexdigest()
-def extract_messages_and_stacks(exc: Exception) -> Tuple[List[str], List[str]]:
-    cur_exc = exc
-    messages, stacks = [], []
-    while True:
-        messages.append(str(cur_exc))
-        stacks.append("".join(traceback.format_tb(cur_exc.__traceback__)))
-        if exc.__cause__ is None:
-            break
-        cur_exc = exc.__cause__
-    return messages, stacks
 async def wait_http_response(
     url: str, *, request_timeout: TimeoutType = None, **kwargs
 ) -> httpclient.HTTPResponse:
@@ -449,6 +436,21 @@ async def to_thread_pool(func, *args, pool=None, **kwargs):
     return await loop.run_in_executor(pool, func_call)
+def create_event(loop: asyncio.AbstractEventLoop) -> asyncio.Event:
+    """
+    Create an asyncio.Event in a certain event loop.
+    """
+    if sys.version_info[1] < 10 or loop is None:
+        return asyncio.Event(loop=loop)
+    # From Python3.10 the loop parameter has been removed. We should work around here.
+    old_loop = asyncio.get_running_loop()
+    asyncio.set_event_loop(loop)
+    event = asyncio.Event()
+    asyncio.set_event_loop(old_loop)
+    return event
 class ToThreadCancelledError(asyncio.CancelledError):
     def __init__(self, *args, result=None):
         super().__init__(*args)
@@ -519,6 +521,7 @@ def config_odps_default_options():
         "metaservice.client.cache.enable": "false",
         "odps.sql.session.result.cache.enable": "false",
         "odps.sql.submit.mode": "script",
+        "odps.sql.job.max.time.hours": 72,
     }
@@ -1106,3 +1109,9 @@ def get_python_tag():
     # todo add implementation suffix for non-GIL tags when PEP703 is ready
     version_info = sys.version_info
     return f"cp{version_info[0]}{version_info[1]}"
+def get_item_if_scalar(val: Any) -> Any:
+    if isinstance(val, np.ndarray) and val.shape == ():
+        return val.item()
+    return val

{maxframe-0.1.0b4.dist-info → maxframe-1.0.0rc1.dist-info}/METADATA RENAMED Viewed

@@ -1,10 +1,10 @@
 Metadata-Version: 2.1
 Name: maxframe
-Version: 0.1.0b4
+Version: 1.0.0rc1
 Summary: MaxFrame operator-based data analyze framework
-Requires-Dist: numpy >=1.19.0
+Requires-Dist: numpy <2.0.0,>=1.19.0
 Requires-Dist: pandas >=1.0.0
-Requires-Dist: pyodps >=0.11.5
+Requires-Dist: pyodps >=0.11.6.1
 Requires-Dist: scipy >=1.0
 Requires-Dist: pyarrow >=1.0.0
 Requires-Dist: msgpack >=1.0.0