maxframe 2.2.0__cp39-cp39-win_amd64.whl → 2.3.0rc1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (114) hide show
  1. maxframe/_utils.cp39-win_amd64.pyd +0 -0
  2. maxframe/codegen/core.py +3 -2
  3. maxframe/codegen/spe/dataframe/merge.py +4 -0
  4. maxframe/codegen/spe/dataframe/misc.py +2 -0
  5. maxframe/codegen/spe/dataframe/reduction.py +18 -0
  6. maxframe/codegen/spe/dataframe/sort.py +9 -1
  7. maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
  8. maxframe/codegen/spe/dataframe/tseries.py +9 -0
  9. maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
  10. maxframe/codegen/spe/tensor/datasource.py +1 -0
  11. maxframe/config/config.py +3 -0
  12. maxframe/conftest.py +10 -0
  13. maxframe/core/base.py +2 -1
  14. maxframe/core/entity/tileables.py +2 -0
  15. maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
  16. maxframe/core/graph/entity.py +7 -1
  17. maxframe/core/mode.py +6 -1
  18. maxframe/dataframe/__init__.py +2 -2
  19. maxframe/dataframe/arithmetic/__init__.py +4 -0
  20. maxframe/dataframe/arithmetic/maximum.py +33 -0
  21. maxframe/dataframe/arithmetic/minimum.py +33 -0
  22. maxframe/dataframe/core.py +98 -106
  23. maxframe/dataframe/datasource/core.py +6 -0
  24. maxframe/dataframe/datasource/direct.py +57 -0
  25. maxframe/dataframe/datasource/read_csv.py +19 -11
  26. maxframe/dataframe/datasource/read_odps_query.py +29 -6
  27. maxframe/dataframe/datasource/read_odps_table.py +32 -10
  28. maxframe/dataframe/datasource/read_parquet.py +38 -39
  29. maxframe/dataframe/datastore/__init__.py +6 -0
  30. maxframe/dataframe/datastore/direct.py +268 -0
  31. maxframe/dataframe/datastore/to_odps.py +6 -0
  32. maxframe/dataframe/extensions/flatjson.py +2 -1
  33. maxframe/dataframe/groupby/__init__.py +5 -1
  34. maxframe/dataframe/groupby/aggregation.py +10 -6
  35. maxframe/dataframe/groupby/apply_chunk.py +1 -3
  36. maxframe/dataframe/groupby/core.py +20 -4
  37. maxframe/dataframe/indexing/__init__.py +2 -1
  38. maxframe/dataframe/indexing/insert.py +45 -17
  39. maxframe/dataframe/merge/__init__.py +3 -0
  40. maxframe/dataframe/merge/combine.py +244 -0
  41. maxframe/dataframe/misc/__init__.py +14 -3
  42. maxframe/dataframe/misc/check_unique.py +41 -10
  43. maxframe/dataframe/misc/drop.py +31 -0
  44. maxframe/dataframe/misc/infer_dtypes.py +251 -0
  45. maxframe/dataframe/misc/map.py +31 -18
  46. maxframe/dataframe/misc/repeat.py +159 -0
  47. maxframe/dataframe/misc/tests/test_misc.py +35 -1
  48. maxframe/dataframe/missing/checkna.py +3 -2
  49. maxframe/dataframe/reduction/__init__.py +10 -5
  50. maxframe/dataframe/reduction/aggregation.py +6 -6
  51. maxframe/dataframe/reduction/argmax.py +7 -4
  52. maxframe/dataframe/reduction/argmin.py +7 -4
  53. maxframe/dataframe/reduction/core.py +18 -9
  54. maxframe/dataframe/reduction/mode.py +144 -0
  55. maxframe/dataframe/reduction/nunique.py +10 -3
  56. maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
  57. maxframe/dataframe/sort/__init__.py +9 -2
  58. maxframe/dataframe/sort/argsort.py +7 -1
  59. maxframe/dataframe/sort/core.py +1 -1
  60. maxframe/dataframe/sort/rank.py +147 -0
  61. maxframe/dataframe/tseries/__init__.py +19 -0
  62. maxframe/dataframe/tseries/at_time.py +61 -0
  63. maxframe/dataframe/tseries/between_time.py +122 -0
  64. maxframe/dataframe/utils.py +30 -26
  65. maxframe/learn/contrib/llm/core.py +16 -7
  66. maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
  67. maxframe/learn/contrib/llm/deploy/config.py +221 -0
  68. maxframe/learn/contrib/llm/deploy/core.py +247 -0
  69. maxframe/learn/contrib/llm/deploy/framework.py +35 -0
  70. maxframe/learn/contrib/llm/deploy/loader.py +360 -0
  71. maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
  73. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  74. maxframe/learn/contrib/llm/models/dashscope.py +12 -6
  75. maxframe/learn/contrib/llm/models/managed.py +76 -11
  76. maxframe/learn/contrib/llm/models/openai.py +72 -0
  77. maxframe/learn/contrib/llm/tests/__init__.py +13 -0
  78. maxframe/learn/contrib/llm/tests/test_core.py +34 -0
  79. maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
  80. maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
  81. maxframe/learn/contrib/llm/text.py +348 -42
  82. maxframe/learn/contrib/models.py +4 -1
  83. maxframe/learn/contrib/xgboost/classifier.py +2 -0
  84. maxframe/learn/contrib/xgboost/core.py +31 -7
  85. maxframe/learn/contrib/xgboost/predict.py +4 -2
  86. maxframe/learn/contrib/xgboost/regressor.py +5 -0
  87. maxframe/learn/contrib/xgboost/train.py +2 -0
  88. maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
  89. maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
  90. maxframe/learn/utils/__init__.py +1 -0
  91. maxframe/learn/utils/extmath.py +42 -9
  92. maxframe/learn/utils/odpsio.py +80 -11
  93. maxframe/lib/filesystem/_oss_lib/common.py +2 -0
  94. maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
  95. maxframe/opcodes.py +9 -1
  96. maxframe/remote/core.py +4 -0
  97. maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
  98. maxframe/serialization/tests/test_serial.py +2 -2
  99. maxframe/tensor/arithmetic/__init__.py +1 -1
  100. maxframe/tensor/arithmetic/core.py +2 -2
  101. maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
  102. maxframe/tensor/core.py +3 -0
  103. maxframe/tensor/misc/copyto.py +1 -1
  104. maxframe/tests/test_udf.py +61 -0
  105. maxframe/tests/test_utils.py +8 -5
  106. maxframe/udf.py +103 -7
  107. maxframe/utils.py +61 -8
  108. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
  109. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
  110. maxframe_client/session/task.py +8 -1
  111. maxframe_client/tests/test_session.py +24 -0
  112. maxframe/dataframe/arrays.py +0 -864
  113. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
  114. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,61 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import textwrap
16
+
17
+ from odps import ODPS
18
+ from odps.errors import NoSuchObject
19
+
20
+ from maxframe.tests.utils import tn
21
+ from maxframe.udf import ODPSFunction
22
+
23
+
24
+ def test_odps_function():
25
+ func_body = """from odps.udf import annotate
26
+ @annotate("bigint->bigint")
27
+ class MyMul(object):
28
+ def evaluate(self, arg0):
29
+ return arg0 * 2 if arg0 is not None else None"""
30
+ odps_entry = ODPS.from_environments()
31
+ res_name = tn("test_res")
32
+ func_name = tn("test_odps_func")
33
+
34
+ def _cleanup():
35
+ try:
36
+ odps_entry.delete_resource(res_name + ".py")
37
+ except NoSuchObject:
38
+ pass
39
+ try:
40
+ odps_entry.delete_function(func_name)
41
+ except NoSuchObject:
42
+ pass
43
+
44
+ _cleanup()
45
+
46
+ try:
47
+ test_res = odps_entry.create_resource(
48
+ res_name + ".py", "py", fileobj=textwrap.dedent(func_body)
49
+ )
50
+ test_odps_func_obj = odps_entry.create_function(
51
+ func_name, class_type=f"{res_name}.MyMul", resources=[test_res]
52
+ )
53
+ func = ODPSFunction.wrap(test_odps_func_obj)
54
+ assert isinstance(func, ODPSFunction)
55
+ assert func.__name__ == func_name
56
+ assert func.full_function_name in (
57
+ f"{odps_entry.project}:{func_name}",
58
+ f"{odps_entry.project}:default:{func_name}",
59
+ )
60
+ finally:
61
+ _cleanup()
@@ -31,6 +31,7 @@ import pyarrow as pa
31
31
  import pytest
32
32
 
33
33
  from .. import utils
34
+ from ..lib.dtypes_extension import ArrowDtype
34
35
  from ..serialization import PickleContainer
35
36
  from ..utils import parse_size_to_megabytes, validate_and_adjust_resource_ratio
36
37
 
@@ -298,11 +299,11 @@ def test_estimate_pandas_size():
298
299
  s1 = pd.Series(np.random.rand(1000))
299
300
  assert utils.estimate_pandas_size(s1) == sys.getsizeof(s1)
300
301
 
301
- from ..dataframe.arrays import ArrowStringArray
302
-
303
- array = ArrowStringArray(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
304
- s2 = pd.Series(array)
305
- assert utils.estimate_pandas_size(s2) == sys.getsizeof(s2)
302
+ if hasattr(pd, "ArrowDtype"):
303
+ arrow_array = pa.array(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
304
+ array = pd.array(arrow_array, dtype=ArrowDtype(arrow_array.type))
305
+ s2 = pd.Series(array)
306
+ assert utils.estimate_pandas_size(s2) == sys.getsizeof(s2)
306
307
 
307
308
  s3 = pd.Series(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
308
309
  assert (
@@ -366,6 +367,8 @@ def test_arrow_type_from_string():
366
367
  _assert_arrow_type_convert(pa.decimal128(10, 2))
367
368
  _assert_arrow_type_convert(pa.list_(pa.int64()))
368
369
  _assert_arrow_type_convert(pa.map_(pa.string(), pa.int64()))
370
+ _assert_arrow_type_convert(pa.date32())
371
+ _assert_arrow_type_convert(pa.date64())
369
372
  _assert_arrow_type_convert(
370
373
  pa.struct([("key", pa.string()), ("value", pa.list_(pa.int64()))])
371
374
  )
maxframe/udf.py CHANGED
@@ -13,13 +13,18 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import shlex
16
+ import sys
16
17
  from typing import Callable, List, Optional, Union
17
18
 
18
- from odps.models import Resource
19
+ import numpy as np
20
+ from odps.models import Function as ODPSFunctionObj
21
+ from odps.models import Resource as ODPSResourceObj
19
22
 
20
23
  from .config.validators import is_positive_integer
24
+ from .core.mode import is_mock_mode
21
25
  from .serialization import load_member
22
26
  from .serialization.serializables import (
27
+ AnyField,
23
28
  BoolField,
24
29
  DictField,
25
30
  FieldTypes,
@@ -28,7 +33,8 @@ from .serialization.serializables import (
28
33
  Serializable,
29
34
  StringField,
30
35
  )
31
- from .utils import extract_class_name, tokenize
36
+ from .typing_ import PandasDType
37
+ from .utils import extract_class_name, make_dtype, tokenize
32
38
 
33
39
 
34
40
  class PythonPackOptions(Serializable):
@@ -122,8 +128,100 @@ class MarkedFunction(Serializable):
122
128
  return f"<MarkedFunction {self.func!r}>"
123
129
 
124
130
 
125
- def with_resources(*resources: Union[str, Resource], use_wrapper_class: bool = True):
126
- def res_to_str(res: Union[str, Resource]) -> str:
131
+ class ODPSFunction(Serializable):
132
+ __slots__ = ("_caller_type",)
133
+
134
+ full_function_name = StringField("full_function_name")
135
+ expect_engine = StringField("expect_engine", default=None)
136
+ expect_resources = DictField(
137
+ "expect_resources", FieldTypes.string, default_factory=dict
138
+ )
139
+ result_dtype = AnyField("result_dtype", default=None)
140
+
141
+ def __init__(
142
+ self,
143
+ func,
144
+ expect_engine: str = None,
145
+ expect_resources: dict = None,
146
+ dtype: PandasDType = None,
147
+ **kw,
148
+ ):
149
+ full_function_name = None
150
+ if isinstance(func, str):
151
+ full_function_name = func
152
+ elif isinstance(func, ODPSFunctionObj):
153
+ func_parts = [func.project.name]
154
+ if func.schema:
155
+ func_parts.append(func.schema.name)
156
+ func_parts.append(func.name)
157
+ full_function_name = ":".join(func_parts)
158
+ if full_function_name:
159
+ kw["full_function_name"] = full_function_name
160
+
161
+ if dtype is not None:
162
+ kw["result_dtype"] = make_dtype(dtype)
163
+ super().__init__(
164
+ expect_engine=expect_engine, expect_resources=expect_resources, **kw
165
+ )
166
+
167
+ @property
168
+ def __name__(self):
169
+ return self.full_function_name.rsplit(":", 1)[-1]
170
+
171
+ def _detect_caller_type(self) -> Optional[str]:
172
+ if hasattr(self, "_caller_type"):
173
+ return self._caller_type
174
+
175
+ frame = sys._getframe(1)
176
+ is_set = False
177
+ while frame.f_back:
178
+ f_mod = frame.f_globals.get("__name__")
179
+ if f_mod and f_mod.startswith("maxframe.dataframe."):
180
+ if f_mod.endswith(".map"):
181
+ self._caller_type, is_set = "map", True
182
+ elif f_mod.endswith(".aggregation") or ".reduction." in f_mod:
183
+ self._caller_type, is_set = "agg", True
184
+ if is_set:
185
+ return self._caller_type
186
+ frame = frame.f_back
187
+ return None
188
+
189
+ def __call__(self, obj, *args, **kwargs):
190
+ caller_type = self._detect_caller_type()
191
+ if caller_type == "agg":
192
+ return self._call_aggregate(obj, *args, **kwargs)
193
+ raise NotImplementedError("Need to be referenced inside apply or map functions")
194
+
195
+ def _call_aggregate(self, obj, *args, **kwargs):
196
+ from .dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
197
+ from .dataframe.reduction.custom_reduction import build_custom_reduction_result
198
+
199
+ if isinstance(obj, (DATAFRAME_TYPE, SERIES_TYPE)):
200
+ return build_custom_reduction_result(obj, self)
201
+ if is_mock_mode():
202
+ ret = obj.iloc[0]
203
+ if self.result_dtype:
204
+ if hasattr(ret, "astype"):
205
+ ret = ret.astype(self.result_dtype)
206
+ else: # pragma: no cover
207
+ ret = np.array(ret).astype(self.result_dtype).item()
208
+ return ret
209
+ raise NotImplementedError("Need to be referenced inside apply or map functions")
210
+
211
+ def __repr__(self):
212
+ return f"<ODPSStoredFunction {self.full_function_name}>"
213
+
214
+ @classmethod
215
+ def wrap(cls, func):
216
+ if isinstance(func, ODPSFunctionObj):
217
+ return ODPSFunction(func)
218
+ return func
219
+
220
+
221
+ def with_resources(
222
+ *resources: Union[str, ODPSResourceObj], use_wrapper_class: bool = True
223
+ ):
224
+ def res_to_str(res: Union[str, ODPSResourceObj]) -> str:
127
225
  if isinstance(res, str):
128
226
  return res
129
227
  res_parts = [res.project.name]
@@ -250,9 +348,7 @@ def with_running_options(
250
348
  with_resource_libraries = with_resources
251
349
 
252
350
 
253
- def get_udf_resources(
254
- func: Callable,
255
- ) -> List[Union[Resource, str]]:
351
+ def get_udf_resources(func: Callable) -> List[Union[ODPSResourceObj, str]]:
256
352
  return getattr(func, "resources", None) or []
257
353
 
258
354
 
maxframe/utils.py CHANGED
@@ -14,6 +14,7 @@
14
14
 
15
15
  import asyncio.events
16
16
  import concurrent.futures
17
+ import contextlib
17
18
  import contextvars
18
19
  import copy
19
20
  import dataclasses
@@ -80,6 +81,7 @@ from ._utils import ( # noqa: F401 # pylint: disable=unused-import
80
81
  tokenize,
81
82
  tokenize_int,
82
83
  )
84
+ from .lib.dtypes_extension import ArrowDtype
83
85
  from .lib.version import parse as parse_version
84
86
  from .typing_ import TileableType, TimeoutType
85
87
 
@@ -204,13 +206,28 @@ def on_serialize_nsplits(value: Tuple[Tuple[int]]):
204
206
  return tuple(new_nsplits)
205
207
 
206
208
 
207
- def has_unknown_shape(*tiled_tileables: TileableType) -> bool:
209
+ def has_unknown_shape(
210
+ *tiled_tileables: TileableType, axis: Union[None, int, List[int]] = None
211
+ ) -> bool:
212
+ if isinstance(axis, int):
213
+ axis = [axis]
214
+
208
215
  for tileable in tiled_tileables:
209
216
  if getattr(tileable, "shape", None) is None:
210
217
  continue
211
- if any(pd.isnull(s) for s in tileable.shape):
218
+
219
+ shape_iter = (
220
+ tileable.shape if axis is None else (tileable.shape[idx] for idx in axis)
221
+ )
222
+ if any(pd.isnull(s) for s in shape_iter):
212
223
  return True
213
- if any(pd.isnull(s) for s in itertools.chain(*tileable.nsplits)):
224
+
225
+ nsplits_iter = (
226
+ tileable.nsplits
227
+ if axis is None
228
+ else (tileable.nsplits[idx] for idx in axis)
229
+ )
230
+ if any(pd.isnull(s) for s in itertools.chain(*nsplits_iter)):
214
231
  return True
215
232
  return False
216
233
 
@@ -281,7 +298,10 @@ def make_dtype(dtype: Union[np.dtype, pd.api.extensions.ExtensionDtype]):
281
298
  elif dtype is pd.Timedelta or dtype is datetime.timedelta:
282
299
  return np.dtype("timedelta64[ns]")
283
300
  else:
284
- return np.dtype(dtype)
301
+ try:
302
+ return pd.api.types.pandas_dtype(dtype)
303
+ except TypeError:
304
+ return np.dtype("O")
285
305
 
286
306
 
287
307
  def make_dtypes(
@@ -448,7 +468,10 @@ def create_sync_primitive(
448
468
  return cls(loop=loop)
449
469
 
450
470
  # From Python3.10 the loop parameter has been removed. We should work around here.
451
- old_loop = asyncio.get_event_loop()
471
+ try:
472
+ old_loop = asyncio.get_event_loop()
473
+ except RuntimeError:
474
+ old_loop = None
452
475
  try:
453
476
  asyncio.set_event_loop(loop)
454
477
  primitive = cls()
@@ -599,8 +622,6 @@ def estimate_pandas_size(
599
622
  # MultiIndex's sample size can't be used to estimate
600
623
  return sys.getsizeof(pd_obj)
601
624
 
602
- from .dataframe.arrays import ArrowDtype
603
-
604
625
  def _is_fast_dtype(dtype):
605
626
  if isinstance(dtype, np.dtype):
606
627
  return np.issubdtype(dtype, np.number)
@@ -1182,13 +1203,16 @@ if pa:
1182
1203
  "float": pa.float32,
1183
1204
  "double": pa.float64,
1184
1205
  "decimal": pa.decimal128,
1206
+ # repr() of date32 and date64 has `day` or `ms`
1207
+ # which is not needed in constructors
1208
+ "date32": lambda *_: pa.date32(),
1209
+ "date64": lambda *_: pa.date64(),
1185
1210
  }
1186
1211
  _plain_arrow_types = """
1187
1212
  null
1188
1213
  int8 int16 int32 int64
1189
1214
  uint8 uint16 uint32 uint64
1190
1215
  float16 float32 float64
1191
- date32 date64
1192
1216
  decimal128 decimal256
1193
1217
  string utf8 binary
1194
1218
  time32 time64 duration timestamp
@@ -1719,3 +1743,32 @@ def validate_and_adjust_resource_ratio(
1719
1743
  )
1720
1744
 
1721
1745
  return expect_resources, False
1746
+
1747
+
1748
+ def get_pd_option(option_name, default=no_default):
1749
+ """Get pandas option. If not exist return `default`."""
1750
+ try:
1751
+ with warnings.catch_warnings():
1752
+ warnings.filterwarnings("ignore", category=FutureWarning)
1753
+ return pd.get_option(option_name)
1754
+ except (KeyError, AttributeError):
1755
+ if default is no_default:
1756
+ raise
1757
+ return default
1758
+
1759
+
1760
+ @contextlib.contextmanager
1761
+ def pd_option_context(*args):
1762
+ arg_kv = dict(zip(args[0::2], args[1::2]))
1763
+ new_args = []
1764
+ for k, v in arg_kv.items():
1765
+ try:
1766
+ get_pd_option(k)
1767
+ except (KeyError, AttributeError): # pragma: no cover
1768
+ continue
1769
+ new_args.extend([k, v])
1770
+ if not new_args: # pragma: no cover
1771
+ yield
1772
+ else:
1773
+ with pd.option_context(*new_args):
1774
+ yield
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: maxframe
3
- Version: 2.2.0
3
+ Version: 2.3.0rc1
4
4
  Summary: MaxFrame operator-based data analyze framework
5
5
  Requires-Dist: numpy<2.0.0,>=1.19.0
6
6
  Requires-Dist: pandas>=1.0.0
@@ -107,4 +107,3 @@ License
107
107
 
108
108
  Licensed under the `Apache License
109
109
  2.0 <https://www.apache.org/licenses/LICENSE-2.0.html>`__.
110
-