maxframe 1.2.1__cp311-cp311-macosx_10_9_universal2.whl → 1.3.1__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (73) hide show
  1. maxframe/_utils.cpython-311-darwin.so +0 -0
  2. maxframe/codegen.py +70 -21
  3. maxframe/config/config.py +6 -0
  4. maxframe/core/accessor.py +1 -0
  5. maxframe/core/graph/core.cpython-311-darwin.so +0 -0
  6. maxframe/dataframe/accessors/__init__.py +1 -1
  7. maxframe/dataframe/accessors/dict_/accessor.py +1 -0
  8. maxframe/dataframe/accessors/dict_/length.py +1 -0
  9. maxframe/dataframe/accessors/dict_/setitem.py +1 -0
  10. maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
  11. maxframe/dataframe/accessors/list_/__init__.py +37 -0
  12. maxframe/dataframe/accessors/list_/accessor.py +39 -0
  13. maxframe/dataframe/accessors/list_/getitem.py +135 -0
  14. maxframe/dataframe/accessors/list_/length.py +73 -0
  15. maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
  16. maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
  17. maxframe/dataframe/accessors/plotting/__init__.py +2 -0
  18. maxframe/dataframe/accessors/string_/__init__.py +1 -0
  19. maxframe/dataframe/datastore/to_odps.py +6 -0
  20. maxframe/dataframe/extensions/accessor.py +1 -0
  21. maxframe/dataframe/extensions/apply_chunk.py +34 -21
  22. maxframe/dataframe/extensions/flatmap.py +8 -1
  23. maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
  24. maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
  25. maxframe/dataframe/groupby/aggregation.py +53 -1
  26. maxframe/dataframe/merge/concat.py +7 -4
  27. maxframe/dataframe/merge/merge.py +1 -0
  28. maxframe/dataframe/merge/tests/test_merge.py +97 -47
  29. maxframe/dataframe/missing/tests/test_missing.py +1 -0
  30. maxframe/dataframe/reduction/aggregation.py +63 -0
  31. maxframe/dataframe/reduction/core.py +17 -5
  32. maxframe/dataframe/tests/test_utils.py +7 -0
  33. maxframe/dataframe/ufunc/ufunc.py +1 -0
  34. maxframe/dataframe/utils.py +3 -0
  35. maxframe/io/odpsio/schema.py +1 -0
  36. maxframe/learn/contrib/__init__.py +2 -4
  37. maxframe/learn/contrib/llm/__init__.py +1 -0
  38. maxframe/learn/contrib/llm/core.py +31 -10
  39. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  40. maxframe/learn/contrib/llm/models/dashscope.py +38 -3
  41. maxframe/learn/contrib/llm/models/managed.py +54 -0
  42. maxframe/learn/contrib/llm/multi_modal.py +93 -0
  43. maxframe/learn/contrib/llm/text.py +268 -8
  44. maxframe/learn/contrib/models.py +77 -0
  45. maxframe/learn/contrib/utils.py +1 -0
  46. maxframe/learn/contrib/xgboost/__init__.py +8 -1
  47. maxframe/learn/contrib/xgboost/classifier.py +15 -4
  48. maxframe/learn/contrib/xgboost/core.py +108 -1
  49. maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
  50. maxframe/learn/contrib/xgboost/predict.py +6 -3
  51. maxframe/learn/contrib/xgboost/regressor.py +15 -1
  52. maxframe/learn/contrib/xgboost/train.py +5 -4
  53. maxframe/lib/dtypes_extension/__init__.py +2 -1
  54. maxframe/lib/dtypes_extension/dtypes.py +21 -0
  55. maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
  56. maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
  57. maxframe/opcodes.py +19 -0
  58. maxframe/serialization/__init__.py +1 -0
  59. maxframe/serialization/core.cpython-311-darwin.so +0 -0
  60. maxframe/serialization/core.pyx +12 -1
  61. maxframe/serialization/numpy.py +12 -4
  62. maxframe/serialization/serializables/tests/test_serializable.py +13 -2
  63. maxframe/serialization/tests/test_serial.py +2 -0
  64. maxframe/tensor/merge/concatenate.py +1 -0
  65. maxframe/tensor/misc/unique.py +11 -10
  66. maxframe/tensor/reshape/reshape.py +4 -1
  67. maxframe/utils.py +4 -0
  68. {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/METADATA +3 -2
  69. {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/RECORD +73 -65
  70. {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/WHEEL +1 -1
  71. maxframe_client/session/odps.py +3 -0
  72. maxframe_client/session/tests/test_task.py +1 -0
  73. {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,79 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+ import pyarrow as pa
18
+ import pytest
19
+
20
+ from ..... import dataframe as md
21
+ from .....lib.dtypes_extension import list_
22
+ from .....utils import ARROW_DTYPE_NOT_SUPPORTED
23
+ from ..getitem import SeriesListGetItemOperator
24
+ from ..length import SeriesListLengthOperator
25
+
26
+ pytestmark = pytest.mark.skipif(
27
+ ARROW_DTYPE_NOT_SUPPORTED, reason="Arrow Dtype is not supported"
28
+ )
29
+
30
+
31
+ @pytest.fixture
32
+ def df():
33
+ return md.DataFrame(
34
+ {
35
+ "A": pd.Series([[5, 3, 2]], dtype=list_(pa.int32())),
36
+ "B": pd.Series([["ab", "cd"]], dtype=list_(pa.string())),
37
+ "C": pd.Series([1], dtype=np.dtype("int64")),
38
+ },
39
+ index=[1],
40
+ )
41
+
42
+
43
+ def test_invalid_dtype(df):
44
+ with pytest.raises(AttributeError):
45
+ df["C"].list.len()
46
+
47
+
48
+ def test_getitem(df):
49
+ s1 = df["A"].list[1]
50
+ assert isinstance(s1, md.Series)
51
+ assert s1.dtype == pd.ArrowDtype(pa.int32())
52
+ assert s1.shape == (1,)
53
+ assert s1.index_value == df.index_value
54
+ op = s1.op
55
+ assert isinstance(op, SeriesListGetItemOperator)
56
+ assert op.query_index == 1
57
+ assert op.ignore_index_error is False
58
+
59
+
60
+ def test_getitem_ignore_index_err(df):
61
+ s1 = df["B"].list.get(1)
62
+ assert isinstance(s1, md.Series)
63
+ assert s1.dtype == pd.ArrowDtype(pa.string())
64
+ assert s1.shape == (1,)
65
+ assert s1.index_value == df.index_value
66
+ op = s1.op
67
+ assert isinstance(op, SeriesListGetItemOperator)
68
+ assert op.query_index == 1
69
+ assert op.ignore_index_error is True
70
+
71
+
72
+ def test_length(df):
73
+ s1 = df["A"].list.len()
74
+ assert isinstance(s1, md.Series)
75
+ assert s1.dtype == pd.ArrowDtype(pa.int64())
76
+ assert s1.shape == (1,)
77
+ assert s1.index_value == df.index_value
78
+ op = s1.op
79
+ assert isinstance(op, SeriesListLengthOperator)
@@ -11,6 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
15
+
14
16
  def _install():
15
17
  import pandas as pd
16
18
 
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from .core import SeriesStringMethod
15
16
 
16
17
 
@@ -27,6 +27,7 @@ from ...core import OutputType
27
27
  from ...io.odpsio import build_dataframe_table_meta
28
28
  from ...serialization.serializables import (
29
29
  BoolField,
30
+ DictField,
30
31
  FieldTypes,
31
32
  Int64Field,
32
33
  ListField,
@@ -55,6 +56,7 @@ class DataFrameToODPSTable(DataFrameDataStore):
55
56
  index = BoolField("index", default=True)
56
57
  index_label = ListField("index_label", FieldTypes.string, default=None)
57
58
  lifecycle = Int64Field("lifecycle", default=None)
59
+ table_properties = DictField("table_properties", default=None)
58
60
 
59
61
  def __init__(self, **kw):
60
62
  super().__init__(_output_types=[OutputType.dataframe], **kw)
@@ -84,6 +86,7 @@ def to_odps_table(
84
86
  index: bool = True,
85
87
  index_label: Union[None, str, List[str]] = None,
86
88
  lifecycle: Optional[int] = None,
89
+ table_properties: Optional[dict] = None,
87
90
  ):
88
91
  """
89
92
  Write DataFrame object into a MaxCompute (ODPS) table.
@@ -122,6 +125,8 @@ def to_odps_table(
122
125
  names will be used.
123
126
  lifecycle: Optional[int]
124
127
  Specify lifecycle of the output table.
128
+ table_properties: Optional[dict]
129
+ Specify properties of the output table.
125
130
 
126
131
  Returns
127
132
  -------
@@ -186,5 +191,6 @@ def to_odps_table(
186
191
  index=index,
187
192
  index_label=index_label,
188
193
  lifecycle=lifecycle or options.session.table_lifecycle,
194
+ table_properties=table_properties,
189
195
  )
190
196
  return op(df)
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from typing import TYPE_CHECKING
15
16
 
16
17
  from ...core import BaseMaxFrameAccessor
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import functools
15
16
  from typing import Any, Callable, Dict, List, Tuple, Union
16
17
 
@@ -19,7 +20,12 @@ import pandas as pd
19
20
 
20
21
  from ... import opcodes
21
22
  from ...core import OutputType
22
- from ...serialization.serializables import FunctionField, Int32Field
23
+ from ...serialization.serializables import (
24
+ DictField,
25
+ FunctionField,
26
+ Int32Field,
27
+ TupleField,
28
+ )
23
29
  from ...utils import quiet_stdio
24
30
  from ..core import DATAFRAME_TYPE, DataFrame, IndexValue, Series
25
31
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -38,7 +44,9 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
38
44
  _op_type_ = opcodes.APPLY_CHUNK
39
45
 
40
46
  func = FunctionField("func")
41
- batch_rows = Int32Field("batch_rows")
47
+ batch_rows = Int32Field("batch_rows", default=None)
48
+ args = TupleField("args", default=None)
49
+ kwargs = DictField("kwargs", default=None)
42
50
 
43
51
  def __init__(self, output_type=None, **kw):
44
52
  if output_type:
@@ -104,12 +112,11 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
104
112
  dtypes: Union[Tuple[str, Any], Dict[str, Any]] = None,
105
113
  output_type=None,
106
114
  index=None,
107
- args=(),
108
- **kwargs,
109
115
  ):
116
+ args = self.args or ()
117
+ kwargs = self.kwargs or {}
110
118
  # if not dtypes and not skip_infer:
111
- origin_func = self.func
112
- self.func = get_packed_func(df_or_series, origin_func, *args, **kwargs)
119
+ packed_func = get_packed_func(df_or_series, self.func, *args, **kwargs)
113
120
 
114
121
  # if skip_infer, directly build a frame
115
122
  if self.output_types and self.output_types[0] == OutputType.df_or_series:
@@ -118,8 +125,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
118
125
  # infer return index and dtypes
119
126
  dtypes, index_value, elementwise = self._infer_batch_func_returns(
120
127
  df_or_series,
121
- origin_func=origin_func,
122
- packed_func=self.func,
128
+ origin_func=self.func,
129
+ packed_func=packed_func,
123
130
  given_output_type=output_type,
124
131
  given_dtypes=dtypes,
125
132
  given_index=index,
@@ -166,6 +173,8 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
166
173
  given_dtypes: Union[Tuple[str, Any], pd.Series, List[Any], Dict[str, Any]],
167
174
  given_index: Union[pd.Index, IndexValue],
168
175
  given_elementwise: bool = False,
176
+ *args,
177
+ **kwargs,
169
178
  ):
170
179
  inferred_output_type = inferred_dtypes = inferred_index_value = None
171
180
  inferred_is_elementwise = False
@@ -190,7 +199,7 @@ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
190
199
  try:
191
200
  # execute
192
201
  with np.errstate(all="ignore"), quiet_stdio():
193
- infer_result = packed_func(empty_data)
202
+ infer_result = packed_func(empty_data, *args, **kwargs)
194
203
 
195
204
  # if executed successfully, get index and dtypes from returned object
196
205
  if inferred_index_value is None:
@@ -258,7 +267,7 @@ def get_packed_func(df, func, *args, **kwargs) -> Any:
258
267
  def df_apply_chunk(
259
268
  dataframe,
260
269
  func: Union[str, Callable],
261
- batch_rows,
270
+ batch_rows=None,
262
271
  dtypes=None,
263
272
  dtype=None,
264
273
  name=None,
@@ -462,11 +471,11 @@ def df_apply_chunk(
462
471
  if not isinstance(func, Callable):
463
472
  raise TypeError("function must be a callable object")
464
473
 
465
- if not isinstance(batch_rows, int):
466
- raise TypeError("batch_rows must be an integer")
467
-
468
- if batch_rows <= 0:
469
- raise ValueError("batch_rows must be greater than 0")
474
+ if batch_rows is not None:
475
+ if not isinstance(batch_rows, int):
476
+ raise TypeError("batch_rows must be an integer")
477
+ elif batch_rows <= 0:
478
+ raise ValueError("batch_rows must be greater than 0")
470
479
 
471
480
  dtypes = (name, dtype) if dtype is not None else dtypes
472
481
 
@@ -481,15 +490,17 @@ def df_apply_chunk(
481
490
 
482
491
  # bind args and kwargs
483
492
  op = DataFrameApplyChunkOperator(
484
- func=func, batch_rows=batch_rows, output_type=output_type
493
+ func=func,
494
+ batch_rows=batch_rows,
495
+ output_type=output_type,
496
+ args=args,
497
+ kwargs=kwargs,
485
498
  )
486
499
 
487
500
  return op(
488
501
  dataframe,
489
502
  dtypes=dtypes,
490
503
  index=index,
491
- args=args,
492
- **kwargs,
493
504
  )
494
505
 
495
506
 
@@ -720,7 +731,11 @@ def series_apply_chunk(
720
731
  output_type = OutputType.df_or_series
721
732
 
722
733
  op = DataFrameApplyChunkOperator(
723
- func=func, batch_rows=batch_rows, output_type=output_type
734
+ func=func,
735
+ batch_rows=batch_rows,
736
+ output_type=output_type,
737
+ args=args,
738
+ kwargs=kwargs,
724
739
  )
725
740
 
726
741
  dtypes = (name, dtype) if dtype is not None else dtypes
@@ -729,6 +744,4 @@ def series_apply_chunk(
729
744
  dtypes=dtypes,
730
745
  output_type=output_type,
731
746
  index=index,
732
- args=args,
733
- **kwargs,
734
747
  )
@@ -27,7 +27,12 @@ from ...serialization.serializables import (
27
27
  )
28
28
  from ..core import DataFrame
29
29
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
30
- from ..utils import gen_unknown_index_value, make_dtypes, parse_index
30
+ from ..utils import (
31
+ copy_func_scheduling_hints,
32
+ gen_unknown_index_value,
33
+ make_dtypes,
34
+ parse_index,
35
+ )
31
36
 
32
37
 
33
38
  class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
@@ -40,6 +45,8 @@ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
40
45
 
41
46
  def __init__(self, output_types=None, **kw):
42
47
  super().__init__(_output_types=output_types, **kw)
48
+ if hasattr(self, "func"):
49
+ copy_func_scheduling_hints(self.func, self)
43
50
 
44
51
  def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
45
52
  dtypes = make_dtypes(dtypes)
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import numpy as np
15
16
  import pandas as pd
16
17
  import pytest
@@ -102,7 +103,7 @@ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
102
103
  assert result.index_value is df1.index_value
103
104
  assert result.dtypes.equals(df1.dtypes)
104
105
  assert isinstance(result.op.func, MarkedFunction)
105
- assert result.op.func is not process
106
+ assert result.op.func is process
106
107
  assert result.op.func.resources is process.resources
107
108
  assert result.op.func.pythonpacks is process.pythonpacks
108
109
 
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import numpy as np
15
16
  import pandas as pd
16
17
  import pytest
@@ -303,11 +303,63 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
303
303
  if aggregated result is very large, 'auto' will use 'shuffle' method
304
304
  in distributed mode and use 'tree' in local mode.
305
305
 
306
-
307
306
  Returns
308
307
  -------
309
308
  Series or DataFrame
310
309
  Aggregated result.
310
+
311
+ Examples
312
+ --------
313
+ >>> import maxframe.dataframe as md
314
+ >>> df = md.DataFrame(
315
+ ... {
316
+ ... "A": [1, 1, 2, 2],
317
+ ... "B": [1, 2, 3, 4],
318
+ ... "C": [0.362838, 0.227877, 1.267767, -0.562860],
319
+ ... }
320
+ ... ).execute()
321
+ A B C
322
+ 0 1 1 0.362838
323
+ 1 1 2 0.227877
324
+ 2 2 3 1.267767
325
+ 3 2 4 -0.562860
326
+
327
+ The aggregation is for each column.
328
+
329
+ >>> df.groupby('A').agg('min').execute()
330
+ B C
331
+ A
332
+ 1 1 0.227877
333
+ 2 3 -0.562860
334
+
335
+ Multiple aggregations.
336
+
337
+ >>> df.groupby('A').agg(['min', 'max']).execute()
338
+ B C
339
+ min max min max
340
+ A
341
+ 1 1 2 0.227877 0.362838
342
+ 2 3 4 -0.562860 1.267767
343
+
344
+ Different aggregations per column
345
+
346
+ >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'}).execute()
347
+ B C
348
+ min max sum
349
+ A
350
+ 1 1 2 0.590715
351
+ 2 3 4 0.704907
352
+
353
+ To control the output names with different aggregations per column, pandas supports “named aggregation”
354
+
355
+ >>> from maxframe.dataframe.groupby import NamedAgg
356
+ >>> df.groupby("A").agg(
357
+ ... b_min=NamedAgg(column="B", aggfunc="min"),
358
+ ... c_sum=NamedAgg(column="C", aggfunc="sum")).execute()
359
+ b_min c_sum
360
+ A
361
+ 1 1 0.590715
362
+ 2 3 0.704907
311
363
  """
312
364
 
313
365
  # When perform a computation on the grouped data, we won't shuffle
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from typing import List, Union
15
16
 
16
17
  import pandas as pd
@@ -100,8 +101,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
100
101
  row_length = 0
101
102
  for series in objs:
102
103
  row_length += series.shape[0]
103
- if self.ignore_index: # pragma: no cover
104
- index_value = parse_index(pd.RangeIndex(row_length))
104
+ if self.ignore_index:
105
+ idx_length = 0 if pd.isna(row_length) else row_length
106
+ index_value = parse_index(pd.RangeIndex(idx_length))
105
107
  else:
106
108
  index = self._concat_index(objs)
107
109
  index_value = parse_index(index, objs)
@@ -159,8 +161,9 @@ class DataFrameConcat(DataFrameOperator, DataFrameOperatorMixin):
159
161
  if self.join == "inner":
160
162
  objs = [o[list(emtpy_result.columns)] for o in objs]
161
163
 
162
- if self.ignore_index: # pragma: no cover
163
- index_value = parse_index(pd.RangeIndex(row_length))
164
+ if self.ignore_index:
165
+ idx_length = 0 if pd.isna(row_length) else row_length
166
+ index_value = parse_index(pd.RangeIndex(idx_length))
164
167
  else:
165
168
  index = self._concat_index(objs)
166
169
  index_value = parse_index(index, objs)
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import logging
15
16
  from abc import abstractmethod
16
17
  from collections import namedtuple