maxframe 1.0.0rc3__cp311-cp311-macosx_10_9_universal2.whl → 1.1.0__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show
  1. maxframe/_utils.cpython-311-darwin.so +0 -0
  2. maxframe/codegen.py +1 -0
  3. maxframe/config/config.py +16 -1
  4. maxframe/conftest.py +52 -14
  5. maxframe/core/entity/executable.py +1 -1
  6. maxframe/core/graph/core.cpython-311-darwin.so +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/docstring.py +26 -2
  9. maxframe/dataframe/arithmetic/equal.py +4 -2
  10. maxframe/dataframe/arithmetic/greater.py +4 -2
  11. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  12. maxframe/dataframe/arithmetic/less.py +2 -2
  13. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  14. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  16. maxframe/dataframe/core.py +26 -2
  17. maxframe/dataframe/datasource/read_odps_query.py +116 -28
  18. maxframe/dataframe/datasource/read_odps_table.py +3 -1
  19. maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
  20. maxframe/dataframe/datastore/to_odps.py +7 -0
  21. maxframe/dataframe/extensions/__init__.py +8 -0
  22. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  23. maxframe/dataframe/extensions/flatjson.py +131 -0
  24. maxframe/dataframe/extensions/flatmap.py +314 -0
  25. maxframe/dataframe/extensions/reshuffle.py +1 -1
  26. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  27. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  28. maxframe/dataframe/groupby/__init__.py +1 -0
  29. maxframe/dataframe/groupby/aggregation.py +1 -0
  30. maxframe/dataframe/groupby/apply.py +9 -1
  31. maxframe/dataframe/groupby/core.py +1 -1
  32. maxframe/dataframe/groupby/fill.py +4 -1
  33. maxframe/dataframe/groupby/getitem.py +6 -0
  34. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  35. maxframe/dataframe/groupby/transform.py +8 -2
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/loc.py +6 -4
  38. maxframe/dataframe/indexing/rename.py +11 -0
  39. maxframe/dataframe/initializer.py +11 -1
  40. maxframe/dataframe/merge/__init__.py +9 -1
  41. maxframe/dataframe/merge/concat.py +41 -31
  42. maxframe/dataframe/merge/merge.py +1 -1
  43. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  44. maxframe/dataframe/misc/apply.py +3 -0
  45. maxframe/dataframe/misc/drop_duplicates.py +23 -2
  46. maxframe/dataframe/misc/map.py +3 -1
  47. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  48. maxframe/dataframe/misc/transform.py +22 -13
  49. maxframe/dataframe/reduction/__init__.py +3 -0
  50. maxframe/dataframe/reduction/aggregation.py +1 -0
  51. maxframe/dataframe/reduction/median.py +56 -0
  52. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  53. maxframe/dataframe/statistics/quantile.py +8 -2
  54. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  55. maxframe/dataframe/tests/test_initializer.py +33 -2
  56. maxframe/dataframe/tests/test_utils.py +60 -0
  57. maxframe/dataframe/utils.py +110 -7
  58. maxframe/dataframe/window/expanding.py +5 -3
  59. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  60. maxframe/io/objects/tests/test_object_io.py +39 -12
  61. maxframe/io/odpsio/arrow.py +30 -2
  62. maxframe/io/odpsio/schema.py +28 -8
  63. maxframe/io/odpsio/tableio.py +55 -133
  64. maxframe/io/odpsio/tests/test_schema.py +40 -4
  65. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  66. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  67. maxframe/io/odpsio/volumeio.py +36 -6
  68. maxframe/learn/contrib/__init__.py +3 -1
  69. maxframe/learn/contrib/graph/__init__.py +15 -0
  70. maxframe/learn/contrib/graph/connected_components.py +215 -0
  71. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  73. maxframe/learn/contrib/llm/__init__.py +16 -0
  74. maxframe/learn/contrib/llm/core.py +54 -0
  75. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  76. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  77. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  78. maxframe/learn/contrib/llm/text.py +42 -0
  79. maxframe/learn/contrib/xgboost/classifier.py +3 -3
  80. maxframe/learn/contrib/xgboost/predict.py +8 -39
  81. maxframe/learn/contrib/xgboost/train.py +4 -3
  82. maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
  83. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  84. maxframe/opcodes.py +10 -1
  85. maxframe/protocol.py +6 -1
  86. maxframe/serialization/core.cpython-311-darwin.so +0 -0
  87. maxframe/serialization/core.pyx +13 -1
  88. maxframe/serialization/pandas.py +50 -20
  89. maxframe/serialization/serializables/core.py +24 -5
  90. maxframe/serialization/serializables/field_type.py +4 -1
  91. maxframe/serialization/serializables/tests/test_serializable.py +8 -1
  92. maxframe/serialization/tests/test_serial.py +2 -1
  93. maxframe/session.py +9 -2
  94. maxframe/tensor/__init__.py +19 -7
  95. maxframe/tensor/indexing/getitem.py +2 -0
  96. maxframe/tensor/merge/concatenate.py +23 -20
  97. maxframe/tensor/merge/vstack.py +5 -1
  98. maxframe/tensor/misc/transpose.py +1 -1
  99. maxframe/tests/utils.py +16 -0
  100. maxframe/udf.py +27 -0
  101. maxframe/utils.py +64 -14
  102. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
  103. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
  104. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
  105. maxframe_client/clients/framedriver.py +4 -1
  106. maxframe_client/fetcher.py +28 -10
  107. maxframe_client/session/consts.py +3 -0
  108. maxframe_client/session/odps.py +104 -20
  109. maxframe_client/session/task.py +42 -26
  110. maxframe_client/session/tests/test_task.py +0 -4
  111. maxframe_client/tests/test_session.py +44 -12
  112. {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,131 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List
16
+
17
+ from ... import opcodes
18
+ from ...core import OutputType
19
+ from ...serialization.serializables import ListField
20
+ from ...serialization.serializables.field_type import FieldTypes
21
+ from ..core import DataFrame
22
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
23
+ from ..utils import make_dtypes, parse_index
24
+
25
+
26
+ class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin):
27
+ _op_type_ = opcodes.FLATJSON
28
+
29
+ query_paths = ListField("query_paths", field_type=FieldTypes.string, default=None)
30
+
31
+ def __call__(self, series, dtypes):
32
+ if self._output_types[0] == OutputType.series:
33
+ name, dtype = dtypes
34
+ return self.new_series(
35
+ [series],
36
+ shape=series.shape,
37
+ index_value=series.index_value,
38
+ name=name,
39
+ dtype=dtype,
40
+ )
41
+ return self.new_dataframe(
42
+ [series],
43
+ shape=(series.shape[0], len(dtypes)),
44
+ index_value=series.index_value,
45
+ columns_value=parse_index(dtypes.index, store_data=True),
46
+ dtypes=make_dtypes(dtypes),
47
+ )
48
+
49
+
50
+ def series_flatjson(
51
+ series,
52
+ query_paths: List[str],
53
+ dtypes=None,
54
+ dtype=None,
55
+ name: str = None,
56
+ ) -> DataFrame:
57
+ """
58
+ Flat JSON object in the series to a dataframe according to JSON query.
59
+
60
+ Parameters
61
+ ----------
62
+ series : Series
63
+ The series of json strings.
64
+
65
+ query_paths: List[str] or str
66
+ The JSON query paths for each generated column. The path format should follow
67
+ [RFC9535](https://datatracker.ietf.org/doc/rfc9535/).
68
+
69
+ dtypes : Series, default None
70
+ Specify dtypes of returned DataFrame. Can't work with dtype.
71
+
72
+ dtype : numpy.dtype, default None
73
+ Specify dtype of returned Series. Can't work with dtypes.
74
+
75
+ name : str, default None
76
+ Specify name of the returned Series.
77
+
78
+ Returns
79
+ -------
80
+ DataFrame or Series
81
+ Result of DataFrame when dtypes specified, else Series.
82
+
83
+ Examples
84
+ --------
85
+ >>> import maxframe.dataframe as md
86
+ >>> import pandas as pd
87
+ >>> s = md.Series(
88
+ ... [
89
+ ... '{"age": 24, "gender": "male", "graduated": false}',
90
+ ... '{"age": 25, "gender": "female", "graduated": true}',
91
+ ... ]
92
+ ... )
93
+ >>> s.execute()
94
+ 0 {"age": 24, "gender": "male", "graduated": false}
95
+ 1 {"age": 25, "gender": "female", "graduated": true}
96
+ dtype: object
97
+
98
+ >>> df = s.mf.flatjson(
99
+ ... ["$.age", "$.gender", "$.graduated"],
100
+ ... dtypes=pd.Series(["int32", "object", "bool"], index=["age", "gender", "graduated"]),
101
+ ... )
102
+ >>> df.execute()
103
+ age gender graduated
104
+ 0 24 male True
105
+ 1 25 female True
106
+
107
+ >>> s2 = s.mf.flatjson("$.age", name="age", dtype="int32")
108
+ >>> s2.execute()
109
+ 0 24
110
+ 1 25
111
+ Name: age, dtype: int32
112
+ """
113
+ if isinstance(query_paths, str):
114
+ query_paths = [query_paths]
115
+ if dtypes is not None and dtype is not None:
116
+ raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
117
+ if dtype is not None:
118
+ if len(query_paths) != 1:
119
+ raise ValueError("query_paths should have only one path if dtype is set")
120
+ output_type = OutputType.series
121
+ elif dtypes is not None:
122
+ if len(dtypes) != len(query_paths):
123
+ raise ValueError("query_paths and dtypes should have same length")
124
+ output_type = OutputType.dataframe
125
+ else:
126
+ raise ValueError("dtypes or dtype should be specified")
127
+
128
+ dtypes = (name, dtype) if dtype is not None else dtypes
129
+ return SeriesFlatJSONOperator(query_paths=query_paths, _output_types=[output_type])(
130
+ series, dtypes
131
+ )
@@ -0,0 +1,314 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Callable
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ from ... import opcodes
21
+ from ...core import OutputType
22
+ from ...serialization.serializables import (
23
+ BoolField,
24
+ DictField,
25
+ FunctionField,
26
+ TupleField,
27
+ )
28
+ from ..core import DataFrame
29
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
30
+ from ..utils import gen_unknown_index_value, make_dtypes, parse_index
31
+
32
+
33
+ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
34
+ _op_type_ = opcodes.FLATMAP
35
+
36
+ func = FunctionField("func")
37
+ raw = BoolField("raw", default=False)
38
+ args = TupleField("args", default=())
39
+ kwargs = DictField("kwargs", default={})
40
+
41
+ def __init__(self, output_types=None, **kw):
42
+ super().__init__(_output_types=output_types, **kw)
43
+
44
+ def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
45
+ dtypes = make_dtypes(dtypes)
46
+ index_value = gen_unknown_index_value(
47
+ df.index_value,
48
+ (df.key, df.index_value.key, self.func),
49
+ normalize_range_index=True,
50
+ )
51
+ return self.new_dataframe(
52
+ [df],
53
+ shape=(np.nan, len(dtypes)),
54
+ index_value=index_value,
55
+ columns_value=parse_index(dtypes.index, store_data=True),
56
+ dtypes=dtypes,
57
+ )
58
+
59
+ def _call_series_or_index(self, series, dtypes=None):
60
+ index_value = gen_unknown_index_value(
61
+ series.index_value,
62
+ (series.key, series.index_value.key, self.func),
63
+ normalize_range_index=True,
64
+ )
65
+
66
+ if self.output_types[0] == OutputType.series:
67
+ name, dtype = dtypes
68
+ return self.new_series(
69
+ [series],
70
+ dtype=dtype,
71
+ shape=(np.nan,),
72
+ index_value=index_value,
73
+ name=name,
74
+ )
75
+
76
+ dtypes = make_dtypes(dtypes)
77
+ columns_value = parse_index(dtypes.index, store_data=True)
78
+ return self.new_dataframe(
79
+ [series],
80
+ shape=(np.nan, len(dtypes)),
81
+ index_value=index_value,
82
+ columns_value=columns_value,
83
+ dtypes=dtypes,
84
+ )
85
+
86
+ def __call__(
87
+ self,
88
+ df_or_series,
89
+ dtypes=None,
90
+ output_type=None,
91
+ ):
92
+ if df_or_series.op.output_types[0] == OutputType.dataframe:
93
+ return self._call_dataframe(df_or_series, dtypes=dtypes)
94
+ else:
95
+ return self._call_series_or_index(df_or_series, dtypes=dtypes)
96
+
97
+
98
+ def df_flatmap(dataframe, func: Callable, dtypes=None, raw=False, args=(), **kwargs):
99
+ """
100
+ Apply the given function to each row and then flatten results. Use this method if your transformation returns
101
+ multiple rows for each input row.
102
+
103
+ This function applies a transformation to each row of the DataFrame, where the transformation can return zero
104
+ or multiple values, effectively flattening Python generators, list-like collections, and DataFrames.
105
+
106
+ Parameters
107
+ ----------
108
+ func : Callable
109
+ Function to apply to each row of the DataFrame. It should accept a Series (or an array if `raw=True`)
110
+ representing a row and return a list or iterable of values.
111
+
112
+ dtypes : Series, dict or list
113
+ Specify dtypes of returned DataFrame.
114
+
115
+ raw : bool, default False
116
+ Determines if the row is passed as a Series or as a numpy array:
117
+
118
+ * ``False`` : passes each row as a Series to the function.
119
+ * ``True`` : the passed function will receive numpy array objects instead.
120
+
121
+ args : tuple
122
+ Positional arguments to pass to `func`.
123
+
124
+ **kwargs
125
+ Additional keyword arguments to pass as keywords arguments to `func`.
126
+
127
+ Returns
128
+ -------
129
+ DataFrame
130
+ Return DataFrame with specified `dtypes`.
131
+
132
+ Notes
133
+ -----
134
+ The ``func`` must return an iterable of values for each input row. The index of the resulting DataFrame will be
135
+ repeated based on the number of output rows generated by `func`.
136
+
137
+ Examples
138
+ --------
139
+ >>> import numpy as np
140
+ >>> import maxframe.dataframe as md
141
+ >>> df = md.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
142
+ >>> df.execute()
143
+ A B
144
+ 0 1 4
145
+ 1 2 5
146
+ 2 3 6
147
+
148
+ Define a function that takes a number and returns a list of two numbers:
149
+
150
+ >>> def generate_values_array(row):
151
+ ... return [row['A'] * 2, row['B'] * 3]
152
+
153
+ Define a function that takes a row and return two rows and two columns:
154
+
155
+ >>> def generate_values_in_generator(row):
156
+ ... yield [row[0] * 2, row[1] * 4]
157
+ ... yield [row[0] * 3, row[1] * 5]
158
+
159
+ Which equals to the following function return a dataframe:
160
+
161
+ >>> def generate_values_in_dataframe(row):
162
+ ... return pd.DataFrame([[row[0] * 2, row[1] * 4], [row[0] * 3, row[1] * 5]])
163
+
164
+ Specify `dtypes` with a function which returns a DataFrame:
165
+
166
+ >>> df.mf.flatmap(generate_values_array, dtypes=pd.Series({'A': 'int'})).execute()
167
+ A
168
+ 0 2
169
+ 0 12
170
+ 1 4
171
+ 1 15
172
+ 2 6
173
+ 2 18
174
+
175
+ Specify raw=True to pass input row as array:
176
+
177
+ >>> df.mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}, raw=True).execute()
178
+ A B
179
+ 0 2 16
180
+ 0 3 20
181
+ 1 4 20
182
+ 1 6 25
183
+ 2 6 24
184
+ 2 9 30
185
+ """
186
+ if dtypes is None or len(dtypes) == 0:
187
+ raise TypeError(
188
+ "Cannot determine {dtypes} by calculating with enumerate data, "
189
+ "please specify it as arguments"
190
+ )
191
+
192
+ if not isinstance(func, Callable):
193
+ raise TypeError("function must be a callable object")
194
+
195
+ output_types = [OutputType.dataframe]
196
+ op = DataFrameFlatMapOperator(
197
+ func=func, raw=raw, output_types=output_types, args=args, kwargs=kwargs
198
+ )
199
+ return op(
200
+ dataframe,
201
+ dtypes=dtypes,
202
+ )
203
+
204
+
205
+ def series_flatmap(
206
+ series, func: Callable, dtypes=None, dtype=None, name=None, args=(), **kwargs
207
+ ):
208
+ """
209
+ Apply the given function to each row and then flatten results. Use this method if your transformation returns
210
+ multiple rows for each input row.
211
+
212
+ This function applies a transformation to each element of the Series, where the transformation can return zero
213
+ or multiple values, effectively flattening Python generator, list-liked collections and DataFrame.
214
+
215
+ Parameters
216
+ ----------
217
+ func : Callable
218
+ Function to apply to each element of the Series. It should accept a scalar value
219
+ (or an array if ``raw=True``) and return a list or iterable of values.
220
+
221
+ dtypes : Series, default None
222
+ Specify dtypes of returned DataFrame. Can't work with dtype.
223
+
224
+ dtype : numpy.dtype, default None
225
+ Specify dtype of returned Series. Can't work with dtypes.
226
+
227
+ name : str, default None
228
+ Specify name of the returned Series.
229
+
230
+ args : tuple
231
+ Positional arguments to pass to ``func``.
232
+
233
+ **kwargs
234
+ Additional keyword arguments to pass as keywords arguments to ``func``.
235
+
236
+ Returns
237
+ -------
238
+ DataFrame or Series
239
+ Result of DataFrame when dtypes specified, else Series.
240
+
241
+ Notes
242
+ -----
243
+ The ``func`` must return an iterable of values for each input element. If ``dtypes`` is specified,
244
+ `flatmap` will return a DataFrame, if ``dtype`` and ``name`` is specified, a Series will be returned.
245
+
246
+ The index of the resulting DataFrame/Series will be repeated based on the number of output rows generated
247
+ by ``func``.
248
+
249
+ Examples
250
+ --------
251
+ >>> import numpy as np
252
+ >>> import maxframe.dataframe as md
253
+ >>> df = md.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
254
+ >>> df.execute()
255
+ A B
256
+ 0 1 4
257
+ 1 2 5
258
+ 2 3 6
259
+
260
+ Define a function that takes a number and returns a list of two numbers:
261
+
262
+ >>> def generate_values_array(x):
263
+ ... return [x * 2, x * 3]
264
+
265
+ Specify ``dtype`` with a function which returns list to return more elements as a Series:
266
+
267
+ >>> df['A'].mf.flatmap(generate_values_array, dtype="int", name="C").execute()
268
+ 0 2
269
+ 0 3
270
+ 1 4
271
+ 1 6
272
+ 2 6
273
+ 2 9
274
+ Name: C, dtype: int64
275
+
276
+ Specify ``dtypes`` to return multi columns as a DataFrame:
277
+
278
+
279
+ >>> def generate_values_in_generator(x):
280
+ ... yield pd.Series([x * 2, x * 4])
281
+ ... yield pd.Series([x * 3, x * 5])
282
+
283
+ >>> df['A'].mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}).execute()
284
+ A B
285
+ 0 2 4
286
+ 0 3 5
287
+ 1 4 8
288
+ 1 6 10
289
+ 2 6 12
290
+ 2 9 15
291
+ """
292
+
293
+ if dtypes is not None and dtype is not None:
294
+ raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
295
+
296
+ dtypes = (name, dtype) if dtype is not None else dtypes
297
+ if dtypes is None:
298
+ raise TypeError(
299
+ "Cannot determine {dtypes} or {dtype} by calculating with enumerate data, "
300
+ "please specify it as arguments"
301
+ )
302
+
303
+ if not isinstance(func, Callable):
304
+ raise TypeError("function must be a callable object")
305
+
306
+ output_type = OutputType.series if dtype is not None else OutputType.dataframe
307
+
308
+ op = DataFrameFlatMapOperator(
309
+ func=func, raw=False, output_types=[output_type], args=args, kwargs=kwargs
310
+ )
311
+ return op(
312
+ series,
313
+ dtypes=dtypes,
314
+ )
@@ -38,7 +38,7 @@ class DataFrameReshuffle(DataFrameOperator, DataFrameOperatorMixin):
38
38
  else:
39
39
  idx_value = df.index_value
40
40
  if isinstance(idx_value.value, IndexValue.RangeIndex):
41
- idx_value = parse_index(pd.Int64Index([0]))
41
+ idx_value = parse_index(pd.RangeIndex(1))
42
42
  params = df.params
43
43
  params["index_value"] = idx_value
44
44
  self._output_types = get_output_types(df)
@@ -0,0 +1,186 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import numpy as np
15
+ import pandas as pd
16
+ import pytest
17
+
18
+ from ....udf import MarkedFunction
19
+ from ... import DataFrame
20
+ from ...core import DATAFRAME_TYPE, SERIES_TYPE
21
+
22
+
23
+ @pytest.fixture
24
+ def df1():
25
+ return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
26
+
27
+
28
+ @pytest.fixture
29
+ def df2():
30
+ return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
31
+
32
+
33
+ @pytest.fixture
34
+ def df3():
35
+ return DataFrame(
36
+ [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
37
+ columns=["a", "b", "c"],
38
+ index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
39
+ )
40
+
41
+
42
+ def test_apply_chunk_infer_dtypes_and_index(df1, df2, df3):
43
+ # dataframe -> dataframe filter
44
+ result = df3.mf.apply_chunk(
45
+ lambda data: data.query("A > 1"), batch_rows=2, output_type="dataframe"
46
+ )
47
+ assert isinstance(result, DATAFRAME_TYPE)
48
+ assert df3.index_value.key != result.index_value.key
49
+ assert df3.index_value.to_pandas().names == result.index_value.to_pandas().names
50
+
51
+ # dataframe -> dataframe keep same
52
+ result = df1.mf.apply_chunk(
53
+ lambda data: data, batch_rows=2, output_type="dataframe"
54
+ )
55
+ assert isinstance(result, DATAFRAME_TYPE)
56
+ assert result.index_value is df1.index_value
57
+
58
+ # dataframe -> dataframe ufunc with arguments
59
+ result = df1.mf.apply_chunk(
60
+ np.add, batch_rows=2, args=(2,), output_type="dataframe"
61
+ )
62
+ assert isinstance(result, DATAFRAME_TYPE)
63
+ assert result.index_value is df1.index_value
64
+ assert result.dtypes.equals(df1.dtypes)
65
+ assert result.shape == df1.shape
66
+
67
+ # dataframe -> series ufunc return series
68
+ result = df1.mf.apply_chunk(np.sum, batch_rows=2)
69
+ assert isinstance(result, SERIES_TYPE)
70
+ assert result.index_value is not df1.index_value
71
+
72
+ # series -> series
73
+ result = df3.a.mf.apply_chunk(lambda data: data, batch_rows=2, output_type="series")
74
+ assert isinstance(result, SERIES_TYPE)
75
+ assert df3.a.index_value is result.index_value
76
+
77
+ result = df3.a.mf.apply_chunk(
78
+ np.sum, batch_rows=2, output_type="series", dtype=np.int64, name="sum"
79
+ )
80
+ assert isinstance(result, SERIES_TYPE)
81
+ assert isinstance(result.index_value.to_pandas(), pd.RangeIndex)
82
+
83
+ # general functions
84
+ def process(data, param, k):
85
+ return data * param * k
86
+
87
+ result = df2.mf.apply_chunk(
88
+ process, batch_rows=3, output_type="dataframe", args=(4,), k=1
89
+ )
90
+ assert result.index_value is df2.index_value
91
+ assert result.dtypes.equals(df2.dtypes)
92
+
93
+ # mark functions
94
+ from ....udf import with_python_requirements, with_resources
95
+
96
+ @with_resources("empty.txt")
97
+ @with_python_requirements("numpy")
98
+ def process(data, k):
99
+ return data
100
+
101
+ result = df1.mf.apply_chunk(process, batch_rows=3, output_type="dataframe", k=1)
102
+ assert result.index_value is df1.index_value
103
+ assert result.dtypes.equals(df1.dtypes)
104
+ assert isinstance(result.op.func, MarkedFunction)
105
+ assert result.op.func is not process
106
+ assert result.op.func.resources is process.resources
107
+ assert result.op.func.pythonpacks is process.pythonpacks
108
+
109
+ def func_series_ret_series(data):
110
+ return pd.DataFrame([data, data])
111
+
112
+ result = df3.a.mf.apply_chunk(
113
+ func_series_ret_series, batch_rows=2, output_type="dataframe"
114
+ )
115
+ assert isinstance(result, DATAFRAME_TYPE)
116
+ assert result.op.func is func_series_ret_series
117
+
118
+
119
+ def test_apply_test(df1):
120
+ def process(x, param):
121
+ return x * param
122
+
123
+ result = df1.a.mf.apply_chunk(
124
+ process, batch_rows=2, output_type="series", args=(5,)
125
+ )
126
+ assert isinstance(result, SERIES_TYPE)
127
+
128
+
129
+ def test_apply_chunk(df1):
130
+ keys = [1, 2]
131
+
132
+ def f(x, keys):
133
+ if x["a"] in keys:
134
+ return [1, 0]
135
+ else:
136
+ return [0, 1]
137
+
138
+ result = df1[["a"]].mf.apply_chunk(
139
+ f,
140
+ output_type="dataframe",
141
+ dtypes=pd.Series(["int64", "int64"]),
142
+ batch_rows=5,
143
+ keys=keys,
144
+ )
145
+ assert result.shape == (np.nan, 2)
146
+ assert df1.index_value.key != result.index_value.key
147
+
148
+ # dataframe return series
149
+ result = df1.mf.apply_chunk(
150
+ lambda x: x.a,
151
+ output_type="series",
152
+ dtype="int64",
153
+ batch_rows=5,
154
+ )
155
+ assert result.shape == (np.nan,)
156
+ assert df1.index_value.key == result.index_value.key
157
+ assert df1.a.index_value.key == result.index_value.key
158
+
159
+ # return dataframe with given dtypes
160
+ result = df1.a.mf.apply_chunk(
161
+ lambda x: pd.concat([x, x], axis=1),
162
+ output_type="dataframe",
163
+ dtypes=pd.Series(["int64", "int64"]),
164
+ batch_rows=5,
165
+ )
166
+ assert result.shape == (np.nan, 2)
167
+ assert df1.a.index_value.key != result.index_value.key
168
+
169
+ # return series but as dataframe
170
+ result = df1.a.mf.apply_chunk(
171
+ lambda x: pd.concat([x, x], axis=0),
172
+ output_type="dataframe",
173
+ dtypes={"c": np.int_},
174
+ batch_rows=5,
175
+ )
176
+ assert result.shape == (np.nan, 1)
177
+
178
+
179
+ def test_apply_chunk_exception(df1):
180
+ with pytest.raises(ValueError):
181
+ df1.mf.apply_chunk(lambda data: data, batch_rows=-1, output_type="dataframe")
182
+
183
+ with pytest.raises(TypeError):
184
+ df1.mf.apply_chunk(
185
+ lambda data: data, batch_rows=object(), output_type="dataframe"
186
+ )