maxframe 1.0.0rc4__cp311-cp311-win32.whl → 1.1.1__cp311-cp311-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (88) hide show
  1. maxframe/_utils.cp311-win32.pyd +0 -0
  2. maxframe/config/__init__.py +1 -1
  3. maxframe/config/config.py +26 -0
  4. maxframe/config/tests/test_config.py +20 -1
  5. maxframe/conftest.py +17 -4
  6. maxframe/core/graph/core.cp311-win32.pyd +0 -0
  7. maxframe/core/operator/base.py +2 -0
  8. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
  9. maxframe/dataframe/core.py +24 -2
  10. maxframe/dataframe/datasource/read_odps_query.py +65 -35
  11. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  12. maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
  13. maxframe/dataframe/extensions/__init__.py +5 -0
  14. maxframe/dataframe/extensions/apply_chunk.py +649 -0
  15. maxframe/dataframe/extensions/flatjson.py +131 -0
  16. maxframe/dataframe/extensions/flatmap.py +28 -40
  17. maxframe/dataframe/extensions/reshuffle.py +1 -1
  18. maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
  19. maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
  20. maxframe/dataframe/groupby/__init__.py +1 -0
  21. maxframe/dataframe/groupby/aggregation.py +1 -0
  22. maxframe/dataframe/groupby/apply.py +9 -1
  23. maxframe/dataframe/groupby/core.py +1 -1
  24. maxframe/dataframe/groupby/fill.py +4 -1
  25. maxframe/dataframe/groupby/getitem.py +6 -0
  26. maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
  27. maxframe/dataframe/groupby/transform.py +8 -2
  28. maxframe/dataframe/indexing/loc.py +6 -4
  29. maxframe/dataframe/merge/__init__.py +9 -1
  30. maxframe/dataframe/merge/concat.py +41 -31
  31. maxframe/dataframe/merge/merge.py +1 -1
  32. maxframe/dataframe/merge/tests/test_merge.py +3 -1
  33. maxframe/dataframe/misc/apply.py +3 -0
  34. maxframe/dataframe/misc/drop_duplicates.py +5 -1
  35. maxframe/dataframe/misc/map.py +3 -1
  36. maxframe/dataframe/misc/tests/test_misc.py +24 -2
  37. maxframe/dataframe/misc/transform.py +22 -13
  38. maxframe/dataframe/reduction/__init__.py +3 -0
  39. maxframe/dataframe/reduction/aggregation.py +1 -0
  40. maxframe/dataframe/reduction/median.py +56 -0
  41. maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
  42. maxframe/dataframe/statistics/quantile.py +8 -2
  43. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  44. maxframe/dataframe/tests/test_utils.py +60 -0
  45. maxframe/dataframe/utils.py +110 -7
  46. maxframe/dataframe/window/expanding.py +5 -3
  47. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  48. maxframe/io/objects/tests/test_object_io.py +39 -12
  49. maxframe/io/odpsio/__init__.py +1 -1
  50. maxframe/io/odpsio/arrow.py +51 -2
  51. maxframe/io/odpsio/schema.py +23 -5
  52. maxframe/io/odpsio/tableio.py +80 -124
  53. maxframe/io/odpsio/tests/test_schema.py +40 -0
  54. maxframe/io/odpsio/tests/test_tableio.py +5 -5
  55. maxframe/io/odpsio/tests/test_volumeio.py +35 -11
  56. maxframe/io/odpsio/volumeio.py +27 -3
  57. maxframe/learn/contrib/__init__.py +3 -2
  58. maxframe/learn/contrib/llm/__init__.py +16 -0
  59. maxframe/learn/contrib/llm/core.py +54 -0
  60. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  61. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  62. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  63. maxframe/learn/contrib/llm/text.py +42 -0
  64. maxframe/lib/mmh3.cp311-win32.pyd +0 -0
  65. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  66. maxframe/opcodes.py +7 -1
  67. maxframe/serialization/core.cp311-win32.pyd +0 -0
  68. maxframe/serialization/core.pyx +13 -1
  69. maxframe/serialization/pandas.py +50 -20
  70. maxframe/serialization/serializables/core.py +70 -15
  71. maxframe/serialization/serializables/field_type.py +4 -1
  72. maxframe/serialization/serializables/tests/test_serializable.py +12 -2
  73. maxframe/serialization/tests/test_serial.py +2 -1
  74. maxframe/tensor/__init__.py +19 -7
  75. maxframe/tensor/merge/vstack.py +1 -1
  76. maxframe/tests/utils.py +16 -0
  77. maxframe/udf.py +27 -0
  78. maxframe/utils.py +42 -8
  79. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/METADATA +4 -4
  80. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/RECORD +88 -77
  81. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/WHEEL +1 -1
  82. maxframe_client/clients/framedriver.py +4 -1
  83. maxframe_client/fetcher.py +23 -8
  84. maxframe_client/session/odps.py +40 -11
  85. maxframe_client/session/task.py +6 -25
  86. maxframe_client/session/tests/test_task.py +35 -6
  87. maxframe_client/tests/test_session.py +30 -10
  88. {maxframe-1.0.0rc4.dist-info → maxframe-1.1.1.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,8 @@ from .accessor import (
18
18
  IndexMaxFrameAccessor,
19
19
  SeriesMaxFrameAccessor,
20
20
  )
21
+ from .apply_chunk import df_apply_chunk, series_apply_chunk
22
+ from .flatjson import series_flatjson
21
23
  from .flatmap import df_flatmap, series_flatmap
22
24
  from .reshuffle import DataFrameReshuffle, df_reshuffle
23
25
 
@@ -27,7 +29,10 @@ def _install():
27
29
 
28
30
  DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
29
31
  DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
32
+ DataFrameMaxFrameAccessor._register("apply_chunk", df_apply_chunk)
30
33
  SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
34
+ SeriesMaxFrameAccessor._register("flatjson", series_flatjson)
35
+ SeriesMaxFrameAccessor._register("apply_chunk", series_apply_chunk)
31
36
 
32
37
  if DataFrameMaxFrameAccessor._api_count:
33
38
  for t in DATAFRAME_TYPE:
@@ -0,0 +1,649 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import functools
15
+ from typing import Any, Callable, Dict, List, Tuple, Union
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ from ... import opcodes
21
+ from ...core import OutputType
22
+ from ...serialization.serializables import FunctionField, Int32Field
23
+ from ...utils import quiet_stdio
24
+ from ..core import DATAFRAME_TYPE, DataFrame, IndexValue, Series
25
+ from ..operators import DataFrameOperator, DataFrameOperatorMixin
26
+ from ..utils import (
27
+ build_df,
28
+ build_series,
29
+ copy_func_scheduling_hints,
30
+ make_dtypes,
31
+ pack_func_args,
32
+ parse_index,
33
+ validate_output_types,
34
+ )
35
+
36
+
37
+ class DataFrameApplyChunkOperator(DataFrameOperator, DataFrameOperatorMixin):
38
+ _op_type_ = opcodes.APPLY_CHUNK
39
+
40
+ func = FunctionField("func")
41
+ batch_rows = Int32Field("batch_rows")
42
+
43
+ def __init__(self, output_type=None, **kw):
44
+ if output_type:
45
+ kw["_output_types"] = [output_type]
46
+ super().__init__(**kw)
47
+ if hasattr(self, "func"):
48
+ copy_func_scheduling_hints(self.func, self)
49
+
50
+ def _call_dataframe(self, df, dtypes, index_value, element_wise):
51
+ # return dataframe
52
+ if self.output_types[0] == OutputType.dataframe:
53
+ dtypes = make_dtypes(dtypes)
54
+ # apply_chunk will use generate new range index for results
55
+ return self.new_dataframe(
56
+ [df],
57
+ shape=df.shape if element_wise else (np.nan, len(dtypes)),
58
+ index_value=index_value,
59
+ columns_value=parse_index(dtypes.index, store_data=True),
60
+ dtypes=dtypes,
61
+ )
62
+
63
+ # return series
64
+ if not isinstance(dtypes, tuple):
65
+ raise TypeError(
66
+ "Cannot determine dtype, " "please specify `dtype` as argument"
67
+ )
68
+
69
+ name, dtype = dtypes
70
+ return self.new_series(
71
+ [df], shape=(np.nan,), name=name, dtype=dtype, index_value=index_value
72
+ )
73
+
74
+ def _call_series(self, series, dtypes, index_value, element_wise):
75
+ if self.output_types[0] == OutputType.series:
76
+ if not isinstance(dtypes, tuple):
77
+ raise TypeError(
78
+ "Cannot determine dtype, " "please specify `dtype` as argument"
79
+ )
80
+
81
+ name, dtype = dtypes
82
+ shape = series.shape if element_wise else (np.nan,)
83
+
84
+ return self.new_series(
85
+ [series],
86
+ dtype=dtype,
87
+ shape=shape,
88
+ index_value=index_value,
89
+ name=name,
90
+ )
91
+
92
+ dtypes = make_dtypes(dtypes)
93
+ return self.new_dataframe(
94
+ [series],
95
+ shape=(np.nan, len(dtypes)),
96
+ index_value=index_value,
97
+ columns_value=parse_index(dtypes.index, store_data=True),
98
+ dtypes=dtypes,
99
+ )
100
+
101
+ def __call__(
102
+ self,
103
+ df_or_series: Union[DataFrame, Series],
104
+ dtypes: Union[Tuple[str, Any], Dict[str, Any]] = None,
105
+ output_type=None,
106
+ index=None,
107
+ args=(),
108
+ **kwargs,
109
+ ):
110
+ # if not dtypes and not skip_infer:
111
+ origin_func = self.func
112
+ self.func = get_packed_func(df_or_series, origin_func, *args, **kwargs)
113
+
114
+ # if skip_infer, directly build a frame
115
+ if self.output_types and self.output_types[0] == OutputType.df_or_series:
116
+ return self.new_df_or_series([df_or_series])
117
+
118
+ # infer return index and dtypes
119
+ dtypes, index_value, elementwise = self._infer_batch_func_returns(
120
+ df_or_series,
121
+ origin_func=origin_func,
122
+ packed_func=self.func,
123
+ given_output_type=output_type,
124
+ given_dtypes=dtypes,
125
+ given_index=index,
126
+ )
127
+
128
+ if index_value is None:
129
+ index_value = parse_index(
130
+ None, (df_or_series.key, df_or_series.index_value.key, self.func)
131
+ )
132
+ for arg, desc in zip((self.output_types, dtypes), ("output_types", "dtypes")):
133
+ if arg is None:
134
+ raise TypeError(
135
+ f"Cannot determine {desc} by calculating with enumerate data, "
136
+ "please specify it as arguments"
137
+ )
138
+
139
+ if dtypes is None or len(dtypes) == 0:
140
+ raise TypeError(
141
+ "Cannot determine {dtypes} or {dtype} by calculating with enumerate data, "
142
+ "please specify it as arguments"
143
+ )
144
+
145
+ if isinstance(df_or_series, DATAFRAME_TYPE):
146
+ return self._call_dataframe(
147
+ df_or_series,
148
+ dtypes=dtypes,
149
+ index_value=index_value,
150
+ element_wise=elementwise,
151
+ )
152
+
153
+ return self._call_series(
154
+ df_or_series,
155
+ dtypes=dtypes,
156
+ index_value=index_value,
157
+ element_wise=elementwise,
158
+ )
159
+
160
+ def _infer_batch_func_returns(
161
+ self,
162
+ input_df_or_series: Union[DataFrame, Series],
163
+ origin_func: Union[str, Callable, np.ufunc],
164
+ packed_func: Union[Callable, functools.partial],
165
+ given_output_type: OutputType,
166
+ given_dtypes: Union[Tuple[str, Any], pd.Series, List[Any], Dict[str, Any]],
167
+ given_index: Union[pd.Index, IndexValue],
168
+ given_elementwise: bool = False,
169
+ ):
170
+ inferred_output_type = inferred_dtypes = inferred_index_value = None
171
+ inferred_is_elementwise = False
172
+
173
+ # handle numpy ufunc case
174
+ if isinstance(origin_func, np.ufunc):
175
+ inferred_output_type = OutputType.dataframe
176
+ inferred_dtypes = None
177
+ inferred_index_value = input_df_or_series.index_value
178
+ inferred_is_elementwise = True
179
+ elif self.output_types is not None and given_dtypes is not None:
180
+ inferred_dtypes = given_dtypes
181
+
182
+ # build same schema frame toto execute
183
+ if isinstance(input_df_or_series, DATAFRAME_TYPE):
184
+ empty_data = build_df(input_df_or_series, fill_value=1, size=1)
185
+ else:
186
+ empty_data = build_series(
187
+ input_df_or_series, size=1, name=input_df_or_series.name
188
+ )
189
+
190
+ try:
191
+ # execute
192
+ with np.errstate(all="ignore"), quiet_stdio():
193
+ infer_result = packed_func(empty_data)
194
+
195
+ # if executed successfully, get index and dtypes from returned object
196
+ if inferred_index_value is None:
197
+ if (
198
+ infer_result is None
199
+ or not hasattr(infer_result, "index")
200
+ or infer_result.index is None
201
+ ):
202
+ inferred_index_value = parse_index(pd.RangeIndex(-1))
203
+ elif infer_result.index is empty_data.index:
204
+ inferred_index_value = input_df_or_series.index_value
205
+ else:
206
+ inferred_index_value = parse_index(infer_result.index, packed_func)
207
+
208
+ if isinstance(infer_result, pd.DataFrame):
209
+ if (
210
+ given_output_type is not None
211
+ and given_output_type != OutputType.dataframe
212
+ ):
213
+ raise TypeError(
214
+ f'Cannot infer output_type as "series", '
215
+ f'please specify `output_type` as "dataframe"'
216
+ )
217
+ inferred_output_type = given_output_type or OutputType.dataframe
218
+ inferred_dtypes = (
219
+ given_dtypes if given_dtypes is not None else infer_result.dtypes
220
+ )
221
+ else:
222
+ if (
223
+ given_output_type is not None
224
+ and given_output_type == OutputType.dataframe
225
+ ):
226
+ raise TypeError(
227
+ f'Cannot infer output_type as "dataframe", '
228
+ f'please specify `output_type` as "series"'
229
+ )
230
+ inferred_output_type = given_output_type or OutputType.series
231
+ inferred_dtypes = (infer_result.name, infer_result.dtype)
232
+ except: # noqa: E722
233
+ pass
234
+
235
+ # merge specified and inferred index, dtypes, output_type
236
+ # elementwise used to decide shape
237
+ self.output_types = (
238
+ [inferred_output_type]
239
+ if not self.output_types and inferred_output_type
240
+ else self.output_types
241
+ )
242
+ inferred_dtypes = given_dtypes if given_dtypes is not None else inferred_dtypes
243
+ if given_index is not None:
244
+ inferred_index_value = (
245
+ parse_index(given_index)
246
+ if given_index is not input_df_or_series.index_value
247
+ else input_df_or_series.index_value
248
+ )
249
+ inferred_is_elementwise = given_elementwise or inferred_is_elementwise
250
+ return inferred_dtypes, inferred_index_value, inferred_is_elementwise
251
+
252
+
253
+ def get_packed_func(df, func, *args, **kwargs) -> Any:
254
+ stub_df = build_df(df, fill_value=1, size=1)
255
+ return pack_func_args(stub_df, func, *args, **kwargs)
256
+
257
+
258
+ def df_apply_chunk(
259
+ dataframe,
260
+ func: Union[str, Callable],
261
+ batch_rows,
262
+ dtypes=None,
263
+ dtype=None,
264
+ name=None,
265
+ output_type=None,
266
+ index=None,
267
+ skip_infer=False,
268
+ args=(),
269
+ **kwargs,
270
+ ):
271
+ """
272
+ Apply a function that takes pandas DataFrame and outputs pandas DataFrame/Series.
273
+ The pandas DataFrame given to the function is a chunk of the input dataframe, consider as a batch rows.
274
+
275
+ The objects passed into this function are slices of the original DataFrame, containing at most batch_rows
276
+ number of rows and all columns. It is equivalent to merging multiple ``df.apply`` with ``axis=1`` inputs and then
277
+ passing them into the function for execution, thereby improving performance in specific scenarios. The function
278
+ output can be either a DataFrame or a Series. ``apply_chunk`` will ultimately merge the results into a new
279
+ DataFrame or Series.
280
+
281
+ Don't expect to receive all rows of the DataFrame in the function, as it depends on the implementation
282
+ of MaxFrame and the internal running state of MaxCompute.
283
+
284
+ Parameters
285
+ ----------
286
+ func : str or Callable
287
+ Function to apply to the dataframe chunk.
288
+
289
+ batch_rows : int
290
+ Specify expected number of rows in a batch, as well as the len of function input dataframe. When the remaining
291
+ data is insufficient, it may be less than this number.
292
+
293
+ output_type : {'dataframe', 'series'}, default None
294
+ Specify type of returned object. See `Notes` for more details.
295
+
296
+ dtypes : Series, default None
297
+ Specify dtypes of returned DataFrames. See `Notes` for more details.
298
+
299
+ dtype : numpy.dtype, default None
300
+ Specify dtype of returned Series. See `Notes` for more details.
301
+
302
+ name : str, default None
303
+ Specify name of returned Series. See `Notes` for more details.
304
+
305
+ index : Index, default None
306
+ Specify index of returned object. See `Notes` for more details.
307
+
308
+ skip_infer: bool, default False
309
+ Whether infer dtypes when dtypes or output_type is not specified.
310
+
311
+ args : tuple
312
+ Positional arguments to pass to ``func`` in addition to the
313
+ array/series.
314
+
315
+ **kwds
316
+ Additional keyword arguments to pass as keywords arguments to
317
+ ``func``.
318
+
319
+ Returns
320
+ -------
321
+ Series or DataFrame
322
+ Result of applying ``func`` along the given chunk of the
323
+ DataFrame.
324
+
325
+ See Also
326
+ --------
327
+ DataFrame.apply: For non-batching operations.
328
+ Series.mf.apply_chunk: Apply function to Series chunk.
329
+
330
+ Notes
331
+ -----
332
+ When deciding output dtypes and shape of the return value, MaxFrame will
333
+ try applying ``func`` onto a mock DataFrame, and the apply call may
334
+ fail. When this happens, you need to specify the type of apply call
335
+ (DataFrame or Series) in output_type.
336
+
337
+ * For DataFrame output, you need to specify a list or a pandas Series
338
+ as ``dtypes`` of output DataFrame. ``index`` of output can also be
339
+ specified.
340
+ * For Series output, you need to specify ``dtype`` and ``name`` of
341
+ output Series.
342
+
343
+ Examples
344
+ --------
345
+ >>> import numpy as np
346
+ >>> import maxframe.tensor as mt
347
+ >>> import maxframe.dataframe as md
348
+ >>> df = md.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
349
+ >>> df.execute()
350
+ A B
351
+ 0 4 9
352
+ 1 4 9
353
+ 2 4 9
354
+
355
+ Use different batch_rows will collect different dataframe chunk into the function.
356
+
357
+ For example, when you use ``batch_rows=3``, it means that the function will wait until 3 rows are collected.
358
+ >>> df.mf.apply_chunk(np.sum, batch_rows=3).execute()
359
+ A 12
360
+ B 27
361
+ dtype: int64
362
+
363
+ While, if ``batch_rows=2``, the data will be divided into at least two segments. Additionally, if your function
364
+ alters the shape of the dataframe, it may result in different outputs.
365
+
366
+ >>> df.mf.apply_chunk(np.sum, batch_rows=2).execute()
367
+ A 8
368
+ B 18
369
+ A 4
370
+ B 9
371
+ dtype: int64
372
+
373
+ If the function requires some parameters, you can specify them using args or kwargs.
374
+
375
+ >>> def calc(df, x, y):
376
+ ... return df * x + y
377
+ >>> df.mf.apply_chunk(calc, args=(10,), y=20).execute()
378
+ A B
379
+ 0 60 110
380
+ 1 60 110
381
+ 2 60 110
382
+
383
+ The batch rows will benefit the actions consume a dataframe, like sklearn predict.
384
+ You can easily use sklearn in MaxFrame to perform offline inference, and apply_chunk makes this process more
385
+ efficient. The ``@with_python_requirements`` provides the capability to automatically package and load
386
+ dependencies.
387
+
388
+ Once you rely on some third-party dependencies, MaxFrame may not be able to correctly infer the return type.
389
+ Therefore, using ``output_type`` with ``dtype`` or ``dtypes`` is necessary.
390
+
391
+ >>> from maxframe.udf import with_python_requirements
392
+ >>> data = {
393
+ ... 'A': np.random.rand(10),
394
+ ... 'B': np.random.rand(10)
395
+ ... }
396
+ >>> pd_df = pd.DataFrame(data)
397
+ >>> X = pd_df[['A']]
398
+ >>> y = pd_df['B']
399
+
400
+ >>> from sklearn.model_selection import train_test_split
401
+ >>> from sklearn.linear_model import LinearRegression
402
+ >>> model = LinearRegression()
403
+ >>> model.fit(X, y)
404
+
405
+ >>> @with_python_requirements("scikit-learn")
406
+ ... def predict(df):
407
+ ... predict_B = model.predict(df[["A"]])
408
+ ... return pd.Series(predict_B, index=df.A.index)
409
+
410
+ >>> df.mf.apply_chunk(predict, batch_rows=3, output_type="series", dtype="float", name="predict_B").execute()
411
+ 0 -0.765025
412
+ 1 -0.765025
413
+ 2 -0.765025
414
+ Name: predict_B, dtype: float64
415
+ """
416
+ if not isinstance(func, Callable):
417
+ raise TypeError("function must be a callable object")
418
+
419
+ if not isinstance(batch_rows, int):
420
+ raise TypeError("batch_rows must be an integer")
421
+
422
+ if batch_rows <= 0:
423
+ raise ValueError("batch_rows must be greater than 0")
424
+
425
+ dtypes = (name, dtype) if dtype is not None else dtypes
426
+
427
+ output_types = kwargs.pop("output_types", None)
428
+ object_type = kwargs.pop("object_type", None)
429
+ output_types = validate_output_types(
430
+ output_type=output_type, output_types=output_types, object_type=object_type
431
+ )
432
+ output_type = output_types[0] if output_types else None
433
+ if skip_infer and output_type is None:
434
+ output_type = OutputType.df_or_series
435
+
436
+ # bind args and kwargs
437
+ op = DataFrameApplyChunkOperator(
438
+ func=func, batch_rows=batch_rows, output_type=output_type
439
+ )
440
+
441
+ return op(
442
+ dataframe,
443
+ dtypes=dtypes,
444
+ index=index,
445
+ args=args,
446
+ **kwargs,
447
+ )
448
+
449
+
450
+ def series_apply_chunk(
451
+ dataframe_or_series,
452
+ func: Union[str, Callable],
453
+ batch_rows,
454
+ dtypes=None,
455
+ dtype=None,
456
+ name=None,
457
+ output_type=None,
458
+ index=None,
459
+ skip_infer=False,
460
+ args=(),
461
+ **kwargs,
462
+ ):
463
+ """
464
+ Apply a function that takes pandas Series and outputs pandas DataFrame/Series.
465
+ The pandas DataFrame given to the function is a chunk of the input series.
466
+
467
+ The objects passed into this function are slices of the original series, containing at most batch_rows
468
+ number of elements. The function output can be either a DataFrame or a Series.
469
+ ``apply_chunk`` will ultimately merge the results into a new DataFrame or Series.
470
+
471
+ Don't expect to receive all elements of series in the function, as it depends on the implementation
472
+ of MaxFrame and the internal running state of MaxCompute.
473
+
474
+ Can be ufunc (a NumPy function that applies to the entire Series)
475
+ or a Python function that only works on series.
476
+
477
+ Parameters
478
+ ----------
479
+ func : function
480
+ Python function or NumPy ufunc to apply.
481
+
482
+ batch_rows : int
483
+ Specify expected number of elements in a batch, as well as the len of function input series.
484
+ When the remaining data is insufficient, it may be less than this number.
485
+
486
+ output_type : {'dataframe', 'series'}, default None
487
+ Specify type of returned object. See `Notes` for more details.
488
+
489
+ dtypes : Series, default None
490
+ Specify dtypes of returned DataFrames. See `Notes` for more details.
491
+
492
+ dtype : numpy.dtype, default None
493
+ Specify dtype of returned Series. See `Notes` for more details.
494
+
495
+ name : str, default None
496
+ Specify name of returned Series. See `Notes` for more details.
497
+
498
+ index : Index, default None
499
+ Specify index of returned object. See `Notes` for more details.
500
+
501
+ args : tuple
502
+ Positional arguments passed to func after the series value.
503
+
504
+ skip_infer: bool, default False
505
+ Whether infer dtypes when dtypes or output_type is not specified.
506
+
507
+ **kwds
508
+ Additional keyword arguments passed to func.
509
+
510
+ Returns
511
+ -------
512
+ Series or DataFrame
513
+ If func returns a Series object the result will be a Series, else the result will be a DataFrame.
514
+
515
+ See Also
516
+ --------
517
+ DataFrame.apply_chunk: Apply function to DataFrame chunk.
518
+ Series.apply: For non-batching operations.
519
+
520
+ Notes
521
+ -----
522
+ When deciding output dtypes and shape of the return value, MaxFrame will
523
+ try applying ``func`` onto a mock Series, and the apply call may fail.
524
+ When this happens, you need to specify the type of apply call
525
+ (DataFrame or Series) in output_type.
526
+
527
+ * For DataFrame output, you need to specify a list or a pandas Series
528
+ as ``dtypes`` of output DataFrame. ``index`` of output can also be
529
+ specified.
530
+ * For Series output, you need to specify ``dtype`` and ``name`` of
531
+ output Series.
532
+
533
+ Examples
534
+ --------
535
+ Create a series with typical summer temperatures for each city.
536
+
537
+ >>> import maxframe.tensor as mt
538
+ >>> import maxframe.dataframe as md
539
+ >>> s = md.Series([20, 21, 12],
540
+ ... index=['London', 'New York', 'Helsinki'])
541
+ >>> s.execute()
542
+ London 20
543
+ New York 21
544
+ Helsinki 12
545
+ dtype: int64
546
+
547
+ Square the values by defining a function and passing it as an
548
+ argument to ``apply_chunk()``.
549
+
550
+ >>> def square(x):
551
+ ... return x ** 2
552
+ >>> s.mf.apply_chunk(square, batch_rows=2).execute()
553
+ London 400
554
+ New York 441
555
+ Helsinki 144
556
+ dtype: int64
557
+
558
+ Square the values by passing an anonymous function as an
559
+ argument to ``apply_chunk()``.
560
+
561
+ >>> s.mf.apply_chunk(lambda x: x**2, batch_rows=2).execute()
562
+ London 400
563
+ New York 441
564
+ Helsinki 144
565
+ dtype: int64
566
+
567
+ Define a custom function that needs additional positional
568
+ arguments and pass these additional arguments using the
569
+ ``args`` keyword.
570
+
571
+ >>> def subtract_custom_value(x, custom_value):
572
+ ... return x - custom_value
573
+
574
+ >>> s.mf.apply_chunk(subtract_custom_value, args=(5,), batch_rows=3).execute()
575
+ London 15
576
+ New York 16
577
+ Helsinki 7
578
+ dtype: int64
579
+
580
+ Define a custom function that takes keyword arguments
581
+ and pass these arguments to ``apply_chunk``.
582
+
583
+ >>> def add_custom_values(x, **kwargs):
584
+ ... for month in kwargs:
585
+ ... x += kwargs[month]
586
+ ... return x
587
+
588
+ >>> s.mf.apply_chunk(add_custom_values, batch_rows=2, june=30, july=20, august=25).execute()
589
+ London 95
590
+ New York 96
591
+ Helsinki 87
592
+ dtype: int64
593
+
594
+ If func return a dataframe, the apply_chunk will return a dataframe as well.
595
+
596
+ >>> def get_dataframe(x):
597
+ ... return pd.concat([x, x], axis=1)
598
+
599
+ >>> s.mf.apply_chunk(get_dataframe, batch_rows=2).execute()
600
+ 0 1
601
+ London 20 20
602
+ New York 21 21
603
+ Helsinki 12 12
604
+
605
+ Provides a dtypes or dtype with name to naming the output schema.
606
+
607
+ >>> s.mf.apply_chunk(
608
+ ... get_dataframe,
609
+ ... batch_rows=2,
610
+ ... dtypes={"A": np.int_, "B": np.int_},
611
+ ... output_type="dataframe"
612
+ ... ).execute()
613
+ A B
614
+ London 20 20
615
+ New York 21 21
616
+ Helsinki 12 12
617
+ """
618
+ if not isinstance(func, Callable):
619
+ raise TypeError("function must be a callable object")
620
+
621
+ if not isinstance(batch_rows, int):
622
+ raise TypeError("batch_rows must be an integer")
623
+
624
+ if batch_rows <= 0:
625
+ raise ValueError("batch_rows must be greater than 0")
626
+
627
+ # bind args and kwargs
628
+ output_types = kwargs.pop("output_types", None)
629
+ object_type = kwargs.pop("object_type", None)
630
+ output_types = validate_output_types(
631
+ output_type=output_type, output_types=output_types, object_type=object_type
632
+ )
633
+ output_type = output_types[0] if output_types else None
634
+ if skip_infer and output_type is None:
635
+ output_type = OutputType.df_or_series
636
+
637
+ op = DataFrameApplyChunkOperator(
638
+ func=func, batch_rows=batch_rows, output_type=output_type
639
+ )
640
+
641
+ dtypes = (name, dtype) if dtype is not None else dtypes
642
+ return op(
643
+ dataframe_or_series,
644
+ dtypes=dtypes,
645
+ output_type=output_type,
646
+ index=index,
647
+ args=args,
648
+ **kwargs,
649
+ )