maxframe 1.2.1__cp38-cp38-win32.whl → 1.3.1__cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (73) hide show
  1. maxframe/_utils.cp38-win32.pyd +0 -0
  2. maxframe/codegen.py +70 -21
  3. maxframe/config/config.py +6 -0
  4. maxframe/core/accessor.py +1 -0
  5. maxframe/core/graph/core.cp38-win32.pyd +0 -0
  6. maxframe/dataframe/accessors/__init__.py +1 -1
  7. maxframe/dataframe/accessors/dict_/accessor.py +1 -0
  8. maxframe/dataframe/accessors/dict_/length.py +1 -0
  9. maxframe/dataframe/accessors/dict_/setitem.py +1 -0
  10. maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
  11. maxframe/dataframe/accessors/list_/__init__.py +37 -0
  12. maxframe/dataframe/accessors/list_/accessor.py +39 -0
  13. maxframe/dataframe/accessors/list_/getitem.py +135 -0
  14. maxframe/dataframe/accessors/list_/length.py +73 -0
  15. maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
  16. maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
  17. maxframe/dataframe/accessors/plotting/__init__.py +2 -0
  18. maxframe/dataframe/accessors/string_/__init__.py +1 -0
  19. maxframe/dataframe/datastore/to_odps.py +6 -0
  20. maxframe/dataframe/extensions/accessor.py +1 -0
  21. maxframe/dataframe/extensions/apply_chunk.py +34 -21
  22. maxframe/dataframe/extensions/flatmap.py +8 -1
  23. maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
  24. maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
  25. maxframe/dataframe/groupby/aggregation.py +53 -1
  26. maxframe/dataframe/merge/concat.py +7 -4
  27. maxframe/dataframe/merge/merge.py +1 -0
  28. maxframe/dataframe/merge/tests/test_merge.py +97 -47
  29. maxframe/dataframe/missing/tests/test_missing.py +1 -0
  30. maxframe/dataframe/reduction/aggregation.py +63 -0
  31. maxframe/dataframe/reduction/core.py +17 -5
  32. maxframe/dataframe/tests/test_utils.py +7 -0
  33. maxframe/dataframe/ufunc/ufunc.py +1 -0
  34. maxframe/dataframe/utils.py +3 -0
  35. maxframe/io/odpsio/schema.py +1 -0
  36. maxframe/learn/contrib/__init__.py +2 -4
  37. maxframe/learn/contrib/llm/__init__.py +1 -0
  38. maxframe/learn/contrib/llm/core.py +31 -10
  39. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  40. maxframe/learn/contrib/llm/models/dashscope.py +38 -3
  41. maxframe/learn/contrib/llm/models/managed.py +54 -0
  42. maxframe/learn/contrib/llm/multi_modal.py +93 -0
  43. maxframe/learn/contrib/llm/text.py +268 -8
  44. maxframe/learn/contrib/models.py +77 -0
  45. maxframe/learn/contrib/utils.py +1 -0
  46. maxframe/learn/contrib/xgboost/__init__.py +8 -1
  47. maxframe/learn/contrib/xgboost/classifier.py +15 -4
  48. maxframe/learn/contrib/xgboost/core.py +108 -1
  49. maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
  50. maxframe/learn/contrib/xgboost/predict.py +6 -3
  51. maxframe/learn/contrib/xgboost/regressor.py +15 -1
  52. maxframe/learn/contrib/xgboost/train.py +5 -4
  53. maxframe/lib/dtypes_extension/__init__.py +2 -1
  54. maxframe/lib/dtypes_extension/dtypes.py +21 -0
  55. maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
  56. maxframe/lib/mmh3.cp38-win32.pyd +0 -0
  57. maxframe/opcodes.py +19 -0
  58. maxframe/serialization/__init__.py +1 -0
  59. maxframe/serialization/core.cp38-win32.pyd +0 -0
  60. maxframe/serialization/core.pyx +12 -1
  61. maxframe/serialization/numpy.py +12 -4
  62. maxframe/serialization/serializables/tests/test_serializable.py +13 -2
  63. maxframe/serialization/tests/test_serial.py +2 -0
  64. maxframe/tensor/merge/concatenate.py +1 -0
  65. maxframe/tensor/misc/unique.py +11 -10
  66. maxframe/tensor/reshape/reshape.py +4 -1
  67. maxframe/utils.py +4 -0
  68. {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/METADATA +2 -1
  69. {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/RECORD +73 -65
  70. {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/WHEEL +1 -1
  71. maxframe_client/session/odps.py +3 -0
  72. maxframe_client/session/tests/test_task.py +1 -0
  73. {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/top_level.txt +0 -0
@@ -16,10 +16,10 @@ import numpy as np
16
16
  import pandas as pd
17
17
  import pytest
18
18
 
19
+ from .... import dataframe as md
19
20
  from ....tests.utils import assert_mf_index_dtype
20
21
  from ...core import IndexValue
21
- from ...datasource.dataframe import from_pandas
22
- from .. import DataFrameMerge, concat
22
+ from .. import DataFrameMerge
23
23
  from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
24
24
 
25
25
 
@@ -29,8 +29,8 @@ def test_merge():
29
29
  )
30
30
  df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
31
31
 
32
- mdf1 = from_pandas(df1, chunk_size=2)
33
- mdf2 = from_pandas(df2, chunk_size=3)
32
+ mdf1 = md.DataFrame(df1, chunk_size=2)
33
+ mdf2 = md.DataFrame(df2, chunk_size=3)
34
34
 
35
35
  mapjoin = MapJoinHint()
36
36
  dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
@@ -83,8 +83,8 @@ def test_merge_invalid_parameters():
83
83
  )
84
84
  pdf2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
85
85
 
86
- df1 = from_pandas(pdf1, chunk_size=2)
87
- df2 = from_pandas(pdf2, chunk_size=3)
86
+ df1 = md.DataFrame(pdf1, chunk_size=2)
87
+ df2 = md.DataFrame(pdf2, chunk_size=3)
88
88
 
89
89
  with pytest.raises(ValueError):
90
90
  df1.merge(df2, bloom_filter="wrong")
@@ -104,8 +104,8 @@ def test_join():
104
104
  df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1
105
105
  df2 = pd.concat([df2, df2 + 1])
106
106
 
107
- mdf1 = from_pandas(df1, chunk_size=2)
108
- mdf2 = from_pandas(df2, chunk_size=2)
107
+ mdf1 = md.DataFrame(df1, chunk_size=2)
108
+ mdf2 = md.DataFrame(df2, chunk_size=2)
109
109
 
110
110
  parameters = [
111
111
  {"lsuffix": "l_", "rsuffix": "r_"},
@@ -132,8 +132,8 @@ def test_join_on():
132
132
  )
133
133
  df2 = pd.concat([df2, df2 + 1])
134
134
 
135
- mdf1 = from_pandas(df1, chunk_size=2)
136
- mdf2 = from_pandas(df2, chunk_size=2)
135
+ mdf1 = md.DataFrame(df1, chunk_size=2)
136
+ mdf2 = md.DataFrame(df2, chunk_size=2)
137
137
 
138
138
  parameters = [
139
139
  {"lsuffix": "l_", "rsuffix": "r_"},
@@ -157,15 +157,15 @@ def test_append():
157
157
  df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
158
158
  df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
159
159
 
160
- mdf1 = from_pandas(df1, chunk_size=3)
161
- mdf2 = from_pandas(df2, chunk_size=3)
160
+ mdf1 = md.DataFrame(df1, chunk_size=3)
161
+ mdf2 = md.DataFrame(df2, chunk_size=3)
162
162
  adf = mdf1.append(mdf2)
163
163
 
164
164
  assert adf.shape == (20, 4)
165
165
  assert_mf_index_dtype(adf.index_value.value, np.int64)
166
166
 
167
- mdf1 = from_pandas(df1, chunk_size=3)
168
- mdf2 = from_pandas(df2, chunk_size=3)
167
+ mdf1 = md.DataFrame(df1, chunk_size=3)
168
+ mdf2 = md.DataFrame(df2, chunk_size=3)
169
169
  adf = mdf1.append(mdf2, ignore_index=True)
170
170
 
171
171
  assert adf.shape == (20, 4)
@@ -173,84 +173,135 @@ def test_append():
173
173
  pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20))
174
174
 
175
175
 
176
- def test_concat():
176
+ def test_concat_dataframe():
177
+ # test index concatenate
177
178
  df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
178
179
  df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
179
180
 
180
- mdf1 = from_pandas(df1, chunk_size=4)
181
- mdf2 = from_pandas(df2, chunk_size=4)
182
- r = concat([mdf1, mdf2], axis="index")
181
+ mdf1 = md.DataFrame(df1, chunk_size=4)
182
+ mdf2 = md.DataFrame(df2, chunk_size=4)
183
+ r = md.concat([mdf1, mdf2], axis="index")
183
184
 
184
185
  assert r.shape == (20, 4)
185
186
  assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
186
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
187
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
187
188
 
188
- df3 = pd.DataFrame(
189
- np.random.rand(10, 4), columns=list("ABCD"), index=pd.RangeIndex(10, 20)
189
+ # test index concatenate with range index
190
+ mdf3 = md.DataFrame(
191
+ np.random.rand(10, 4),
192
+ columns=list("ABCD"),
193
+ index=pd.RangeIndex(10, 20),
194
+ chunk_size=4,
190
195
  )
191
-
192
- mdf3 = from_pandas(df3, chunk_size=4)
193
- r = concat([mdf1, mdf3], axis="index")
196
+ r = md.concat([mdf1, mdf3], axis="index")
194
197
 
195
198
  assert r.shape == (20, 4)
196
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
199
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
197
200
  pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
198
201
 
202
+ # test index concatenate with perm index
199
203
  df4 = pd.DataFrame(
200
204
  np.random.rand(10, 4),
201
205
  columns=list("ABCD"),
202
206
  index=np.random.permutation(np.arange(10)),
203
207
  )
204
208
 
205
- mdf4 = from_pandas(df4, chunk_size=4)
206
- r = concat([mdf1, mdf4], axis="index")
209
+ # test concat with same index with different sources
210
+ mdf4 = md.DataFrame(df4, chunk_size=4)
211
+ r = md.concat([mdf1, mdf4], axis="index")
207
212
 
208
213
  assert r.shape == (20, 4)
209
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
214
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
210
215
  pd.testing.assert_index_equal(
211
216
  r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
212
217
  )
213
218
 
214
- r = concat([mdf4, mdf1], axis="index")
219
+ r = md.concat([mdf4, mdf1], axis="index")
215
220
 
216
221
  assert r.shape == (20, 4)
217
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
222
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
218
223
  pd.testing.assert_index_equal(
219
224
  r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
220
225
  )
221
226
 
222
- r = concat([mdf4, mdf4], axis="index")
227
+ # test concat with same index with same source
228
+ r = md.concat([mdf4, mdf4], axis="index")
223
229
 
224
230
  assert r.shape == (20, 4)
225
- pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
231
+ pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
226
232
  pd.testing.assert_index_equal(
227
233
  r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
228
234
  )
229
235
 
230
- mdf1 = from_pandas(df1, chunk_size=3)
231
- mdf2 = from_pandas(df2, chunk_size=4)
232
- r = concat([mdf1, mdf2], axis="columns")
236
+ # test concat with column outer join
237
+ mdf1 = md.DataFrame(df1, chunk_size=3)
238
+ mdf2 = md.DataFrame(df2, chunk_size=4)
239
+ r = md.concat([mdf1, mdf2], axis="columns")
233
240
 
234
241
  assert r.shape == (10, 8)
235
242
  expected_dtypes = pd.concat([df1, df2], axis="columns").dtypes
236
243
  pd.testing.assert_series_equal(r.dtypes, expected_dtypes)
237
244
 
238
- df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
239
- df2 = pd.DataFrame(np.random.rand(10, 3), columns=list("ABC"))
240
- mdf1 = from_pandas(df1, chunk_size=3)
241
- mdf2 = from_pandas(df2, chunk_size=3)
242
- r = concat([mdf1, mdf2], join="inner")
245
+ # test concat with column inner join
246
+ mdf1 = md.DataFrame(np.random.rand(10, 4), columns=list("ABCD"), chunk_size=3)
247
+ mdf2 = md.DataFrame(np.random.rand(10, 3), columns=list("ABC"), chunk_size=3)
248
+ r = md.concat([mdf1, mdf2], join="inner")
243
249
  assert r.shape == (20, 3)
244
250
 
251
+ # test concat with ignore index
252
+ r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
253
+ assert r.shape == (20, 3)
254
+ pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
255
+
256
+ # test concat with unknown shapes
257
+ mdf1._shape = (np.nan, 4)
258
+ r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
259
+ np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
260
+ r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
261
+ np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
262
+
263
+ # test concat with empty frames
264
+ r = md.concat([md.DataFrame([]), mdf2], ignore_index=True)
265
+ assert r.shape == (10, 3)
266
+
267
+
268
+ def test_concat_series():
269
+ # test row concat
270
+ ms1 = md.Series(np.random.rand(10))
271
+ ms2 = md.Series(np.random.rand(10))
272
+ r = md.concat([ms1, ms2])
273
+ assert r.shape == (20,)
274
+
275
+ # test row concat with unknown shape
276
+ ms1._shape = (np.nan,)
277
+ r = md.concat([ms1, ms2])
278
+ assert np.isnan(r.shape[0])
279
+ r = md.concat([ms1, ms2], ignore_index=True)
280
+ assert np.isnan(r.shape[0])
281
+
282
+ # test col concat
283
+ ms1 = md.Series(np.random.rand(10))
284
+ ms2 = md.Series(np.random.rand(10))
285
+ r = md.concat([ms1, ms2], axis=1)
286
+ assert r.shape == (10, 2)
287
+
288
+ # test col concat with names
289
+ ms1.name = "col1"
290
+ ms2.name = "col2"
291
+ r = md.concat([ms1, ms2], axis=1)
292
+ assert r.shape == (10, 2)
293
+ assert r.dtypes.index.tolist() == ["col1", "col2"]
294
+
245
295
 
246
296
  def test_invalid_join_hint():
247
- df1 = pd.DataFrame(
248
- np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
297
+ mdf1 = md.DataFrame(
298
+ np.arange(20).reshape((4, 5)) + 1,
299
+ columns=["a", "b", "c", "d", "e"],
300
+ chunk_size=2,
301
+ )
302
+ mdf2 = md.DataFrame(
303
+ np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"], chunk_size=3
249
304
  )
250
- df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
251
-
252
- mdf1 = from_pandas(df1, chunk_size=2)
253
- mdf2 = from_pandas(df2, chunk_size=3)
254
305
 
255
306
  # type error
256
307
  parameters = [
@@ -282,7 +333,6 @@ def test_invalid_join_hint():
282
333
  ]
283
334
 
284
335
  for kw in parameters:
285
- print(kw)
286
336
  with pytest.raises(TypeError):
287
337
  mdf1.merge(mdf2, **kw)
288
338
 
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import random
15
16
 
16
17
  import numpy as np
@@ -315,6 +315,69 @@ def compile_reduction_funcs(op: DataFrameAggregate, input: TileableType):
315
315
 
316
316
 
317
317
  def aggregate(df, func=None, axis=0, **kw):
318
+ """
319
+ Aggregate using one or more operations over the specified axis.
320
+
321
+ Parameters
322
+ ----------
323
+ df : DataFrame, Series
324
+ Object to aggregate.
325
+ func : list or dict
326
+ Function to use for aggregating the data.
327
+ axis : {0 or ‘index’, 1 or ‘columns’}, default 0
328
+ If 0 or ‘index’: apply function to each column. If 1 or ‘columns’: apply function to each row.
329
+ kw
330
+ Keyword arguments to pass to func.
331
+
332
+ Returns
333
+ -------
334
+ scalar, Series or DataFrame
335
+ The return can be:
336
+
337
+ * scalar : when Series.agg is called with single function
338
+ * Series : when DataFrame.agg is called with a single function
339
+ * DataFrame : when DataFrame.agg is called with several functions
340
+
341
+ Examples
342
+ --------
343
+ >>> import maxframe.dataframe as md
344
+ >>> df = md.DataFrame([[1, 2, 3],
345
+ ... [4, 5, 6],
346
+ ... [7, 8, 9],
347
+ ... [np.nan, np.nan, np.nan]],
348
+ ... columns=['A', 'B', 'C']).execute()
349
+
350
+ Aggregate these functions over the rows.
351
+
352
+ >>> df.agg(['sum', 'min']).execute()
353
+ A B C
354
+ min 1.0 2.0 3.0
355
+ sum 12.0 15.0 18.0
356
+
357
+ Different aggregations per column.
358
+
359
+ >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}).execute()
360
+ A B
361
+ max NaN 8.0
362
+ min 1.0 2.0
363
+ sum 12.0 NaN
364
+
365
+ Aggregate different functions over the columns and rename the index of the resulting DataFrame.
366
+
367
+ >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean')).execute()
368
+ A B C
369
+ x 7.0 NaN NaN
370
+ y NaN 2.0 NaN
371
+ z NaN NaN 6.0
372
+
373
+ >>> s = md.Series([1, 2, 3, 4])
374
+ >>> s.agg('min').execute()
375
+ 1
376
+
377
+ >>> s.agg(['min', 'max']).execute()
378
+ max 4
379
+ min 1
380
+ """
318
381
  axis = validate_axis(axis, df)
319
382
  if (
320
383
  df.ndim == 2
@@ -404,6 +404,7 @@ class ReductionPostStep(NamedTuple):
404
404
  func_name: str
405
405
  columns: Optional[List[str]]
406
406
  func_idl: bytes
407
+ post_func_aliases: Optional[List[str]] = None
407
408
 
408
409
 
409
410
  class ReductionSteps(NamedTuple):
@@ -462,6 +463,7 @@ class ReductionCompiler:
462
463
  self._output_key_to_agg_steps = dict()
463
464
  self._output_key_to_post_steps = dict()
464
465
  self._output_key_to_post_cols = dict()
466
+ self._output_key_to_col_func_mapping = dict()
465
467
 
466
468
  @classmethod
467
469
  def _check_function_valid(cls, func):
@@ -531,6 +533,14 @@ class ReductionCompiler:
531
533
  self._output_key_to_post_steps[step.output_key] = step
532
534
  self._update_col_dict(self._output_key_to_post_cols, step.output_key, cols)
533
535
 
536
+ if cols is not None:
537
+ col_name_map = (
538
+ self._output_key_to_col_func_mapping.get(step.output_key) or {}
539
+ )
540
+ for col in cols:
541
+ col_name_map[col] = func_name
542
+ self._output_key_to_col_func_mapping[step.output_key] = col_name_map
543
+
534
544
  @staticmethod
535
545
  def _build_mock_return_object(func, input_dtype, ndim):
536
546
  from ..initializer import DataFrame as MaxDataFrame
@@ -812,11 +822,12 @@ class ReductionCompiler:
812
822
  agg_funcs.append(step)
813
823
 
814
824
  for key, step in self._output_key_to_post_steps.items():
815
- cols = self._output_key_to_post_cols[key]
816
- if cols and set(cols) == set(referred_cols):
817
- post_cols = None
818
- else:
819
- post_cols = cols
825
+ post_cols = self._output_key_to_post_cols[key]
826
+ func_renames = None
827
+ if post_cols:
828
+ col_map = self._output_key_to_col_func_mapping.get(key)
829
+ if col_map:
830
+ func_renames = [col_map[c] for c in post_cols]
820
831
 
821
832
  func_name = step.func_name
822
833
  if self._lambda_counter == 1 and step.func_name == "<lambda_0>":
@@ -831,6 +842,7 @@ class ReductionCompiler:
831
842
  func_name,
832
843
  post_cols,
833
844
  step.func_idl,
845
+ func_renames,
834
846
  )
835
847
  )
836
848
 
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  import numpy as np
15
16
  import pandas as pd
16
17
  import pyarrow as pa
@@ -71,6 +72,12 @@ def test_pack_function(df1):
71
72
  @pytest.mark.parametrize(
72
73
  "dtype, fill_value, expected",
73
74
  [
75
+ (
76
+ ArrowDtype(pa.list_(pa.string())) if ArrowDtype else None,
77
+ 1,
78
+ ["1"],
79
+ ),
80
+ (pa.list_(pa.string()), 1, ["1"]),
74
81
  (
75
82
  ArrowDtype(pa.map_(pa.int32(), pa.string())) if ArrowDtype else None,
76
83
  1,
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from numbers import Number
15
16
 
16
17
  from ...tensor import tensor as astensor
@@ -463,6 +463,9 @@ def _generate_value(dtype, fill_value):
463
463
  if ArrowDtype and isinstance(dtype, pd.ArrowDtype):
464
464
  return _generate_value(dtype.pyarrow_dtype, fill_value)
465
465
 
466
+ if isinstance(dtype, pa.ListType):
467
+ return [_generate_value(dtype.value_type, fill_value)]
468
+
466
469
  if isinstance(dtype, pa.MapType):
467
470
  return [
468
471
  (
@@ -28,6 +28,7 @@ from ...protocol import DataFrameTableMeta
28
28
  from ...tensor.core import TENSOR_TYPE
29
29
 
30
30
  _TEMP_TABLE_PREFIX = "tmp_mf_"
31
+ DEFAULT_SINGLE_INDEX_NAME = "_idx_0"
31
32
 
32
33
  _arrow_to_odps_types = {
33
34
  pa.string(): odps_types.string,
@@ -12,8 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from . import graph, llm, pytorch
15
+ from . import graph, llm, models, pytorch
16
16
 
17
- del graph
18
- del llm
19
- del pytorch
17
+ del graph, llm, models, pytorch
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from . import models, multi_modal, text
15
16
 
16
17
  del models
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from typing import Any, Dict
15
16
 
16
17
  import numpy as np
@@ -19,6 +20,8 @@ import pandas as pd
19
20
  from ....core.entity.output_types import OutputType
20
21
  from ....core.operator.base import Operator
21
22
  from ....core.operator.core import TileableOperatorMixin
23
+ from ....dataframe.core import SERIES_TYPE
24
+ from ....dataframe.operators import DataFrameOperatorMixin
22
25
  from ....dataframe.utils import parse_index
23
26
  from ....serialization.serializables.core import Serializable
24
27
  from ....serialization.serializables.field import AnyField, DictField, StringField
@@ -31,24 +34,42 @@ class LLM(Serializable):
31
34
  pass
32
35
 
33
36
 
34
- class LLMOperator(Operator, TileableOperatorMixin):
37
+ class LLMTaskOperator(Operator, DataFrameOperatorMixin):
38
+ task = AnyField("task", default=None)
35
39
  model = AnyField("model", default=None)
36
- prompt_template = AnyField("prompt_template", default=None)
37
40
  params = DictField("params", default=None)
41
+ running_options: Dict[str, Any] = DictField("running_options", default=None)
38
42
 
39
43
  def __init__(self, output_types=None, **kw):
40
44
  if output_types is None:
41
45
  output_types = [OutputType.dataframe]
42
46
  super().__init__(_output_types=output_types, **kw)
43
47
 
44
- def __call__(self, data):
45
- col_names = ["response", "success"]
46
- columns = parse_index(pd.Index(col_names), store_data=True)
47
- out_dtypes = pd.Series([np.dtype("O"), np.dtype("bool")], index=col_names)
48
- return self.new_tileable(
48
+ def get_output_dtypes(self) -> Dict[str, np.dtype]:
49
+ raise NotImplementedError
50
+
51
+ def __call__(self, data, index=None):
52
+ outputs = self.get_output_dtypes()
53
+ col_name = list(outputs.keys())
54
+ columns = parse_index(pd.Index(col_name), store_data=True)
55
+ out_dtypes = pd.Series(list(outputs.values()), index=col_name)
56
+ index_value = index or (
57
+ parse_index(pd.RangeIndex(-1), data)
58
+ if isinstance(data, SERIES_TYPE)
59
+ else data.index_value
60
+ )
61
+
62
+ return self.new_dataframe(
49
63
  inputs=[data],
50
- dtypes=out_dtypes,
51
- shape=(data.shape[0], len(col_names)),
52
- index_value=data.index_value,
64
+ shape=(np.nan, len(col_name)),
65
+ index_value=index_value,
53
66
  columns_value=columns,
67
+ dtypes=out_dtypes,
54
68
  )
69
+
70
+
71
+ class LLMTextGenOperator(LLMTaskOperator, TileableOperatorMixin):
72
+ prompt_template = AnyField("prompt_template", default=None)
73
+
74
+ def get_output_dtypes(self) -> Dict[str, np.dtype]:
75
+ return {"response": np.dtype("O"), "success": np.dtype("bool")}
@@ -11,4 +11,5 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from .dashscope import DashScopeMultiModalLLM, DashScopeTextLLM
@@ -11,12 +11,13 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+
14
15
  from typing import Any, Dict
15
16
 
16
17
  from ..... import opcodes
17
18
  from .....serialization.serializables.core import Serializable
18
19
  from .....serialization.serializables.field import StringField
19
- from ..core import LLMOperator
20
+ from ..core import LLMTextGenOperator
20
21
  from ..multi_modal import MultiModalLLM
21
22
  from ..text import TextLLM
22
23
 
@@ -33,8 +34,25 @@ class DashScopeLLMMixin(Serializable):
33
34
 
34
35
 
35
36
  class DashScopeTextLLM(TextLLM, DashScopeLLMMixin):
37
+ """
38
+ DashScope text LLM.
39
+ """
40
+
36
41
  api_key_resource = StringField("api_key_resource", default=None)
37
42
 
43
+ def __init__(self, name: str, api_key_resource: str):
44
+ """
45
+ Initialize a DashScope text LLM.
46
+
47
+ Parameters
48
+ ----------
49
+ name : str
50
+ The LLM name to use, check DashScope for `available models <https://help.aliyun.com/zh/model-studio/getting-started/models>`_.
51
+ api_key_resource : str
52
+ The MaxCompute resource file name containing the DashScope API key.
53
+ """
54
+ super().__init__(name=name, api_key_resource=api_key_resource)
55
+
38
56
  def generate(
39
57
  self,
40
58
  data,
@@ -49,8 +67,25 @@ class DashScopeTextLLM(TextLLM, DashScopeLLMMixin):
49
67
 
50
68
 
51
69
  class DashScopeMultiModalLLM(MultiModalLLM, DashScopeLLMMixin):
70
+ """
71
+ DashScope multi-modal LLM.
72
+ """
73
+
52
74
  api_key_resource = StringField("api_key_resource", default=None)
53
75
 
76
+ def __init__(self, name: str, api_key_resource: str):
77
+ """
78
+ Initialize a DashScope multi-modal LLM.
79
+
80
+ Parameters
81
+ ----------
82
+ name : str
83
+ The LLM name to use, check DashScope for `available models <https://help.aliyun.com/zh/model-studio/getting-started/models>`_.
84
+ api_key_resource : str
85
+ The MaxCompute resource file name containing the DashScope API key.
86
+ """
87
+ super().__init__(name=name, api_key_resource=api_key_resource)
88
+
54
89
  def generate(
55
90
  self,
56
91
  data,
@@ -65,9 +100,9 @@ class DashScopeMultiModalLLM(MultiModalLLM, DashScopeLLMMixin):
65
100
  )(data)
66
101
 
67
102
 
68
- class DashScopeTextGenerationOperator(LLMOperator):
103
+ class DashScopeTextGenerationOperator(LLMTextGenOperator):
69
104
  _op_type_ = opcodes.DASHSCOPE_TEXT_GENERATION
70
105
 
71
106
 
72
- class DashScopeMultiModalGenerationOperator(LLMOperator):
107
+ class DashScopeMultiModalGenerationOperator(LLMTextGenOperator):
73
108
  _op_type_ = opcodes.DASHSCOPE_MULTI_MODAL_GENERATION
@@ -0,0 +1,54 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Dict, List
16
+
17
+ from ..... import opcodes
18
+ from .....serialization.serializables import StringField
19
+ from ..core import LLMTextGenOperator
20
+ from ..text import TextLLM
21
+
22
+
23
+ class ManagedLLMTextGenOperator(LLMTextGenOperator):
24
+ _op_type_ = opcodes.MANAGED_TEXT_MODAL_GENERATION
25
+
26
+ inference_framework: str = StringField("inference_framework", default=None)
27
+
28
+
29
+ class ManagedTextLLM(TextLLM):
30
+ """
31
+ Managed text LLM by MaxFrame.
32
+ """
33
+
34
+ def __init__(self, name: str):
35
+ """
36
+ Initialize a managed text LLM.
37
+
38
+ Parameters
39
+ ----------
40
+ name : str
41
+ The managed text LLM name to use.
42
+ """
43
+ super().__init__(name=name)
44
+
45
+ def generate(
46
+ self,
47
+ data,
48
+ prompt_template: List[Dict[str, Any]],
49
+ params: Dict[str, Any] = None,
50
+ **kw
51
+ ):
52
+ return ManagedLLMTextGenOperator(
53
+ model=self, prompt_template=prompt_template, params=params, **kw
54
+ )(data)