maxframe 1.2.1__cp310-cp310-win32.whl → 1.3.1__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp310-win32.pyd +0 -0
- maxframe/codegen.py +70 -21
- maxframe/config/config.py +6 -0
- maxframe/core/accessor.py +1 -0
- maxframe/core/graph/core.cp310-win32.pyd +0 -0
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/dict_/accessor.py +1 -0
- maxframe/dataframe/accessors/dict_/length.py +1 -0
- maxframe/dataframe/accessors/dict_/setitem.py +1 -0
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
- maxframe/dataframe/accessors/list_/__init__.py +37 -0
- maxframe/dataframe/accessors/list_/accessor.py +39 -0
- maxframe/dataframe/accessors/list_/getitem.py +135 -0
- maxframe/dataframe/accessors/list_/length.py +73 -0
- maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
- maxframe/dataframe/accessors/plotting/__init__.py +2 -0
- maxframe/dataframe/accessors/string_/__init__.py +1 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/accessor.py +1 -0
- maxframe/dataframe/extensions/apply_chunk.py +34 -21
- maxframe/dataframe/extensions/flatmap.py +8 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +53 -1
- maxframe/dataframe/merge/concat.py +7 -4
- maxframe/dataframe/merge/merge.py +1 -0
- maxframe/dataframe/merge/tests/test_merge.py +97 -47
- maxframe/dataframe/missing/tests/test_missing.py +1 -0
- maxframe/dataframe/reduction/aggregation.py +63 -0
- maxframe/dataframe/reduction/core.py +17 -5
- maxframe/dataframe/tests/test_utils.py +7 -0
- maxframe/dataframe/ufunc/ufunc.py +1 -0
- maxframe/dataframe/utils.py +3 -0
- maxframe/io/odpsio/schema.py +1 -0
- maxframe/learn/contrib/__init__.py +2 -4
- maxframe/learn/contrib/llm/__init__.py +1 -0
- maxframe/learn/contrib/llm/core.py +31 -10
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +38 -3
- maxframe/learn/contrib/llm/models/managed.py +54 -0
- maxframe/learn/contrib/llm/multi_modal.py +93 -0
- maxframe/learn/contrib/llm/text.py +268 -8
- maxframe/learn/contrib/models.py +77 -0
- maxframe/learn/contrib/utils.py +1 -0
- maxframe/learn/contrib/xgboost/__init__.py +8 -1
- maxframe/learn/contrib/xgboost/classifier.py +15 -4
- maxframe/learn/contrib/xgboost/core.py +108 -1
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
- maxframe/learn/contrib/xgboost/predict.py +6 -3
- maxframe/learn/contrib/xgboost/regressor.py +15 -1
- maxframe/learn/contrib/xgboost/train.py +5 -4
- maxframe/lib/dtypes_extension/__init__.py +2 -1
- maxframe/lib/dtypes_extension/dtypes.py +21 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
- maxframe/lib/mmh3.cp310-win32.pyd +0 -0
- maxframe/opcodes.py +19 -0
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp310-win32.pyd +0 -0
- maxframe/serialization/core.pyx +12 -1
- maxframe/serialization/numpy.py +12 -4
- maxframe/serialization/serializables/tests/test_serializable.py +13 -2
- maxframe/serialization/tests/test_serial.py +2 -0
- maxframe/tensor/merge/concatenate.py +1 -0
- maxframe/tensor/misc/unique.py +11 -10
- maxframe/tensor/reshape/reshape.py +4 -1
- maxframe/utils.py +4 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/METADATA +3 -2
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/RECORD +73 -65
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/WHEEL +1 -1
- maxframe_client/session/odps.py +3 -0
- maxframe_client/session/tests/test_task.py +1 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -16,10 +16,10 @@ import numpy as np
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
import pytest
|
|
18
18
|
|
|
19
|
+
from .... import dataframe as md
|
|
19
20
|
from ....tests.utils import assert_mf_index_dtype
|
|
20
21
|
from ...core import IndexValue
|
|
21
|
-
from
|
|
22
|
-
from .. import DataFrameMerge, concat
|
|
22
|
+
from .. import DataFrameMerge
|
|
23
23
|
from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
|
|
24
24
|
|
|
25
25
|
|
|
@@ -29,8 +29,8 @@ def test_merge():
|
|
|
29
29
|
)
|
|
30
30
|
df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
|
|
31
31
|
|
|
32
|
-
mdf1 =
|
|
33
|
-
mdf2 =
|
|
32
|
+
mdf1 = md.DataFrame(df1, chunk_size=2)
|
|
33
|
+
mdf2 = md.DataFrame(df2, chunk_size=3)
|
|
34
34
|
|
|
35
35
|
mapjoin = MapJoinHint()
|
|
36
36
|
dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
|
|
@@ -83,8 +83,8 @@ def test_merge_invalid_parameters():
|
|
|
83
83
|
)
|
|
84
84
|
pdf2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
|
|
85
85
|
|
|
86
|
-
df1 =
|
|
87
|
-
df2 =
|
|
86
|
+
df1 = md.DataFrame(pdf1, chunk_size=2)
|
|
87
|
+
df2 = md.DataFrame(pdf2, chunk_size=3)
|
|
88
88
|
|
|
89
89
|
with pytest.raises(ValueError):
|
|
90
90
|
df1.merge(df2, bloom_filter="wrong")
|
|
@@ -104,8 +104,8 @@ def test_join():
|
|
|
104
104
|
df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=["a1", "b2", "b3"]) + 1
|
|
105
105
|
df2 = pd.concat([df2, df2 + 1])
|
|
106
106
|
|
|
107
|
-
mdf1 =
|
|
108
|
-
mdf2 =
|
|
107
|
+
mdf1 = md.DataFrame(df1, chunk_size=2)
|
|
108
|
+
mdf2 = md.DataFrame(df2, chunk_size=2)
|
|
109
109
|
|
|
110
110
|
parameters = [
|
|
111
111
|
{"lsuffix": "l_", "rsuffix": "r_"},
|
|
@@ -132,8 +132,8 @@ def test_join_on():
|
|
|
132
132
|
)
|
|
133
133
|
df2 = pd.concat([df2, df2 + 1])
|
|
134
134
|
|
|
135
|
-
mdf1 =
|
|
136
|
-
mdf2 =
|
|
135
|
+
mdf1 = md.DataFrame(df1, chunk_size=2)
|
|
136
|
+
mdf2 = md.DataFrame(df2, chunk_size=2)
|
|
137
137
|
|
|
138
138
|
parameters = [
|
|
139
139
|
{"lsuffix": "l_", "rsuffix": "r_"},
|
|
@@ -157,15 +157,15 @@ def test_append():
|
|
|
157
157
|
df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
|
|
158
158
|
df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
|
|
159
159
|
|
|
160
|
-
mdf1 =
|
|
161
|
-
mdf2 =
|
|
160
|
+
mdf1 = md.DataFrame(df1, chunk_size=3)
|
|
161
|
+
mdf2 = md.DataFrame(df2, chunk_size=3)
|
|
162
162
|
adf = mdf1.append(mdf2)
|
|
163
163
|
|
|
164
164
|
assert adf.shape == (20, 4)
|
|
165
165
|
assert_mf_index_dtype(adf.index_value.value, np.int64)
|
|
166
166
|
|
|
167
|
-
mdf1 =
|
|
168
|
-
mdf2 =
|
|
167
|
+
mdf1 = md.DataFrame(df1, chunk_size=3)
|
|
168
|
+
mdf2 = md.DataFrame(df2, chunk_size=3)
|
|
169
169
|
adf = mdf1.append(mdf2, ignore_index=True)
|
|
170
170
|
|
|
171
171
|
assert adf.shape == (20, 4)
|
|
@@ -173,84 +173,135 @@ def test_append():
|
|
|
173
173
|
pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20))
|
|
174
174
|
|
|
175
175
|
|
|
176
|
-
def
|
|
176
|
+
def test_concat_dataframe():
|
|
177
|
+
# test index concatenate
|
|
177
178
|
df1 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
|
|
178
179
|
df2 = pd.DataFrame(np.random.rand(10, 4), columns=list("ABCD"))
|
|
179
180
|
|
|
180
|
-
mdf1 =
|
|
181
|
-
mdf2 =
|
|
182
|
-
r = concat([mdf1, mdf2], axis="index")
|
|
181
|
+
mdf1 = md.DataFrame(df1, chunk_size=4)
|
|
182
|
+
mdf2 = md.DataFrame(df2, chunk_size=4)
|
|
183
|
+
r = md.concat([mdf1, mdf2], axis="index")
|
|
183
184
|
|
|
184
185
|
assert r.shape == (20, 4)
|
|
185
186
|
assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
|
|
186
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
187
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
187
188
|
|
|
188
|
-
|
|
189
|
-
|
|
189
|
+
# test index concatenate with range index
|
|
190
|
+
mdf3 = md.DataFrame(
|
|
191
|
+
np.random.rand(10, 4),
|
|
192
|
+
columns=list("ABCD"),
|
|
193
|
+
index=pd.RangeIndex(10, 20),
|
|
194
|
+
chunk_size=4,
|
|
190
195
|
)
|
|
191
|
-
|
|
192
|
-
mdf3 = from_pandas(df3, chunk_size=4)
|
|
193
|
-
r = concat([mdf1, mdf3], axis="index")
|
|
196
|
+
r = md.concat([mdf1, mdf3], axis="index")
|
|
194
197
|
|
|
195
198
|
assert r.shape == (20, 4)
|
|
196
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
199
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
197
200
|
pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
|
|
198
201
|
|
|
202
|
+
# test index concatenate with perm index
|
|
199
203
|
df4 = pd.DataFrame(
|
|
200
204
|
np.random.rand(10, 4),
|
|
201
205
|
columns=list("ABCD"),
|
|
202
206
|
index=np.random.permutation(np.arange(10)),
|
|
203
207
|
)
|
|
204
208
|
|
|
205
|
-
|
|
206
|
-
|
|
209
|
+
# test concat with same index with different sources
|
|
210
|
+
mdf4 = md.DataFrame(df4, chunk_size=4)
|
|
211
|
+
r = md.concat([mdf1, mdf4], axis="index")
|
|
207
212
|
|
|
208
213
|
assert r.shape == (20, 4)
|
|
209
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
214
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
210
215
|
pd.testing.assert_index_equal(
|
|
211
216
|
r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
212
217
|
)
|
|
213
218
|
|
|
214
|
-
r = concat([mdf4, mdf1], axis="index")
|
|
219
|
+
r = md.concat([mdf4, mdf1], axis="index")
|
|
215
220
|
|
|
216
221
|
assert r.shape == (20, 4)
|
|
217
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
222
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
218
223
|
pd.testing.assert_index_equal(
|
|
219
224
|
r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
220
225
|
)
|
|
221
226
|
|
|
222
|
-
|
|
227
|
+
# test concat with same index with same source
|
|
228
|
+
r = md.concat([mdf4, mdf4], axis="index")
|
|
223
229
|
|
|
224
230
|
assert r.shape == (20, 4)
|
|
225
|
-
pd.testing.assert_series_equal(r.dtypes,
|
|
231
|
+
pd.testing.assert_series_equal(r.dtypes, mdf1.dtypes)
|
|
226
232
|
pd.testing.assert_index_equal(
|
|
227
233
|
r.index_value.to_pandas(), pd.Index([], dtype=np.int64)
|
|
228
234
|
)
|
|
229
235
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
236
|
+
# test concat with column outer join
|
|
237
|
+
mdf1 = md.DataFrame(df1, chunk_size=3)
|
|
238
|
+
mdf2 = md.DataFrame(df2, chunk_size=4)
|
|
239
|
+
r = md.concat([mdf1, mdf2], axis="columns")
|
|
233
240
|
|
|
234
241
|
assert r.shape == (10, 8)
|
|
235
242
|
expected_dtypes = pd.concat([df1, df2], axis="columns").dtypes
|
|
236
243
|
pd.testing.assert_series_equal(r.dtypes, expected_dtypes)
|
|
237
244
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
r = concat([mdf1, mdf2], join="inner")
|
|
245
|
+
# test concat with column inner join
|
|
246
|
+
mdf1 = md.DataFrame(np.random.rand(10, 4), columns=list("ABCD"), chunk_size=3)
|
|
247
|
+
mdf2 = md.DataFrame(np.random.rand(10, 3), columns=list("ABC"), chunk_size=3)
|
|
248
|
+
r = md.concat([mdf1, mdf2], join="inner")
|
|
243
249
|
assert r.shape == (20, 3)
|
|
244
250
|
|
|
251
|
+
# test concat with ignore index
|
|
252
|
+
r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
|
|
253
|
+
assert r.shape == (20, 3)
|
|
254
|
+
pd.testing.assert_index_equal(r.index_value.to_pandas(), pd.RangeIndex(20))
|
|
255
|
+
|
|
256
|
+
# test concat with unknown shapes
|
|
257
|
+
mdf1._shape = (np.nan, 4)
|
|
258
|
+
r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
|
|
259
|
+
np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
|
|
260
|
+
r = md.concat([mdf1, mdf2], join="inner", ignore_index=True)
|
|
261
|
+
np.testing.assert_array_equal(np.array(r.shape), np.array((np.nan, 3)))
|
|
262
|
+
|
|
263
|
+
# test concat with empty frames
|
|
264
|
+
r = md.concat([md.DataFrame([]), mdf2], ignore_index=True)
|
|
265
|
+
assert r.shape == (10, 3)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def test_concat_series():
|
|
269
|
+
# test row concat
|
|
270
|
+
ms1 = md.Series(np.random.rand(10))
|
|
271
|
+
ms2 = md.Series(np.random.rand(10))
|
|
272
|
+
r = md.concat([ms1, ms2])
|
|
273
|
+
assert r.shape == (20,)
|
|
274
|
+
|
|
275
|
+
# test row concat with unknown shape
|
|
276
|
+
ms1._shape = (np.nan,)
|
|
277
|
+
r = md.concat([ms1, ms2])
|
|
278
|
+
assert np.isnan(r.shape[0])
|
|
279
|
+
r = md.concat([ms1, ms2], ignore_index=True)
|
|
280
|
+
assert np.isnan(r.shape[0])
|
|
281
|
+
|
|
282
|
+
# test col concat
|
|
283
|
+
ms1 = md.Series(np.random.rand(10))
|
|
284
|
+
ms2 = md.Series(np.random.rand(10))
|
|
285
|
+
r = md.concat([ms1, ms2], axis=1)
|
|
286
|
+
assert r.shape == (10, 2)
|
|
287
|
+
|
|
288
|
+
# test col concat with names
|
|
289
|
+
ms1.name = "col1"
|
|
290
|
+
ms2.name = "col2"
|
|
291
|
+
r = md.concat([ms1, ms2], axis=1)
|
|
292
|
+
assert r.shape == (10, 2)
|
|
293
|
+
assert r.dtypes.index.tolist() == ["col1", "col2"]
|
|
294
|
+
|
|
245
295
|
|
|
246
296
|
def test_invalid_join_hint():
|
|
247
|
-
|
|
248
|
-
np.arange(20).reshape((4, 5)) + 1,
|
|
297
|
+
mdf1 = md.DataFrame(
|
|
298
|
+
np.arange(20).reshape((4, 5)) + 1,
|
|
299
|
+
columns=["a", "b", "c", "d", "e"],
|
|
300
|
+
chunk_size=2,
|
|
301
|
+
)
|
|
302
|
+
mdf2 = md.DataFrame(
|
|
303
|
+
np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"], chunk_size=3
|
|
249
304
|
)
|
|
250
|
-
df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
|
|
251
|
-
|
|
252
|
-
mdf1 = from_pandas(df1, chunk_size=2)
|
|
253
|
-
mdf2 = from_pandas(df2, chunk_size=3)
|
|
254
305
|
|
|
255
306
|
# type error
|
|
256
307
|
parameters = [
|
|
@@ -282,7 +333,6 @@ def test_invalid_join_hint():
|
|
|
282
333
|
]
|
|
283
334
|
|
|
284
335
|
for kw in parameters:
|
|
285
|
-
print(kw)
|
|
286
336
|
with pytest.raises(TypeError):
|
|
287
337
|
mdf1.merge(mdf2, **kw)
|
|
288
338
|
|
|
@@ -315,6 +315,69 @@ def compile_reduction_funcs(op: DataFrameAggregate, input: TileableType):
|
|
|
315
315
|
|
|
316
316
|
|
|
317
317
|
def aggregate(df, func=None, axis=0, **kw):
|
|
318
|
+
"""
|
|
319
|
+
Aggregate using one or more operations over the specified axis.
|
|
320
|
+
|
|
321
|
+
Parameters
|
|
322
|
+
----------
|
|
323
|
+
df : DataFrame, Series
|
|
324
|
+
Object to aggregate.
|
|
325
|
+
func : list or dict
|
|
326
|
+
Function to use for aggregating the data.
|
|
327
|
+
axis : {0 or ‘index’, 1 or ‘columns’}, default 0
|
|
328
|
+
If 0 or ‘index’: apply function to each column. If 1 or ‘columns’: apply function to each row.
|
|
329
|
+
kw
|
|
330
|
+
Keyword arguments to pass to func.
|
|
331
|
+
|
|
332
|
+
Returns
|
|
333
|
+
-------
|
|
334
|
+
scalar, Series or DataFrame
|
|
335
|
+
The return can be:
|
|
336
|
+
|
|
337
|
+
* scalar : when Series.agg is called with single function
|
|
338
|
+
* Series : when DataFrame.agg is called with a single function
|
|
339
|
+
* DataFrame : when DataFrame.agg is called with several functions
|
|
340
|
+
|
|
341
|
+
Examples
|
|
342
|
+
--------
|
|
343
|
+
>>> import maxframe.dataframe as md
|
|
344
|
+
>>> df = md.DataFrame([[1, 2, 3],
|
|
345
|
+
... [4, 5, 6],
|
|
346
|
+
... [7, 8, 9],
|
|
347
|
+
... [np.nan, np.nan, np.nan]],
|
|
348
|
+
... columns=['A', 'B', 'C']).execute()
|
|
349
|
+
|
|
350
|
+
Aggregate these functions over the rows.
|
|
351
|
+
|
|
352
|
+
>>> df.agg(['sum', 'min']).execute()
|
|
353
|
+
A B C
|
|
354
|
+
min 1.0 2.0 3.0
|
|
355
|
+
sum 12.0 15.0 18.0
|
|
356
|
+
|
|
357
|
+
Different aggregations per column.
|
|
358
|
+
|
|
359
|
+
>>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}).execute()
|
|
360
|
+
A B
|
|
361
|
+
max NaN 8.0
|
|
362
|
+
min 1.0 2.0
|
|
363
|
+
sum 12.0 NaN
|
|
364
|
+
|
|
365
|
+
Aggregate different functions over the columns and rename the index of the resulting DataFrame.
|
|
366
|
+
|
|
367
|
+
>>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean')).execute()
|
|
368
|
+
A B C
|
|
369
|
+
x 7.0 NaN NaN
|
|
370
|
+
y NaN 2.0 NaN
|
|
371
|
+
z NaN NaN 6.0
|
|
372
|
+
|
|
373
|
+
>>> s = md.Series([1, 2, 3, 4])
|
|
374
|
+
>>> s.agg('min').execute()
|
|
375
|
+
1
|
|
376
|
+
|
|
377
|
+
>>> s.agg(['min', 'max']).execute()
|
|
378
|
+
max 4
|
|
379
|
+
min 1
|
|
380
|
+
"""
|
|
318
381
|
axis = validate_axis(axis, df)
|
|
319
382
|
if (
|
|
320
383
|
df.ndim == 2
|
|
@@ -404,6 +404,7 @@ class ReductionPostStep(NamedTuple):
|
|
|
404
404
|
func_name: str
|
|
405
405
|
columns: Optional[List[str]]
|
|
406
406
|
func_idl: bytes
|
|
407
|
+
post_func_aliases: Optional[List[str]] = None
|
|
407
408
|
|
|
408
409
|
|
|
409
410
|
class ReductionSteps(NamedTuple):
|
|
@@ -462,6 +463,7 @@ class ReductionCompiler:
|
|
|
462
463
|
self._output_key_to_agg_steps = dict()
|
|
463
464
|
self._output_key_to_post_steps = dict()
|
|
464
465
|
self._output_key_to_post_cols = dict()
|
|
466
|
+
self._output_key_to_col_func_mapping = dict()
|
|
465
467
|
|
|
466
468
|
@classmethod
|
|
467
469
|
def _check_function_valid(cls, func):
|
|
@@ -531,6 +533,14 @@ class ReductionCompiler:
|
|
|
531
533
|
self._output_key_to_post_steps[step.output_key] = step
|
|
532
534
|
self._update_col_dict(self._output_key_to_post_cols, step.output_key, cols)
|
|
533
535
|
|
|
536
|
+
if cols is not None:
|
|
537
|
+
col_name_map = (
|
|
538
|
+
self._output_key_to_col_func_mapping.get(step.output_key) or {}
|
|
539
|
+
)
|
|
540
|
+
for col in cols:
|
|
541
|
+
col_name_map[col] = func_name
|
|
542
|
+
self._output_key_to_col_func_mapping[step.output_key] = col_name_map
|
|
543
|
+
|
|
534
544
|
@staticmethod
|
|
535
545
|
def _build_mock_return_object(func, input_dtype, ndim):
|
|
536
546
|
from ..initializer import DataFrame as MaxDataFrame
|
|
@@ -812,11 +822,12 @@ class ReductionCompiler:
|
|
|
812
822
|
agg_funcs.append(step)
|
|
813
823
|
|
|
814
824
|
for key, step in self._output_key_to_post_steps.items():
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
825
|
+
post_cols = self._output_key_to_post_cols[key]
|
|
826
|
+
func_renames = None
|
|
827
|
+
if post_cols:
|
|
828
|
+
col_map = self._output_key_to_col_func_mapping.get(key)
|
|
829
|
+
if col_map:
|
|
830
|
+
func_renames = [col_map[c] for c in post_cols]
|
|
820
831
|
|
|
821
832
|
func_name = step.func_name
|
|
822
833
|
if self._lambda_counter == 1 and step.func_name == "<lambda_0>":
|
|
@@ -831,6 +842,7 @@ class ReductionCompiler:
|
|
|
831
842
|
func_name,
|
|
832
843
|
post_cols,
|
|
833
844
|
step.func_idl,
|
|
845
|
+
func_renames,
|
|
834
846
|
)
|
|
835
847
|
)
|
|
836
848
|
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
import numpy as np
|
|
15
16
|
import pandas as pd
|
|
16
17
|
import pyarrow as pa
|
|
@@ -71,6 +72,12 @@ def test_pack_function(df1):
|
|
|
71
72
|
@pytest.mark.parametrize(
|
|
72
73
|
"dtype, fill_value, expected",
|
|
73
74
|
[
|
|
75
|
+
(
|
|
76
|
+
ArrowDtype(pa.list_(pa.string())) if ArrowDtype else None,
|
|
77
|
+
1,
|
|
78
|
+
["1"],
|
|
79
|
+
),
|
|
80
|
+
(pa.list_(pa.string()), 1, ["1"]),
|
|
74
81
|
(
|
|
75
82
|
ArrowDtype(pa.map_(pa.int32(), pa.string())) if ArrowDtype else None,
|
|
76
83
|
1,
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from numbers import Number
|
|
15
16
|
|
|
16
17
|
from ...tensor import tensor as astensor
|
maxframe/dataframe/utils.py
CHANGED
|
@@ -463,6 +463,9 @@ def _generate_value(dtype, fill_value):
|
|
|
463
463
|
if ArrowDtype and isinstance(dtype, pd.ArrowDtype):
|
|
464
464
|
return _generate_value(dtype.pyarrow_dtype, fill_value)
|
|
465
465
|
|
|
466
|
+
if isinstance(dtype, pa.ListType):
|
|
467
|
+
return [_generate_value(dtype.value_type, fill_value)]
|
|
468
|
+
|
|
466
469
|
if isinstance(dtype, pa.MapType):
|
|
467
470
|
return [
|
|
468
471
|
(
|
maxframe/io/odpsio/schema.py
CHANGED
|
@@ -12,8 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from . import graph, llm, pytorch
|
|
15
|
+
from . import graph, llm, models, pytorch
|
|
16
16
|
|
|
17
|
-
del graph
|
|
18
|
-
del llm
|
|
19
|
-
del pytorch
|
|
17
|
+
del graph, llm, models, pytorch
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from typing import Any, Dict
|
|
15
16
|
|
|
16
17
|
import numpy as np
|
|
@@ -19,6 +20,8 @@ import pandas as pd
|
|
|
19
20
|
from ....core.entity.output_types import OutputType
|
|
20
21
|
from ....core.operator.base import Operator
|
|
21
22
|
from ....core.operator.core import TileableOperatorMixin
|
|
23
|
+
from ....dataframe.core import SERIES_TYPE
|
|
24
|
+
from ....dataframe.operators import DataFrameOperatorMixin
|
|
22
25
|
from ....dataframe.utils import parse_index
|
|
23
26
|
from ....serialization.serializables.core import Serializable
|
|
24
27
|
from ....serialization.serializables.field import AnyField, DictField, StringField
|
|
@@ -31,24 +34,42 @@ class LLM(Serializable):
|
|
|
31
34
|
pass
|
|
32
35
|
|
|
33
36
|
|
|
34
|
-
class
|
|
37
|
+
class LLMTaskOperator(Operator, DataFrameOperatorMixin):
|
|
38
|
+
task = AnyField("task", default=None)
|
|
35
39
|
model = AnyField("model", default=None)
|
|
36
|
-
prompt_template = AnyField("prompt_template", default=None)
|
|
37
40
|
params = DictField("params", default=None)
|
|
41
|
+
running_options: Dict[str, Any] = DictField("running_options", default=None)
|
|
38
42
|
|
|
39
43
|
def __init__(self, output_types=None, **kw):
|
|
40
44
|
if output_types is None:
|
|
41
45
|
output_types = [OutputType.dataframe]
|
|
42
46
|
super().__init__(_output_types=output_types, **kw)
|
|
43
47
|
|
|
44
|
-
def
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
48
|
+
def get_output_dtypes(self) -> Dict[str, np.dtype]:
|
|
49
|
+
raise NotImplementedError
|
|
50
|
+
|
|
51
|
+
def __call__(self, data, index=None):
|
|
52
|
+
outputs = self.get_output_dtypes()
|
|
53
|
+
col_name = list(outputs.keys())
|
|
54
|
+
columns = parse_index(pd.Index(col_name), store_data=True)
|
|
55
|
+
out_dtypes = pd.Series(list(outputs.values()), index=col_name)
|
|
56
|
+
index_value = index or (
|
|
57
|
+
parse_index(pd.RangeIndex(-1), data)
|
|
58
|
+
if isinstance(data, SERIES_TYPE)
|
|
59
|
+
else data.index_value
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return self.new_dataframe(
|
|
49
63
|
inputs=[data],
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
index_value=data.index_value,
|
|
64
|
+
shape=(np.nan, len(col_name)),
|
|
65
|
+
index_value=index_value,
|
|
53
66
|
columns_value=columns,
|
|
67
|
+
dtypes=out_dtypes,
|
|
54
68
|
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class LLMTextGenOperator(LLMTaskOperator, TileableOperatorMixin):
|
|
72
|
+
prompt_template = AnyField("prompt_template", default=None)
|
|
73
|
+
|
|
74
|
+
def get_output_dtypes(self) -> Dict[str, np.dtype]:
|
|
75
|
+
return {"response": np.dtype("O"), "success": np.dtype("bool")}
|
|
@@ -11,12 +11,13 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from typing import Any, Dict
|
|
15
16
|
|
|
16
17
|
from ..... import opcodes
|
|
17
18
|
from .....serialization.serializables.core import Serializable
|
|
18
19
|
from .....serialization.serializables.field import StringField
|
|
19
|
-
from ..core import
|
|
20
|
+
from ..core import LLMTextGenOperator
|
|
20
21
|
from ..multi_modal import MultiModalLLM
|
|
21
22
|
from ..text import TextLLM
|
|
22
23
|
|
|
@@ -33,8 +34,25 @@ class DashScopeLLMMixin(Serializable):
|
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
class DashScopeTextLLM(TextLLM, DashScopeLLMMixin):
|
|
37
|
+
"""
|
|
38
|
+
DashScope text LLM.
|
|
39
|
+
"""
|
|
40
|
+
|
|
36
41
|
api_key_resource = StringField("api_key_resource", default=None)
|
|
37
42
|
|
|
43
|
+
def __init__(self, name: str, api_key_resource: str):
|
|
44
|
+
"""
|
|
45
|
+
Initialize a DashScope text LLM.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
name : str
|
|
50
|
+
The LLM name to use, check DashScope for `available models <https://help.aliyun.com/zh/model-studio/getting-started/models>`_.
|
|
51
|
+
api_key_resource : str
|
|
52
|
+
The MaxCompute resource file name containing the DashScope API key.
|
|
53
|
+
"""
|
|
54
|
+
super().__init__(name=name, api_key_resource=api_key_resource)
|
|
55
|
+
|
|
38
56
|
def generate(
|
|
39
57
|
self,
|
|
40
58
|
data,
|
|
@@ -49,8 +67,25 @@ class DashScopeTextLLM(TextLLM, DashScopeLLMMixin):
|
|
|
49
67
|
|
|
50
68
|
|
|
51
69
|
class DashScopeMultiModalLLM(MultiModalLLM, DashScopeLLMMixin):
|
|
70
|
+
"""
|
|
71
|
+
DashScope multi-modal LLM.
|
|
72
|
+
"""
|
|
73
|
+
|
|
52
74
|
api_key_resource = StringField("api_key_resource", default=None)
|
|
53
75
|
|
|
76
|
+
def __init__(self, name: str, api_key_resource: str):
|
|
77
|
+
"""
|
|
78
|
+
Initialize a DashScope multi-modal LLM.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
name : str
|
|
83
|
+
The LLM name to use, check DashScope for `available models <https://help.aliyun.com/zh/model-studio/getting-started/models>`_.
|
|
84
|
+
api_key_resource : str
|
|
85
|
+
The MaxCompute resource file name containing the DashScope API key.
|
|
86
|
+
"""
|
|
87
|
+
super().__init__(name=name, api_key_resource=api_key_resource)
|
|
88
|
+
|
|
54
89
|
def generate(
|
|
55
90
|
self,
|
|
56
91
|
data,
|
|
@@ -65,9 +100,9 @@ class DashScopeMultiModalLLM(MultiModalLLM, DashScopeLLMMixin):
|
|
|
65
100
|
)(data)
|
|
66
101
|
|
|
67
102
|
|
|
68
|
-
class DashScopeTextGenerationOperator(
|
|
103
|
+
class DashScopeTextGenerationOperator(LLMTextGenOperator):
|
|
69
104
|
_op_type_ = opcodes.DASHSCOPE_TEXT_GENERATION
|
|
70
105
|
|
|
71
106
|
|
|
72
|
-
class DashScopeMultiModalGenerationOperator(
|
|
107
|
+
class DashScopeMultiModalGenerationOperator(LLMTextGenOperator):
|
|
73
108
|
_op_type_ = opcodes.DASHSCOPE_MULTI_MODAL_GENERATION
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Any, Dict, List
|
|
16
|
+
|
|
17
|
+
from ..... import opcodes
|
|
18
|
+
from .....serialization.serializables import StringField
|
|
19
|
+
from ..core import LLMTextGenOperator
|
|
20
|
+
from ..text import TextLLM
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ManagedLLMTextGenOperator(LLMTextGenOperator):
|
|
24
|
+
_op_type_ = opcodes.MANAGED_TEXT_MODAL_GENERATION
|
|
25
|
+
|
|
26
|
+
inference_framework: str = StringField("inference_framework", default=None)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ManagedTextLLM(TextLLM):
|
|
30
|
+
"""
|
|
31
|
+
Managed text LLM by MaxFrame.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, name: str):
|
|
35
|
+
"""
|
|
36
|
+
Initialize a managed text LLM.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
name : str
|
|
41
|
+
The managed text LLM name to use.
|
|
42
|
+
"""
|
|
43
|
+
super().__init__(name=name)
|
|
44
|
+
|
|
45
|
+
def generate(
|
|
46
|
+
self,
|
|
47
|
+
data,
|
|
48
|
+
prompt_template: List[Dict[str, Any]],
|
|
49
|
+
params: Dict[str, Any] = None,
|
|
50
|
+
**kw
|
|
51
|
+
):
|
|
52
|
+
return ManagedLLMTextGenOperator(
|
|
53
|
+
model=self, prompt_template=prompt_template, params=params, **kw
|
|
54
|
+
)(data)
|