maxframe 2.2.0__cp39-cp39-win_amd64.whl → 2.3.0rc1__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp39-win_amd64.pyd +0 -0
- maxframe/codegen/core.py +3 -2
- maxframe/codegen/spe/dataframe/merge.py +4 -0
- maxframe/codegen/spe/dataframe/misc.py +2 -0
- maxframe/codegen/spe/dataframe/reduction.py +18 -0
- maxframe/codegen/spe/dataframe/sort.py +9 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +10 -0
- maxframe/core/base.py +2 -1
- maxframe/core/entity/tileables.py +2 -0
- maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
- maxframe/core/graph/entity.py +7 -1
- maxframe/core/mode.py +6 -1
- maxframe/dataframe/__init__.py +2 -2
- maxframe/dataframe/arithmetic/__init__.py +4 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/core.py +98 -106
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/read_csv.py +19 -11
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datastore/__init__.py +6 -0
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/groupby/__init__.py +5 -1
- maxframe/dataframe/groupby/aggregation.py +10 -6
- maxframe/dataframe/groupby/apply_chunk.py +1 -3
- maxframe/dataframe/groupby/core.py +20 -4
- maxframe/dataframe/indexing/__init__.py +2 -1
- maxframe/dataframe/indexing/insert.py +45 -17
- maxframe/dataframe/merge/__init__.py +3 -0
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/misc/__init__.py +14 -3
- maxframe/dataframe/misc/check_unique.py +41 -10
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/map.py +31 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +35 -1
- maxframe/dataframe/missing/checkna.py +3 -2
- maxframe/dataframe/reduction/__init__.py +10 -5
- maxframe/dataframe/reduction/aggregation.py +6 -6
- maxframe/dataframe/reduction/argmax.py +7 -4
- maxframe/dataframe/reduction/argmin.py +7 -4
- maxframe/dataframe/reduction/core.py +18 -9
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +10 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/sort/__init__.py +9 -2
- maxframe/dataframe/sort/argsort.py +7 -1
- maxframe/dataframe/sort/core.py +1 -1
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/utils.py +30 -26
- maxframe/learn/contrib/llm/core.py +16 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +31 -7
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +2 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +1 -0
- maxframe/learn/utils/extmath.py +42 -9
- maxframe/learn/utils/odpsio.py +80 -11
- maxframe/lib/filesystem/_oss_lib/common.py +2 -0
- maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
- maxframe/opcodes.py +9 -1
- maxframe/remote/core.py +4 -0
- maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
- maxframe/serialization/tests/test_serial.py +2 -2
- maxframe/tensor/arithmetic/__init__.py +1 -1
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
- maxframe/tensor/core.py +3 -0
- maxframe/tensor/misc/copyto.py +1 -1
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +8 -5
- maxframe/udf.py +103 -7
- maxframe/utils.py +61 -8
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_session.py +24 -0
- maxframe/dataframe/arrays.py +0 -864
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -12,18 +12,24 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any, Dict, List
|
|
15
|
+
from typing import Any, Dict, List, Tuple
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
|
|
19
19
|
from .... import opcodes
|
|
20
20
|
from ....dataframe.core import DataFrame, Series
|
|
21
|
-
from ....serialization.serializables import
|
|
21
|
+
from ....serialization.serializables import (
|
|
22
|
+
DictField,
|
|
23
|
+
FieldTypes,
|
|
24
|
+
ListField,
|
|
25
|
+
StringField,
|
|
26
|
+
)
|
|
22
27
|
from .core import LLM, LLMTaskOperator
|
|
23
28
|
|
|
24
29
|
|
|
25
|
-
class
|
|
30
|
+
class TextLLMSummarizeOp(LLMTaskOperator):
|
|
26
31
|
_op_type_ = opcodes.LLM_TEXT_SUMMARIZE_TASK
|
|
32
|
+
_legacy_name = "TextLLMSummarizeOperator" # since v2.3.0
|
|
27
33
|
|
|
28
34
|
def get_output_dtypes(self) -> Dict[str, np.dtype]:
|
|
29
35
|
return {
|
|
@@ -32,21 +38,25 @@ class TextLLMSummarizeOperator(LLMTaskOperator):
|
|
|
32
38
|
}
|
|
33
39
|
|
|
34
40
|
|
|
35
|
-
class
|
|
41
|
+
class TextLLMTranslateOp(LLMTaskOperator):
|
|
36
42
|
_op_type_ = opcodes.LLM_TEXT_TRANSLATE_TASK
|
|
43
|
+
_legacy_name = "TextLLMTranslateOperator" # since v2.3.0
|
|
37
44
|
|
|
38
45
|
source_language = StringField("source_language")
|
|
39
46
|
target_language = StringField("target_language")
|
|
47
|
+
description = StringField("description", default=None)
|
|
48
|
+
examples = ListField("examples", FieldTypes.dict, default=None)
|
|
40
49
|
|
|
41
50
|
def get_output_dtypes(self) -> Dict[str, np.dtype]:
|
|
42
51
|
return {
|
|
43
|
-
"
|
|
52
|
+
"output": np.dtype("O"),
|
|
44
53
|
"success": np.dtype("bool"),
|
|
45
54
|
}
|
|
46
55
|
|
|
47
56
|
|
|
48
|
-
class
|
|
57
|
+
class TextLLMClassifyOp(LLMTaskOperator):
|
|
49
58
|
_op_type_ = opcodes.LLM_TEXT_CLASSIFY_TASK
|
|
59
|
+
_legacy_name = "TextLLMClassifyOperator" # since v2.3.0
|
|
50
60
|
|
|
51
61
|
labels = ListField("labels")
|
|
52
62
|
description = StringField("description", default=None)
|
|
@@ -60,7 +70,24 @@ class TextLLMClassifyOperator(LLMTaskOperator):
|
|
|
60
70
|
}
|
|
61
71
|
|
|
62
72
|
|
|
63
|
-
class
|
|
73
|
+
class TextLLMExtractOp(LLMTaskOperator):
|
|
74
|
+
_op_type_ = opcodes.LLM_TEXT_EXTRACT_TASK
|
|
75
|
+
_legacy_name = "TextLLMExtractOperator" # since v2.3.0
|
|
76
|
+
|
|
77
|
+
schema = DictField("schema", FieldTypes.string, FieldTypes.any, default=None)
|
|
78
|
+
description = StringField("description", default=None)
|
|
79
|
+
examples = ListField("examples", FieldTypes.dict, default_factory=None)
|
|
80
|
+
|
|
81
|
+
def get_output_dtypes(self) -> Dict[str, np.dtype]:
|
|
82
|
+
return {
|
|
83
|
+
"output": np.dtype("O"),
|
|
84
|
+
"success": np.dtype("bool"),
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TextGenLLM(LLM):
|
|
89
|
+
_legacy_name = "TextLLM" # since v2.3.0
|
|
90
|
+
|
|
64
91
|
def generate(
|
|
65
92
|
self,
|
|
66
93
|
data,
|
|
@@ -70,23 +97,25 @@ class TextLLM(LLM):
|
|
|
70
97
|
raise NotImplementedError
|
|
71
98
|
|
|
72
99
|
def summarize(self, series, index=None, **kw):
|
|
73
|
-
return
|
|
74
|
-
series, index
|
|
75
|
-
)
|
|
100
|
+
return TextLLMSummarizeOp(model=self, task="summarize", **kw)(series, index)
|
|
76
101
|
|
|
77
102
|
def translate(
|
|
78
103
|
self,
|
|
79
104
|
series,
|
|
80
105
|
target_language: str,
|
|
81
106
|
source_language: str = None,
|
|
107
|
+
description: str = None,
|
|
108
|
+
examples: List[Dict[str, str]] = None,
|
|
82
109
|
index=None,
|
|
83
110
|
**kw
|
|
84
111
|
):
|
|
85
|
-
return
|
|
112
|
+
return TextLLMTranslateOp(
|
|
86
113
|
model=self,
|
|
87
114
|
task="translate",
|
|
88
115
|
source_language=source_language,
|
|
89
116
|
target_language=target_language,
|
|
117
|
+
description=description,
|
|
118
|
+
examples=examples,
|
|
90
119
|
**kw
|
|
91
120
|
)(series, index)
|
|
92
121
|
|
|
@@ -99,7 +128,7 @@ class TextLLM(LLM):
|
|
|
99
128
|
index=None,
|
|
100
129
|
**kw
|
|
101
130
|
):
|
|
102
|
-
return
|
|
131
|
+
return TextLLMClassifyOp(
|
|
103
132
|
model=self,
|
|
104
133
|
labels=labels,
|
|
105
134
|
task="classify",
|
|
@@ -108,10 +137,51 @@ class TextLLM(LLM):
|
|
|
108
137
|
**kw
|
|
109
138
|
)(series, index)
|
|
110
139
|
|
|
140
|
+
def extract(
|
|
141
|
+
self,
|
|
142
|
+
series,
|
|
143
|
+
schema: Any,
|
|
144
|
+
description: str = None,
|
|
145
|
+
examples: List[Tuple[str, str]] = None,
|
|
146
|
+
index=None,
|
|
147
|
+
**kw
|
|
148
|
+
):
|
|
149
|
+
import inspect
|
|
150
|
+
|
|
151
|
+
from pydantic import BaseModel
|
|
152
|
+
|
|
153
|
+
if inspect.isclass(schema) and issubclass(schema, BaseModel):
|
|
154
|
+
schema = schema.model_json_schema()
|
|
155
|
+
|
|
156
|
+
return TextLLMExtractOp(
|
|
157
|
+
model=self,
|
|
158
|
+
schema=schema,
|
|
159
|
+
task="extract",
|
|
160
|
+
description=description,
|
|
161
|
+
examples=examples,
|
|
162
|
+
**kw
|
|
163
|
+
)(series, index)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
TextLLM = TextGenLLM # for old client compatibility
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class TextEmbeddingModel(LLM):
|
|
170
|
+
def embed(
|
|
171
|
+
self,
|
|
172
|
+
data: Series,
|
|
173
|
+
dimensions: int,
|
|
174
|
+
encoding_format: str,
|
|
175
|
+
simple_output: bool,
|
|
176
|
+
params: Dict[str, Any],
|
|
177
|
+
**kw
|
|
178
|
+
):
|
|
179
|
+
raise NotImplementedError
|
|
180
|
+
|
|
111
181
|
|
|
112
182
|
def generate(
|
|
113
183
|
data,
|
|
114
|
-
model:
|
|
184
|
+
model: TextGenLLM,
|
|
115
185
|
prompt_template: List[Dict[str, Any]],
|
|
116
186
|
params: Dict[str, Any] = None,
|
|
117
187
|
):
|
|
@@ -141,11 +211,11 @@ def generate(
|
|
|
141
211
|
|
|
142
212
|
Examples
|
|
143
213
|
--------
|
|
144
|
-
>>> from maxframe.learn.contrib.llm.models.managed import
|
|
214
|
+
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
|
|
145
215
|
>>> import maxframe.dataframe as md
|
|
146
216
|
>>>
|
|
147
217
|
>>> # Initialize the model
|
|
148
|
-
>>> llm =
|
|
218
|
+
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
|
|
149
219
|
>>>
|
|
150
220
|
>>> # Prepare prompt template
|
|
151
221
|
>>> messages = [
|
|
@@ -164,14 +234,14 @@ def generate(
|
|
|
164
234
|
"""
|
|
165
235
|
if not isinstance(data, DataFrame) and not isinstance(data, Series):
|
|
166
236
|
raise ValueError("data must be a maxframe dataframe or series object")
|
|
167
|
-
if not isinstance(model,
|
|
237
|
+
if not isinstance(model, TextGenLLM):
|
|
168
238
|
raise TypeError("model must be a TextLLM object")
|
|
169
239
|
params = params if params is not None else dict()
|
|
170
240
|
model.validate_params(params)
|
|
171
241
|
return model.generate(data, prompt_template=prompt_template, params=params)
|
|
172
242
|
|
|
173
243
|
|
|
174
|
-
def summary(series, model:
|
|
244
|
+
def summary(series, model: TextGenLLM, index=None):
|
|
175
245
|
"""
|
|
176
246
|
Generate summaries for text content in a series using a language model.
|
|
177
247
|
|
|
@@ -180,15 +250,35 @@ def summary(series, model: TextLLM, index=None):
|
|
|
180
250
|
series : Series
|
|
181
251
|
A maxframe Series containing text data to be summarized.
|
|
182
252
|
Each element should be a text string.
|
|
183
|
-
model :
|
|
253
|
+
model : TextGenLLM
|
|
184
254
|
Language model instance used for text summarization.
|
|
185
255
|
index : array-like, optional
|
|
186
256
|
Index for the output series, by default None, will generate new index.
|
|
187
257
|
|
|
188
258
|
Returns
|
|
189
259
|
-------
|
|
190
|
-
|
|
191
|
-
A
|
|
260
|
+
DataFrame
|
|
261
|
+
A DataFrame containing the generated summaries and success status.
|
|
262
|
+
Columns include 'summary' (generated summary text) and 'success' (boolean status).
|
|
263
|
+
If 'success' is False, the 'summary' column will contain error information instead of the expected output.
|
|
264
|
+
|
|
265
|
+
Examples
|
|
266
|
+
--------
|
|
267
|
+
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
|
|
268
|
+
>>> import maxframe.dataframe as md
|
|
269
|
+
>>>
|
|
270
|
+
>>> # Initialize the model
|
|
271
|
+
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
|
|
272
|
+
>>>
|
|
273
|
+
>>> # Create sample data
|
|
274
|
+
>>> texts = md.Series([
|
|
275
|
+
... "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.",
|
|
276
|
+
... "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data."
|
|
277
|
+
... ])
|
|
278
|
+
>>>
|
|
279
|
+
>>> # Generate summaries
|
|
280
|
+
>>> result = summary(texts, llm)
|
|
281
|
+
>>> result.execute()
|
|
192
282
|
|
|
193
283
|
Notes
|
|
194
284
|
-----
|
|
@@ -205,35 +295,54 @@ def summary(series, model: TextLLM, index=None):
|
|
|
205
295
|
|
|
206
296
|
|
|
207
297
|
def translate(
|
|
208
|
-
series, model:
|
|
298
|
+
series, model: TextGenLLM, source_language: str, target_language: str, index=None
|
|
209
299
|
):
|
|
210
300
|
"""
|
|
211
301
|
Translate text content in a series using a language model from source language to target language.
|
|
212
302
|
|
|
213
303
|
Parameters
|
|
214
304
|
----------
|
|
215
|
-
series :
|
|
305
|
+
series : Series
|
|
216
306
|
A maxframe Series containing text data to translate.
|
|
217
307
|
Each element should be a text string.
|
|
218
|
-
model :
|
|
219
|
-
Language model instance used for text
|
|
308
|
+
model : TextGenLLM
|
|
309
|
+
Language model instance used for text translation.
|
|
220
310
|
source_language : str
|
|
221
|
-
Source language of the text.
|
|
311
|
+
Source language of the text (e.g., 'en', 'zh', 'ja').
|
|
222
312
|
target_language : str
|
|
223
|
-
Target language
|
|
313
|
+
Target language for translation (e.g., 'en', 'zh', 'ja').
|
|
224
314
|
index : array-like, optional
|
|
225
315
|
Index for the output series, by default None, will generate new index.
|
|
226
316
|
|
|
227
317
|
Returns
|
|
228
318
|
-------
|
|
229
|
-
|
|
230
|
-
A
|
|
319
|
+
DataFrame
|
|
320
|
+
A DataFrame containing the generated translations and success status.
|
|
321
|
+
Columns include 'output' (translated text) and 'success' (boolean status).
|
|
322
|
+
If 'success' is False, the 'output' column will contain error information instead of the expected output.
|
|
323
|
+
|
|
324
|
+
Examples
|
|
325
|
+
--------
|
|
326
|
+
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
|
|
327
|
+
>>> import maxframe.dataframe as md
|
|
328
|
+
>>>
|
|
329
|
+
>>> # Initialize the model
|
|
330
|
+
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
|
|
331
|
+
>>>
|
|
332
|
+
>>> # Create sample data
|
|
333
|
+
>>> texts = md.Series([
|
|
334
|
+
... "Hello, how are you?",
|
|
335
|
+
... "Machine learning is fascinating."
|
|
336
|
+
... ])
|
|
337
|
+
>>>
|
|
338
|
+
>>> # Translate from English to Chinese
|
|
339
|
+
>>> result = translate(texts, llm, source_language="en", target_language="zh")
|
|
340
|
+
>>> result.execute()
|
|
231
341
|
|
|
232
342
|
Notes
|
|
233
343
|
-----
|
|
234
344
|
**Preview:** This API is in preview state and may be unstable.
|
|
235
345
|
The interface may change in future releases.
|
|
236
|
-
|
|
237
346
|
"""
|
|
238
347
|
if not isinstance(series, Series):
|
|
239
348
|
raise ValueError("series must be a maxframe series object")
|
|
@@ -249,36 +358,63 @@ def translate(
|
|
|
249
358
|
|
|
250
359
|
def classify(
|
|
251
360
|
series,
|
|
252
|
-
model:
|
|
361
|
+
model: TextGenLLM,
|
|
253
362
|
labels: List[str],
|
|
254
363
|
description: str = None,
|
|
255
364
|
examples: List[Dict[str, str]] = None,
|
|
256
365
|
index=None,
|
|
257
366
|
):
|
|
258
367
|
"""
|
|
259
|
-
Classify text content in a series with given labels.
|
|
368
|
+
Classify text content in a series with given labels using a language model.
|
|
260
369
|
|
|
261
370
|
Parameters
|
|
262
371
|
----------
|
|
263
|
-
series :
|
|
372
|
+
series : Series
|
|
264
373
|
A maxframe Series containing text data to be classified.
|
|
265
374
|
Each element should be a text string.
|
|
266
|
-
model :
|
|
267
|
-
Language model instance used for text
|
|
375
|
+
model : TextGenLLM
|
|
376
|
+
Language model instance used for text classification.
|
|
268
377
|
labels : List[str]
|
|
269
|
-
List of labels to classify the text.
|
|
270
|
-
description : str
|
|
271
|
-
Description of the classification task.
|
|
272
|
-
examples : List[Dict[str,
|
|
273
|
-
Examples of the classification task, like [{
|
|
274
|
-
LLM
|
|
378
|
+
List of labels to classify the text into.
|
|
379
|
+
description : str, optional
|
|
380
|
+
Description of the classification task to help the model understand the context.
|
|
381
|
+
examples : List[Dict[str, str]], optional
|
|
382
|
+
Examples of the classification task, like [{"text": "text...", "label": "A", "reason": "reason..."}],
|
|
383
|
+
to help LLM better understand your classification rules.
|
|
275
384
|
index : array-like, optional
|
|
276
385
|
Index for the output series, by default None, will generate new index.
|
|
277
386
|
|
|
278
387
|
Returns
|
|
279
388
|
-------
|
|
280
|
-
|
|
281
|
-
A
|
|
389
|
+
DataFrame
|
|
390
|
+
A DataFrame containing the generated classification results and success status.
|
|
391
|
+
Columns include 'label' (predicted label), 'reason' (reasoning), and 'success' (boolean status).
|
|
392
|
+
If 'success' is False, the 'label' and 'reason' columns will contain error information instead of the expected output.
|
|
393
|
+
|
|
394
|
+
Examples
|
|
395
|
+
--------
|
|
396
|
+
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
|
|
397
|
+
>>> import maxframe.dataframe as md
|
|
398
|
+
>>>
|
|
399
|
+
>>> # Initialize the model
|
|
400
|
+
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
|
|
401
|
+
>>>
|
|
402
|
+
>>> # Create sample data
|
|
403
|
+
>>> texts = md.Series([
|
|
404
|
+
... "I love this product! It's amazing!",
|
|
405
|
+
... "This is terrible, worst purchase ever.",
|
|
406
|
+
... "It's okay, nothing special."
|
|
407
|
+
... ])
|
|
408
|
+
>>>
|
|
409
|
+
>>> # Classify sentiment
|
|
410
|
+
>>> labels = ["positive", "negative", "neutral"]
|
|
411
|
+
>>> description = "Classify the sentiment of customer reviews"
|
|
412
|
+
>>> examples = [
|
|
413
|
+
... {"text": "Great product!", "label": "positive", "reason": "Expresses satisfaction"},
|
|
414
|
+
... {"text": "Poor quality", "label": "negative", "reason": "Expresses dissatisfaction"}
|
|
415
|
+
... ]
|
|
416
|
+
>>> result = classify(texts, llm, labels=labels, description=description, examples=examples)
|
|
417
|
+
>>> result.execute()
|
|
282
418
|
|
|
283
419
|
Notes
|
|
284
420
|
-----
|
|
@@ -300,3 +436,173 @@ def classify(
|
|
|
300
436
|
return model.classify(
|
|
301
437
|
series, labels=labels, description=description, examples=examples, index=index
|
|
302
438
|
)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def extract(
|
|
442
|
+
series,
|
|
443
|
+
model: TextGenLLM,
|
|
444
|
+
schema: Any,
|
|
445
|
+
description: str = None,
|
|
446
|
+
examples: List[Tuple[str, str]] = None,
|
|
447
|
+
index=None,
|
|
448
|
+
):
|
|
449
|
+
"""
|
|
450
|
+
Extract structured information from text content in a series using a language model.
|
|
451
|
+
|
|
452
|
+
Parameters
|
|
453
|
+
----------
|
|
454
|
+
series : Series
|
|
455
|
+
A maxframe Series containing text data to extract information from.
|
|
456
|
+
Each element should be a text string.
|
|
457
|
+
model : TextGenLLM
|
|
458
|
+
Language model instance used for information extraction.
|
|
459
|
+
schema : Any
|
|
460
|
+
Schema definition for the extraction. Can be a dictionary defining the structure
|
|
461
|
+
or a Pydantic BaseModel class that will be converted to JSON schema.
|
|
462
|
+
description : str, optional
|
|
463
|
+
Description of the extraction task to help the model understand what to extract.
|
|
464
|
+
examples : List[Tuple[str, str]], optional
|
|
465
|
+
Examples of the extraction task in format [(input_text, expected_output), ...],
|
|
466
|
+
to help LLM better understand the extraction requirements.
|
|
467
|
+
index : array-like, optional
|
|
468
|
+
Index for the output series, by default None, will generate new index.
|
|
469
|
+
|
|
470
|
+
Returns
|
|
471
|
+
-------
|
|
472
|
+
DataFrame
|
|
473
|
+
A DataFrame containing the extracted information and success status.
|
|
474
|
+
Columns include 'output' (extracted structured data) and 'success' (boolean status).
|
|
475
|
+
If 'success' is False, the 'output' column will contain error information instead of the expected output.
|
|
476
|
+
|
|
477
|
+
Examples
|
|
478
|
+
--------
|
|
479
|
+
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
|
|
480
|
+
>>> import maxframe.dataframe as md
|
|
481
|
+
>>>
|
|
482
|
+
>>> # Initialize the model
|
|
483
|
+
>>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
|
|
484
|
+
>>>
|
|
485
|
+
>>> # Create sample data
|
|
486
|
+
>>> texts = md.Series([
|
|
487
|
+
... "John Smith, age 30, works as a Software Engineer at Google.",
|
|
488
|
+
... "Alice Johnson, 25 years old, is a Data Scientist at Microsoft."
|
|
489
|
+
... ])
|
|
490
|
+
>>>
|
|
491
|
+
>>> # Define extraction schema
|
|
492
|
+
>>> schema = {
|
|
493
|
+
... "name": "string",
|
|
494
|
+
... "age": "integer",
|
|
495
|
+
... "job_title": "string",
|
|
496
|
+
... "company": "string"
|
|
497
|
+
... }
|
|
498
|
+
>>>
|
|
499
|
+
>>> # Extract structured information
|
|
500
|
+
>>> description = "Extract person information from text"
|
|
501
|
+
>>> examples = [
|
|
502
|
+
... ("Bob Brown, 35, Manager at Apple", '{"name": "Bob Brown", "age": 35, "job_title": "Manager", "company": "Apple"}')
|
|
503
|
+
... ]
|
|
504
|
+
>>> result = extract(texts, llm, schema=schema, description=description, examples=examples)
|
|
505
|
+
>>> result.execute()
|
|
506
|
+
|
|
507
|
+
Notes
|
|
508
|
+
-----
|
|
509
|
+
**Preview:** This API is in preview state and may be unstable.
|
|
510
|
+
The interface may change in future releases.
|
|
511
|
+
"""
|
|
512
|
+
if not isinstance(series, Series):
|
|
513
|
+
raise ValueError("series must be a maxframe series object")
|
|
514
|
+
if series.dtype != np.str_:
|
|
515
|
+
raise ValueError("extract input must be a string series")
|
|
516
|
+
if not schema:
|
|
517
|
+
raise ValueError("schema must not be empty")
|
|
518
|
+
if (
|
|
519
|
+
examples
|
|
520
|
+
and not isinstance(examples, list)
|
|
521
|
+
or not any(isinstance(x, Tuple) for x in examples)
|
|
522
|
+
):
|
|
523
|
+
raise ValueError("examples must be a list of tuples, format is (input, output)")
|
|
524
|
+
return model.extract(
|
|
525
|
+
series, schema=schema, description=description, examples=examples, index=index
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def embed(
|
|
530
|
+
series,
|
|
531
|
+
model: TextEmbeddingModel,
|
|
532
|
+
dimensions: int = None,
|
|
533
|
+
encoding_format: str = None,
|
|
534
|
+
simple_output: bool = False,
|
|
535
|
+
params: Dict[str, Any] = None,
|
|
536
|
+
index=None,
|
|
537
|
+
):
|
|
538
|
+
"""
|
|
539
|
+
Embed text content in a series using a text embedding model.
|
|
540
|
+
|
|
541
|
+
Parameters
|
|
542
|
+
----------
|
|
543
|
+
series : Series
|
|
544
|
+
A maxframe Series containing text data to be embedded.
|
|
545
|
+
Each element should be a text string.
|
|
546
|
+
model : TextEmbeddingModel
|
|
547
|
+
Text embedding model instance used for generating embeddings.
|
|
548
|
+
dimensions : int, optional
|
|
549
|
+
Dimensions of the embedding vectors. If not specified, uses model default.
|
|
550
|
+
encoding_format : str, optional
|
|
551
|
+
Encoding format of the embedding (e.g., 'float', 'base64'). If not specified, uses model default.
|
|
552
|
+
simple_output : bool, optional
|
|
553
|
+
Whether to return the embedding data directly without additional metadata, by default False.
|
|
554
|
+
params : Dict[str, Any], optional
|
|
555
|
+
Additional parameters for embedding configuration, by default None.
|
|
556
|
+
Can include model-specific settings.
|
|
557
|
+
index : array-like, optional
|
|
558
|
+
Index for the output series, by default None, will generate new index.
|
|
559
|
+
|
|
560
|
+
Returns
|
|
561
|
+
-------
|
|
562
|
+
DataFrame
|
|
563
|
+
A DataFrame containing the generated embeddings and success status.
|
|
564
|
+
Columns include 'response' (embedding vectors) and 'success' (boolean status).
|
|
565
|
+
If 'success' is False, the 'response' column will contain error information instead of the expected output.
|
|
566
|
+
|
|
567
|
+
Examples
|
|
568
|
+
--------
|
|
569
|
+
>>> from maxframe.learn.contrib.llm.models.managed import ManagedTextEmbeddingModel
|
|
570
|
+
>>> import maxframe.dataframe as md
|
|
571
|
+
>>>
|
|
572
|
+
>>> # Initialize the embedding model
|
|
573
|
+
>>> embedding_model = ManagedTextEmbeddingModel(name="text-embedding-ada-002")
|
|
574
|
+
>>>
|
|
575
|
+
>>> # Create sample data
|
|
576
|
+
>>> texts = md.Series([
|
|
577
|
+
... "Machine learning is a powerful technology.",
|
|
578
|
+
... "Natural language processing enables computers to understand text.",
|
|
579
|
+
... "Deep learning uses neural networks for pattern recognition."
|
|
580
|
+
... ])
|
|
581
|
+
>>>
|
|
582
|
+
>>> # Generate embeddings
|
|
583
|
+
>>> result = embed(texts, embedding_model, simple_output=True)
|
|
584
|
+
>>> result.execute()
|
|
585
|
+
|
|
586
|
+
Notes
|
|
587
|
+
-----
|
|
588
|
+
**Preview:** This API is in preview state and may be unstable.
|
|
589
|
+
The interface may change in future releases.
|
|
590
|
+
"""
|
|
591
|
+
if not isinstance(series, Series):
|
|
592
|
+
raise ValueError("series must be a maxframe series object")
|
|
593
|
+
if series.dtype != np.str_:
|
|
594
|
+
raise ValueError("embed input must be a string series")
|
|
595
|
+
return model.embed(
|
|
596
|
+
series,
|
|
597
|
+
dimensions=dimensions,
|
|
598
|
+
encoding_format=encoding_format,
|
|
599
|
+
simple_output=simple_output,
|
|
600
|
+
params=params,
|
|
601
|
+
index=index,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
TextLLMExtractOperator = TextLLMExtractOp
|
|
606
|
+
TextLLMSummarizeOperator = TextLLMSummarizeOp
|
|
607
|
+
TextLLMTranslateOperator = TextLLMTranslateOp
|
|
608
|
+
TextLLMClassifyOperator = TextLLMClassifyOp
|
maxframe/learn/contrib/models.py
CHANGED
|
@@ -40,7 +40,10 @@ class ModelWithEvalData(ModelData):
|
|
|
40
40
|
def execute(self, session=None, **kw):
|
|
41
41
|
# The evals_result should be fetched when BoosterData.execute() is called.
|
|
42
42
|
result = super().execute(session=session, **kw)
|
|
43
|
-
if
|
|
43
|
+
if (
|
|
44
|
+
getattr(self.op, "has_evals_result", None)
|
|
45
|
+
and self.key == self.op.outputs[0].key
|
|
46
|
+
):
|
|
44
47
|
self._evals_result.update(self.op.outputs[1].fetch(session=session))
|
|
45
48
|
return result
|
|
46
49
|
|
|
@@ -18,6 +18,7 @@ import numpy as np
|
|
|
18
18
|
|
|
19
19
|
from .... import tensor as mt
|
|
20
20
|
from ....tensor.merge.vstack import _vstack
|
|
21
|
+
from ...utils.odpsio import register_odps_model
|
|
21
22
|
from ..utils import make_import_error_func
|
|
22
23
|
from .core import XGBScikitLearnBase, xgboost
|
|
23
24
|
|
|
@@ -28,6 +29,7 @@ else:
|
|
|
28
29
|
|
|
29
30
|
from .predict import predict
|
|
30
31
|
|
|
32
|
+
@register_odps_model
|
|
31
33
|
class XGBClassifier(XGBScikitLearnBase, XGBClassifierBase):
|
|
32
34
|
"""
|
|
33
35
|
Implementation of the scikit-learn API for XGBoost classification.
|
|
@@ -24,11 +24,11 @@ from ....udf import builtin_function
|
|
|
24
24
|
|
|
25
25
|
try:
|
|
26
26
|
import xgboost
|
|
27
|
-
except ImportError:
|
|
27
|
+
except ImportError: # pragma: no cover
|
|
28
28
|
xgboost = None
|
|
29
29
|
|
|
30
|
-
from ....core import OutputType
|
|
31
|
-
from ...utils.odpsio import
|
|
30
|
+
from ....core import OutputType, enter_mode, is_kernel_mode
|
|
31
|
+
from ...utils.odpsio import ODPSModelMixin, ReadODPSModel
|
|
32
32
|
from ..models import ModelApplyChunk, ModelWithEval, ModelWithEvalData, to_remote_model
|
|
33
33
|
from .dmatrix import DMatrix
|
|
34
34
|
|
|
@@ -40,6 +40,14 @@ _xgb_type_to_np_type = {
|
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
class BoosterData(ModelWithEvalData):
|
|
43
|
+
def save_config(self) -> str:
|
|
44
|
+
try:
|
|
45
|
+
return self.fetch().save_config()
|
|
46
|
+
except:
|
|
47
|
+
if is_kernel_mode():
|
|
48
|
+
return "{}"
|
|
49
|
+
raise
|
|
50
|
+
|
|
43
51
|
@staticmethod
|
|
44
52
|
def _get_booster_score(bst, fmap=None, importance_type="weight"):
|
|
45
53
|
if not fmap:
|
|
@@ -157,7 +165,7 @@ if not xgboost:
|
|
|
157
165
|
XGBScikitLearnBase = None
|
|
158
166
|
else:
|
|
159
167
|
|
|
160
|
-
class XGBScikitLearnBase(xgboost.XGBModel,
|
|
168
|
+
class XGBScikitLearnBase(xgboost.XGBModel, ODPSModelMixin):
|
|
161
169
|
"""
|
|
162
170
|
Base class for implementing scikit-learn interface
|
|
163
171
|
"""
|
|
@@ -218,7 +226,8 @@ else:
|
|
|
218
226
|
sample_weight_eval_set,
|
|
219
227
|
base_margin_eval_set,
|
|
220
228
|
)
|
|
221
|
-
|
|
229
|
+
with enter_mode(kernel=True):
|
|
230
|
+
params = self.get_xgb_params()
|
|
222
231
|
if not params.get("objective"):
|
|
223
232
|
params["objective"] = "reg:squarederror"
|
|
224
233
|
self.evals_result_ = dict()
|
|
@@ -351,16 +360,31 @@ else:
|
|
|
351
360
|
evals_result=self.evals_result_t_, local_info=local_info
|
|
352
361
|
)
|
|
353
362
|
|
|
354
|
-
def _get_odps_model_info(self) ->
|
|
363
|
+
def _get_odps_model_info(self) -> ODPSModelMixin.ODPSModelInfo:
|
|
355
364
|
model_format = (
|
|
356
365
|
"BOOSTED_TREE_CLASSIFIER"
|
|
357
366
|
if hasattr(self, "predict_proba")
|
|
358
367
|
else "BOOSTED_TREE_REGRESSOR"
|
|
359
368
|
)
|
|
360
|
-
return
|
|
369
|
+
return ODPSModelMixin.ODPSModelInfo(
|
|
361
370
|
model_format=model_format, model_params=self._Booster
|
|
362
371
|
)
|
|
363
372
|
|
|
373
|
+
@classmethod
|
|
374
|
+
def _build_odps_source_model(cls, op: ReadODPSModel) -> Any:
|
|
375
|
+
if not (
|
|
376
|
+
op.format == "BOOSTED_TREE_CLASSIFIER" and hasattr(cls, "predict_proba")
|
|
377
|
+
) and not (
|
|
378
|
+
op.format == "BOOSTED_TREE_REGRESSOR"
|
|
379
|
+
and not hasattr(cls, "predict_proba")
|
|
380
|
+
):
|
|
381
|
+
return None
|
|
382
|
+
op._output_types = [OutputType.object]
|
|
383
|
+
booster = op.new_tileable(None, object_class=Booster)
|
|
384
|
+
estimator = cls()
|
|
385
|
+
estimator._Booster = booster
|
|
386
|
+
return estimator
|
|
387
|
+
|
|
364
388
|
def wrap_evaluation_matrices(
|
|
365
389
|
missing: float,
|
|
366
390
|
X: Any,
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
from typing import List
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
18
19
|
|
|
19
20
|
from .... import opcodes
|
|
20
21
|
from ....core import EntityData
|
|
@@ -62,9 +63,10 @@ class XGBPredict(Operator, TileableOperatorMixin):
|
|
|
62
63
|
|
|
63
64
|
def __call__(self):
|
|
64
65
|
num_class = getattr(self.model.op, "num_class", None)
|
|
65
|
-
|
|
66
|
+
output_ndim = getattr(self.model.op, "output_ndim", None)
|
|
67
|
+
if num_class is not None and not pd.isna(num_class):
|
|
66
68
|
num_class = int(num_class)
|
|
67
|
-
if num_class is not None and num_class > 2:
|
|
69
|
+
if num_class is not None and (num_class > 2 or output_ndim == 2):
|
|
68
70
|
shape = (self.data.shape[0], num_class)
|
|
69
71
|
else:
|
|
70
72
|
shape = (self.data.shape[0],)
|