maxframe 2.2.0__cp38-cp38-macosx_10_9_universal2.whl → 2.3.0rc1__cp38-cp38-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (114) hide show
  1. maxframe/_utils.cpython-38-darwin.so +0 -0
  2. maxframe/codegen/core.py +3 -2
  3. maxframe/codegen/spe/dataframe/merge.py +4 -0
  4. maxframe/codegen/spe/dataframe/misc.py +2 -0
  5. maxframe/codegen/spe/dataframe/reduction.py +18 -0
  6. maxframe/codegen/spe/dataframe/sort.py +9 -1
  7. maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
  8. maxframe/codegen/spe/dataframe/tseries.py +9 -0
  9. maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
  10. maxframe/codegen/spe/tensor/datasource.py +1 -0
  11. maxframe/config/config.py +3 -0
  12. maxframe/conftest.py +10 -0
  13. maxframe/core/base.py +2 -1
  14. maxframe/core/entity/tileables.py +2 -0
  15. maxframe/core/graph/core.cpython-38-darwin.so +0 -0
  16. maxframe/core/graph/entity.py +7 -1
  17. maxframe/core/mode.py +6 -1
  18. maxframe/dataframe/__init__.py +2 -2
  19. maxframe/dataframe/arithmetic/__init__.py +4 -0
  20. maxframe/dataframe/arithmetic/maximum.py +33 -0
  21. maxframe/dataframe/arithmetic/minimum.py +33 -0
  22. maxframe/dataframe/core.py +98 -106
  23. maxframe/dataframe/datasource/core.py +6 -0
  24. maxframe/dataframe/datasource/direct.py +57 -0
  25. maxframe/dataframe/datasource/read_csv.py +19 -11
  26. maxframe/dataframe/datasource/read_odps_query.py +29 -6
  27. maxframe/dataframe/datasource/read_odps_table.py +32 -10
  28. maxframe/dataframe/datasource/read_parquet.py +38 -39
  29. maxframe/dataframe/datastore/__init__.py +6 -0
  30. maxframe/dataframe/datastore/direct.py +268 -0
  31. maxframe/dataframe/datastore/to_odps.py +6 -0
  32. maxframe/dataframe/extensions/flatjson.py +2 -1
  33. maxframe/dataframe/groupby/__init__.py +5 -1
  34. maxframe/dataframe/groupby/aggregation.py +10 -6
  35. maxframe/dataframe/groupby/apply_chunk.py +1 -3
  36. maxframe/dataframe/groupby/core.py +20 -4
  37. maxframe/dataframe/indexing/__init__.py +2 -1
  38. maxframe/dataframe/indexing/insert.py +45 -17
  39. maxframe/dataframe/merge/__init__.py +3 -0
  40. maxframe/dataframe/merge/combine.py +244 -0
  41. maxframe/dataframe/misc/__init__.py +14 -3
  42. maxframe/dataframe/misc/check_unique.py +41 -10
  43. maxframe/dataframe/misc/drop.py +31 -0
  44. maxframe/dataframe/misc/infer_dtypes.py +251 -0
  45. maxframe/dataframe/misc/map.py +31 -18
  46. maxframe/dataframe/misc/repeat.py +159 -0
  47. maxframe/dataframe/misc/tests/test_misc.py +35 -1
  48. maxframe/dataframe/missing/checkna.py +3 -2
  49. maxframe/dataframe/reduction/__init__.py +10 -5
  50. maxframe/dataframe/reduction/aggregation.py +6 -6
  51. maxframe/dataframe/reduction/argmax.py +7 -4
  52. maxframe/dataframe/reduction/argmin.py +7 -4
  53. maxframe/dataframe/reduction/core.py +18 -9
  54. maxframe/dataframe/reduction/mode.py +144 -0
  55. maxframe/dataframe/reduction/nunique.py +10 -3
  56. maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
  57. maxframe/dataframe/sort/__init__.py +9 -2
  58. maxframe/dataframe/sort/argsort.py +7 -1
  59. maxframe/dataframe/sort/core.py +1 -1
  60. maxframe/dataframe/sort/rank.py +147 -0
  61. maxframe/dataframe/tseries/__init__.py +19 -0
  62. maxframe/dataframe/tseries/at_time.py +61 -0
  63. maxframe/dataframe/tseries/between_time.py +122 -0
  64. maxframe/dataframe/utils.py +30 -26
  65. maxframe/learn/contrib/llm/core.py +16 -7
  66. maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
  67. maxframe/learn/contrib/llm/deploy/config.py +221 -0
  68. maxframe/learn/contrib/llm/deploy/core.py +247 -0
  69. maxframe/learn/contrib/llm/deploy/framework.py +35 -0
  70. maxframe/learn/contrib/llm/deploy/loader.py +360 -0
  71. maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
  73. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  74. maxframe/learn/contrib/llm/models/dashscope.py +12 -6
  75. maxframe/learn/contrib/llm/models/managed.py +76 -11
  76. maxframe/learn/contrib/llm/models/openai.py +72 -0
  77. maxframe/learn/contrib/llm/tests/__init__.py +13 -0
  78. maxframe/learn/contrib/llm/tests/test_core.py +34 -0
  79. maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
  80. maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
  81. maxframe/learn/contrib/llm/text.py +348 -42
  82. maxframe/learn/contrib/models.py +4 -1
  83. maxframe/learn/contrib/xgboost/classifier.py +2 -0
  84. maxframe/learn/contrib/xgboost/core.py +31 -7
  85. maxframe/learn/contrib/xgboost/predict.py +4 -2
  86. maxframe/learn/contrib/xgboost/regressor.py +5 -0
  87. maxframe/learn/contrib/xgboost/train.py +2 -0
  88. maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
  89. maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
  90. maxframe/learn/utils/__init__.py +1 -0
  91. maxframe/learn/utils/extmath.py +42 -9
  92. maxframe/learn/utils/odpsio.py +80 -11
  93. maxframe/lib/filesystem/_oss_lib/common.py +2 -0
  94. maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
  95. maxframe/opcodes.py +9 -1
  96. maxframe/remote/core.py +4 -0
  97. maxframe/serialization/core.cpython-38-darwin.so +0 -0
  98. maxframe/serialization/tests/test_serial.py +2 -2
  99. maxframe/tensor/arithmetic/__init__.py +1 -1
  100. maxframe/tensor/arithmetic/core.py +2 -2
  101. maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
  102. maxframe/tensor/core.py +3 -0
  103. maxframe/tensor/misc/copyto.py +1 -1
  104. maxframe/tests/test_udf.py +61 -0
  105. maxframe/tests/test_utils.py +8 -5
  106. maxframe/udf.py +103 -7
  107. maxframe/utils.py +61 -8
  108. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
  109. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
  110. maxframe_client/session/task.py +8 -1
  111. maxframe_client/tests/test_session.py +24 -0
  112. maxframe/dataframe/arrays.py +0 -864
  113. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
  114. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
@@ -12,18 +12,24 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Any, Dict, List
15
+ from typing import Any, Dict, List, Tuple
16
16
 
17
17
  import numpy as np
18
18
 
19
19
  from .... import opcodes
20
20
  from ....dataframe.core import DataFrame, Series
21
- from ....serialization.serializables import FieldTypes, ListField, StringField
21
+ from ....serialization.serializables import (
22
+ DictField,
23
+ FieldTypes,
24
+ ListField,
25
+ StringField,
26
+ )
22
27
  from .core import LLM, LLMTaskOperator
23
28
 
24
29
 
25
- class TextLLMSummarizeOperator(LLMTaskOperator):
30
+ class TextLLMSummarizeOp(LLMTaskOperator):
26
31
  _op_type_ = opcodes.LLM_TEXT_SUMMARIZE_TASK
32
+ _legacy_name = "TextLLMSummarizeOperator" # since v2.3.0
27
33
 
28
34
  def get_output_dtypes(self) -> Dict[str, np.dtype]:
29
35
  return {
@@ -32,21 +38,25 @@ class TextLLMSummarizeOperator(LLMTaskOperator):
32
38
  }
33
39
 
34
40
 
35
- class TextLLMTranslateOperator(LLMTaskOperator):
41
+ class TextLLMTranslateOp(LLMTaskOperator):
36
42
  _op_type_ = opcodes.LLM_TEXT_TRANSLATE_TASK
43
+ _legacy_name = "TextLLMTranslateOperator" # since v2.3.0
37
44
 
38
45
  source_language = StringField("source_language")
39
46
  target_language = StringField("target_language")
47
+ description = StringField("description", default=None)
48
+ examples = ListField("examples", FieldTypes.dict, default=None)
40
49
 
41
50
  def get_output_dtypes(self) -> Dict[str, np.dtype]:
42
51
  return {
43
- "target": np.dtype("O"),
52
+ "output": np.dtype("O"),
44
53
  "success": np.dtype("bool"),
45
54
  }
46
55
 
47
56
 
48
- class TextLLMClassifyOperator(LLMTaskOperator):
57
+ class TextLLMClassifyOp(LLMTaskOperator):
49
58
  _op_type_ = opcodes.LLM_TEXT_CLASSIFY_TASK
59
+ _legacy_name = "TextLLMClassifyOperator" # since v2.3.0
50
60
 
51
61
  labels = ListField("labels")
52
62
  description = StringField("description", default=None)
@@ -60,7 +70,24 @@ class TextLLMClassifyOperator(LLMTaskOperator):
60
70
  }
61
71
 
62
72
 
63
- class TextLLM(LLM):
73
+ class TextLLMExtractOp(LLMTaskOperator):
74
+ _op_type_ = opcodes.LLM_TEXT_EXTRACT_TASK
75
+ _legacy_name = "TextLLMExtractOperator" # since v2.3.0
76
+
77
+ schema = DictField("schema", FieldTypes.string, FieldTypes.any, default=None)
78
+ description = StringField("description", default=None)
79
+ examples = ListField("examples", FieldTypes.dict, default_factory=None)
80
+
81
+ def get_output_dtypes(self) -> Dict[str, np.dtype]:
82
+ return {
83
+ "output": np.dtype("O"),
84
+ "success": np.dtype("bool"),
85
+ }
86
+
87
+
88
+ class TextGenLLM(LLM):
89
+ _legacy_name = "TextLLM" # since v2.3.0
90
+
64
91
  def generate(
65
92
  self,
66
93
  data,
@@ -70,23 +97,25 @@ class TextLLM(LLM):
70
97
  raise NotImplementedError
71
98
 
72
99
  def summarize(self, series, index=None, **kw):
73
- return TextLLMSummarizeOperator(model=self, task="summarize", **kw)(
74
- series, index
75
- )
100
+ return TextLLMSummarizeOp(model=self, task="summarize", **kw)(series, index)
76
101
 
77
102
  def translate(
78
103
  self,
79
104
  series,
80
105
  target_language: str,
81
106
  source_language: str = None,
107
+ description: str = None,
108
+ examples: List[Dict[str, str]] = None,
82
109
  index=None,
83
110
  **kw
84
111
  ):
85
- return TextLLMTranslateOperator(
112
+ return TextLLMTranslateOp(
86
113
  model=self,
87
114
  task="translate",
88
115
  source_language=source_language,
89
116
  target_language=target_language,
117
+ description=description,
118
+ examples=examples,
90
119
  **kw
91
120
  )(series, index)
92
121
 
@@ -99,7 +128,7 @@ class TextLLM(LLM):
99
128
  index=None,
100
129
  **kw
101
130
  ):
102
- return TextLLMClassifyOperator(
131
+ return TextLLMClassifyOp(
103
132
  model=self,
104
133
  labels=labels,
105
134
  task="classify",
@@ -108,10 +137,51 @@ class TextLLM(LLM):
108
137
  **kw
109
138
  )(series, index)
110
139
 
140
+ def extract(
141
+ self,
142
+ series,
143
+ schema: Any,
144
+ description: str = None,
145
+ examples: List[Tuple[str, str]] = None,
146
+ index=None,
147
+ **kw
148
+ ):
149
+ import inspect
150
+
151
+ from pydantic import BaseModel
152
+
153
+ if inspect.isclass(schema) and issubclass(schema, BaseModel):
154
+ schema = schema.model_json_schema()
155
+
156
+ return TextLLMExtractOp(
157
+ model=self,
158
+ schema=schema,
159
+ task="extract",
160
+ description=description,
161
+ examples=examples,
162
+ **kw
163
+ )(series, index)
164
+
165
+
166
+ TextLLM = TextGenLLM # for old client compatibility
167
+
168
+
169
+ class TextEmbeddingModel(LLM):
170
+ def embed(
171
+ self,
172
+ data: Series,
173
+ dimensions: int,
174
+ encoding_format: str,
175
+ simple_output: bool,
176
+ params: Dict[str, Any],
177
+ **kw
178
+ ):
179
+ raise NotImplementedError
180
+
111
181
 
112
182
  def generate(
113
183
  data,
114
- model: TextLLM,
184
+ model: TextGenLLM,
115
185
  prompt_template: List[Dict[str, Any]],
116
186
  params: Dict[str, Any] = None,
117
187
  ):
@@ -141,11 +211,11 @@ def generate(
141
211
 
142
212
  Examples
143
213
  --------
144
- >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextLLM
214
+ >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
145
215
  >>> import maxframe.dataframe as md
146
216
  >>>
147
217
  >>> # Initialize the model
148
- >>> llm = ManagedTextLLM(name="Qwen2.5-0.5B-instruct")
218
+ >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
149
219
  >>>
150
220
  >>> # Prepare prompt template
151
221
  >>> messages = [
@@ -164,14 +234,14 @@ def generate(
164
234
  """
165
235
  if not isinstance(data, DataFrame) and not isinstance(data, Series):
166
236
  raise ValueError("data must be a maxframe dataframe or series object")
167
- if not isinstance(model, TextLLM):
237
+ if not isinstance(model, TextGenLLM):
168
238
  raise TypeError("model must be a TextLLM object")
169
239
  params = params if params is not None else dict()
170
240
  model.validate_params(params)
171
241
  return model.generate(data, prompt_template=prompt_template, params=params)
172
242
 
173
243
 
174
- def summary(series, model: TextLLM, index=None):
244
+ def summary(series, model: TextGenLLM, index=None):
175
245
  """
176
246
  Generate summaries for text content in a series using a language model.
177
247
 
@@ -180,15 +250,35 @@ def summary(series, model: TextLLM, index=None):
180
250
  series : Series
181
251
  A maxframe Series containing text data to be summarized.
182
252
  Each element should be a text string.
183
- model : TextLLM
253
+ model : TextGenLLM
184
254
  Language model instance used for text summarization.
185
255
  index : array-like, optional
186
256
  Index for the output series, by default None, will generate new index.
187
257
 
188
258
  Returns
189
259
  -------
190
- maxframe.Series
191
- A pandas Series containing the generated summaries and success status.
260
+ DataFrame
261
+ A DataFrame containing the generated summaries and success status.
262
+ Columns include 'summary' (generated summary text) and 'success' (boolean status).
263
+ If 'success' is False, the 'summary' column will contain error information instead of the expected output.
264
+
265
+ Examples
266
+ --------
267
+ >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
268
+ >>> import maxframe.dataframe as md
269
+ >>>
270
+ >>> # Initialize the model
271
+ >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
272
+ >>>
273
+ >>> # Create sample data
274
+ >>> texts = md.Series([
275
+ ... "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.",
276
+ ... "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data."
277
+ ... ])
278
+ >>>
279
+ >>> # Generate summaries
280
+ >>> result = summary(texts, llm)
281
+ >>> result.execute()
192
282
 
193
283
  Notes
194
284
  -----
@@ -205,35 +295,54 @@ def summary(series, model: TextLLM, index=None):
205
295
 
206
296
 
207
297
  def translate(
208
- series, model: TextLLM, source_language: str, target_language: str, index=None
298
+ series, model: TextGenLLM, source_language: str, target_language: str, index=None
209
299
  ):
210
300
  """
211
301
  Translate text content in a series using a language model from source language to target language.
212
302
 
213
303
  Parameters
214
304
  ----------
215
- series : pandas.Series
305
+ series : Series
216
306
  A maxframe Series containing text data to translate.
217
307
  Each element should be a text string.
218
- model : TextLLM
219
- Language model instance used for text summarization.
308
+ model : TextGenLLM
309
+ Language model instance used for text translation.
220
310
  source_language : str
221
- Source language of the text.
311
+ Source language of the text (e.g., 'en', 'zh', 'ja').
222
312
  target_language : str
223
- Target language of the text.
313
+ Target language for translation (e.g., 'en', 'zh', 'ja').
224
314
  index : array-like, optional
225
315
  Index for the output series, by default None, will generate new index.
226
316
 
227
317
  Returns
228
318
  -------
229
- maxframe.Series
230
- A pandas Series containing the generated translation and success status.
319
+ DataFrame
320
+ A DataFrame containing the generated translations and success status.
321
+ Columns include 'output' (translated text) and 'success' (boolean status).
322
+ If 'success' is False, the 'output' column will contain error information instead of the expected output.
323
+
324
+ Examples
325
+ --------
326
+ >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
327
+ >>> import maxframe.dataframe as md
328
+ >>>
329
+ >>> # Initialize the model
330
+ >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
331
+ >>>
332
+ >>> # Create sample data
333
+ >>> texts = md.Series([
334
+ ... "Hello, how are you?",
335
+ ... "Machine learning is fascinating."
336
+ ... ])
337
+ >>>
338
+ >>> # Translate from English to Chinese
339
+ >>> result = translate(texts, llm, source_language="en", target_language="zh")
340
+ >>> result.execute()
231
341
 
232
342
  Notes
233
343
  -----
234
344
  **Preview:** This API is in preview state and may be unstable.
235
345
  The interface may change in future releases.
236
-
237
346
  """
238
347
  if not isinstance(series, Series):
239
348
  raise ValueError("series must be a maxframe series object")
@@ -249,36 +358,63 @@ def translate(
249
358
 
250
359
  def classify(
251
360
  series,
252
- model: TextLLM,
361
+ model: TextGenLLM,
253
362
  labels: List[str],
254
363
  description: str = None,
255
364
  examples: List[Dict[str, str]] = None,
256
365
  index=None,
257
366
  ):
258
367
  """
259
- Classify text content in a series with given labels.
368
+ Classify text content in a series with given labels using a language model.
260
369
 
261
370
  Parameters
262
371
  ----------
263
- series : pandas.Series
372
+ series : Series
264
373
  A maxframe Series containing text data to be classified.
265
374
  Each element should be a text string.
266
- model : TextLLM
267
- Language model instance used for text summarization.
375
+ model : TextGenLLM
376
+ Language model instance used for text classification.
268
377
  labels : List[str]
269
- List of labels to classify the text.
270
- description : str
271
- Description of the classification task.
272
- examples : List[Dict[str, Dict[str, str]]]
273
- Examples of the classification task, like [{ "text": "text...", "label":"A", reason : "reason..."}], help
274
- LLM to better understand your rules.
378
+ List of labels to classify the text into.
379
+ description : str, optional
380
+ Description of the classification task to help the model understand the context.
381
+ examples : List[Dict[str, str]], optional
382
+ Examples of the classification task, like [{"text": "text...", "label": "A", "reason": "reason..."}],
383
+ to help LLM better understand your classification rules.
275
384
  index : array-like, optional
276
385
  Index for the output series, by default None, will generate new index.
277
386
 
278
387
  Returns
279
388
  -------
280
- maxframe.Series
281
- A pandas Series containing the generated classification results and success status.
389
+ DataFrame
390
+ A DataFrame containing the generated classification results and success status.
391
+ Columns include 'label' (predicted label), 'reason' (reasoning), and 'success' (boolean status).
392
+ If 'success' is False, the 'label' and 'reason' columns will contain error information instead of the expected output.
393
+
394
+ Examples
395
+ --------
396
+ >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
397
+ >>> import maxframe.dataframe as md
398
+ >>>
399
+ >>> # Initialize the model
400
+ >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
401
+ >>>
402
+ >>> # Create sample data
403
+ >>> texts = md.Series([
404
+ ... "I love this product! It's amazing!",
405
+ ... "This is terrible, worst purchase ever.",
406
+ ... "It's okay, nothing special."
407
+ ... ])
408
+ >>>
409
+ >>> # Classify sentiment
410
+ >>> labels = ["positive", "negative", "neutral"]
411
+ >>> description = "Classify the sentiment of customer reviews"
412
+ >>> examples = [
413
+ ... {"text": "Great product!", "label": "positive", "reason": "Expresses satisfaction"},
414
+ ... {"text": "Poor quality", "label": "negative", "reason": "Expresses dissatisfaction"}
415
+ ... ]
416
+ >>> result = classify(texts, llm, labels=labels, description=description, examples=examples)
417
+ >>> result.execute()
282
418
 
283
419
  Notes
284
420
  -----
@@ -300,3 +436,173 @@ def classify(
300
436
  return model.classify(
301
437
  series, labels=labels, description=description, examples=examples, index=index
302
438
  )
439
+
440
+
441
+ def extract(
442
+ series,
443
+ model: TextGenLLM,
444
+ schema: Any,
445
+ description: str = None,
446
+ examples: List[Tuple[str, str]] = None,
447
+ index=None,
448
+ ):
449
+ """
450
+ Extract structured information from text content in a series using a language model.
451
+
452
+ Parameters
453
+ ----------
454
+ series : Series
455
+ A maxframe Series containing text data to extract information from.
456
+ Each element should be a text string.
457
+ model : TextGenLLM
458
+ Language model instance used for information extraction.
459
+ schema : Any
460
+ Schema definition for the extraction. Can be a dictionary defining the structure
461
+ or a Pydantic BaseModel class that will be converted to JSON schema.
462
+ description : str, optional
463
+ Description of the extraction task to help the model understand what to extract.
464
+ examples : List[Tuple[str, str]], optional
465
+ Examples of the extraction task in format [(input_text, expected_output), ...],
466
+ to help LLM better understand the extraction requirements.
467
+ index : array-like, optional
468
+ Index for the output series, by default None, will generate new index.
469
+
470
+ Returns
471
+ -------
472
+ DataFrame
473
+ A DataFrame containing the extracted information and success status.
474
+ Columns include 'output' (extracted structured data) and 'success' (boolean status).
475
+ If 'success' is False, the 'output' column will contain error information instead of the expected output.
476
+
477
+ Examples
478
+ --------
479
+ >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
480
+ >>> import maxframe.dataframe as md
481
+ >>>
482
+ >>> # Initialize the model
483
+ >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
484
+ >>>
485
+ >>> # Create sample data
486
+ >>> texts = md.Series([
487
+ ... "John Smith, age 30, works as a Software Engineer at Google.",
488
+ ... "Alice Johnson, 25 years old, is a Data Scientist at Microsoft."
489
+ ... ])
490
+ >>>
491
+ >>> # Define extraction schema
492
+ >>> schema = {
493
+ ... "name": "string",
494
+ ... "age": "integer",
495
+ ... "job_title": "string",
496
+ ... "company": "string"
497
+ ... }
498
+ >>>
499
+ >>> # Extract structured information
500
+ >>> description = "Extract person information from text"
501
+ >>> examples = [
502
+ ... ("Bob Brown, 35, Manager at Apple", '{"name": "Bob Brown", "age": 35, "job_title": "Manager", "company": "Apple"}')
503
+ ... ]
504
+ >>> result = extract(texts, llm, schema=schema, description=description, examples=examples)
505
+ >>> result.execute()
506
+
507
+ Notes
508
+ -----
509
+ **Preview:** This API is in preview state and may be unstable.
510
+ The interface may change in future releases.
511
+ """
512
+ if not isinstance(series, Series):
513
+ raise ValueError("series must be a maxframe series object")
514
+ if series.dtype != np.str_:
515
+ raise ValueError("extract input must be a string series")
516
+ if not schema:
517
+ raise ValueError("schema must not be empty")
518
+ if (
519
+ examples
520
+ and not isinstance(examples, list)
521
+ or not any(isinstance(x, Tuple) for x in examples)
522
+ ):
523
+ raise ValueError("examples must be a list of tuples, format is (input, output)")
524
+ return model.extract(
525
+ series, schema=schema, description=description, examples=examples, index=index
526
+ )
527
+
528
+
529
+ def embed(
530
+ series,
531
+ model: TextEmbeddingModel,
532
+ dimensions: int = None,
533
+ encoding_format: str = None,
534
+ simple_output: bool = False,
535
+ params: Dict[str, Any] = None,
536
+ index=None,
537
+ ):
538
+ """
539
+ Embed text content in a series using a text embedding model.
540
+
541
+ Parameters
542
+ ----------
543
+ series : Series
544
+ A maxframe Series containing text data to be embedded.
545
+ Each element should be a text string.
546
+ model : TextEmbeddingModel
547
+ Text embedding model instance used for generating embeddings.
548
+ dimensions : int, optional
549
+ Dimensions of the embedding vectors. If not specified, uses model default.
550
+ encoding_format : str, optional
551
+ Encoding format of the embedding (e.g., 'float', 'base64'). If not specified, uses model default.
552
+ simple_output : bool, optional
553
+ Whether to return the embedding data directly without additional metadata, by default False.
554
+ params : Dict[str, Any], optional
555
+ Additional parameters for embedding configuration, by default None.
556
+ Can include model-specific settings.
557
+ index : array-like, optional
558
+ Index for the output series, by default None, will generate new index.
559
+
560
+ Returns
561
+ -------
562
+ DataFrame
563
+ A DataFrame containing the generated embeddings and success status.
564
+ Columns include 'response' (embedding vectors) and 'success' (boolean status).
565
+ If 'success' is False, the 'response' column will contain error information instead of the expected output.
566
+
567
+ Examples
568
+ --------
569
+ >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextEmbeddingModel
570
+ >>> import maxframe.dataframe as md
571
+ >>>
572
+ >>> # Initialize the embedding model
573
+ >>> embedding_model = ManagedTextEmbeddingModel(name="text-embedding-ada-002")
574
+ >>>
575
+ >>> # Create sample data
576
+ >>> texts = md.Series([
577
+ ... "Machine learning is a powerful technology.",
578
+ ... "Natural language processing enables computers to understand text.",
579
+ ... "Deep learning uses neural networks for pattern recognition."
580
+ ... ])
581
+ >>>
582
+ >>> # Generate embeddings
583
+ >>> result = embed(texts, embedding_model, simple_output=True)
584
+ >>> result.execute()
585
+
586
+ Notes
587
+ -----
588
+ **Preview:** This API is in preview state and may be unstable.
589
+ The interface may change in future releases.
590
+ """
591
+ if not isinstance(series, Series):
592
+ raise ValueError("series must be a maxframe series object")
593
+ if series.dtype != np.str_:
594
+ raise ValueError("embed input must be a string series")
595
+ return model.embed(
596
+ series,
597
+ dimensions=dimensions,
598
+ encoding_format=encoding_format,
599
+ simple_output=simple_output,
600
+ params=params,
601
+ index=index,
602
+ )
603
+
604
+
605
+ TextLLMExtractOperator = TextLLMExtractOp
606
+ TextLLMSummarizeOperator = TextLLMSummarizeOp
607
+ TextLLMTranslateOperator = TextLLMTranslateOp
608
+ TextLLMClassifyOperator = TextLLMClassifyOp
@@ -40,7 +40,10 @@ class ModelWithEvalData(ModelData):
40
40
  def execute(self, session=None, **kw):
41
41
  # The evals_result should be fetched when BoosterData.execute() is called.
42
42
  result = super().execute(session=session, **kw)
43
- if self.op.has_evals_result and self.key == self.op.outputs[0].key:
43
+ if (
44
+ getattr(self.op, "has_evals_result", None)
45
+ and self.key == self.op.outputs[0].key
46
+ ):
44
47
  self._evals_result.update(self.op.outputs[1].fetch(session=session))
45
48
  return result
46
49
 
@@ -18,6 +18,7 @@ import numpy as np
18
18
 
19
19
  from .... import tensor as mt
20
20
  from ....tensor.merge.vstack import _vstack
21
+ from ...utils.odpsio import register_odps_model
21
22
  from ..utils import make_import_error_func
22
23
  from .core import XGBScikitLearnBase, xgboost
23
24
 
@@ -28,6 +29,7 @@ else:
28
29
 
29
30
  from .predict import predict
30
31
 
32
+ @register_odps_model
31
33
  class XGBClassifier(XGBScikitLearnBase, XGBClassifierBase):
32
34
  """
33
35
  Implementation of the scikit-learn API for XGBoost classification.
@@ -24,11 +24,11 @@ from ....udf import builtin_function
24
24
 
25
25
  try:
26
26
  import xgboost
27
- except ImportError:
27
+ except ImportError: # pragma: no cover
28
28
  xgboost = None
29
29
 
30
- from ....core import OutputType
31
- from ...utils.odpsio import ToODPSModelMixin
30
+ from ....core import OutputType, enter_mode, is_kernel_mode
31
+ from ...utils.odpsio import ODPSModelMixin, ReadODPSModel
32
32
  from ..models import ModelApplyChunk, ModelWithEval, ModelWithEvalData, to_remote_model
33
33
  from .dmatrix import DMatrix
34
34
 
@@ -40,6 +40,14 @@ _xgb_type_to_np_type = {
40
40
 
41
41
 
42
42
  class BoosterData(ModelWithEvalData):
43
+ def save_config(self) -> str:
44
+ try:
45
+ return self.fetch().save_config()
46
+ except:
47
+ if is_kernel_mode():
48
+ return "{}"
49
+ raise
50
+
43
51
  @staticmethod
44
52
  def _get_booster_score(bst, fmap=None, importance_type="weight"):
45
53
  if not fmap:
@@ -157,7 +165,7 @@ if not xgboost:
157
165
  XGBScikitLearnBase = None
158
166
  else:
159
167
 
160
- class XGBScikitLearnBase(xgboost.XGBModel, ToODPSModelMixin):
168
+ class XGBScikitLearnBase(xgboost.XGBModel, ODPSModelMixin):
161
169
  """
162
170
  Base class for implementing scikit-learn interface
163
171
  """
@@ -218,7 +226,8 @@ else:
218
226
  sample_weight_eval_set,
219
227
  base_margin_eval_set,
220
228
  )
221
- params = self.get_xgb_params()
229
+ with enter_mode(kernel=True):
230
+ params = self.get_xgb_params()
222
231
  if not params.get("objective"):
223
232
  params["objective"] = "reg:squarederror"
224
233
  self.evals_result_ = dict()
@@ -351,16 +360,31 @@ else:
351
360
  evals_result=self.evals_result_t_, local_info=local_info
352
361
  )
353
362
 
354
- def _get_odps_model_info(self) -> ToODPSModelMixin.ODPSModelInfo:
363
+ def _get_odps_model_info(self) -> ODPSModelMixin.ODPSModelInfo:
355
364
  model_format = (
356
365
  "BOOSTED_TREE_CLASSIFIER"
357
366
  if hasattr(self, "predict_proba")
358
367
  else "BOOSTED_TREE_REGRESSOR"
359
368
  )
360
- return ToODPSModelMixin.ODPSModelInfo(
369
+ return ODPSModelMixin.ODPSModelInfo(
361
370
  model_format=model_format, model_params=self._Booster
362
371
  )
363
372
 
373
+ @classmethod
374
+ def _build_odps_source_model(cls, op: ReadODPSModel) -> Any:
375
+ if not (
376
+ op.format == "BOOSTED_TREE_CLASSIFIER" and hasattr(cls, "predict_proba")
377
+ ) and not (
378
+ op.format == "BOOSTED_TREE_REGRESSOR"
379
+ and not hasattr(cls, "predict_proba")
380
+ ):
381
+ return None
382
+ op._output_types = [OutputType.object]
383
+ booster = op.new_tileable(None, object_class=Booster)
384
+ estimator = cls()
385
+ estimator._Booster = booster
386
+ return estimator
387
+
364
388
  def wrap_evaluation_matrices(
365
389
  missing: float,
366
390
  X: Any,
@@ -15,6 +15,7 @@
15
15
  from typing import List
16
16
 
17
17
  import numpy as np
18
+ import pandas as pd
18
19
 
19
20
  from .... import opcodes
20
21
  from ....core import EntityData
@@ -62,9 +63,10 @@ class XGBPredict(Operator, TileableOperatorMixin):
62
63
 
63
64
  def __call__(self):
64
65
  num_class = getattr(self.model.op, "num_class", None)
65
- if num_class is not None:
66
+ output_ndim = getattr(self.model.op, "output_ndim", None)
67
+ if num_class is not None and not pd.isna(num_class):
66
68
  num_class = int(num_class)
67
- if num_class is not None and num_class > 2:
69
+ if num_class is not None and (num_class > 2 or output_ndim == 2):
68
70
  shape = (self.data.shape[0], num_class)
69
71
  else:
70
72
  shape = (self.data.shape[0],)