PyPI - maxframe - Versions diffs - 2.2.0__cp38-cp38-macosx_10_9_universal2.whl → 2.3.0rc1__cp38-cp38-macosx_10_9_universal2.whl - Mend

maxframe 2.2.0__cp38-cp38-macosx_10_9_universal2.whl → 2.3.0rc1__cp38-cp38-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (114) hide show

maxframe/_utils.cpython-38-darwin.so +0 -0
maxframe/codegen/core.py +3 -2
maxframe/codegen/spe/dataframe/merge.py +4 -0
maxframe/codegen/spe/dataframe/misc.py +2 -0
maxframe/codegen/spe/dataframe/reduction.py +18 -0
maxframe/codegen/spe/dataframe/sort.py +9 -1
maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
maxframe/codegen/spe/dataframe/tseries.py +9 -0
maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
maxframe/codegen/spe/tensor/datasource.py +1 -0
maxframe/config/config.py +3 -0
maxframe/conftest.py +10 -0
maxframe/core/base.py +2 -1
maxframe/core/entity/tileables.py +2 -0
maxframe/core/graph/core.cpython-38-darwin.so +0 -0
maxframe/core/graph/entity.py +7 -1
maxframe/core/mode.py +6 -1
maxframe/dataframe/__init__.py +2 -2
maxframe/dataframe/arithmetic/__init__.py +4 -0
maxframe/dataframe/arithmetic/maximum.py +33 -0
maxframe/dataframe/arithmetic/minimum.py +33 -0
maxframe/dataframe/core.py +98 -106
maxframe/dataframe/datasource/core.py +6 -0
maxframe/dataframe/datasource/direct.py +57 -0
maxframe/dataframe/datasource/read_csv.py +19 -11
maxframe/dataframe/datasource/read_odps_query.py +29 -6
maxframe/dataframe/datasource/read_odps_table.py +32 -10
maxframe/dataframe/datasource/read_parquet.py +38 -39
maxframe/dataframe/datastore/__init__.py +6 -0
maxframe/dataframe/datastore/direct.py +268 -0
maxframe/dataframe/datastore/to_odps.py +6 -0
maxframe/dataframe/extensions/flatjson.py +2 -1
maxframe/dataframe/groupby/__init__.py +5 -1
maxframe/dataframe/groupby/aggregation.py +10 -6
maxframe/dataframe/groupby/apply_chunk.py +1 -3
maxframe/dataframe/groupby/core.py +20 -4
maxframe/dataframe/indexing/__init__.py +2 -1
maxframe/dataframe/indexing/insert.py +45 -17
maxframe/dataframe/merge/__init__.py +3 -0
maxframe/dataframe/merge/combine.py +244 -0
maxframe/dataframe/misc/__init__.py +14 -3
maxframe/dataframe/misc/check_unique.py +41 -10
maxframe/dataframe/misc/drop.py +31 -0
maxframe/dataframe/misc/infer_dtypes.py +251 -0
maxframe/dataframe/misc/map.py +31 -18
maxframe/dataframe/misc/repeat.py +159 -0
maxframe/dataframe/misc/tests/test_misc.py +35 -1
maxframe/dataframe/missing/checkna.py +3 -2
maxframe/dataframe/reduction/__init__.py +10 -5
maxframe/dataframe/reduction/aggregation.py +6 -6
maxframe/dataframe/reduction/argmax.py +7 -4
maxframe/dataframe/reduction/argmin.py +7 -4
maxframe/dataframe/reduction/core.py +18 -9
maxframe/dataframe/reduction/mode.py +144 -0
maxframe/dataframe/reduction/nunique.py +10 -3
maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
maxframe/dataframe/sort/__init__.py +9 -2
maxframe/dataframe/sort/argsort.py +7 -1
maxframe/dataframe/sort/core.py +1 -1
maxframe/dataframe/sort/rank.py +147 -0
maxframe/dataframe/tseries/__init__.py +19 -0
maxframe/dataframe/tseries/at_time.py +61 -0
maxframe/dataframe/tseries/between_time.py +122 -0
maxframe/dataframe/utils.py +30 -26
maxframe/learn/contrib/llm/core.py +16 -7
maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
maxframe/learn/contrib/llm/deploy/config.py +221 -0
maxframe/learn/contrib/llm/deploy/core.py +247 -0
maxframe/learn/contrib/llm/deploy/framework.py +35 -0
maxframe/learn/contrib/llm/deploy/loader.py +360 -0
maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
maxframe/learn/contrib/llm/models/__init__.py +1 -0
maxframe/learn/contrib/llm/models/dashscope.py +12 -6
maxframe/learn/contrib/llm/models/managed.py +76 -11
maxframe/learn/contrib/llm/models/openai.py +72 -0
maxframe/learn/contrib/llm/tests/__init__.py +13 -0
maxframe/learn/contrib/llm/tests/test_core.py +34 -0
maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
maxframe/learn/contrib/llm/text.py +348 -42
maxframe/learn/contrib/models.py +4 -1
maxframe/learn/contrib/xgboost/classifier.py +2 -0
maxframe/learn/contrib/xgboost/core.py +31 -7
maxframe/learn/contrib/xgboost/predict.py +4 -2
maxframe/learn/contrib/xgboost/regressor.py +5 -0
maxframe/learn/contrib/xgboost/train.py +2 -0
maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
maxframe/learn/utils/__init__.py +1 -0
maxframe/learn/utils/extmath.py +42 -9
maxframe/learn/utils/odpsio.py +80 -11
maxframe/lib/filesystem/_oss_lib/common.py +2 -0
maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
maxframe/opcodes.py +9 -1
maxframe/remote/core.py +4 -0
maxframe/serialization/core.cpython-38-darwin.so +0 -0
maxframe/serialization/tests/test_serial.py +2 -2
maxframe/tensor/arithmetic/__init__.py +1 -1
maxframe/tensor/arithmetic/core.py +2 -2
maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
maxframe/tensor/core.py +3 -0
maxframe/tensor/misc/copyto.py +1 -1
maxframe/tests/test_udf.py +61 -0
maxframe/tests/test_utils.py +8 -5
maxframe/udf.py +103 -7
maxframe/utils.py +61 -8
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
maxframe_client/session/task.py +8 -1
maxframe_client/tests/test_session.py +24 -0
maxframe/dataframe/arrays.py +0 -864
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0

maxframe/learn/contrib/llm/text.py CHANGED Viewed

@@ -12,18 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Tuple
 import numpy as np
 from .... import opcodes
 from ....dataframe.core import DataFrame, Series
-from ....serialization.serializables import FieldTypes, ListField, StringField
+from ....serialization.serializables import (
+    DictField,
+    FieldTypes,
+    ListField,
+    StringField,
+)
 from .core import LLM, LLMTaskOperator
-class TextLLMSummarizeOperator(LLMTaskOperator):
+class TextLLMSummarizeOp(LLMTaskOperator):
     _op_type_ = opcodes.LLM_TEXT_SUMMARIZE_TASK
+    _legacy_name = "TextLLMSummarizeOperator"  # since v2.3.0
     def get_output_dtypes(self) -> Dict[str, np.dtype]:
         return {
@@ -32,21 +38,25 @@ class TextLLMSummarizeOperator(LLMTaskOperator):
         }
-class TextLLMTranslateOperator(LLMTaskOperator):
+class TextLLMTranslateOp(LLMTaskOperator):
     _op_type_ = opcodes.LLM_TEXT_TRANSLATE_TASK
+    _legacy_name = "TextLLMTranslateOperator"  # since v2.3.0
     source_language = StringField("source_language")
     target_language = StringField("target_language")
+    description = StringField("description", default=None)
+    examples = ListField("examples", FieldTypes.dict, default=None)
     def get_output_dtypes(self) -> Dict[str, np.dtype]:
         return {
-            "target": np.dtype("O"),
+            "output": np.dtype("O"),
             "success": np.dtype("bool"),
         }
-class TextLLMClassifyOperator(LLMTaskOperator):
+class TextLLMClassifyOp(LLMTaskOperator):
     _op_type_ = opcodes.LLM_TEXT_CLASSIFY_TASK
+    _legacy_name = "TextLLMClassifyOperator"  # since v2.3.0
     labels = ListField("labels")
     description = StringField("description", default=None)
@@ -60,7 +70,24 @@ class TextLLMClassifyOperator(LLMTaskOperator):
         }
-class TextLLM(LLM):
+class TextLLMExtractOp(LLMTaskOperator):
+    _op_type_ = opcodes.LLM_TEXT_EXTRACT_TASK
+    _legacy_name = "TextLLMExtractOperator"  # since v2.3.0
+    schema = DictField("schema", FieldTypes.string, FieldTypes.any, default=None)
+    description = StringField("description", default=None)
+    examples = ListField("examples", FieldTypes.dict, default_factory=None)
+    def get_output_dtypes(self) -> Dict[str, np.dtype]:
+        return {
+            "output": np.dtype("O"),
+            "success": np.dtype("bool"),
+        }
+class TextGenLLM(LLM):
+    _legacy_name = "TextLLM"  # since v2.3.0
     def generate(
         self,
         data,
@@ -70,23 +97,25 @@ class TextLLM(LLM):
         raise NotImplementedError
     def summarize(self, series, index=None, **kw):
-        return TextLLMSummarizeOperator(model=self, task="summarize", **kw)(
-            series, index
-        )
+        return TextLLMSummarizeOp(model=self, task="summarize", **kw)(series, index)
     def translate(
         self,
         series,
         target_language: str,
         source_language: str = None,
+        description: str = None,
+        examples: List[Dict[str, str]] = None,
         index=None,
         **kw
     ):
-        return TextLLMTranslateOperator(
+        return TextLLMTranslateOp(
             model=self,
             task="translate",
             source_language=source_language,
             target_language=target_language,
+            description=description,
+            examples=examples,
             **kw
         )(series, index)
@@ -99,7 +128,7 @@ class TextLLM(LLM):
         index=None,
         **kw
     ):
-        return TextLLMClassifyOperator(
+        return TextLLMClassifyOp(
             model=self,
             labels=labels,
             task="classify",
@@ -108,10 +137,51 @@ class TextLLM(LLM):
             **kw
         )(series, index)
+    def extract(
+        self,
+        series,
+        schema: Any,
+        description: str = None,
+        examples: List[Tuple[str, str]] = None,
+        index=None,
+        **kw
+    ):
+        import inspect
+        from pydantic import BaseModel
+        if inspect.isclass(schema) and issubclass(schema, BaseModel):
+            schema = schema.model_json_schema()
+        return TextLLMExtractOp(
+            model=self,
+            schema=schema,
+            task="extract",
+            description=description,
+            examples=examples,
+            **kw
+        )(series, index)
+TextLLM = TextGenLLM  # for old client compatibility
+class TextEmbeddingModel(LLM):
+    def embed(
+        self,
+        data: Series,
+        dimensions: int,
+        encoding_format: str,
+        simple_output: bool,
+        params: Dict[str, Any],
+        **kw
+    ):
+        raise NotImplementedError
 def generate(
     data,
-    model: TextLLM,
+    model: TextGenLLM,
     prompt_template: List[Dict[str, Any]],
     params: Dict[str, Any] = None,
 ):
@@ -141,11 +211,11 @@ def generate(
     Examples
     --------
-    >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextLLM
+    >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
     >>> import maxframe.dataframe as md
     >>>
     >>> # Initialize the model
-    >>> llm = ManagedTextLLM(name="Qwen2.5-0.5B-instruct")
+    >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
     >>>
     >>> # Prepare prompt template
     >>> messages = [
@@ -164,14 +234,14 @@ def generate(
     """
     if not isinstance(data, DataFrame) and not isinstance(data, Series):
         raise ValueError("data must be a maxframe dataframe or series object")
-    if not isinstance(model, TextLLM):
+    if not isinstance(model, TextGenLLM):
         raise TypeError("model must be a TextLLM object")
     params = params if params is not None else dict()
     model.validate_params(params)
     return model.generate(data, prompt_template=prompt_template, params=params)
-def summary(series, model: TextLLM, index=None):
+def summary(series, model: TextGenLLM, index=None):
     """
     Generate summaries for text content in a series using a language model.
@@ -180,15 +250,35 @@ def summary(series, model: TextLLM, index=None):
     series : Series
         A maxframe Series containing text data to be summarized.
         Each element should be a text string.
-    model : TextLLM
+    model : TextGenLLM
         Language model instance used for text summarization.
     index : array-like, optional
         Index for the output series, by default None, will generate new index.
     Returns
     -------
-    maxframe.Series
-        A pandas Series containing the generated summaries and success status.
+    DataFrame
+        A DataFrame containing the generated summaries and success status.
+        Columns include 'summary' (generated summary text) and 'success' (boolean status).
+        If 'success' is False, the 'summary' column will contain error information instead of the expected output.
+    Examples
+    --------
+    >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
+    >>> import maxframe.dataframe as md
+    >>>
+    >>> # Initialize the model
+    >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
+    >>>
+    >>> # Create sample data
+    >>> texts = md.Series([
+    ...     "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed.",
+    ...     "Deep learning uses neural networks with multiple layers to model and understand complex patterns in data."
+    ... ])
+    >>>
+    >>> # Generate summaries
+    >>> result = summary(texts, llm)
+    >>> result.execute()
     Notes
     -----
@@ -205,35 +295,54 @@ def summary(series, model: TextLLM, index=None):
 def translate(
-    series, model: TextLLM, source_language: str, target_language: str, index=None
+    series, model: TextGenLLM, source_language: str, target_language: str, index=None
 ):
     """
     Translate text content in a series using a language model from source language to target language.
     Parameters
     ----------
-    series : pandas.Series
+    series : Series
         A maxframe Series containing text data to translate.
         Each element should be a text string.
-    model : TextLLM
-        Language model instance used for text summarization.
+    model : TextGenLLM
+        Language model instance used for text translation.
     source_language : str
-        Source language of the text.
+        Source language of the text (e.g., 'en', 'zh', 'ja').
     target_language : str
-        Target language of the text.
+        Target language for translation (e.g., 'en', 'zh', 'ja').
     index : array-like, optional
         Index for the output series, by default None, will generate new index.
     Returns
     -------
-    maxframe.Series
-        A pandas Series containing the generated translation and success status.
+    DataFrame
+        A DataFrame containing the generated translations and success status.
+        Columns include 'output' (translated text) and 'success' (boolean status).
+        If 'success' is False, the 'output' column will contain error information instead of the expected output.
+    Examples
+    --------
+    >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
+    >>> import maxframe.dataframe as md
+    >>>
+    >>> # Initialize the model
+    >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
+    >>>
+    >>> # Create sample data
+    >>> texts = md.Series([
+    ...     "Hello, how are you?",
+    ...     "Machine learning is fascinating."
+    ... ])
+    >>>
+    >>> # Translate from English to Chinese
+    >>> result = translate(texts, llm, source_language="en", target_language="zh")
+    >>> result.execute()
     Notes
     -----
       **Preview:** This API is in preview state and may be unstable.
       The interface may change in future releases.
     """
     if not isinstance(series, Series):
         raise ValueError("series must be a maxframe series object")
@@ -249,36 +358,63 @@ def translate(
 def classify(
     series,
-    model: TextLLM,
+    model: TextGenLLM,
     labels: List[str],
     description: str = None,
     examples: List[Dict[str, str]] = None,
     index=None,
 ):
     """
-    Classify text content in a series with given labels.
+    Classify text content in a series with given labels using a language model.
     Parameters
     ----------
-    series : pandas.Series
+    series : Series
         A maxframe Series containing text data to be classified.
         Each element should be a text string.
-    model : TextLLM
-        Language model instance used for text summarization.
+    model : TextGenLLM
+        Language model instance used for text classification.
     labels : List[str]
-        List of labels to classify the text.
-    description : str
-        Description of the classification task.
-    examples : List[Dict[str, Dict[str, str]]]
-        Examples of the classification task, like [{ "text": "text...", "label":"A", reason : "reason..."}], help
-        LLM to better understand your rules.
+        List of labels to classify the text into.
+    description : str, optional
+        Description of the classification task to help the model understand the context.
+    examples : List[Dict[str, str]], optional
+        Examples of the classification task, like [{"text": "text...", "label": "A", "reason": "reason..."}],
+        to help LLM better understand your classification rules.
     index : array-like, optional
         Index for the output series, by default None, will generate new index.
     Returns
     -------
-    maxframe.Series
-        A pandas Series containing the generated classification results and success status.
+    DataFrame
+        A DataFrame containing the generated classification results and success status.
+        Columns include 'label' (predicted label), 'reason' (reasoning), and 'success' (boolean status).
+        If 'success' is False, the 'label' and 'reason' columns will contain error information instead of the expected output.
+    Examples
+    --------
+    >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
+    >>> import maxframe.dataframe as md
+    >>>
+    >>> # Initialize the model
+    >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
+    >>>
+    >>> # Create sample data
+    >>> texts = md.Series([
+    ...     "I love this product! It's amazing!",
+    ...     "This is terrible, worst purchase ever.",
+    ...     "It's okay, nothing special."
+    ... ])
+    >>>
+    >>> # Classify sentiment
+    >>> labels = ["positive", "negative", "neutral"]
+    >>> description = "Classify the sentiment of customer reviews"
+    >>> examples = [
+    ...     {"text": "Great product!", "label": "positive", "reason": "Expresses satisfaction"},
+    ...     {"text": "Poor quality", "label": "negative", "reason": "Expresses dissatisfaction"}
+    ... ]
+    >>> result = classify(texts, llm, labels=labels, description=description, examples=examples)
+    >>> result.execute()
     Notes
     -----
@@ -300,3 +436,173 @@ def classify(
     return model.classify(
         series, labels=labels, description=description, examples=examples, index=index
     )
+def extract(
+    series,
+    model: TextGenLLM,
+    schema: Any,
+    description: str = None,
+    examples: List[Tuple[str, str]] = None,
+    index=None,
+):
+    """
+    Extract structured information from text content in a series using a language model.
+    Parameters
+    ----------
+    series : Series
+        A maxframe Series containing text data to extract information from.
+        Each element should be a text string.
+    model : TextGenLLM
+        Language model instance used for information extraction.
+    schema : Any
+        Schema definition for the extraction. Can be a dictionary defining the structure
+        or a Pydantic BaseModel class that will be converted to JSON schema.
+    description : str, optional
+        Description of the extraction task to help the model understand what to extract.
+    examples : List[Tuple[str, str]], optional
+        Examples of the extraction task in format [(input_text, expected_output), ...],
+        to help LLM better understand the extraction requirements.
+    index : array-like, optional
+        Index for the output series, by default None, will generate new index.
+    Returns
+    -------
+    DataFrame
+        A DataFrame containing the extracted information and success status.
+        Columns include 'output' (extracted structured data) and 'success' (boolean status).
+        If 'success' is False, the 'output' column will contain error information instead of the expected output.
+    Examples
+    --------
+    >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextGenLLM
+    >>> import maxframe.dataframe as md
+    >>>
+    >>> # Initialize the model
+    >>> llm = ManagedTextGenLLM(name="Qwen3-0.6B")
+    >>>
+    >>> # Create sample data
+    >>> texts = md.Series([
+    ...     "John Smith, age 30, works as a Software Engineer at Google.",
+    ...     "Alice Johnson, 25 years old, is a Data Scientist at Microsoft."
+    ... ])
+    >>>
+    >>> # Define extraction schema
+    >>> schema = {
+    ...     "name": "string",
+    ...     "age": "integer",
+    ...     "job_title": "string",
+    ...     "company": "string"
+    ... }
+    >>>
+    >>> # Extract structured information
+    >>> description = "Extract person information from text"
+    >>> examples = [
+    ...     ("Bob Brown, 35, Manager at Apple", '{"name": "Bob Brown", "age": 35, "job_title": "Manager", "company": "Apple"}')
+    ... ]
+    >>> result = extract(texts, llm, schema=schema, description=description, examples=examples)
+    >>> result.execute()
+    Notes
+    -----
+      **Preview:** This API is in preview state and may be unstable.
+      The interface may change in future releases.
+    """
+    if not isinstance(series, Series):
+        raise ValueError("series must be a maxframe series object")
+    if series.dtype != np.str_:
+        raise ValueError("extract input must be a string series")
+    if not schema:
+        raise ValueError("schema must not be empty")
+    if (
+        examples
+        and not isinstance(examples, list)
+        or not any(isinstance(x, Tuple) for x in examples)
+    ):
+        raise ValueError("examples must be a list of tuples, format is (input, output)")
+    return model.extract(
+        series, schema=schema, description=description, examples=examples, index=index
+    )
+def embed(
+    series,
+    model: TextEmbeddingModel,
+    dimensions: int = None,
+    encoding_format: str = None,
+    simple_output: bool = False,
+    params: Dict[str, Any] = None,
+    index=None,
+):
+    """
+    Embed text content in a series using a text embedding model.
+    Parameters
+    ----------
+    series : Series
+        A maxframe Series containing text data to be embedded.
+        Each element should be a text string.
+    model : TextEmbeddingModel
+        Text embedding model instance used for generating embeddings.
+    dimensions : int, optional
+        Dimensions of the embedding vectors. If not specified, uses model default.
+    encoding_format : str, optional
+        Encoding format of the embedding (e.g., 'float', 'base64'). If not specified, uses model default.
+    simple_output : bool, optional
+        Whether to return the embedding data directly without additional metadata, by default False.
+    params : Dict[str, Any], optional
+        Additional parameters for embedding configuration, by default None.
+        Can include model-specific settings.
+    index : array-like, optional
+        Index for the output series, by default None, will generate new index.
+    Returns
+    -------
+    DataFrame
+        A DataFrame containing the generated embeddings and success status.
+        Columns include 'response' (embedding vectors) and 'success' (boolean status).
+        If 'success' is False, the 'response' column will contain error information instead of the expected output.
+    Examples
+    --------
+    >>> from maxframe.learn.contrib.llm.models.managed import ManagedTextEmbeddingModel
+    >>> import maxframe.dataframe as md
+    >>>
+    >>> # Initialize the embedding model
+    >>> embedding_model = ManagedTextEmbeddingModel(name="text-embedding-ada-002")
+    >>>
+    >>> # Create sample data
+    >>> texts = md.Series([
+    ...     "Machine learning is a powerful technology.",
+    ...     "Natural language processing enables computers to understand text.",
+    ...     "Deep learning uses neural networks for pattern recognition."
+    ... ])
+    >>>
+    >>> # Generate embeddings
+    >>> result = embed(texts, embedding_model, simple_output=True)
+    >>> result.execute()
+    Notes
+    -----
+      **Preview:** This API is in preview state and may be unstable.
+      The interface may change in future releases.
+    """
+    if not isinstance(series, Series):
+        raise ValueError("series must be a maxframe series object")
+    if series.dtype != np.str_:
+        raise ValueError("embed input must be a string series")
+    return model.embed(
+        series,
+        dimensions=dimensions,
+        encoding_format=encoding_format,
+        simple_output=simple_output,
+        params=params,
+        index=index,
+    )
+TextLLMExtractOperator = TextLLMExtractOp
+TextLLMSummarizeOperator = TextLLMSummarizeOp
+TextLLMTranslateOperator = TextLLMTranslateOp
+TextLLMClassifyOperator = TextLLMClassifyOp

maxframe/learn/contrib/models.py CHANGED Viewed

@@ -40,7 +40,10 @@ class ModelWithEvalData(ModelData):
     def execute(self, session=None, **kw):
         # The evals_result should be fetched when BoosterData.execute() is called.
         result = super().execute(session=session, **kw)
-        if self.op.has_evals_result and self.key == self.op.outputs[0].key:
+        if (
+            getattr(self.op, "has_evals_result", None)
+            and self.key == self.op.outputs[0].key
+        ):
             self._evals_result.update(self.op.outputs[1].fetch(session=session))
         return result

maxframe/learn/contrib/xgboost/classifier.py CHANGED Viewed

@@ -18,6 +18,7 @@ import numpy as np
 from .... import tensor as mt
 from ....tensor.merge.vstack import _vstack
+from ...utils.odpsio import register_odps_model
 from ..utils import make_import_error_func
 from .core import XGBScikitLearnBase, xgboost
@@ -28,6 +29,7 @@ else:
     from .predict import predict
+    @register_odps_model
     class XGBClassifier(XGBScikitLearnBase, XGBClassifierBase):
         """
         Implementation of the scikit-learn API for XGBoost classification.

maxframe/learn/contrib/xgboost/core.py CHANGED Viewed

@@ -24,11 +24,11 @@ from ....udf import builtin_function
 try:
     import xgboost
-except ImportError:
+except ImportError:  # pragma: no cover
     xgboost = None
-from ....core import OutputType
-from ...utils.odpsio import ToODPSModelMixin
+from ....core import OutputType, enter_mode, is_kernel_mode
+from ...utils.odpsio import ODPSModelMixin, ReadODPSModel
 from ..models import ModelApplyChunk, ModelWithEval, ModelWithEvalData, to_remote_model
 from .dmatrix import DMatrix
@@ -40,6 +40,14 @@ _xgb_type_to_np_type = {
 class BoosterData(ModelWithEvalData):
+    def save_config(self) -> str:
+        try:
+            return self.fetch().save_config()
+        except:
+            if is_kernel_mode():
+                return "{}"
+            raise
     @staticmethod
     def _get_booster_score(bst, fmap=None, importance_type="weight"):
         if not fmap:
@@ -157,7 +165,7 @@ if not xgboost:
     XGBScikitLearnBase = None
 else:
-    class XGBScikitLearnBase(xgboost.XGBModel, ToODPSModelMixin):
+    class XGBScikitLearnBase(xgboost.XGBModel, ODPSModelMixin):
         """
         Base class for implementing scikit-learn interface
         """
@@ -218,7 +226,8 @@ else:
                 sample_weight_eval_set,
                 base_margin_eval_set,
             )
-            params = self.get_xgb_params()
+            with enter_mode(kernel=True):
+                params = self.get_xgb_params()
             if not params.get("objective"):
                 params["objective"] = "reg:squarederror"
             self.evals_result_ = dict()
@@ -351,16 +360,31 @@ else:
                 evals_result=self.evals_result_t_, local_info=local_info
             )
-        def _get_odps_model_info(self) -> ToODPSModelMixin.ODPSModelInfo:
+        def _get_odps_model_info(self) -> ODPSModelMixin.ODPSModelInfo:
             model_format = (
                 "BOOSTED_TREE_CLASSIFIER"
                 if hasattr(self, "predict_proba")
                 else "BOOSTED_TREE_REGRESSOR"
             )
-            return ToODPSModelMixin.ODPSModelInfo(
+            return ODPSModelMixin.ODPSModelInfo(
                 model_format=model_format, model_params=self._Booster
             )
+        @classmethod
+        def _build_odps_source_model(cls, op: ReadODPSModel) -> Any:
+            if not (
+                op.format == "BOOSTED_TREE_CLASSIFIER" and hasattr(cls, "predict_proba")
+            ) and not (
+                op.format == "BOOSTED_TREE_REGRESSOR"
+                and not hasattr(cls, "predict_proba")
+            ):
+                return None
+            op._output_types = [OutputType.object]
+            booster = op.new_tileable(None, object_class=Booster)
+            estimator = cls()
+            estimator._Booster = booster
+            return estimator
     def wrap_evaluation_matrices(
         missing: float,
         X: Any,

maxframe/learn/contrib/xgboost/predict.py CHANGED Viewed

@@ -15,6 +15,7 @@
 from typing import List
 import numpy as np
+import pandas as pd
 from .... import opcodes
 from ....core import EntityData
@@ -62,9 +63,10 @@ class XGBPredict(Operator, TileableOperatorMixin):
     def __call__(self):
         num_class = getattr(self.model.op, "num_class", None)
-        if num_class is not None:
+        output_ndim = getattr(self.model.op, "output_ndim", None)
+        if num_class is not None and not pd.isna(num_class):
             num_class = int(num_class)
-        if num_class is not None and num_class > 2:
+        if num_class is not None and (num_class > 2 or output_ndim == 2):
             shape = (self.data.shape[0], num_class)
         else:
             shape = (self.data.shape[0],)