PyPI - maxframe - Versions diffs - 2.2.0__cp312-cp312-macosx_10_9_universal2.whl → 2.3.0rc1__cp312-cp312-macosx_10_9_universal2.whl - Mend

maxframe 2.2.0__cp312-cp312-macosx_10_9_universal2.whl → 2.3.0rc1__cp312-cp312-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (113) hide show

maxframe/_utils.cpython-312-darwin.so +0 -0
maxframe/codegen/core.py +3 -2
maxframe/codegen/spe/dataframe/merge.py +4 -0
maxframe/codegen/spe/dataframe/misc.py +2 -0
maxframe/codegen/spe/dataframe/reduction.py +18 -0
maxframe/codegen/spe/dataframe/sort.py +9 -1
maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
maxframe/codegen/spe/dataframe/tseries.py +9 -0
maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
maxframe/codegen/spe/tensor/datasource.py +1 -0
maxframe/config/config.py +3 -0
maxframe/conftest.py +10 -0
maxframe/core/base.py +2 -1
maxframe/core/entity/tileables.py +2 -0
maxframe/core/graph/entity.py +7 -1
maxframe/core/mode.py +6 -1
maxframe/dataframe/__init__.py +2 -2
maxframe/dataframe/arithmetic/__init__.py +4 -0
maxframe/dataframe/arithmetic/maximum.py +33 -0
maxframe/dataframe/arithmetic/minimum.py +33 -0
maxframe/dataframe/core.py +98 -106
maxframe/dataframe/datasource/core.py +6 -0
maxframe/dataframe/datasource/direct.py +57 -0
maxframe/dataframe/datasource/read_csv.py +19 -11
maxframe/dataframe/datasource/read_odps_query.py +29 -6
maxframe/dataframe/datasource/read_odps_table.py +32 -10
maxframe/dataframe/datasource/read_parquet.py +38 -39
maxframe/dataframe/datastore/__init__.py +6 -0
maxframe/dataframe/datastore/direct.py +268 -0
maxframe/dataframe/datastore/to_odps.py +6 -0
maxframe/dataframe/extensions/flatjson.py +2 -1
maxframe/dataframe/groupby/__init__.py +5 -1
maxframe/dataframe/groupby/aggregation.py +10 -6
maxframe/dataframe/groupby/apply_chunk.py +1 -3
maxframe/dataframe/groupby/core.py +20 -4
maxframe/dataframe/indexing/__init__.py +2 -1
maxframe/dataframe/indexing/insert.py +45 -17
maxframe/dataframe/merge/__init__.py +3 -0
maxframe/dataframe/merge/combine.py +244 -0
maxframe/dataframe/misc/__init__.py +14 -3
maxframe/dataframe/misc/check_unique.py +41 -10
maxframe/dataframe/misc/drop.py +31 -0
maxframe/dataframe/misc/infer_dtypes.py +251 -0
maxframe/dataframe/misc/map.py +31 -18
maxframe/dataframe/misc/repeat.py +159 -0
maxframe/dataframe/misc/tests/test_misc.py +35 -1
maxframe/dataframe/missing/checkna.py +3 -2
maxframe/dataframe/reduction/__init__.py +10 -5
maxframe/dataframe/reduction/aggregation.py +6 -6
maxframe/dataframe/reduction/argmax.py +7 -4
maxframe/dataframe/reduction/argmin.py +7 -4
maxframe/dataframe/reduction/core.py +18 -9
maxframe/dataframe/reduction/mode.py +144 -0
maxframe/dataframe/reduction/nunique.py +10 -3
maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
maxframe/dataframe/sort/__init__.py +9 -2
maxframe/dataframe/sort/argsort.py +7 -1
maxframe/dataframe/sort/core.py +1 -1
maxframe/dataframe/sort/rank.py +147 -0
maxframe/dataframe/tseries/__init__.py +19 -0
maxframe/dataframe/tseries/at_time.py +61 -0
maxframe/dataframe/tseries/between_time.py +122 -0
maxframe/dataframe/utils.py +30 -26
maxframe/learn/contrib/llm/core.py +16 -7
maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
maxframe/learn/contrib/llm/deploy/config.py +221 -0
maxframe/learn/contrib/llm/deploy/core.py +247 -0
maxframe/learn/contrib/llm/deploy/framework.py +35 -0
maxframe/learn/contrib/llm/deploy/loader.py +360 -0
maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
maxframe/learn/contrib/llm/models/__init__.py +1 -0
maxframe/learn/contrib/llm/models/dashscope.py +12 -6
maxframe/learn/contrib/llm/models/managed.py +76 -11
maxframe/learn/contrib/llm/models/openai.py +72 -0
maxframe/learn/contrib/llm/tests/__init__.py +13 -0
maxframe/learn/contrib/llm/tests/test_core.py +34 -0
maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
maxframe/learn/contrib/llm/text.py +348 -42
maxframe/learn/contrib/models.py +4 -1
maxframe/learn/contrib/xgboost/classifier.py +2 -0
maxframe/learn/contrib/xgboost/core.py +31 -7
maxframe/learn/contrib/xgboost/predict.py +4 -2
maxframe/learn/contrib/xgboost/regressor.py +5 -0
maxframe/learn/contrib/xgboost/train.py +2 -0
maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
maxframe/learn/utils/__init__.py +1 -0
maxframe/learn/utils/extmath.py +42 -9
maxframe/learn/utils/odpsio.py +80 -11
maxframe/lib/filesystem/_oss_lib/common.py +2 -0
maxframe/lib/mmh3.cpython-312-darwin.so +0 -0
maxframe/opcodes.py +9 -1
maxframe/remote/core.py +4 -0
maxframe/serialization/core.cpython-312-darwin.so +0 -0
maxframe/serialization/tests/test_serial.py +2 -2
maxframe/tensor/arithmetic/__init__.py +1 -1
maxframe/tensor/arithmetic/core.py +2 -2
maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
maxframe/tensor/core.py +3 -0
maxframe/tensor/misc/copyto.py +1 -1
maxframe/tests/test_udf.py +61 -0
maxframe/tests/test_utils.py +8 -5
maxframe/udf.py +103 -7
maxframe/utils.py +61 -8
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +112 -89
maxframe_client/session/task.py +8 -1
maxframe_client/tests/test_session.py +24 -0
maxframe/dataframe/arrays.py +0 -864
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
{maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0

maxframe/dataframe/core.py CHANGED Viewed

@@ -715,55 +715,6 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
         return from_index(self, dtype=dtype, extract_multi_index=extract_multi_index)
     def to_frame(self, index: bool = True, name=None):
-        """
-        Create a DataFrame with a column containing the Index.
-        Parameters
-        ----------
-        index : bool, default True
-            Set the index of the returned DataFrame as the original Index.
-        name : object, default None
-            The passed name should substitute for the index name (if it has
-            one).
-        Returns
-        -------
-        DataFrame
-            DataFrame containing the original Index data.
-        See Also
-        --------
-        Index.to_series : Convert an Index to a Series.
-        Series.to_frame : Convert Series to DataFrame.
-        Examples
-        --------
-        >>> import maxframe.dataframe as md
-        >>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal')
-        >>> idx.to_frame().execute()
-               animal
-        animal
-        Ant       Ant
-        Bear     Bear
-        Cow       Cow
-        By default, the original Index is reused. To enforce a new Index:
-        >>> idx.to_frame(index=False).execute()
-          animal
-        0    Ant
-        1   Bear
-        2    Cow
-        To override the name of the resulting column, specify `name`:
-        >>> idx.to_frame(index=False, name='zoo').execute()
-            zoo
-        0   Ant
-        1  Bear
-        2   Cow
-        """
         from . import dataframe_from_tensor
         if isinstance(self.index_value.value, IndexValue.MultiIndex):
@@ -789,34 +740,20 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
             columns = [name or self.name or 0]
         index_ = self if index else None
         return dataframe_from_tensor(
-            self._to_maxframe_tensor(self, extract_multi_index=True),
+            self._to_maxframe_tensor(extract_multi_index=True),
             index=index_,
             columns=columns,
         )
     def to_series(self, index=None, name=None):
-        """
-        Create a Series with both index and values equal to the index keys.
-        Useful with map for returning an indexer based on an index.
-        Parameters
-        ----------
-        index : Index, optional
-            Index of resulting Series. If None, defaults to original index.
-        name : str, optional
-            Dame of resulting Series. If None, defaults to name of original
-            index.
-        Returns
-        -------
-        Series
-            The dtype will be based on the type of the Index values.
-        """
         from . import series_from_index
         return series_from_index(self, index=index, name=name)
+    @property
+    def hasnans(self):
+        return self.isna().any()
 class Index(HasShapeTileable, _ToPandasMixin):
     __slots__ = "_df_or_series", "_parent_key", "_axis"
@@ -887,6 +824,99 @@ class Index(HasShapeTileable, _ToPandasMixin):
     def values(self):
         return self.to_tensor()
+    def to_frame(self, index: bool = True, name=None):
+        """
+        Create a DataFrame with a column containing the Index.
+        Parameters
+        ----------
+        index : bool, default True
+            Set the index of the returned DataFrame as the original Index.
+        name : object, default None
+            The passed name should substitute for the index name (if it has
+            one).
+        Returns
+        -------
+        DataFrame
+            DataFrame containing the original Index data.
+        See Also
+        --------
+        Index.to_series : Convert an Index to a Series.
+        Series.to_frame : Convert Series to DataFrame.
+        Examples
+        --------
+        >>> import maxframe.dataframe as md
+        >>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal')
+        >>> idx.to_frame().execute()
+               animal
+        animal
+        Ant       Ant
+        Bear     Bear
+        Cow       Cow
+        By default, the original Index is reused. To enforce a new Index:
+        >>> idx.to_frame(index=False).execute()
+          animal
+        0    Ant
+        1   Bear
+        2    Cow
+        To override the name of the resulting column, specify `name`:
+        >>> idx.to_frame(index=False, name='zoo').execute()
+            zoo
+        0   Ant
+        1  Bear
+        2   Cow
+        """
+        return self._data.to_frame(index=index, name=name)
+    def to_series(self, index=None, name=None):
+        """
+        Create a Series with both index and values equal to the index keys.
+        Useful with map for returning an indexer based on an index.
+        Parameters
+        ----------
+        index : Index, optional
+            Index of resulting Series. If None, defaults to original index.
+        name : str, optional
+            Dame of resulting Series. If None, defaults to name of original
+            index.
+        Returns
+        -------
+        Series
+            The dtype will be based on the type of the Index values.
+        """
+        return self._data.to_series(index=index, name=name)
+    @property
+    def hasnans(self):
+        """
+        Return True if there are any NaNs.
+        Returns
+        -------
+        bool
+        Examples
+        --------
+        >>> import maxframe.dataframe as md
+        >>> idx = md.Index([1, 2, 3, None])
+        >>> idx.execute()
+        Index([1.0, 2.0, 3.0, nan], dtype='float64')
+        >>> idx.hasnans.execute()
+        True
+        """
+        return self._data.hasnans
 class RangeIndex(Index):
     __slots__ = ()
@@ -1085,12 +1115,6 @@ class SeriesData(_BatchedFetcher, BaseSeriesData):
     items = iteritems
-    def to_dict(self, into=dict, batch_size=10000, session=None):
-        fetch_kwargs = dict(batch_size=batch_size)
-        return self.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
-            into=into
-        )
     def to_frame(self, name=None):
         from . import dataframe_from_tensor
@@ -1285,38 +1309,6 @@ class Series(HasShapeTileable, _ToPandasMixin):
     items = iteritems
-    def to_dict(self, into=dict, batch_size=10000, session=None):
-        """
-        Convert Series to {label -> value} dict or dict-like object.
-        Parameters
-        ----------
-        into : class, default dict
-            The collections.abc.Mapping subclass to use as the return
-            object. Can be the actual class or an empty
-            instance of the mapping type you want.  If you want a
-            collections.defaultdict, you must pass it initialized.
-        Returns
-        -------
-        collections.abc.Mapping
-            Key-value representation of Series.
-        Examples
-        --------
-        >>> import maxframe.dataframe as md
-        >>> s = md.Series([1, 2, 3, 4])
-        >>> s.to_dict()
-        {0: 1, 1: 2, 2: 3, 3: 4}
-        >>> from collections import OrderedDict, defaultdict
-        >>> s.to_dict(OrderedDict)
-        OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
-        >>> dd = defaultdict(list)
-        >>> s.to_dict(dd)
-        defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
-        """
-        return self._data.to_dict(into=into, batch_size=batch_size, session=session)
     def to_frame(self, name=None):
         """
         Convert Series to DataFrame.

maxframe/dataframe/datasource/core.py CHANGED Viewed

@@ -18,6 +18,7 @@ from typing import List, MutableMapping, Optional, Union
 from ...serialization.serializables import Int64Field, StringField
 from ...utils import estimate_pandas_size
 from ..operators import DataFrameOperator, DataFrameOperatorMixin
+from ..utils import validate_dtype_backend
 class HeadOptimizedDataSource(DataFrameOperator, DataFrameOperatorMixin):
@@ -86,3 +87,8 @@ class PandasDataSourceOperator(DataFrameOperator):
         cls, ctx: MutableMapping[str, Union[int, float]], op: "PandasDataSourceOperator"
     ):
         ctx[op.outputs[0].key] = estimate_pandas_size(op.get_data())
+class DtypeBackendCompatibleMixin:
+    def __on_deserialize__(self):
+        self.dtype_backend = validate_dtype_backend(self.dtype_backend)

maxframe/dataframe/datasource/direct.py ADDED Viewed

@@ -0,0 +1,57 @@
+# Copyright 1999-2025 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pandas as pd
+def read_clipboard(sep=None, **kwargs):
+    """
+    Read text from clipboard and pass to :func:`~pandas.read_csv`.
+    Parses clipboard contents similar to how CSV files are parsed
+    using :func:`~pandas.read_csv`.
+    Parameters
+    ----------
+    sep : str, default '\\s+'
+        A string or regex delimiter. The default of ``'\\s+'`` denotes
+        one or more whitespace characters.
+    **kwargs
+        See :func:`~pandas.read_csv` for the full argument list.
+    Returns
+    -------
+    DataFrame
+        A parsed :class:`DataFrame` object.
+    See Also
+    --------
+    DataFrame.to_clipboard : Copy object to the system clipboard.
+    read_csv : Read a comma-separated values (csv) file into DataFrame.
+    read_fwf : Read a table of fixed-width formatted lines into DataFrame.
+    Examples
+    --------
+    >>> import maxframe.dataframe as md
+    >>> df = md.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
+    >>> df.to_clipboard()  # doctest: +SKIP
+    >>> md.read_clipboard()  # doctest: +SKIP.execute()
+         A  B  C
+    0    1  2  3
+    1    4  5  6
+    """
+    from ..initializer import DataFrame
+    return DataFrame(pd.read_clipboard(sep=sep, **kwargs))

maxframe/dataframe/datasource/read_csv.py CHANGED Viewed

@@ -38,8 +38,12 @@ from ...serialization.serializables import (
     StringField,
 )
 from ...utils import lazy_import, parse_readable_size
-from ..utils import parse_index, to_arrow_dtypes
-from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
+from ..utils import parse_index, to_arrow_dtypes, validate_dtype_backend
+from .core import (
+    ColumnPruneSupportedDataSourceMixin,
+    DtypeBackendCompatibleMixin,
+    IncrementalIndexDatasource,
+)
 cudf = lazy_import("cudf")
@@ -88,6 +92,7 @@ def _find_chunk_start_end(f, offset, size):
 class DataFrameReadCSV(
     IncrementalIndexDatasource,
     ColumnPruneSupportedDataSourceMixin,
+    DtypeBackendCompatibleMixin,
 ):
     _op_type_ = opcodes.READ_CSV
@@ -101,7 +106,7 @@ class DataFrameReadCSV(
     offset = Int64Field("offset")
     size = Int64Field("size")
     incremental_index = BoolField("incremental_index")
-    use_arrow_dtype = BoolField("use_arrow_dtype")
+    dtype_backend = StringField("dtype_backend", default=None)
     keep_usecols_order = BoolField("keep_usecols_order", default=None)
     storage_options = DictField("storage_options")
     merge_small_files = BoolField("merge_small_files")
@@ -151,7 +156,7 @@ def read_csv(
     head_bytes="100k",
     head_lines=None,
     incremental_index: bool = True,
-    use_arrow_dtype: bool = None,
+    dtype_backend: str = None,
     storage_options: dict = None,
     memory_scale: int = None,
     merge_small_files: bool = True,
@@ -419,8 +424,8 @@ def read_csv(
     incremental_index: bool, default True
         If index_col not specified, ensure range index incremental,
         gain a slightly better performance if setting False.
-    use_arrow_dtype: bool, default None
-        If True, use arrow dtype to store columns.
+    dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
+        Back-end data type applied to the resultant DataFrame (still experimental).
     storage_options: dict, optional
         Options for storage connection.
     merge_small_files: bool, default True
@@ -509,7 +514,7 @@ def read_csv(
         compression=compression,
         gpu=gpu,
         incremental_index=incremental_index,
-        use_arrow_dtype=use_arrow_dtype,
+        dtype_backend=dtype_backend,
         storage_options=storage_options,
         memory_scale=memory_scale,
         merge_small_files=merge_small_files,
@@ -518,10 +523,13 @@ def read_csv(
     )
     chunk_bytes = chunk_bytes or options.chunk_store_limit
     dtypes = mini_df.dtypes
-    if use_arrow_dtype is None:
-        use_arrow_dtype = options.dataframe.use_arrow_dtype
-    if not gpu and use_arrow_dtype:
-        dtypes = to_arrow_dtypes(dtypes, test_df=mini_df)
+    dtype_backend = validate_dtype_backend(
+        dtype_backend or options.dataframe.dtype_backend
+    )
+    if not gpu and dtype_backend == "pyarrow":
+        dtypes = to_arrow_dtypes(dtypes)
     ret = op(
         index_value=index_value,
         columns_value=columns_value,

maxframe/dataframe/datasource/read_odps_query.py CHANGED Viewed

@@ -29,7 +29,7 @@ from odps.types import Column, OdpsSchema, validate_data_type
 from odps.utils import split_sql_by_semicolon
 from ... import opcodes
-from ...config import options
+from ...config import option_context, options
 from ...core import OutputType
 from ...core.graph import DAG
 from ...io.odpsio import odps_schema_to_pandas_dtypes
@@ -44,8 +44,12 @@ from ...serialization.serializables import (
     StringField,
 )
 from ...utils import is_empty
-from ..utils import parse_index
-from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
+from ..utils import parse_index, validate_dtype_backend
+from .core import (
+    ColumnPruneSupportedDataSourceMixin,
+    DtypeBackendCompatibleMixin,
+    IncrementalIndexDatasource,
+)
 logger = logging.getLogger(__name__)
@@ -266,6 +270,7 @@ def _build_explain_sql(
 class DataFrameReadODPSQuery(
     IncrementalIndexDatasource,
     ColumnPruneSupportedDataSourceMixin,
+    DtypeBackendCompatibleMixin,
 ):
     _op_type_ = opcodes.READ_ODPS_QUERY
@@ -273,12 +278,16 @@ class DataFrameReadODPSQuery(
     dtypes = SeriesField("dtypes", default=None)
     columns = AnyField("columns", default=None)
     nrows = Int64Field("nrows", default=None)
-    use_arrow_dtype = BoolField("use_arrow_dtype", default=None)
+    dtype_backend = StringField("dtype_backend", default=None)
     string_as_binary = BoolField("string_as_binary", default=None)
     index_columns = ListField("index_columns", FieldTypes.string, default=None)
     index_dtypes = SeriesField("index_dtypes", default=None)
     column_renames = DictField("column_renames", default=None)
+    def __init__(self, dtype_backend=None, **kw):
+        dtype_backend = validate_dtype_backend(dtype_backend)
+        super().__init__(dtype_backend=dtype_backend, **kw)
     def get_columns(self):
         return self.columns or list(self.dtypes.index)
@@ -404,6 +413,7 @@ def read_odps_query(
     sql_hints: Dict[str, str] = None,
     anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
     skip_schema: bool = False,
+    dtype_backend: str = None,
     **kw,
 ):
     """
@@ -428,6 +438,8 @@ def read_odps_query(
         Skip resolving output schema before execution. Once this is configured,
         the output DataFrame cannot be inputs of other DataFrame operators
         before execution.
+    dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
+        Back-end data type applied to the resultant DataFrame (still experimental).
     Returns
     -------
@@ -459,6 +471,14 @@ def read_odps_query(
     if odps_entry is None:
         raise ValueError("Missing odps_entry parameter")
+    if "use_arrow_dtype" in kw:
+        dtype_backend = dtype_backend or validate_dtype_backend(
+            kw.pop("use_arrow_dtype")
+        )
+    dtype_backend = validate_dtype_backend(
+        dtype_backend or options.dataframe.dtype_backend
+    )
     col_renames = {}
     if not skip_schema:
         odps_schema = _resolve_query_schema(
@@ -479,7 +499,9 @@ def read_odps_query(
             else:
                 new_columns.append(col)
-        dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
+        with option_context():
+            options.dataframe.dtype_backend = dtype_backend
+            dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
     else:
         dtypes = None
@@ -500,10 +522,11 @@ def read_odps_query(
     chunk_bytes = kw.pop("chunk_bytes", None)
     chunk_size = kw.pop("chunk_size", None)
     op = DataFrameReadODPSQuery(
         query=query,
         dtypes=dtypes,
-        use_arrow_dtype=kw.pop("use_arrow_dtype", True),
+        dtype_backend=dtype_backend,
         string_as_binary=string_as_binary,
         index_columns=index_col,
         index_dtypes=index_dtypes,

maxframe/dataframe/datasource/read_odps_table.py CHANGED Viewed

@@ -22,7 +22,7 @@ from odps.models import Table
 from odps.utils import to_timestamp
 from ... import opcodes
-from ...config import options
+from ...config import option_context, options
 from ...core import OutputType
 from ...io.odpsio import odps_schema_to_pandas_dtypes
 from ...serialization.serializables import (
@@ -36,8 +36,12 @@ from ...serialization.serializables import (
 )
 from ...utils import estimate_table_size, is_empty
 from ..core import DataFrame  # noqa: F401
-from ..utils import parse_index
-from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
+from ..utils import parse_index, validate_dtype_backend
+from .core import (
+    ColumnPruneSupportedDataSourceMixin,
+    DtypeBackendCompatibleMixin,
+    IncrementalIndexDatasource,
+)
 logger = logging.getLogger(__name__)
@@ -45,6 +49,7 @@ logger = logging.getLogger(__name__)
 class DataFrameReadODPSTable(
     IncrementalIndexDatasource,
     ColumnPruneSupportedDataSourceMixin,
+    DtypeBackendCompatibleMixin,
 ):
     __slots__ = ("_odps_entry",)
     _op_type_ = opcodes.READ_ODPS_TABLE
@@ -54,18 +59,22 @@ class DataFrameReadODPSTable(
     dtypes = SeriesField("dtypes", default=None)
     columns = AnyField("columns", default=None)
     nrows = Int64Field("nrows", default=None)
-    use_arrow_dtype = BoolField("use_arrow_dtype", default=None)
+    dtype_backend = StringField("dtype_backend", default=None)
     string_as_binary = BoolField("string_as_binary", default=None)
     append_partitions = BoolField("append_partitions", default=None)
     last_modified_time = Int64Field("last_modified_time", default=None)
     index_columns = ListField("index_columns", FieldTypes.string, default=None)
     index_dtypes = SeriesField("index_dtypes", default=None)
-    def __init__(self, memory_scale=None, **kw):
+    def __init__(self, memory_scale=None, dtype_backend=None, **kw):
         output_type = kw.pop("output_type", OutputType.dataframe)
         self._odps_entry = kw.pop("odps_entry", None)
+        dtype_backend = validate_dtype_backend(dtype_backend)
         super(DataFrameReadODPSTable, self).__init__(
-            memory_scale=memory_scale, _output_types=[output_type], **kw
+            memory_scale=memory_scale,
+            dtype_backend=dtype_backend,
+            _output_types=[output_type],
+            **kw,
         )
     @property
@@ -153,6 +162,7 @@ def read_odps_table(
     odps_entry: ODPS = None,
     string_as_binary: bool = None,
     append_partitions: bool = False,
+    dtype_backend: str = None,
     **kw,
 ):
     """
@@ -176,6 +186,8 @@ def read_odps_table(
     append_partitions: bool
         If True, will add all partition columns as selected columns when
         `columns` is not specified,
+    dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
+        Back-end data type applied to the resultant DataFrame (still experimental).
     Returns
     -------
@@ -202,9 +214,20 @@ def read_odps_table(
         else table.table_schema.simple_columns
     )
     table_columns = [c.name.lower() for c in cols]
-    table_dtypes = odps_schema_to_pandas_dtypes(
-        table.table_schema, with_partitions=True
+    if "use_arrow_dtype" in kw:
+        dtype_backend = dtype_backend or validate_dtype_backend(
+            kw.pop("use_arrow_dtype")
+        )
+    dtype_backend = validate_dtype_backend(
+        dtype_backend or options.dataframe.dtype_backend
     )
+    with option_context():
+        options.dataframe.dtype_backend = dtype_backend
+        table_dtypes = odps_schema_to_pandas_dtypes(
+            table.table_schema, with_partitions=True
+        )
     df_types = [table_dtypes[c] for c in table_columns]
     if isinstance(index_col, str):
@@ -246,7 +269,6 @@ def read_odps_table(
     dtypes = pd.Series(df_types, index=table_columns)
     chunk_bytes = kw.pop("chunk_bytes", None)
     chunk_size = kw.pop("chunk_size", None)
-    use_arrow_dtype = kw.pop("use_arrow_dtype", True)
     partitions = partitions or kw.get("partition")
     if isinstance(partitions, str):
@@ -261,7 +283,7 @@ def read_odps_table(
         partitions=partitions,
         dtypes=dtypes,
         columns=columns,
-        use_arrow_dtype=use_arrow_dtype,
+        dtype_backend=dtype_backend,
         string_as_binary=string_as_binary,
         append_partitions=append_partitions,
         last_modified_time=to_timestamp(table.last_data_modified_time),