maxframe 2.2.0__cp312-cp312-macosx_10_9_universal2.whl → 2.3.0rc1__cp312-cp312-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (113) hide show
  1. maxframe/_utils.cpython-312-darwin.so +0 -0
  2. maxframe/codegen/core.py +3 -2
  3. maxframe/codegen/spe/dataframe/merge.py +4 -0
  4. maxframe/codegen/spe/dataframe/misc.py +2 -0
  5. maxframe/codegen/spe/dataframe/reduction.py +18 -0
  6. maxframe/codegen/spe/dataframe/sort.py +9 -1
  7. maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
  8. maxframe/codegen/spe/dataframe/tseries.py +9 -0
  9. maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
  10. maxframe/codegen/spe/tensor/datasource.py +1 -0
  11. maxframe/config/config.py +3 -0
  12. maxframe/conftest.py +10 -0
  13. maxframe/core/base.py +2 -1
  14. maxframe/core/entity/tileables.py +2 -0
  15. maxframe/core/graph/entity.py +7 -1
  16. maxframe/core/mode.py +6 -1
  17. maxframe/dataframe/__init__.py +2 -2
  18. maxframe/dataframe/arithmetic/__init__.py +4 -0
  19. maxframe/dataframe/arithmetic/maximum.py +33 -0
  20. maxframe/dataframe/arithmetic/minimum.py +33 -0
  21. maxframe/dataframe/core.py +98 -106
  22. maxframe/dataframe/datasource/core.py +6 -0
  23. maxframe/dataframe/datasource/direct.py +57 -0
  24. maxframe/dataframe/datasource/read_csv.py +19 -11
  25. maxframe/dataframe/datasource/read_odps_query.py +29 -6
  26. maxframe/dataframe/datasource/read_odps_table.py +32 -10
  27. maxframe/dataframe/datasource/read_parquet.py +38 -39
  28. maxframe/dataframe/datastore/__init__.py +6 -0
  29. maxframe/dataframe/datastore/direct.py +268 -0
  30. maxframe/dataframe/datastore/to_odps.py +6 -0
  31. maxframe/dataframe/extensions/flatjson.py +2 -1
  32. maxframe/dataframe/groupby/__init__.py +5 -1
  33. maxframe/dataframe/groupby/aggregation.py +10 -6
  34. maxframe/dataframe/groupby/apply_chunk.py +1 -3
  35. maxframe/dataframe/groupby/core.py +20 -4
  36. maxframe/dataframe/indexing/__init__.py +2 -1
  37. maxframe/dataframe/indexing/insert.py +45 -17
  38. maxframe/dataframe/merge/__init__.py +3 -0
  39. maxframe/dataframe/merge/combine.py +244 -0
  40. maxframe/dataframe/misc/__init__.py +14 -3
  41. maxframe/dataframe/misc/check_unique.py +41 -10
  42. maxframe/dataframe/misc/drop.py +31 -0
  43. maxframe/dataframe/misc/infer_dtypes.py +251 -0
  44. maxframe/dataframe/misc/map.py +31 -18
  45. maxframe/dataframe/misc/repeat.py +159 -0
  46. maxframe/dataframe/misc/tests/test_misc.py +35 -1
  47. maxframe/dataframe/missing/checkna.py +3 -2
  48. maxframe/dataframe/reduction/__init__.py +10 -5
  49. maxframe/dataframe/reduction/aggregation.py +6 -6
  50. maxframe/dataframe/reduction/argmax.py +7 -4
  51. maxframe/dataframe/reduction/argmin.py +7 -4
  52. maxframe/dataframe/reduction/core.py +18 -9
  53. maxframe/dataframe/reduction/mode.py +144 -0
  54. maxframe/dataframe/reduction/nunique.py +10 -3
  55. maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
  56. maxframe/dataframe/sort/__init__.py +9 -2
  57. maxframe/dataframe/sort/argsort.py +7 -1
  58. maxframe/dataframe/sort/core.py +1 -1
  59. maxframe/dataframe/sort/rank.py +147 -0
  60. maxframe/dataframe/tseries/__init__.py +19 -0
  61. maxframe/dataframe/tseries/at_time.py +61 -0
  62. maxframe/dataframe/tseries/between_time.py +122 -0
  63. maxframe/dataframe/utils.py +30 -26
  64. maxframe/learn/contrib/llm/core.py +16 -7
  65. maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
  66. maxframe/learn/contrib/llm/deploy/config.py +221 -0
  67. maxframe/learn/contrib/llm/deploy/core.py +247 -0
  68. maxframe/learn/contrib/llm/deploy/framework.py +35 -0
  69. maxframe/learn/contrib/llm/deploy/loader.py +360 -0
  70. maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
  71. maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
  72. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  73. maxframe/learn/contrib/llm/models/dashscope.py +12 -6
  74. maxframe/learn/contrib/llm/models/managed.py +76 -11
  75. maxframe/learn/contrib/llm/models/openai.py +72 -0
  76. maxframe/learn/contrib/llm/tests/__init__.py +13 -0
  77. maxframe/learn/contrib/llm/tests/test_core.py +34 -0
  78. maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
  79. maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
  80. maxframe/learn/contrib/llm/text.py +348 -42
  81. maxframe/learn/contrib/models.py +4 -1
  82. maxframe/learn/contrib/xgboost/classifier.py +2 -0
  83. maxframe/learn/contrib/xgboost/core.py +31 -7
  84. maxframe/learn/contrib/xgboost/predict.py +4 -2
  85. maxframe/learn/contrib/xgboost/regressor.py +5 -0
  86. maxframe/learn/contrib/xgboost/train.py +2 -0
  87. maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
  88. maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
  89. maxframe/learn/utils/__init__.py +1 -0
  90. maxframe/learn/utils/extmath.py +42 -9
  91. maxframe/learn/utils/odpsio.py +80 -11
  92. maxframe/lib/filesystem/_oss_lib/common.py +2 -0
  93. maxframe/lib/mmh3.cpython-312-darwin.so +0 -0
  94. maxframe/opcodes.py +9 -1
  95. maxframe/remote/core.py +4 -0
  96. maxframe/serialization/core.cpython-312-darwin.so +0 -0
  97. maxframe/serialization/tests/test_serial.py +2 -2
  98. maxframe/tensor/arithmetic/__init__.py +1 -1
  99. maxframe/tensor/arithmetic/core.py +2 -2
  100. maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
  101. maxframe/tensor/core.py +3 -0
  102. maxframe/tensor/misc/copyto.py +1 -1
  103. maxframe/tests/test_udf.py +61 -0
  104. maxframe/tests/test_utils.py +8 -5
  105. maxframe/udf.py +103 -7
  106. maxframe/utils.py +61 -8
  107. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
  108. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +112 -89
  109. maxframe_client/session/task.py +8 -1
  110. maxframe_client/tests/test_session.py +24 -0
  111. maxframe/dataframe/arrays.py +0 -864
  112. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
  113. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
@@ -715,55 +715,6 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
715
715
  return from_index(self, dtype=dtype, extract_multi_index=extract_multi_index)
716
716
 
717
717
  def to_frame(self, index: bool = True, name=None):
718
- """
719
- Create a DataFrame with a column containing the Index.
720
-
721
- Parameters
722
- ----------
723
- index : bool, default True
724
- Set the index of the returned DataFrame as the original Index.
725
-
726
- name : object, default None
727
- The passed name should substitute for the index name (if it has
728
- one).
729
-
730
- Returns
731
- -------
732
- DataFrame
733
- DataFrame containing the original Index data.
734
-
735
- See Also
736
- --------
737
- Index.to_series : Convert an Index to a Series.
738
- Series.to_frame : Convert Series to DataFrame.
739
-
740
- Examples
741
- --------
742
- >>> import maxframe.dataframe as md
743
- >>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal')
744
- >>> idx.to_frame().execute()
745
- animal
746
- animal
747
- Ant Ant
748
- Bear Bear
749
- Cow Cow
750
-
751
- By default, the original Index is reused. To enforce a new Index:
752
-
753
- >>> idx.to_frame(index=False).execute()
754
- animal
755
- 0 Ant
756
- 1 Bear
757
- 2 Cow
758
-
759
- To override the name of the resulting column, specify `name`:
760
-
761
- >>> idx.to_frame(index=False, name='zoo').execute()
762
- zoo
763
- 0 Ant
764
- 1 Bear
765
- 2 Cow
766
- """
767
718
  from . import dataframe_from_tensor
768
719
 
769
720
  if isinstance(self.index_value.value, IndexValue.MultiIndex):
@@ -789,34 +740,20 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
789
740
  columns = [name or self.name or 0]
790
741
  index_ = self if index else None
791
742
  return dataframe_from_tensor(
792
- self._to_maxframe_tensor(self, extract_multi_index=True),
743
+ self._to_maxframe_tensor(extract_multi_index=True),
793
744
  index=index_,
794
745
  columns=columns,
795
746
  )
796
747
 
797
748
  def to_series(self, index=None, name=None):
798
- """
799
- Create a Series with both index and values equal to the index keys.
800
-
801
- Useful with map for returning an indexer based on an index.
802
-
803
- Parameters
804
- ----------
805
- index : Index, optional
806
- Index of resulting Series. If None, defaults to original index.
807
- name : str, optional
808
- Dame of resulting Series. If None, defaults to name of original
809
- index.
810
-
811
- Returns
812
- -------
813
- Series
814
- The dtype will be based on the type of the Index values.
815
- """
816
749
  from . import series_from_index
817
750
 
818
751
  return series_from_index(self, index=index, name=name)
819
752
 
753
+ @property
754
+ def hasnans(self):
755
+ return self.isna().any()
756
+
820
757
 
821
758
  class Index(HasShapeTileable, _ToPandasMixin):
822
759
  __slots__ = "_df_or_series", "_parent_key", "_axis"
@@ -887,6 +824,99 @@ class Index(HasShapeTileable, _ToPandasMixin):
887
824
  def values(self):
888
825
  return self.to_tensor()
889
826
 
827
+ def to_frame(self, index: bool = True, name=None):
828
+ """
829
+ Create a DataFrame with a column containing the Index.
830
+
831
+ Parameters
832
+ ----------
833
+ index : bool, default True
834
+ Set the index of the returned DataFrame as the original Index.
835
+
836
+ name : object, default None
837
+ The passed name should substitute for the index name (if it has
838
+ one).
839
+
840
+ Returns
841
+ -------
842
+ DataFrame
843
+ DataFrame containing the original Index data.
844
+
845
+ See Also
846
+ --------
847
+ Index.to_series : Convert an Index to a Series.
848
+ Series.to_frame : Convert Series to DataFrame.
849
+
850
+ Examples
851
+ --------
852
+ >>> import maxframe.dataframe as md
853
+ >>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal')
854
+ >>> idx.to_frame().execute()
855
+ animal
856
+ animal
857
+ Ant Ant
858
+ Bear Bear
859
+ Cow Cow
860
+
861
+ By default, the original Index is reused. To enforce a new Index:
862
+
863
+ >>> idx.to_frame(index=False).execute()
864
+ animal
865
+ 0 Ant
866
+ 1 Bear
867
+ 2 Cow
868
+
869
+ To override the name of the resulting column, specify `name`:
870
+
871
+ >>> idx.to_frame(index=False, name='zoo').execute()
872
+ zoo
873
+ 0 Ant
874
+ 1 Bear
875
+ 2 Cow
876
+ """
877
+ return self._data.to_frame(index=index, name=name)
878
+
879
+ def to_series(self, index=None, name=None):
880
+ """
881
+ Create a Series with both index and values equal to the index keys.
882
+
883
+ Useful with map for returning an indexer based on an index.
884
+
885
+ Parameters
886
+ ----------
887
+ index : Index, optional
888
+ Index of resulting Series. If None, defaults to original index.
889
+ name : str, optional
890
+ Dame of resulting Series. If None, defaults to name of original
891
+ index.
892
+
893
+ Returns
894
+ -------
895
+ Series
896
+ The dtype will be based on the type of the Index values.
897
+ """
898
+ return self._data.to_series(index=index, name=name)
899
+
900
+ @property
901
+ def hasnans(self):
902
+ """
903
+ Return True if there are any NaNs.
904
+
905
+ Returns
906
+ -------
907
+ bool
908
+
909
+ Examples
910
+ --------
911
+ >>> import maxframe.dataframe as md
912
+ >>> idx = md.Index([1, 2, 3, None])
913
+ >>> idx.execute()
914
+ Index([1.0, 2.0, 3.0, nan], dtype='float64')
915
+ >>> idx.hasnans.execute()
916
+ True
917
+ """
918
+ return self._data.hasnans
919
+
890
920
 
891
921
  class RangeIndex(Index):
892
922
  __slots__ = ()
@@ -1085,12 +1115,6 @@ class SeriesData(_BatchedFetcher, BaseSeriesData):
1085
1115
 
1086
1116
  items = iteritems
1087
1117
 
1088
- def to_dict(self, into=dict, batch_size=10000, session=None):
1089
- fetch_kwargs = dict(batch_size=batch_size)
1090
- return self.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
1091
- into=into
1092
- )
1093
-
1094
1118
  def to_frame(self, name=None):
1095
1119
  from . import dataframe_from_tensor
1096
1120
 
@@ -1285,38 +1309,6 @@ class Series(HasShapeTileable, _ToPandasMixin):
1285
1309
 
1286
1310
  items = iteritems
1287
1311
 
1288
- def to_dict(self, into=dict, batch_size=10000, session=None):
1289
- """
1290
- Convert Series to {label -> value} dict or dict-like object.
1291
-
1292
- Parameters
1293
- ----------
1294
- into : class, default dict
1295
- The collections.abc.Mapping subclass to use as the return
1296
- object. Can be the actual class or an empty
1297
- instance of the mapping type you want. If you want a
1298
- collections.defaultdict, you must pass it initialized.
1299
-
1300
- Returns
1301
- -------
1302
- collections.abc.Mapping
1303
- Key-value representation of Series.
1304
-
1305
- Examples
1306
- --------
1307
- >>> import maxframe.dataframe as md
1308
- >>> s = md.Series([1, 2, 3, 4])
1309
- >>> s.to_dict()
1310
- {0: 1, 1: 2, 2: 3, 3: 4}
1311
- >>> from collections import OrderedDict, defaultdict
1312
- >>> s.to_dict(OrderedDict)
1313
- OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
1314
- >>> dd = defaultdict(list)
1315
- >>> s.to_dict(dd)
1316
- defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
1317
- """
1318
- return self._data.to_dict(into=into, batch_size=batch_size, session=session)
1319
-
1320
1312
  def to_frame(self, name=None):
1321
1313
  """
1322
1314
  Convert Series to DataFrame.
@@ -18,6 +18,7 @@ from typing import List, MutableMapping, Optional, Union
18
18
  from ...serialization.serializables import Int64Field, StringField
19
19
  from ...utils import estimate_pandas_size
20
20
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
21
+ from ..utils import validate_dtype_backend
21
22
 
22
23
 
23
24
  class HeadOptimizedDataSource(DataFrameOperator, DataFrameOperatorMixin):
@@ -86,3 +87,8 @@ class PandasDataSourceOperator(DataFrameOperator):
86
87
  cls, ctx: MutableMapping[str, Union[int, float]], op: "PandasDataSourceOperator"
87
88
  ):
88
89
  ctx[op.outputs[0].key] = estimate_pandas_size(op.get_data())
90
+
91
+
92
+ class DtypeBackendCompatibleMixin:
93
+ def __on_deserialize__(self):
94
+ self.dtype_backend = validate_dtype_backend(self.dtype_backend)
@@ -0,0 +1,57 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import pandas as pd
16
+
17
+
18
+ def read_clipboard(sep=None, **kwargs):
19
+ """
20
+ Read text from clipboard and pass to :func:`~pandas.read_csv`.
21
+
22
+ Parses clipboard contents similar to how CSV files are parsed
23
+ using :func:`~pandas.read_csv`.
24
+
25
+ Parameters
26
+ ----------
27
+ sep : str, default '\\s+'
28
+ A string or regex delimiter. The default of ``'\\s+'`` denotes
29
+ one or more whitespace characters.
30
+
31
+ **kwargs
32
+ See :func:`~pandas.read_csv` for the full argument list.
33
+
34
+ Returns
35
+ -------
36
+ DataFrame
37
+ A parsed :class:`DataFrame` object.
38
+
39
+ See Also
40
+ --------
41
+ DataFrame.to_clipboard : Copy object to the system clipboard.
42
+ read_csv : Read a comma-separated values (csv) file into DataFrame.
43
+ read_fwf : Read a table of fixed-width formatted lines into DataFrame.
44
+
45
+ Examples
46
+ --------
47
+ >>> import maxframe.dataframe as md
48
+ >>> df = md.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
49
+ >>> df.to_clipboard() # doctest: +SKIP
50
+ >>> md.read_clipboard() # doctest: +SKIP.execute()
51
+ A B C
52
+ 0 1 2 3
53
+ 1 4 5 6
54
+ """
55
+ from ..initializer import DataFrame
56
+
57
+ return DataFrame(pd.read_clipboard(sep=sep, **kwargs))
@@ -38,8 +38,12 @@ from ...serialization.serializables import (
38
38
  StringField,
39
39
  )
40
40
  from ...utils import lazy_import, parse_readable_size
41
- from ..utils import parse_index, to_arrow_dtypes
42
- from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
41
+ from ..utils import parse_index, to_arrow_dtypes, validate_dtype_backend
42
+ from .core import (
43
+ ColumnPruneSupportedDataSourceMixin,
44
+ DtypeBackendCompatibleMixin,
45
+ IncrementalIndexDatasource,
46
+ )
43
47
 
44
48
  cudf = lazy_import("cudf")
45
49
 
@@ -88,6 +92,7 @@ def _find_chunk_start_end(f, offset, size):
88
92
  class DataFrameReadCSV(
89
93
  IncrementalIndexDatasource,
90
94
  ColumnPruneSupportedDataSourceMixin,
95
+ DtypeBackendCompatibleMixin,
91
96
  ):
92
97
  _op_type_ = opcodes.READ_CSV
93
98
 
@@ -101,7 +106,7 @@ class DataFrameReadCSV(
101
106
  offset = Int64Field("offset")
102
107
  size = Int64Field("size")
103
108
  incremental_index = BoolField("incremental_index")
104
- use_arrow_dtype = BoolField("use_arrow_dtype")
109
+ dtype_backend = StringField("dtype_backend", default=None)
105
110
  keep_usecols_order = BoolField("keep_usecols_order", default=None)
106
111
  storage_options = DictField("storage_options")
107
112
  merge_small_files = BoolField("merge_small_files")
@@ -151,7 +156,7 @@ def read_csv(
151
156
  head_bytes="100k",
152
157
  head_lines=None,
153
158
  incremental_index: bool = True,
154
- use_arrow_dtype: bool = None,
159
+ dtype_backend: str = None,
155
160
  storage_options: dict = None,
156
161
  memory_scale: int = None,
157
162
  merge_small_files: bool = True,
@@ -419,8 +424,8 @@ def read_csv(
419
424
  incremental_index: bool, default True
420
425
  If index_col not specified, ensure range index incremental,
421
426
  gain a slightly better performance if setting False.
422
- use_arrow_dtype: bool, default None
423
- If True, use arrow dtype to store columns.
427
+ dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
428
+ Back-end data type applied to the resultant DataFrame (still experimental).
424
429
  storage_options: dict, optional
425
430
  Options for storage connection.
426
431
  merge_small_files: bool, default True
@@ -509,7 +514,7 @@ def read_csv(
509
514
  compression=compression,
510
515
  gpu=gpu,
511
516
  incremental_index=incremental_index,
512
- use_arrow_dtype=use_arrow_dtype,
517
+ dtype_backend=dtype_backend,
513
518
  storage_options=storage_options,
514
519
  memory_scale=memory_scale,
515
520
  merge_small_files=merge_small_files,
@@ -518,10 +523,13 @@ def read_csv(
518
523
  )
519
524
  chunk_bytes = chunk_bytes or options.chunk_store_limit
520
525
  dtypes = mini_df.dtypes
521
- if use_arrow_dtype is None:
522
- use_arrow_dtype = options.dataframe.use_arrow_dtype
523
- if not gpu and use_arrow_dtype:
524
- dtypes = to_arrow_dtypes(dtypes, test_df=mini_df)
526
+
527
+ dtype_backend = validate_dtype_backend(
528
+ dtype_backend or options.dataframe.dtype_backend
529
+ )
530
+
531
+ if not gpu and dtype_backend == "pyarrow":
532
+ dtypes = to_arrow_dtypes(dtypes)
525
533
  ret = op(
526
534
  index_value=index_value,
527
535
  columns_value=columns_value,
@@ -29,7 +29,7 @@ from odps.types import Column, OdpsSchema, validate_data_type
29
29
  from odps.utils import split_sql_by_semicolon
30
30
 
31
31
  from ... import opcodes
32
- from ...config import options
32
+ from ...config import option_context, options
33
33
  from ...core import OutputType
34
34
  from ...core.graph import DAG
35
35
  from ...io.odpsio import odps_schema_to_pandas_dtypes
@@ -44,8 +44,12 @@ from ...serialization.serializables import (
44
44
  StringField,
45
45
  )
46
46
  from ...utils import is_empty
47
- from ..utils import parse_index
48
- from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
47
+ from ..utils import parse_index, validate_dtype_backend
48
+ from .core import (
49
+ ColumnPruneSupportedDataSourceMixin,
50
+ DtypeBackendCompatibleMixin,
51
+ IncrementalIndexDatasource,
52
+ )
49
53
 
50
54
  logger = logging.getLogger(__name__)
51
55
 
@@ -266,6 +270,7 @@ def _build_explain_sql(
266
270
  class DataFrameReadODPSQuery(
267
271
  IncrementalIndexDatasource,
268
272
  ColumnPruneSupportedDataSourceMixin,
273
+ DtypeBackendCompatibleMixin,
269
274
  ):
270
275
  _op_type_ = opcodes.READ_ODPS_QUERY
271
276
 
@@ -273,12 +278,16 @@ class DataFrameReadODPSQuery(
273
278
  dtypes = SeriesField("dtypes", default=None)
274
279
  columns = AnyField("columns", default=None)
275
280
  nrows = Int64Field("nrows", default=None)
276
- use_arrow_dtype = BoolField("use_arrow_dtype", default=None)
281
+ dtype_backend = StringField("dtype_backend", default=None)
277
282
  string_as_binary = BoolField("string_as_binary", default=None)
278
283
  index_columns = ListField("index_columns", FieldTypes.string, default=None)
279
284
  index_dtypes = SeriesField("index_dtypes", default=None)
280
285
  column_renames = DictField("column_renames", default=None)
281
286
 
287
+ def __init__(self, dtype_backend=None, **kw):
288
+ dtype_backend = validate_dtype_backend(dtype_backend)
289
+ super().__init__(dtype_backend=dtype_backend, **kw)
290
+
282
291
  def get_columns(self):
283
292
  return self.columns or list(self.dtypes.index)
284
293
 
@@ -404,6 +413,7 @@ def read_odps_query(
404
413
  sql_hints: Dict[str, str] = None,
405
414
  anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
406
415
  skip_schema: bool = False,
416
+ dtype_backend: str = None,
407
417
  **kw,
408
418
  ):
409
419
  """
@@ -428,6 +438,8 @@ def read_odps_query(
428
438
  Skip resolving output schema before execution. Once this is configured,
429
439
  the output DataFrame cannot be inputs of other DataFrame operators
430
440
  before execution.
441
+ dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
442
+ Back-end data type applied to the resultant DataFrame (still experimental).
431
443
 
432
444
  Returns
433
445
  -------
@@ -459,6 +471,14 @@ def read_odps_query(
459
471
  if odps_entry is None:
460
472
  raise ValueError("Missing odps_entry parameter")
461
473
 
474
+ if "use_arrow_dtype" in kw:
475
+ dtype_backend = dtype_backend or validate_dtype_backend(
476
+ kw.pop("use_arrow_dtype")
477
+ )
478
+ dtype_backend = validate_dtype_backend(
479
+ dtype_backend or options.dataframe.dtype_backend
480
+ )
481
+
462
482
  col_renames = {}
463
483
  if not skip_schema:
464
484
  odps_schema = _resolve_query_schema(
@@ -479,7 +499,9 @@ def read_odps_query(
479
499
  else:
480
500
  new_columns.append(col)
481
501
 
482
- dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
502
+ with option_context():
503
+ options.dataframe.dtype_backend = dtype_backend
504
+ dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
483
505
  else:
484
506
  dtypes = None
485
507
 
@@ -500,10 +522,11 @@ def read_odps_query(
500
522
 
501
523
  chunk_bytes = kw.pop("chunk_bytes", None)
502
524
  chunk_size = kw.pop("chunk_size", None)
525
+
503
526
  op = DataFrameReadODPSQuery(
504
527
  query=query,
505
528
  dtypes=dtypes,
506
- use_arrow_dtype=kw.pop("use_arrow_dtype", True),
529
+ dtype_backend=dtype_backend,
507
530
  string_as_binary=string_as_binary,
508
531
  index_columns=index_col,
509
532
  index_dtypes=index_dtypes,
@@ -22,7 +22,7 @@ from odps.models import Table
22
22
  from odps.utils import to_timestamp
23
23
 
24
24
  from ... import opcodes
25
- from ...config import options
25
+ from ...config import option_context, options
26
26
  from ...core import OutputType
27
27
  from ...io.odpsio import odps_schema_to_pandas_dtypes
28
28
  from ...serialization.serializables import (
@@ -36,8 +36,12 @@ from ...serialization.serializables import (
36
36
  )
37
37
  from ...utils import estimate_table_size, is_empty
38
38
  from ..core import DataFrame # noqa: F401
39
- from ..utils import parse_index
40
- from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
39
+ from ..utils import parse_index, validate_dtype_backend
40
+ from .core import (
41
+ ColumnPruneSupportedDataSourceMixin,
42
+ DtypeBackendCompatibleMixin,
43
+ IncrementalIndexDatasource,
44
+ )
41
45
 
42
46
  logger = logging.getLogger(__name__)
43
47
 
@@ -45,6 +49,7 @@ logger = logging.getLogger(__name__)
45
49
  class DataFrameReadODPSTable(
46
50
  IncrementalIndexDatasource,
47
51
  ColumnPruneSupportedDataSourceMixin,
52
+ DtypeBackendCompatibleMixin,
48
53
  ):
49
54
  __slots__ = ("_odps_entry",)
50
55
  _op_type_ = opcodes.READ_ODPS_TABLE
@@ -54,18 +59,22 @@ class DataFrameReadODPSTable(
54
59
  dtypes = SeriesField("dtypes", default=None)
55
60
  columns = AnyField("columns", default=None)
56
61
  nrows = Int64Field("nrows", default=None)
57
- use_arrow_dtype = BoolField("use_arrow_dtype", default=None)
62
+ dtype_backend = StringField("dtype_backend", default=None)
58
63
  string_as_binary = BoolField("string_as_binary", default=None)
59
64
  append_partitions = BoolField("append_partitions", default=None)
60
65
  last_modified_time = Int64Field("last_modified_time", default=None)
61
66
  index_columns = ListField("index_columns", FieldTypes.string, default=None)
62
67
  index_dtypes = SeriesField("index_dtypes", default=None)
63
68
 
64
- def __init__(self, memory_scale=None, **kw):
69
+ def __init__(self, memory_scale=None, dtype_backend=None, **kw):
65
70
  output_type = kw.pop("output_type", OutputType.dataframe)
66
71
  self._odps_entry = kw.pop("odps_entry", None)
72
+ dtype_backend = validate_dtype_backend(dtype_backend)
67
73
  super(DataFrameReadODPSTable, self).__init__(
68
- memory_scale=memory_scale, _output_types=[output_type], **kw
74
+ memory_scale=memory_scale,
75
+ dtype_backend=dtype_backend,
76
+ _output_types=[output_type],
77
+ **kw,
69
78
  )
70
79
 
71
80
  @property
@@ -153,6 +162,7 @@ def read_odps_table(
153
162
  odps_entry: ODPS = None,
154
163
  string_as_binary: bool = None,
155
164
  append_partitions: bool = False,
165
+ dtype_backend: str = None,
156
166
  **kw,
157
167
  ):
158
168
  """
@@ -176,6 +186,8 @@ def read_odps_table(
176
186
  append_partitions: bool
177
187
  If True, will add all partition columns as selected columns when
178
188
  `columns` is not specified,
189
+ dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
190
+ Back-end data type applied to the resultant DataFrame (still experimental).
179
191
 
180
192
  Returns
181
193
  -------
@@ -202,9 +214,20 @@ def read_odps_table(
202
214
  else table.table_schema.simple_columns
203
215
  )
204
216
  table_columns = [c.name.lower() for c in cols]
205
- table_dtypes = odps_schema_to_pandas_dtypes(
206
- table.table_schema, with_partitions=True
217
+
218
+ if "use_arrow_dtype" in kw:
219
+ dtype_backend = dtype_backend or validate_dtype_backend(
220
+ kw.pop("use_arrow_dtype")
221
+ )
222
+ dtype_backend = validate_dtype_backend(
223
+ dtype_backend or options.dataframe.dtype_backend
207
224
  )
225
+
226
+ with option_context():
227
+ options.dataframe.dtype_backend = dtype_backend
228
+ table_dtypes = odps_schema_to_pandas_dtypes(
229
+ table.table_schema, with_partitions=True
230
+ )
208
231
  df_types = [table_dtypes[c] for c in table_columns]
209
232
 
210
233
  if isinstance(index_col, str):
@@ -246,7 +269,6 @@ def read_odps_table(
246
269
  dtypes = pd.Series(df_types, index=table_columns)
247
270
  chunk_bytes = kw.pop("chunk_bytes", None)
248
271
  chunk_size = kw.pop("chunk_size", None)
249
- use_arrow_dtype = kw.pop("use_arrow_dtype", True)
250
272
 
251
273
  partitions = partitions or kw.get("partition")
252
274
  if isinstance(partitions, str):
@@ -261,7 +283,7 @@ def read_odps_table(
261
283
  partitions=partitions,
262
284
  dtypes=dtypes,
263
285
  columns=columns,
264
- use_arrow_dtype=use_arrow_dtype,
286
+ dtype_backend=dtype_backend,
265
287
  string_as_binary=string_as_binary,
266
288
  append_partitions=append_partitions,
267
289
  last_modified_time=to_timestamp(table.last_data_modified_time),