maxframe 2.2.0__cp38-cp38-macosx_10_9_universal2.whl → 2.3.0rc1__cp38-cp38-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-38-darwin.so +0 -0
- maxframe/codegen/core.py +3 -2
- maxframe/codegen/spe/dataframe/merge.py +4 -0
- maxframe/codegen/spe/dataframe/misc.py +2 -0
- maxframe/codegen/spe/dataframe/reduction.py +18 -0
- maxframe/codegen/spe/dataframe/sort.py +9 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +10 -0
- maxframe/core/base.py +2 -1
- maxframe/core/entity/tileables.py +2 -0
- maxframe/core/graph/core.cpython-38-darwin.so +0 -0
- maxframe/core/graph/entity.py +7 -1
- maxframe/core/mode.py +6 -1
- maxframe/dataframe/__init__.py +2 -2
- maxframe/dataframe/arithmetic/__init__.py +4 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/core.py +98 -106
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/read_csv.py +19 -11
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datastore/__init__.py +6 -0
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/groupby/__init__.py +5 -1
- maxframe/dataframe/groupby/aggregation.py +10 -6
- maxframe/dataframe/groupby/apply_chunk.py +1 -3
- maxframe/dataframe/groupby/core.py +20 -4
- maxframe/dataframe/indexing/__init__.py +2 -1
- maxframe/dataframe/indexing/insert.py +45 -17
- maxframe/dataframe/merge/__init__.py +3 -0
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/misc/__init__.py +14 -3
- maxframe/dataframe/misc/check_unique.py +41 -10
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/map.py +31 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +35 -1
- maxframe/dataframe/missing/checkna.py +3 -2
- maxframe/dataframe/reduction/__init__.py +10 -5
- maxframe/dataframe/reduction/aggregation.py +6 -6
- maxframe/dataframe/reduction/argmax.py +7 -4
- maxframe/dataframe/reduction/argmin.py +7 -4
- maxframe/dataframe/reduction/core.py +18 -9
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +10 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/sort/__init__.py +9 -2
- maxframe/dataframe/sort/argsort.py +7 -1
- maxframe/dataframe/sort/core.py +1 -1
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/utils.py +30 -26
- maxframe/learn/contrib/llm/core.py +16 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +31 -7
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +2 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +1 -0
- maxframe/learn/utils/extmath.py +42 -9
- maxframe/learn/utils/odpsio.py +80 -11
- maxframe/lib/filesystem/_oss_lib/common.py +2 -0
- maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
- maxframe/opcodes.py +9 -1
- maxframe/remote/core.py +4 -0
- maxframe/serialization/core.cpython-38-darwin.so +0 -0
- maxframe/serialization/tests/test_serial.py +2 -2
- maxframe/tensor/arithmetic/__init__.py +1 -1
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
- maxframe/tensor/core.py +3 -0
- maxframe/tensor/misc/copyto.py +1 -1
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +8 -5
- maxframe/udf.py +103 -7
- maxframe/utils.py +61 -8
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_session.py +24 -0
- maxframe/dataframe/arrays.py +0 -864
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
maxframe/dataframe/core.py
CHANGED
|
@@ -715,55 +715,6 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
|
|
|
715
715
|
return from_index(self, dtype=dtype, extract_multi_index=extract_multi_index)
|
|
716
716
|
|
|
717
717
|
def to_frame(self, index: bool = True, name=None):
|
|
718
|
-
"""
|
|
719
|
-
Create a DataFrame with a column containing the Index.
|
|
720
|
-
|
|
721
|
-
Parameters
|
|
722
|
-
----------
|
|
723
|
-
index : bool, default True
|
|
724
|
-
Set the index of the returned DataFrame as the original Index.
|
|
725
|
-
|
|
726
|
-
name : object, default None
|
|
727
|
-
The passed name should substitute for the index name (if it has
|
|
728
|
-
one).
|
|
729
|
-
|
|
730
|
-
Returns
|
|
731
|
-
-------
|
|
732
|
-
DataFrame
|
|
733
|
-
DataFrame containing the original Index data.
|
|
734
|
-
|
|
735
|
-
See Also
|
|
736
|
-
--------
|
|
737
|
-
Index.to_series : Convert an Index to a Series.
|
|
738
|
-
Series.to_frame : Convert Series to DataFrame.
|
|
739
|
-
|
|
740
|
-
Examples
|
|
741
|
-
--------
|
|
742
|
-
>>> import maxframe.dataframe as md
|
|
743
|
-
>>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal')
|
|
744
|
-
>>> idx.to_frame().execute()
|
|
745
|
-
animal
|
|
746
|
-
animal
|
|
747
|
-
Ant Ant
|
|
748
|
-
Bear Bear
|
|
749
|
-
Cow Cow
|
|
750
|
-
|
|
751
|
-
By default, the original Index is reused. To enforce a new Index:
|
|
752
|
-
|
|
753
|
-
>>> idx.to_frame(index=False).execute()
|
|
754
|
-
animal
|
|
755
|
-
0 Ant
|
|
756
|
-
1 Bear
|
|
757
|
-
2 Cow
|
|
758
|
-
|
|
759
|
-
To override the name of the resulting column, specify `name`:
|
|
760
|
-
|
|
761
|
-
>>> idx.to_frame(index=False, name='zoo').execute()
|
|
762
|
-
zoo
|
|
763
|
-
0 Ant
|
|
764
|
-
1 Bear
|
|
765
|
-
2 Cow
|
|
766
|
-
"""
|
|
767
718
|
from . import dataframe_from_tensor
|
|
768
719
|
|
|
769
720
|
if isinstance(self.index_value.value, IndexValue.MultiIndex):
|
|
@@ -789,34 +740,20 @@ class IndexData(HasShapeTileableData, _ToPandasMixin):
|
|
|
789
740
|
columns = [name or self.name or 0]
|
|
790
741
|
index_ = self if index else None
|
|
791
742
|
return dataframe_from_tensor(
|
|
792
|
-
self._to_maxframe_tensor(
|
|
743
|
+
self._to_maxframe_tensor(extract_multi_index=True),
|
|
793
744
|
index=index_,
|
|
794
745
|
columns=columns,
|
|
795
746
|
)
|
|
796
747
|
|
|
797
748
|
def to_series(self, index=None, name=None):
|
|
798
|
-
"""
|
|
799
|
-
Create a Series with both index and values equal to the index keys.
|
|
800
|
-
|
|
801
|
-
Useful with map for returning an indexer based on an index.
|
|
802
|
-
|
|
803
|
-
Parameters
|
|
804
|
-
----------
|
|
805
|
-
index : Index, optional
|
|
806
|
-
Index of resulting Series. If None, defaults to original index.
|
|
807
|
-
name : str, optional
|
|
808
|
-
Dame of resulting Series. If None, defaults to name of original
|
|
809
|
-
index.
|
|
810
|
-
|
|
811
|
-
Returns
|
|
812
|
-
-------
|
|
813
|
-
Series
|
|
814
|
-
The dtype will be based on the type of the Index values.
|
|
815
|
-
"""
|
|
816
749
|
from . import series_from_index
|
|
817
750
|
|
|
818
751
|
return series_from_index(self, index=index, name=name)
|
|
819
752
|
|
|
753
|
+
@property
|
|
754
|
+
def hasnans(self):
|
|
755
|
+
return self.isna().any()
|
|
756
|
+
|
|
820
757
|
|
|
821
758
|
class Index(HasShapeTileable, _ToPandasMixin):
|
|
822
759
|
__slots__ = "_df_or_series", "_parent_key", "_axis"
|
|
@@ -887,6 +824,99 @@ class Index(HasShapeTileable, _ToPandasMixin):
|
|
|
887
824
|
def values(self):
|
|
888
825
|
return self.to_tensor()
|
|
889
826
|
|
|
827
|
+
def to_frame(self, index: bool = True, name=None):
|
|
828
|
+
"""
|
|
829
|
+
Create a DataFrame with a column containing the Index.
|
|
830
|
+
|
|
831
|
+
Parameters
|
|
832
|
+
----------
|
|
833
|
+
index : bool, default True
|
|
834
|
+
Set the index of the returned DataFrame as the original Index.
|
|
835
|
+
|
|
836
|
+
name : object, default None
|
|
837
|
+
The passed name should substitute for the index name (if it has
|
|
838
|
+
one).
|
|
839
|
+
|
|
840
|
+
Returns
|
|
841
|
+
-------
|
|
842
|
+
DataFrame
|
|
843
|
+
DataFrame containing the original Index data.
|
|
844
|
+
|
|
845
|
+
See Also
|
|
846
|
+
--------
|
|
847
|
+
Index.to_series : Convert an Index to a Series.
|
|
848
|
+
Series.to_frame : Convert Series to DataFrame.
|
|
849
|
+
|
|
850
|
+
Examples
|
|
851
|
+
--------
|
|
852
|
+
>>> import maxframe.dataframe as md
|
|
853
|
+
>>> idx = md.Index(['Ant', 'Bear', 'Cow'], name='animal')
|
|
854
|
+
>>> idx.to_frame().execute()
|
|
855
|
+
animal
|
|
856
|
+
animal
|
|
857
|
+
Ant Ant
|
|
858
|
+
Bear Bear
|
|
859
|
+
Cow Cow
|
|
860
|
+
|
|
861
|
+
By default, the original Index is reused. To enforce a new Index:
|
|
862
|
+
|
|
863
|
+
>>> idx.to_frame(index=False).execute()
|
|
864
|
+
animal
|
|
865
|
+
0 Ant
|
|
866
|
+
1 Bear
|
|
867
|
+
2 Cow
|
|
868
|
+
|
|
869
|
+
To override the name of the resulting column, specify `name`:
|
|
870
|
+
|
|
871
|
+
>>> idx.to_frame(index=False, name='zoo').execute()
|
|
872
|
+
zoo
|
|
873
|
+
0 Ant
|
|
874
|
+
1 Bear
|
|
875
|
+
2 Cow
|
|
876
|
+
"""
|
|
877
|
+
return self._data.to_frame(index=index, name=name)
|
|
878
|
+
|
|
879
|
+
def to_series(self, index=None, name=None):
|
|
880
|
+
"""
|
|
881
|
+
Create a Series with both index and values equal to the index keys.
|
|
882
|
+
|
|
883
|
+
Useful with map for returning an indexer based on an index.
|
|
884
|
+
|
|
885
|
+
Parameters
|
|
886
|
+
----------
|
|
887
|
+
index : Index, optional
|
|
888
|
+
Index of resulting Series. If None, defaults to original index.
|
|
889
|
+
name : str, optional
|
|
890
|
+
Dame of resulting Series. If None, defaults to name of original
|
|
891
|
+
index.
|
|
892
|
+
|
|
893
|
+
Returns
|
|
894
|
+
-------
|
|
895
|
+
Series
|
|
896
|
+
The dtype will be based on the type of the Index values.
|
|
897
|
+
"""
|
|
898
|
+
return self._data.to_series(index=index, name=name)
|
|
899
|
+
|
|
900
|
+
@property
|
|
901
|
+
def hasnans(self):
|
|
902
|
+
"""
|
|
903
|
+
Return True if there are any NaNs.
|
|
904
|
+
|
|
905
|
+
Returns
|
|
906
|
+
-------
|
|
907
|
+
bool
|
|
908
|
+
|
|
909
|
+
Examples
|
|
910
|
+
--------
|
|
911
|
+
>>> import maxframe.dataframe as md
|
|
912
|
+
>>> idx = md.Index([1, 2, 3, None])
|
|
913
|
+
>>> idx.execute()
|
|
914
|
+
Index([1.0, 2.0, 3.0, nan], dtype='float64')
|
|
915
|
+
>>> idx.hasnans.execute()
|
|
916
|
+
True
|
|
917
|
+
"""
|
|
918
|
+
return self._data.hasnans
|
|
919
|
+
|
|
890
920
|
|
|
891
921
|
class RangeIndex(Index):
|
|
892
922
|
__slots__ = ()
|
|
@@ -1085,12 +1115,6 @@ class SeriesData(_BatchedFetcher, BaseSeriesData):
|
|
|
1085
1115
|
|
|
1086
1116
|
items = iteritems
|
|
1087
1117
|
|
|
1088
|
-
def to_dict(self, into=dict, batch_size=10000, session=None):
|
|
1089
|
-
fetch_kwargs = dict(batch_size=batch_size)
|
|
1090
|
-
return self.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
|
|
1091
|
-
into=into
|
|
1092
|
-
)
|
|
1093
|
-
|
|
1094
1118
|
def to_frame(self, name=None):
|
|
1095
1119
|
from . import dataframe_from_tensor
|
|
1096
1120
|
|
|
@@ -1285,38 +1309,6 @@ class Series(HasShapeTileable, _ToPandasMixin):
|
|
|
1285
1309
|
|
|
1286
1310
|
items = iteritems
|
|
1287
1311
|
|
|
1288
|
-
def to_dict(self, into=dict, batch_size=10000, session=None):
|
|
1289
|
-
"""
|
|
1290
|
-
Convert Series to {label -> value} dict or dict-like object.
|
|
1291
|
-
|
|
1292
|
-
Parameters
|
|
1293
|
-
----------
|
|
1294
|
-
into : class, default dict
|
|
1295
|
-
The collections.abc.Mapping subclass to use as the return
|
|
1296
|
-
object. Can be the actual class or an empty
|
|
1297
|
-
instance of the mapping type you want. If you want a
|
|
1298
|
-
collections.defaultdict, you must pass it initialized.
|
|
1299
|
-
|
|
1300
|
-
Returns
|
|
1301
|
-
-------
|
|
1302
|
-
collections.abc.Mapping
|
|
1303
|
-
Key-value representation of Series.
|
|
1304
|
-
|
|
1305
|
-
Examples
|
|
1306
|
-
--------
|
|
1307
|
-
>>> import maxframe.dataframe as md
|
|
1308
|
-
>>> s = md.Series([1, 2, 3, 4])
|
|
1309
|
-
>>> s.to_dict()
|
|
1310
|
-
{0: 1, 1: 2, 2: 3, 3: 4}
|
|
1311
|
-
>>> from collections import OrderedDict, defaultdict
|
|
1312
|
-
>>> s.to_dict(OrderedDict)
|
|
1313
|
-
OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
|
|
1314
|
-
>>> dd = defaultdict(list)
|
|
1315
|
-
>>> s.to_dict(dd)
|
|
1316
|
-
defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
|
|
1317
|
-
"""
|
|
1318
|
-
return self._data.to_dict(into=into, batch_size=batch_size, session=session)
|
|
1319
|
-
|
|
1320
1312
|
def to_frame(self, name=None):
|
|
1321
1313
|
"""
|
|
1322
1314
|
Convert Series to DataFrame.
|
|
@@ -18,6 +18,7 @@ from typing import List, MutableMapping, Optional, Union
|
|
|
18
18
|
from ...serialization.serializables import Int64Field, StringField
|
|
19
19
|
from ...utils import estimate_pandas_size
|
|
20
20
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
21
|
+
from ..utils import validate_dtype_backend
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class HeadOptimizedDataSource(DataFrameOperator, DataFrameOperatorMixin):
|
|
@@ -86,3 +87,8 @@ class PandasDataSourceOperator(DataFrameOperator):
|
|
|
86
87
|
cls, ctx: MutableMapping[str, Union[int, float]], op: "PandasDataSourceOperator"
|
|
87
88
|
):
|
|
88
89
|
ctx[op.outputs[0].key] = estimate_pandas_size(op.get_data())
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class DtypeBackendCompatibleMixin:
|
|
93
|
+
def __on_deserialize__(self):
|
|
94
|
+
self.dtype_backend = validate_dtype_backend(self.dtype_backend)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def read_clipboard(sep=None, **kwargs):
|
|
19
|
+
"""
|
|
20
|
+
Read text from clipboard and pass to :func:`~pandas.read_csv`.
|
|
21
|
+
|
|
22
|
+
Parses clipboard contents similar to how CSV files are parsed
|
|
23
|
+
using :func:`~pandas.read_csv`.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
sep : str, default '\\s+'
|
|
28
|
+
A string or regex delimiter. The default of ``'\\s+'`` denotes
|
|
29
|
+
one or more whitespace characters.
|
|
30
|
+
|
|
31
|
+
**kwargs
|
|
32
|
+
See :func:`~pandas.read_csv` for the full argument list.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
DataFrame
|
|
37
|
+
A parsed :class:`DataFrame` object.
|
|
38
|
+
|
|
39
|
+
See Also
|
|
40
|
+
--------
|
|
41
|
+
DataFrame.to_clipboard : Copy object to the system clipboard.
|
|
42
|
+
read_csv : Read a comma-separated values (csv) file into DataFrame.
|
|
43
|
+
read_fwf : Read a table of fixed-width formatted lines into DataFrame.
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> import maxframe.dataframe as md
|
|
48
|
+
>>> df = md.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
|
|
49
|
+
>>> df.to_clipboard() # doctest: +SKIP
|
|
50
|
+
>>> md.read_clipboard() # doctest: +SKIP.execute()
|
|
51
|
+
A B C
|
|
52
|
+
0 1 2 3
|
|
53
|
+
1 4 5 6
|
|
54
|
+
"""
|
|
55
|
+
from ..initializer import DataFrame
|
|
56
|
+
|
|
57
|
+
return DataFrame(pd.read_clipboard(sep=sep, **kwargs))
|
|
@@ -38,8 +38,12 @@ from ...serialization.serializables import (
|
|
|
38
38
|
StringField,
|
|
39
39
|
)
|
|
40
40
|
from ...utils import lazy_import, parse_readable_size
|
|
41
|
-
from ..utils import parse_index, to_arrow_dtypes
|
|
42
|
-
from .core import
|
|
41
|
+
from ..utils import parse_index, to_arrow_dtypes, validate_dtype_backend
|
|
42
|
+
from .core import (
|
|
43
|
+
ColumnPruneSupportedDataSourceMixin,
|
|
44
|
+
DtypeBackendCompatibleMixin,
|
|
45
|
+
IncrementalIndexDatasource,
|
|
46
|
+
)
|
|
43
47
|
|
|
44
48
|
cudf = lazy_import("cudf")
|
|
45
49
|
|
|
@@ -88,6 +92,7 @@ def _find_chunk_start_end(f, offset, size):
|
|
|
88
92
|
class DataFrameReadCSV(
|
|
89
93
|
IncrementalIndexDatasource,
|
|
90
94
|
ColumnPruneSupportedDataSourceMixin,
|
|
95
|
+
DtypeBackendCompatibleMixin,
|
|
91
96
|
):
|
|
92
97
|
_op_type_ = opcodes.READ_CSV
|
|
93
98
|
|
|
@@ -101,7 +106,7 @@ class DataFrameReadCSV(
|
|
|
101
106
|
offset = Int64Field("offset")
|
|
102
107
|
size = Int64Field("size")
|
|
103
108
|
incremental_index = BoolField("incremental_index")
|
|
104
|
-
|
|
109
|
+
dtype_backend = StringField("dtype_backend", default=None)
|
|
105
110
|
keep_usecols_order = BoolField("keep_usecols_order", default=None)
|
|
106
111
|
storage_options = DictField("storage_options")
|
|
107
112
|
merge_small_files = BoolField("merge_small_files")
|
|
@@ -151,7 +156,7 @@ def read_csv(
|
|
|
151
156
|
head_bytes="100k",
|
|
152
157
|
head_lines=None,
|
|
153
158
|
incremental_index: bool = True,
|
|
154
|
-
|
|
159
|
+
dtype_backend: str = None,
|
|
155
160
|
storage_options: dict = None,
|
|
156
161
|
memory_scale: int = None,
|
|
157
162
|
merge_small_files: bool = True,
|
|
@@ -419,8 +424,8 @@ def read_csv(
|
|
|
419
424
|
incremental_index: bool, default True
|
|
420
425
|
If index_col not specified, ensure range index incremental,
|
|
421
426
|
gain a slightly better performance if setting False.
|
|
422
|
-
|
|
423
|
-
|
|
427
|
+
dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
|
|
428
|
+
Back-end data type applied to the resultant DataFrame (still experimental).
|
|
424
429
|
storage_options: dict, optional
|
|
425
430
|
Options for storage connection.
|
|
426
431
|
merge_small_files: bool, default True
|
|
@@ -509,7 +514,7 @@ def read_csv(
|
|
|
509
514
|
compression=compression,
|
|
510
515
|
gpu=gpu,
|
|
511
516
|
incremental_index=incremental_index,
|
|
512
|
-
|
|
517
|
+
dtype_backend=dtype_backend,
|
|
513
518
|
storage_options=storage_options,
|
|
514
519
|
memory_scale=memory_scale,
|
|
515
520
|
merge_small_files=merge_small_files,
|
|
@@ -518,10 +523,13 @@ def read_csv(
|
|
|
518
523
|
)
|
|
519
524
|
chunk_bytes = chunk_bytes or options.chunk_store_limit
|
|
520
525
|
dtypes = mini_df.dtypes
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
526
|
+
|
|
527
|
+
dtype_backend = validate_dtype_backend(
|
|
528
|
+
dtype_backend or options.dataframe.dtype_backend
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
if not gpu and dtype_backend == "pyarrow":
|
|
532
|
+
dtypes = to_arrow_dtypes(dtypes)
|
|
525
533
|
ret = op(
|
|
526
534
|
index_value=index_value,
|
|
527
535
|
columns_value=columns_value,
|
|
@@ -29,7 +29,7 @@ from odps.types import Column, OdpsSchema, validate_data_type
|
|
|
29
29
|
from odps.utils import split_sql_by_semicolon
|
|
30
30
|
|
|
31
31
|
from ... import opcodes
|
|
32
|
-
from ...config import options
|
|
32
|
+
from ...config import option_context, options
|
|
33
33
|
from ...core import OutputType
|
|
34
34
|
from ...core.graph import DAG
|
|
35
35
|
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
@@ -44,8 +44,12 @@ from ...serialization.serializables import (
|
|
|
44
44
|
StringField,
|
|
45
45
|
)
|
|
46
46
|
from ...utils import is_empty
|
|
47
|
-
from ..utils import parse_index
|
|
48
|
-
from .core import
|
|
47
|
+
from ..utils import parse_index, validate_dtype_backend
|
|
48
|
+
from .core import (
|
|
49
|
+
ColumnPruneSupportedDataSourceMixin,
|
|
50
|
+
DtypeBackendCompatibleMixin,
|
|
51
|
+
IncrementalIndexDatasource,
|
|
52
|
+
)
|
|
49
53
|
|
|
50
54
|
logger = logging.getLogger(__name__)
|
|
51
55
|
|
|
@@ -266,6 +270,7 @@ def _build_explain_sql(
|
|
|
266
270
|
class DataFrameReadODPSQuery(
|
|
267
271
|
IncrementalIndexDatasource,
|
|
268
272
|
ColumnPruneSupportedDataSourceMixin,
|
|
273
|
+
DtypeBackendCompatibleMixin,
|
|
269
274
|
):
|
|
270
275
|
_op_type_ = opcodes.READ_ODPS_QUERY
|
|
271
276
|
|
|
@@ -273,12 +278,16 @@ class DataFrameReadODPSQuery(
|
|
|
273
278
|
dtypes = SeriesField("dtypes", default=None)
|
|
274
279
|
columns = AnyField("columns", default=None)
|
|
275
280
|
nrows = Int64Field("nrows", default=None)
|
|
276
|
-
|
|
281
|
+
dtype_backend = StringField("dtype_backend", default=None)
|
|
277
282
|
string_as_binary = BoolField("string_as_binary", default=None)
|
|
278
283
|
index_columns = ListField("index_columns", FieldTypes.string, default=None)
|
|
279
284
|
index_dtypes = SeriesField("index_dtypes", default=None)
|
|
280
285
|
column_renames = DictField("column_renames", default=None)
|
|
281
286
|
|
|
287
|
+
def __init__(self, dtype_backend=None, **kw):
|
|
288
|
+
dtype_backend = validate_dtype_backend(dtype_backend)
|
|
289
|
+
super().__init__(dtype_backend=dtype_backend, **kw)
|
|
290
|
+
|
|
282
291
|
def get_columns(self):
|
|
283
292
|
return self.columns or list(self.dtypes.index)
|
|
284
293
|
|
|
@@ -404,6 +413,7 @@ def read_odps_query(
|
|
|
404
413
|
sql_hints: Dict[str, str] = None,
|
|
405
414
|
anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
|
|
406
415
|
skip_schema: bool = False,
|
|
416
|
+
dtype_backend: str = None,
|
|
407
417
|
**kw,
|
|
408
418
|
):
|
|
409
419
|
"""
|
|
@@ -428,6 +438,8 @@ def read_odps_query(
|
|
|
428
438
|
Skip resolving output schema before execution. Once this is configured,
|
|
429
439
|
the output DataFrame cannot be inputs of other DataFrame operators
|
|
430
440
|
before execution.
|
|
441
|
+
dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
|
|
442
|
+
Back-end data type applied to the resultant DataFrame (still experimental).
|
|
431
443
|
|
|
432
444
|
Returns
|
|
433
445
|
-------
|
|
@@ -459,6 +471,14 @@ def read_odps_query(
|
|
|
459
471
|
if odps_entry is None:
|
|
460
472
|
raise ValueError("Missing odps_entry parameter")
|
|
461
473
|
|
|
474
|
+
if "use_arrow_dtype" in kw:
|
|
475
|
+
dtype_backend = dtype_backend or validate_dtype_backend(
|
|
476
|
+
kw.pop("use_arrow_dtype")
|
|
477
|
+
)
|
|
478
|
+
dtype_backend = validate_dtype_backend(
|
|
479
|
+
dtype_backend or options.dataframe.dtype_backend
|
|
480
|
+
)
|
|
481
|
+
|
|
462
482
|
col_renames = {}
|
|
463
483
|
if not skip_schema:
|
|
464
484
|
odps_schema = _resolve_query_schema(
|
|
@@ -479,7 +499,9 @@ def read_odps_query(
|
|
|
479
499
|
else:
|
|
480
500
|
new_columns.append(col)
|
|
481
501
|
|
|
482
|
-
|
|
502
|
+
with option_context():
|
|
503
|
+
options.dataframe.dtype_backend = dtype_backend
|
|
504
|
+
dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
|
|
483
505
|
else:
|
|
484
506
|
dtypes = None
|
|
485
507
|
|
|
@@ -500,10 +522,11 @@ def read_odps_query(
|
|
|
500
522
|
|
|
501
523
|
chunk_bytes = kw.pop("chunk_bytes", None)
|
|
502
524
|
chunk_size = kw.pop("chunk_size", None)
|
|
525
|
+
|
|
503
526
|
op = DataFrameReadODPSQuery(
|
|
504
527
|
query=query,
|
|
505
528
|
dtypes=dtypes,
|
|
506
|
-
|
|
529
|
+
dtype_backend=dtype_backend,
|
|
507
530
|
string_as_binary=string_as_binary,
|
|
508
531
|
index_columns=index_col,
|
|
509
532
|
index_dtypes=index_dtypes,
|
|
@@ -22,7 +22,7 @@ from odps.models import Table
|
|
|
22
22
|
from odps.utils import to_timestamp
|
|
23
23
|
|
|
24
24
|
from ... import opcodes
|
|
25
|
-
from ...config import options
|
|
25
|
+
from ...config import option_context, options
|
|
26
26
|
from ...core import OutputType
|
|
27
27
|
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
28
28
|
from ...serialization.serializables import (
|
|
@@ -36,8 +36,12 @@ from ...serialization.serializables import (
|
|
|
36
36
|
)
|
|
37
37
|
from ...utils import estimate_table_size, is_empty
|
|
38
38
|
from ..core import DataFrame # noqa: F401
|
|
39
|
-
from ..utils import parse_index
|
|
40
|
-
from .core import
|
|
39
|
+
from ..utils import parse_index, validate_dtype_backend
|
|
40
|
+
from .core import (
|
|
41
|
+
ColumnPruneSupportedDataSourceMixin,
|
|
42
|
+
DtypeBackendCompatibleMixin,
|
|
43
|
+
IncrementalIndexDatasource,
|
|
44
|
+
)
|
|
41
45
|
|
|
42
46
|
logger = logging.getLogger(__name__)
|
|
43
47
|
|
|
@@ -45,6 +49,7 @@ logger = logging.getLogger(__name__)
|
|
|
45
49
|
class DataFrameReadODPSTable(
|
|
46
50
|
IncrementalIndexDatasource,
|
|
47
51
|
ColumnPruneSupportedDataSourceMixin,
|
|
52
|
+
DtypeBackendCompatibleMixin,
|
|
48
53
|
):
|
|
49
54
|
__slots__ = ("_odps_entry",)
|
|
50
55
|
_op_type_ = opcodes.READ_ODPS_TABLE
|
|
@@ -54,18 +59,22 @@ class DataFrameReadODPSTable(
|
|
|
54
59
|
dtypes = SeriesField("dtypes", default=None)
|
|
55
60
|
columns = AnyField("columns", default=None)
|
|
56
61
|
nrows = Int64Field("nrows", default=None)
|
|
57
|
-
|
|
62
|
+
dtype_backend = StringField("dtype_backend", default=None)
|
|
58
63
|
string_as_binary = BoolField("string_as_binary", default=None)
|
|
59
64
|
append_partitions = BoolField("append_partitions", default=None)
|
|
60
65
|
last_modified_time = Int64Field("last_modified_time", default=None)
|
|
61
66
|
index_columns = ListField("index_columns", FieldTypes.string, default=None)
|
|
62
67
|
index_dtypes = SeriesField("index_dtypes", default=None)
|
|
63
68
|
|
|
64
|
-
def __init__(self, memory_scale=None, **kw):
|
|
69
|
+
def __init__(self, memory_scale=None, dtype_backend=None, **kw):
|
|
65
70
|
output_type = kw.pop("output_type", OutputType.dataframe)
|
|
66
71
|
self._odps_entry = kw.pop("odps_entry", None)
|
|
72
|
+
dtype_backend = validate_dtype_backend(dtype_backend)
|
|
67
73
|
super(DataFrameReadODPSTable, self).__init__(
|
|
68
|
-
memory_scale=memory_scale,
|
|
74
|
+
memory_scale=memory_scale,
|
|
75
|
+
dtype_backend=dtype_backend,
|
|
76
|
+
_output_types=[output_type],
|
|
77
|
+
**kw,
|
|
69
78
|
)
|
|
70
79
|
|
|
71
80
|
@property
|
|
@@ -153,6 +162,7 @@ def read_odps_table(
|
|
|
153
162
|
odps_entry: ODPS = None,
|
|
154
163
|
string_as_binary: bool = None,
|
|
155
164
|
append_partitions: bool = False,
|
|
165
|
+
dtype_backend: str = None,
|
|
156
166
|
**kw,
|
|
157
167
|
):
|
|
158
168
|
"""
|
|
@@ -176,6 +186,8 @@ def read_odps_table(
|
|
|
176
186
|
append_partitions: bool
|
|
177
187
|
If True, will add all partition columns as selected columns when
|
|
178
188
|
`columns` is not specified,
|
|
189
|
+
dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
|
|
190
|
+
Back-end data type applied to the resultant DataFrame (still experimental).
|
|
179
191
|
|
|
180
192
|
Returns
|
|
181
193
|
-------
|
|
@@ -202,9 +214,20 @@ def read_odps_table(
|
|
|
202
214
|
else table.table_schema.simple_columns
|
|
203
215
|
)
|
|
204
216
|
table_columns = [c.name.lower() for c in cols]
|
|
205
|
-
|
|
206
|
-
|
|
217
|
+
|
|
218
|
+
if "use_arrow_dtype" in kw:
|
|
219
|
+
dtype_backend = dtype_backend or validate_dtype_backend(
|
|
220
|
+
kw.pop("use_arrow_dtype")
|
|
221
|
+
)
|
|
222
|
+
dtype_backend = validate_dtype_backend(
|
|
223
|
+
dtype_backend or options.dataframe.dtype_backend
|
|
207
224
|
)
|
|
225
|
+
|
|
226
|
+
with option_context():
|
|
227
|
+
options.dataframe.dtype_backend = dtype_backend
|
|
228
|
+
table_dtypes = odps_schema_to_pandas_dtypes(
|
|
229
|
+
table.table_schema, with_partitions=True
|
|
230
|
+
)
|
|
208
231
|
df_types = [table_dtypes[c] for c in table_columns]
|
|
209
232
|
|
|
210
233
|
if isinstance(index_col, str):
|
|
@@ -246,7 +269,6 @@ def read_odps_table(
|
|
|
246
269
|
dtypes = pd.Series(df_types, index=table_columns)
|
|
247
270
|
chunk_bytes = kw.pop("chunk_bytes", None)
|
|
248
271
|
chunk_size = kw.pop("chunk_size", None)
|
|
249
|
-
use_arrow_dtype = kw.pop("use_arrow_dtype", True)
|
|
250
272
|
|
|
251
273
|
partitions = partitions or kw.get("partition")
|
|
252
274
|
if isinstance(partitions, str):
|
|
@@ -261,7 +283,7 @@ def read_odps_table(
|
|
|
261
283
|
partitions=partitions,
|
|
262
284
|
dtypes=dtypes,
|
|
263
285
|
columns=columns,
|
|
264
|
-
|
|
286
|
+
dtype_backend=dtype_backend,
|
|
265
287
|
string_as_binary=string_as_binary,
|
|
266
288
|
append_partitions=append_partitions,
|
|
267
289
|
last_modified_time=to_timestamp(table.last_data_modified_time),
|