maxframe 2.2.0__cp38-cp38-macosx_10_9_universal2.whl → 2.3.0rc1__cp38-cp38-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-38-darwin.so +0 -0
- maxframe/codegen/core.py +3 -2
- maxframe/codegen/spe/dataframe/merge.py +4 -0
- maxframe/codegen/spe/dataframe/misc.py +2 -0
- maxframe/codegen/spe/dataframe/reduction.py +18 -0
- maxframe/codegen/spe/dataframe/sort.py +9 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +10 -0
- maxframe/core/base.py +2 -1
- maxframe/core/entity/tileables.py +2 -0
- maxframe/core/graph/core.cpython-38-darwin.so +0 -0
- maxframe/core/graph/entity.py +7 -1
- maxframe/core/mode.py +6 -1
- maxframe/dataframe/__init__.py +2 -2
- maxframe/dataframe/arithmetic/__init__.py +4 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/core.py +98 -106
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/read_csv.py +19 -11
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datastore/__init__.py +6 -0
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/groupby/__init__.py +5 -1
- maxframe/dataframe/groupby/aggregation.py +10 -6
- maxframe/dataframe/groupby/apply_chunk.py +1 -3
- maxframe/dataframe/groupby/core.py +20 -4
- maxframe/dataframe/indexing/__init__.py +2 -1
- maxframe/dataframe/indexing/insert.py +45 -17
- maxframe/dataframe/merge/__init__.py +3 -0
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/misc/__init__.py +14 -3
- maxframe/dataframe/misc/check_unique.py +41 -10
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/map.py +31 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +35 -1
- maxframe/dataframe/missing/checkna.py +3 -2
- maxframe/dataframe/reduction/__init__.py +10 -5
- maxframe/dataframe/reduction/aggregation.py +6 -6
- maxframe/dataframe/reduction/argmax.py +7 -4
- maxframe/dataframe/reduction/argmin.py +7 -4
- maxframe/dataframe/reduction/core.py +18 -9
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +10 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/sort/__init__.py +9 -2
- maxframe/dataframe/sort/argsort.py +7 -1
- maxframe/dataframe/sort/core.py +1 -1
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/utils.py +30 -26
- maxframe/learn/contrib/llm/core.py +16 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +31 -7
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +2 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +1 -0
- maxframe/learn/utils/extmath.py +42 -9
- maxframe/learn/utils/odpsio.py +80 -11
- maxframe/lib/filesystem/_oss_lib/common.py +2 -0
- maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
- maxframe/opcodes.py +9 -1
- maxframe/remote/core.py +4 -0
- maxframe/serialization/core.cpython-38-darwin.so +0 -0
- maxframe/serialization/tests/test_serial.py +2 -2
- maxframe/tensor/arithmetic/__init__.py +1 -1
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
- maxframe/tensor/core.py +3 -0
- maxframe/tensor/misc/copyto.py +1 -1
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +8 -5
- maxframe/udf.py +103 -7
- maxframe/utils.py +61 -8
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_session.py +24 -0
- maxframe/dataframe/arrays.py +0 -864
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -32,6 +32,7 @@ except ImportError:
|
|
|
32
32
|
|
|
33
33
|
from ... import opcodes
|
|
34
34
|
from ...config import options
|
|
35
|
+
from ...lib.dtypes_extension import ArrowDtype
|
|
35
36
|
from ...lib.filesystem import FileSystem, get_fs, glob, open_file
|
|
36
37
|
from ...serialization.serializables import (
|
|
37
38
|
AnyField,
|
|
@@ -43,10 +44,13 @@ from ...serialization.serializables import (
|
|
|
43
44
|
StringField,
|
|
44
45
|
)
|
|
45
46
|
from ...utils import lazy_import
|
|
46
|
-
from ..arrays import ArrowStringDtype
|
|
47
47
|
from ..operators import OutputType
|
|
48
48
|
from ..utils import parse_index, to_arrow_dtypes
|
|
49
|
-
from .core import
|
|
49
|
+
from .core import (
|
|
50
|
+
ColumnPruneSupportedDataSourceMixin,
|
|
51
|
+
DtypeBackendCompatibleMixin,
|
|
52
|
+
IncrementalIndexDatasource,
|
|
53
|
+
)
|
|
50
54
|
|
|
51
55
|
PARQUET_MEMORY_SCALE = 15
|
|
52
56
|
STRING_FIELD_OVERHEAD = 50
|
|
@@ -89,13 +93,11 @@ class ParquetEngine:
|
|
|
89
93
|
def read_dtypes(self, f, **kwargs):
|
|
90
94
|
raise NotImplementedError
|
|
91
95
|
|
|
92
|
-
def read_to_pandas(
|
|
93
|
-
self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
|
|
94
|
-
):
|
|
96
|
+
def read_to_pandas(self, f, columns=None, nrows=None, dtype_backend=None, **kwargs):
|
|
95
97
|
raise NotImplementedError
|
|
96
98
|
|
|
97
99
|
def read_group_to_pandas(
|
|
98
|
-
self, f, group_index, columns=None, nrows=None,
|
|
100
|
+
self, f, group_index, columns=None, nrows=None, dtype_backend=None, **kwargs
|
|
99
101
|
):
|
|
100
102
|
raise NotImplementedError
|
|
101
103
|
|
|
@@ -106,11 +108,11 @@ class ParquetEngine:
|
|
|
106
108
|
partition_keys: Dict,
|
|
107
109
|
columns=None,
|
|
108
110
|
nrows=None,
|
|
109
|
-
|
|
111
|
+
dtype_backend=None,
|
|
110
112
|
**kwargs,
|
|
111
113
|
):
|
|
112
114
|
raw_df = self.read_to_pandas(
|
|
113
|
-
f, columns=columns, nrows=nrows,
|
|
115
|
+
f, columns=columns, nrows=nrows, dtype_backend=dtype_backend, **kwargs
|
|
114
116
|
)
|
|
115
117
|
for col, value in partition_keys.items():
|
|
116
118
|
dictionary = partitions[col]
|
|
@@ -169,28 +171,26 @@ class ArrowEngine(ParquetEngine):
|
|
|
169
171
|
return file.schema_arrow.empty_table().to_pandas().dtypes
|
|
170
172
|
|
|
171
173
|
@classmethod
|
|
172
|
-
def _table_to_pandas(cls, t, nrows=None,
|
|
174
|
+
def _table_to_pandas(cls, t, nrows=None, dtype_backend=None):
|
|
173
175
|
if nrows is not None:
|
|
174
176
|
t = t.slice(0, nrows)
|
|
175
|
-
if
|
|
176
|
-
df = t.to_pandas(types_mapper={pa.string():
|
|
177
|
+
if dtype_backend == "pyarrow":
|
|
178
|
+
df = t.to_pandas(types_mapper={pa.string(): ArrowDtype(pa.string())}.get)
|
|
177
179
|
else:
|
|
178
180
|
df = t.to_pandas()
|
|
179
181
|
return df
|
|
180
182
|
|
|
181
|
-
def read_to_pandas(
|
|
182
|
-
self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
|
|
183
|
-
):
|
|
183
|
+
def read_to_pandas(self, f, columns=None, nrows=None, dtype_backend=None, **kwargs):
|
|
184
184
|
file = pq.ParquetFile(f)
|
|
185
185
|
t = file.read(columns=columns, **kwargs)
|
|
186
|
-
return self._table_to_pandas(t, nrows=nrows,
|
|
186
|
+
return self._table_to_pandas(t, nrows=nrows, dtype_backend=dtype_backend)
|
|
187
187
|
|
|
188
188
|
def read_group_to_pandas(
|
|
189
|
-
self, f, group_index, columns=None, nrows=None,
|
|
189
|
+
self, f, group_index, columns=None, nrows=None, dtype_backend=None, **kwargs
|
|
190
190
|
):
|
|
191
191
|
file = pq.ParquetFile(f)
|
|
192
192
|
t = file.read_row_group(group_index, columns=columns, **kwargs)
|
|
193
|
-
return self._table_to_pandas(t, nrows=nrows,
|
|
193
|
+
return self._table_to_pandas(t, nrows=nrows, dtype_backend=dtype_backend)
|
|
194
194
|
|
|
195
195
|
|
|
196
196
|
class FastpaquetEngine(ParquetEngine):
|
|
@@ -203,14 +203,12 @@ class FastpaquetEngine(ParquetEngine):
|
|
|
203
203
|
dtypes_dict = file._dtypes()
|
|
204
204
|
return pd.Series(dict((c, dtypes_dict[c]) for c in file.columns))
|
|
205
205
|
|
|
206
|
-
def read_to_pandas(
|
|
207
|
-
self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
|
|
208
|
-
):
|
|
206
|
+
def read_to_pandas(self, f, columns=None, nrows=None, dtype_backend=None, **kwargs):
|
|
209
207
|
file = fastparquet.ParquetFile(f)
|
|
210
208
|
df = file.to_pandas(columns, **kwargs)
|
|
211
209
|
if nrows is not None:
|
|
212
210
|
df = df.head(nrows)
|
|
213
|
-
if
|
|
211
|
+
if dtype_backend == "pyarrow":
|
|
214
212
|
df = df.astype(to_arrow_dtypes(df.dtypes).to_dict())
|
|
215
213
|
return df
|
|
216
214
|
|
|
@@ -265,29 +263,30 @@ class CudfEngine:
|
|
|
265
263
|
class DataFrameReadParquet(
|
|
266
264
|
IncrementalIndexDatasource,
|
|
267
265
|
ColumnPruneSupportedDataSourceMixin,
|
|
266
|
+
DtypeBackendCompatibleMixin,
|
|
268
267
|
):
|
|
269
268
|
_op_type_ = opcodes.READ_PARQUET
|
|
270
269
|
|
|
271
270
|
path = AnyField("path")
|
|
272
271
|
engine = StringField("engine")
|
|
273
272
|
columns = ListField("columns")
|
|
274
|
-
|
|
275
|
-
groups_as_chunks = BoolField("groups_as_chunks")
|
|
276
|
-
group_index = Int32Field("group_index")
|
|
277
|
-
read_kwargs = DictField("read_kwargs")
|
|
278
|
-
incremental_index = BoolField("incremental_index")
|
|
279
|
-
storage_options = DictField("storage_options")
|
|
280
|
-
is_partitioned = BoolField("is_partitioned")
|
|
281
|
-
merge_small_files = BoolField("merge_small_files")
|
|
282
|
-
merge_small_file_options = DictField("merge_small_file_options")
|
|
273
|
+
dtype_backend = StringField("dtype_backend", default=None)
|
|
274
|
+
groups_as_chunks = BoolField("groups_as_chunks", default=None)
|
|
275
|
+
group_index = Int32Field("group_index", default=None)
|
|
276
|
+
read_kwargs = DictField("read_kwargs", default=None)
|
|
277
|
+
incremental_index = BoolField("incremental_index", default=None)
|
|
278
|
+
storage_options = DictField("storage_options", default=None)
|
|
279
|
+
is_partitioned = BoolField("is_partitioned", default=None)
|
|
280
|
+
merge_small_files = BoolField("merge_small_files", default=None)
|
|
281
|
+
merge_small_file_options = DictField("merge_small_file_options", default=None)
|
|
283
282
|
# for chunk
|
|
284
283
|
partitions = DictField("partitions", default=None)
|
|
285
284
|
partition_keys = DictField("partition_keys", default=None)
|
|
286
285
|
num_group_rows = Int64Field("num_group_rows", default=None)
|
|
287
286
|
# as read meta may be too time-consuming when number of files is large,
|
|
288
287
|
# thus we only read first file to get row number and raw file size
|
|
289
|
-
first_chunk_row_num = Int64Field("first_chunk_row_num")
|
|
290
|
-
first_chunk_raw_bytes = Int64Field("first_chunk_raw_bytes")
|
|
288
|
+
first_chunk_row_num = Int64Field("first_chunk_row_num", default=None)
|
|
289
|
+
first_chunk_raw_bytes = Int64Field("first_chunk_raw_bytes", default=None)
|
|
291
290
|
|
|
292
291
|
def get_columns(self):
|
|
293
292
|
return self.columns
|
|
@@ -319,7 +318,7 @@ def read_parquet(
|
|
|
319
318
|
engine: str = "auto",
|
|
320
319
|
columns: list = None,
|
|
321
320
|
groups_as_chunks: bool = False,
|
|
322
|
-
|
|
321
|
+
dtype_backend: str = None,
|
|
323
322
|
incremental_index: bool = False,
|
|
324
323
|
storage_options: dict = None,
|
|
325
324
|
memory_scale: int = None,
|
|
@@ -356,8 +355,8 @@ def read_parquet(
|
|
|
356
355
|
incremental_index: bool, default False
|
|
357
356
|
If index_col not specified, ensure range index incremental,
|
|
358
357
|
gain a slightly better performance if setting False.
|
|
359
|
-
|
|
360
|
-
|
|
358
|
+
dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
|
|
359
|
+
Back-end data type applied to the resultant DataFrame (still experimental).
|
|
361
360
|
storage_options: dict, optional
|
|
362
361
|
Options for storage connection.
|
|
363
362
|
memory_scale: int, optional
|
|
@@ -401,9 +400,9 @@ def read_parquet(
|
|
|
401
400
|
if columns:
|
|
402
401
|
dtypes = dtypes[columns]
|
|
403
402
|
|
|
404
|
-
if
|
|
405
|
-
|
|
406
|
-
if
|
|
403
|
+
if dtype_backend is None:
|
|
404
|
+
dtype_backend = options.dataframe.dtype_backend
|
|
405
|
+
if dtype_backend == "pyarrow":
|
|
407
406
|
dtypes = to_arrow_dtypes(dtypes)
|
|
408
407
|
|
|
409
408
|
index_value = parse_index(pd.RangeIndex(-1))
|
|
@@ -413,7 +412,7 @@ def read_parquet(
|
|
|
413
412
|
engine=engine_type,
|
|
414
413
|
columns=columns,
|
|
415
414
|
groups_as_chunks=groups_as_chunks,
|
|
416
|
-
|
|
415
|
+
dtype_backend=dtype_backend,
|
|
417
416
|
read_kwargs=kwargs,
|
|
418
417
|
incremental_index=incremental_index,
|
|
419
418
|
storage_options=storage_options,
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from .direct import df_to_dict, series_to_dict, series_to_list, to_clipboard
|
|
15
16
|
from .to_csv import to_csv
|
|
16
17
|
from .to_odps import to_odps_table
|
|
17
18
|
|
|
@@ -20,10 +21,15 @@ def _install():
|
|
|
20
21
|
from ..core import DATAFRAME_TYPE, SERIES_TYPE
|
|
21
22
|
|
|
22
23
|
for t in DATAFRAME_TYPE:
|
|
24
|
+
t.to_clipboard = to_clipboard
|
|
23
25
|
t.to_csv = to_csv
|
|
26
|
+
t.to_dict = df_to_dict
|
|
24
27
|
t.to_odps_table = to_odps_table
|
|
25
28
|
for t in SERIES_TYPE:
|
|
29
|
+
t.to_clipboard = to_clipboard
|
|
26
30
|
t.to_csv = to_csv
|
|
31
|
+
t.to_dict = series_to_dict
|
|
32
|
+
t.to_list = series_to_list
|
|
27
33
|
|
|
28
34
|
|
|
29
35
|
_install()
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# Copyright 1999-2025 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ...utils import pd_release_version
|
|
16
|
+
|
|
17
|
+
_to_dict_has_index = pd_release_version[0] >= 2
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def df_to_dict(
|
|
21
|
+
df, orient="dict", into=dict, index=True, batch_size=10000, session=None
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
Convert the DataFrame to a dictionary.
|
|
25
|
+
|
|
26
|
+
The type of the key-value pairs can be customized with the parameters
|
|
27
|
+
(see below).
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
|
|
32
|
+
Determines the type of the values of the dictionary.
|
|
33
|
+
|
|
34
|
+
- 'dict' (default) : dict like {column -> {index -> value}}
|
|
35
|
+
- 'list' : dict like {column -> [values]}
|
|
36
|
+
- 'series' : dict like {column -> Series(values)}
|
|
37
|
+
- 'split' : dict like
|
|
38
|
+
{'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
|
|
39
|
+
- 'tight' : dict like
|
|
40
|
+
{'index' -> [index], 'columns' -> [columns], 'data' -> [values],
|
|
41
|
+
'index_names' -> [index.names], 'column_names' -> [column.names]}
|
|
42
|
+
- 'records' : list like
|
|
43
|
+
[{column -> value}, ... , {column -> value}]
|
|
44
|
+
- 'index' : dict like {index -> {column -> value}}
|
|
45
|
+
|
|
46
|
+
into : class, default dict
|
|
47
|
+
The collections.abc.MutableMapping subclass used for all Mappings
|
|
48
|
+
in the return value. Can be the actual class or an empty
|
|
49
|
+
instance of the mapping type you want. If you want a
|
|
50
|
+
collections.defaultdict, you must pass it initialized.
|
|
51
|
+
|
|
52
|
+
index : bool, default True
|
|
53
|
+
Whether to include the index item (and index_names item if `orient`
|
|
54
|
+
is 'tight') in the returned dictionary. Can only be ``False``
|
|
55
|
+
when `orient` is 'split' or 'tight'.
|
|
56
|
+
|
|
57
|
+
Returns
|
|
58
|
+
-------
|
|
59
|
+
dict, list or collections.abc.MutableMapping
|
|
60
|
+
Return a collections.abc.MutableMapping object representing the
|
|
61
|
+
DataFrame. The resulting transformation depends on the `orient`
|
|
62
|
+
parameter.
|
|
63
|
+
|
|
64
|
+
See Also
|
|
65
|
+
--------
|
|
66
|
+
DataFrame.from_dict: Create a DataFrame from a dictionary.
|
|
67
|
+
DataFrame.to_json: Convert a DataFrame to JSON format.
|
|
68
|
+
|
|
69
|
+
Examples
|
|
70
|
+
--------
|
|
71
|
+
>>> import maxframe.dataframe as md
|
|
72
|
+
>>> df = md.DataFrame({'col1': [1, 2],
|
|
73
|
+
... 'col2': [0.5, 0.75]},
|
|
74
|
+
... index=['row1', 'row2'])
|
|
75
|
+
>>> df.execute()
|
|
76
|
+
col1 col2
|
|
77
|
+
row1 1 0.50
|
|
78
|
+
row2 2 0.75
|
|
79
|
+
>>> df.to_dict()
|
|
80
|
+
{'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
|
|
81
|
+
|
|
82
|
+
You can specify the return orientation.
|
|
83
|
+
|
|
84
|
+
>>> df.to_dict('series')
|
|
85
|
+
{'col1': row1 1
|
|
86
|
+
row2 2
|
|
87
|
+
Name: col1, dtype: int64,
|
|
88
|
+
'col2': row1 0.50
|
|
89
|
+
row2 0.75
|
|
90
|
+
Name: col2, dtype: float64}
|
|
91
|
+
|
|
92
|
+
>>> df.to_dict('split')
|
|
93
|
+
{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
|
|
94
|
+
'data': [[1, 0.5], [2, 0.75]]}
|
|
95
|
+
|
|
96
|
+
>>> df.to_dict('records')
|
|
97
|
+
[{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
|
|
98
|
+
|
|
99
|
+
>>> df.to_dict('index')
|
|
100
|
+
{'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
|
|
101
|
+
|
|
102
|
+
>>> df.to_dict('tight')
|
|
103
|
+
{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
|
|
104
|
+
'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
|
|
105
|
+
|
|
106
|
+
You can also specify the mapping type.
|
|
107
|
+
|
|
108
|
+
>>> from collections import OrderedDict, defaultdict
|
|
109
|
+
>>> df.to_dict(into=OrderedDict)
|
|
110
|
+
OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
|
|
111
|
+
('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
|
|
112
|
+
|
|
113
|
+
If you want a `defaultdict`, you need to initialize it:
|
|
114
|
+
|
|
115
|
+
>>> dd = defaultdict(list)
|
|
116
|
+
>>> df.to_dict('records', into=dd)
|
|
117
|
+
[defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
|
|
118
|
+
defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
|
|
119
|
+
"""
|
|
120
|
+
fetch_kwargs = dict(batch_size=batch_size)
|
|
121
|
+
to_dict_kw = dict(orient=orient, into=into)
|
|
122
|
+
if _to_dict_has_index:
|
|
123
|
+
to_dict_kw["index"] = index
|
|
124
|
+
return df.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
|
|
125
|
+
**to_dict_kw
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def series_to_dict(series, into=dict, batch_size=10000, session=None):
|
|
130
|
+
"""
|
|
131
|
+
Convert Series to {label -> value} dict or dict-like object.
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
into : class, default dict
|
|
136
|
+
The collections.abc.Mapping subclass to use as the return
|
|
137
|
+
object. Can be the actual class or an empty
|
|
138
|
+
instance of the mapping type you want. If you want a
|
|
139
|
+
collections.defaultdict, you must pass it initialized.
|
|
140
|
+
|
|
141
|
+
Returns
|
|
142
|
+
-------
|
|
143
|
+
collections.abc.Mapping
|
|
144
|
+
Key-value representation of Series.
|
|
145
|
+
|
|
146
|
+
Examples
|
|
147
|
+
--------
|
|
148
|
+
>>> import maxframe.dataframe as md
|
|
149
|
+
>>> s = md.Series([1, 2, 3, 4])
|
|
150
|
+
>>> s.to_dict()
|
|
151
|
+
{0: 1, 1: 2, 2: 3, 3: 4}
|
|
152
|
+
>>> from collections import OrderedDict, defaultdict
|
|
153
|
+
>>> s.to_dict(OrderedDict)
|
|
154
|
+
OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
|
|
155
|
+
>>> dd = defaultdict(list)
|
|
156
|
+
>>> s.to_dict(dd)
|
|
157
|
+
defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
|
|
158
|
+
"""
|
|
159
|
+
fetch_kwargs = dict(batch_size=batch_size)
|
|
160
|
+
return series.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
|
|
161
|
+
into=into
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def series_to_list(series, batch_size=10000, session=None):
|
|
166
|
+
"""
|
|
167
|
+
Return a list of the values.
|
|
168
|
+
|
|
169
|
+
These are each a scalar type, which is a Python scalar
|
|
170
|
+
(for str, int, float) or a pandas scalar
|
|
171
|
+
(for Timestamp/Timedelta/Interval/Period)
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
list
|
|
176
|
+
|
|
177
|
+
See Also
|
|
178
|
+
--------
|
|
179
|
+
numpy.ndarray.tolist : Return the array as an a.ndim-levels deep
|
|
180
|
+
nested list of Python scalars.
|
|
181
|
+
|
|
182
|
+
Examples
|
|
183
|
+
--------
|
|
184
|
+
For Series
|
|
185
|
+
|
|
186
|
+
>>> import maxframe.dataframe as md
|
|
187
|
+
>>> s = md.Series([1, 2, 3])
|
|
188
|
+
>>> s.to_list()
|
|
189
|
+
[1, 2, 3]
|
|
190
|
+
|
|
191
|
+
For Index:
|
|
192
|
+
|
|
193
|
+
>>> idx = md.Index([1, 2, 3])
|
|
194
|
+
>>> idx.execute()
|
|
195
|
+
Index([1, 2, 3], dtype='int64')
|
|
196
|
+
|
|
197
|
+
>>> idx.to_list()
|
|
198
|
+
[1, 2, 3]
|
|
199
|
+
"""
|
|
200
|
+
fetch_kwargs = dict(batch_size=batch_size)
|
|
201
|
+
return series.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_list()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def to_clipboard(
|
|
205
|
+
obj, *, excel=True, sep=None, batch_size=10000, session=None, **kwargs
|
|
206
|
+
):
|
|
207
|
+
"""
|
|
208
|
+
Copy object to the system clipboard.
|
|
209
|
+
|
|
210
|
+
Write a text representation of object to the system clipboard.
|
|
211
|
+
This can be pasted into Excel, for example.
|
|
212
|
+
|
|
213
|
+
Parameters
|
|
214
|
+
----------
|
|
215
|
+
excel : bool, default True
|
|
216
|
+
Produce output in a csv format for easy pasting into excel.
|
|
217
|
+
|
|
218
|
+
- True, use the provided separator for csv pasting.
|
|
219
|
+
- False, write a string representation of the object to the clipboard.
|
|
220
|
+
|
|
221
|
+
sep : str, default ``'\t'``
|
|
222
|
+
Field delimiter.
|
|
223
|
+
**kwargs
|
|
224
|
+
These parameters will be passed to DataFrame.to_csv.
|
|
225
|
+
|
|
226
|
+
See Also
|
|
227
|
+
--------
|
|
228
|
+
DataFrame.to_csv : Write a DataFrame to a comma-separated values
|
|
229
|
+
(csv) file.
|
|
230
|
+
read_clipboard : Read text from clipboard and pass to read_csv.
|
|
231
|
+
|
|
232
|
+
Notes
|
|
233
|
+
-----
|
|
234
|
+
Requirements for your platform.
|
|
235
|
+
|
|
236
|
+
- Linux : `xclip`, or `xsel` (with `PyQt4` modules)
|
|
237
|
+
- Windows : none
|
|
238
|
+
- macOS : none
|
|
239
|
+
|
|
240
|
+
This method uses the processes developed for the package `pyperclip`. A
|
|
241
|
+
solution to render any output string format is given in the examples.
|
|
242
|
+
|
|
243
|
+
Examples
|
|
244
|
+
--------
|
|
245
|
+
Copy the contents of a DataFrame to the clipboard.
|
|
246
|
+
|
|
247
|
+
>>> import maxframe.dataframe as md
|
|
248
|
+
>>> df = md.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
|
|
249
|
+
|
|
250
|
+
>>> df.to_clipboard(sep=',') # doctest: +SKIP
|
|
251
|
+
... # Wrote the following to the system clipboard:
|
|
252
|
+
... # ,A,B,C
|
|
253
|
+
... # 0,1,2,3
|
|
254
|
+
... # 1,4,5,6
|
|
255
|
+
|
|
256
|
+
We can omit the index by passing the keyword `index` and setting
|
|
257
|
+
it to false.
|
|
258
|
+
|
|
259
|
+
>>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP
|
|
260
|
+
... # Wrote the following to the system clipboard:
|
|
261
|
+
... # A,B,C
|
|
262
|
+
... # 1,2,3
|
|
263
|
+
... # 4,5,6
|
|
264
|
+
"""
|
|
265
|
+
fetch_kwargs = dict(batch_size=batch_size)
|
|
266
|
+
return obj.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_clipboard(
|
|
267
|
+
excel=excel, sep=sep, **kwargs
|
|
268
|
+
)
|
|
@@ -57,10 +57,16 @@ class DataFrameToODPSTable(DataFrameDataStore):
|
|
|
57
57
|
lifecycle = Int64Field("lifecycle", default=None)
|
|
58
58
|
table_properties = DictField("table_properties", default=None)
|
|
59
59
|
primary_key = ListField("primary_key", FieldTypes.string, default=None)
|
|
60
|
+
use_generated_table_meta = BoolField("use_generated_table_meta", default=False)
|
|
60
61
|
|
|
61
62
|
def __init__(self, **kw):
|
|
62
63
|
super().__init__(_output_types=[OutputType.dataframe], **kw)
|
|
63
64
|
|
|
65
|
+
def check_inputs(self, inputs: List[TileableType]):
|
|
66
|
+
if self.use_generated_table_meta:
|
|
67
|
+
return None
|
|
68
|
+
return super().check_inputs(inputs)
|
|
69
|
+
|
|
64
70
|
def __call__(self, x):
|
|
65
71
|
shape = (0,) * len(x.shape)
|
|
66
72
|
index_value = parse_index(x.index_value.to_pandas()[:0], x.key, "index")
|
|
@@ -39,12 +39,13 @@ class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
39
39
|
name=name,
|
|
40
40
|
dtype=make_dtype(dtype),
|
|
41
41
|
)
|
|
42
|
+
dtypes = make_dtypes(dtypes)
|
|
42
43
|
return self.new_dataframe(
|
|
43
44
|
[series],
|
|
44
45
|
shape=(series.shape[0], len(dtypes)),
|
|
45
46
|
index_value=series.index_value,
|
|
46
47
|
columns_value=parse_index(dtypes.index, store_data=True),
|
|
47
|
-
dtypes=
|
|
48
|
+
dtypes=dtypes,
|
|
48
49
|
)
|
|
49
50
|
|
|
50
51
|
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
# noinspection PyUnresolvedReferences
|
|
16
16
|
from ..core import DataFrameGroupBy, GroupBy, SeriesGroupBy
|
|
17
|
-
from .core import
|
|
17
|
+
from .core import _make_named_agg_compat
|
|
18
18
|
from .expanding import ExpandingGroupby
|
|
19
19
|
from .rolling import RollingGroupby
|
|
20
20
|
|
|
@@ -99,3 +99,7 @@ def _install():
|
|
|
99
99
|
|
|
100
100
|
_install()
|
|
101
101
|
del _install
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
__getattr__ = _make_named_agg_compat
|
|
105
|
+
del _make_named_agg_compat
|
|
@@ -21,7 +21,7 @@ import pandas as pd
|
|
|
21
21
|
|
|
22
22
|
from ... import opcodes
|
|
23
23
|
from ...config import options
|
|
24
|
-
from ...core import ENTITY_TYPE, EntityData, OutputType
|
|
24
|
+
from ...core import ENTITY_TYPE, EntityData, OutputType, enter_mode
|
|
25
25
|
from ...serialization import PickleContainer
|
|
26
26
|
from ...serialization.serializables import (
|
|
27
27
|
AnyField,
|
|
@@ -34,7 +34,7 @@ from ...serialization.serializables import (
|
|
|
34
34
|
StringField,
|
|
35
35
|
)
|
|
36
36
|
from ...udf import BuiltinFunction
|
|
37
|
-
from ...utils import find_objects, lazy_import, pd_release_version
|
|
37
|
+
from ...utils import find_objects, get_pd_option, lazy_import, pd_release_version
|
|
38
38
|
from ..core import GROUPBY_TYPE
|
|
39
39
|
from ..operators import DataFrameOperator, DataFrameOperatorMixin
|
|
40
40
|
from ..reduction.aggregation import (
|
|
@@ -116,7 +116,10 @@ def build_mock_agg_result(
|
|
|
116
116
|
**raw_func_kw,
|
|
117
117
|
):
|
|
118
118
|
try:
|
|
119
|
-
|
|
119
|
+
with enter_mode(mock=True):
|
|
120
|
+
agg_result = groupby.op.build_mock_groupby().aggregate(
|
|
121
|
+
raw_func, **raw_func_kw
|
|
122
|
+
)
|
|
120
123
|
except ValueError:
|
|
121
124
|
if (
|
|
122
125
|
groupby_params.get("as_index") or _support_get_group_without_as_index
|
|
@@ -377,9 +380,10 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
|
|
|
377
380
|
1 1 2 0.590715
|
|
378
381
|
2 3 4 0.704907
|
|
379
382
|
|
|
380
|
-
To control the output names with different aggregations per column,
|
|
383
|
+
To control the output names with different aggregations per column,
|
|
384
|
+
MaxFrame supports “named aggregation”
|
|
381
385
|
|
|
382
|
-
>>> from maxframe.dataframe
|
|
386
|
+
>>> from maxframe.dataframe import NamedAgg
|
|
383
387
|
>>> df.groupby("A").agg(
|
|
384
388
|
... b_min=NamedAgg(column="B", aggfunc="min"),
|
|
385
389
|
... c_sum=NamedAgg(column="C", aggfunc="sum")).execute()
|
|
@@ -432,6 +436,6 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
|
|
|
432
436
|
groupby_params=groupby.op.groupby_params,
|
|
433
437
|
combine_size=combine_size,
|
|
434
438
|
chunk_store_limit=options.chunk_store_limit,
|
|
435
|
-
use_inf_as_na=
|
|
439
|
+
use_inf_as_na=get_pd_option("mode.use_inf_as_na", False),
|
|
436
440
|
)
|
|
437
441
|
return agg_op(groupby)
|
|
@@ -29,7 +29,7 @@ from ...serialization.serializables import (
|
|
|
29
29
|
TupleField,
|
|
30
30
|
)
|
|
31
31
|
from ...udf import BuiltinFunction, MarkedFunction
|
|
32
|
-
from ...utils import copy_if_possible
|
|
32
|
+
from ...utils import copy_if_possible, make_dtype, make_dtypes
|
|
33
33
|
from ..core import (
|
|
34
34
|
DATAFRAME_GROUPBY_TYPE,
|
|
35
35
|
GROUPBY_TYPE,
|
|
@@ -45,8 +45,6 @@ from ..utils import (
|
|
|
45
45
|
copy_func_scheduling_hints,
|
|
46
46
|
infer_dataframe_return_value,
|
|
47
47
|
make_column_list,
|
|
48
|
-
make_dtype,
|
|
49
|
-
make_dtypes,
|
|
50
48
|
parse_index,
|
|
51
49
|
validate_output_types,
|
|
52
50
|
)
|
|
@@ -12,7 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
import os
|
|
16
|
+
import warnings
|
|
16
17
|
from typing import Any, Dict, List
|
|
17
18
|
|
|
18
19
|
import pandas as pd
|
|
@@ -20,6 +21,7 @@ import pandas as pd
|
|
|
20
21
|
from ... import opcodes
|
|
21
22
|
from ...core import ENTITY_TYPE, Entity, EntityData, OutputType
|
|
22
23
|
from ...core.operator import MapReduceOperator
|
|
24
|
+
from ...env import MAXFRAME_INSIDE_TASK
|
|
23
25
|
from ...serialization import PickleContainer
|
|
24
26
|
from ...serialization.serializables import AnyField, BoolField, DictField, Int32Field
|
|
25
27
|
from ...udf import BuiltinFunction
|
|
@@ -38,9 +40,6 @@ from ..utils import (
|
|
|
38
40
|
cudf = lazy_import("cudf")
|
|
39
41
|
|
|
40
42
|
|
|
41
|
-
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
|
|
42
|
-
|
|
43
|
-
|
|
44
43
|
class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
|
|
45
44
|
_op_type_ = opcodes.GROUPBY
|
|
46
45
|
_legacy_name = "DataFrameGroupByOperator" # since v2.0.0
|
|
@@ -324,3 +323,20 @@ class BaseGroupByWindowOp(DataFrameOperatorMixin, DataFrameOperator):
|
|
|
324
323
|
name, dtype = out_dtypes
|
|
325
324
|
kw.update(dtype=dtype, name=name, shape=(groupby.shape[0],))
|
|
326
325
|
return self.new_tileable([in_df], **kw)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _make_named_agg_compat(name): # pragma: no cover
|
|
329
|
+
# to make imports compatible
|
|
330
|
+
from ..reduction import NamedAgg
|
|
331
|
+
|
|
332
|
+
if name == "NamedAgg":
|
|
333
|
+
if MAXFRAME_INSIDE_TASK not in os.environ:
|
|
334
|
+
warnings.warn(
|
|
335
|
+
"Please import NamedAgg from maxframe.dataframe",
|
|
336
|
+
DeprecationWarning,
|
|
337
|
+
)
|
|
338
|
+
return NamedAgg
|
|
339
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
__getattr__ = _make_named_agg_compat
|
|
@@ -29,7 +29,7 @@ def _install():
|
|
|
29
29
|
from .getitem import dataframe_getitem, series_getitem
|
|
30
30
|
from .iat import iat
|
|
31
31
|
from .iloc import head, iloc, index_getitem, index_setitem, tail
|
|
32
|
-
from .insert import df_insert
|
|
32
|
+
from .insert import df_insert, index_insert
|
|
33
33
|
from .loc import loc
|
|
34
34
|
from .reindex import reindex, reindex_like
|
|
35
35
|
from .rename import df_rename, index_rename, index_set_names, series_rename
|
|
@@ -94,6 +94,7 @@ def _install():
|
|
|
94
94
|
setattr(cls, "droplevel", index_droplevel)
|
|
95
95
|
setattr(cls, "get_level_values", get_level_values)
|
|
96
96
|
setattr(cls, "__getitem__", index_getitem)
|
|
97
|
+
setattr(cls, "insert", index_insert)
|
|
97
98
|
setattr(cls, "rename", index_rename)
|
|
98
99
|
setattr(cls, "__setitem__", index_setitem)
|
|
99
100
|
setattr(cls, "set_names", index_set_names)
|