maxframe 2.2.0__cp39-cp39-win_amd64.whl → 2.3.0rc1__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (114) hide show
  1. maxframe/_utils.cp39-win_amd64.pyd +0 -0
  2. maxframe/codegen/core.py +3 -2
  3. maxframe/codegen/spe/dataframe/merge.py +4 -0
  4. maxframe/codegen/spe/dataframe/misc.py +2 -0
  5. maxframe/codegen/spe/dataframe/reduction.py +18 -0
  6. maxframe/codegen/spe/dataframe/sort.py +9 -1
  7. maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
  8. maxframe/codegen/spe/dataframe/tseries.py +9 -0
  9. maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
  10. maxframe/codegen/spe/tensor/datasource.py +1 -0
  11. maxframe/config/config.py +3 -0
  12. maxframe/conftest.py +10 -0
  13. maxframe/core/base.py +2 -1
  14. maxframe/core/entity/tileables.py +2 -0
  15. maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
  16. maxframe/core/graph/entity.py +7 -1
  17. maxframe/core/mode.py +6 -1
  18. maxframe/dataframe/__init__.py +2 -2
  19. maxframe/dataframe/arithmetic/__init__.py +4 -0
  20. maxframe/dataframe/arithmetic/maximum.py +33 -0
  21. maxframe/dataframe/arithmetic/minimum.py +33 -0
  22. maxframe/dataframe/core.py +98 -106
  23. maxframe/dataframe/datasource/core.py +6 -0
  24. maxframe/dataframe/datasource/direct.py +57 -0
  25. maxframe/dataframe/datasource/read_csv.py +19 -11
  26. maxframe/dataframe/datasource/read_odps_query.py +29 -6
  27. maxframe/dataframe/datasource/read_odps_table.py +32 -10
  28. maxframe/dataframe/datasource/read_parquet.py +38 -39
  29. maxframe/dataframe/datastore/__init__.py +6 -0
  30. maxframe/dataframe/datastore/direct.py +268 -0
  31. maxframe/dataframe/datastore/to_odps.py +6 -0
  32. maxframe/dataframe/extensions/flatjson.py +2 -1
  33. maxframe/dataframe/groupby/__init__.py +5 -1
  34. maxframe/dataframe/groupby/aggregation.py +10 -6
  35. maxframe/dataframe/groupby/apply_chunk.py +1 -3
  36. maxframe/dataframe/groupby/core.py +20 -4
  37. maxframe/dataframe/indexing/__init__.py +2 -1
  38. maxframe/dataframe/indexing/insert.py +45 -17
  39. maxframe/dataframe/merge/__init__.py +3 -0
  40. maxframe/dataframe/merge/combine.py +244 -0
  41. maxframe/dataframe/misc/__init__.py +14 -3
  42. maxframe/dataframe/misc/check_unique.py +41 -10
  43. maxframe/dataframe/misc/drop.py +31 -0
  44. maxframe/dataframe/misc/infer_dtypes.py +251 -0
  45. maxframe/dataframe/misc/map.py +31 -18
  46. maxframe/dataframe/misc/repeat.py +159 -0
  47. maxframe/dataframe/misc/tests/test_misc.py +35 -1
  48. maxframe/dataframe/missing/checkna.py +3 -2
  49. maxframe/dataframe/reduction/__init__.py +10 -5
  50. maxframe/dataframe/reduction/aggregation.py +6 -6
  51. maxframe/dataframe/reduction/argmax.py +7 -4
  52. maxframe/dataframe/reduction/argmin.py +7 -4
  53. maxframe/dataframe/reduction/core.py +18 -9
  54. maxframe/dataframe/reduction/mode.py +144 -0
  55. maxframe/dataframe/reduction/nunique.py +10 -3
  56. maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
  57. maxframe/dataframe/sort/__init__.py +9 -2
  58. maxframe/dataframe/sort/argsort.py +7 -1
  59. maxframe/dataframe/sort/core.py +1 -1
  60. maxframe/dataframe/sort/rank.py +147 -0
  61. maxframe/dataframe/tseries/__init__.py +19 -0
  62. maxframe/dataframe/tseries/at_time.py +61 -0
  63. maxframe/dataframe/tseries/between_time.py +122 -0
  64. maxframe/dataframe/utils.py +30 -26
  65. maxframe/learn/contrib/llm/core.py +16 -7
  66. maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
  67. maxframe/learn/contrib/llm/deploy/config.py +221 -0
  68. maxframe/learn/contrib/llm/deploy/core.py +247 -0
  69. maxframe/learn/contrib/llm/deploy/framework.py +35 -0
  70. maxframe/learn/contrib/llm/deploy/loader.py +360 -0
  71. maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
  72. maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
  73. maxframe/learn/contrib/llm/models/__init__.py +1 -0
  74. maxframe/learn/contrib/llm/models/dashscope.py +12 -6
  75. maxframe/learn/contrib/llm/models/managed.py +76 -11
  76. maxframe/learn/contrib/llm/models/openai.py +72 -0
  77. maxframe/learn/contrib/llm/tests/__init__.py +13 -0
  78. maxframe/learn/contrib/llm/tests/test_core.py +34 -0
  79. maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
  80. maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
  81. maxframe/learn/contrib/llm/text.py +348 -42
  82. maxframe/learn/contrib/models.py +4 -1
  83. maxframe/learn/contrib/xgboost/classifier.py +2 -0
  84. maxframe/learn/contrib/xgboost/core.py +31 -7
  85. maxframe/learn/contrib/xgboost/predict.py +4 -2
  86. maxframe/learn/contrib/xgboost/regressor.py +5 -0
  87. maxframe/learn/contrib/xgboost/train.py +2 -0
  88. maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
  89. maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
  90. maxframe/learn/utils/__init__.py +1 -0
  91. maxframe/learn/utils/extmath.py +42 -9
  92. maxframe/learn/utils/odpsio.py +80 -11
  93. maxframe/lib/filesystem/_oss_lib/common.py +2 -0
  94. maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
  95. maxframe/opcodes.py +9 -1
  96. maxframe/remote/core.py +4 -0
  97. maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
  98. maxframe/serialization/tests/test_serial.py +2 -2
  99. maxframe/tensor/arithmetic/__init__.py +1 -1
  100. maxframe/tensor/arithmetic/core.py +2 -2
  101. maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
  102. maxframe/tensor/core.py +3 -0
  103. maxframe/tensor/misc/copyto.py +1 -1
  104. maxframe/tests/test_udf.py +61 -0
  105. maxframe/tests/test_utils.py +8 -5
  106. maxframe/udf.py +103 -7
  107. maxframe/utils.py +61 -8
  108. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
  109. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
  110. maxframe_client/session/task.py +8 -1
  111. maxframe_client/tests/test_session.py +24 -0
  112. maxframe/dataframe/arrays.py +0 -864
  113. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
  114. {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
@@ -32,6 +32,7 @@ except ImportError:
32
32
 
33
33
  from ... import opcodes
34
34
  from ...config import options
35
+ from ...lib.dtypes_extension import ArrowDtype
35
36
  from ...lib.filesystem import FileSystem, get_fs, glob, open_file
36
37
  from ...serialization.serializables import (
37
38
  AnyField,
@@ -43,10 +44,13 @@ from ...serialization.serializables import (
43
44
  StringField,
44
45
  )
45
46
  from ...utils import lazy_import
46
- from ..arrays import ArrowStringDtype
47
47
  from ..operators import OutputType
48
48
  from ..utils import parse_index, to_arrow_dtypes
49
- from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
49
+ from .core import (
50
+ ColumnPruneSupportedDataSourceMixin,
51
+ DtypeBackendCompatibleMixin,
52
+ IncrementalIndexDatasource,
53
+ )
50
54
 
51
55
  PARQUET_MEMORY_SCALE = 15
52
56
  STRING_FIELD_OVERHEAD = 50
@@ -89,13 +93,11 @@ class ParquetEngine:
89
93
  def read_dtypes(self, f, **kwargs):
90
94
  raise NotImplementedError
91
95
 
92
- def read_to_pandas(
93
- self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
94
- ):
96
+ def read_to_pandas(self, f, columns=None, nrows=None, dtype_backend=None, **kwargs):
95
97
  raise NotImplementedError
96
98
 
97
99
  def read_group_to_pandas(
98
- self, f, group_index, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
100
+ self, f, group_index, columns=None, nrows=None, dtype_backend=None, **kwargs
99
101
  ):
100
102
  raise NotImplementedError
101
103
 
@@ -106,11 +108,11 @@ class ParquetEngine:
106
108
  partition_keys: Dict,
107
109
  columns=None,
108
110
  nrows=None,
109
- use_arrow_dtype=None,
111
+ dtype_backend=None,
110
112
  **kwargs,
111
113
  ):
112
114
  raw_df = self.read_to_pandas(
113
- f, columns=columns, nrows=nrows, use_arrow_dtype=use_arrow_dtype, **kwargs
115
+ f, columns=columns, nrows=nrows, dtype_backend=dtype_backend, **kwargs
114
116
  )
115
117
  for col, value in partition_keys.items():
116
118
  dictionary = partitions[col]
@@ -169,28 +171,26 @@ class ArrowEngine(ParquetEngine):
169
171
  return file.schema_arrow.empty_table().to_pandas().dtypes
170
172
 
171
173
  @classmethod
172
- def _table_to_pandas(cls, t, nrows=None, use_arrow_dtype=None):
174
+ def _table_to_pandas(cls, t, nrows=None, dtype_backend=None):
173
175
  if nrows is not None:
174
176
  t = t.slice(0, nrows)
175
- if use_arrow_dtype:
176
- df = t.to_pandas(types_mapper={pa.string(): ArrowStringDtype()}.get)
177
+ if dtype_backend == "pyarrow":
178
+ df = t.to_pandas(types_mapper={pa.string(): ArrowDtype(pa.string())}.get)
177
179
  else:
178
180
  df = t.to_pandas()
179
181
  return df
180
182
 
181
- def read_to_pandas(
182
- self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
183
- ):
183
+ def read_to_pandas(self, f, columns=None, nrows=None, dtype_backend=None, **kwargs):
184
184
  file = pq.ParquetFile(f)
185
185
  t = file.read(columns=columns, **kwargs)
186
- return self._table_to_pandas(t, nrows=nrows, use_arrow_dtype=use_arrow_dtype)
186
+ return self._table_to_pandas(t, nrows=nrows, dtype_backend=dtype_backend)
187
187
 
188
188
  def read_group_to_pandas(
189
- self, f, group_index, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
189
+ self, f, group_index, columns=None, nrows=None, dtype_backend=None, **kwargs
190
190
  ):
191
191
  file = pq.ParquetFile(f)
192
192
  t = file.read_row_group(group_index, columns=columns, **kwargs)
193
- return self._table_to_pandas(t, nrows=nrows, use_arrow_dtype=use_arrow_dtype)
193
+ return self._table_to_pandas(t, nrows=nrows, dtype_backend=dtype_backend)
194
194
 
195
195
 
196
196
  class FastpaquetEngine(ParquetEngine):
@@ -203,14 +203,12 @@ class FastpaquetEngine(ParquetEngine):
203
203
  dtypes_dict = file._dtypes()
204
204
  return pd.Series(dict((c, dtypes_dict[c]) for c in file.columns))
205
205
 
206
- def read_to_pandas(
207
- self, f, columns=None, nrows=None, use_arrow_dtype=None, **kwargs
208
- ):
206
+ def read_to_pandas(self, f, columns=None, nrows=None, dtype_backend=None, **kwargs):
209
207
  file = fastparquet.ParquetFile(f)
210
208
  df = file.to_pandas(columns, **kwargs)
211
209
  if nrows is not None:
212
210
  df = df.head(nrows)
213
- if use_arrow_dtype:
211
+ if dtype_backend == "pyarrow":
214
212
  df = df.astype(to_arrow_dtypes(df.dtypes).to_dict())
215
213
  return df
216
214
 
@@ -265,29 +263,30 @@ class CudfEngine:
265
263
  class DataFrameReadParquet(
266
264
  IncrementalIndexDatasource,
267
265
  ColumnPruneSupportedDataSourceMixin,
266
+ DtypeBackendCompatibleMixin,
268
267
  ):
269
268
  _op_type_ = opcodes.READ_PARQUET
270
269
 
271
270
  path = AnyField("path")
272
271
  engine = StringField("engine")
273
272
  columns = ListField("columns")
274
- use_arrow_dtype = BoolField("use_arrow_dtype")
275
- groups_as_chunks = BoolField("groups_as_chunks")
276
- group_index = Int32Field("group_index")
277
- read_kwargs = DictField("read_kwargs")
278
- incremental_index = BoolField("incremental_index")
279
- storage_options = DictField("storage_options")
280
- is_partitioned = BoolField("is_partitioned")
281
- merge_small_files = BoolField("merge_small_files")
282
- merge_small_file_options = DictField("merge_small_file_options")
273
+ dtype_backend = StringField("dtype_backend", default=None)
274
+ groups_as_chunks = BoolField("groups_as_chunks", default=None)
275
+ group_index = Int32Field("group_index", default=None)
276
+ read_kwargs = DictField("read_kwargs", default=None)
277
+ incremental_index = BoolField("incremental_index", default=None)
278
+ storage_options = DictField("storage_options", default=None)
279
+ is_partitioned = BoolField("is_partitioned", default=None)
280
+ merge_small_files = BoolField("merge_small_files", default=None)
281
+ merge_small_file_options = DictField("merge_small_file_options", default=None)
283
282
  # for chunk
284
283
  partitions = DictField("partitions", default=None)
285
284
  partition_keys = DictField("partition_keys", default=None)
286
285
  num_group_rows = Int64Field("num_group_rows", default=None)
287
286
  # as read meta may be too time-consuming when number of files is large,
288
287
  # thus we only read first file to get row number and raw file size
289
- first_chunk_row_num = Int64Field("first_chunk_row_num")
290
- first_chunk_raw_bytes = Int64Field("first_chunk_raw_bytes")
288
+ first_chunk_row_num = Int64Field("first_chunk_row_num", default=None)
289
+ first_chunk_raw_bytes = Int64Field("first_chunk_raw_bytes", default=None)
291
290
 
292
291
  def get_columns(self):
293
292
  return self.columns
@@ -319,7 +318,7 @@ def read_parquet(
319
318
  engine: str = "auto",
320
319
  columns: list = None,
321
320
  groups_as_chunks: bool = False,
322
- use_arrow_dtype: bool = None,
321
+ dtype_backend: str = None,
323
322
  incremental_index: bool = False,
324
323
  storage_options: dict = None,
325
324
  memory_scale: int = None,
@@ -356,8 +355,8 @@ def read_parquet(
356
355
  incremental_index: bool, default False
357
356
  If index_col not specified, ensure range index incremental,
358
357
  gain a slightly better performance if setting False.
359
- use_arrow_dtype: bool, default None
360
- If True, use arrow dtype to store columns.
358
+ dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
359
+ Back-end data type applied to the resultant DataFrame (still experimental).
361
360
  storage_options: dict, optional
362
361
  Options for storage connection.
363
362
  memory_scale: int, optional
@@ -401,9 +400,9 @@ def read_parquet(
401
400
  if columns:
402
401
  dtypes = dtypes[columns]
403
402
 
404
- if use_arrow_dtype is None:
405
- use_arrow_dtype = options.dataframe.use_arrow_dtype
406
- if use_arrow_dtype:
403
+ if dtype_backend is None:
404
+ dtype_backend = options.dataframe.dtype_backend
405
+ if dtype_backend == "pyarrow":
407
406
  dtypes = to_arrow_dtypes(dtypes)
408
407
 
409
408
  index_value = parse_index(pd.RangeIndex(-1))
@@ -413,7 +412,7 @@ def read_parquet(
413
412
  engine=engine_type,
414
413
  columns=columns,
415
414
  groups_as_chunks=groups_as_chunks,
416
- use_arrow_dtype=use_arrow_dtype,
415
+ dtype_backend=dtype_backend,
417
416
  read_kwargs=kwargs,
418
417
  incremental_index=incremental_index,
419
418
  storage_options=storage_options,
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from .direct import df_to_dict, series_to_dict, series_to_list, to_clipboard
15
16
  from .to_csv import to_csv
16
17
  from .to_odps import to_odps_table
17
18
 
@@ -20,10 +21,15 @@ def _install():
20
21
  from ..core import DATAFRAME_TYPE, SERIES_TYPE
21
22
 
22
23
  for t in DATAFRAME_TYPE:
24
+ t.to_clipboard = to_clipboard
23
25
  t.to_csv = to_csv
26
+ t.to_dict = df_to_dict
24
27
  t.to_odps_table = to_odps_table
25
28
  for t in SERIES_TYPE:
29
+ t.to_clipboard = to_clipboard
26
30
  t.to_csv = to_csv
31
+ t.to_dict = series_to_dict
32
+ t.to_list = series_to_list
27
33
 
28
34
 
29
35
  _install()
@@ -0,0 +1,268 @@
1
+ # Copyright 1999-2025 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ...utils import pd_release_version
16
+
17
+ _to_dict_has_index = pd_release_version[0] >= 2
18
+
19
+
20
+ def df_to_dict(
21
+ df, orient="dict", into=dict, index=True, batch_size=10000, session=None
22
+ ):
23
+ """
24
+ Convert the DataFrame to a dictionary.
25
+
26
+ The type of the key-value pairs can be customized with the parameters
27
+ (see below).
28
+
29
+ Parameters
30
+ ----------
31
+ orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
32
+ Determines the type of the values of the dictionary.
33
+
34
+ - 'dict' (default) : dict like {column -> {index -> value}}
35
+ - 'list' : dict like {column -> [values]}
36
+ - 'series' : dict like {column -> Series(values)}
37
+ - 'split' : dict like
38
+ {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
39
+ - 'tight' : dict like
40
+ {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
41
+ 'index_names' -> [index.names], 'column_names' -> [column.names]}
42
+ - 'records' : list like
43
+ [{column -> value}, ... , {column -> value}]
44
+ - 'index' : dict like {index -> {column -> value}}
45
+
46
+ into : class, default dict
47
+ The collections.abc.MutableMapping subclass used for all Mappings
48
+ in the return value. Can be the actual class or an empty
49
+ instance of the mapping type you want. If you want a
50
+ collections.defaultdict, you must pass it initialized.
51
+
52
+ index : bool, default True
53
+ Whether to include the index item (and index_names item if `orient`
54
+ is 'tight') in the returned dictionary. Can only be ``False``
55
+ when `orient` is 'split' or 'tight'.
56
+
57
+ Returns
58
+ -------
59
+ dict, list or collections.abc.MutableMapping
60
+ Return a collections.abc.MutableMapping object representing the
61
+ DataFrame. The resulting transformation depends on the `orient`
62
+ parameter.
63
+
64
+ See Also
65
+ --------
66
+ DataFrame.from_dict: Create a DataFrame from a dictionary.
67
+ DataFrame.to_json: Convert a DataFrame to JSON format.
68
+
69
+ Examples
70
+ --------
71
+ >>> import maxframe.dataframe as md
72
+ >>> df = md.DataFrame({'col1': [1, 2],
73
+ ... 'col2': [0.5, 0.75]},
74
+ ... index=['row1', 'row2'])
75
+ >>> df.execute()
76
+ col1 col2
77
+ row1 1 0.50
78
+ row2 2 0.75
79
+ >>> df.to_dict()
80
+ {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
81
+
82
+ You can specify the return orientation.
83
+
84
+ >>> df.to_dict('series')
85
+ {'col1': row1 1
86
+ row2 2
87
+ Name: col1, dtype: int64,
88
+ 'col2': row1 0.50
89
+ row2 0.75
90
+ Name: col2, dtype: float64}
91
+
92
+ >>> df.to_dict('split')
93
+ {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
94
+ 'data': [[1, 0.5], [2, 0.75]]}
95
+
96
+ >>> df.to_dict('records')
97
+ [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
98
+
99
+ >>> df.to_dict('index')
100
+ {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
101
+
102
+ >>> df.to_dict('tight')
103
+ {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
104
+ 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
105
+
106
+ You can also specify the mapping type.
107
+
108
+ >>> from collections import OrderedDict, defaultdict
109
+ >>> df.to_dict(into=OrderedDict)
110
+ OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
111
+ ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
112
+
113
+ If you want a `defaultdict`, you need to initialize it:
114
+
115
+ >>> dd = defaultdict(list)
116
+ >>> df.to_dict('records', into=dd)
117
+ [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
118
+ defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
119
+ """
120
+ fetch_kwargs = dict(batch_size=batch_size)
121
+ to_dict_kw = dict(orient=orient, into=into)
122
+ if _to_dict_has_index:
123
+ to_dict_kw["index"] = index
124
+ return df.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
125
+ **to_dict_kw
126
+ )
127
+
128
+
129
+ def series_to_dict(series, into=dict, batch_size=10000, session=None):
130
+ """
131
+ Convert Series to {label -> value} dict or dict-like object.
132
+
133
+ Parameters
134
+ ----------
135
+ into : class, default dict
136
+ The collections.abc.Mapping subclass to use as the return
137
+ object. Can be the actual class or an empty
138
+ instance of the mapping type you want. If you want a
139
+ collections.defaultdict, you must pass it initialized.
140
+
141
+ Returns
142
+ -------
143
+ collections.abc.Mapping
144
+ Key-value representation of Series.
145
+
146
+ Examples
147
+ --------
148
+ >>> import maxframe.dataframe as md
149
+ >>> s = md.Series([1, 2, 3, 4])
150
+ >>> s.to_dict()
151
+ {0: 1, 1: 2, 2: 3, 3: 4}
152
+ >>> from collections import OrderedDict, defaultdict
153
+ >>> s.to_dict(OrderedDict)
154
+ OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
155
+ >>> dd = defaultdict(list)
156
+ >>> s.to_dict(dd)
157
+ defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
158
+ """
159
+ fetch_kwargs = dict(batch_size=batch_size)
160
+ return series.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_dict(
161
+ into=into
162
+ )
163
+
164
+
165
+ def series_to_list(series, batch_size=10000, session=None):
166
+ """
167
+ Return a list of the values.
168
+
169
+ These are each a scalar type, which is a Python scalar
170
+ (for str, int, float) or a pandas scalar
171
+ (for Timestamp/Timedelta/Interval/Period)
172
+
173
+ Returns
174
+ -------
175
+ list
176
+
177
+ See Also
178
+ --------
179
+ numpy.ndarray.tolist : Return the array as an a.ndim-levels deep
180
+ nested list of Python scalars.
181
+
182
+ Examples
183
+ --------
184
+ For Series
185
+
186
+ >>> import maxframe.dataframe as md
187
+ >>> s = md.Series([1, 2, 3])
188
+ >>> s.to_list()
189
+ [1, 2, 3]
190
+
191
+ For Index:
192
+
193
+ >>> idx = md.Index([1, 2, 3])
194
+ >>> idx.execute()
195
+ Index([1, 2, 3], dtype='int64')
196
+
197
+ >>> idx.to_list()
198
+ [1, 2, 3]
199
+ """
200
+ fetch_kwargs = dict(batch_size=batch_size)
201
+ return series.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_list()
202
+
203
+
204
+ def to_clipboard(
205
+ obj, *, excel=True, sep=None, batch_size=10000, session=None, **kwargs
206
+ ):
207
+ """
208
+ Copy object to the system clipboard.
209
+
210
+ Write a text representation of object to the system clipboard.
211
+ This can be pasted into Excel, for example.
212
+
213
+ Parameters
214
+ ----------
215
+ excel : bool, default True
216
+ Produce output in a csv format for easy pasting into excel.
217
+
218
+ - True, use the provided separator for csv pasting.
219
+ - False, write a string representation of the object to the clipboard.
220
+
221
+ sep : str, default ``'\t'``
222
+ Field delimiter.
223
+ **kwargs
224
+ These parameters will be passed to DataFrame.to_csv.
225
+
226
+ See Also
227
+ --------
228
+ DataFrame.to_csv : Write a DataFrame to a comma-separated values
229
+ (csv) file.
230
+ read_clipboard : Read text from clipboard and pass to read_csv.
231
+
232
+ Notes
233
+ -----
234
+ Requirements for your platform.
235
+
236
+ - Linux : `xclip`, or `xsel` (with `PyQt4` modules)
237
+ - Windows : none
238
+ - macOS : none
239
+
240
+ This method uses the processes developed for the package `pyperclip`. A
241
+ solution to render any output string format is given in the examples.
242
+
243
+ Examples
244
+ --------
245
+ Copy the contents of a DataFrame to the clipboard.
246
+
247
+ >>> import maxframe.dataframe as md
248
+ >>> df = md.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
249
+
250
+ >>> df.to_clipboard(sep=',') # doctest: +SKIP
251
+ ... # Wrote the following to the system clipboard:
252
+ ... # ,A,B,C
253
+ ... # 0,1,2,3
254
+ ... # 1,4,5,6
255
+
256
+ We can omit the index by passing the keyword `index` and setting
257
+ it to false.
258
+
259
+ >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP
260
+ ... # Wrote the following to the system clipboard:
261
+ ... # A,B,C
262
+ ... # 1,2,3
263
+ ... # 4,5,6
264
+ """
265
+ fetch_kwargs = dict(batch_size=batch_size)
266
+ return obj.to_pandas(session=session, fetch_kwargs=fetch_kwargs).to_clipboard(
267
+ excel=excel, sep=sep, **kwargs
268
+ )
@@ -57,10 +57,16 @@ class DataFrameToODPSTable(DataFrameDataStore):
57
57
  lifecycle = Int64Field("lifecycle", default=None)
58
58
  table_properties = DictField("table_properties", default=None)
59
59
  primary_key = ListField("primary_key", FieldTypes.string, default=None)
60
+ use_generated_table_meta = BoolField("use_generated_table_meta", default=False)
60
61
 
61
62
  def __init__(self, **kw):
62
63
  super().__init__(_output_types=[OutputType.dataframe], **kw)
63
64
 
65
+ def check_inputs(self, inputs: List[TileableType]):
66
+ if self.use_generated_table_meta:
67
+ return None
68
+ return super().check_inputs(inputs)
69
+
64
70
  def __call__(self, x):
65
71
  shape = (0,) * len(x.shape)
66
72
  index_value = parse_index(x.index_value.to_pandas()[:0], x.key, "index")
@@ -39,12 +39,13 @@ class SeriesFlatJSONOperator(DataFrameOperator, DataFrameOperatorMixin):
39
39
  name=name,
40
40
  dtype=make_dtype(dtype),
41
41
  )
42
+ dtypes = make_dtypes(dtypes)
42
43
  return self.new_dataframe(
43
44
  [series],
44
45
  shape=(series.shape[0], len(dtypes)),
45
46
  index_value=series.index_value,
46
47
  columns_value=parse_index(dtypes.index, store_data=True),
47
- dtypes=make_dtypes(dtypes),
48
+ dtypes=dtypes,
48
49
  )
49
50
 
50
51
 
@@ -14,7 +14,7 @@
14
14
 
15
15
  # noinspection PyUnresolvedReferences
16
16
  from ..core import DataFrameGroupBy, GroupBy, SeriesGroupBy
17
- from .core import NamedAgg
17
+ from .core import _make_named_agg_compat
18
18
  from .expanding import ExpandingGroupby
19
19
  from .rolling import RollingGroupby
20
20
 
@@ -99,3 +99,7 @@ def _install():
99
99
 
100
100
  _install()
101
101
  del _install
102
+
103
+
104
+ __getattr__ = _make_named_agg_compat
105
+ del _make_named_agg_compat
@@ -21,7 +21,7 @@ import pandas as pd
21
21
 
22
22
  from ... import opcodes
23
23
  from ...config import options
24
- from ...core import ENTITY_TYPE, EntityData, OutputType
24
+ from ...core import ENTITY_TYPE, EntityData, OutputType, enter_mode
25
25
  from ...serialization import PickleContainer
26
26
  from ...serialization.serializables import (
27
27
  AnyField,
@@ -34,7 +34,7 @@ from ...serialization.serializables import (
34
34
  StringField,
35
35
  )
36
36
  from ...udf import BuiltinFunction
37
- from ...utils import find_objects, lazy_import, pd_release_version
37
+ from ...utils import find_objects, get_pd_option, lazy_import, pd_release_version
38
38
  from ..core import GROUPBY_TYPE
39
39
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
40
40
  from ..reduction.aggregation import (
@@ -116,7 +116,10 @@ def build_mock_agg_result(
116
116
  **raw_func_kw,
117
117
  ):
118
118
  try:
119
- agg_result = groupby.op.build_mock_groupby().aggregate(raw_func, **raw_func_kw)
119
+ with enter_mode(mock=True):
120
+ agg_result = groupby.op.build_mock_groupby().aggregate(
121
+ raw_func, **raw_func_kw
122
+ )
120
123
  except ValueError:
121
124
  if (
122
125
  groupby_params.get("as_index") or _support_get_group_without_as_index
@@ -377,9 +380,10 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
377
380
  1 1 2 0.590715
378
381
  2 3 4 0.704907
379
382
 
380
- To control the output names with different aggregations per column, pandas supports “named aggregation”
383
+ To control the output names with different aggregations per column,
384
+ MaxFrame supports “named aggregation”
381
385
 
382
- >>> from maxframe.dataframe.groupby import NamedAgg
386
+ >>> from maxframe.dataframe import NamedAgg
383
387
  >>> df.groupby("A").agg(
384
388
  ... b_min=NamedAgg(column="B", aggfunc="min"),
385
389
  ... c_sum=NamedAgg(column="C", aggfunc="sum")).execute()
@@ -432,6 +436,6 @@ def agg(groupby, func=None, method="auto", *args, **kwargs):
432
436
  groupby_params=groupby.op.groupby_params,
433
437
  combine_size=combine_size,
434
438
  chunk_store_limit=options.chunk_store_limit,
435
- use_inf_as_na=pd.get_option("mode.use_inf_as_na"),
439
+ use_inf_as_na=get_pd_option("mode.use_inf_as_na", False),
436
440
  )
437
441
  return agg_op(groupby)
@@ -29,7 +29,7 @@ from ...serialization.serializables import (
29
29
  TupleField,
30
30
  )
31
31
  from ...udf import BuiltinFunction, MarkedFunction
32
- from ...utils import copy_if_possible
32
+ from ...utils import copy_if_possible, make_dtype, make_dtypes
33
33
  from ..core import (
34
34
  DATAFRAME_GROUPBY_TYPE,
35
35
  GROUPBY_TYPE,
@@ -45,8 +45,6 @@ from ..utils import (
45
45
  copy_func_scheduling_hints,
46
46
  infer_dataframe_return_value,
47
47
  make_column_list,
48
- make_dtype,
49
- make_dtypes,
50
48
  parse_index,
51
49
  validate_output_types,
52
50
  )
@@ -12,7 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from collections import namedtuple
15
+ import os
16
+ import warnings
16
17
  from typing import Any, Dict, List
17
18
 
18
19
  import pandas as pd
@@ -20,6 +21,7 @@ import pandas as pd
20
21
  from ... import opcodes
21
22
  from ...core import ENTITY_TYPE, Entity, EntityData, OutputType
22
23
  from ...core.operator import MapReduceOperator
24
+ from ...env import MAXFRAME_INSIDE_TASK
23
25
  from ...serialization import PickleContainer
24
26
  from ...serialization.serializables import AnyField, BoolField, DictField, Int32Field
25
27
  from ...udf import BuiltinFunction
@@ -38,9 +40,6 @@ from ..utils import (
38
40
  cudf = lazy_import("cudf")
39
41
 
40
42
 
41
- NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
42
-
43
-
44
43
  class DataFrameGroupByOp(MapReduceOperator, DataFrameOperatorMixin):
45
44
  _op_type_ = opcodes.GROUPBY
46
45
  _legacy_name = "DataFrameGroupByOperator" # since v2.0.0
@@ -324,3 +323,20 @@ class BaseGroupByWindowOp(DataFrameOperatorMixin, DataFrameOperator):
324
323
  name, dtype = out_dtypes
325
324
  kw.update(dtype=dtype, name=name, shape=(groupby.shape[0],))
326
325
  return self.new_tileable([in_df], **kw)
326
+
327
+
328
+ def _make_named_agg_compat(name): # pragma: no cover
329
+ # to make imports compatible
330
+ from ..reduction import NamedAgg
331
+
332
+ if name == "NamedAgg":
333
+ if MAXFRAME_INSIDE_TASK not in os.environ:
334
+ warnings.warn(
335
+ "Please import NamedAgg from maxframe.dataframe",
336
+ DeprecationWarning,
337
+ )
338
+ return NamedAgg
339
+ raise AttributeError(f"module {__name__} has no attribute {name}")
340
+
341
+
342
+ __getattr__ = _make_named_agg_compat
@@ -29,7 +29,7 @@ def _install():
29
29
  from .getitem import dataframe_getitem, series_getitem
30
30
  from .iat import iat
31
31
  from .iloc import head, iloc, index_getitem, index_setitem, tail
32
- from .insert import df_insert
32
+ from .insert import df_insert, index_insert
33
33
  from .loc import loc
34
34
  from .reindex import reindex, reindex_like
35
35
  from .rename import df_rename, index_rename, index_set_names, series_rename
@@ -94,6 +94,7 @@ def _install():
94
94
  setattr(cls, "droplevel", index_droplevel)
95
95
  setattr(cls, "get_level_values", get_level_values)
96
96
  setattr(cls, "__getitem__", index_getitem)
97
+ setattr(cls, "insert", index_insert)
97
98
  setattr(cls, "rename", index_rename)
98
99
  setattr(cls, "__setitem__", index_setitem)
99
100
  setattr(cls, "set_names", index_set_names)