maxframe 1.0.0rc2__cp38-cp38-win_amd64.whl → 1.0.0rc4__cp38-cp38-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (134) hide show
  1. maxframe/_utils.cp38-win_amd64.pyd +0 -0
  2. maxframe/codegen.py +4 -2
  3. maxframe/config/config.py +28 -9
  4. maxframe/config/validators.py +42 -12
  5. maxframe/conftest.py +56 -14
  6. maxframe/core/__init__.py +2 -13
  7. maxframe/core/entity/__init__.py +0 -4
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +45 -2
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cp38-win_amd64.pyd +0 -0
  19. maxframe/core/graph/entity.py +9 -33
  20. maxframe/core/operator/__init__.py +2 -9
  21. maxframe/core/operator/base.py +3 -5
  22. maxframe/core/operator/objects.py +0 -9
  23. maxframe/core/operator/utils.py +55 -0
  24. maxframe/dataframe/arithmetic/docstring.py +26 -2
  25. maxframe/dataframe/arithmetic/equal.py +4 -2
  26. maxframe/dataframe/arithmetic/greater.py +4 -2
  27. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  28. maxframe/dataframe/arithmetic/less.py +2 -2
  29. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  30. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  31. maxframe/dataframe/core.py +2 -0
  32. maxframe/dataframe/datasource/read_odps_query.py +67 -8
  33. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  34. maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
  35. maxframe/dataframe/datastore/to_odps.py +8 -1
  36. maxframe/dataframe/extensions/__init__.py +3 -0
  37. maxframe/dataframe/extensions/flatmap.py +326 -0
  38. maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
  39. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  40. maxframe/dataframe/indexing/rename.py +11 -0
  41. maxframe/dataframe/initializer.py +11 -1
  42. maxframe/dataframe/misc/drop_duplicates.py +18 -1
  43. maxframe/dataframe/operators.py +1 -17
  44. maxframe/dataframe/reduction/core.py +2 -2
  45. maxframe/dataframe/tests/test_initializer.py +33 -2
  46. maxframe/io/objects/__init__.py +24 -0
  47. maxframe/io/objects/core.py +140 -0
  48. maxframe/io/objects/tensor.py +76 -0
  49. maxframe/io/objects/tests/__init__.py +13 -0
  50. maxframe/io/objects/tests/test_object_io.py +97 -0
  51. maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
  52. maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
  53. maxframe/{odpsio → io/odpsio}/schema.py +10 -8
  54. maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
  55. maxframe/io/odpsio/tests/__init__.py +13 -0
  56. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
  57. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
  58. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  59. maxframe/io/odpsio/volumeio.py +63 -0
  60. maxframe/learn/contrib/__init__.py +2 -1
  61. maxframe/learn/contrib/graph/__init__.py +15 -0
  62. maxframe/learn/contrib/graph/connected_components.py +215 -0
  63. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  64. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  65. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  66. maxframe/learn/contrib/xgboost/core.py +87 -2
  67. maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
  68. maxframe/learn/contrib/xgboost/predict.py +27 -44
  69. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  70. maxframe/learn/contrib/xgboost/train.py +27 -16
  71. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  72. maxframe/lib/mmh3.cp38-win_amd64.pyd +0 -0
  73. maxframe/opcodes.py +3 -0
  74. maxframe/protocol.py +7 -16
  75. maxframe/remote/core.py +4 -8
  76. maxframe/serialization/__init__.py +1 -0
  77. maxframe/serialization/core.cp38-win_amd64.pyd +0 -0
  78. maxframe/session.py +9 -2
  79. maxframe/tensor/__init__.py +10 -2
  80. maxframe/tensor/arithmetic/isclose.py +1 -0
  81. maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
  82. maxframe/tensor/core.py +5 -136
  83. maxframe/tensor/datasource/array.py +3 -0
  84. maxframe/tensor/datasource/full.py +1 -1
  85. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  86. maxframe/tensor/indexing/flatnonzero.py +1 -1
  87. maxframe/tensor/indexing/getitem.py +2 -0
  88. maxframe/tensor/merge/__init__.py +2 -0
  89. maxframe/tensor/merge/concatenate.py +101 -0
  90. maxframe/tensor/merge/tests/test_merge.py +30 -1
  91. maxframe/tensor/merge/vstack.py +74 -0
  92. maxframe/tensor/{base → misc}/__init__.py +2 -0
  93. maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
  94. maxframe/tensor/misc/atleast_2d.py +70 -0
  95. maxframe/tensor/misc/atleast_3d.py +85 -0
  96. maxframe/tensor/misc/tests/__init__.py +13 -0
  97. maxframe/tensor/{base → misc}/transpose.py +22 -18
  98. maxframe/tensor/operators.py +1 -7
  99. maxframe/tensor/random/core.py +1 -1
  100. maxframe/tensor/reduction/count_nonzero.py +1 -0
  101. maxframe/tensor/reduction/mean.py +1 -0
  102. maxframe/tensor/reduction/nanmean.py +1 -0
  103. maxframe/tensor/reduction/nanvar.py +2 -0
  104. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  105. maxframe/tensor/reduction/var.py +2 -0
  106. maxframe/tensor/utils.py +2 -22
  107. maxframe/typing_.py +4 -1
  108. maxframe/udf.py +8 -9
  109. maxframe/utils.py +49 -73
  110. maxframe-1.0.0rc4.dist-info/METADATA +104 -0
  111. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
  112. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
  113. maxframe_client/fetcher.py +33 -50
  114. maxframe_client/session/consts.py +3 -0
  115. maxframe_client/session/graph.py +8 -2
  116. maxframe_client/session/odps.py +134 -27
  117. maxframe_client/session/task.py +58 -20
  118. maxframe_client/tests/test_fetcher.py +1 -1
  119. maxframe_client/tests/test_session.py +27 -3
  120. maxframe/core/entity/chunks.py +0 -68
  121. maxframe/core/entity/fuse.py +0 -73
  122. maxframe/core/graph/builder/chunk.py +0 -430
  123. maxframe/odpsio/volumeio.py +0 -95
  124. maxframe-1.0.0rc2.dist-info/METADATA +0 -177
  125. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  126. /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
  127. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  128. /maxframe/tensor/{base → misc}/astype.py +0 -0
  129. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  130. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  131. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  132. /maxframe/tensor/{base → misc}/unique.py +0 -0
  133. /maxframe/tensor/{base → misc}/where.py +0 -0
  134. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ import numpy as np
19
19
  import pandas as pd
20
20
  import pytest
21
21
  from odps import ODPS
22
+ from odps import types as odps_types
22
23
 
23
24
  from .... import tensor as mt
24
25
  from ....core import OutputType
@@ -35,7 +36,7 @@ from ..from_tensor import (
35
36
  )
36
37
  from ..index import from_pandas as from_pandas_index
37
38
  from ..index import from_tileable
38
- from ..read_odps_query import ColumnSchema, _resolve_task_sector
39
+ from ..read_odps_query import ColumnSchema, _parse_simple_explain, _resolve_task_sector
39
40
  from ..series import from_pandas as from_pandas_series
40
41
 
41
42
  ray = lazy_import("ray")
@@ -329,10 +330,6 @@ def test_from_odps_query():
329
330
  read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
330
331
  assert "instant query" in err_info.value.args[0]
331
332
 
332
- with pytest.raises(ValueError) as err_info:
333
- read_odps_query(f"SELECT col1, col2 + col3 FROM {table1_name}")
334
- assert "names" in err_info.value.args[0]
335
-
336
333
  query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
337
334
  df = read_odps_query(query1)
338
335
  assert df.op.query == query1
@@ -401,7 +398,9 @@ def test_date_range():
401
398
 
402
399
 
403
400
  def test_resolve_task_sector():
404
- input_path = os.path.join(os.path.dirname(__file__), "test-data", "task-input.txt")
401
+ input_path = os.path.join(
402
+ os.path.dirname(__file__), "test-data", "task-input-full.txt"
403
+ )
405
404
  with open(input_path, "r") as f:
406
405
  sector = f.read()
407
406
  actual_sector = _resolve_task_sector("job0", sector)
@@ -413,3 +412,33 @@ def test_resolve_task_sector():
413
412
  assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
414
413
  assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
415
414
  assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
415
+
416
+
417
+ def test_resolve_task_odps2():
418
+ input_path = os.path.join(
419
+ os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
420
+ )
421
+ with open(input_path, "r") as f:
422
+ sector = f.read()
423
+ actual_sector = _resolve_task_sector("job0", sector)
424
+
425
+ assert actual_sector.job_name == "job0"
426
+ assert actual_sector.task_name == "M1"
427
+ assert actual_sector.output_target == "Screen"
428
+ assert len(actual_sector.schema) == 2
429
+ assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
430
+ assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
431
+
432
+
433
+ def test_resolve_simple_explain():
434
+ input_path = os.path.join(
435
+ os.path.dirname(__file__), "test-data", "task-input-simple.txt"
436
+ )
437
+ with open(input_path, "r") as f:
438
+ sector = f.read()
439
+
440
+ schema = _parse_simple_explain(sector)
441
+ assert schema.columns[0].name == "memberid"
442
+ assert schema.columns[0].type == odps_types.string
443
+ assert schema.columns[1].name == "createdate"
444
+ assert schema.columns[1].type == odps_types.bigint
@@ -17,13 +17,14 @@
17
17
  import logging
18
18
  from typing import List, Optional, Union
19
19
 
20
+ from odps import ODPS
20
21
  from odps.models import Table as ODPSTable
21
22
  from odps.types import PartitionSpec
22
23
 
23
24
  from ... import opcodes
24
25
  from ...config import options
25
26
  from ...core import OutputType
26
- from ...odpsio import build_dataframe_table_meta
27
+ from ...io.odpsio import build_dataframe_table_meta
27
28
  from ...serialization.serializables import (
28
29
  BoolField,
29
30
  FieldTypes,
@@ -136,8 +137,14 @@ def to_odps_table(
136
137
  --------
137
138
 
138
139
  """
140
+ odps_entry = ODPS.from_global() or ODPS.from_environments()
139
141
  if isinstance(table, ODPSTable):
140
142
  table = table.full_table_name
143
+ elif options.session.enable_schema and "." not in table:
144
+ default_schema = (
145
+ options.session.default_schema or odps_entry.schema or "default"
146
+ )
147
+ table = default_schema + "." + table
141
148
 
142
149
  if isinstance(index_label, str):
143
150
  index_label = [index_label]
@@ -18,6 +18,7 @@ from .accessor import (
18
18
  IndexMaxFrameAccessor,
19
19
  SeriesMaxFrameAccessor,
20
20
  )
21
+ from .flatmap import df_flatmap, series_flatmap
21
22
  from .reshuffle import DataFrameReshuffle, df_reshuffle
22
23
 
23
24
 
@@ -25,6 +26,8 @@ def _install():
25
26
  from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
26
27
 
27
28
  DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
29
+ DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
30
+ SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
28
31
 
29
32
  if DataFrameMaxFrameAccessor._api_count:
30
33
  for t in DATAFRAME_TYPE:
@@ -0,0 +1,326 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Callable
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ from maxframe import opcodes
21
+ from maxframe.core import OutputType
22
+ from maxframe.dataframe.core import DataFrame, IndexValue
23
+ from maxframe.dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
24
+ from maxframe.dataframe.utils import make_dtypes, parse_index
25
+ from maxframe.serialization.serializables import (
26
+ BoolField,
27
+ DictField,
28
+ FunctionField,
29
+ TupleField,
30
+ )
31
+
32
+
33
+ class DataFrameFlatMapOperator(DataFrameOperator, DataFrameOperatorMixin):
34
+ _op_type_ = opcodes.FLATMAP
35
+
36
+ func = FunctionField("func")
37
+ raw = BoolField("raw", default=False)
38
+ args = TupleField("args", default=())
39
+ kwargs = DictField("kwargs", default={})
40
+
41
+ def __init__(self, output_types=None, **kw):
42
+ super().__init__(_output_types=output_types, **kw)
43
+
44
+ @staticmethod
45
+ def _gen_flattening_index_value(index_value, *args) -> IndexValue:
46
+ pd_index = index_value.to_pandas()
47
+ if not isinstance(pd_index, pd.MultiIndex):
48
+ # for func return multi rows, will copy indexes
49
+ return parse_index(pd.Index([], dtype=pd_index.dtype), *args)
50
+ # multi index will keep the same level and types
51
+ return parse_index(
52
+ pd.MultiIndex.from_arrays([c[:0] for c in pd_index.levels]), *args
53
+ )
54
+
55
+ def _call_dataframe(self, df: DataFrame, dtypes: pd.Series):
56
+ dtypes = make_dtypes(dtypes)
57
+ index_value = self._gen_flattening_index_value(
58
+ df.index_value,
59
+ (df.key, df.index_value.key, self.func),
60
+ )
61
+ return self.new_dataframe(
62
+ [df],
63
+ shape=(np.nan, len(dtypes)),
64
+ index_value=index_value,
65
+ columns_value=parse_index(dtypes.index, store_data=True),
66
+ dtypes=dtypes,
67
+ )
68
+
69
+ def _call_series_or_index(self, series, dtypes=None):
70
+ index_value = self._gen_flattening_index_value(
71
+ series.index_value,
72
+ (series.key, series.index_value.key, self.func),
73
+ )
74
+
75
+ if self.output_types[0] == OutputType.series:
76
+ name, dtype = dtypes
77
+ return self.new_series(
78
+ [series],
79
+ dtype=dtype,
80
+ shape=(np.nan,),
81
+ index_value=index_value,
82
+ name=name,
83
+ )
84
+
85
+ dtypes = make_dtypes(dtypes)
86
+ columns_value = parse_index(dtypes.index, store_data=True)
87
+ return self.new_dataframe(
88
+ [series],
89
+ shape=(np.nan, len(dtypes)),
90
+ index_value=index_value,
91
+ columns_value=columns_value,
92
+ dtypes=dtypes,
93
+ )
94
+
95
+ def __call__(
96
+ self,
97
+ df_or_series,
98
+ dtypes=None,
99
+ output_type=None,
100
+ ):
101
+ if df_or_series.op.output_types[0] == OutputType.dataframe:
102
+ return self._call_dataframe(df_or_series, dtypes=dtypes)
103
+ else:
104
+ return self._call_series_or_index(df_or_series, dtypes=dtypes)
105
+
106
+
107
+ def df_flatmap(dataframe, func: Callable, dtypes=None, raw=False, args=(), **kwargs):
108
+ """
109
+ Apply the given function to each row and then flatten results. Use this method if your transformation returns
110
+ multiple rows for each input row.
111
+
112
+ This function applies a transformation to each row of the DataFrame, where the transformation can return zero
113
+ or multiple values, effectively flattening Python generators, list-like collections, and DataFrames.
114
+
115
+ Parameters
116
+ ----------
117
+ dataframe : DataFrame
118
+ The DataFrame to which the function will be applied.
119
+
120
+ func : Callable
121
+ Function to apply to each row of the DataFrame. It should accept a Series (or an array if `raw=True`)
122
+ representing a row and return a list or iterable of values.
123
+
124
+ dtypes : Series, dict or list
125
+ Specify dtypes of returned DataFrame.
126
+
127
+ raw : bool, default False
128
+ Determines if the row is passed as a Series or as a numpy array:
129
+
130
+ * ``False`` : passes each row as a Series to the function.
131
+ * ``True`` : the passed function will receive numpy array objects instead.
132
+
133
+ args : tuple
134
+ Positional arguments to pass to `func`.
135
+
136
+ **kwargs
137
+ Additional keyword arguments to pass as keywords arguments to `func`.
138
+
139
+ Returns
140
+ -------
141
+ DataFrame
142
+ Return DataFrame with specified `dtypes`.
143
+
144
+ Notes
145
+ -----
146
+ The `func` must return an iterable of values for each input row. The index of the resulting DataFrame will be
147
+ repeated based on the number of output rows generated by `func`.
148
+
149
+ Examples
150
+ --------
151
+ >>> import numpy as np
152
+ >>> import maxframe.dataframe as md
153
+ >>> df = md.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
154
+ >>> df.execute()
155
+ A B
156
+ 0 1 4
157
+ 1 2 5
158
+ 2 3 6
159
+
160
+ Define a function that takes a number and returns a list of two numbers:
161
+
162
+ >>> def generate_values_array(row):
163
+ ... return [row['A'] * 2, row['B'] * 3]
164
+
165
+ Define a function that takes a row and return two rows and two columns:
166
+
167
+ >>> def generate_values_in_generator(row):
168
+ ... yield [row[0] * 2, row[1] * 4]
169
+ ... yield [row[0] * 3, row[1] * 5]
170
+
171
+ Which equals to the following function return a dataframe:
172
+
173
+ >>> def generate_values_in_dataframe(row):
174
+ ... return pd.DataFrame([[row[0] * 2, row[1] * 4], [row[0] * 3, row[1] * 5]])
175
+
176
+ Specify `dtypes` with a function which returns a DataFrame:
177
+
178
+ >>> df.mf.flatmap(generate_values_array, dtypes=pd.Series({'A': 'int'})).execute()
179
+ A
180
+ 0 2
181
+ 0 12
182
+ 1 4
183
+ 1 15
184
+ 2 6
185
+ 2 18
186
+
187
+ Specify raw=True to pass input row as array:
188
+
189
+ >>> df.mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}, raw=True).execute()
190
+ A B
191
+ 0 2 16
192
+ 0 3 20
193
+ 1 4 20
194
+ 1 6 25
195
+ 2 6 24
196
+ 2 9 30
197
+ """
198
+ if dtypes is None or len(dtypes) == 0:
199
+ raise TypeError(
200
+ "Cannot determine {dtypes} by calculating with enumerate data, "
201
+ "please specify it as arguments"
202
+ )
203
+
204
+ if not isinstance(func, Callable):
205
+ raise TypeError("function must be a callable object")
206
+
207
+ output_types = [OutputType.dataframe]
208
+ op = DataFrameFlatMapOperator(
209
+ func=func, raw=raw, output_types=output_types, args=args, kwargs=kwargs
210
+ )
211
+ return op(
212
+ dataframe,
213
+ dtypes=dtypes,
214
+ )
215
+
216
+
217
+ def series_flatmap(
218
+ series, func: Callable, dtypes=None, dtype=None, name=None, args=(), **kwargs
219
+ ):
220
+ """
221
+ Apply the given function to each row and then flatten results. Use this method if your transformation returns
222
+ multiple rows for each input row.
223
+
224
+ This function applies a transformation to each element of the Series, where the transformation can return zero
225
+ or multiple values, effectively flattening Python generator, list-liked collections and DataFrame.
226
+
227
+ Parameters
228
+ ----------
229
+ series : Series
230
+ The series to which the function will be applied.
231
+
232
+ func : Callable
233
+ Function to apply to each element of the Series. It should accept a scalar value
234
+ (or an array if `raw=True`) and return a list or iterable of values.
235
+
236
+ dtypes : Series, default None
237
+ Specify dtypes of returned DataFrame. Can't work with dtype.
238
+
239
+ dtype : numpy.dtype, default None
240
+ Specify dtype of returned Series. Can't work with dtypes.
241
+
242
+ name : str, default None
243
+ Specify name of the returned Series.
244
+
245
+ args : tuple
246
+ Positional arguments to pass to `func`.
247
+
248
+ **kwargs
249
+ Additional keyword arguments to pass as keywords arguments to `func`.
250
+
251
+ Returns
252
+ -------
253
+ DataFrame or Series
254
+ Result of DataFrame when dtypes specified, else Series.
255
+
256
+ Notes
257
+ -----
258
+ The `func` must return an iterable of values for each input element. If `dtypes` is specified,
259
+ `flatmap` will return a DataFrame, if `dtype` and `name` is specified, a Series will be returned. The index of
260
+ the resulting DataFrame/Series will be repeated based on the number of output rows generated by `func`.
261
+
262
+ Examples
263
+ --------
264
+ >>> import numpy as np
265
+ >>> import maxframe.dataframe as md
266
+ >>> df = md.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
267
+ >>> df.execute()
268
+ A B
269
+ 0 1 4
270
+ 1 2 5
271
+ 2 3 6
272
+
273
+ Define a function that takes a number and returns a list of two numbers:
274
+
275
+ >>> def generate_values_array(x):
276
+ ... return [x * 2, x * 3]
277
+
278
+ >>> def generate_values_in_generator(x):
279
+ ... yield pd.Series([x * 2, x * 4])
280
+ ... yield pd.Series([x * 3, x * 5])
281
+
282
+ Specify `dtype` with a function which returns list to return more than one elements as a Series:
283
+
284
+ >>> df['A'].mf.flatmap(generate_values_array, dtype="int", name="C").execute()
285
+ 0 2
286
+ 0 3
287
+ 1 4
288
+ 1 6
289
+ 2 6
290
+ 2 9
291
+ Name: C, dtype: int64
292
+
293
+ Specify `dtypes` to return multi columns as a DataFrame:
294
+
295
+ >>> df['A'].mf.flatmap(generate_values_in_generator, dtypes={"A": "int", "B": "int"}).execute()
296
+ A B
297
+ 0 2 4
298
+ 0 3 5
299
+ 1 4 8
300
+ 1 6 10
301
+ 2 6 12
302
+ 2 9 15
303
+ """
304
+
305
+ if dtypes and dtype:
306
+ raise ValueError("Both dtypes and dtype cannot be specified at the same time.")
307
+
308
+ dtypes = (name, dtype) if dtype is not None else dtypes
309
+ if dtypes is None:
310
+ raise TypeError(
311
+ "Cannot determine {dtypes} or {dtype} by calculating with enumerate data, "
312
+ "please specify it as arguments"
313
+ )
314
+
315
+ if not isinstance(func, Callable):
316
+ raise TypeError("function must be a callable object")
317
+
318
+ output_type = OutputType.series if dtype is not None else OutputType.dataframe
319
+
320
+ op = DataFrameFlatMapOperator(
321
+ func=func, raw=False, output_types=[output_type], args=args, kwargs=kwargs
322
+ )
323
+ return op(
324
+ series,
325
+ dtypes=dtypes,
326
+ )
@@ -11,11 +11,12 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import numpy as np
15
15
  import pandas as pd
16
16
  import pytest
17
17
 
18
18
  from .... import dataframe as md
19
+ from ... import DataFrame
19
20
  from ...core import IndexValue
20
21
  from ..reshuffle import DataFrameReshuffle
21
22
 
@@ -36,3 +37,63 @@ def test_reshuffle():
36
37
  r = mdf.mf.reshuffle(ignore_index=True)
37
38
  assert isinstance(r.op, DataFrameReshuffle)
38
39
  assert isinstance(r.index_value.value, IndexValue.RangeIndex)
40
+
41
+
42
+ @pytest.fixture
43
+ def df1():
44
+ return DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
45
+
46
+
47
+ @pytest.fixture
48
+ def df2():
49
+ return DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["a", "b", "c"])
50
+
51
+
52
+ @pytest.fixture
53
+ def df3():
54
+ return DataFrame(
55
+ [[1, 2, 3], [1, 2, 3], [1, 2, 3]],
56
+ columns=["a", "b", "c"],
57
+ index=pd.MultiIndex.from_arrays([[1, 2, 3], [1, 2, 3]], names=["A", "B"]),
58
+ )
59
+
60
+
61
+ def test_flatmap(df1, df2, df3):
62
+ def f(x, keys):
63
+ if x["a"] in keys:
64
+ yield [1, 0]
65
+ yield [0, 1]
66
+
67
+ apply_df = df1[["a"]].mf.flatmap(
68
+ f,
69
+ dtypes={"a": "int64", "b": "int64"},
70
+ )
71
+ assert apply_df.shape == (np.nan, 2)
72
+ assert df1.index_value.key != apply_df.index_value.key
73
+ assert isinstance(df1.index_value.to_pandas(), pd.RangeIndex)
74
+ assert not isinstance(apply_df.index_value.to_pandas(), pd.RangeIndex)
75
+ apply_df = df2[["a"]].mf.flatmap(
76
+ f,
77
+ dtypes=pd.Series(["int64", "int64"]),
78
+ )
79
+ assert apply_df.shape == (np.nan, 2)
80
+ assert df2.index_value.key != apply_df.index_value.key
81
+ with pytest.raises(TypeError):
82
+ apply_s = df3["a"].mf.flatmap(
83
+ f,
84
+ )
85
+ apply_s = df3["a"].mf.flatmap(
86
+ f,
87
+ dtype="int64",
88
+ )
89
+ assert apply_s.shape == (np.nan,)
90
+ assert df3.index_value.key != apply_s.index_value.key
91
+ assert df3.key != apply_s.index_value.key
92
+ apply_s = df3["a"].mf.flatmap(
93
+ f,
94
+ output_type="dataframe",
95
+ dtypes=["int64", "int64"],
96
+ )
97
+ assert apply_s.shape == (np.nan, 2)
98
+ assert df3.index_value.key != apply_s.index_value.key
99
+ assert df3.key != apply_s.index_value.key
@@ -51,7 +51,7 @@ def _get_prefix_suffix_docs(is_prefix: bool):
51
51
  Examples
52
52
  --------
53
53
  >>> import maxframe.dataframe as md
54
- >>> s = md.Series([1, 2, 3, 4])
54
+ >>> s = md.Series([1, 2, 3, 4])
55
55
  >>> s.execute()
56
56
  0 1
57
57
  1 2
@@ -248,6 +248,7 @@ def df_rename(
248
248
  )
249
249
 
250
250
 
251
+ # fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/58
251
252
  def series_rename(
252
253
  series,
253
254
  index=None,
@@ -382,6 +383,7 @@ def index_rename(index, name, inplace=False):
382
383
  return ret
383
384
 
384
385
 
386
+ # fixme https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/59
385
387
  def index_set_names(index, names, level=None, inplace=False):
386
388
  """
387
389
  Set Index or MultiIndex name.
@@ -407,6 +409,15 @@ def index_set_names(index, names, level=None, inplace=False):
407
409
  See Also
408
410
  --------
409
411
  Index.rename : Able to set new names without level.
412
+
413
+ Examples
414
+ --------
415
+ >>> import maxframe.dataframe as md
416
+ >>> idx = md.Index([1, 2, 3, 4])
417
+ >>> idx.execute()
418
+ Int64Index([1, 2, 3, 4], dtype='int64')
419
+ >>> idx.set_names('quarter').execute()
420
+ Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
410
421
  """
411
422
  op = DataFrameRename(
412
423
  index_mapper=names, level=level, output_types=get_output_types(index)
@@ -15,6 +15,7 @@
15
15
  from typing import Union
16
16
 
17
17
  import pandas as pd
18
+ from pandas.api.types import is_list_like
18
19
  from pandas.core.dtypes.common import pandas_dtype
19
20
 
20
21
  from ..core import ENTITY_TYPE
@@ -61,6 +62,8 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
61
62
  num_partitions=None,
62
63
  ):
63
64
  need_repart = False
65
+ if columns is not None and not is_list_like(columns):
66
+ raise ValueError("columns must be a list-like object")
64
67
  if isinstance(data, TENSOR_TYPE):
65
68
  if chunk_size is not None:
66
69
  data = data.rechunk(chunk_size)
@@ -69,7 +72,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
69
72
  )
70
73
  need_repart = num_partitions is not None
71
74
  elif isinstance(data, SERIES_TYPE):
72
- df = data.to_frame()
75
+ if columns is not None and len(columns) != 1:
76
+ raise ValueError("columns' length must be 1 when data is Series")
77
+ col_name = columns[0] if columns else None
78
+ df = data.to_frame(name=col_name)
73
79
  need_repart = num_partitions is not None
74
80
  elif isinstance(data, DATAFRAME_TYPE):
75
81
  if not hasattr(data, "data"):
@@ -77,6 +83,10 @@ class DataFrame(_Frame, metaclass=InitializerMeta):
77
83
  df = _Frame(data)
78
84
  else:
79
85
  df = data
86
+ if columns is not None:
87
+ if len(df.columns) != len(columns):
88
+ raise ValueError("columns' length must be equal to the data's")
89
+ df.columns = columns
80
90
  need_repart = num_partitions is not None
81
91
  elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
82
92
  # data is a dict and some value is tensor
@@ -104,7 +104,6 @@ def df_drop_duplicates(
104
104
  def series_drop_duplicates(
105
105
  series, keep="first", inplace=False, ignore_index=False, method="auto"
106
106
  ):
107
- # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
108
107
  """
109
108
  Return Series with duplicate values removed.
110
109
 
@@ -148,6 +147,24 @@ def series_drop_duplicates(
148
147
  5 hippo
149
148
  Name: animal, dtype: object
150
149
 
150
+ With the 'keep' parameter, the selection behaviour of duplicated values
151
+ can be changed. The value 'first' keeps the first occurrence for each
152
+ set of duplicated entries. The default value of keep is 'first'.
153
+ >>> s.drop_duplicates().execute()
154
+ 0 lame
155
+ 1 cow
156
+ 3 beetle
157
+ 5 hippo
158
+ Name: animal, dtype: object
159
+ The value 'last' for parameter 'keep' keeps the last occurrence for
160
+ each set of duplicated entries.
161
+ >>> s.drop_duplicates(keep='last').execute()
162
+ 1 cow
163
+ 3 beetle
164
+ 4 lame
165
+ 5 hippo
166
+ Name: animal, dtype: object
167
+
151
168
  The value ``False`` for parameter 'keep' discards all sets of
152
169
  duplicated entries. Setting the value of 'inplace' to ``True`` performs
153
170
  the operation inplace and returns ``None``.
@@ -16,13 +16,7 @@ import numpy as np
16
16
  import pandas as pd
17
17
 
18
18
  from ..core import ENTITY_TYPE, OutputType
19
- from ..core.operator import (
20
- Fuse,
21
- FuseChunkMixin,
22
- Operator,
23
- ShuffleProxy,
24
- TileableOperatorMixin,
25
- )
19
+ from ..core.operator import Operator, ShuffleProxy, TileableOperatorMixin
26
20
  from ..tensor.core import TENSOR_TYPE
27
21
  from ..tensor.datasource import tensor as astensor
28
22
  from .core import DATAFRAME_TYPE, SERIES_TYPE
@@ -261,13 +255,3 @@ DataFrameOperator = Operator
261
255
  class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperatorMixin):
262
256
  def __init__(self, sparse=None, output_types=None, **kwargs):
263
257
  super().__init__(sparse=sparse, _output_types=output_types, **kwargs)
264
-
265
-
266
- class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperatorMixin):
267
- __slots__ = ()
268
-
269
-
270
- class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin):
271
- @property
272
- def output_types(self):
273
- return self.outputs[-1].chunk.op.output_types
@@ -552,7 +552,7 @@ class ReductionCompiler:
552
552
  @enter_mode(build=True)
553
553
  def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps:
554
554
  from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
555
- from ...tensor.base import TensorWhere
555
+ from ...tensor.misc import TensorWhere
556
556
  from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
557
557
  from ..datasource.dataframe import DataFrameDataSource
558
558
  from ..datasource.series import SeriesDataSource
@@ -679,8 +679,8 @@ class ReductionCompiler:
679
679
  ]
680
680
  """
681
681
  from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
682
- from ...tensor.base import TensorWhere
683
682
  from ...tensor.datasource import Scalar
683
+ from ...tensor.misc import TensorWhere
684
684
  from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
685
685
  from ..datasource.dataframe import DataFrameDataSource
686
686
  from ..datasource.series import SeriesDataSource