maxframe 1.0.0rc1__cp38-cp38-win32.whl → 1.0.0rc2__cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (64) hide show
  1. maxframe/_utils.cp38-win32.pyd +0 -0
  2. maxframe/codegen.py +0 -4
  3. maxframe/config/config.py +34 -2
  4. maxframe/config/validators.py +1 -0
  5. maxframe/conftest.py +2 -0
  6. maxframe/core/entity/objects.py +1 -1
  7. maxframe/core/graph/core.cp38-win32.pyd +0 -0
  8. maxframe/dataframe/__init__.py +1 -1
  9. maxframe/dataframe/arithmetic/around.py +5 -17
  10. maxframe/dataframe/arithmetic/core.py +15 -7
  11. maxframe/dataframe/arithmetic/docstring.py +5 -55
  12. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  13. maxframe/dataframe/core.py +5 -5
  14. maxframe/dataframe/datasource/date_range.py +2 -2
  15. maxframe/dataframe/datasource/read_odps_query.py +6 -0
  16. maxframe/dataframe/datasource/read_odps_table.py +2 -1
  17. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  18. maxframe/dataframe/groupby/cum.py +0 -1
  19. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  20. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  21. maxframe/dataframe/indexing/rename.py +3 -37
  22. maxframe/dataframe/indexing/sample.py +0 -1
  23. maxframe/dataframe/indexing/set_index.py +68 -1
  24. maxframe/dataframe/merge/merge.py +236 -2
  25. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  26. maxframe/dataframe/misc/apply.py +3 -10
  27. maxframe/dataframe/misc/case_when.py +1 -1
  28. maxframe/dataframe/misc/describe.py +2 -2
  29. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  30. maxframe/dataframe/misc/eval.py +4 -0
  31. maxframe/dataframe/misc/pct_change.py +1 -83
  32. maxframe/dataframe/misc/transform.py +1 -30
  33. maxframe/dataframe/misc/value_counts.py +4 -17
  34. maxframe/dataframe/missing/dropna.py +1 -1
  35. maxframe/dataframe/missing/fillna.py +5 -5
  36. maxframe/dataframe/sort/sort_values.py +1 -11
  37. maxframe/dataframe/statistics/quantile.py +5 -17
  38. maxframe/dataframe/utils.py +4 -7
  39. maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
  40. maxframe/learn/contrib/xgboost/predict.py +2 -2
  41. maxframe/learn/contrib/xgboost/train.py +2 -2
  42. maxframe/lib/mmh3.cp38-win32.pyd +0 -0
  43. maxframe/odpsio/__init__.py +1 -1
  44. maxframe/odpsio/arrow.py +8 -4
  45. maxframe/odpsio/schema.py +10 -7
  46. maxframe/odpsio/tableio.py +388 -14
  47. maxframe/odpsio/tests/test_schema.py +16 -15
  48. maxframe/odpsio/tests/test_tableio.py +48 -21
  49. maxframe/protocol.py +40 -2
  50. maxframe/serialization/core.cp38-win32.pyd +0 -0
  51. maxframe/serialization/serializables/core.py +48 -9
  52. maxframe/tensor/__init__.py +59 -0
  53. maxframe/tensor/base/unique.py +2 -2
  54. maxframe/tensor/statistics/quantile.py +2 -2
  55. maxframe/tests/utils.py +11 -2
  56. maxframe/utils.py +17 -9
  57. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +74 -1
  58. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +64 -64
  59. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
  60. maxframe_client/fetcher.py +38 -27
  61. maxframe_client/session/odps.py +5 -5
  62. maxframe_client/tests/test_fetcher.py +21 -3
  63. maxframe_client/tests/test_session.py +13 -2
  64. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
Binary file
maxframe/codegen.py CHANGED
@@ -205,12 +205,8 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
205
205
  return self._session_id
206
206
 
207
207
  def register_udf(self, udf: AbstractUDF):
208
- from maxframe_framedriver.services.session import SessionManager
209
-
210
208
  udf.session_id = self._session_id
211
209
  self._udfs[udf.name] = udf
212
- if self._session_id and SessionManager.initialized():
213
- SessionManager.instance().register_udf(self._session_id, udf)
214
210
 
215
211
  def get_udfs(self) -> List[AbstractUDF]:
216
212
  return list(self._udfs.values())
maxframe/config/config.py CHANGED
@@ -19,6 +19,15 @@ import warnings
19
19
  from copy import deepcopy
20
20
  from typing import Any, Dict, Optional, Union
21
21
 
22
+ from odps.lib import tzlocal
23
+
24
+ try:
25
+ from zoneinfo import available_timezones
26
+ except ImportError:
27
+ from pytz import all_timezones
28
+
29
+ available_timezones = lambda: all_timezones
30
+
22
31
  from ..utils import get_python_tag
23
32
  from .validators import (
24
33
  ValidatorType,
@@ -28,6 +37,7 @@ from .validators import (
28
37
  is_dict,
29
38
  is_in,
30
39
  is_integer,
40
+ is_non_negative_integer,
31
41
  is_null,
32
42
  is_numeric,
33
43
  is_string,
@@ -37,11 +47,12 @@ _DEFAULT_REDIRECT_WARN = "Option {source} has been replaced by {target} and migh
37
47
  _DEFAULT_MAX_ALIVE_SECONDS = 3 * 24 * 3600
38
48
  _DEFAULT_MAX_IDLE_SECONDS = 3600
39
49
  _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS = 120
50
+ _DEFAULT_SPE_FAILURE_RETRY_TIMES = 5
40
51
  _DEFAULT_UPLOAD_BATCH_SIZE = 4096
41
52
  _DEFAULT_TEMP_LIFECYCLE = 1
42
53
  _DEFAULT_TASK_START_TIMEOUT = 60
43
54
  _DEFAULT_TASK_RESTART_TIMEOUT = 300
44
- _DEFAULT_LOGVIEW_HOURS = 24 * 60
55
+ _DEFAULT_LOGVIEW_HOURS = 24 * 30
45
56
 
46
57
 
47
58
  class OptionError(Exception):
@@ -297,13 +308,28 @@ class Config:
297
308
  return {k: v for k, v in res.items() if k in self._remote_options}
298
309
 
299
310
 
311
+ def _get_legal_local_tz_name() -> Optional[str]:
312
+ """Sometimes we may get illegal tz name from tzlocal.get_localzone()"""
313
+ tz_name = str(tzlocal.get_localzone())
314
+ if tz_name not in available_timezones():
315
+ return None
316
+ return tz_name
317
+
318
+
300
319
  default_options = Config()
301
320
  default_options.register_option(
302
321
  "execution_mode", "trigger", validator=is_in(["trigger", "eager"])
303
322
  )
323
+ default_options.register_option("use_common_table", False, validator=is_bool)
304
324
  default_options.register_option(
305
325
  "python_tag", get_python_tag(), validator=is_string, remote=True
306
326
  )
327
+ default_options.register_option(
328
+ "local_timezone",
329
+ _get_legal_local_tz_name(),
330
+ validator=any_validator(is_null, is_in(set(available_timezones()))),
331
+ remote=True,
332
+ )
307
333
  default_options.register_option(
308
334
  "session.logview_hours", _DEFAULT_LOGVIEW_HOURS, validator=is_integer, remote=True
309
335
  )
@@ -378,7 +404,13 @@ default_options.register_option(
378
404
  default_options.register_option(
379
405
  "spe.operation_timeout_seconds",
380
406
  _DEFAULT_SPE_OPERATION_TIMEOUT_SECONDS,
381
- validator=is_integer,
407
+ validator=is_non_negative_integer,
408
+ remote=True,
409
+ )
410
+ default_options.register_option(
411
+ "spe.failure_retry_times",
412
+ _DEFAULT_SPE_FAILURE_RETRY_TIMES,
413
+ validator=is_non_negative_integer,
382
414
  remote=True,
383
415
  )
384
416
 
@@ -40,6 +40,7 @@ is_numeric = lambda x: isinstance(x, (int, float))
40
40
  is_string = lambda x: isinstance(x, str)
41
41
  is_dict = lambda x: isinstance(x, dict)
42
42
  is_positive_integer = lambda x: is_integer(x) and x > 0
43
+ is_non_negative_integer = lambda x: is_integer(x) and x >= 0
43
44
 
44
45
 
45
46
  def is_in(vals):
maxframe/conftest.py CHANGED
@@ -87,6 +87,7 @@ def oss_config():
87
87
  oss_secret_access_key = config.get("oss", "secret_access_key")
88
88
  oss_bucket_name = config.get("oss", "bucket_name")
89
89
  oss_endpoint = config.get("oss", "endpoint")
90
+ oss_rolearn = config.get("oss", "rolearn")
90
91
 
91
92
  config.oss_config = (
92
93
  oss_access_id,
@@ -99,6 +100,7 @@ def oss_config():
99
100
 
100
101
  auth = oss2.Auth(oss_access_id, oss_secret_access_key)
101
102
  config.oss_bucket = oss2.Bucket(auth, oss_endpoint, oss_bucket_name)
103
+ config.oss_rolearn = oss_rolearn
102
104
  return config
103
105
  except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, ImportError):
104
106
  return None
@@ -23,7 +23,7 @@ class ObjectData(TileableData, _ToObjectMixin):
23
23
  __slots__ = ()
24
24
  type_name = "Object"
25
25
  # workaround for removed field since v0.1.0b5
26
- # todo remove this when all versions below v0.1.0b5 is eliminated
26
+ # todo remove this when all versions below v1.0.0rc1 is eliminated
27
27
  _legacy_deprecated_non_primitives = ["_chunks"]
28
28
 
29
29
  def __init__(self, op=None, nsplits=None, **kw):
Binary file
@@ -54,7 +54,7 @@ from .reduction import CustomReduction, unique
54
54
  from .tseries.to_datetime import to_datetime
55
55
 
56
56
  try:
57
- from pandas import NA, Timestamp
57
+ from pandas import NA, NaT, Timestamp
58
58
  except ImportError: # pragma: no cover
59
59
  pass
60
60
 
@@ -43,20 +43,20 @@ def around(df, decimals=0, *args, **kwargs):
43
43
  return op(df)
44
44
 
45
45
 
46
+ # FIXME Series input of decimals not supported yet
46
47
  around.__frame_doc__ = """
47
48
  Round a DataFrame to a variable number of decimal places.
48
49
 
49
50
  Parameters
50
51
  ----------
51
- decimals : int, dict, Series
52
+ decimals : int, dict
52
53
  Number of decimal places to round each column to. If an int is
53
54
  given, round each column to the same number of places.
54
55
  Otherwise dict and Series round to variable numbers of places.
55
56
  Column names should be in the keys if `decimals` is a
56
- dict-like, or in the index if `decimals` is a Series. Any
57
- columns not included in `decimals` will be left as is. Elements
58
- of `decimals` which are not columns of the input will be
59
- ignored.
57
+ dict-like. Any columns not included in `decimals` will be left
58
+ as is. Elements of `decimals` which are not columns of the
59
+ input will be ignored.
60
60
  *args
61
61
  Additional keywords have no effect but might be accepted for
62
62
  compatibility with numpy.
@@ -107,18 +107,6 @@ places as value
107
107
  1 0.0 1.0
108
108
  2 0.7 0.0
109
109
  3 0.2 0.0
110
-
111
- Using a Series, the number of places for specific columns can be
112
- specified with the column names as index and the number of
113
- decimal places as value
114
-
115
- >>> decimals = md.Series([0, 1], index=['cats', 'dogs'])
116
- >>> df.round(decimals).execute()
117
- dogs cats
118
- 0 0.2 0.0
119
- 1 0.0 1.0
120
- 2 0.7 0.0
121
- 3 0.2 0.0
122
110
  """
123
111
  around.__series_doc__ = """
124
112
  Round each value in a Series to the given number of decimals.
@@ -39,7 +39,7 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
39
39
  raise NotImplementedError
40
40
 
41
41
  @classmethod
42
- def _calc_properties(cls, x1, x2=None, axis="columns"):
42
+ def _calc_properties(cls, x1, x2=None, axis="columns", level=None):
43
43
  if isinstance(x1, DATAFRAME_TYPE) and (
44
44
  x2 is None or pd.api.types.is_scalar(x2) or isinstance(x2, TENSOR_TYPE)
45
45
  ):
@@ -108,7 +108,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
108
108
  index = copy.copy(x1.index_value)
109
109
  index_shape = x1.shape[0]
110
110
  else:
111
- index = infer_index_value(x1.index_value, x2.index_value)
111
+ index = infer_index_value(
112
+ x1.index_value, x2.index_value, level=level
113
+ )
112
114
  if index.key == x1.index_value.key == x2.index_value.key and (
113
115
  not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
114
116
  ):
@@ -141,7 +143,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
141
143
  column_shape = len(dtypes)
142
144
  else: # pragma: no cover
143
145
  dtypes = x1.dtypes # FIXME
144
- columns = infer_index_value(x1.columns_value, x2.index_value)
146
+ columns = infer_index_value(
147
+ x1.columns_value, x2.index_value, level=level
148
+ )
145
149
  column_shape = np.nan
146
150
  else:
147
151
  assert axis == "index" or axis == 0
@@ -169,7 +173,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
169
173
  ],
170
174
  index=x1.dtypes.index,
171
175
  )
172
- index = infer_index_value(x1.index_value, x2.index_value)
176
+ index = infer_index_value(
177
+ x1.index_value, x2.index_value, level=level
178
+ )
173
179
  index_shape = np.nan
174
180
  return {
175
181
  "shape": (index_shape, column_shape),
@@ -187,7 +193,9 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
187
193
  index = copy.copy(x1.index_value)
188
194
  index_shape = x1.shape[0]
189
195
  else:
190
- index = infer_index_value(x1.index_value, x2.index_value)
196
+ index = infer_index_value(
197
+ x1.index_value, x2.index_value, level=level
198
+ )
191
199
  if index.key == x1.index_value.key == x2.index_value.key and (
192
200
  not np.isnan(x1.shape[0]) or not np.isnan(x2.shape[0])
193
201
  ):
@@ -237,14 +245,14 @@ class DataFrameBinOpMixin(DataFrameOperatorMixin):
237
245
  self._check_inputs(x1, x2)
238
246
  if isinstance(x1, DATAFRAME_TYPE) or isinstance(x2, DATAFRAME_TYPE):
239
247
  df1, df2 = (x1, x2) if isinstance(x1, DATAFRAME_TYPE) else (x2, x1)
240
- kw = self._calc_properties(df1, df2, axis=self.axis)
248
+ kw = self._calc_properties(df1, df2, axis=self.axis, level=self.level)
241
249
  if not pd.api.types.is_scalar(df2):
242
250
  return self.new_dataframe([x1, x2], **kw)
243
251
  else:
244
252
  return self.new_dataframe([df1], **kw)
245
253
  if isinstance(x1, SERIES_TYPE) or isinstance(x2, SERIES_TYPE):
246
254
  s1, s2 = (x1, x2) if isinstance(x1, SERIES_TYPE) else (x2, x1)
247
- kw = self._calc_properties(s1, s2)
255
+ kw = self._calc_properties(s1, s2, level=self.level)
248
256
  if not pd.api.types.is_scalar(s2):
249
257
  return self.new_series([x1, x2], **kw)
250
258
  else:
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ # FIXME:https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/17
15
16
  _flex_doc_FRAME = """
16
17
  Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
17
18
  Equivalent to ``{equiv}``, but with support to substitute a fill_value
@@ -127,44 +128,15 @@ circle 0
127
128
  triangle 3
128
129
  rectangle 4
129
130
 
130
- >>> (df * other).execute()
131
- angles degrees
132
- circle 0 NaN
133
- triangle 9 NaN
134
- rectangle 16 NaN
135
-
136
131
  >>> df.mul(other, fill_value=0).execute()
137
132
  angles degrees
138
133
  circle 0 0.0
139
134
  triangle 9 0.0
140
135
  rectangle 16 0.0
141
136
 
142
- Divide by a MultiIndex by level.
143
-
144
- >>> df_multindex = md.DataFrame({{'angles': [0, 3, 4, 4, 5, 6],
145
- ... 'degrees': [360, 180, 360, 360, 540, 720]}},
146
- ... index=[['A', 'A', 'A', 'B', 'B', 'B'],
147
- ... ['circle', 'triangle', 'rectangle',
148
- ... 'square', 'pentagon', 'hexagon']])
149
- >>> df_multindex.execute()
150
- angles degrees
151
- A circle 0 360
152
- triangle 3 180
153
- rectangle 4 360
154
- B square 4 360
155
- pentagon 5 540
156
- hexagon 6 720
157
-
158
- >>> df.div(df_multindex, level=1, fill_value=0).execute()
159
- angles degrees
160
- A circle NaN 1.0
161
- triangle 1.0 1.0
162
- rectangle 1.0 1.0
163
- B square 0.0 0.0
164
- pentagon 0.0 0.0
165
- hexagon 0.0 0.0
166
137
  """
167
138
 
139
+ # FIXME:https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/28
168
140
  _flex_doc_SERIES = """
169
141
  Return {desc} of series and other, element-wise (binary operator `{op_name}`).
170
142
 
@@ -213,6 +185,7 @@ e NaN
213
185
  dtype: float64
214
186
  """
215
187
 
188
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/48
216
189
  _flex_comp_doc_FRAME = """
217
190
  Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
218
191
  Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
@@ -257,7 +230,8 @@ Mismatched indices will be unioned together.
257
230
 
258
231
  Examples
259
232
  --------
260
- >>> df = pd.DataFrame({{'cost': [250, 150, 100],
233
+ >>> import maxframe.dataframe as md
234
+ >>> df = md.DataFrame({{'cost': [250, 150, 100],
261
235
  ... 'revenue': [100, 250, 300]}},
262
236
  ... index=['A', 'B', 'C'])
263
237
  >>> df.execute()
@@ -332,30 +306,6 @@ A False False
332
306
  B False False
333
307
  C False True
334
308
  D False False
335
-
336
- Compare to a MultiIndex by level.
337
-
338
- >>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
339
- ... 'revenue': [100, 250, 300, 200, 175, 225]}},
340
- ... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
341
- ... ['A', 'B', 'C', 'A', 'B', 'C']])
342
- >>> df_multindex.execute()
343
- cost revenue
344
- Q1 A 250 100
345
- B 150 250
346
- C 100 300
347
- Q2 A 150 200
348
- B 300 175
349
- C 220 225
350
-
351
- >>> df.le(df_multindex, level=1).execute()
352
- cost revenue
353
- Q1 A True True
354
- B True True
355
- C True True
356
- Q2 A False True
357
- B True False
358
- C True False
359
309
  """
360
310
 
361
311
 
@@ -239,6 +239,28 @@ def test_dataframe_and_series_with_shuffle(func_name, func_opts):
239
239
  assert df2.columns_value.key != df1.columns_value.key
240
240
 
241
241
 
242
+ @pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
243
+ def test_dataframe_and_series_with_multiindex(func_name, func_opts):
244
+ data1 = pd.DataFrame(
245
+ np.random.rand(10, 10),
246
+ index=pd.MultiIndex.from_arrays(
247
+ [list("AAAAABBBBB"), [4, 9, 3, 2, 1, 5, 8, 6, 7, 10]]
248
+ ),
249
+ columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
250
+ )
251
+ data1 = to_boolean_if_needed(func_opts.func_name, data1)
252
+ df1 = from_pandas(data1, chunk_size=5)
253
+ s1 = from_pandas_series(data1[10].reset_index(level=0, drop=True), chunk_size=6)
254
+
255
+ df2 = getattr(df1, func_opts.func_name)(s1, level=1, axis=0)
256
+
257
+ # test df2's index and columns
258
+ assert df2.shape == (np.nan, df1.shape[1])
259
+ assert df2.index_value.key != df1.index_value.key
260
+ assert df2.index_value.names == df1.index_value.names
261
+ assert df2.columns_value.key == df1.columns_value.key
262
+
263
+
242
264
  @pytest.mark.parametrize("func_name, func_opts", binary_functions.items())
243
265
  def test_series_and_series_with_align_map(func_name, func_opts):
244
266
  data1 = pd.DataFrame(
@@ -1086,11 +1086,11 @@ class Series(HasShapeTileable, _ToPandasMixin):
1086
1086
  --------
1087
1087
  >>> import maxframe.dataframe as md
1088
1088
  >>> s = md.Series({'a': 1, 'b': 2, 'c': 3})
1089
- >>> s.ndim.execute()
1089
+ >>> s.ndim
1090
1090
  1
1091
1091
 
1092
1092
  >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
1093
- >>> df.ndim.execute()
1093
+ >>> df.ndim
1094
1094
  2
1095
1095
  """
1096
1096
  return super().ndim
@@ -1520,7 +1520,7 @@ class BaseDataFrameData(HasShapeTileableData, _ToPandasMixin):
1520
1520
  self._columns_value = parse_index(dtypes.index, store_data=True)
1521
1521
  self._dtypes_value = DtypesValue(key=tokenize(dtypes), value=dtypes)
1522
1522
  new_shape = list(self._shape)
1523
- new_shape[0] = len(dtypes)
1523
+ new_shape[-1] = len(dtypes)
1524
1524
  self._shape = tuple(new_shape)
1525
1525
 
1526
1526
  @property
@@ -1761,11 +1761,11 @@ class DataFrame(HasShapeTileable, _ToPandasMixin):
1761
1761
  --------
1762
1762
  >>> import maxframe.dataframe as md
1763
1763
  >>> s = md.Series({'a': 1, 'b': 2, 'c': 3})
1764
- >>> s.ndim.execute()
1764
+ >>> s.ndim
1765
1765
  1
1766
1766
 
1767
1767
  >>> df = md.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
1768
- >>> df.ndim.execute()
1768
+ >>> df.ndim
1769
1769
  2
1770
1770
  """
1771
1771
  return super().ndim
@@ -22,7 +22,7 @@ from pandas._libs.tslibs import timezones
22
22
  from pandas.tseries.frequencies import to_offset
23
23
  from pandas.tseries.offsets import Tick
24
24
 
25
- from ... import opcodes as OperandDef
25
+ from ... import opcodes
26
26
  from ...core import OutputType
27
27
  from ...serialization.serializables import AnyField, BoolField, Int64Field, StringField
28
28
  from ...utils import no_default, pd_release_version
@@ -117,7 +117,7 @@ def generate_range_count(
117
117
 
118
118
 
119
119
  class DataFrameDateRange(DataFrameOperator, DataFrameOperatorMixin):
120
- _op_type_ = OperandDef.DATE_RANGE
120
+ _op_type_ = opcodes.DATE_RANGE
121
121
 
122
122
  start = AnyField("start")
123
123
  end = AnyField("end")
@@ -47,6 +47,7 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
47
47
  re.MULTILINE,
48
48
  )
49
49
  _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
50
+ _ANONYMOUS_COL_REGEX = re.compile(r"^_c\d+$")
50
51
 
51
52
 
52
53
  @dataclasses.dataclass
@@ -272,6 +273,11 @@ def read_odps_query(
272
273
  explain_str = list(inst.get_task_results().values())[0]
273
274
 
274
275
  odps_schema = _parse_explained_schema(explain_str)
276
+
277
+ for col in odps_schema.columns:
278
+ if _ANONYMOUS_COL_REGEX.match(col.name) and col.name not in query:
279
+ raise ValueError("Need to specify names for all columns in SELECT clause.")
280
+
275
281
  dtypes = odps_schema_to_pandas_dtypes(odps_schema)
276
282
 
277
283
  if not index_col:
@@ -119,9 +119,10 @@ class DataFrameReadODPSTable(
119
119
  return self.new_tileable(
120
120
  [],
121
121
  None,
122
- shape=shape,
122
+ shape=shape[:1],
123
123
  name=getattr(index_value, "name", None),
124
124
  names=getattr(index_value, "names", None),
125
+ dtype=self.index_dtypes.iloc[0],
125
126
  index_value=index_value,
126
127
  chunk_bytes=chunk_bytes,
127
128
  chunk_size=chunk_size,
@@ -21,6 +21,7 @@ import pytest
21
21
  from odps import ODPS
22
22
 
23
23
  from .... import tensor as mt
24
+ from ....core import OutputType
24
25
  from ....tests.utils import tn
25
26
  from ....utils import lazy_import
26
27
  from ... import read_odps_query, read_odps_table
@@ -295,6 +296,15 @@ def test_from_odps_table():
295
296
  ),
296
297
  )
297
298
 
299
+ out_idx = read_odps_table(
300
+ test_table,
301
+ columns=[],
302
+ index_col=["col1", "col2"],
303
+ output_type=OutputType.index,
304
+ )
305
+ assert out_idx.names == ["col1", "col2"]
306
+ assert out_idx.shape == (np.nan,)
307
+
298
308
  test_table.drop()
299
309
  test_parted_table.drop()
300
310
 
@@ -319,6 +329,10 @@ def test_from_odps_query():
319
329
  read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
320
330
  assert "instant query" in err_info.value.args[0]
321
331
 
332
+ with pytest.raises(ValueError) as err_info:
333
+ read_odps_query(f"SELECT col1, col2 + col3 FROM {table1_name}")
334
+ assert "names" in err_info.value.args[0]
335
+
322
336
  query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
323
337
  df = read_odps_query(query1)
324
338
  assert df.op.query == query1
@@ -59,7 +59,6 @@ class GroupByCumReductionOperator(DataFrameOperatorMixin, DataFrameOperator):
59
59
  out_dtypes = self._calc_out_dtypes(groupby)
60
60
 
61
61
  kw = in_df.params.copy()
62
- kw["index_value"] = parse_index(pd.RangeIndex(-1), groupby.key)
63
62
  if self.output_types[0] == OutputType.dataframe:
64
63
  kw.update(
65
64
  dict(
@@ -282,14 +282,17 @@ def test_groupby_cum():
282
282
  r = getattr(mdf.groupby("b"), fun)()
283
283
  assert r.op.output_types[0] == OutputType.dataframe
284
284
  assert r.shape == (len(df1), 2)
285
+ assert r.index_value.key == mdf.index_value.key
285
286
 
286
287
  r = getattr(mdf.groupby("b"), fun)(axis=1)
287
288
  assert r.op.output_types[0] == OutputType.dataframe
288
289
  assert r.shape == (len(df1), 3)
290
+ assert r.index_value.key == mdf.index_value.key
289
291
 
290
292
  r = mdf.groupby("b").cumcount()
291
293
  assert r.op.output_types[0] == OutputType.series
292
294
  assert r.shape == (len(df1),)
295
+ assert r.index_value.key == mdf.index_value.key
293
296
 
294
297
  series1 = pd.Series([2, 2, 5, 7, 3, 7, 8, 8, 5, 6])
295
298
  ms1 = md.Series(series1, chunk_size=3)
@@ -298,6 +301,7 @@ def test_groupby_cum():
298
301
  r = getattr(ms1.groupby(lambda x: x % 2), fun)()
299
302
  assert r.op.output_types[0] == OutputType.series
300
303
  assert r.shape == (len(series1),)
304
+ assert r.index_value.key == ms1.index_value.key
301
305
 
302
306
 
303
307
  def test_groupby_fill():
@@ -51,7 +51,7 @@ def _get_prefix_suffix_docs(is_prefix: bool):
51
51
  Examples
52
52
  --------
53
53
  >>> import maxframe.dataframe as md
54
- >>> s = md.Series([1, 2, 3, 4])
54
+ >>> s = md.Series([1, 2, 3, 4])
55
55
  >>> s.execute()
56
56
  0 1
57
57
  1 2
@@ -17,7 +17,7 @@ import warnings
17
17
  from ... import opcodes
18
18
  from ...core import get_output_types
19
19
  from ...serialization.serializables import AnyField, StringField
20
- from ..core import SERIES_TYPE
20
+ from ..core import INDEX_TYPE, SERIES_TYPE
21
21
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
22
22
  from ..utils import build_df, build_series, parse_index, validate_axis
23
23
 
@@ -73,6 +73,8 @@ class DataFrameRename(DataFrameOperator, DataFrameOperatorMixin):
73
73
  params["index_value"] = parse_index(new_index)
74
74
  if df.ndim == 1:
75
75
  params["name"] = new_df.name
76
+ if isinstance(df, INDEX_TYPE):
77
+ params["names"] = new_df.names
76
78
  return self.new_tileable([df], **params)
77
79
 
78
80
 
@@ -303,11 +305,6 @@ def series_rename(
303
305
  1 2
304
306
  2 3
305
307
  Name: my_name, dtype: int64
306
- >>> s.rename(lambda x: x ** 2).execute() # function, changes labels.execute()
307
- 0 1
308
- 1 2
309
- 4 3
310
- dtype: int64
311
308
  >>> s.rename({1: 3, 2: 5}).execute() # mapping, changes labels.execute()
312
309
  0 1
313
310
  3 2
@@ -410,37 +407,6 @@ def index_set_names(index, names, level=None, inplace=False):
410
407
  See Also
411
408
  --------
412
409
  Index.rename : Able to set new names without level.
413
-
414
- Examples
415
- --------
416
- >>> import maxframe.dataframe as md
417
- >>> idx = md.Index([1, 2, 3, 4])
418
- >>> idx.execute()
419
- Int64Index([1, 2, 3, 4], dtype='int64')
420
- >>> idx.set_names('quarter').execute()
421
- Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')
422
-
423
- >>> idx = md.MultiIndex.from_product([['python', 'cobra'],
424
- ... [2018, 2019]])
425
- >>> idx.execute()
426
- MultiIndex([('python', 2018),
427
- ('python', 2019),
428
- ( 'cobra', 2018),
429
- ( 'cobra', 2019)],
430
- )
431
- >>> idx.set_names(['kind', 'year'], inplace=True)
432
- >>> idx.execute()
433
- MultiIndex([('python', 2018),
434
- ('python', 2019),
435
- ( 'cobra', 2018),
436
- ( 'cobra', 2019)],
437
- names=['kind', 'year'])
438
- >>> idx.set_names('species', level=0).execute()
439
- MultiIndex([('python', 2018),
440
- ('python', 2019),
441
- ( 'cobra', 2018),
442
- ( 'cobra', 2019)],
443
- names=['species', 'year'])
444
410
  """
445
411
  op = DataFrameRename(
446
412
  index_mapper=names, level=level, output_types=get_output_types(index)
@@ -195,7 +195,6 @@ def sample(
195
195
  num_legs num_wings num_specimen_seen
196
196
  falcon 2 2 10
197
197
  fish 0 0 8
198
-
199
198
  """
200
199
  axis = validate_axis(axis or 0, df_or_series)
201
200
  if axis == 1: