maxframe 1.0.0rc1__cp38-cp38-macosx_10_9_universal2.whl → 1.0.0rc2__cp38-cp38-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (64) hide show
  1. maxframe/_utils.cpython-38-darwin.so +0 -0
  2. maxframe/codegen.py +0 -4
  3. maxframe/config/config.py +34 -2
  4. maxframe/config/validators.py +1 -0
  5. maxframe/conftest.py +2 -0
  6. maxframe/core/entity/objects.py +1 -1
  7. maxframe/core/graph/core.cpython-38-darwin.so +0 -0
  8. maxframe/dataframe/__init__.py +1 -1
  9. maxframe/dataframe/arithmetic/around.py +5 -17
  10. maxframe/dataframe/arithmetic/core.py +15 -7
  11. maxframe/dataframe/arithmetic/docstring.py +5 -55
  12. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  13. maxframe/dataframe/core.py +5 -5
  14. maxframe/dataframe/datasource/date_range.py +2 -2
  15. maxframe/dataframe/datasource/read_odps_query.py +6 -0
  16. maxframe/dataframe/datasource/read_odps_table.py +2 -1
  17. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  18. maxframe/dataframe/groupby/cum.py +0 -1
  19. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  20. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  21. maxframe/dataframe/indexing/rename.py +3 -37
  22. maxframe/dataframe/indexing/sample.py +0 -1
  23. maxframe/dataframe/indexing/set_index.py +68 -1
  24. maxframe/dataframe/merge/merge.py +236 -2
  25. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  26. maxframe/dataframe/misc/apply.py +3 -10
  27. maxframe/dataframe/misc/case_when.py +1 -1
  28. maxframe/dataframe/misc/describe.py +2 -2
  29. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  30. maxframe/dataframe/misc/eval.py +4 -0
  31. maxframe/dataframe/misc/pct_change.py +1 -83
  32. maxframe/dataframe/misc/transform.py +1 -30
  33. maxframe/dataframe/misc/value_counts.py +4 -17
  34. maxframe/dataframe/missing/dropna.py +1 -1
  35. maxframe/dataframe/missing/fillna.py +5 -5
  36. maxframe/dataframe/sort/sort_values.py +1 -11
  37. maxframe/dataframe/statistics/quantile.py +5 -17
  38. maxframe/dataframe/utils.py +4 -7
  39. maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
  40. maxframe/learn/contrib/xgboost/predict.py +2 -2
  41. maxframe/learn/contrib/xgboost/train.py +2 -2
  42. maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
  43. maxframe/odpsio/__init__.py +1 -1
  44. maxframe/odpsio/arrow.py +8 -4
  45. maxframe/odpsio/schema.py +10 -7
  46. maxframe/odpsio/tableio.py +388 -14
  47. maxframe/odpsio/tests/test_schema.py +16 -15
  48. maxframe/odpsio/tests/test_tableio.py +48 -21
  49. maxframe/protocol.py +40 -2
  50. maxframe/serialization/core.cpython-38-darwin.so +0 -0
  51. maxframe/serialization/serializables/core.py +48 -9
  52. maxframe/tensor/__init__.py +59 -0
  53. maxframe/tensor/base/unique.py +2 -2
  54. maxframe/tensor/statistics/quantile.py +2 -2
  55. maxframe/tests/utils.py +11 -2
  56. maxframe/utils.py +17 -9
  57. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +74 -1
  58. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +64 -64
  59. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
  60. maxframe_client/fetcher.py +38 -27
  61. maxframe_client/session/odps.py +5 -5
  62. maxframe_client/tests/test_fetcher.py +21 -3
  63. maxframe_client/tests/test_session.py +13 -2
  64. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -31,7 +31,7 @@ class DataFrameSetIndex(DataFrameOperator, DataFrameOperatorMixin):
31
31
  super().__init__(_output_types=output_types, **kw)
32
32
 
33
33
  def __call__(self, df):
34
- new_df = build_empty_df(df.dtypes).set_index(
34
+ new_df = build_empty_df(df.dtypes, index=df.index_value.to_pandas()).set_index(
35
35
  keys=self.keys,
36
36
  drop=self.drop,
37
37
  append=self.append,
@@ -47,6 +47,73 @@ class DataFrameSetIndex(DataFrameOperator, DataFrameOperatorMixin):
47
47
 
48
48
 
49
49
  def set_index(df, keys, drop=True, append=False, inplace=False, verify_integrity=False):
50
+ # TODO add support for set index by series, index, mt.ndarray, etc.
51
+ """
52
+ Set the DataFrame index using existing columns.
53
+
54
+ Set the DataFrame index (row labels) using one or more existing
55
+ columns. The index can replace the existing index or expand on it.
56
+
57
+ Parameters
58
+ ----------
59
+ keys : label or array-like or list of labels
60
+ This parameter can be either a single column key, or a list containing column keys.
61
+ drop : bool, default True
62
+ Delete columns to be used as the new index.
63
+ append : bool, default False
64
+ Whether to append columns to existing index.
65
+ inplace : bool, default False
66
+ If True, modifies the DataFrame in place (do not create a new object).
67
+ verify_integrity : bool, default False
68
+ Check the new index for duplicates. Otherwise defer the check until
69
+ necessary. Setting to False will improve the performance of this
70
+ method.
71
+
72
+ Returns
73
+ -------
74
+ DataFrame or None
75
+ Changed row labels or None if ``inplace=True``.
76
+
77
+ See Also
78
+ --------
79
+ DataFrame.reset_index : Opposite of set_index.
80
+ DataFrame.reindex : Change to new indices or expand indices.
81
+ DataFrame.reindex_like : Change to same indices as other DataFrame.
82
+
83
+ Examples
84
+ --------
85
+ >>> import maxframe.dataframe as md
86
+
87
+ >>> df = md.DataFrame({'month': [1, 4, 7, 10],
88
+ ... 'year': [2012, 2014, 2013, 2014],
89
+ ... 'sale': [55, 40, 84, 31]})
90
+ >>> df
91
+ month year sale
92
+ 0 1 2012 55
93
+ 1 4 2014 40
94
+ 2 7 2013 84
95
+ 3 10 2014 31
96
+
97
+ Set the index to become the 'month' column:
98
+
99
+ >>> df.set_index('month')
100
+ year sale
101
+ month
102
+ 1 2012 55
103
+ 4 2014 40
104
+ 7 2013 84
105
+ 10 2014 31
106
+
107
+ Create a MultiIndex using columns 'year' and 'month':
108
+
109
+ >>> df.set_index(['year', 'month'])
110
+ sale
111
+ year month
112
+ 2012 1 55
113
+ 2014 4 40
114
+ 2013 7 84
115
+ 2014 10 31
116
+ """
50
117
  op = DataFrameSetIndex(
51
118
  keys=keys,
52
119
  drop=drop,
@@ -11,12 +11,13 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import logging
15
+ from abc import abstractmethod
16
16
  from collections import namedtuple
17
- from typing import Any, Dict, Optional, Tuple, Union
17
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
18
18
 
19
19
  import numpy as np
20
+ from pandas import Index
20
21
 
21
22
  from ... import opcodes
22
23
  from ...core import OutputType
@@ -28,6 +29,7 @@ from ...serialization.serializables import (
28
29
  Int32Field,
29
30
  KeyField,
30
31
  NamedTupleField,
32
+ Serializable,
31
33
  StringField,
32
34
  TupleField,
33
35
  )
@@ -73,9 +75,208 @@ class DataFrameMergeAlign(MapReduceOperator, DataFrameOperatorMixin):
73
75
  MergeSplitInfo = namedtuple("MergeSplitInfo", "split_side, split_index, nsplits")
74
76
 
75
77
 
78
+ class JoinHint(Serializable):
79
+ @abstractmethod
80
+ def verify_params(
81
+ self,
82
+ hint_on_df: Union[DataFrame, Series],
83
+ on: str,
84
+ is_on_index: bool,
85
+ how: str,
86
+ is_hint_for_left: bool,
87
+ ):
88
+ pass
89
+
90
+ @abstractmethod
91
+ def verify_can_work_with(self, other: "JoinHint"):
92
+ pass
93
+
94
+
95
+ class MapJoinHint(JoinHint):
96
+ def verify_params(
97
+ self,
98
+ hint_on_df: Union[DataFrame, Series],
99
+ on: str,
100
+ is_on_index: bool,
101
+ how: str,
102
+ is_hint_for_left: bool,
103
+ ):
104
+ if how in ("cross", "outer"):
105
+ raise ValueError(
106
+ "Invalid join hint, MapJoinHint is not support in cross and outer join"
107
+ )
108
+
109
+ def verify_can_work_with(self, other: JoinHint):
110
+ if isinstance(other, SkewJoinHint):
111
+ raise ValueError(
112
+ "Invalid join hint, SkewJoinHint cannot work with MapJoinHint"
113
+ )
114
+
115
+
116
+ class DistributedMapJoinHint(JoinHint):
117
+ shard_count = Int32Field("shard_count")
118
+ replica_count = Int32Field("replica_count", default=1)
119
+
120
+ def verify_params(
121
+ self,
122
+ hint_on_df: Union[DataFrame, Series],
123
+ on: str,
124
+ is_on_index: bool,
125
+ how: str,
126
+ is_hint_for_left: bool,
127
+ ):
128
+ if how in ("cross", "outer"):
129
+ raise ValueError(
130
+ "Invalid join hint, DistributedMapJoinHint is not support in cross and outer join"
131
+ )
132
+ if not hasattr(self, "shard_count"):
133
+ raise ValueError(
134
+ "Invalid DistributedMapJoinHint, shard_count must be specified"
135
+ )
136
+ if self.shard_count <= 0 or self.replica_count <= 0:
137
+ raise ValueError(
138
+ "Invalid DistributedMapJoinHint, shard_count and replica_count must be greater than 0"
139
+ )
140
+
141
+ def verify_can_work_with(self, other: JoinHint):
142
+ pass
143
+
144
+
145
+ class SkewJoinHint(JoinHint):
146
+ columns = AnyField("columns", default=None)
147
+
148
+ @staticmethod
149
+ def _check_index_levels(index, level_list):
150
+ selected_levels = set()
151
+ valid_levels = set(range(index.nlevels))
152
+ valid_level_names = set(index.names)
153
+
154
+ for item in level_list:
155
+ if isinstance(item, int):
156
+ if item not in valid_levels:
157
+ raise ValueError(f"Level {item} is not a valid index level")
158
+ if item in selected_levels:
159
+ raise ValueError(f"Level {item} is selected multiple times")
160
+ selected_levels.add(item)
161
+ elif isinstance(item, str):
162
+ if item not in valid_level_names:
163
+ raise ValueError(f"'{item}' is not a valid index level name")
164
+ level = index.names.index(item)
165
+ if level in selected_levels:
166
+ raise ValueError(
167
+ f"'{item}' (Level {level}) is selected multiple times"
168
+ )
169
+ selected_levels.add(level)
170
+ else:
171
+ raise ValueError(f"Invalid input type: {type(item)}")
172
+
173
+ @staticmethod
174
+ def _check_columns(join_on_columns, column_list):
175
+ selected_columns = set()
176
+ valid_columns = set(join_on_columns)
177
+
178
+ for item in column_list:
179
+ if isinstance(item, int):
180
+ if item < 0 or item >= len(join_on_columns):
181
+ raise ValueError(f"Column index {item} is out of range")
182
+ col_name = join_on_columns[item]
183
+ if col_name in selected_columns:
184
+ raise ValueError(
185
+ f"Column '{col_name}' (index {item}) is selected multiple times"
186
+ )
187
+ selected_columns.add(col_name)
188
+ elif isinstance(item, str):
189
+ if item not in valid_columns:
190
+ raise ValueError(f"'{item}' is not a valid column name")
191
+ if item in selected_columns:
192
+ raise ValueError(f"Column '{item}' is selected multiple times")
193
+ selected_columns.add(item)
194
+ else:
195
+ raise ValueError(f"Invalid input type: {type(item)}")
196
+
197
+ def verify_params(
198
+ self,
199
+ hint_on_df: Union[DataFrame, Series],
200
+ on: str,
201
+ is_on_index: bool,
202
+ how: str,
203
+ is_hint_for_left: bool,
204
+ ):
205
+ if how in ("cross", "outer"):
206
+ raise ValueError(
207
+ "Invalid join hint, map join is not support in cross and outer join"
208
+ )
209
+ if is_hint_for_left and how == "right":
210
+ raise ValueError(
211
+ "Invalid join hint, right join can only use SkewJoinHint on right frame"
212
+ )
213
+ elif not is_hint_for_left and how == "left":
214
+ raise ValueError(
215
+ "Invalid join hint, left join can only use SkewJoinHint on left frame"
216
+ )
217
+
218
+ # check columns
219
+ if self.columns is None:
220
+ return
221
+
222
+ if not isinstance(self.columns, list):
223
+ raise TypeError("Invalid SkewJoinHint, `columns` must be a list")
224
+
225
+ if all(isinstance(item, (int, str)) for item in self.columns):
226
+ # if elements are int (levels) or str (index names or column names)
227
+ self._verify_valid_index_or_columns(
228
+ self.columns, hint_on_df.index_value.to_pandas(), on, is_on_index
229
+ )
230
+ elif all(isinstance(c, dict) for c in self.columns):
231
+ # dict with column names and values
232
+ cols_set = set(self.columns[0].keys())
233
+ if any(cols_set != set(c.keys()) for c in self.columns):
234
+ raise ValueError(
235
+ "Invalid SkewJoinHint, all values in `columns` need to have same columns"
236
+ )
237
+
238
+ self._verify_valid_index_or_columns(
239
+ cols_set, hint_on_df.index_value.to_pandas(), on, is_on_index
240
+ )
241
+ else:
242
+ raise TypeError("Invalid SkewJoinHint, annot accept `columns` type")
243
+
244
+ def verify_can_work_with(self, other: JoinHint):
245
+ if isinstance(other, SkewJoinHint):
246
+ raise ValueError(
247
+ "Invalid join hint, SkewJoinHint cannot work with MapJoinHint"
248
+ )
249
+
250
+ @staticmethod
251
+ def _verify_valid_index_or_columns(
252
+ skew_join_columns: Iterable[Union[int, str]],
253
+ frame_index: Index,
254
+ on: Union[str, List[str]],
255
+ is_on_index: bool,
256
+ ):
257
+ if isinstance(on, str):
258
+ on = [on]
259
+ on_columns = set(frame_index.names if is_on_index else on)
260
+ for col in skew_join_columns:
261
+ if isinstance(col, int):
262
+ if col < 0 or col >= len(on_columns):
263
+ raise ValueError(
264
+ f"Invalid, SkeJoinHint, `{col}` is out of join on columns range"
265
+ )
266
+ else:
267
+ if col not in on_columns:
268
+ raise ValueError(
269
+ f"Invalid, SkeJoinHint, '{col}' is not a valid column name"
270
+ )
271
+
272
+
76
273
  class DataFrameMerge(DataFrameOperator, DataFrameOperatorMixin):
77
274
  _op_type_ = opcodes.DATAFRAME_MERGE
78
275
 
276
+ # workaround for new field since v1.0.0rc2
277
+ # todo remove this when all versions below v1.0.0rc1 is eliminated
278
+ _legacy_new_non_primitives = ["left_hint", "right_hint"]
279
+
79
280
  how = StringField("how")
80
281
  on = AnyField("on")
81
282
  left_on = AnyField("left_on")
@@ -95,6 +296,8 @@ class DataFrameMerge(DataFrameOperator, DataFrameOperatorMixin):
95
296
 
96
297
  # only for broadcast merge
97
298
  split_info = NamedTupleField("split_info")
299
+ left_hint = AnyField("left_hint", default=None)
300
+ right_hint = AnyField("right_hint", default=None)
98
301
 
99
302
  def __init__(self, copy=None, **kwargs):
100
303
  super().__init__(copy_=copy, **kwargs)
@@ -165,6 +368,8 @@ def merge(
165
368
  auto_merge_threshold: int = 8,
166
369
  bloom_filter: Union[bool, str] = "auto",
167
370
  bloom_filter_options: Dict[str, Any] = None,
371
+ left_hint: JoinHint = None,
372
+ right_hint: JoinHint = None,
168
373
  ) -> DataFrame:
169
374
  """
170
375
  Merge DataFrame or named Series objects with a database-style join.
@@ -267,6 +472,12 @@ def merge(
267
472
  when chunk size of left and right is greater than this threshold, apply bloom filter
268
473
  * "filter": "large", "small", "both", default "large"
269
474
  decides to filter on large, small or both DataFrames.
475
+ left_hint: JoinHint, default None
476
+ Join strategy to use for left frame. When data skew occurs, consider these strategies to avoid long-tail issues,
477
+ but use them cautiously to prevent OOM and unnecessary overhead.
478
+ right_hint: JoinHint, default None
479
+ Join strategy to use for right frame.
480
+
270
481
 
271
482
  Returns
272
483
  -------
@@ -381,6 +592,18 @@ def merge(
381
592
  raise ValueError(
382
593
  f"Invalid filter {k}, available: {BLOOM_FILTER_ON_OPTIONS}"
383
594
  )
595
+
596
+ if left_hint:
597
+ if not isinstance(left_hint, JoinHint):
598
+ raise TypeError(f"left_hint must be a JoinHint, got {type(left_hint)}")
599
+ left_hint.verify_can_work_with(right_hint)
600
+ left_hint.verify_params(df, on or left_on, left_index, how, True)
601
+
602
+ if right_hint:
603
+ if not isinstance(right_hint, JoinHint):
604
+ raise TypeError(f"right_hint must be a JoinHint, got {type(right_hint)}")
605
+ right_hint.verify_params(right, on or right_on, right_index, how, False)
606
+
384
607
  op = DataFrameMerge(
385
608
  how=how,
386
609
  on=on,
@@ -399,6 +622,8 @@ def merge(
399
622
  bloom_filter=bloom_filter,
400
623
  bloom_filter_options=bloom_filter_options,
401
624
  output_types=[OutputType.dataframe],
625
+ left_hint=left_hint,
626
+ right_hint=right_hint,
402
627
  )
403
628
  return op(df, right)
404
629
 
@@ -416,6 +641,8 @@ def join(
416
641
  auto_merge_threshold: int = 8,
417
642
  bloom_filter: Union[bool, Dict] = True,
418
643
  bloom_filter_options: Dict[str, Any] = None,
644
+ left_hint: JoinHint = None,
645
+ right_hint: JoinHint = None,
419
646
  ) -> DataFrame:
420
647
  """
421
648
  Join columns of another DataFrame.
@@ -480,6 +707,11 @@ def join(
480
707
  when chunk size of left and right is greater than this threshold, apply bloom filter
481
708
  * "filter": "large", "small", "both", default "large"
482
709
  decides to filter on large, small or both DataFrames.
710
+ left_hint: JoinHint, default None
711
+ Join strategy to use for left frame. When data skew occurs, consider these strategies to avoid long-tail issues,
712
+ but use them cautiously to prevent OOM and unnecessary overhead.
713
+ right_hint: JoinHint, default None
714
+ Join strategy to use for right frame.
483
715
 
484
716
  Returns
485
717
  -------
@@ -590,4 +822,6 @@ def join(
590
822
  auto_merge_threshold=auto_merge_threshold,
591
823
  bloom_filter=bloom_filter,
592
824
  bloom_filter_options=bloom_filter_options,
825
+ left_hint=left_hint,
826
+ right_hint=right_hint,
593
827
  )
@@ -19,6 +19,7 @@ import pytest
19
19
  from ...core import IndexValue
20
20
  from ...datasource.dataframe import from_pandas
21
21
  from .. import DataFrameMerge, concat
22
+ from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
22
23
 
23
24
 
24
25
  def test_merge():
@@ -30,14 +31,39 @@ def test_merge():
30
31
  mdf1 = from_pandas(df1, chunk_size=2)
31
32
  mdf2 = from_pandas(df2, chunk_size=3)
32
33
 
34
+ mapjoin = MapJoinHint()
35
+ dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
36
+ skew_join1 = SkewJoinHint()
37
+ skew_join2 = SkewJoinHint(columns=[0])
38
+ skew_join3 = SkewJoinHint(columns=[{"a": 4}, {"a": 6}])
39
+ skew_join4 = SkewJoinHint(columns=[{"a": 4, "b": "test"}, {"a": 5, "b": "hello"}])
40
+
33
41
  parameters = [
34
42
  {},
35
43
  {"how": "left", "right_on": "x", "left_index": True},
44
+ {
45
+ "how": "left",
46
+ "right_on": "x",
47
+ "left_index": True,
48
+ "left_hint": mapjoin,
49
+ "right_hint": mapjoin,
50
+ },
36
51
  {"how": "right", "left_on": "a", "right_index": True},
52
+ {
53
+ "how": "right",
54
+ "left_on": "a",
55
+ "right_index": True,
56
+ "left_hint": mapjoin,
57
+ "right_hint": dist_mapjoin1,
58
+ },
37
59
  {"how": "left", "left_on": "a", "right_on": "x"},
60
+ {"how": "left", "left_on": "a", "right_on": "x", "left_hint": skew_join1},
38
61
  {"how": "right", "left_on": "a", "right_index": True},
62
+ {"how": "right", "left_on": "a", "right_index": True, "right_hint": skew_join2},
39
63
  {"how": "right", "on": "a"},
64
+ {"how": "right", "on": "a", "right_hint": skew_join3},
40
65
  {"how": "inner", "on": ["a", "b"]},
66
+ {"how": "inner", "on": ["a", "b"], "left_hint": skew_join4},
41
67
  ]
42
68
 
43
69
  for kw in parameters:
@@ -213,3 +239,100 @@ def test_concat():
213
239
  mdf2 = from_pandas(df2, chunk_size=3)
214
240
  r = concat([mdf1, mdf2], join="inner")
215
241
  assert r.shape == (20, 3)
242
+
243
+
244
+ def test_invalid_join_hint():
245
+ df1 = pd.DataFrame(
246
+ np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
247
+ )
248
+ df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
249
+
250
+ mdf1 = from_pandas(df1, chunk_size=2)
251
+ mdf2 = from_pandas(df2, chunk_size=3)
252
+
253
+ # type error
254
+ parameters = [
255
+ {"how": "left", "right_on": "x", "left_index": True, "left_hint": [1]},
256
+ {
257
+ "how": "left",
258
+ "right_on": "x",
259
+ "left_index": True,
260
+ "left_hint": {"key": "value"},
261
+ },
262
+ {
263
+ "how": "right",
264
+ "left_on": "a",
265
+ "right_index": True,
266
+ "right_hint": SkewJoinHint(columns=2),
267
+ },
268
+ {
269
+ "how": "left",
270
+ "left_on": "a",
271
+ "right_on": "x",
272
+ "left_hint": SkewJoinHint(columns="a"),
273
+ },
274
+ {
275
+ "how": "right",
276
+ "left_on": "a",
277
+ "right_index": True,
278
+ "right_hint": SkewJoinHint(columns=["0", []]),
279
+ },
280
+ ]
281
+
282
+ for kw in parameters:
283
+ print(kw)
284
+ with pytest.raises(TypeError):
285
+ mdf1.merge(mdf2, **kw)
286
+
287
+ # value error
288
+ parameters = [
289
+ # mapjoin can't working with skew join
290
+ {
291
+ "how": "left",
292
+ "right_on": "x",
293
+ "left_index": True,
294
+ "left_hint": MapJoinHint(),
295
+ "right_hint": SkewJoinHint(),
296
+ },
297
+ # right join can't apply to skew join left frame
298
+ {
299
+ "how": "right",
300
+ "left_on": "a",
301
+ "right_index": True,
302
+ "left_hint": SkewJoinHint(),
303
+ },
304
+ # invalid columns
305
+ {
306
+ "how": "left",
307
+ "left_on": "a",
308
+ "right_on": "x",
309
+ "left_hint": SkewJoinHint(columns=["b"]),
310
+ },
311
+ # invalid index level
312
+ {
313
+ "how": "right",
314
+ "left_on": "a",
315
+ "right_index": True,
316
+ "right_hint": SkewJoinHint(columns=[5]),
317
+ },
318
+ # unmatched skew join columns
319
+ {
320
+ "how": "right",
321
+ "left_on": "a",
322
+ "right_index": True,
323
+ "right_hint": SkewJoinHint(columns=[{0: "value1"}, {1: "value2"}]),
324
+ },
325
+ # invalid dist_mapjoin shard_count
326
+ {"how": "right", "on": "a", "right_hint": DistributedMapJoinHint()},
327
+ # all can't work with outer join
328
+ {"how": "outer", "on": ["a", "b"], "left_hint": MapJoinHint()},
329
+ {
330
+ "how": "outer",
331
+ "on": ["a", "b"],
332
+ "left_hint": DistributedMapJoinHint(shard_count=5),
333
+ },
334
+ {"how": "outer", "on": ["a", "b"], "left_hint": SkewJoinHint()},
335
+ ]
336
+ for kw in parameters:
337
+ with pytest.raises(ValueError):
338
+ mdf1.merge(mdf2, **kw)
@@ -319,6 +319,7 @@ def df_apply(
319
319
  skip_infer=False,
320
320
  **kwds,
321
321
  ):
322
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/50
322
323
  """
323
324
  Apply a function along an axis of the DataFrame.
324
325
 
@@ -444,20 +445,12 @@ def df_apply(
444
445
  B 27
445
446
  dtype: int64
446
447
 
447
- >>> df.apply(np.sum, axis=1).execute()
448
+ >>> df.apply(lambda row: int(np.sum(row)), axis=1).execute()
448
449
  0 13
449
450
  1 13
450
451
  2 13
451
452
  dtype: int64
452
453
 
453
- Returning a list-like will result in a Series
454
-
455
- >>> df.apply(lambda x: [1, 2], axis=1).execute()
456
- 0 [1, 2]
457
- 1 [1, 2]
458
- 2 [1, 2]
459
- dtype: object
460
-
461
454
  Passing ``result_type='expand'`` will expand list-like results
462
455
  to columns of a Dataframe
463
456
 
@@ -471,7 +464,7 @@ def df_apply(
471
464
  ``result_type='expand'``. The resulting column names
472
465
  will be the Series index.
473
466
 
474
- >>> df.apply(lambda x: md.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
467
+ >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
475
468
  foo bar
476
469
  0 1 2
477
470
  1 1 2
@@ -99,7 +99,7 @@ def case_when(series, caselist):
99
99
  >>> b = md.Series([0, 3, 4, 5])
100
100
 
101
101
  >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement
102
- ... (b.gt(0), b)])
102
+ ... (b.gt(0), b)]).execute()
103
103
  0 6
104
104
  1 3
105
105
  2 1
@@ -15,7 +15,7 @@
15
15
  import numpy as np
16
16
  import pandas as pd
17
17
 
18
- from ... import opcodes as OperandDef
18
+ from ... import opcodes
19
19
  from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
20
20
  from ..core import SERIES_TYPE
21
21
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -23,7 +23,7 @@ from ..utils import build_empty_df, parse_index
23
23
 
24
24
 
25
25
  class DataFrameDescribe(DataFrameOperator, DataFrameOperatorMixin):
26
- _op_type_ = OperandDef.DESCRIBE
26
+ _op_type_ = opcodes.DESCRIBE
27
27
 
28
28
  input = KeyField("input", default=None)
29
29
  percentiles = ListField("percentiles", FieldTypes.float64, default=None)
@@ -37,16 +37,15 @@ class DataFrameDropDuplicates(DuplicateOperand):
37
37
  shape += (3,)
38
38
  return shape
39
39
 
40
- @classmethod
41
- def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
40
+ def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
42
41
  params = input_params.copy()
43
- if op.ignore_index:
42
+ if op.ignore_index and self._output_types[0] != OutputType.index:
44
43
  params["index_value"] = parse_index(pd.RangeIndex(-1))
45
44
  else:
46
45
  params["index_value"] = gen_unknown_index_value(
47
46
  input_params["index_value"], op.keep, op.subset, type(op).__name__
48
47
  )
49
- params["shape"] = cls._get_shape(input_params["shape"], op)
48
+ params["shape"] = self._get_shape(input_params["shape"], op)
50
49
  return params
51
50
 
52
51
  def __call__(self, inp, inplace=False):
@@ -105,6 +104,7 @@ def df_drop_duplicates(
105
104
  def series_drop_duplicates(
106
105
  series, keep="first", inplace=False, ignore_index=False, method="auto"
107
106
  ):
107
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/12
108
108
  """
109
109
  Return Series with duplicate values removed.
110
110
 
@@ -148,27 +148,6 @@ def series_drop_duplicates(
148
148
  5 hippo
149
149
  Name: animal, dtype: object
150
150
 
151
- With the 'keep' parameter, the selection behaviour of duplicated values
152
- can be changed. The value 'first' keeps the first occurrence for each
153
- set of duplicated entries. The default value of keep is 'first'.
154
-
155
- >>> s.drop_duplicates().execute()
156
- 0 lame
157
- 1 cow
158
- 3 beetle
159
- 5 hippo
160
- Name: animal, dtype: object
161
-
162
- The value 'last' for parameter 'keep' keeps the last occurrence for
163
- each set of duplicated entries.
164
-
165
- >>> s.drop_duplicates(keep='last').execute()
166
- 1 cow
167
- 3 beetle
168
- 4 lame
169
- 5 hippo
170
- Name: animal, dtype: object
171
-
172
151
  The value ``False`` for parameter 'keep' discards all sets of
173
152
  duplicated entries. Setting the value of 'inplace' to ``True`` performs
174
153
  the operation inplace and returns ``None``.
@@ -120,6 +120,10 @@ class CollectionVisitor(ast.NodeVisitor):
120
120
  if obj_name in self.env:
121
121
  self.referenced_vars.add(obj_name)
122
122
  return self.env[obj_name]
123
+ try:
124
+ return self.target[obj_name]
125
+ except KeyError:
126
+ pass
123
127
  raise KeyError(f"name {obj_name} is not defined")
124
128
 
125
129
  def visit(self, node):