maxframe 0.1.0b5__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show
  1. maxframe/_utils.cpython-310-darwin.so +0 -0
  2. maxframe/codegen.py +10 -4
  3. maxframe/config/config.py +68 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +58 -14
  6. maxframe/core/__init__.py +2 -16
  7. maxframe/core/entity/__init__.py +1 -12
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +46 -45
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cpython-310-darwin.so +0 -0
  19. maxframe/core/graph/core.pyx +4 -4
  20. maxframe/core/graph/entity.py +9 -33
  21. maxframe/core/operator/__init__.py +2 -9
  22. maxframe/core/operator/base.py +3 -5
  23. maxframe/core/operator/objects.py +0 -9
  24. maxframe/core/operator/utils.py +55 -0
  25. maxframe/dataframe/__init__.py +1 -1
  26. maxframe/dataframe/arithmetic/around.py +5 -17
  27. maxframe/dataframe/arithmetic/core.py +15 -7
  28. maxframe/dataframe/arithmetic/docstring.py +7 -33
  29. maxframe/dataframe/arithmetic/equal.py +4 -2
  30. maxframe/dataframe/arithmetic/greater.py +4 -2
  31. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  32. maxframe/dataframe/arithmetic/less.py +2 -2
  33. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  34. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  36. maxframe/dataframe/core.py +31 -7
  37. maxframe/dataframe/datasource/date_range.py +2 -2
  38. maxframe/dataframe/datasource/read_odps_query.py +117 -23
  39. maxframe/dataframe/datasource/read_odps_table.py +6 -3
  40. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  41. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  42. maxframe/dataframe/datastore/to_odps.py +28 -0
  43. maxframe/dataframe/extensions/__init__.py +5 -0
  44. maxframe/dataframe/extensions/flatjson.py +131 -0
  45. maxframe/dataframe/extensions/flatmap.py +317 -0
  46. maxframe/dataframe/extensions/reshuffle.py +1 -1
  47. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  48. maxframe/dataframe/groupby/core.py +1 -1
  49. maxframe/dataframe/groupby/cum.py +0 -1
  50. maxframe/dataframe/groupby/fill.py +4 -1
  51. maxframe/dataframe/groupby/getitem.py +6 -0
  52. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  53. maxframe/dataframe/groupby/transform.py +5 -1
  54. maxframe/dataframe/indexing/align.py +1 -1
  55. maxframe/dataframe/indexing/loc.py +6 -4
  56. maxframe/dataframe/indexing/rename.py +5 -28
  57. maxframe/dataframe/indexing/sample.py +0 -1
  58. maxframe/dataframe/indexing/set_index.py +68 -1
  59. maxframe/dataframe/initializer.py +11 -1
  60. maxframe/dataframe/merge/__init__.py +9 -1
  61. maxframe/dataframe/merge/concat.py +41 -31
  62. maxframe/dataframe/merge/merge.py +237 -3
  63. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  64. maxframe/dataframe/misc/apply.py +5 -10
  65. maxframe/dataframe/misc/case_when.py +1 -1
  66. maxframe/dataframe/misc/describe.py +2 -2
  67. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  68. maxframe/dataframe/misc/eval.py +4 -0
  69. maxframe/dataframe/misc/memory_usage.py +2 -2
  70. maxframe/dataframe/misc/pct_change.py +1 -83
  71. maxframe/dataframe/misc/tests/test_misc.py +33 -2
  72. maxframe/dataframe/misc/transform.py +1 -30
  73. maxframe/dataframe/misc/value_counts.py +4 -17
  74. maxframe/dataframe/missing/dropna.py +1 -1
  75. maxframe/dataframe/missing/fillna.py +5 -5
  76. maxframe/dataframe/operators.py +1 -17
  77. maxframe/dataframe/reduction/core.py +2 -2
  78. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  79. maxframe/dataframe/sort/sort_values.py +1 -11
  80. maxframe/dataframe/statistics/corr.py +3 -3
  81. maxframe/dataframe/statistics/quantile.py +13 -19
  82. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  83. maxframe/dataframe/tests/test_initializer.py +33 -2
  84. maxframe/dataframe/utils.py +26 -11
  85. maxframe/dataframe/window/expanding.py +5 -3
  86. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  87. maxframe/errors.py +13 -0
  88. maxframe/extension.py +12 -0
  89. maxframe/io/__init__.py +13 -0
  90. maxframe/io/objects/__init__.py +24 -0
  91. maxframe/io/objects/core.py +140 -0
  92. maxframe/io/objects/tensor.py +76 -0
  93. maxframe/io/objects/tests/__init__.py +13 -0
  94. maxframe/io/objects/tests/test_object_io.py +97 -0
  95. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  96. maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
  97. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  98. maxframe/io/odpsio/tableio.py +719 -0
  99. maxframe/io/odpsio/tests/__init__.py +13 -0
  100. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
  101. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  102. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  103. maxframe/io/odpsio/volumeio.py +63 -0
  104. maxframe/learn/contrib/__init__.py +3 -1
  105. maxframe/learn/contrib/graph/__init__.py +15 -0
  106. maxframe/learn/contrib/graph/connected_components.py +215 -0
  107. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  108. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  109. maxframe/learn/contrib/llm/__init__.py +16 -0
  110. maxframe/learn/contrib/llm/core.py +54 -0
  111. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  112. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  113. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  114. maxframe/learn/contrib/llm/text.py +42 -0
  115. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  116. maxframe/learn/contrib/xgboost/core.py +87 -2
  117. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  118. maxframe/learn/contrib/xgboost/predict.py +29 -46
  119. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  120. maxframe/learn/contrib/xgboost/train.py +29 -18
  121. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  122. maxframe/lib/mmh3.cpython-310-darwin.so +0 -0
  123. maxframe/lib/mmh3.pyi +43 -0
  124. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  125. maxframe/lib/wrapped_pickle.py +2 -1
  126. maxframe/opcodes.py +8 -0
  127. maxframe/protocol.py +154 -27
  128. maxframe/remote/core.py +4 -8
  129. maxframe/serialization/__init__.py +1 -0
  130. maxframe/serialization/core.cpython-310-darwin.so +0 -0
  131. maxframe/serialization/core.pxd +3 -0
  132. maxframe/serialization/core.pyi +3 -0
  133. maxframe/serialization/core.pyx +67 -26
  134. maxframe/serialization/exception.py +1 -1
  135. maxframe/serialization/pandas.py +52 -17
  136. maxframe/serialization/serializables/core.py +180 -15
  137. maxframe/serialization/serializables/field_type.py +4 -1
  138. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  139. maxframe/serialization/tests/test_serial.py +2 -1
  140. maxframe/session.py +9 -2
  141. maxframe/tensor/__init__.py +81 -2
  142. maxframe/tensor/arithmetic/isclose.py +1 -0
  143. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  144. maxframe/tensor/core.py +5 -136
  145. maxframe/tensor/datasource/array.py +3 -0
  146. maxframe/tensor/datasource/full.py +1 -1
  147. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  148. maxframe/tensor/indexing/flatnonzero.py +1 -1
  149. maxframe/tensor/indexing/getitem.py +2 -0
  150. maxframe/tensor/merge/__init__.py +2 -0
  151. maxframe/tensor/merge/concatenate.py +101 -0
  152. maxframe/tensor/merge/tests/test_merge.py +30 -1
  153. maxframe/tensor/merge/vstack.py +74 -0
  154. maxframe/tensor/{base → misc}/__init__.py +2 -0
  155. maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
  156. maxframe/tensor/misc/atleast_2d.py +70 -0
  157. maxframe/tensor/misc/atleast_3d.py +85 -0
  158. maxframe/tensor/misc/tests/__init__.py +13 -0
  159. maxframe/tensor/{base → misc}/transpose.py +22 -18
  160. maxframe/tensor/{base → misc}/unique.py +3 -3
  161. maxframe/tensor/operators.py +1 -7
  162. maxframe/tensor/random/core.py +1 -1
  163. maxframe/tensor/reduction/count_nonzero.py +2 -1
  164. maxframe/tensor/reduction/mean.py +1 -0
  165. maxframe/tensor/reduction/nanmean.py +1 -0
  166. maxframe/tensor/reduction/nanvar.py +2 -0
  167. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  168. maxframe/tensor/reduction/var.py +2 -0
  169. maxframe/tensor/statistics/quantile.py +2 -2
  170. maxframe/tensor/utils.py +2 -22
  171. maxframe/tests/test_protocol.py +34 -0
  172. maxframe/tests/test_utils.py +0 -12
  173. maxframe/tests/utils.py +17 -2
  174. maxframe/typing_.py +4 -1
  175. maxframe/udf.py +8 -9
  176. maxframe/utils.py +106 -86
  177. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  178. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
  179. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  180. maxframe_client/__init__.py +0 -1
  181. maxframe_client/clients/framedriver.py +4 -1
  182. maxframe_client/fetcher.py +81 -74
  183. maxframe_client/session/consts.py +3 -0
  184. maxframe_client/session/graph.py +8 -2
  185. maxframe_client/session/odps.py +194 -40
  186. maxframe_client/session/task.py +94 -39
  187. maxframe_client/tests/test_fetcher.py +21 -3
  188. maxframe_client/tests/test_session.py +109 -8
  189. maxframe/core/entity/chunks.py +0 -68
  190. maxframe/core/entity/fuse.py +0 -73
  191. maxframe/core/graph/builder/chunk.py +0 -430
  192. maxframe/odpsio/tableio.py +0 -322
  193. maxframe/odpsio/volumeio.py +0 -95
  194. maxframe_client/clients/spe.py +0 -104
  195. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  196. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  197. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  198. /maxframe/tensor/{base → misc}/astype.py +0 -0
  199. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  200. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  201. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  202. /maxframe/tensor/{base → misc}/where.py +0 -0
  203. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -11,12 +11,13 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
15
14
  import logging
15
+ from abc import abstractmethod
16
16
  from collections import namedtuple
17
- from typing import Any, Dict, Optional, Tuple, Union
17
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
18
18
 
19
19
  import numpy as np
20
+ from pandas import Index
20
21
 
21
22
  from ... import opcodes
22
23
  from ...core import OutputType
@@ -28,6 +29,7 @@ from ...serialization.serializables import (
28
29
  Int32Field,
29
30
  KeyField,
30
31
  NamedTupleField,
32
+ Serializable,
31
33
  StringField,
32
34
  TupleField,
33
35
  )
@@ -73,9 +75,208 @@ class DataFrameMergeAlign(MapReduceOperator, DataFrameOperatorMixin):
73
75
  MergeSplitInfo = namedtuple("MergeSplitInfo", "split_side, split_index, nsplits")
74
76
 
75
77
 
78
+ class JoinHint(Serializable):
79
+ @abstractmethod
80
+ def verify_params(
81
+ self,
82
+ hint_on_df: Union[DataFrame, Series],
83
+ on: str,
84
+ is_on_index: bool,
85
+ how: str,
86
+ is_hint_for_left: bool,
87
+ ):
88
+ pass
89
+
90
+ @abstractmethod
91
+ def verify_can_work_with(self, other: "JoinHint"):
92
+ pass
93
+
94
+
95
+ class MapJoinHint(JoinHint):
96
+ def verify_params(
97
+ self,
98
+ hint_on_df: Union[DataFrame, Series],
99
+ on: str,
100
+ is_on_index: bool,
101
+ how: str,
102
+ is_hint_for_left: bool,
103
+ ):
104
+ if how in ("cross", "outer"):
105
+ raise ValueError(
106
+ "Invalid join hint, MapJoinHint is not support in cross and outer join"
107
+ )
108
+
109
+ def verify_can_work_with(self, other: JoinHint):
110
+ if isinstance(other, SkewJoinHint):
111
+ raise ValueError(
112
+ "Invalid join hint, SkewJoinHint cannot work with MapJoinHint"
113
+ )
114
+
115
+
116
+ class DistributedMapJoinHint(JoinHint):
117
+ shard_count = Int32Field("shard_count")
118
+ replica_count = Int32Field("replica_count", default=1)
119
+
120
+ def verify_params(
121
+ self,
122
+ hint_on_df: Union[DataFrame, Series],
123
+ on: str,
124
+ is_on_index: bool,
125
+ how: str,
126
+ is_hint_for_left: bool,
127
+ ):
128
+ if how in ("cross", "outer"):
129
+ raise ValueError(
130
+ "Invalid join hint, DistributedMapJoinHint is not support in cross and outer join"
131
+ )
132
+ if not hasattr(self, "shard_count"):
133
+ raise ValueError(
134
+ "Invalid DistributedMapJoinHint, shard_count must be specified"
135
+ )
136
+ if self.shard_count <= 0 or self.replica_count <= 0:
137
+ raise ValueError(
138
+ "Invalid DistributedMapJoinHint, shard_count and replica_count must be greater than 0"
139
+ )
140
+
141
+ def verify_can_work_with(self, other: JoinHint):
142
+ pass
143
+
144
+
145
+ class SkewJoinHint(JoinHint):
146
+ columns = AnyField("columns", default=None)
147
+
148
+ @staticmethod
149
+ def _check_index_levels(index, level_list):
150
+ selected_levels = set()
151
+ valid_levels = set(range(index.nlevels))
152
+ valid_level_names = set(index.names)
153
+
154
+ for item in level_list:
155
+ if isinstance(item, int):
156
+ if item not in valid_levels:
157
+ raise ValueError(f"Level {item} is not a valid index level")
158
+ if item in selected_levels:
159
+ raise ValueError(f"Level {item} is selected multiple times")
160
+ selected_levels.add(item)
161
+ elif isinstance(item, str):
162
+ if item not in valid_level_names:
163
+ raise ValueError(f"'{item}' is not a valid index level name")
164
+ level = index.names.index(item)
165
+ if level in selected_levels:
166
+ raise ValueError(
167
+ f"'{item}' (Level {level}) is selected multiple times"
168
+ )
169
+ selected_levels.add(level)
170
+ else:
171
+ raise ValueError(f"Invalid input type: {type(item)}")
172
+
173
+ @staticmethod
174
+ def _check_columns(join_on_columns, column_list):
175
+ selected_columns = set()
176
+ valid_columns = set(join_on_columns)
177
+
178
+ for item in column_list:
179
+ if isinstance(item, int):
180
+ if item < 0 or item >= len(join_on_columns):
181
+ raise ValueError(f"Column index {item} is out of range")
182
+ col_name = join_on_columns[item]
183
+ if col_name in selected_columns:
184
+ raise ValueError(
185
+ f"Column '{col_name}' (index {item}) is selected multiple times"
186
+ )
187
+ selected_columns.add(col_name)
188
+ elif isinstance(item, str):
189
+ if item not in valid_columns:
190
+ raise ValueError(f"'{item}' is not a valid column name")
191
+ if item in selected_columns:
192
+ raise ValueError(f"Column '{item}' is selected multiple times")
193
+ selected_columns.add(item)
194
+ else:
195
+ raise ValueError(f"Invalid input type: {type(item)}")
196
+
197
+ def verify_params(
198
+ self,
199
+ hint_on_df: Union[DataFrame, Series],
200
+ on: str,
201
+ is_on_index: bool,
202
+ how: str,
203
+ is_hint_for_left: bool,
204
+ ):
205
+ if how in ("cross", "outer"):
206
+ raise ValueError(
207
+ "Invalid join hint, map join is not support in cross and outer join"
208
+ )
209
+ if is_hint_for_left and how == "right":
210
+ raise ValueError(
211
+ "Invalid join hint, right join can only use SkewJoinHint on right frame"
212
+ )
213
+ elif not is_hint_for_left and how == "left":
214
+ raise ValueError(
215
+ "Invalid join hint, left join can only use SkewJoinHint on left frame"
216
+ )
217
+
218
+ # check columns
219
+ if self.columns is None:
220
+ return
221
+
222
+ if not isinstance(self.columns, list):
223
+ raise TypeError("Invalid SkewJoinHint, `columns` must be a list")
224
+
225
+ if all(isinstance(item, (int, str)) for item in self.columns):
226
+ # if elements are int (levels) or str (index names or column names)
227
+ self._verify_valid_index_or_columns(
228
+ self.columns, hint_on_df.index_value.to_pandas(), on, is_on_index
229
+ )
230
+ elif all(isinstance(c, dict) for c in self.columns):
231
+ # dict with column names and values
232
+ cols_set = set(self.columns[0].keys())
233
+ if any(cols_set != set(c.keys()) for c in self.columns):
234
+ raise ValueError(
235
+ "Invalid SkewJoinHint, all values in `columns` need to have same columns"
236
+ )
237
+
238
+ self._verify_valid_index_or_columns(
239
+ cols_set, hint_on_df.index_value.to_pandas(), on, is_on_index
240
+ )
241
+ else:
242
+ raise TypeError("Invalid SkewJoinHint, annot accept `columns` type")
243
+
244
+ def verify_can_work_with(self, other: JoinHint):
245
+ if isinstance(other, SkewJoinHint):
246
+ raise ValueError(
247
+ "Invalid join hint, SkewJoinHint cannot work with MapJoinHint"
248
+ )
249
+
250
+ @staticmethod
251
+ def _verify_valid_index_or_columns(
252
+ skew_join_columns: Iterable[Union[int, str]],
253
+ frame_index: Index,
254
+ on: Union[str, List[str]],
255
+ is_on_index: bool,
256
+ ):
257
+ if isinstance(on, str):
258
+ on = [on]
259
+ on_columns = set(frame_index.names if is_on_index else on)
260
+ for col in skew_join_columns:
261
+ if isinstance(col, int):
262
+ if col < 0 or col >= len(on_columns):
263
+ raise ValueError(
264
+ f"Invalid, SkeJoinHint, `{col}` is out of join on columns range"
265
+ )
266
+ else:
267
+ if col not in on_columns:
268
+ raise ValueError(
269
+ f"Invalid, SkeJoinHint, '{col}' is not a valid column name"
270
+ )
271
+
272
+
76
273
  class DataFrameMerge(DataFrameOperator, DataFrameOperatorMixin):
77
274
  _op_type_ = opcodes.DATAFRAME_MERGE
78
275
 
276
+ # workaround for new field since v1.0.0rc2
277
+ # todo remove this when all versions below v1.0.0rc1 is eliminated
278
+ _legacy_new_non_primitives = ["left_hint", "right_hint"]
279
+
79
280
  how = StringField("how")
80
281
  on = AnyField("on")
81
282
  left_on = AnyField("left_on")
@@ -95,6 +296,8 @@ class DataFrameMerge(DataFrameOperator, DataFrameOperatorMixin):
95
296
 
96
297
  # only for broadcast merge
97
298
  split_info = NamedTupleField("split_info")
299
+ left_hint = AnyField("left_hint", default=None)
300
+ right_hint = AnyField("right_hint", default=None)
98
301
 
99
302
  def __init__(self, copy=None, **kwargs):
100
303
  super().__init__(copy_=copy, **kwargs)
@@ -150,7 +353,7 @@ def merge(
150
353
  df: Union[DataFrame, Series],
151
354
  right: Union[DataFrame, Series],
152
355
  how: str = "inner",
153
- on: str = None,
356
+ on: Union[str, List[str]] = None,
154
357
  left_on: str = None,
155
358
  right_on: str = None,
156
359
  left_index: bool = False,
@@ -165,6 +368,8 @@ def merge(
165
368
  auto_merge_threshold: int = 8,
166
369
  bloom_filter: Union[bool, str] = "auto",
167
370
  bloom_filter_options: Dict[str, Any] = None,
371
+ left_hint: JoinHint = None,
372
+ right_hint: JoinHint = None,
168
373
  ) -> DataFrame:
169
374
  """
170
375
  Merge DataFrame or named Series objects with a database-style join.
@@ -267,6 +472,12 @@ def merge(
267
472
  when chunk size of left and right is greater than this threshold, apply bloom filter
268
473
  * "filter": "large", "small", "both", default "large"
269
474
  decides to filter on large, small or both DataFrames.
475
+ left_hint: JoinHint, default None
476
+ Join strategy to use for left frame. When data skew occurs, consider these strategies to avoid long-tail issues,
477
+ but use them cautiously to prevent OOM and unnecessary overhead.
478
+ right_hint: JoinHint, default None
479
+ Join strategy to use for right frame.
480
+
270
481
 
271
482
  Returns
272
483
  -------
@@ -381,6 +592,18 @@ def merge(
381
592
  raise ValueError(
382
593
  f"Invalid filter {k}, available: {BLOOM_FILTER_ON_OPTIONS}"
383
594
  )
595
+
596
+ if left_hint:
597
+ if not isinstance(left_hint, JoinHint):
598
+ raise TypeError(f"left_hint must be a JoinHint, got {type(left_hint)}")
599
+ left_hint.verify_can_work_with(right_hint)
600
+ left_hint.verify_params(df, on or left_on, left_index, how, True)
601
+
602
+ if right_hint:
603
+ if not isinstance(right_hint, JoinHint):
604
+ raise TypeError(f"right_hint must be a JoinHint, got {type(right_hint)}")
605
+ right_hint.verify_params(right, on or right_on, right_index, how, False)
606
+
384
607
  op = DataFrameMerge(
385
608
  how=how,
386
609
  on=on,
@@ -399,6 +622,8 @@ def merge(
399
622
  bloom_filter=bloom_filter,
400
623
  bloom_filter_options=bloom_filter_options,
401
624
  output_types=[OutputType.dataframe],
625
+ left_hint=left_hint,
626
+ right_hint=right_hint,
402
627
  )
403
628
  return op(df, right)
404
629
 
@@ -416,6 +641,8 @@ def join(
416
641
  auto_merge_threshold: int = 8,
417
642
  bloom_filter: Union[bool, Dict] = True,
418
643
  bloom_filter_options: Dict[str, Any] = None,
644
+ left_hint: JoinHint = None,
645
+ right_hint: JoinHint = None,
419
646
  ) -> DataFrame:
420
647
  """
421
648
  Join columns of another DataFrame.
@@ -480,6 +707,11 @@ def join(
480
707
  when chunk size of left and right is greater than this threshold, apply bloom filter
481
708
  * "filter": "large", "small", "both", default "large"
482
709
  decides to filter on large, small or both DataFrames.
710
+ left_hint: JoinHint, default None
711
+ Join strategy to use for left frame. When data skew occurs, consider these strategies to avoid long-tail issues,
712
+ but use them cautiously to prevent OOM and unnecessary overhead.
713
+ right_hint: JoinHint, default None
714
+ Join strategy to use for right frame.
483
715
 
484
716
  Returns
485
717
  -------
@@ -590,4 +822,6 @@ def join(
590
822
  auto_merge_threshold=auto_merge_threshold,
591
823
  bloom_filter=bloom_filter,
592
824
  bloom_filter_options=bloom_filter_options,
825
+ left_hint=left_hint,
826
+ right_hint=right_hint,
593
827
  )
@@ -16,9 +16,11 @@ import numpy as np
16
16
  import pandas as pd
17
17
  import pytest
18
18
 
19
+ from ....tests.utils import assert_mf_index_dtype
19
20
  from ...core import IndexValue
20
21
  from ...datasource.dataframe import from_pandas
21
22
  from .. import DataFrameMerge, concat
23
+ from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
22
24
 
23
25
 
24
26
  def test_merge():
@@ -30,14 +32,39 @@ def test_merge():
30
32
  mdf1 = from_pandas(df1, chunk_size=2)
31
33
  mdf2 = from_pandas(df2, chunk_size=3)
32
34
 
35
+ mapjoin = MapJoinHint()
36
+ dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
37
+ skew_join1 = SkewJoinHint()
38
+ skew_join2 = SkewJoinHint(columns=[0])
39
+ skew_join3 = SkewJoinHint(columns=[{"a": 4}, {"a": 6}])
40
+ skew_join4 = SkewJoinHint(columns=[{"a": 4, "b": "test"}, {"a": 5, "b": "hello"}])
41
+
33
42
  parameters = [
34
43
  {},
35
44
  {"how": "left", "right_on": "x", "left_index": True},
45
+ {
46
+ "how": "left",
47
+ "right_on": "x",
48
+ "left_index": True,
49
+ "left_hint": mapjoin,
50
+ "right_hint": mapjoin,
51
+ },
36
52
  {"how": "right", "left_on": "a", "right_index": True},
53
+ {
54
+ "how": "right",
55
+ "left_on": "a",
56
+ "right_index": True,
57
+ "left_hint": mapjoin,
58
+ "right_hint": dist_mapjoin1,
59
+ },
37
60
  {"how": "left", "left_on": "a", "right_on": "x"},
61
+ {"how": "left", "left_on": "a", "right_on": "x", "left_hint": skew_join1},
38
62
  {"how": "right", "left_on": "a", "right_index": True},
63
+ {"how": "right", "left_on": "a", "right_index": True, "right_hint": skew_join2},
39
64
  {"how": "right", "on": "a"},
65
+ {"how": "right", "on": "a", "right_hint": skew_join3},
40
66
  {"how": "inner", "on": ["a", "b"]},
67
+ {"how": "inner", "on": ["a", "b"], "left_hint": skew_join4},
41
68
  ]
42
69
 
43
70
  for kw in parameters:
@@ -135,7 +162,7 @@ def test_append():
135
162
  adf = mdf1.append(mdf2)
136
163
 
137
164
  assert adf.shape == (20, 4)
138
- assert isinstance(adf.index_value.value, IndexValue.Int64Index)
165
+ assert_mf_index_dtype(adf.index_value.value, np.int64)
139
166
 
140
167
  mdf1 = from_pandas(df1, chunk_size=3)
141
168
  mdf2 = from_pandas(df2, chunk_size=3)
@@ -155,6 +182,7 @@ def test_concat():
155
182
  r = concat([mdf1, mdf2], axis="index")
156
183
 
157
184
  assert r.shape == (20, 4)
185
+ assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
158
186
  pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
159
187
 
160
188
  df3 = pd.DataFrame(
@@ -213,3 +241,100 @@ def test_concat():
213
241
  mdf2 = from_pandas(df2, chunk_size=3)
214
242
  r = concat([mdf1, mdf2], join="inner")
215
243
  assert r.shape == (20, 3)
244
+
245
+
246
+ def test_invalid_join_hint():
247
+ df1 = pd.DataFrame(
248
+ np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
249
+ )
250
+ df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
251
+
252
+ mdf1 = from_pandas(df1, chunk_size=2)
253
+ mdf2 = from_pandas(df2, chunk_size=3)
254
+
255
+ # type error
256
+ parameters = [
257
+ {"how": "left", "right_on": "x", "left_index": True, "left_hint": [1]},
258
+ {
259
+ "how": "left",
260
+ "right_on": "x",
261
+ "left_index": True,
262
+ "left_hint": {"key": "value"},
263
+ },
264
+ {
265
+ "how": "right",
266
+ "left_on": "a",
267
+ "right_index": True,
268
+ "right_hint": SkewJoinHint(columns=2),
269
+ },
270
+ {
271
+ "how": "left",
272
+ "left_on": "a",
273
+ "right_on": "x",
274
+ "left_hint": SkewJoinHint(columns="a"),
275
+ },
276
+ {
277
+ "how": "right",
278
+ "left_on": "a",
279
+ "right_index": True,
280
+ "right_hint": SkewJoinHint(columns=["0", []]),
281
+ },
282
+ ]
283
+
284
+ for kw in parameters:
285
+ print(kw)
286
+ with pytest.raises(TypeError):
287
+ mdf1.merge(mdf2, **kw)
288
+
289
+ # value error
290
+ parameters = [
291
+ # mapjoin can't working with skew join
292
+ {
293
+ "how": "left",
294
+ "right_on": "x",
295
+ "left_index": True,
296
+ "left_hint": MapJoinHint(),
297
+ "right_hint": SkewJoinHint(),
298
+ },
299
+ # right join can't apply to skew join left frame
300
+ {
301
+ "how": "right",
302
+ "left_on": "a",
303
+ "right_index": True,
304
+ "left_hint": SkewJoinHint(),
305
+ },
306
+ # invalid columns
307
+ {
308
+ "how": "left",
309
+ "left_on": "a",
310
+ "right_on": "x",
311
+ "left_hint": SkewJoinHint(columns=["b"]),
312
+ },
313
+ # invalid index level
314
+ {
315
+ "how": "right",
316
+ "left_on": "a",
317
+ "right_index": True,
318
+ "right_hint": SkewJoinHint(columns=[5]),
319
+ },
320
+ # unmatched skew join columns
321
+ {
322
+ "how": "right",
323
+ "left_on": "a",
324
+ "right_index": True,
325
+ "right_hint": SkewJoinHint(columns=[{0: "value1"}, {1: "value2"}]),
326
+ },
327
+ # invalid dist_mapjoin shard_count
328
+ {"how": "right", "on": "a", "right_hint": DistributedMapJoinHint()},
329
+ # all can't work with outer join
330
+ {"how": "outer", "on": ["a", "b"], "left_hint": MapJoinHint()},
331
+ {
332
+ "how": "outer",
333
+ "on": ["a", "b"],
334
+ "left_hint": DistributedMapJoinHint(shard_count=5),
335
+ },
336
+ {"how": "outer", "on": ["a", "b"], "left_hint": SkewJoinHint()},
337
+ ]
338
+ for kw in parameters:
339
+ with pytest.raises(ValueError):
340
+ mdf1.merge(mdf2, **kw)
@@ -170,6 +170,8 @@ class ApplyOperator(
170
170
  elif self.output_types[0] == OutputType.dataframe:
171
171
  shape = [np.nan, np.nan]
172
172
  shape[1 - self.axis] = df.shape[1 - self.axis]
173
+ if self.axis == 1:
174
+ shape[1] = len(dtypes)
173
175
  shape = tuple(shape)
174
176
  else:
175
177
  shape = (df.shape[1 - self.axis],)
@@ -317,6 +319,7 @@ def df_apply(
317
319
  skip_infer=False,
318
320
  **kwds,
319
321
  ):
322
+ # FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/50
320
323
  """
321
324
  Apply a function along an axis of the DataFrame.
322
325
 
@@ -442,20 +445,12 @@ def df_apply(
442
445
  B 27
443
446
  dtype: int64
444
447
 
445
- >>> df.apply(np.sum, axis=1).execute()
448
+ >>> df.apply(lambda row: int(np.sum(row)), axis=1).execute()
446
449
  0 13
447
450
  1 13
448
451
  2 13
449
452
  dtype: int64
450
453
 
451
- Returning a list-like will result in a Series
452
-
453
- >>> df.apply(lambda x: [1, 2], axis=1).execute()
454
- 0 [1, 2]
455
- 1 [1, 2]
456
- 2 [1, 2]
457
- dtype: object
458
-
459
454
  Passing ``result_type='expand'`` will expand list-like results
460
455
  to columns of a Dataframe
461
456
 
@@ -469,7 +464,7 @@ def df_apply(
469
464
  ``result_type='expand'``. The resulting column names
470
465
  will be the Series index.
471
466
 
472
- >>> df.apply(lambda x: md.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
467
+ >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
473
468
  foo bar
474
469
  0 1 2
475
470
  1 1 2
@@ -99,7 +99,7 @@ def case_when(series, caselist):
99
99
  >>> b = md.Series([0, 3, 4, 5])
100
100
 
101
101
  >>> c.case_when(caselist=[(a.gt(0), a), # condition, replacement
102
- ... (b.gt(0), b)])
102
+ ... (b.gt(0), b)]).execute()
103
103
  0 6
104
104
  1 3
105
105
  2 1
@@ -15,7 +15,7 @@
15
15
  import numpy as np
16
16
  import pandas as pd
17
17
 
18
- from ... import opcodes as OperandDef
18
+ from ... import opcodes
19
19
  from ...serialization.serializables import AnyField, FieldTypes, KeyField, ListField
20
20
  from ..core import SERIES_TYPE
21
21
  from ..operators import DataFrameOperator, DataFrameOperatorMixin
@@ -23,7 +23,7 @@ from ..utils import build_empty_df, parse_index
23
23
 
24
24
 
25
25
  class DataFrameDescribe(DataFrameOperator, DataFrameOperatorMixin):
26
- _op_type_ = OperandDef.DESCRIBE
26
+ _op_type_ = opcodes.DESCRIBE
27
27
 
28
28
  input = KeyField("input", default=None)
29
29
  percentiles = ListField("percentiles", FieldTypes.float64, default=None)
@@ -37,16 +37,19 @@ class DataFrameDropDuplicates(DuplicateOperand):
37
37
  shape += (3,)
38
38
  return shape
39
39
 
40
- @classmethod
41
- def _gen_tileable_params(cls, op: "DataFrameDropDuplicates", input_params):
40
+ def _gen_tileable_params(self, op: "DataFrameDropDuplicates", input_params):
42
41
  params = input_params.copy()
43
- if op.ignore_index:
42
+ if op.ignore_index and self._output_types[0] != OutputType.index:
44
43
  params["index_value"] = parse_index(pd.RangeIndex(-1))
45
44
  else:
46
45
  params["index_value"] = gen_unknown_index_value(
47
- input_params["index_value"], op.keep, op.subset, type(op).__name__
46
+ input_params["index_value"],
47
+ op.keep,
48
+ op.subset,
49
+ type(op).__name__,
50
+ normalize_range_index=True,
48
51
  )
49
- params["shape"] = cls._get_shape(input_params["shape"], op)
52
+ params["shape"] = self._get_shape(input_params["shape"], op)
50
53
  return params
51
54
 
52
55
  def __call__(self, inp, inplace=False):
@@ -151,17 +154,14 @@ def series_drop_duplicates(
151
154
  With the 'keep' parameter, the selection behaviour of duplicated values
152
155
  can be changed. The value 'first' keeps the first occurrence for each
153
156
  set of duplicated entries. The default value of keep is 'first'.
154
-
155
157
  >>> s.drop_duplicates().execute()
156
158
  0 lame
157
159
  1 cow
158
160
  3 beetle
159
161
  5 hippo
160
162
  Name: animal, dtype: object
161
-
162
163
  The value 'last' for parameter 'keep' keeps the last occurrence for
163
164
  each set of duplicated entries.
164
-
165
165
  >>> s.drop_duplicates(keep='last').execute()
166
166
  1 cow
167
167
  3 beetle
@@ -120,6 +120,10 @@ class CollectionVisitor(ast.NodeVisitor):
120
120
  if obj_name in self.env:
121
121
  self.referenced_vars.add(obj_name)
122
122
  return self.env[obj_name]
123
+ try:
124
+ return self.target[obj_name]
125
+ except KeyError:
126
+ pass
123
127
  raise KeyError(f"name {obj_name} is not defined")
124
128
 
125
129
  def visit(self, node):
@@ -58,7 +58,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
58
58
  """
59
59
  if df_or_series.ndim == 1:
60
60
  # the input data is a series, a Scalar will be returned
61
- return self.new_scalar([df_or_series], dtype=np.dtype(np.int_))
61
+ return self.new_scalar([df_or_series], dtype=np.dtype(int))
62
62
  else:
63
63
  # the input data is a DataFrame, a Scalar will be returned
64
64
  # calculate shape of returning series given ``op.index``
@@ -71,7 +71,7 @@ class DataFrameMemoryUsage(DataFrameOperator, DataFrameOperatorMixin):
71
71
  [df_or_series],
72
72
  index_value=self._adapt_index(df_or_series.columns_value),
73
73
  shape=new_shape,
74
- dtype=np.dtype(np.int_),
74
+ dtype=np.dtype(int),
75
75
  )
76
76
 
77
77