maxframe 0.1.0b4__cp39-cp39-win32.whl → 1.0.0__cp39-cp39-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp39-win32.pyd +0 -0
- maxframe/codegen.py +56 -5
- maxframe/config/config.py +78 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp39-win32.pyd +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +2 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +58 -12
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +120 -24
- maxframe/dataframe/datasource/read_odps_table.py +9 -4
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +6 -11
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +93 -1
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +4 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +33 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +110 -0
- maxframe/learn/contrib/xgboost/core.py +241 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
- maxframe/learn/contrib/xgboost/predict.py +121 -0
- maxframe/learn/contrib/xgboost/regressor.py +71 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +132 -0
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cp39-win32.pyd +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +11 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp39-win32.pyd +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +64 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +37 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +7 -2
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +4 -0
- maxframe/tensor/misc/atleast_1d.py +72 -0
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/misc/unique.py +205 -0
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +62 -3
- maxframe/utils.py +112 -86
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +123 -54
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +223 -40
- maxframe_client/session/task.py +108 -80
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +136 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -300
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -11,12 +11,13 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
14
|
import logging
|
|
15
|
+
from abc import abstractmethod
|
|
16
16
|
from collections import namedtuple
|
|
17
|
-
from typing import Any, Dict, Optional, Tuple, Union
|
|
17
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
18
18
|
|
|
19
19
|
import numpy as np
|
|
20
|
+
from pandas import Index
|
|
20
21
|
|
|
21
22
|
from ... import opcodes
|
|
22
23
|
from ...core import OutputType
|
|
@@ -28,6 +29,7 @@ from ...serialization.serializables import (
|
|
|
28
29
|
Int32Field,
|
|
29
30
|
KeyField,
|
|
30
31
|
NamedTupleField,
|
|
32
|
+
Serializable,
|
|
31
33
|
StringField,
|
|
32
34
|
TupleField,
|
|
33
35
|
)
|
|
@@ -73,9 +75,208 @@ class DataFrameMergeAlign(MapReduceOperator, DataFrameOperatorMixin):
|
|
|
73
75
|
MergeSplitInfo = namedtuple("MergeSplitInfo", "split_side, split_index, nsplits")
|
|
74
76
|
|
|
75
77
|
|
|
78
|
+
class JoinHint(Serializable):
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def verify_params(
|
|
81
|
+
self,
|
|
82
|
+
hint_on_df: Union[DataFrame, Series],
|
|
83
|
+
on: str,
|
|
84
|
+
is_on_index: bool,
|
|
85
|
+
how: str,
|
|
86
|
+
is_hint_for_left: bool,
|
|
87
|
+
):
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
@abstractmethod
|
|
91
|
+
def verify_can_work_with(self, other: "JoinHint"):
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class MapJoinHint(JoinHint):
|
|
96
|
+
def verify_params(
|
|
97
|
+
self,
|
|
98
|
+
hint_on_df: Union[DataFrame, Series],
|
|
99
|
+
on: str,
|
|
100
|
+
is_on_index: bool,
|
|
101
|
+
how: str,
|
|
102
|
+
is_hint_for_left: bool,
|
|
103
|
+
):
|
|
104
|
+
if how in ("cross", "outer"):
|
|
105
|
+
raise ValueError(
|
|
106
|
+
"Invalid join hint, MapJoinHint is not support in cross and outer join"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def verify_can_work_with(self, other: JoinHint):
|
|
110
|
+
if isinstance(other, SkewJoinHint):
|
|
111
|
+
raise ValueError(
|
|
112
|
+
"Invalid join hint, SkewJoinHint cannot work with MapJoinHint"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class DistributedMapJoinHint(JoinHint):
|
|
117
|
+
shard_count = Int32Field("shard_count")
|
|
118
|
+
replica_count = Int32Field("replica_count", default=1)
|
|
119
|
+
|
|
120
|
+
def verify_params(
|
|
121
|
+
self,
|
|
122
|
+
hint_on_df: Union[DataFrame, Series],
|
|
123
|
+
on: str,
|
|
124
|
+
is_on_index: bool,
|
|
125
|
+
how: str,
|
|
126
|
+
is_hint_for_left: bool,
|
|
127
|
+
):
|
|
128
|
+
if how in ("cross", "outer"):
|
|
129
|
+
raise ValueError(
|
|
130
|
+
"Invalid join hint, DistributedMapJoinHint is not support in cross and outer join"
|
|
131
|
+
)
|
|
132
|
+
if not hasattr(self, "shard_count"):
|
|
133
|
+
raise ValueError(
|
|
134
|
+
"Invalid DistributedMapJoinHint, shard_count must be specified"
|
|
135
|
+
)
|
|
136
|
+
if self.shard_count <= 0 or self.replica_count <= 0:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
"Invalid DistributedMapJoinHint, shard_count and replica_count must be greater than 0"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def verify_can_work_with(self, other: JoinHint):
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class SkewJoinHint(JoinHint):
|
|
146
|
+
columns = AnyField("columns", default=None)
|
|
147
|
+
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _check_index_levels(index, level_list):
|
|
150
|
+
selected_levels = set()
|
|
151
|
+
valid_levels = set(range(index.nlevels))
|
|
152
|
+
valid_level_names = set(index.names)
|
|
153
|
+
|
|
154
|
+
for item in level_list:
|
|
155
|
+
if isinstance(item, int):
|
|
156
|
+
if item not in valid_levels:
|
|
157
|
+
raise ValueError(f"Level {item} is not a valid index level")
|
|
158
|
+
if item in selected_levels:
|
|
159
|
+
raise ValueError(f"Level {item} is selected multiple times")
|
|
160
|
+
selected_levels.add(item)
|
|
161
|
+
elif isinstance(item, str):
|
|
162
|
+
if item not in valid_level_names:
|
|
163
|
+
raise ValueError(f"'{item}' is not a valid index level name")
|
|
164
|
+
level = index.names.index(item)
|
|
165
|
+
if level in selected_levels:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"'{item}' (Level {level}) is selected multiple times"
|
|
168
|
+
)
|
|
169
|
+
selected_levels.add(level)
|
|
170
|
+
else:
|
|
171
|
+
raise ValueError(f"Invalid input type: {type(item)}")
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
def _check_columns(join_on_columns, column_list):
|
|
175
|
+
selected_columns = set()
|
|
176
|
+
valid_columns = set(join_on_columns)
|
|
177
|
+
|
|
178
|
+
for item in column_list:
|
|
179
|
+
if isinstance(item, int):
|
|
180
|
+
if item < 0 or item >= len(join_on_columns):
|
|
181
|
+
raise ValueError(f"Column index {item} is out of range")
|
|
182
|
+
col_name = join_on_columns[item]
|
|
183
|
+
if col_name in selected_columns:
|
|
184
|
+
raise ValueError(
|
|
185
|
+
f"Column '{col_name}' (index {item}) is selected multiple times"
|
|
186
|
+
)
|
|
187
|
+
selected_columns.add(col_name)
|
|
188
|
+
elif isinstance(item, str):
|
|
189
|
+
if item not in valid_columns:
|
|
190
|
+
raise ValueError(f"'{item}' is not a valid column name")
|
|
191
|
+
if item in selected_columns:
|
|
192
|
+
raise ValueError(f"Column '{item}' is selected multiple times")
|
|
193
|
+
selected_columns.add(item)
|
|
194
|
+
else:
|
|
195
|
+
raise ValueError(f"Invalid input type: {type(item)}")
|
|
196
|
+
|
|
197
|
+
def verify_params(
|
|
198
|
+
self,
|
|
199
|
+
hint_on_df: Union[DataFrame, Series],
|
|
200
|
+
on: str,
|
|
201
|
+
is_on_index: bool,
|
|
202
|
+
how: str,
|
|
203
|
+
is_hint_for_left: bool,
|
|
204
|
+
):
|
|
205
|
+
if how in ("cross", "outer"):
|
|
206
|
+
raise ValueError(
|
|
207
|
+
"Invalid join hint, map join is not support in cross and outer join"
|
|
208
|
+
)
|
|
209
|
+
if is_hint_for_left and how == "right":
|
|
210
|
+
raise ValueError(
|
|
211
|
+
"Invalid join hint, right join can only use SkewJoinHint on right frame"
|
|
212
|
+
)
|
|
213
|
+
elif not is_hint_for_left and how == "left":
|
|
214
|
+
raise ValueError(
|
|
215
|
+
"Invalid join hint, left join can only use SkewJoinHint on left frame"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# check columns
|
|
219
|
+
if self.columns is None:
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
if not isinstance(self.columns, list):
|
|
223
|
+
raise TypeError("Invalid SkewJoinHint, `columns` must be a list")
|
|
224
|
+
|
|
225
|
+
if all(isinstance(item, (int, str)) for item in self.columns):
|
|
226
|
+
# if elements are int (levels) or str (index names or column names)
|
|
227
|
+
self._verify_valid_index_or_columns(
|
|
228
|
+
self.columns, hint_on_df.index_value.to_pandas(), on, is_on_index
|
|
229
|
+
)
|
|
230
|
+
elif all(isinstance(c, dict) for c in self.columns):
|
|
231
|
+
# dict with column names and values
|
|
232
|
+
cols_set = set(self.columns[0].keys())
|
|
233
|
+
if any(cols_set != set(c.keys()) for c in self.columns):
|
|
234
|
+
raise ValueError(
|
|
235
|
+
"Invalid SkewJoinHint, all values in `columns` need to have same columns"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
self._verify_valid_index_or_columns(
|
|
239
|
+
cols_set, hint_on_df.index_value.to_pandas(), on, is_on_index
|
|
240
|
+
)
|
|
241
|
+
else:
|
|
242
|
+
raise TypeError("Invalid SkewJoinHint, annot accept `columns` type")
|
|
243
|
+
|
|
244
|
+
def verify_can_work_with(self, other: JoinHint):
|
|
245
|
+
if isinstance(other, SkewJoinHint):
|
|
246
|
+
raise ValueError(
|
|
247
|
+
"Invalid join hint, SkewJoinHint cannot work with MapJoinHint"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _verify_valid_index_or_columns(
|
|
252
|
+
skew_join_columns: Iterable[Union[int, str]],
|
|
253
|
+
frame_index: Index,
|
|
254
|
+
on: Union[str, List[str]],
|
|
255
|
+
is_on_index: bool,
|
|
256
|
+
):
|
|
257
|
+
if isinstance(on, str):
|
|
258
|
+
on = [on]
|
|
259
|
+
on_columns = set(frame_index.names if is_on_index else on)
|
|
260
|
+
for col in skew_join_columns:
|
|
261
|
+
if isinstance(col, int):
|
|
262
|
+
if col < 0 or col >= len(on_columns):
|
|
263
|
+
raise ValueError(
|
|
264
|
+
f"Invalid, SkeJoinHint, `{col}` is out of join on columns range"
|
|
265
|
+
)
|
|
266
|
+
else:
|
|
267
|
+
if col not in on_columns:
|
|
268
|
+
raise ValueError(
|
|
269
|
+
f"Invalid, SkeJoinHint, '{col}' is not a valid column name"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
|
|
76
273
|
class DataFrameMerge(DataFrameOperator, DataFrameOperatorMixin):
|
|
77
274
|
_op_type_ = opcodes.DATAFRAME_MERGE
|
|
78
275
|
|
|
276
|
+
# workaround for new field since v1.0.0rc2
|
|
277
|
+
# todo remove this when all versions below v1.0.0rc1 is eliminated
|
|
278
|
+
_legacy_new_non_primitives = ["left_hint", "right_hint"]
|
|
279
|
+
|
|
79
280
|
how = StringField("how")
|
|
80
281
|
on = AnyField("on")
|
|
81
282
|
left_on = AnyField("left_on")
|
|
@@ -95,6 +296,8 @@ class DataFrameMerge(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
95
296
|
|
|
96
297
|
# only for broadcast merge
|
|
97
298
|
split_info = NamedTupleField("split_info")
|
|
299
|
+
left_hint = AnyField("left_hint", default=None)
|
|
300
|
+
right_hint = AnyField("right_hint", default=None)
|
|
98
301
|
|
|
99
302
|
def __init__(self, copy=None, **kwargs):
|
|
100
303
|
super().__init__(copy_=copy, **kwargs)
|
|
@@ -150,7 +353,7 @@ def merge(
|
|
|
150
353
|
df: Union[DataFrame, Series],
|
|
151
354
|
right: Union[DataFrame, Series],
|
|
152
355
|
how: str = "inner",
|
|
153
|
-
on: str = None,
|
|
356
|
+
on: Union[str, List[str]] = None,
|
|
154
357
|
left_on: str = None,
|
|
155
358
|
right_on: str = None,
|
|
156
359
|
left_index: bool = False,
|
|
@@ -165,6 +368,8 @@ def merge(
|
|
|
165
368
|
auto_merge_threshold: int = 8,
|
|
166
369
|
bloom_filter: Union[bool, str] = "auto",
|
|
167
370
|
bloom_filter_options: Dict[str, Any] = None,
|
|
371
|
+
left_hint: JoinHint = None,
|
|
372
|
+
right_hint: JoinHint = None,
|
|
168
373
|
) -> DataFrame:
|
|
169
374
|
"""
|
|
170
375
|
Merge DataFrame or named Series objects with a database-style join.
|
|
@@ -267,6 +472,12 @@ def merge(
|
|
|
267
472
|
when chunk size of left and right is greater than this threshold, apply bloom filter
|
|
268
473
|
* "filter": "large", "small", "both", default "large"
|
|
269
474
|
decides to filter on large, small or both DataFrames.
|
|
475
|
+
left_hint: JoinHint, default None
|
|
476
|
+
Join strategy to use for left frame. When data skew occurs, consider these strategies to avoid long-tail issues,
|
|
477
|
+
but use them cautiously to prevent OOM and unnecessary overhead.
|
|
478
|
+
right_hint: JoinHint, default None
|
|
479
|
+
Join strategy to use for right frame.
|
|
480
|
+
|
|
270
481
|
|
|
271
482
|
Returns
|
|
272
483
|
-------
|
|
@@ -381,6 +592,18 @@ def merge(
|
|
|
381
592
|
raise ValueError(
|
|
382
593
|
f"Invalid filter {k}, available: {BLOOM_FILTER_ON_OPTIONS}"
|
|
383
594
|
)
|
|
595
|
+
|
|
596
|
+
if left_hint:
|
|
597
|
+
if not isinstance(left_hint, JoinHint):
|
|
598
|
+
raise TypeError(f"left_hint must be a JoinHint, got {type(left_hint)}")
|
|
599
|
+
left_hint.verify_can_work_with(right_hint)
|
|
600
|
+
left_hint.verify_params(df, on or left_on, left_index, how, True)
|
|
601
|
+
|
|
602
|
+
if right_hint:
|
|
603
|
+
if not isinstance(right_hint, JoinHint):
|
|
604
|
+
raise TypeError(f"right_hint must be a JoinHint, got {type(right_hint)}")
|
|
605
|
+
right_hint.verify_params(right, on or right_on, right_index, how, False)
|
|
606
|
+
|
|
384
607
|
op = DataFrameMerge(
|
|
385
608
|
how=how,
|
|
386
609
|
on=on,
|
|
@@ -399,6 +622,8 @@ def merge(
|
|
|
399
622
|
bloom_filter=bloom_filter,
|
|
400
623
|
bloom_filter_options=bloom_filter_options,
|
|
401
624
|
output_types=[OutputType.dataframe],
|
|
625
|
+
left_hint=left_hint,
|
|
626
|
+
right_hint=right_hint,
|
|
402
627
|
)
|
|
403
628
|
return op(df, right)
|
|
404
629
|
|
|
@@ -416,6 +641,8 @@ def join(
|
|
|
416
641
|
auto_merge_threshold: int = 8,
|
|
417
642
|
bloom_filter: Union[bool, Dict] = True,
|
|
418
643
|
bloom_filter_options: Dict[str, Any] = None,
|
|
644
|
+
left_hint: JoinHint = None,
|
|
645
|
+
right_hint: JoinHint = None,
|
|
419
646
|
) -> DataFrame:
|
|
420
647
|
"""
|
|
421
648
|
Join columns of another DataFrame.
|
|
@@ -480,6 +707,11 @@ def join(
|
|
|
480
707
|
when chunk size of left and right is greater than this threshold, apply bloom filter
|
|
481
708
|
* "filter": "large", "small", "both", default "large"
|
|
482
709
|
decides to filter on large, small or both DataFrames.
|
|
710
|
+
left_hint: JoinHint, default None
|
|
711
|
+
Join strategy to use for left frame. When data skew occurs, consider these strategies to avoid long-tail issues,
|
|
712
|
+
but use them cautiously to prevent OOM and unnecessary overhead.
|
|
713
|
+
right_hint: JoinHint, default None
|
|
714
|
+
Join strategy to use for right frame.
|
|
483
715
|
|
|
484
716
|
Returns
|
|
485
717
|
-------
|
|
@@ -590,4 +822,6 @@ def join(
|
|
|
590
822
|
auto_merge_threshold=auto_merge_threshold,
|
|
591
823
|
bloom_filter=bloom_filter,
|
|
592
824
|
bloom_filter_options=bloom_filter_options,
|
|
825
|
+
left_hint=left_hint,
|
|
826
|
+
right_hint=right_hint,
|
|
593
827
|
)
|
|
@@ -16,9 +16,11 @@ import numpy as np
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
import pytest
|
|
18
18
|
|
|
19
|
+
from ....tests.utils import assert_mf_index_dtype
|
|
19
20
|
from ...core import IndexValue
|
|
20
21
|
from ...datasource.dataframe import from_pandas
|
|
21
22
|
from .. import DataFrameMerge, concat
|
|
23
|
+
from ..merge import DistributedMapJoinHint, MapJoinHint, SkewJoinHint
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
def test_merge():
|
|
@@ -30,14 +32,39 @@ def test_merge():
|
|
|
30
32
|
mdf1 = from_pandas(df1, chunk_size=2)
|
|
31
33
|
mdf2 = from_pandas(df2, chunk_size=3)
|
|
32
34
|
|
|
35
|
+
mapjoin = MapJoinHint()
|
|
36
|
+
dist_mapjoin1 = DistributedMapJoinHint(shard_count=5)
|
|
37
|
+
skew_join1 = SkewJoinHint()
|
|
38
|
+
skew_join2 = SkewJoinHint(columns=[0])
|
|
39
|
+
skew_join3 = SkewJoinHint(columns=[{"a": 4}, {"a": 6}])
|
|
40
|
+
skew_join4 = SkewJoinHint(columns=[{"a": 4, "b": "test"}, {"a": 5, "b": "hello"}])
|
|
41
|
+
|
|
33
42
|
parameters = [
|
|
34
43
|
{},
|
|
35
44
|
{"how": "left", "right_on": "x", "left_index": True},
|
|
45
|
+
{
|
|
46
|
+
"how": "left",
|
|
47
|
+
"right_on": "x",
|
|
48
|
+
"left_index": True,
|
|
49
|
+
"left_hint": mapjoin,
|
|
50
|
+
"right_hint": mapjoin,
|
|
51
|
+
},
|
|
36
52
|
{"how": "right", "left_on": "a", "right_index": True},
|
|
53
|
+
{
|
|
54
|
+
"how": "right",
|
|
55
|
+
"left_on": "a",
|
|
56
|
+
"right_index": True,
|
|
57
|
+
"left_hint": mapjoin,
|
|
58
|
+
"right_hint": dist_mapjoin1,
|
|
59
|
+
},
|
|
37
60
|
{"how": "left", "left_on": "a", "right_on": "x"},
|
|
61
|
+
{"how": "left", "left_on": "a", "right_on": "x", "left_hint": skew_join1},
|
|
38
62
|
{"how": "right", "left_on": "a", "right_index": True},
|
|
63
|
+
{"how": "right", "left_on": "a", "right_index": True, "right_hint": skew_join2},
|
|
39
64
|
{"how": "right", "on": "a"},
|
|
65
|
+
{"how": "right", "on": "a", "right_hint": skew_join3},
|
|
40
66
|
{"how": "inner", "on": ["a", "b"]},
|
|
67
|
+
{"how": "inner", "on": ["a", "b"], "left_hint": skew_join4},
|
|
41
68
|
]
|
|
42
69
|
|
|
43
70
|
for kw in parameters:
|
|
@@ -135,7 +162,7 @@ def test_append():
|
|
|
135
162
|
adf = mdf1.append(mdf2)
|
|
136
163
|
|
|
137
164
|
assert adf.shape == (20, 4)
|
|
138
|
-
|
|
165
|
+
assert_mf_index_dtype(adf.index_value.value, np.int64)
|
|
139
166
|
|
|
140
167
|
mdf1 = from_pandas(df1, chunk_size=3)
|
|
141
168
|
mdf2 = from_pandas(df2, chunk_size=3)
|
|
@@ -155,6 +182,7 @@ def test_concat():
|
|
|
155
182
|
r = concat([mdf1, mdf2], axis="index")
|
|
156
183
|
|
|
157
184
|
assert r.shape == (20, 4)
|
|
185
|
+
assert not isinstance(r.index_value.to_pandas(), pd.RangeIndex)
|
|
158
186
|
pd.testing.assert_series_equal(r.dtypes, df1.dtypes)
|
|
159
187
|
|
|
160
188
|
df3 = pd.DataFrame(
|
|
@@ -213,3 +241,100 @@ def test_concat():
|
|
|
213
241
|
mdf2 = from_pandas(df2, chunk_size=3)
|
|
214
242
|
r = concat([mdf1, mdf2], join="inner")
|
|
215
243
|
assert r.shape == (20, 3)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def test_invalid_join_hint():
|
|
247
|
+
df1 = pd.DataFrame(
|
|
248
|
+
np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
|
|
249
|
+
)
|
|
250
|
+
df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"])
|
|
251
|
+
|
|
252
|
+
mdf1 = from_pandas(df1, chunk_size=2)
|
|
253
|
+
mdf2 = from_pandas(df2, chunk_size=3)
|
|
254
|
+
|
|
255
|
+
# type error
|
|
256
|
+
parameters = [
|
|
257
|
+
{"how": "left", "right_on": "x", "left_index": True, "left_hint": [1]},
|
|
258
|
+
{
|
|
259
|
+
"how": "left",
|
|
260
|
+
"right_on": "x",
|
|
261
|
+
"left_index": True,
|
|
262
|
+
"left_hint": {"key": "value"},
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
"how": "right",
|
|
266
|
+
"left_on": "a",
|
|
267
|
+
"right_index": True,
|
|
268
|
+
"right_hint": SkewJoinHint(columns=2),
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
"how": "left",
|
|
272
|
+
"left_on": "a",
|
|
273
|
+
"right_on": "x",
|
|
274
|
+
"left_hint": SkewJoinHint(columns="a"),
|
|
275
|
+
},
|
|
276
|
+
{
|
|
277
|
+
"how": "right",
|
|
278
|
+
"left_on": "a",
|
|
279
|
+
"right_index": True,
|
|
280
|
+
"right_hint": SkewJoinHint(columns=["0", []]),
|
|
281
|
+
},
|
|
282
|
+
]
|
|
283
|
+
|
|
284
|
+
for kw in parameters:
|
|
285
|
+
print(kw)
|
|
286
|
+
with pytest.raises(TypeError):
|
|
287
|
+
mdf1.merge(mdf2, **kw)
|
|
288
|
+
|
|
289
|
+
# value error
|
|
290
|
+
parameters = [
|
|
291
|
+
# mapjoin can't working with skew join
|
|
292
|
+
{
|
|
293
|
+
"how": "left",
|
|
294
|
+
"right_on": "x",
|
|
295
|
+
"left_index": True,
|
|
296
|
+
"left_hint": MapJoinHint(),
|
|
297
|
+
"right_hint": SkewJoinHint(),
|
|
298
|
+
},
|
|
299
|
+
# right join can't apply to skew join left frame
|
|
300
|
+
{
|
|
301
|
+
"how": "right",
|
|
302
|
+
"left_on": "a",
|
|
303
|
+
"right_index": True,
|
|
304
|
+
"left_hint": SkewJoinHint(),
|
|
305
|
+
},
|
|
306
|
+
# invalid columns
|
|
307
|
+
{
|
|
308
|
+
"how": "left",
|
|
309
|
+
"left_on": "a",
|
|
310
|
+
"right_on": "x",
|
|
311
|
+
"left_hint": SkewJoinHint(columns=["b"]),
|
|
312
|
+
},
|
|
313
|
+
# invalid index level
|
|
314
|
+
{
|
|
315
|
+
"how": "right",
|
|
316
|
+
"left_on": "a",
|
|
317
|
+
"right_index": True,
|
|
318
|
+
"right_hint": SkewJoinHint(columns=[5]),
|
|
319
|
+
},
|
|
320
|
+
# unmatched skew join columns
|
|
321
|
+
{
|
|
322
|
+
"how": "right",
|
|
323
|
+
"left_on": "a",
|
|
324
|
+
"right_index": True,
|
|
325
|
+
"right_hint": SkewJoinHint(columns=[{0: "value1"}, {1: "value2"}]),
|
|
326
|
+
},
|
|
327
|
+
# invalid dist_mapjoin shard_count
|
|
328
|
+
{"how": "right", "on": "a", "right_hint": DistributedMapJoinHint()},
|
|
329
|
+
# all can't work with outer join
|
|
330
|
+
{"how": "outer", "on": ["a", "b"], "left_hint": MapJoinHint()},
|
|
331
|
+
{
|
|
332
|
+
"how": "outer",
|
|
333
|
+
"on": ["a", "b"],
|
|
334
|
+
"left_hint": DistributedMapJoinHint(shard_count=5),
|
|
335
|
+
},
|
|
336
|
+
{"how": "outer", "on": ["a", "b"], "left_hint": SkewJoinHint()},
|
|
337
|
+
]
|
|
338
|
+
for kw in parameters:
|
|
339
|
+
with pytest.raises(ValueError):
|
|
340
|
+
mdf1.merge(mdf2, **kw)
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
from .apply import df_apply, series_apply
|
|
16
16
|
from .astype import astype, index_astype
|
|
17
|
+
from .case_when import case_when
|
|
17
18
|
from .check_monotonic import (
|
|
18
19
|
check_monotonic,
|
|
19
20
|
is_monotonic,
|
|
@@ -37,6 +38,7 @@ from .map import index_map, series_map
|
|
|
37
38
|
from .melt import melt
|
|
38
39
|
from .memory_usage import df_memory_usage, index_memory_usage, series_memory_usage
|
|
39
40
|
from .pct_change import pct_change
|
|
41
|
+
from .pivot_table import pivot_table
|
|
40
42
|
from .qcut import qcut
|
|
41
43
|
from .select_dtypes import select_dtypes
|
|
42
44
|
from .shift import shift, tshift
|
|
@@ -69,6 +71,7 @@ def _install():
|
|
|
69
71
|
setattr(t, "melt", melt)
|
|
70
72
|
setattr(t, "memory_usage", df_memory_usage)
|
|
71
73
|
setattr(t, "pct_change", pct_change)
|
|
74
|
+
setattr(t, "pivot_table", pivot_table)
|
|
72
75
|
setattr(t, "pop", df_pop)
|
|
73
76
|
setattr(t, "query", df_query)
|
|
74
77
|
setattr(t, "select_dtypes", select_dtypes)
|
|
@@ -81,6 +84,7 @@ def _install():
|
|
|
81
84
|
for t in SERIES_TYPE:
|
|
82
85
|
setattr(t, "apply", series_apply)
|
|
83
86
|
setattr(t, "astype", astype)
|
|
87
|
+
setattr(t, "case_when", case_when)
|
|
84
88
|
setattr(t, "check_monotonic", check_monotonic)
|
|
85
89
|
setattr(t, "describe", describe)
|
|
86
90
|
setattr(t, "diff", series_diff)
|
maxframe/dataframe/misc/apply.py
CHANGED
|
@@ -170,6 +170,8 @@ class ApplyOperator(
|
|
|
170
170
|
elif self.output_types[0] == OutputType.dataframe:
|
|
171
171
|
shape = [np.nan, np.nan]
|
|
172
172
|
shape[1 - self.axis] = df.shape[1 - self.axis]
|
|
173
|
+
if self.axis == 1:
|
|
174
|
+
shape[1] = len(dtypes)
|
|
173
175
|
shape = tuple(shape)
|
|
174
176
|
else:
|
|
175
177
|
shape = (df.shape[1 - self.axis],)
|
|
@@ -225,7 +227,7 @@ class ApplyOperator(
|
|
|
225
227
|
else: # pragma: no cover
|
|
226
228
|
index_value = parse_index(infer_series.index)
|
|
227
229
|
else:
|
|
228
|
-
index_value = parse_index(
|
|
230
|
+
index_value = parse_index(series.index_value)
|
|
229
231
|
|
|
230
232
|
if output_type == OutputType.dataframe:
|
|
231
233
|
if dtypes is None:
|
|
@@ -317,6 +319,7 @@ def df_apply(
|
|
|
317
319
|
skip_infer=False,
|
|
318
320
|
**kwds,
|
|
319
321
|
):
|
|
322
|
+
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/50
|
|
320
323
|
"""
|
|
321
324
|
Apply a function along an axis of the DataFrame.
|
|
322
325
|
|
|
@@ -442,20 +445,12 @@ def df_apply(
|
|
|
442
445
|
B 27
|
|
443
446
|
dtype: int64
|
|
444
447
|
|
|
445
|
-
>>> df.apply(np.sum, axis=1).execute()
|
|
448
|
+
>>> df.apply(lambda row: int(np.sum(row)), axis=1).execute()
|
|
446
449
|
0 13
|
|
447
450
|
1 13
|
|
448
451
|
2 13
|
|
449
452
|
dtype: int64
|
|
450
453
|
|
|
451
|
-
Returning a list-like will result in a Series
|
|
452
|
-
|
|
453
|
-
>>> df.apply(lambda x: [1, 2], axis=1).execute()
|
|
454
|
-
0 [1, 2]
|
|
455
|
-
1 [1, 2]
|
|
456
|
-
2 [1, 2]
|
|
457
|
-
dtype: object
|
|
458
|
-
|
|
459
454
|
Passing ``result_type='expand'`` will expand list-like results
|
|
460
455
|
to columns of a Dataframe
|
|
461
456
|
|
|
@@ -469,7 +464,7 @@ def df_apply(
|
|
|
469
464
|
``result_type='expand'``. The resulting column names
|
|
470
465
|
will be the Series index.
|
|
471
466
|
|
|
472
|
-
>>> df.apply(lambda x:
|
|
467
|
+
>>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1).execute()
|
|
473
468
|
foo bar
|
|
474
469
|
0 1 2
|
|
475
470
|
1 1 2
|