maxframe 1.0.0rc1__cp310-cp310-win_amd64.whl → 1.0.0rc2__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp310-win_amd64.pyd +0 -0
- maxframe/codegen.py +0 -4
- maxframe/config/config.py +34 -2
- maxframe/config/validators.py +1 -0
- maxframe/conftest.py +2 -0
- maxframe/core/entity/objects.py +1 -1
- maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +6 -0
- maxframe/dataframe/datasource/read_odps_table.py +2 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +3 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
- maxframe/learn/contrib/xgboost/predict.py +2 -2
- maxframe/learn/contrib/xgboost/train.py +2 -2
- maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
- maxframe/odpsio/__init__.py +1 -1
- maxframe/odpsio/arrow.py +8 -4
- maxframe/odpsio/schema.py +10 -7
- maxframe/odpsio/tableio.py +388 -14
- maxframe/odpsio/tests/test_schema.py +16 -15
- maxframe/odpsio/tests/test_tableio.py +48 -21
- maxframe/protocol.py +40 -2
- maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
- maxframe/serialization/serializables/core.py +48 -9
- maxframe/tensor/__init__.py +59 -0
- maxframe/tensor/base/unique.py +2 -2
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tests/utils.py +11 -2
- maxframe/utils.py +17 -9
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +74 -1
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +64 -64
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +38 -27
- maxframe_client/session/odps.py +5 -5
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +13 -2
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -12,22 +12,37 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import datetime
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
17
19
|
import pyarrow as pa
|
|
20
|
+
import pytest
|
|
18
21
|
from odps import ODPS
|
|
19
22
|
|
|
23
|
+
from ...config import options
|
|
20
24
|
from ...tests.utils import flaky, tn
|
|
21
25
|
from ...utils import config_odps_default_options
|
|
22
|
-
from ..tableio import
|
|
26
|
+
from ..tableio import ODPSTableIO
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def switch_table_io(request):
|
|
31
|
+
old_use_common_table = options.use_common_table
|
|
32
|
+
try:
|
|
33
|
+
options.use_common_table = request.param
|
|
34
|
+
yield
|
|
35
|
+
finally:
|
|
36
|
+
options.use_common_table = old_use_common_table
|
|
23
37
|
|
|
24
38
|
|
|
25
39
|
@flaky(max_runs=3)
|
|
26
|
-
|
|
40
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
41
|
+
def test_empty_table_io(switch_table_io):
|
|
27
42
|
config_odps_default_options()
|
|
28
43
|
|
|
29
44
|
o = ODPS.from_environments()
|
|
30
|
-
|
|
45
|
+
table_io = ODPSTableIO(o)
|
|
31
46
|
|
|
32
47
|
# test read from empty table
|
|
33
48
|
empty_table_name = tn("test_empty_table_halo_read")
|
|
@@ -35,42 +50,53 @@ def test_empty_table_io():
|
|
|
35
50
|
tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
|
|
36
51
|
|
|
37
52
|
try:
|
|
38
|
-
with
|
|
53
|
+
with table_io.open_reader(empty_table_name) as reader:
|
|
39
54
|
assert len(reader.read_all()) == 0
|
|
40
55
|
finally:
|
|
41
56
|
tb.drop()
|
|
42
57
|
|
|
43
58
|
|
|
44
59
|
@flaky(max_runs=3)
|
|
45
|
-
|
|
60
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
61
|
+
def test_table_io_without_parts(switch_table_io):
|
|
46
62
|
config_odps_default_options()
|
|
47
63
|
|
|
48
64
|
o = ODPS.from_environments()
|
|
49
|
-
|
|
65
|
+
table_io = ODPSTableIO(o)
|
|
50
66
|
|
|
51
67
|
# test read and write tables without partition
|
|
52
68
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
53
69
|
o.delete_table(no_part_table_name, if_exists=True)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
)
|
|
70
|
+
col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
|
|
71
|
+
tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
|
|
57
72
|
|
|
58
73
|
try:
|
|
59
74
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
60
|
-
|
|
75
|
+
date_val = [
|
|
76
|
+
(
|
|
77
|
+
datetime.datetime.now().replace(microsecond=0)
|
|
78
|
+
+ datetime.timedelta(seconds=i)
|
|
79
|
+
)
|
|
80
|
+
for i in range(100)
|
|
81
|
+
]
|
|
82
|
+
pd_data["f"] = pd.Series(date_val, dtype="datetime64[ms]").dt.tz_localize(
|
|
83
|
+
options.local_timezone
|
|
84
|
+
)
|
|
85
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
61
86
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
62
|
-
with
|
|
87
|
+
with table_io.open_reader(no_part_table_name) as reader:
|
|
63
88
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
64
89
|
finally:
|
|
65
90
|
tb.drop()
|
|
66
91
|
|
|
67
92
|
|
|
68
93
|
@flaky(max_runs=3)
|
|
69
|
-
|
|
94
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
95
|
+
def test_table_io_with_range_reader(switch_table_io):
|
|
70
96
|
config_odps_default_options()
|
|
71
97
|
|
|
72
98
|
o = ODPS.from_environments()
|
|
73
|
-
|
|
99
|
+
table_io = ODPSTableIO(o)
|
|
74
100
|
|
|
75
101
|
# test read and write tables without partition
|
|
76
102
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
@@ -81,15 +107,15 @@ def test_table_io_with_range_reader():
|
|
|
81
107
|
|
|
82
108
|
try:
|
|
83
109
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
84
|
-
with
|
|
110
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
85
111
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
86
112
|
|
|
87
|
-
with
|
|
113
|
+
with table_io.open_reader(
|
|
88
114
|
no_part_table_name, start=None, stop=100, row_batch_size=10
|
|
89
115
|
) as reader:
|
|
90
116
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
91
117
|
|
|
92
|
-
with
|
|
118
|
+
with table_io.open_reader(
|
|
93
119
|
no_part_table_name,
|
|
94
120
|
start=-2,
|
|
95
121
|
stop=-52,
|
|
@@ -105,11 +131,12 @@ def test_table_io_with_range_reader():
|
|
|
105
131
|
|
|
106
132
|
|
|
107
133
|
@flaky(max_runs=3)
|
|
108
|
-
|
|
134
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
135
|
+
def test_table_io_with_parts(switch_table_io):
|
|
109
136
|
config_odps_default_options()
|
|
110
137
|
|
|
111
138
|
o = ODPS.from_environments()
|
|
112
|
-
|
|
139
|
+
table_io = ODPSTableIO(o)
|
|
113
140
|
|
|
114
141
|
# test read and write tables with partition
|
|
115
142
|
parted_table_name = tn("test_parted_halo_write")
|
|
@@ -122,11 +149,11 @@ def test_table_io_with_parts():
|
|
|
122
149
|
|
|
123
150
|
try:
|
|
124
151
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
125
|
-
with
|
|
152
|
+
with table_io.open_writer(parted_table_name, "pt=test") as writer:
|
|
126
153
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
127
|
-
with
|
|
154
|
+
with table_io.open_reader(parted_table_name, "pt=test") as reader:
|
|
128
155
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
129
|
-
with
|
|
156
|
+
with table_io.open_reader(
|
|
130
157
|
parted_table_name, "pt=test", partition_columns=True
|
|
131
158
|
) as reader:
|
|
132
159
|
expected_data = pd_data.copy()
|
maxframe/protocol.py
CHANGED
|
@@ -154,6 +154,9 @@ class ODPSTableResultInfo(ResultInfo):
|
|
|
154
154
|
partition_specs: Optional[List[str]] = ListField(
|
|
155
155
|
"partition_specs", FieldTypes.string, default=None
|
|
156
156
|
)
|
|
157
|
+
table_meta: Optional["DataFrameTableMeta"] = ReferenceField(
|
|
158
|
+
"table_meta", default=None
|
|
159
|
+
)
|
|
157
160
|
|
|
158
161
|
def __init__(self, result_type: ResultType = None, **kw):
|
|
159
162
|
result_type = result_type or ResultType.ODPS_TABLE
|
|
@@ -164,8 +167,17 @@ class ODPSTableResultInfo(ResultInfo):
|
|
|
164
167
|
ret["full_table_name"] = self.full_table_name
|
|
165
168
|
if self.partition_specs:
|
|
166
169
|
ret["partition_specs"] = self.partition_specs
|
|
170
|
+
if self.table_meta:
|
|
171
|
+
ret["table_meta"] = self.table_meta.to_json()
|
|
167
172
|
return ret
|
|
168
173
|
|
|
174
|
+
@classmethod
|
|
175
|
+
def _json_to_kwargs(cls, serialized: dict) -> dict:
|
|
176
|
+
kw = super()._json_to_kwargs(serialized)
|
|
177
|
+
if "table_meta" in kw:
|
|
178
|
+
kw["table_meta"] = DataFrameTableMeta.from_json(kw["table_meta"])
|
|
179
|
+
return kw
|
|
180
|
+
|
|
169
181
|
|
|
170
182
|
class ODPSVolumeResultInfo(ResultInfo):
|
|
171
183
|
_result_type = ResultType.ODPS_VOLUME
|
|
@@ -469,7 +481,7 @@ class DecrefRequest(Serializable):
|
|
|
469
481
|
keys: List[str] = ListField("keys", FieldTypes.string, default=None)
|
|
470
482
|
|
|
471
483
|
|
|
472
|
-
class DataFrameTableMeta(
|
|
484
|
+
class DataFrameTableMeta(JsonSerializable):
|
|
473
485
|
__slots__ = "_pd_column_names", "_pd_index_level_names"
|
|
474
486
|
|
|
475
487
|
table_name: Optional[str] = StringField("table_name", default=None)
|
|
@@ -500,7 +512,7 @@ class DataFrameTableMeta(Serializable):
|
|
|
500
512
|
self._pd_index_level_names = self.pd_index_dtypes.index.tolist()
|
|
501
513
|
return self._pd_index_level_names
|
|
502
514
|
|
|
503
|
-
def __eq__(self, other: "
|
|
515
|
+
def __eq__(self, other: "DataFrameTableMeta") -> bool:
|
|
504
516
|
if not isinstance(other, type(self)):
|
|
505
517
|
return False
|
|
506
518
|
for k in self._FIELDS:
|
|
@@ -511,3 +523,29 @@ class DataFrameTableMeta(Serializable):
|
|
|
511
523
|
if not is_same:
|
|
512
524
|
return False
|
|
513
525
|
return True
|
|
526
|
+
|
|
527
|
+
def to_json(self) -> dict:
|
|
528
|
+
b64_pk = lambda x: base64.b64encode(pickle.dumps(x))
|
|
529
|
+
ret = {
|
|
530
|
+
"table_name": self.table_name,
|
|
531
|
+
"type": self.type.value,
|
|
532
|
+
"table_column_names": self.table_column_names,
|
|
533
|
+
"table_index_column_names": self.table_index_column_names,
|
|
534
|
+
"pd_column_dtypes": b64_pk(self.pd_column_dtypes),
|
|
535
|
+
"pd_column_level_names": b64_pk(self.pd_column_level_names),
|
|
536
|
+
"pd_index_dtypes": b64_pk(self.pd_index_dtypes),
|
|
537
|
+
}
|
|
538
|
+
return ret
|
|
539
|
+
|
|
540
|
+
@classmethod
|
|
541
|
+
def from_json(cls, serialized: dict) -> "DataFrameTableMeta":
|
|
542
|
+
b64_upk = lambda x: pickle.loads(base64.b64decode(x))
|
|
543
|
+
serialized.update(
|
|
544
|
+
{
|
|
545
|
+
"type": OutputType(serialized["type"]),
|
|
546
|
+
"pd_column_dtypes": b64_upk(serialized["pd_column_dtypes"]),
|
|
547
|
+
"pd_column_level_names": b64_upk(serialized["pd_column_level_names"]),
|
|
548
|
+
"pd_index_dtypes": b64_upk(serialized["pd_index_dtypes"]),
|
|
549
|
+
}
|
|
550
|
+
)
|
|
551
|
+
return DataFrameTableMeta(**serialized)
|
|
Binary file
|
|
@@ -51,7 +51,10 @@ def _is_field_primitive_compound(field: Field):
|
|
|
51
51
|
class SerializableMeta(type):
|
|
52
52
|
def __new__(mcs, name: str, bases: Tuple[Type], properties: Dict):
|
|
53
53
|
# All the fields including misc fields.
|
|
54
|
-
|
|
54
|
+
legacy_name_hash = hash(f"{properties.get('__module__')}.{name}")
|
|
55
|
+
name_hash = hash(
|
|
56
|
+
f"{properties.get('__module__')}.{properties.get('__qualname__')}"
|
|
57
|
+
)
|
|
55
58
|
all_fields = dict()
|
|
56
59
|
# mapping field names to base classes
|
|
57
60
|
field_to_cls_hash = dict()
|
|
@@ -107,6 +110,10 @@ class SerializableMeta(type):
|
|
|
107
110
|
slots.update(properties_field_slot_names)
|
|
108
111
|
|
|
109
112
|
properties = properties_without_fields
|
|
113
|
+
|
|
114
|
+
# todo remove this prop when all versions below v1.0.0rc1 is eliminated
|
|
115
|
+
properties["_LEGACY_NAME_HASH"] = legacy_name_hash
|
|
116
|
+
|
|
110
117
|
properties["_NAME_HASH"] = name_hash
|
|
111
118
|
properties["_FIELDS"] = all_fields
|
|
112
119
|
properties["_FIELD_ORDER"] = field_order
|
|
@@ -210,8 +217,8 @@ class SerializableSerializer(Serializer):
|
|
|
210
217
|
"""
|
|
211
218
|
|
|
212
219
|
@classmethod
|
|
213
|
-
def _get_obj_field_count_key(cls, obj: Serializable):
|
|
214
|
-
return f"FC_{obj._NAME_HASH}"
|
|
220
|
+
def _get_obj_field_count_key(cls, obj: Serializable, legacy: bool = False):
|
|
221
|
+
return f"FC_{obj._NAME_HASH if not legacy else obj._LEGACY_NAME_HASH}"
|
|
215
222
|
|
|
216
223
|
@classmethod
|
|
217
224
|
def _get_field_values(cls, obj: Serializable, fields):
|
|
@@ -290,6 +297,12 @@ class SerializableSerializer(Serializer):
|
|
|
290
297
|
server_cls_to_field_count = obj_class._CLS_TO_NON_PRIMITIVE_FIELD_COUNT
|
|
291
298
|
server_fields = obj_class._NON_PRIMITIVE_FIELDS
|
|
292
299
|
|
|
300
|
+
legacy_to_new_hash = {
|
|
301
|
+
c._LEGACY_NAME_HASH: c._NAME_HASH
|
|
302
|
+
for c in obj_class.__mro__
|
|
303
|
+
if hasattr(c, "_NAME_HASH") and c._LEGACY_NAME_HASH != c._NAME_HASH
|
|
304
|
+
}
|
|
305
|
+
|
|
293
306
|
if client_cls_to_field_count:
|
|
294
307
|
field_num, server_field_num = 0, 0
|
|
295
308
|
for cls_hash, count in client_cls_to_field_count.items():
|
|
@@ -301,20 +314,40 @@ class SerializableSerializer(Serializer):
|
|
|
301
314
|
if not is_primitive or value != {}:
|
|
302
315
|
cls._set_field_value(obj, field, value)
|
|
303
316
|
field_num += count
|
|
304
|
-
|
|
317
|
+
try:
|
|
318
|
+
server_field_num += server_cls_to_field_count[cls_hash]
|
|
319
|
+
except KeyError:
|
|
320
|
+
try:
|
|
321
|
+
# todo remove this fallback when all
|
|
322
|
+
# versions below v1.0.0rc1 is eliminated
|
|
323
|
+
server_field_num += server_cls_to_field_count[
|
|
324
|
+
legacy_to_new_hash[cls_hash]
|
|
325
|
+
]
|
|
326
|
+
except KeyError:
|
|
327
|
+
# it is possible that certain type of field does not exist
|
|
328
|
+
# at server side
|
|
329
|
+
pass
|
|
305
330
|
else:
|
|
331
|
+
# handle legacy serialization style, with all fields sorted by name
|
|
306
332
|
# todo remove this branch when all versions below v0.1.0b5 is eliminated
|
|
307
333
|
from .field import AnyField
|
|
308
334
|
|
|
309
|
-
# legacy serialization style, with all fields sorted by name
|
|
310
335
|
if is_primitive:
|
|
311
|
-
|
|
336
|
+
new_field_attr = "_legacy_new_primitives"
|
|
337
|
+
deprecated_field_attr = "_legacy_deprecated_primitives"
|
|
312
338
|
else:
|
|
313
|
-
|
|
339
|
+
new_field_attr = "_legacy_new_non_primitives"
|
|
340
|
+
deprecated_field_attr = "_legacy_deprecated_non_primitives"
|
|
341
|
+
|
|
342
|
+
# remove fields added on later releases
|
|
343
|
+
new_names = set(getattr(obj_class, new_field_attr, None) or [])
|
|
344
|
+
server_fields = [f for f in server_fields if f.name not in new_names]
|
|
345
|
+
|
|
346
|
+
# fill fields deprecated on later releases
|
|
314
347
|
deprecated_fields = []
|
|
315
348
|
deprecated_names = set()
|
|
316
|
-
if hasattr(obj_class,
|
|
317
|
-
deprecated_names = set(getattr(obj_class,
|
|
349
|
+
if hasattr(obj_class, deprecated_field_attr):
|
|
350
|
+
deprecated_names = set(getattr(obj_class, deprecated_field_attr))
|
|
318
351
|
for field_name in deprecated_names:
|
|
319
352
|
field = AnyField(tag=field_name)
|
|
320
353
|
field.name = field_name
|
|
@@ -342,6 +375,12 @@ class SerializableSerializer(Serializer):
|
|
|
342
375
|
field_count_data = self.get_public_data(
|
|
343
376
|
context, self._get_obj_field_count_key(obj)
|
|
344
377
|
)
|
|
378
|
+
if field_count_data is None:
|
|
379
|
+
# todo remove this fallback when all
|
|
380
|
+
# versions below v1.0.0rc1 is eliminated
|
|
381
|
+
field_count_data = self.get_public_data(
|
|
382
|
+
context, self._get_obj_field_count_key(obj, legacy=True)
|
|
383
|
+
)
|
|
345
384
|
if field_count_data is not None:
|
|
346
385
|
cls_to_prim_key, cls_to_non_prim_key = msgpack.loads(field_count_data)
|
|
347
386
|
cls_to_prim_key = dict(cls_to_prim_key)
|
maxframe/tensor/__init__.py
CHANGED
|
@@ -180,4 +180,63 @@ from .reduction import std, sum, var
|
|
|
180
180
|
from .reshape import reshape
|
|
181
181
|
from .ufunc import ufunc
|
|
182
182
|
|
|
183
|
+
# isort: off
|
|
184
|
+
# noinspection PyUnresolvedReferences
|
|
185
|
+
from numpy import (
|
|
186
|
+
NAN,
|
|
187
|
+
NINF,
|
|
188
|
+
AxisError,
|
|
189
|
+
Inf,
|
|
190
|
+
NaN,
|
|
191
|
+
e,
|
|
192
|
+
errstate,
|
|
193
|
+
geterr,
|
|
194
|
+
inf,
|
|
195
|
+
nan,
|
|
196
|
+
newaxis,
|
|
197
|
+
pi,
|
|
198
|
+
seterr,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# import numpy types
|
|
202
|
+
# noinspection PyUnresolvedReferences
|
|
203
|
+
from numpy import (
|
|
204
|
+
bool_ as bool,
|
|
205
|
+
bytes_,
|
|
206
|
+
cfloat,
|
|
207
|
+
character,
|
|
208
|
+
complex64,
|
|
209
|
+
complex128,
|
|
210
|
+
complexfloating,
|
|
211
|
+
datetime64,
|
|
212
|
+
double,
|
|
213
|
+
dtype,
|
|
214
|
+
flexible,
|
|
215
|
+
float16,
|
|
216
|
+
float32,
|
|
217
|
+
float64,
|
|
218
|
+
floating,
|
|
219
|
+
generic,
|
|
220
|
+
inexact,
|
|
221
|
+
int8,
|
|
222
|
+
int16,
|
|
223
|
+
int32,
|
|
224
|
+
int64,
|
|
225
|
+
intc,
|
|
226
|
+
intp,
|
|
227
|
+
number,
|
|
228
|
+
integer,
|
|
229
|
+
object_ as object,
|
|
230
|
+
signedinteger,
|
|
231
|
+
timedelta64,
|
|
232
|
+
uint,
|
|
233
|
+
uint8,
|
|
234
|
+
uint16,
|
|
235
|
+
uint32,
|
|
236
|
+
uint64,
|
|
237
|
+
unicode_,
|
|
238
|
+
unsignedinteger,
|
|
239
|
+
void,
|
|
240
|
+
)
|
|
241
|
+
|
|
183
242
|
del fetch, ufunc
|
maxframe/tensor/base/unique.py
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
17
|
|
|
18
|
-
from ... import opcodes
|
|
18
|
+
from ... import opcodes
|
|
19
19
|
from ...serialization.serializables import BoolField, Int32Field
|
|
20
20
|
from ..core import TensorOrder
|
|
21
21
|
from ..operators import TensorHasInput, TensorOperatorMixin
|
|
@@ -23,7 +23,7 @@ from ..utils import validate_axis
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class TensorUnique(TensorHasInput, TensorOperatorMixin):
|
|
26
|
-
_op_type_ =
|
|
26
|
+
_op_type_ = opcodes.UNIQUE
|
|
27
27
|
|
|
28
28
|
return_index = BoolField("return_index", default=False)
|
|
29
29
|
return_inverse = BoolField("return_inverse", default=False)
|
|
@@ -16,7 +16,7 @@ from collections.abc import Iterable
|
|
|
16
16
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
|
|
19
|
-
from ... import opcodes
|
|
19
|
+
from ... import opcodes
|
|
20
20
|
from ...core import ENTITY_TYPE
|
|
21
21
|
from ...serialization.serializables import AnyField, BoolField, KeyField, StringField
|
|
22
22
|
from ..core import TENSOR_TYPE, TensorOrder
|
|
@@ -43,7 +43,7 @@ q_error_msg = "Quantiles must be in the range [0, 1]"
|
|
|
43
43
|
|
|
44
44
|
class TensorQuantile(TensorOperator, TensorOperatorMixin):
|
|
45
45
|
__slots__ = ("q_error_msg",)
|
|
46
|
-
_op_type_ =
|
|
46
|
+
_op_type_ = opcodes.QUANTILE
|
|
47
47
|
|
|
48
48
|
a = KeyField("a")
|
|
49
49
|
q = AnyField("q")
|
maxframe/tests/utils.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import functools
|
|
17
|
+
import hashlib
|
|
17
18
|
import os
|
|
18
19
|
import queue
|
|
19
20
|
import socket
|
|
@@ -25,7 +26,7 @@ import pytest
|
|
|
25
26
|
from tornado import netutil
|
|
26
27
|
|
|
27
28
|
from ..core import Tileable, TileableGraph
|
|
28
|
-
from ..utils import
|
|
29
|
+
from ..utils import create_sync_primitive, lazy_import, to_binary
|
|
29
30
|
|
|
30
31
|
try:
|
|
31
32
|
from flaky import flaky
|
|
@@ -102,7 +103,7 @@ def run_app_in_thread(app_func):
|
|
|
102
103
|
def fixture_func(*args, **kwargs):
|
|
103
104
|
app_loop = asyncio.new_event_loop()
|
|
104
105
|
q = queue.Queue()
|
|
105
|
-
exit_event =
|
|
106
|
+
exit_event = create_sync_primitive(asyncio.Event, app_loop)
|
|
106
107
|
app_thread = Thread(
|
|
107
108
|
name="TestAppThread",
|
|
108
109
|
target=app_thread_func,
|
|
@@ -162,3 +163,11 @@ def require_hadoop(func):
|
|
|
162
163
|
not os.environ.get("WITH_HADOOP"), reason="Only run when hadoop is installed"
|
|
163
164
|
)(func)
|
|
164
165
|
return func
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def get_test_unique_name(size=None):
|
|
169
|
+
test_name = os.getenv("PYTEST_CURRENT_TEST", "pyodps_test")
|
|
170
|
+
digest = hashlib.md5(to_binary(test_name)).hexdigest()
|
|
171
|
+
if size:
|
|
172
|
+
digest = digest[:size]
|
|
173
|
+
return digest + "_" + str(os.getpid())
|
maxframe/utils.py
CHANGED
|
@@ -436,19 +436,27 @@ async def to_thread_pool(func, *args, pool=None, **kwargs):
|
|
|
436
436
|
return await loop.run_in_executor(pool, func_call)
|
|
437
437
|
|
|
438
438
|
|
|
439
|
-
|
|
439
|
+
_PrimitiveType = TypeVar("_PrimitiveType")
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def create_sync_primitive(
|
|
443
|
+
cls: Type[_PrimitiveType], loop: asyncio.AbstractEventLoop
|
|
444
|
+
) -> _PrimitiveType:
|
|
440
445
|
"""
|
|
441
|
-
Create an asyncio
|
|
446
|
+
Create an asyncio sync primitive (locks, events, etc.)
|
|
447
|
+
in a certain event loop.
|
|
442
448
|
"""
|
|
443
|
-
if sys.version_info[1] < 10
|
|
444
|
-
return
|
|
449
|
+
if sys.version_info[1] < 10:
|
|
450
|
+
return cls(loop=loop)
|
|
445
451
|
|
|
446
452
|
# From Python3.10 the loop parameter has been removed. We should work around here.
|
|
447
|
-
old_loop = asyncio.
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
453
|
+
old_loop = asyncio.get_event_loop()
|
|
454
|
+
try:
|
|
455
|
+
asyncio.set_event_loop(loop)
|
|
456
|
+
primitive = cls()
|
|
457
|
+
finally:
|
|
458
|
+
asyncio.set_event_loop(old_loop)
|
|
459
|
+
return primitive
|
|
452
460
|
|
|
453
461
|
|
|
454
462
|
class ToThreadCancelledError(asyncio.CancelledError):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: maxframe
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc2
|
|
4
4
|
Summary: MaxFrame operator-based data analyze framework
|
|
5
5
|
Requires-Dist: numpy <2.0.0,>=1.19.0
|
|
6
6
|
Requires-Dist: pandas >=1.0.0
|
|
@@ -102,3 +102,76 @@ License
|
|
|
102
102
|
|
|
103
103
|
Licensed under the `Apache License
|
|
104
104
|
2.0 <https://www.apache.org/licenses/LICENSE-2.0.html>`__.
|
|
105
|
+
MaxCompute MaxFrame Client
|
|
106
|
+
==========================
|
|
107
|
+
|
|
108
|
+
MaxFrame is a computational framework created by Alibaba Cloud to
|
|
109
|
+
provide a way for Python developers to parallelize their code with
|
|
110
|
+
MaxCompute. It creates a runnable computation graph locally, submits it
|
|
111
|
+
to MaxCompute to execute and obtains results from MaxCompute.
|
|
112
|
+
|
|
113
|
+
MaxFrame client is the client of MaxFrame. Currently it provides a
|
|
114
|
+
DataFrame-based SDK with compatible APIs for pandas. In future, other
|
|
115
|
+
common Python libraries like numpy and scikit-learn will be added as
|
|
116
|
+
well. Python 3.7 is recommended for MaxFrame client to enable all
|
|
117
|
+
functionalities while supports for higher Python versions are on the
|
|
118
|
+
way.
|
|
119
|
+
|
|
120
|
+
Installation
|
|
121
|
+
------------
|
|
122
|
+
|
|
123
|
+
You may install MaxFrame client through PIP:
|
|
124
|
+
|
|
125
|
+
.. code:: bash
|
|
126
|
+
|
|
127
|
+
pip install maxframe
|
|
128
|
+
|
|
129
|
+
Latest beta version can be installed with ``--pre`` argument:
|
|
130
|
+
|
|
131
|
+
.. code:: bash
|
|
132
|
+
|
|
133
|
+
pip install --pre maxframe
|
|
134
|
+
|
|
135
|
+
You can also install MaxFrame client from source code:
|
|
136
|
+
|
|
137
|
+
.. code:: bash
|
|
138
|
+
|
|
139
|
+
pip install git+https://github.com/aliyun/alibabacloud-odps-maxframe-client.git
|
|
140
|
+
|
|
141
|
+
Getting started
|
|
142
|
+
---------------
|
|
143
|
+
|
|
144
|
+
We show a simple code example of MaxFrame client which read data from a
|
|
145
|
+
MaxCompute table, performs some simple data transform and writes back
|
|
146
|
+
into MaxCompute.
|
|
147
|
+
|
|
148
|
+
.. code:: python
|
|
149
|
+
|
|
150
|
+
import maxframe.dataframe as md
|
|
151
|
+
import os
|
|
152
|
+
from maxframe import new_session
|
|
153
|
+
from odps import ODPS
|
|
154
|
+
|
|
155
|
+
o = ODPS(
|
|
156
|
+
os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'),
|
|
157
|
+
os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'),
|
|
158
|
+
project='your-default-project',
|
|
159
|
+
endpoint='your-end-point',
|
|
160
|
+
)
|
|
161
|
+
session = new_session(o)
|
|
162
|
+
|
|
163
|
+
df = md.read_odps_table("source_table")
|
|
164
|
+
df["A"] = "prefix_" + df["A"]
|
|
165
|
+
md.to_odps_table(df, "prefix_source_table")
|
|
166
|
+
|
|
167
|
+
Documentation
|
|
168
|
+
-------------
|
|
169
|
+
|
|
170
|
+
Detailed documentations can be found
|
|
171
|
+
`here <https://maxframe.readthedocs.io>`__.
|
|
172
|
+
|
|
173
|
+
License
|
|
174
|
+
-------
|
|
175
|
+
|
|
176
|
+
Licensed under the `Apache License
|
|
177
|
+
2.0 <https://www.apache.org/licenses/LICENSE-2.0.html>`__.
|