maxframe 1.0.0rc4__cp38-cp38-win32.whl → 1.1.0__cp38-cp38-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp38-win32.pyd +0 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +9 -2
- maxframe/core/graph/core.cp38-win32.pyd +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +24 -2
- maxframe/dataframe/datasource/read_odps_query.py +63 -34
- maxframe/dataframe/datasource/tests/test_datasource.py +59 -7
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +28 -40
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +46 -2
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +5 -1
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/arrow.py +30 -2
- maxframe/io/odpsio/schema.py +23 -5
- maxframe/io/odpsio/tableio.py +26 -110
- maxframe/io/odpsio/tests/test_schema.py +40 -0
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +27 -3
- maxframe/learn/contrib/__init__.py +3 -2
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/lib/mmh3.cp38-win32.pyd +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +7 -1
- maxframe/serialization/core.cp38-win32.pyd +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +24 -5
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +8 -1
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/tensor/__init__.py +19 -7
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +36 -8
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/RECORD +83 -72
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +18 -2
- maxframe_client/session/odps.py +23 -10
- maxframe_client/session/task.py +2 -24
- maxframe_client/session/tests/test_task.py +0 -4
- maxframe_client/tests/test_session.py +30 -10
- {maxframe-1.0.0rc4.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
17
|
+
from .core import LLM
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MultiModalLLM(LLM):
|
|
21
|
+
def generate(
|
|
22
|
+
self,
|
|
23
|
+
data,
|
|
24
|
+
prompt_template: Dict[str, Any],
|
|
25
|
+
params: Dict[str, Any] = None,
|
|
26
|
+
):
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def generate(
|
|
31
|
+
data,
|
|
32
|
+
model: MultiModalLLM,
|
|
33
|
+
prompt_template: Dict[str, Any],
|
|
34
|
+
params: Dict[str, Any] = None,
|
|
35
|
+
):
|
|
36
|
+
if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
|
|
37
|
+
raise ValueError("data must be a maxframe dataframe or series object")
|
|
38
|
+
if not isinstance(model, MultiModalLLM):
|
|
39
|
+
raise ValueError("model must be a MultiModalLLM object")
|
|
40
|
+
params = params if params is not None else dict()
|
|
41
|
+
model.validate_params(params)
|
|
42
|
+
return model.generate(data, prompt_template, params)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
17
|
+
from .core import LLM
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TextLLM(LLM):
|
|
21
|
+
def generate(
|
|
22
|
+
self,
|
|
23
|
+
data,
|
|
24
|
+
prompt_template: Dict[str, Any],
|
|
25
|
+
params: Dict[str, Any] = None,
|
|
26
|
+
):
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def generate(
|
|
31
|
+
data,
|
|
32
|
+
model: TextLLM,
|
|
33
|
+
prompt_template: Dict[str, Any],
|
|
34
|
+
params: Dict[str, Any] = None,
|
|
35
|
+
):
|
|
36
|
+
if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
|
|
37
|
+
raise ValueError("data must be a maxframe dataframe or series object")
|
|
38
|
+
if not isinstance(model, TextLLM):
|
|
39
|
+
raise ValueError("model must be a TextLLM object")
|
|
40
|
+
params = params if params is not None else dict()
|
|
41
|
+
model.validate_params(params)
|
|
42
|
+
return model.generate(data, prompt_template, params)
|
maxframe/lib/mmh3.cp38-win32.pyd
CHANGED
|
Binary file
|
|
@@ -55,13 +55,13 @@ def test_sparse_creation():
|
|
|
55
55
|
s = SparseNDArray(s1_data)
|
|
56
56
|
assert s.ndim == 2
|
|
57
57
|
assert isinstance(s, SparseMatrix)
|
|
58
|
-
assert_array_equal(s.toarray(), s1_data.
|
|
59
|
-
assert_array_equal(s.todense(), s1_data.
|
|
58
|
+
assert_array_equal(s.toarray(), s1_data.toarray())
|
|
59
|
+
assert_array_equal(s.todense(), s1_data.toarray())
|
|
60
60
|
|
|
61
61
|
ss = pickle.loads(pickle.dumps(s))
|
|
62
62
|
assert s == ss
|
|
63
|
-
assert_array_equal(ss.toarray(), s1_data.
|
|
64
|
-
assert_array_equal(ss.todense(), s1_data.
|
|
63
|
+
assert_array_equal(ss.toarray(), s1_data.toarray())
|
|
64
|
+
assert_array_equal(ss.todense(), s1_data.toarray())
|
|
65
65
|
|
|
66
66
|
v = SparseNDArray(v1, shape=(3,))
|
|
67
67
|
assert s.ndim
|
|
@@ -331,12 +331,12 @@ def test_sparse_dot():
|
|
|
331
331
|
|
|
332
332
|
assert_array_equal(mls.dot(s1, v1_s), s1.dot(v1_data))
|
|
333
333
|
assert_array_equal(mls.dot(s2, v1_s), s2.dot(v1_data))
|
|
334
|
-
assert_array_equal(mls.dot(v2_s, s1), v2_data.dot(s1_data.
|
|
335
|
-
assert_array_equal(mls.dot(v2_s, s2), v2_data.dot(s2_data.
|
|
334
|
+
assert_array_equal(mls.dot(v2_s, s1), v2_data.dot(s1_data.toarray()))
|
|
335
|
+
assert_array_equal(mls.dot(v2_s, s2), v2_data.dot(s2_data.toarray()))
|
|
336
336
|
assert_array_equal(mls.dot(v1_s, v1_s), v1_data.dot(v1_data), almost=True)
|
|
337
337
|
assert_array_equal(mls.dot(v2_s, v2_s), v2_data.dot(v2_data), almost=True)
|
|
338
338
|
|
|
339
|
-
assert_array_equal(mls.dot(v2_s, s1, sparse=False), v2_data.dot(s1_data.
|
|
339
|
+
assert_array_equal(mls.dot(v2_s, s1, sparse=False), v2_data.dot(s1_data.toarray()))
|
|
340
340
|
assert_array_equal(mls.dot(v1_s, v1_s, sparse=False), v1_data.dot(v1_data))
|
|
341
341
|
|
|
342
342
|
|
|
@@ -390,7 +390,7 @@ def test_sparse_fill_diagonal():
|
|
|
390
390
|
arr = SparseNDArray(s1)
|
|
391
391
|
arr.fill_diagonal(3)
|
|
392
392
|
|
|
393
|
-
expected = s1.copy().
|
|
393
|
+
expected = s1.copy().toarray()
|
|
394
394
|
np.fill_diagonal(expected, 3)
|
|
395
395
|
|
|
396
396
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -399,7 +399,7 @@ def test_sparse_fill_diagonal():
|
|
|
399
399
|
arr = SparseNDArray(s1)
|
|
400
400
|
arr.fill_diagonal(3, wrap=True)
|
|
401
401
|
|
|
402
|
-
expected = s1.copy().
|
|
402
|
+
expected = s1.copy().toarray()
|
|
403
403
|
np.fill_diagonal(expected, 3, wrap=True)
|
|
404
404
|
|
|
405
405
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -408,7 +408,7 @@ def test_sparse_fill_diagonal():
|
|
|
408
408
|
arr = SparseNDArray(s1)
|
|
409
409
|
arr.fill_diagonal([1, 2, 3])
|
|
410
410
|
|
|
411
|
-
expected = s1.copy().
|
|
411
|
+
expected = s1.copy().toarray()
|
|
412
412
|
np.fill_diagonal(expected, [1, 2, 3])
|
|
413
413
|
|
|
414
414
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -417,7 +417,7 @@ def test_sparse_fill_diagonal():
|
|
|
417
417
|
arr = SparseNDArray(s1)
|
|
418
418
|
arr.fill_diagonal([1, 2, 3], wrap=True)
|
|
419
419
|
|
|
420
|
-
expected = s1.copy().
|
|
420
|
+
expected = s1.copy().toarray()
|
|
421
421
|
np.fill_diagonal(expected, [1, 2, 3], wrap=True)
|
|
422
422
|
|
|
423
423
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -427,7 +427,7 @@ def test_sparse_fill_diagonal():
|
|
|
427
427
|
arr = SparseNDArray(s1)
|
|
428
428
|
arr.fill_diagonal(val)
|
|
429
429
|
|
|
430
|
-
expected = s1.copy().
|
|
430
|
+
expected = s1.copy().toarray()
|
|
431
431
|
np.fill_diagonal(expected, val)
|
|
432
432
|
|
|
433
433
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -437,7 +437,7 @@ def test_sparse_fill_diagonal():
|
|
|
437
437
|
arr = SparseNDArray(s1)
|
|
438
438
|
arr.fill_diagonal(val, wrap=True)
|
|
439
439
|
|
|
440
|
-
expected = s1.copy().
|
|
440
|
+
expected = s1.copy().toarray()
|
|
441
441
|
np.fill_diagonal(expected, val, wrap=True)
|
|
442
442
|
|
|
443
443
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -447,7 +447,7 @@ def test_sparse_fill_diagonal():
|
|
|
447
447
|
arr = SparseNDArray(s1)
|
|
448
448
|
arr.fill_diagonal(val)
|
|
449
449
|
|
|
450
|
-
expected = s1.copy().
|
|
450
|
+
expected = s1.copy().toarray()
|
|
451
451
|
np.fill_diagonal(expected, val)
|
|
452
452
|
|
|
453
453
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -457,7 +457,7 @@ def test_sparse_fill_diagonal():
|
|
|
457
457
|
arr = SparseNDArray(s1)
|
|
458
458
|
arr.fill_diagonal(val, wrap=True)
|
|
459
459
|
|
|
460
|
-
expected = s1.copy().
|
|
460
|
+
expected = s1.copy().toarray()
|
|
461
461
|
np.fill_diagonal(expected, val, wrap=True)
|
|
462
462
|
|
|
463
463
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
maxframe/opcodes.py
CHANGED
|
@@ -270,6 +270,7 @@ KURTOSIS = 351
|
|
|
270
270
|
SEM = 352
|
|
271
271
|
STR_CONCAT = 353
|
|
272
272
|
MAD = 354
|
|
273
|
+
MEDIAN = 355
|
|
273
274
|
|
|
274
275
|
# tensor operator
|
|
275
276
|
RESHAPE = 401
|
|
@@ -377,7 +378,6 @@ DROP_DUPLICATES = 728
|
|
|
377
378
|
MELT = 729
|
|
378
379
|
RENAME = 731
|
|
379
380
|
INSERT = 732
|
|
380
|
-
MAP_CHUNK = 733
|
|
381
381
|
CARTESIAN_CHUNK = 734
|
|
382
382
|
EXPLODE = 735
|
|
383
383
|
REPLACE = 736
|
|
@@ -392,6 +392,10 @@ PIVOT_TABLE = 744
|
|
|
392
392
|
|
|
393
393
|
FUSE = 801
|
|
394
394
|
|
|
395
|
+
# LLM
|
|
396
|
+
DASHSCOPE_TEXT_GENERATION = 810
|
|
397
|
+
DASHSCOPE_MULTI_MODAL_GENERATION = 811
|
|
398
|
+
|
|
395
399
|
# table like input for tensor
|
|
396
400
|
TABLE_COO = 1003
|
|
397
401
|
# store tensor as coo format
|
|
@@ -569,6 +573,8 @@ CHOLESKY_FUSE = 999988
|
|
|
569
573
|
# MaxFrame-dedicated functions
|
|
570
574
|
DATAFRAME_RESHUFFLE = 10001
|
|
571
575
|
FLATMAP = 10002
|
|
576
|
+
FLATJSON = 10003
|
|
577
|
+
APPLY_CHUNK = 10004
|
|
572
578
|
|
|
573
579
|
# MaxFrame internal operators
|
|
574
580
|
DATAFRAME_PROJECTION_SAME_INDEX_MERGE = 100001
|
|
Binary file
|
maxframe/serialization/core.pyx
CHANGED
|
@@ -37,7 +37,7 @@ from .._utils import NamedType
|
|
|
37
37
|
from .._utils cimport TypeDispatcher
|
|
38
38
|
|
|
39
39
|
from ..lib import wrapped_pickle as pickle
|
|
40
|
-
from ..utils import arrow_type_from_str
|
|
40
|
+
from ..utils import NoDefault, arrow_type_from_str, no_default
|
|
41
41
|
|
|
42
42
|
try:
|
|
43
43
|
from pandas import ArrowDtype
|
|
@@ -94,6 +94,7 @@ cdef:
|
|
|
94
94
|
int COMPLEX_SERIALIZER = 12
|
|
95
95
|
int SLICE_SERIALIZER = 13
|
|
96
96
|
int REGEX_SERIALIZER = 14
|
|
97
|
+
int NO_DEFAULT_SERIALIZER = 15
|
|
97
98
|
int PLACEHOLDER_SERIALIZER = 4096
|
|
98
99
|
|
|
99
100
|
|
|
@@ -803,6 +804,16 @@ cdef class RegexSerializer(Serializer):
|
|
|
803
804
|
return re.compile((<bytes>(subs[0])).decode(), serialized[0])
|
|
804
805
|
|
|
805
806
|
|
|
807
|
+
cdef class NoDefaultSerializer(Serializer):
|
|
808
|
+
serializer_id = NO_DEFAULT_SERIALIZER
|
|
809
|
+
|
|
810
|
+
cpdef serial(self, object obj, dict context):
|
|
811
|
+
return [], [], True
|
|
812
|
+
|
|
813
|
+
cpdef deserial(self, list obj, dict context, list subs):
|
|
814
|
+
return no_default
|
|
815
|
+
|
|
816
|
+
|
|
806
817
|
cdef class Placeholder:
|
|
807
818
|
"""
|
|
808
819
|
Placeholder object to reduce duplicated serialization
|
|
@@ -857,6 +868,7 @@ DtypeSerializer.register(ExtensionDtype)
|
|
|
857
868
|
ComplexSerializer.register(complex)
|
|
858
869
|
SliceSerializer.register(slice)
|
|
859
870
|
RegexSerializer.register(re.Pattern)
|
|
871
|
+
NoDefaultSerializer.register(NoDefault)
|
|
860
872
|
PlaceholderSerializer.register(Placeholder)
|
|
861
873
|
|
|
862
874
|
|
maxframe/serialization/pandas.py
CHANGED
|
@@ -134,8 +134,10 @@ class ArraySerializer(Serializer):
|
|
|
134
134
|
data_parts = [obj.tolist()]
|
|
135
135
|
else:
|
|
136
136
|
data_parts = [obj.to_numpy().tolist()]
|
|
137
|
-
|
|
137
|
+
elif hasattr(obj, "_data"):
|
|
138
138
|
data_parts = [getattr(obj, "_data")]
|
|
139
|
+
else:
|
|
140
|
+
data_parts = [getattr(obj, "_pa_array")]
|
|
139
141
|
return [ser_type], [dtype] + data_parts, False
|
|
140
142
|
|
|
141
143
|
def deserial(self, serialized: List, context: Dict, subs: List):
|
|
@@ -155,38 +157,66 @@ class PdTimestampSerializer(Serializer):
|
|
|
155
157
|
else:
|
|
156
158
|
zone_info = []
|
|
157
159
|
ts = obj.to_pydatetime().timestamp()
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
)
|
|
160
|
+
elements = [int(ts), obj.microsecond, obj.nanosecond]
|
|
161
|
+
if hasattr(obj, "unit"):
|
|
162
|
+
elements.append(str(obj.unit))
|
|
163
|
+
return elements, zone_info, bool(zone_info)
|
|
163
164
|
|
|
164
165
|
def deserial(self, serialized: List, context: Dict, subs: List):
|
|
165
166
|
if subs:
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
167
|
+
pydt = datetime.datetime.utcfromtimestamp(serialized[0])
|
|
168
|
+
kwargs = {
|
|
169
|
+
"year": pydt.year,
|
|
170
|
+
"month": pydt.month,
|
|
171
|
+
"day": pydt.day,
|
|
172
|
+
"hour": pydt.hour,
|
|
173
|
+
"minute": pydt.minute,
|
|
174
|
+
"second": pydt.second,
|
|
175
|
+
"microsecond": serialized[1],
|
|
176
|
+
"nanosecond": serialized[2],
|
|
177
|
+
"tzinfo": datetime.timezone.utc,
|
|
178
|
+
}
|
|
179
|
+
if len(serialized) > 3:
|
|
180
|
+
kwargs["unit"] = serialized[3]
|
|
181
|
+
val = pd.Timestamp(**kwargs).tz_convert(subs[0])
|
|
170
182
|
else:
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
183
|
+
pydt = datetime.datetime.fromtimestamp(serialized[0])
|
|
184
|
+
kwargs = {
|
|
185
|
+
"year": pydt.year,
|
|
186
|
+
"month": pydt.month,
|
|
187
|
+
"day": pydt.day,
|
|
188
|
+
"hour": pydt.hour,
|
|
189
|
+
"minute": pydt.minute,
|
|
190
|
+
"second": pydt.second,
|
|
191
|
+
"microsecond": serialized[1],
|
|
192
|
+
"nanosecond": serialized[2],
|
|
193
|
+
}
|
|
194
|
+
if len(serialized) >= 4:
|
|
195
|
+
kwargs["unit"] = serialized[3]
|
|
196
|
+
val = pd.Timestamp(**kwargs)
|
|
174
197
|
return val
|
|
175
198
|
|
|
176
199
|
|
|
177
200
|
class PdTimedeltaSerializer(Serializer):
|
|
178
201
|
def serial(self, obj: pd.Timedelta, context: Dict):
|
|
179
|
-
|
|
202
|
+
elements = [int(obj.seconds), obj.microseconds, obj.nanoseconds, obj.days]
|
|
203
|
+
if hasattr(obj, "unit"):
|
|
204
|
+
elements.append(str(obj.unit))
|
|
205
|
+
return elements, [], True
|
|
180
206
|
|
|
181
207
|
def deserial(self, serialized: List, context: Dict, subs: List):
|
|
182
208
|
days = 0 if len(serialized) < 4 else serialized[3]
|
|
209
|
+
unit = None if len(serialized) < 5 else serialized[4]
|
|
183
210
|
seconds, microseconds, nanoseconds = serialized[:3]
|
|
184
|
-
|
|
185
|
-
days
|
|
186
|
-
seconds
|
|
187
|
-
microseconds
|
|
188
|
-
nanoseconds
|
|
189
|
-
|
|
211
|
+
kwargs = {
|
|
212
|
+
"days": days,
|
|
213
|
+
"seconds": seconds,
|
|
214
|
+
"microseconds": microseconds,
|
|
215
|
+
"nanoseconds": nanoseconds,
|
|
216
|
+
}
|
|
217
|
+
if unit is not None:
|
|
218
|
+
kwargs["unit"] = unit
|
|
219
|
+
return pd.Timedelta(**kwargs)
|
|
190
220
|
|
|
191
221
|
|
|
192
222
|
class NoDefaultSerializer(Serializer):
|
|
@@ -19,6 +19,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type
|
|
|
19
19
|
import msgpack
|
|
20
20
|
|
|
21
21
|
from ...lib.mmh3 import hash
|
|
22
|
+
from ...utils import no_default
|
|
22
23
|
from ..core import Placeholder, Serializer, buffered, load_type
|
|
23
24
|
from .field import Field
|
|
24
25
|
from .field_type import DictType, ListType, PrimitiveFieldType, TupleType
|
|
@@ -211,6 +212,22 @@ class _NoFieldValue:
|
|
|
211
212
|
_no_field_value = _NoFieldValue()
|
|
212
213
|
|
|
213
214
|
|
|
215
|
+
def _to_primitive_placeholder(v: Any) -> Any:
|
|
216
|
+
if v is _no_field_value or v is no_default:
|
|
217
|
+
return {}
|
|
218
|
+
return v
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _restore_primitive_placeholder(v: Any) -> Any:
|
|
222
|
+
if type(v) is dict:
|
|
223
|
+
if v == {}:
|
|
224
|
+
return _no_field_value
|
|
225
|
+
else:
|
|
226
|
+
return v
|
|
227
|
+
else:
|
|
228
|
+
return v
|
|
229
|
+
|
|
230
|
+
|
|
214
231
|
class SerializableSerializer(Serializer):
|
|
215
232
|
"""
|
|
216
233
|
Leverage DictSerializer to perform serde.
|
|
@@ -241,9 +258,7 @@ class SerializableSerializer(Serializer):
|
|
|
241
258
|
else:
|
|
242
259
|
primitive_vals = self._get_field_values(obj, obj._PRIMITIVE_FIELDS)
|
|
243
260
|
# replace _no_field_value as {} to make them msgpack-serializable
|
|
244
|
-
primitive_vals = [
|
|
245
|
-
v if v is not _no_field_value else {} for v in primitive_vals
|
|
246
|
-
]
|
|
261
|
+
primitive_vals = [_to_primitive_placeholder(v) for v in primitive_vals]
|
|
247
262
|
if obj._cache_primitive_serial:
|
|
248
263
|
primitive_vals = msgpack.dumps(primitive_vals)
|
|
249
264
|
_primitive_serial_cache[obj] = primitive_vals
|
|
@@ -311,7 +326,9 @@ class SerializableSerializer(Serializer):
|
|
|
311
326
|
cls_fields = server_fields[server_field_num : field_num + count]
|
|
312
327
|
cls_values = values[field_num : field_num + count]
|
|
313
328
|
for field, value in zip(cls_fields, cls_values):
|
|
314
|
-
if
|
|
329
|
+
if is_primitive:
|
|
330
|
+
value = _restore_primitive_placeholder(value)
|
|
331
|
+
if not is_primitive or value is not _no_field_value:
|
|
315
332
|
cls._set_field_value(obj, field, value)
|
|
316
333
|
field_num += count
|
|
317
334
|
try:
|
|
@@ -356,7 +373,9 @@ class SerializableSerializer(Serializer):
|
|
|
356
373
|
server_fields + deprecated_fields, key=lambda f: f.name
|
|
357
374
|
)
|
|
358
375
|
for field, value in zip(server_fields, values):
|
|
359
|
-
if
|
|
376
|
+
if is_primitive:
|
|
377
|
+
value = _restore_primitive_placeholder(value)
|
|
378
|
+
if not is_primitive or value is not _no_field_value:
|
|
360
379
|
try:
|
|
361
380
|
cls._set_field_value(obj, field, value)
|
|
362
381
|
except AttributeError: # pragma: no cover
|
|
@@ -46,6 +46,9 @@ class PrimitiveType(Enum):
|
|
|
46
46
|
complex128 = 25
|
|
47
47
|
|
|
48
48
|
|
|
49
|
+
_np_unicode = np.unicode_ if hasattr(np, "unicode_") else np.str_
|
|
50
|
+
|
|
51
|
+
|
|
49
52
|
_primitive_type_to_valid_types = {
|
|
50
53
|
PrimitiveType.bool: (bool, np.bool_),
|
|
51
54
|
PrimitiveType.int8: (int, np.int8),
|
|
@@ -60,7 +63,7 @@ _primitive_type_to_valid_types = {
|
|
|
60
63
|
PrimitiveType.float32: (float, np.float32),
|
|
61
64
|
PrimitiveType.float64: (float, np.float64),
|
|
62
65
|
PrimitiveType.bytes: (bytes, np.bytes_),
|
|
63
|
-
PrimitiveType.string: (str,
|
|
66
|
+
PrimitiveType.string: (str, _np_unicode),
|
|
64
67
|
PrimitiveType.complex64: (complex, np.complex64),
|
|
65
68
|
PrimitiveType.complex128: (complex, np.complex128),
|
|
66
69
|
}
|
|
@@ -21,6 +21,7 @@ import pytest
|
|
|
21
21
|
|
|
22
22
|
from ....core import EntityData
|
|
23
23
|
from ....lib.wrapped_pickle import switch_unpickle
|
|
24
|
+
from ....utils import no_default
|
|
24
25
|
from ... import deserialize, serialize
|
|
25
26
|
from .. import (
|
|
26
27
|
AnyField,
|
|
@@ -143,6 +144,7 @@ class MySerializable(Serializable):
|
|
|
143
144
|
oneof1_val=f"{__name__}.MySerializable",
|
|
144
145
|
oneof2_val=MySimpleSerializable,
|
|
145
146
|
)
|
|
147
|
+
_no_default_val = Float64Field("no_default_val", default=no_default)
|
|
146
148
|
|
|
147
149
|
|
|
148
150
|
@pytest.mark.parametrize("set_is_ci", [False, True], indirect=True)
|
|
@@ -187,6 +189,7 @@ def test_serializable(set_is_ci):
|
|
|
187
189
|
_dict_val={"a": b"bytes_value"},
|
|
188
190
|
_ref_val=MySerializable(),
|
|
189
191
|
_oneof_val=MySerializable(_id="2"),
|
|
192
|
+
_no_default_val=no_default,
|
|
190
193
|
)
|
|
191
194
|
|
|
192
195
|
header, buffers = serialize(my_serializable)
|
|
@@ -234,7 +237,11 @@ def _assert_serializable_eq(my_serializable, my_serializable2):
|
|
|
234
237
|
if not hasattr(my_serializable, field.name):
|
|
235
238
|
continue
|
|
236
239
|
expect_value = getattr(my_serializable, field_name)
|
|
237
|
-
|
|
240
|
+
if expect_value is no_default:
|
|
241
|
+
assert not hasattr(my_serializable2, field.name)
|
|
242
|
+
continue
|
|
243
|
+
else:
|
|
244
|
+
actual_value = getattr(my_serializable2, field_name)
|
|
238
245
|
if isinstance(expect_value, np.ndarray):
|
|
239
246
|
np.testing.assert_array_equal(expect_value, actual_value)
|
|
240
247
|
elif isinstance(expect_value, pd.DataFrame):
|
|
@@ -42,7 +42,7 @@ except ImportError:
|
|
|
42
42
|
from ...lib.sparse import SparseMatrix
|
|
43
43
|
from ...lib.wrapped_pickle import switch_unpickle
|
|
44
44
|
from ...tests.utils import require_cudf, require_cupy
|
|
45
|
-
from ...utils import lazy_import
|
|
45
|
+
from ...utils import lazy_import, no_default
|
|
46
46
|
from .. import (
|
|
47
47
|
PickleContainer,
|
|
48
48
|
RemoteException,
|
|
@@ -90,6 +90,7 @@ class CustomNamedTuple(NamedTuple):
|
|
|
90
90
|
pd.Timedelta(102.234154131),
|
|
91
91
|
{"abc": 5.6, "def": [3.4], "gh": None, "ijk": {}},
|
|
92
92
|
OrderedDict([("abcd", 5.6)]),
|
|
93
|
+
no_default,
|
|
93
94
|
],
|
|
94
95
|
)
|
|
95
96
|
@switch_unpickle
|
maxframe/tensor/__init__.py
CHANGED
|
@@ -191,11 +191,6 @@ from .ufunc import ufunc
|
|
|
191
191
|
# isort: off
|
|
192
192
|
# noinspection PyUnresolvedReferences
|
|
193
193
|
from numpy import (
|
|
194
|
-
NAN,
|
|
195
|
-
NINF,
|
|
196
|
-
AxisError,
|
|
197
|
-
Inf,
|
|
198
|
-
NaN,
|
|
199
194
|
e,
|
|
200
195
|
errstate,
|
|
201
196
|
geterr,
|
|
@@ -206,12 +201,21 @@ from numpy import (
|
|
|
206
201
|
seterr,
|
|
207
202
|
)
|
|
208
203
|
|
|
204
|
+
try:
|
|
205
|
+
from numpy.exceptions import AxisError
|
|
206
|
+
except ImportError:
|
|
207
|
+
from numpy import AxisError
|
|
208
|
+
|
|
209
|
+
NAN = nan
|
|
210
|
+
NINF = -inf
|
|
211
|
+
Inf = inf
|
|
212
|
+
NaN = nan
|
|
213
|
+
|
|
209
214
|
# import numpy types
|
|
210
215
|
# noinspection PyUnresolvedReferences
|
|
211
216
|
from numpy import (
|
|
212
217
|
bool_ as bool,
|
|
213
218
|
bytes_,
|
|
214
|
-
cfloat,
|
|
215
219
|
character,
|
|
216
220
|
complex64,
|
|
217
221
|
complex128,
|
|
@@ -242,9 +246,17 @@ from numpy import (
|
|
|
242
246
|
uint16,
|
|
243
247
|
uint32,
|
|
244
248
|
uint64,
|
|
245
|
-
unicode_,
|
|
246
249
|
unsignedinteger,
|
|
247
250
|
void,
|
|
248
251
|
)
|
|
249
252
|
|
|
253
|
+
try:
|
|
254
|
+
from numpy import cfloat
|
|
255
|
+
except ImportError:
|
|
256
|
+
from numpy import cdouble as cfloat
|
|
257
|
+
try:
|
|
258
|
+
from numpy import str_ as unicode_
|
|
259
|
+
except ImportError:
|
|
260
|
+
from numpy import unicode_
|
|
261
|
+
|
|
250
262
|
del fetch, ufunc
|
maxframe/tests/utils.py
CHANGED
|
@@ -18,11 +18,13 @@ import hashlib
|
|
|
18
18
|
import os
|
|
19
19
|
import queue
|
|
20
20
|
import socket
|
|
21
|
+
import time
|
|
21
22
|
import types
|
|
22
23
|
from threading import Thread
|
|
23
24
|
from typing import Dict, List, Optional, Set, Tuple
|
|
24
25
|
|
|
25
26
|
import pytest
|
|
27
|
+
from odps import ODPS
|
|
26
28
|
from tornado import netutil
|
|
27
29
|
|
|
28
30
|
from ..core import Tileable, TileableGraph
|
|
@@ -171,3 +173,17 @@ def get_test_unique_name(size=None):
|
|
|
171
173
|
if size:
|
|
172
174
|
digest = digest[:size]
|
|
173
175
|
return digest + "_" + str(os.getpid())
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def assert_mf_index_dtype(idx_obj, dtype):
|
|
179
|
+
from ..dataframe.core import IndexValue
|
|
180
|
+
|
|
181
|
+
assert isinstance(idx_obj, IndexValue.IndexBase) and idx_obj.dtype == dtype
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def ensure_table_deleted(odps_entry: ODPS, table_name: str) -> None:
|
|
185
|
+
retry_times = 20
|
|
186
|
+
while odps_entry.exist_table(table_name) and retry_times > 0:
|
|
187
|
+
time.sleep(1)
|
|
188
|
+
retry_times -= 1
|
|
189
|
+
assert not odps_entry.exist_table(table_name)
|
maxframe/udf.py
CHANGED
|
@@ -19,6 +19,7 @@ from odps.models import Resource
|
|
|
19
19
|
|
|
20
20
|
from .serialization.serializables import (
|
|
21
21
|
BoolField,
|
|
22
|
+
DictField,
|
|
22
23
|
FieldTypes,
|
|
23
24
|
FunctionField,
|
|
24
25
|
ListField,
|
|
@@ -54,6 +55,10 @@ class MarkedFunction(Serializable):
|
|
|
54
55
|
func = FunctionField("func")
|
|
55
56
|
resources = ListField("resources", FieldTypes.string, default_factory=list)
|
|
56
57
|
pythonpacks = ListField("pythonpacks", FieldTypes.reference, default_factory=list)
|
|
58
|
+
expect_engine = StringField("expect_engine", default=None)
|
|
59
|
+
expect_resources = DictField(
|
|
60
|
+
"expect_resources", FieldTypes.string, default_factory=dict
|
|
61
|
+
)
|
|
57
62
|
|
|
58
63
|
def __init__(self, func: Optional[Callable] = None, **kw):
|
|
59
64
|
super().__init__(func=func, **kw)
|
|
@@ -120,6 +125,28 @@ def with_python_requirements(
|
|
|
120
125
|
return func_wrapper
|
|
121
126
|
|
|
122
127
|
|
|
128
|
+
def with_running_options(
|
|
129
|
+
*,
|
|
130
|
+
engine: Optional[str] = None,
|
|
131
|
+
cpu: Optional[int] = None,
|
|
132
|
+
memory: Optional[int] = None,
|
|
133
|
+
**kwargs,
|
|
134
|
+
):
|
|
135
|
+
engine = engine.upper() if engine else None
|
|
136
|
+
resources = {"cpu": cpu, "memory": memory, **kwargs}
|
|
137
|
+
|
|
138
|
+
def func_wrapper(func):
|
|
139
|
+
if all(v is None for v in (engine, cpu, memory)):
|
|
140
|
+
return func
|
|
141
|
+
if isinstance(func, MarkedFunction):
|
|
142
|
+
func.expect_engine = engine
|
|
143
|
+
func.expect_resources = resources
|
|
144
|
+
return func
|
|
145
|
+
return MarkedFunction(func, expect_engine=engine, expect_resources=resources)
|
|
146
|
+
|
|
147
|
+
return func_wrapper
|
|
148
|
+
|
|
149
|
+
|
|
123
150
|
with_resource_libraries = with_resources
|
|
124
151
|
|
|
125
152
|
|