maxframe 1.0.0rc3__cp38-cp38-macosx_10_9_universal2.whl → 1.1.0__cp38-cp38-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-38-darwin.so +0 -0
- maxframe/codegen.py +1 -0
- maxframe/config/config.py +16 -1
- maxframe/conftest.py +52 -14
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/graph/core.cpython-38-darwin.so +0 -0
- maxframe/core/operator/base.py +2 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
- maxframe/dataframe/core.py +26 -2
- maxframe/dataframe/datasource/read_odps_query.py +116 -28
- maxframe/dataframe/datasource/read_odps_table.py +3 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
- maxframe/dataframe/datastore/to_odps.py +7 -0
- maxframe/dataframe/extensions/__init__.py +8 -0
- maxframe/dataframe/extensions/apply_chunk.py +649 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +314 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/__init__.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +1 -0
- maxframe/dataframe/groupby/apply.py +9 -1
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
- maxframe/dataframe/groupby/transform.py +8 -2
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +1 -1
- maxframe/dataframe/merge/tests/test_merge.py +3 -1
- maxframe/dataframe/misc/apply.py +3 -0
- maxframe/dataframe/misc/drop_duplicates.py +23 -2
- maxframe/dataframe/misc/map.py +3 -1
- maxframe/dataframe/misc/tests/test_misc.py +24 -2
- maxframe/dataframe/misc/transform.py +22 -13
- maxframe/dataframe/reduction/__init__.py +3 -0
- maxframe/dataframe/reduction/aggregation.py +1 -0
- maxframe/dataframe/reduction/median.py +56 -0
- maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
- maxframe/dataframe/statistics/quantile.py +8 -2
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/tests/test_utils.py +60 -0
- maxframe/dataframe/utils.py +110 -7
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/io/objects/tests/test_object_io.py +39 -12
- maxframe/io/odpsio/arrow.py +30 -2
- maxframe/io/odpsio/schema.py +28 -8
- maxframe/io/odpsio/tableio.py +55 -133
- maxframe/io/odpsio/tests/test_schema.py +40 -4
- maxframe/io/odpsio/tests/test_tableio.py +5 -5
- maxframe/io/odpsio/tests/test_volumeio.py +35 -11
- maxframe/io/odpsio/volumeio.py +36 -6
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/xgboost/classifier.py +3 -3
- maxframe/learn/contrib/xgboost/predict.py +8 -39
- maxframe/learn/contrib/xgboost/train.py +4 -3
- maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/opcodes.py +10 -1
- maxframe/protocol.py +6 -1
- maxframe/serialization/core.cpython-38-darwin.so +0 -0
- maxframe/serialization/core.pyx +13 -1
- maxframe/serialization/pandas.py +50 -20
- maxframe/serialization/serializables/core.py +24 -5
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +8 -1
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +19 -7
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/concatenate.py +23 -20
- maxframe/tensor/merge/vstack.py +5 -1
- maxframe/tensor/misc/transpose.py +1 -1
- maxframe/tests/utils.py +16 -0
- maxframe/udf.py +27 -0
- maxframe/utils.py +64 -14
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +28 -10
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/odps.py +104 -20
- maxframe_client/session/task.py +42 -26
- maxframe_client/session/tests/test_task.py +0 -4
- maxframe_client/tests/test_session.py +44 -12
- {maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
17
|
+
from .core import LLM
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MultiModalLLM(LLM):
|
|
21
|
+
def generate(
|
|
22
|
+
self,
|
|
23
|
+
data,
|
|
24
|
+
prompt_template: Dict[str, Any],
|
|
25
|
+
params: Dict[str, Any] = None,
|
|
26
|
+
):
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def generate(
|
|
31
|
+
data,
|
|
32
|
+
model: MultiModalLLM,
|
|
33
|
+
prompt_template: Dict[str, Any],
|
|
34
|
+
params: Dict[str, Any] = None,
|
|
35
|
+
):
|
|
36
|
+
if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
|
|
37
|
+
raise ValueError("data must be a maxframe dataframe or series object")
|
|
38
|
+
if not isinstance(model, MultiModalLLM):
|
|
39
|
+
raise ValueError("model must be a MultiModalLLM object")
|
|
40
|
+
params = params if params is not None else dict()
|
|
41
|
+
model.validate_params(params)
|
|
42
|
+
return model.generate(data, prompt_template, params)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
from ....dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
17
|
+
from .core import LLM
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class TextLLM(LLM):
|
|
21
|
+
def generate(
|
|
22
|
+
self,
|
|
23
|
+
data,
|
|
24
|
+
prompt_template: Dict[str, Any],
|
|
25
|
+
params: Dict[str, Any] = None,
|
|
26
|
+
):
|
|
27
|
+
raise NotImplementedError
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def generate(
|
|
31
|
+
data,
|
|
32
|
+
model: TextLLM,
|
|
33
|
+
prompt_template: Dict[str, Any],
|
|
34
|
+
params: Dict[str, Any] = None,
|
|
35
|
+
):
|
|
36
|
+
if not isinstance(data, DATAFRAME_TYPE) and not isinstance(data, SERIES_TYPE):
|
|
37
|
+
raise ValueError("data must be a maxframe dataframe or series object")
|
|
38
|
+
if not isinstance(model, TextLLM):
|
|
39
|
+
raise ValueError("model must be a TextLLM object")
|
|
40
|
+
params = params if params is not None else dict()
|
|
41
|
+
model.validate_params(params)
|
|
42
|
+
return model.generate(data, prompt_template, params)
|
|
@@ -14,7 +14,8 @@
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
|
|
17
|
-
from ....tensor import argmax, transpose
|
|
17
|
+
from ....tensor import argmax, transpose
|
|
18
|
+
from ....tensor.merge.vstack import _vstack
|
|
18
19
|
from ..utils import make_import_error_func
|
|
19
20
|
from .core import XGBScikitLearnBase, xgboost
|
|
20
21
|
|
|
@@ -89,7 +90,6 @@ else:
|
|
|
89
90
|
if ntree_limit is not None:
|
|
90
91
|
raise NotImplementedError("ntree_limit is not currently supported")
|
|
91
92
|
prediction = predict(self.get_booster(), data, flag=flag, **kw)
|
|
92
|
-
|
|
93
93
|
if len(prediction.shape) == 2 and prediction.shape[1] == self.n_classes_:
|
|
94
94
|
# multi-class
|
|
95
95
|
return prediction
|
|
@@ -103,7 +103,7 @@ else:
|
|
|
103
103
|
# binary logistic function
|
|
104
104
|
classone_probs = prediction
|
|
105
105
|
classzero_probs = 1.0 - classone_probs
|
|
106
|
-
return transpose(
|
|
106
|
+
return transpose(_vstack((classzero_probs, classone_probs)))
|
|
107
107
|
|
|
108
108
|
@property
|
|
109
109
|
def classes_(self) -> np.ndarray:
|
|
@@ -14,20 +14,18 @@
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
import numpy as np
|
|
17
|
-
import pandas as pd
|
|
18
17
|
|
|
19
18
|
from .... import opcodes
|
|
20
19
|
from ....core.entity.output_types import OutputType
|
|
21
20
|
from ....core.operator.base import Operator
|
|
22
21
|
from ....core.operator.core import TileableOperatorMixin
|
|
23
|
-
from ....dataframe.utils import parse_index
|
|
24
22
|
from ....serialization.serializables import (
|
|
25
23
|
BoolField,
|
|
26
24
|
KeyField,
|
|
27
25
|
ReferenceField,
|
|
28
26
|
TupleField,
|
|
29
27
|
)
|
|
30
|
-
from ....tensor.core import
|
|
28
|
+
from ....tensor.core import TensorOrder
|
|
31
29
|
from .core import BoosterData
|
|
32
30
|
from .dmatrix import check_data
|
|
33
31
|
|
|
@@ -65,35 +63,12 @@ class XGBPredict(Operator, TileableOperatorMixin):
|
|
|
65
63
|
else:
|
|
66
64
|
shape = (self.data.shape[0],)
|
|
67
65
|
inputs = [self.data, self.model]
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
order=TensorOrder.C_ORDER,
|
|
75
|
-
)
|
|
76
|
-
elif self.output_types[0] == OutputType.dataframe:
|
|
77
|
-
# dataframe
|
|
78
|
-
dtypes = pd.DataFrame(
|
|
79
|
-
np.random.rand(0, num_class), dtype=self.output_dtype
|
|
80
|
-
).dtypes
|
|
81
|
-
return self.new_tileable(
|
|
82
|
-
inputs,
|
|
83
|
-
shape=shape,
|
|
84
|
-
dtypes=dtypes,
|
|
85
|
-
columns_value=parse_index(dtypes.index),
|
|
86
|
-
index_value=self.data.index_value,
|
|
87
|
-
)
|
|
88
|
-
else:
|
|
89
|
-
# series
|
|
90
|
-
return self.new_tileable(
|
|
91
|
-
inputs,
|
|
92
|
-
shape=shape,
|
|
93
|
-
index_value=self.data.index_value,
|
|
94
|
-
name="predictions",
|
|
95
|
-
dtype=self.output_dtype,
|
|
96
|
-
)
|
|
66
|
+
return self.new_tileable(
|
|
67
|
+
inputs,
|
|
68
|
+
shape=shape,
|
|
69
|
+
dtype=self.output_dtype,
|
|
70
|
+
order=TensorOrder.C_ORDER,
|
|
71
|
+
)
|
|
97
72
|
|
|
98
73
|
|
|
99
74
|
def predict(
|
|
@@ -124,13 +99,7 @@ def predict(
|
|
|
124
99
|
data = check_data(data)
|
|
125
100
|
# TODO: check model datatype
|
|
126
101
|
|
|
127
|
-
|
|
128
|
-
if isinstance(data, TENSOR_TYPE):
|
|
129
|
-
output_types = [OutputType.tensor]
|
|
130
|
-
elif num_class is not None:
|
|
131
|
-
output_types = [OutputType.dataframe]
|
|
132
|
-
else:
|
|
133
|
-
output_types = [OutputType.series]
|
|
102
|
+
output_types = [OutputType.tensor]
|
|
134
103
|
|
|
135
104
|
iteration_range = iteration_range or (0, 0)
|
|
136
105
|
|
|
@@ -102,7 +102,7 @@ def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwarg
|
|
|
102
102
|
Parameters
|
|
103
103
|
----------
|
|
104
104
|
Parameters are the same as `xgboost.train`. Note that train is an eager-execution
|
|
105
|
-
API
|
|
105
|
+
API if evals is passed, thus the call will be blocked until training finished.
|
|
106
106
|
|
|
107
107
|
Returns
|
|
108
108
|
-------
|
|
@@ -121,11 +121,12 @@ def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwarg
|
|
|
121
121
|
processed_evals.append((eval_dmatrix, name))
|
|
122
122
|
else:
|
|
123
123
|
processed_evals.append((to_dmatrix(eval_dmatrix), name))
|
|
124
|
-
|
|
124
|
+
data = XGBTrain(
|
|
125
125
|
params=params,
|
|
126
126
|
dtrain=dtrain,
|
|
127
127
|
evals=processed_evals,
|
|
128
128
|
evals_result=evals_result,
|
|
129
129
|
num_class=num_class,
|
|
130
130
|
**kwargs,
|
|
131
|
-
)(evals_result)
|
|
131
|
+
)(evals_result)
|
|
132
|
+
return data.execute(session=session, **run_kwargs) if evals else data
|
|
Binary file
|
|
@@ -55,13 +55,13 @@ def test_sparse_creation():
|
|
|
55
55
|
s = SparseNDArray(s1_data)
|
|
56
56
|
assert s.ndim == 2
|
|
57
57
|
assert isinstance(s, SparseMatrix)
|
|
58
|
-
assert_array_equal(s.toarray(), s1_data.
|
|
59
|
-
assert_array_equal(s.todense(), s1_data.
|
|
58
|
+
assert_array_equal(s.toarray(), s1_data.toarray())
|
|
59
|
+
assert_array_equal(s.todense(), s1_data.toarray())
|
|
60
60
|
|
|
61
61
|
ss = pickle.loads(pickle.dumps(s))
|
|
62
62
|
assert s == ss
|
|
63
|
-
assert_array_equal(ss.toarray(), s1_data.
|
|
64
|
-
assert_array_equal(ss.todense(), s1_data.
|
|
63
|
+
assert_array_equal(ss.toarray(), s1_data.toarray())
|
|
64
|
+
assert_array_equal(ss.todense(), s1_data.toarray())
|
|
65
65
|
|
|
66
66
|
v = SparseNDArray(v1, shape=(3,))
|
|
67
67
|
assert s.ndim
|
|
@@ -331,12 +331,12 @@ def test_sparse_dot():
|
|
|
331
331
|
|
|
332
332
|
assert_array_equal(mls.dot(s1, v1_s), s1.dot(v1_data))
|
|
333
333
|
assert_array_equal(mls.dot(s2, v1_s), s2.dot(v1_data))
|
|
334
|
-
assert_array_equal(mls.dot(v2_s, s1), v2_data.dot(s1_data.
|
|
335
|
-
assert_array_equal(mls.dot(v2_s, s2), v2_data.dot(s2_data.
|
|
334
|
+
assert_array_equal(mls.dot(v2_s, s1), v2_data.dot(s1_data.toarray()))
|
|
335
|
+
assert_array_equal(mls.dot(v2_s, s2), v2_data.dot(s2_data.toarray()))
|
|
336
336
|
assert_array_equal(mls.dot(v1_s, v1_s), v1_data.dot(v1_data), almost=True)
|
|
337
337
|
assert_array_equal(mls.dot(v2_s, v2_s), v2_data.dot(v2_data), almost=True)
|
|
338
338
|
|
|
339
|
-
assert_array_equal(mls.dot(v2_s, s1, sparse=False), v2_data.dot(s1_data.
|
|
339
|
+
assert_array_equal(mls.dot(v2_s, s1, sparse=False), v2_data.dot(s1_data.toarray()))
|
|
340
340
|
assert_array_equal(mls.dot(v1_s, v1_s, sparse=False), v1_data.dot(v1_data))
|
|
341
341
|
|
|
342
342
|
|
|
@@ -390,7 +390,7 @@ def test_sparse_fill_diagonal():
|
|
|
390
390
|
arr = SparseNDArray(s1)
|
|
391
391
|
arr.fill_diagonal(3)
|
|
392
392
|
|
|
393
|
-
expected = s1.copy().
|
|
393
|
+
expected = s1.copy().toarray()
|
|
394
394
|
np.fill_diagonal(expected, 3)
|
|
395
395
|
|
|
396
396
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -399,7 +399,7 @@ def test_sparse_fill_diagonal():
|
|
|
399
399
|
arr = SparseNDArray(s1)
|
|
400
400
|
arr.fill_diagonal(3, wrap=True)
|
|
401
401
|
|
|
402
|
-
expected = s1.copy().
|
|
402
|
+
expected = s1.copy().toarray()
|
|
403
403
|
np.fill_diagonal(expected, 3, wrap=True)
|
|
404
404
|
|
|
405
405
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -408,7 +408,7 @@ def test_sparse_fill_diagonal():
|
|
|
408
408
|
arr = SparseNDArray(s1)
|
|
409
409
|
arr.fill_diagonal([1, 2, 3])
|
|
410
410
|
|
|
411
|
-
expected = s1.copy().
|
|
411
|
+
expected = s1.copy().toarray()
|
|
412
412
|
np.fill_diagonal(expected, [1, 2, 3])
|
|
413
413
|
|
|
414
414
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -417,7 +417,7 @@ def test_sparse_fill_diagonal():
|
|
|
417
417
|
arr = SparseNDArray(s1)
|
|
418
418
|
arr.fill_diagonal([1, 2, 3], wrap=True)
|
|
419
419
|
|
|
420
|
-
expected = s1.copy().
|
|
420
|
+
expected = s1.copy().toarray()
|
|
421
421
|
np.fill_diagonal(expected, [1, 2, 3], wrap=True)
|
|
422
422
|
|
|
423
423
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -427,7 +427,7 @@ def test_sparse_fill_diagonal():
|
|
|
427
427
|
arr = SparseNDArray(s1)
|
|
428
428
|
arr.fill_diagonal(val)
|
|
429
429
|
|
|
430
|
-
expected = s1.copy().
|
|
430
|
+
expected = s1.copy().toarray()
|
|
431
431
|
np.fill_diagonal(expected, val)
|
|
432
432
|
|
|
433
433
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -437,7 +437,7 @@ def test_sparse_fill_diagonal():
|
|
|
437
437
|
arr = SparseNDArray(s1)
|
|
438
438
|
arr.fill_diagonal(val, wrap=True)
|
|
439
439
|
|
|
440
|
-
expected = s1.copy().
|
|
440
|
+
expected = s1.copy().toarray()
|
|
441
441
|
np.fill_diagonal(expected, val, wrap=True)
|
|
442
442
|
|
|
443
443
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -447,7 +447,7 @@ def test_sparse_fill_diagonal():
|
|
|
447
447
|
arr = SparseNDArray(s1)
|
|
448
448
|
arr.fill_diagonal(val)
|
|
449
449
|
|
|
450
|
-
expected = s1.copy().
|
|
450
|
+
expected = s1.copy().toarray()
|
|
451
451
|
np.fill_diagonal(expected, val)
|
|
452
452
|
|
|
453
453
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
|
@@ -457,7 +457,7 @@ def test_sparse_fill_diagonal():
|
|
|
457
457
|
arr = SparseNDArray(s1)
|
|
458
458
|
arr.fill_diagonal(val, wrap=True)
|
|
459
459
|
|
|
460
|
-
expected = s1.copy().
|
|
460
|
+
expected = s1.copy().toarray()
|
|
461
461
|
np.fill_diagonal(expected, val, wrap=True)
|
|
462
462
|
|
|
463
463
|
np.testing.assert_array_equal(arr.toarray(), expected)
|
maxframe/opcodes.py
CHANGED
|
@@ -270,6 +270,7 @@ KURTOSIS = 351
|
|
|
270
270
|
SEM = 352
|
|
271
271
|
STR_CONCAT = 353
|
|
272
272
|
MAD = 354
|
|
273
|
+
MEDIAN = 355
|
|
273
274
|
|
|
274
275
|
# tensor operator
|
|
275
276
|
RESHAPE = 401
|
|
@@ -377,7 +378,6 @@ DROP_DUPLICATES = 728
|
|
|
377
378
|
MELT = 729
|
|
378
379
|
RENAME = 731
|
|
379
380
|
INSERT = 732
|
|
380
|
-
MAP_CHUNK = 733
|
|
381
381
|
CARTESIAN_CHUNK = 734
|
|
382
382
|
EXPLODE = 735
|
|
383
383
|
REPLACE = 736
|
|
@@ -392,6 +392,10 @@ PIVOT_TABLE = 744
|
|
|
392
392
|
|
|
393
393
|
FUSE = 801
|
|
394
394
|
|
|
395
|
+
# LLM
|
|
396
|
+
DASHSCOPE_TEXT_GENERATION = 810
|
|
397
|
+
DASHSCOPE_MULTI_MODAL_GENERATION = 811
|
|
398
|
+
|
|
395
399
|
# table like input for tensor
|
|
396
400
|
TABLE_COO = 1003
|
|
397
401
|
# store tensor as coo format
|
|
@@ -532,6 +536,8 @@ STATSMODELS_TRAIN = 3012
|
|
|
532
536
|
STATSMODELS_PREDICT = 3013
|
|
533
537
|
|
|
534
538
|
# learn
|
|
539
|
+
CONNECTED_COMPONENTS = 3100
|
|
540
|
+
|
|
535
541
|
# checks
|
|
536
542
|
CHECK_NON_NEGATIVE = 3300
|
|
537
543
|
# classifier check targets
|
|
@@ -566,6 +572,9 @@ CHOLESKY_FUSE = 999988
|
|
|
566
572
|
|
|
567
573
|
# MaxFrame-dedicated functions
|
|
568
574
|
DATAFRAME_RESHUFFLE = 10001
|
|
575
|
+
FLATMAP = 10002
|
|
576
|
+
FLATJSON = 10003
|
|
577
|
+
APPLY_CHUNK = 10004
|
|
569
578
|
|
|
570
579
|
# MaxFrame internal operators
|
|
571
580
|
DATAFRAME_PROJECTION_SAME_INDEX_MERGE = 100001
|
maxframe/protocol.py
CHANGED
|
@@ -375,6 +375,11 @@ class ExecuteDagRequest(Serializable):
|
|
|
375
375
|
value_type=FieldTypes.reference,
|
|
376
376
|
default=None,
|
|
377
377
|
)
|
|
378
|
+
new_settings: Dict[str, Any] = DictField(
|
|
379
|
+
"new_settings",
|
|
380
|
+
key_type=FieldTypes.string,
|
|
381
|
+
default=None,
|
|
382
|
+
)
|
|
378
383
|
|
|
379
384
|
|
|
380
385
|
class SubDagSubmitInstanceInfo(JsonSerializable):
|
|
@@ -511,7 +516,7 @@ class DataFrameTableMeta(JsonSerializable):
|
|
|
511
516
|
return True
|
|
512
517
|
|
|
513
518
|
def to_json(self) -> dict:
|
|
514
|
-
b64_pk = lambda x: base64.b64encode(pickle.dumps(x))
|
|
519
|
+
b64_pk = lambda x: base64.b64encode(pickle.dumps(x)).decode()
|
|
515
520
|
ret = {
|
|
516
521
|
"table_name": self.table_name,
|
|
517
522
|
"type": self.type.value,
|
|
Binary file
|
maxframe/serialization/core.pyx
CHANGED
|
@@ -37,7 +37,7 @@ from .._utils import NamedType
|
|
|
37
37
|
from .._utils cimport TypeDispatcher
|
|
38
38
|
|
|
39
39
|
from ..lib import wrapped_pickle as pickle
|
|
40
|
-
from ..utils import arrow_type_from_str
|
|
40
|
+
from ..utils import NoDefault, arrow_type_from_str, no_default
|
|
41
41
|
|
|
42
42
|
try:
|
|
43
43
|
from pandas import ArrowDtype
|
|
@@ -94,6 +94,7 @@ cdef:
|
|
|
94
94
|
int COMPLEX_SERIALIZER = 12
|
|
95
95
|
int SLICE_SERIALIZER = 13
|
|
96
96
|
int REGEX_SERIALIZER = 14
|
|
97
|
+
int NO_DEFAULT_SERIALIZER = 15
|
|
97
98
|
int PLACEHOLDER_SERIALIZER = 4096
|
|
98
99
|
|
|
99
100
|
|
|
@@ -803,6 +804,16 @@ cdef class RegexSerializer(Serializer):
|
|
|
803
804
|
return re.compile((<bytes>(subs[0])).decode(), serialized[0])
|
|
804
805
|
|
|
805
806
|
|
|
807
|
+
cdef class NoDefaultSerializer(Serializer):
|
|
808
|
+
serializer_id = NO_DEFAULT_SERIALIZER
|
|
809
|
+
|
|
810
|
+
cpdef serial(self, object obj, dict context):
|
|
811
|
+
return [], [], True
|
|
812
|
+
|
|
813
|
+
cpdef deserial(self, list obj, dict context, list subs):
|
|
814
|
+
return no_default
|
|
815
|
+
|
|
816
|
+
|
|
806
817
|
cdef class Placeholder:
|
|
807
818
|
"""
|
|
808
819
|
Placeholder object to reduce duplicated serialization
|
|
@@ -857,6 +868,7 @@ DtypeSerializer.register(ExtensionDtype)
|
|
|
857
868
|
ComplexSerializer.register(complex)
|
|
858
869
|
SliceSerializer.register(slice)
|
|
859
870
|
RegexSerializer.register(re.Pattern)
|
|
871
|
+
NoDefaultSerializer.register(NoDefault)
|
|
860
872
|
PlaceholderSerializer.register(Placeholder)
|
|
861
873
|
|
|
862
874
|
|
maxframe/serialization/pandas.py
CHANGED
|
@@ -134,8 +134,10 @@ class ArraySerializer(Serializer):
|
|
|
134
134
|
data_parts = [obj.tolist()]
|
|
135
135
|
else:
|
|
136
136
|
data_parts = [obj.to_numpy().tolist()]
|
|
137
|
-
|
|
137
|
+
elif hasattr(obj, "_data"):
|
|
138
138
|
data_parts = [getattr(obj, "_data")]
|
|
139
|
+
else:
|
|
140
|
+
data_parts = [getattr(obj, "_pa_array")]
|
|
139
141
|
return [ser_type], [dtype] + data_parts, False
|
|
140
142
|
|
|
141
143
|
def deserial(self, serialized: List, context: Dict, subs: List):
|
|
@@ -155,38 +157,66 @@ class PdTimestampSerializer(Serializer):
|
|
|
155
157
|
else:
|
|
156
158
|
zone_info = []
|
|
157
159
|
ts = obj.to_pydatetime().timestamp()
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
)
|
|
160
|
+
elements = [int(ts), obj.microsecond, obj.nanosecond]
|
|
161
|
+
if hasattr(obj, "unit"):
|
|
162
|
+
elements.append(str(obj.unit))
|
|
163
|
+
return elements, zone_info, bool(zone_info)
|
|
163
164
|
|
|
164
165
|
def deserial(self, serialized: List, context: Dict, subs: List):
|
|
165
166
|
if subs:
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
167
|
+
pydt = datetime.datetime.utcfromtimestamp(serialized[0])
|
|
168
|
+
kwargs = {
|
|
169
|
+
"year": pydt.year,
|
|
170
|
+
"month": pydt.month,
|
|
171
|
+
"day": pydt.day,
|
|
172
|
+
"hour": pydt.hour,
|
|
173
|
+
"minute": pydt.minute,
|
|
174
|
+
"second": pydt.second,
|
|
175
|
+
"microsecond": serialized[1],
|
|
176
|
+
"nanosecond": serialized[2],
|
|
177
|
+
"tzinfo": datetime.timezone.utc,
|
|
178
|
+
}
|
|
179
|
+
if len(serialized) > 3:
|
|
180
|
+
kwargs["unit"] = serialized[3]
|
|
181
|
+
val = pd.Timestamp(**kwargs).tz_convert(subs[0])
|
|
170
182
|
else:
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
183
|
+
pydt = datetime.datetime.fromtimestamp(serialized[0])
|
|
184
|
+
kwargs = {
|
|
185
|
+
"year": pydt.year,
|
|
186
|
+
"month": pydt.month,
|
|
187
|
+
"day": pydt.day,
|
|
188
|
+
"hour": pydt.hour,
|
|
189
|
+
"minute": pydt.minute,
|
|
190
|
+
"second": pydt.second,
|
|
191
|
+
"microsecond": serialized[1],
|
|
192
|
+
"nanosecond": serialized[2],
|
|
193
|
+
}
|
|
194
|
+
if len(serialized) >= 4:
|
|
195
|
+
kwargs["unit"] = serialized[3]
|
|
196
|
+
val = pd.Timestamp(**kwargs)
|
|
174
197
|
return val
|
|
175
198
|
|
|
176
199
|
|
|
177
200
|
class PdTimedeltaSerializer(Serializer):
|
|
178
201
|
def serial(self, obj: pd.Timedelta, context: Dict):
|
|
179
|
-
|
|
202
|
+
elements = [int(obj.seconds), obj.microseconds, obj.nanoseconds, obj.days]
|
|
203
|
+
if hasattr(obj, "unit"):
|
|
204
|
+
elements.append(str(obj.unit))
|
|
205
|
+
return elements, [], True
|
|
180
206
|
|
|
181
207
|
def deserial(self, serialized: List, context: Dict, subs: List):
|
|
182
208
|
days = 0 if len(serialized) < 4 else serialized[3]
|
|
209
|
+
unit = None if len(serialized) < 5 else serialized[4]
|
|
183
210
|
seconds, microseconds, nanoseconds = serialized[:3]
|
|
184
|
-
|
|
185
|
-
days
|
|
186
|
-
seconds
|
|
187
|
-
microseconds
|
|
188
|
-
nanoseconds
|
|
189
|
-
|
|
211
|
+
kwargs = {
|
|
212
|
+
"days": days,
|
|
213
|
+
"seconds": seconds,
|
|
214
|
+
"microseconds": microseconds,
|
|
215
|
+
"nanoseconds": nanoseconds,
|
|
216
|
+
}
|
|
217
|
+
if unit is not None:
|
|
218
|
+
kwargs["unit"] = unit
|
|
219
|
+
return pd.Timedelta(**kwargs)
|
|
190
220
|
|
|
191
221
|
|
|
192
222
|
class NoDefaultSerializer(Serializer):
|
|
@@ -19,6 +19,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type
|
|
|
19
19
|
import msgpack
|
|
20
20
|
|
|
21
21
|
from ...lib.mmh3 import hash
|
|
22
|
+
from ...utils import no_default
|
|
22
23
|
from ..core import Placeholder, Serializer, buffered, load_type
|
|
23
24
|
from .field import Field
|
|
24
25
|
from .field_type import DictType, ListType, PrimitiveFieldType, TupleType
|
|
@@ -211,6 +212,22 @@ class _NoFieldValue:
|
|
|
211
212
|
_no_field_value = _NoFieldValue()
|
|
212
213
|
|
|
213
214
|
|
|
215
|
+
def _to_primitive_placeholder(v: Any) -> Any:
|
|
216
|
+
if v is _no_field_value or v is no_default:
|
|
217
|
+
return {}
|
|
218
|
+
return v
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _restore_primitive_placeholder(v: Any) -> Any:
|
|
222
|
+
if type(v) is dict:
|
|
223
|
+
if v == {}:
|
|
224
|
+
return _no_field_value
|
|
225
|
+
else:
|
|
226
|
+
return v
|
|
227
|
+
else:
|
|
228
|
+
return v
|
|
229
|
+
|
|
230
|
+
|
|
214
231
|
class SerializableSerializer(Serializer):
|
|
215
232
|
"""
|
|
216
233
|
Leverage DictSerializer to perform serde.
|
|
@@ -241,9 +258,7 @@ class SerializableSerializer(Serializer):
|
|
|
241
258
|
else:
|
|
242
259
|
primitive_vals = self._get_field_values(obj, obj._PRIMITIVE_FIELDS)
|
|
243
260
|
# replace _no_field_value as {} to make them msgpack-serializable
|
|
244
|
-
primitive_vals = [
|
|
245
|
-
v if v is not _no_field_value else {} for v in primitive_vals
|
|
246
|
-
]
|
|
261
|
+
primitive_vals = [_to_primitive_placeholder(v) for v in primitive_vals]
|
|
247
262
|
if obj._cache_primitive_serial:
|
|
248
263
|
primitive_vals = msgpack.dumps(primitive_vals)
|
|
249
264
|
_primitive_serial_cache[obj] = primitive_vals
|
|
@@ -311,7 +326,9 @@ class SerializableSerializer(Serializer):
|
|
|
311
326
|
cls_fields = server_fields[server_field_num : field_num + count]
|
|
312
327
|
cls_values = values[field_num : field_num + count]
|
|
313
328
|
for field, value in zip(cls_fields, cls_values):
|
|
314
|
-
if
|
|
329
|
+
if is_primitive:
|
|
330
|
+
value = _restore_primitive_placeholder(value)
|
|
331
|
+
if not is_primitive or value is not _no_field_value:
|
|
315
332
|
cls._set_field_value(obj, field, value)
|
|
316
333
|
field_num += count
|
|
317
334
|
try:
|
|
@@ -356,7 +373,9 @@ class SerializableSerializer(Serializer):
|
|
|
356
373
|
server_fields + deprecated_fields, key=lambda f: f.name
|
|
357
374
|
)
|
|
358
375
|
for field, value in zip(server_fields, values):
|
|
359
|
-
if
|
|
376
|
+
if is_primitive:
|
|
377
|
+
value = _restore_primitive_placeholder(value)
|
|
378
|
+
if not is_primitive or value is not _no_field_value:
|
|
360
379
|
try:
|
|
361
380
|
cls._set_field_value(obj, field, value)
|
|
362
381
|
except AttributeError: # pragma: no cover
|
|
@@ -46,6 +46,9 @@ class PrimitiveType(Enum):
|
|
|
46
46
|
complex128 = 25
|
|
47
47
|
|
|
48
48
|
|
|
49
|
+
_np_unicode = np.unicode_ if hasattr(np, "unicode_") else np.str_
|
|
50
|
+
|
|
51
|
+
|
|
49
52
|
_primitive_type_to_valid_types = {
|
|
50
53
|
PrimitiveType.bool: (bool, np.bool_),
|
|
51
54
|
PrimitiveType.int8: (int, np.int8),
|
|
@@ -60,7 +63,7 @@ _primitive_type_to_valid_types = {
|
|
|
60
63
|
PrimitiveType.float32: (float, np.float32),
|
|
61
64
|
PrimitiveType.float64: (float, np.float64),
|
|
62
65
|
PrimitiveType.bytes: (bytes, np.bytes_),
|
|
63
|
-
PrimitiveType.string: (str,
|
|
66
|
+
PrimitiveType.string: (str, _np_unicode),
|
|
64
67
|
PrimitiveType.complex64: (complex, np.complex64),
|
|
65
68
|
PrimitiveType.complex128: (complex, np.complex128),
|
|
66
69
|
}
|
|
@@ -21,6 +21,7 @@ import pytest
|
|
|
21
21
|
|
|
22
22
|
from ....core import EntityData
|
|
23
23
|
from ....lib.wrapped_pickle import switch_unpickle
|
|
24
|
+
from ....utils import no_default
|
|
24
25
|
from ... import deserialize, serialize
|
|
25
26
|
from .. import (
|
|
26
27
|
AnyField,
|
|
@@ -143,6 +144,7 @@ class MySerializable(Serializable):
|
|
|
143
144
|
oneof1_val=f"{__name__}.MySerializable",
|
|
144
145
|
oneof2_val=MySimpleSerializable,
|
|
145
146
|
)
|
|
147
|
+
_no_default_val = Float64Field("no_default_val", default=no_default)
|
|
146
148
|
|
|
147
149
|
|
|
148
150
|
@pytest.mark.parametrize("set_is_ci", [False, True], indirect=True)
|
|
@@ -187,6 +189,7 @@ def test_serializable(set_is_ci):
|
|
|
187
189
|
_dict_val={"a": b"bytes_value"},
|
|
188
190
|
_ref_val=MySerializable(),
|
|
189
191
|
_oneof_val=MySerializable(_id="2"),
|
|
192
|
+
_no_default_val=no_default,
|
|
190
193
|
)
|
|
191
194
|
|
|
192
195
|
header, buffers = serialize(my_serializable)
|
|
@@ -234,7 +237,11 @@ def _assert_serializable_eq(my_serializable, my_serializable2):
|
|
|
234
237
|
if not hasattr(my_serializable, field.name):
|
|
235
238
|
continue
|
|
236
239
|
expect_value = getattr(my_serializable, field_name)
|
|
237
|
-
|
|
240
|
+
if expect_value is no_default:
|
|
241
|
+
assert not hasattr(my_serializable2, field.name)
|
|
242
|
+
continue
|
|
243
|
+
else:
|
|
244
|
+
actual_value = getattr(my_serializable2, field_name)
|
|
238
245
|
if isinstance(expect_value, np.ndarray):
|
|
239
246
|
np.testing.assert_array_equal(expect_value, actual_value)
|
|
240
247
|
elif isinstance(expect_value, pd.DataFrame):
|
|
@@ -42,7 +42,7 @@ except ImportError:
|
|
|
42
42
|
from ...lib.sparse import SparseMatrix
|
|
43
43
|
from ...lib.wrapped_pickle import switch_unpickle
|
|
44
44
|
from ...tests.utils import require_cudf, require_cupy
|
|
45
|
-
from ...utils import lazy_import
|
|
45
|
+
from ...utils import lazy_import, no_default
|
|
46
46
|
from .. import (
|
|
47
47
|
PickleContainer,
|
|
48
48
|
RemoteException,
|
|
@@ -90,6 +90,7 @@ class CustomNamedTuple(NamedTuple):
|
|
|
90
90
|
pd.Timedelta(102.234154131),
|
|
91
91
|
{"abc": 5.6, "def": [3.4], "gh": None, "ijk": {}},
|
|
92
92
|
OrderedDict([("abcd", 5.6)]),
|
|
93
|
+
no_default,
|
|
93
94
|
],
|
|
94
95
|
)
|
|
95
96
|
@switch_unpickle
|