maxframe 1.2.1__cp37-cp37m-win_amd64.whl → 1.3.1__cp37-cp37m-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp37-win_amd64.pyd +0 -0
- maxframe/codegen.py +70 -21
- maxframe/config/config.py +6 -0
- maxframe/core/accessor.py +1 -0
- maxframe/core/graph/core.cp37-win_amd64.pyd +0 -0
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/dict_/accessor.py +1 -0
- maxframe/dataframe/accessors/dict_/length.py +1 -0
- maxframe/dataframe/accessors/dict_/setitem.py +1 -0
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
- maxframe/dataframe/accessors/list_/__init__.py +37 -0
- maxframe/dataframe/accessors/list_/accessor.py +39 -0
- maxframe/dataframe/accessors/list_/getitem.py +135 -0
- maxframe/dataframe/accessors/list_/length.py +73 -0
- maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
- maxframe/dataframe/accessors/plotting/__init__.py +2 -0
- maxframe/dataframe/accessors/string_/__init__.py +1 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/accessor.py +1 -0
- maxframe/dataframe/extensions/apply_chunk.py +34 -21
- maxframe/dataframe/extensions/flatmap.py +8 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
- maxframe/dataframe/groupby/aggregation.py +53 -1
- maxframe/dataframe/merge/concat.py +7 -4
- maxframe/dataframe/merge/merge.py +1 -0
- maxframe/dataframe/merge/tests/test_merge.py +97 -47
- maxframe/dataframe/missing/tests/test_missing.py +1 -0
- maxframe/dataframe/reduction/aggregation.py +63 -0
- maxframe/dataframe/reduction/core.py +17 -5
- maxframe/dataframe/tests/test_utils.py +7 -0
- maxframe/dataframe/ufunc/ufunc.py +1 -0
- maxframe/dataframe/utils.py +3 -0
- maxframe/io/odpsio/schema.py +1 -0
- maxframe/learn/contrib/__init__.py +2 -4
- maxframe/learn/contrib/llm/__init__.py +1 -0
- maxframe/learn/contrib/llm/core.py +31 -10
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +38 -3
- maxframe/learn/contrib/llm/models/managed.py +54 -0
- maxframe/learn/contrib/llm/multi_modal.py +93 -0
- maxframe/learn/contrib/llm/text.py +268 -8
- maxframe/learn/contrib/models.py +77 -0
- maxframe/learn/contrib/utils.py +1 -0
- maxframe/learn/contrib/xgboost/__init__.py +8 -1
- maxframe/learn/contrib/xgboost/classifier.py +15 -4
- maxframe/learn/contrib/xgboost/core.py +108 -1
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
- maxframe/learn/contrib/xgboost/predict.py +6 -3
- maxframe/learn/contrib/xgboost/regressor.py +15 -1
- maxframe/learn/contrib/xgboost/train.py +5 -4
- maxframe/lib/dtypes_extension/__init__.py +2 -1
- maxframe/lib/dtypes_extension/dtypes.py +21 -0
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +13 -3
- maxframe/lib/mmh3.cp37-win_amd64.pyd +0 -0
- maxframe/opcodes.py +19 -0
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp37-win_amd64.pyd +0 -0
- maxframe/serialization/core.pyx +12 -1
- maxframe/serialization/numpy.py +12 -4
- maxframe/serialization/serializables/tests/test_serializable.py +13 -2
- maxframe/serialization/tests/test_serial.py +2 -0
- maxframe/tensor/merge/concatenate.py +1 -0
- maxframe/tensor/misc/unique.py +11 -10
- maxframe/tensor/reshape/reshape.py +4 -1
- maxframe/utils.py +4 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/METADATA +2 -1
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/RECORD +73 -65
- maxframe_client/session/odps.py +3 -0
- maxframe_client/session/tests/test_task.py +1 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/WHEEL +0 -0
- {maxframe-1.2.1.dist-info → maxframe-1.3.1.dist-info}/top_level.txt +0 -0
|
@@ -12,14 +12,21 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import tempfile
|
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
16
21
|
|
|
17
22
|
try:
|
|
18
23
|
import xgboost
|
|
19
24
|
except ImportError:
|
|
20
25
|
xgboost = None
|
|
21
26
|
|
|
27
|
+
from ....core import OutputType
|
|
22
28
|
from ...core import Model, ModelData
|
|
29
|
+
from ..models import ModelApplyChunk, to_remote_model
|
|
23
30
|
from .dmatrix import DMatrix
|
|
24
31
|
|
|
25
32
|
|
|
@@ -32,6 +39,33 @@ class BoosterData(ModelData):
|
|
|
32
39
|
super().__init__(*args, **kwargs)
|
|
33
40
|
self._evals_result = evals_result if evals_result is not None else dict()
|
|
34
41
|
|
|
42
|
+
@staticmethod
|
|
43
|
+
def _get_booster_score(bst, fmap=None, importance_type="weight"):
|
|
44
|
+
if not fmap:
|
|
45
|
+
tmp_file_name = ""
|
|
46
|
+
else:
|
|
47
|
+
tmp_file = tempfile.NamedTemporaryFile(delete=False)
|
|
48
|
+
tmp_file.write(fmap)
|
|
49
|
+
tmp_file.close()
|
|
50
|
+
tmp_file_name = tmp_file.name
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
return bst.get_score(fmap=tmp_file_name, importance_type=importance_type)
|
|
54
|
+
finally:
|
|
55
|
+
if tmp_file_name:
|
|
56
|
+
os.unlink(tmp_file_name)
|
|
57
|
+
|
|
58
|
+
def get_score(self, fmap="", importance_type="weight"):
|
|
59
|
+
op = ModelApplyChunk(
|
|
60
|
+
func=self._get_booster_score, output_types=[OutputType.object]
|
|
61
|
+
)
|
|
62
|
+
if not fmap:
|
|
63
|
+
fmap_data = None
|
|
64
|
+
else:
|
|
65
|
+
with open(fmap, "rb") as fmap_file:
|
|
66
|
+
fmap_data = fmap_file.read()
|
|
67
|
+
return op(self, [{}], fmap=fmap_data, importance_type=importance_type)[0]
|
|
68
|
+
|
|
35
69
|
def execute(self, session=None, **kw):
|
|
36
70
|
# The evals_result should be fetched when BoosterData.execute() is called.
|
|
37
71
|
result = super().execute(session=session, **kw)
|
|
@@ -82,6 +116,30 @@ else:
|
|
|
82
116
|
Base class for implementing scikit-learn interface
|
|
83
117
|
"""
|
|
84
118
|
|
|
119
|
+
def _set_model(
|
|
120
|
+
self, xgb_model: Union[xgboost.XGBModel, xgboost.Booster] = None
|
|
121
|
+
):
|
|
122
|
+
booster = None
|
|
123
|
+
if isinstance(xgb_model, xgboost.XGBModel):
|
|
124
|
+
booster = xgb_model.get_booster()
|
|
125
|
+
elif isinstance(xgb_model, xgboost.Booster):
|
|
126
|
+
booster = xgb_model
|
|
127
|
+
|
|
128
|
+
if booster is not None:
|
|
129
|
+
self._Booster = to_remote_model(booster, model_cls=Booster)
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
def _get_param_names(cls):
|
|
133
|
+
# make sure `xgb_model` not treated as a model param
|
|
134
|
+
names = super()._get_param_names()
|
|
135
|
+
if names:
|
|
136
|
+
names = [p for p in names if p != "xgb_model"]
|
|
137
|
+
return names
|
|
138
|
+
|
|
139
|
+
def __repr__(self):
|
|
140
|
+
local_model = self.fetch()
|
|
141
|
+
return repr(local_model)
|
|
142
|
+
|
|
85
143
|
def fit(
|
|
86
144
|
self,
|
|
87
145
|
X,
|
|
@@ -157,6 +215,55 @@ else:
|
|
|
157
215
|
self._Booster.execute(session=session, **run_kwargs)
|
|
158
216
|
return super().evals_result()
|
|
159
217
|
|
|
218
|
+
def execute(self, session=None, run_kwargs=None):
|
|
219
|
+
self._Booster.execute(session=session, run_kwargs=run_kwargs)
|
|
220
|
+
return self
|
|
221
|
+
|
|
222
|
+
def fetch(self, session=None, run_kwargs=None):
|
|
223
|
+
from xgboost import sklearn as xgb_sklearn
|
|
224
|
+
|
|
225
|
+
booster = self._Booster.fetch(session=session, run_kwargs=run_kwargs)
|
|
226
|
+
remote_bst, self._Booster = self._Booster, booster
|
|
227
|
+
try:
|
|
228
|
+
local_cls = getattr(xgb_sklearn, type(self).__name__)
|
|
229
|
+
local_model = local_cls(**self.get_params(deep=True))
|
|
230
|
+
local_model._Booster = booster
|
|
231
|
+
return local_model
|
|
232
|
+
finally:
|
|
233
|
+
self._Booster = remote_bst
|
|
234
|
+
|
|
235
|
+
@staticmethod
|
|
236
|
+
def _calc_feature_importance(bst, importance_type, n_features):
|
|
237
|
+
config = json.loads(bst.save_config())
|
|
238
|
+
bst_type = config["learner"]["gradient_booster"]["name"]
|
|
239
|
+
dft = "weight" if bst_type == "gblinear" else "gain"
|
|
240
|
+
importance_type = importance_type or dft
|
|
241
|
+
score = bst.get_score(importance_type=importance_type)
|
|
242
|
+
if bst.feature_names is None:
|
|
243
|
+
feature_names = [f"f{i}" for i in range(n_features)]
|
|
244
|
+
else:
|
|
245
|
+
feature_names = bst.feature_names
|
|
246
|
+
# gblinear returns all features so the `get` in next line is only for gbtree.
|
|
247
|
+
all_features = [score.get(f, 0.0) for f in feature_names]
|
|
248
|
+
all_features_arr = np.array(all_features, dtype=np.float32)
|
|
249
|
+
total = all_features_arr.sum()
|
|
250
|
+
if total == 0:
|
|
251
|
+
return all_features_arr
|
|
252
|
+
return all_features_arr / total
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def feature_importances_(self):
|
|
256
|
+
op = ModelApplyChunk(
|
|
257
|
+
func=self._calc_feature_importance, output_types=[OutputType.tensor]
|
|
258
|
+
)
|
|
259
|
+
params = {"shape": (self._n_features_in,), "dtype": np.dtype(np.float32)}
|
|
260
|
+
return op(
|
|
261
|
+
self.get_booster(),
|
|
262
|
+
[params],
|
|
263
|
+
importance_type=self.importance_type,
|
|
264
|
+
n_features=self._n_features_in,
|
|
265
|
+
)[0]
|
|
266
|
+
|
|
160
267
|
def wrap_evaluation_matrices(
|
|
161
268
|
missing: float,
|
|
162
269
|
X: Any,
|
|
@@ -141,8 +141,8 @@ def to_dmatrix(
|
|
|
141
141
|
label_lower_bound=label_lower_bound,
|
|
142
142
|
label_upper_bound=label_upper_bound,
|
|
143
143
|
gpu=data.op.gpu,
|
|
144
|
-
enable_categorical=enable_categorical,
|
|
145
144
|
_output_types=get_output_types(data),
|
|
145
|
+
enable_categorical=enable_categorical,
|
|
146
146
|
)
|
|
147
147
|
return op()
|
|
148
148
|
|
|
@@ -26,7 +26,8 @@ from ....serialization.serializables import (
|
|
|
26
26
|
TupleField,
|
|
27
27
|
)
|
|
28
28
|
from ....tensor.core import TensorOrder
|
|
29
|
-
from
|
|
29
|
+
from ..models import to_remote_model
|
|
30
|
+
from .core import Booster, BoosterData
|
|
30
31
|
from .dmatrix import check_data
|
|
31
32
|
|
|
32
33
|
|
|
@@ -96,11 +97,13 @@ def predict(
|
|
|
96
97
|
-------
|
|
97
98
|
results: Booster
|
|
98
99
|
"""
|
|
100
|
+
import xgboost
|
|
101
|
+
|
|
99
102
|
data = check_data(data)
|
|
100
|
-
|
|
103
|
+
if isinstance(model, xgboost.Booster):
|
|
104
|
+
model = to_remote_model(model, model_cls=Booster)
|
|
101
105
|
|
|
102
106
|
output_types = [OutputType.tensor]
|
|
103
|
-
|
|
104
107
|
iteration_range = iteration_range or (0, 0)
|
|
105
108
|
|
|
106
109
|
return XGBPredict(
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from typing import Union
|
|
15
16
|
|
|
16
17
|
from ..utils import make_import_error_func
|
|
17
18
|
from .core import XGBScikitLearnBase, xgboost
|
|
@@ -19,15 +20,25 @@ from .core import XGBScikitLearnBase, xgboost
|
|
|
19
20
|
if not xgboost:
|
|
20
21
|
XGBRegressor = make_import_error_func("xgboost")
|
|
21
22
|
else:
|
|
23
|
+
from xgboost.sklearn import XGBRegressorBase
|
|
24
|
+
|
|
22
25
|
from .core import wrap_evaluation_matrices
|
|
23
26
|
from .predict import predict
|
|
24
27
|
from .train import train
|
|
25
28
|
|
|
26
|
-
class XGBRegressor(XGBScikitLearnBase):
|
|
29
|
+
class XGBRegressor(XGBScikitLearnBase, XGBRegressorBase):
|
|
27
30
|
"""
|
|
28
31
|
Implementation of the scikit-learn API for XGBoost regressor.
|
|
29
32
|
"""
|
|
30
33
|
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
xgb_model: Union[xgboost.XGBRegressor, xgboost.Booster] = None,
|
|
37
|
+
**kwargs,
|
|
38
|
+
):
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
self._set_model(xgb_model)
|
|
41
|
+
|
|
31
42
|
def fit(
|
|
32
43
|
self,
|
|
33
44
|
X,
|
|
@@ -41,6 +52,9 @@ else:
|
|
|
41
52
|
):
|
|
42
53
|
session = kw.pop("session", None)
|
|
43
54
|
run_kwargs = kw.pop("run_kwargs", dict())
|
|
55
|
+
|
|
56
|
+
self._n_features_in = X.shape[1]
|
|
57
|
+
|
|
44
58
|
dtrain, evals = wrap_evaluation_matrices(
|
|
45
59
|
None,
|
|
46
60
|
X,
|
|
@@ -82,9 +82,8 @@ class XGBTrain(Operator, TileableOperatorMixin):
|
|
|
82
82
|
inputs = [self.dtrain]
|
|
83
83
|
if self.has_evals_result:
|
|
84
84
|
inputs.extend(e[0] for e in self.evals)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
)[0]
|
|
85
|
+
kws = [{"object_class": Booster}, {}]
|
|
86
|
+
return self.new_tileables(inputs, kws=kws, evals_result=evals_result)[0]
|
|
88
87
|
|
|
89
88
|
@property
|
|
90
89
|
def output_limit(self):
|
|
@@ -129,4 +128,6 @@ def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwarg
|
|
|
129
128
|
num_class=num_class,
|
|
130
129
|
**kwargs,
|
|
131
130
|
)(evals_result)
|
|
132
|
-
|
|
131
|
+
if evals:
|
|
132
|
+
data.execute(session=session, **run_kwargs)
|
|
133
|
+
return data
|
|
@@ -11,4 +11,5 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
|
|
15
|
+
from .dtypes import ArrowDtype, dict_, is_list_dtype, is_map_dtype, list_
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from typing import Union
|
|
15
16
|
|
|
16
17
|
import pandas as pd
|
|
@@ -36,6 +37,15 @@ def dict_(
|
|
|
36
37
|
return pd.ArrowDtype(pa.map_(key_type, item_type))
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
def list_(value_type: Union[pa.DataType, pa.Field]):
|
|
41
|
+
"""
|
|
42
|
+
Create ``pd.ArrowDtype(pa.ListType)`` instance from a list or field.
|
|
43
|
+
"""
|
|
44
|
+
if ArrowDtype is None:
|
|
45
|
+
raise ImportError("ArrowDtype is not supported in current environment")
|
|
46
|
+
return pd.ArrowDtype(pa.list_(value_type))
|
|
47
|
+
|
|
48
|
+
|
|
39
49
|
def is_map_dtype(dtype: ArrowDtype) -> bool:
|
|
40
50
|
"""
|
|
41
51
|
Check whether the dtype is a map type.
|
|
@@ -43,3 +53,14 @@ def is_map_dtype(dtype: ArrowDtype) -> bool:
|
|
|
43
53
|
if ArrowDtype is None:
|
|
44
54
|
raise ImportError("ArrowDtype is not supported in current environment")
|
|
45
55
|
return isinstance(dtype, ArrowDtype) and isinstance(dtype.pyarrow_dtype, pa.MapType)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def is_list_dtype(dtype: ArrowDtype) -> bool:
|
|
59
|
+
"""
|
|
60
|
+
Check whether the dtype is a list dtype.
|
|
61
|
+
"""
|
|
62
|
+
if ArrowDtype is None:
|
|
63
|
+
raise ImportError("ArrowDtype is not supported in current environment")
|
|
64
|
+
return isinstance(dtype, ArrowDtype) and isinstance(
|
|
65
|
+
dtype.pyarrow_dtype, pa.ListType
|
|
66
|
+
)
|
|
@@ -17,18 +17,19 @@ import pyarrow as pa
|
|
|
17
17
|
import pytest
|
|
18
18
|
|
|
19
19
|
from ....utils import ARROW_DTYPE_NOT_SUPPORTED
|
|
20
|
-
from ..dtypes import dict_, is_map_dtype
|
|
20
|
+
from ..dtypes import dict_, is_list_dtype, is_map_dtype, list_
|
|
21
21
|
|
|
22
22
|
try:
|
|
23
23
|
from pandas import ArrowDtype
|
|
24
24
|
except:
|
|
25
25
|
ArrowDtype = None
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
@pytest.mark.skipif(
|
|
27
|
+
pytestmark = pytest.mark.skipif(
|
|
29
28
|
ARROW_DTYPE_NOT_SUPPORTED,
|
|
30
29
|
reason="pandas doesn't support ArrowDtype",
|
|
31
30
|
)
|
|
31
|
+
|
|
32
|
+
|
|
32
33
|
def test_map_dtype():
|
|
33
34
|
dt = dict_(pa.int64(), pa.string())
|
|
34
35
|
assert is_map_dtype(dt)
|
|
@@ -36,3 +37,12 @@ def test_map_dtype():
|
|
|
36
37
|
dt = pd.ArrowDtype(pa.list_(pa.int64()))
|
|
37
38
|
assert not is_map_dtype(dt)
|
|
38
39
|
assert not is_map_dtype(pd.Int64Dtype)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_list_dtype():
|
|
43
|
+
dt = list_(pa.int64())
|
|
44
|
+
assert is_list_dtype(dt)
|
|
45
|
+
|
|
46
|
+
dt = pd.ArrowDtype(pa.map_(pa.int64(), pa.string()))
|
|
47
|
+
assert not is_list_dtype(dt)
|
|
48
|
+
assert not is_list_dtype(pd.Int64Dtype)
|
|
Binary file
|
maxframe/opcodes.py
CHANGED
|
@@ -395,6 +395,11 @@ FUSE = 801
|
|
|
395
395
|
# LLM
|
|
396
396
|
DASHSCOPE_TEXT_GENERATION = 810
|
|
397
397
|
DASHSCOPE_MULTI_MODAL_GENERATION = 811
|
|
398
|
+
MANAGED_TEXT_MODAL_GENERATION = 812
|
|
399
|
+
MANAGED_MULTI_MODAL_GENERATION = 813
|
|
400
|
+
LLM_TEXT_SUMMARIZE_TASK = 814
|
|
401
|
+
LLM_TEXT_TRANSLATE_TASK = 815
|
|
402
|
+
LLM_TEXT_CLASSIFY_TASK = 816
|
|
398
403
|
|
|
399
404
|
# table like input for tensor
|
|
400
405
|
TABLE_COO = 1003
|
|
@@ -575,16 +580,30 @@ DATAFRAME_RESHUFFLE = 10001
|
|
|
575
580
|
FLATMAP = 10002
|
|
576
581
|
FLATJSON = 10003
|
|
577
582
|
APPLY_CHUNK = 10004
|
|
583
|
+
|
|
578
584
|
SERIES_DICT_GETITEM = 10005
|
|
579
585
|
SERIES_DICT_SETITEM = 10006
|
|
580
586
|
SERIES_DICT_LENGTH = 10007
|
|
581
587
|
SERIES_DICT_REMOVE = 10008
|
|
582
588
|
SERIES_DICT_CONTAINS = 10009
|
|
589
|
+
SERIES_DICT_FLATTEN = 10010
|
|
590
|
+
|
|
591
|
+
SERIES_LIST_GETITEM = 10020
|
|
592
|
+
SERIES_LIST_SETITEM = 10021
|
|
593
|
+
SERIES_LIST_CONTAINS = 10022
|
|
594
|
+
SERIES_LIST_LENGTH = 10023
|
|
595
|
+
SERIES_LIST_INSERT = 10024
|
|
596
|
+
SERIES_LIST_EXTEND = 10025
|
|
597
|
+
SERIES_LIST_POP = 10026
|
|
598
|
+
SERIES_LIST_SORT = 10027
|
|
599
|
+
SERIES_LIST_FLATTEN = 10028
|
|
583
600
|
|
|
584
601
|
# MaxFrame internal operators
|
|
585
602
|
DATAFRAME_PROJECTION_SAME_INDEX_MERGE = 100001
|
|
586
603
|
GROUPBY_AGGR_SAME_INDEX_MERGE = 100002
|
|
587
604
|
DATAFRAME_ILOC_GET_AND_RENAME_ITEM = 100003
|
|
605
|
+
COLLECT_MODEL_RESULT = 100004
|
|
606
|
+
MODEL_DATA_SOURCE = 100005
|
|
588
607
|
|
|
589
608
|
# fetches
|
|
590
609
|
FETCH_SHUFFLE = 999998
|
|
Binary file
|
maxframe/serialization/core.pyx
CHANGED
|
@@ -112,7 +112,14 @@ cpdef object load_type(str class_name, object parent_class):
|
|
|
112
112
|
|
|
113
113
|
mod_name, cls_name = class_name.rsplit("#", 1)
|
|
114
114
|
|
|
115
|
-
|
|
115
|
+
try:
|
|
116
|
+
cls = importlib.import_module(mod_name)
|
|
117
|
+
except ImportError as ex:
|
|
118
|
+
raise ImportError(
|
|
119
|
+
f"Failed to import {mod_name} when loading "
|
|
120
|
+
f"class {class_name}, {ex}"
|
|
121
|
+
) from None
|
|
122
|
+
|
|
116
123
|
for sub_cls_name in cls_name.split("."):
|
|
117
124
|
cls = getattr(cls, sub_cls_name)
|
|
118
125
|
_type_cache[class_name] = cls
|
|
@@ -122,6 +129,10 @@ cpdef object load_type(str class_name, object parent_class):
|
|
|
122
129
|
return cls
|
|
123
130
|
|
|
124
131
|
|
|
132
|
+
cpdef void clear_type_cache():
|
|
133
|
+
_type_cache.clear()
|
|
134
|
+
|
|
135
|
+
|
|
125
136
|
cdef Serializer get_deserializer(int32_t deserializer_id):
|
|
126
137
|
return _deserializers[deserializer_id]
|
|
127
138
|
|
maxframe/serialization/numpy.py
CHANGED
|
@@ -70,10 +70,18 @@ class NDArraySerializer(Serializer):
|
|
|
70
70
|
if dtype_new_order:
|
|
71
71
|
dtype = dtype[dtype_new_order]
|
|
72
72
|
if dtype.hasobject:
|
|
73
|
-
shape = header["shape"]
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
shape = tuple(header["shape"])
|
|
74
|
+
if shape == ():
|
|
75
|
+
val = np.array(subs[0]).reshape(shape)
|
|
76
|
+
else:
|
|
77
|
+
# fill empty object array
|
|
78
|
+
val = np.empty(shape, dtype=dtype)
|
|
79
|
+
try:
|
|
80
|
+
val[(slice(None),) * len(shape)] = subs[0]
|
|
81
|
+
except ValueError:
|
|
82
|
+
val[(slice(None),) * len(shape)] = np.array(
|
|
83
|
+
subs[0], dtype=dtype
|
|
84
|
+
).reshape(shape)
|
|
77
85
|
else:
|
|
78
86
|
val = np.ndarray(
|
|
79
87
|
shape=tuple(header["shape"]),
|
|
@@ -22,7 +22,7 @@ import pytest
|
|
|
22
22
|
from ....core import EntityData
|
|
23
23
|
from ....lib.wrapped_pickle import switch_unpickle
|
|
24
24
|
from ....utils import no_default
|
|
25
|
-
from ... import deserialize, serialize
|
|
25
|
+
from ... import clear_type_cache, deserialize, serialize
|
|
26
26
|
from .. import (
|
|
27
27
|
AnyField,
|
|
28
28
|
BoolField,
|
|
@@ -202,6 +202,7 @@ def test_serializable(set_is_ci):
|
|
|
202
202
|
def test_compatible_serializable(set_is_ci):
|
|
203
203
|
global MySimpleSerializable, MySubSerializable
|
|
204
204
|
|
|
205
|
+
clear_type_cache()
|
|
205
206
|
old_base, old_sub = MySimpleSerializable, MySubSerializable
|
|
206
207
|
|
|
207
208
|
try:
|
|
@@ -231,13 +232,23 @@ def test_compatible_serializable(set_is_ci):
|
|
|
231
232
|
my_sub_serializable2 = deserialize(header, buffers)
|
|
232
233
|
assert type(my_sub_serializable) is not type(my_sub_serializable2)
|
|
233
234
|
_assert_serializable_eq(my_sub_serializable, my_sub_serializable2)
|
|
235
|
+
|
|
236
|
+
header, buffers = serialize(my_sub_serializable2)
|
|
234
237
|
finally:
|
|
235
238
|
MySimpleSerializable, MySubSerializable = old_base, old_sub
|
|
239
|
+
MyMidSerializable = None
|
|
240
|
+
clear_type_cache()
|
|
241
|
+
|
|
242
|
+
my_sub_serializable3 = deserialize(header, buffers)
|
|
243
|
+
assert type(my_sub_serializable2) is not type(my_sub_serializable3)
|
|
244
|
+
_assert_serializable_eq(my_sub_serializable2, my_sub_serializable3)
|
|
236
245
|
|
|
237
246
|
|
|
238
247
|
def _assert_serializable_eq(my_serializable, my_serializable2):
|
|
239
248
|
for field_name, field in my_serializable._FIELDS.items():
|
|
240
|
-
if not hasattr(my_serializable, field.name)
|
|
249
|
+
if not hasattr(my_serializable, field.name) or not hasattr(
|
|
250
|
+
my_serializable2, field.name
|
|
251
|
+
):
|
|
241
252
|
continue
|
|
242
253
|
expect_value = getattr(my_serializable, field_name)
|
|
243
254
|
if expect_value is no_default:
|
maxframe/tensor/misc/unique.py
CHANGED
|
@@ -32,23 +32,24 @@ class TensorUnique(TensorHasInput, TensorOperatorMixin):
|
|
|
32
32
|
|
|
33
33
|
@property
|
|
34
34
|
def output_limit(self):
|
|
35
|
-
return 1
|
|
35
|
+
return 1 + self.return_index + self.return_inverse + self.return_counts
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
@classmethod
|
|
38
|
+
def _gen_kws(cls, op: "TensorUnique", input_obj, chunk=False, chunk_index=None):
|
|
38
39
|
kws = []
|
|
39
40
|
|
|
40
41
|
# unique tensor
|
|
41
42
|
shape = list(input_obj.shape)
|
|
42
|
-
shape[
|
|
43
|
+
shape[op.axis] = np.nan
|
|
43
44
|
kw = {"shape": tuple(shape), "dtype": input_obj.dtype, "gpu": input_obj.op.gpu}
|
|
44
45
|
if chunk:
|
|
45
46
|
idx = [0] * len(shape)
|
|
46
|
-
idx[
|
|
47
|
+
idx[op.axis] = chunk_index or 0
|
|
47
48
|
kw["index"] = tuple(idx)
|
|
48
49
|
kws.append(kw)
|
|
49
50
|
|
|
50
51
|
# unique indices tensor
|
|
51
|
-
if
|
|
52
|
+
if op.return_index:
|
|
52
53
|
kw = {
|
|
53
54
|
"shape": (np.nan,),
|
|
54
55
|
"dtype": np.dtype(np.intp),
|
|
@@ -60,9 +61,9 @@ class TensorUnique(TensorHasInput, TensorOperatorMixin):
|
|
|
60
61
|
kws.append(kw)
|
|
61
62
|
|
|
62
63
|
# unique inverse tensor
|
|
63
|
-
if
|
|
64
|
+
if op.return_inverse:
|
|
64
65
|
kw = {
|
|
65
|
-
"shape": (input_obj.shape[
|
|
66
|
+
"shape": (input_obj.shape[op.axis],),
|
|
66
67
|
"dtype": np.dtype(np.intp),
|
|
67
68
|
"gpu": input_obj.op.gpu,
|
|
68
69
|
"type": "inverse",
|
|
@@ -72,7 +73,7 @@ class TensorUnique(TensorHasInput, TensorOperatorMixin):
|
|
|
72
73
|
kws.append(kw)
|
|
73
74
|
|
|
74
75
|
# unique counts tensor
|
|
75
|
-
if
|
|
76
|
+
if op.return_counts:
|
|
76
77
|
kw = {
|
|
77
78
|
"shape": (np.nan,),
|
|
78
79
|
"dtype": np.dtype(int),
|
|
@@ -92,9 +93,9 @@ class TensorUnique(TensorHasInput, TensorOperatorMixin):
|
|
|
92
93
|
if self.axis is None:
|
|
93
94
|
if ar.ndim > 1:
|
|
94
95
|
ar = ar.flatten()
|
|
95
|
-
self.
|
|
96
|
+
self.axis = 0
|
|
96
97
|
else:
|
|
97
|
-
self.
|
|
98
|
+
self.axis = validate_axis(ar.ndim, self.axis)
|
|
98
99
|
|
|
99
100
|
kws = self._gen_kws(self, ar)
|
|
100
101
|
tensors = self.new_tensors([ar], kws=kws, order=TensorOrder.C_ORDER)
|
|
@@ -181,7 +181,10 @@ def _reshape(a, newshape, order="C", tensor_order=None, out_shape=None):
|
|
|
181
181
|
if tensor_order is None:
|
|
182
182
|
tensor_order = get_order(order, a.order, available_options="CFA")
|
|
183
183
|
op = TensorReshape(
|
|
184
|
-
newshape
|
|
184
|
+
newshape=newshape,
|
|
185
|
+
order=order,
|
|
186
|
+
dtype=a.dtype,
|
|
187
|
+
create_view=tensor_order == a.order,
|
|
185
188
|
)
|
|
186
189
|
if out_shape is None:
|
|
187
190
|
out_shape = newshape
|
maxframe/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: maxframe
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.1
|
|
4
4
|
Summary: MaxFrame operator-based data analyze framework
|
|
5
5
|
Requires-Dist: numpy <2.0.0,>=1.19.0
|
|
6
6
|
Requires-Dist: pandas >=1.0.0
|
|
@@ -28,6 +28,7 @@ Requires-Dist: pytest-cov >=4.1.0 ; extra == 'test'
|
|
|
28
28
|
Requires-Dist: pytest-asyncio >=0.21.0 ; extra == 'test'
|
|
29
29
|
Requires-Dist: pytest-timeout >=2.1.0 ; extra == 'test'
|
|
30
30
|
Requires-Dist: matplotlib >=2.0.0 ; extra == 'test'
|
|
31
|
+
Requires-Dist: xgboost <3.0.0,>=1.4.0 ; extra == 'test'
|
|
31
32
|
|
|
32
33
|
MaxCompute MaxFrame Client
|
|
33
34
|
==========================
|