maxframe 1.2.0__cp39-cp39-win_amd64.whl → 1.3.0__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp39-win_amd64.pyd +0 -0
- maxframe/codegen.py +70 -21
- maxframe/config/config.py +6 -0
- maxframe/core/accessor.py +1 -0
- maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
- maxframe/dataframe/accessors/__init__.py +1 -1
- maxframe/dataframe/accessors/dict_/accessor.py +1 -0
- maxframe/dataframe/accessors/dict_/length.py +1 -0
- maxframe/dataframe/accessors/dict_/setitem.py +1 -0
- maxframe/dataframe/accessors/dict_/tests/test_dict_accessor.py +5 -7
- maxframe/dataframe/accessors/list_/__init__.py +37 -0
- maxframe/dataframe/accessors/list_/accessor.py +39 -0
- maxframe/dataframe/accessors/list_/getitem.py +135 -0
- maxframe/dataframe/accessors/list_/length.py +73 -0
- maxframe/dataframe/accessors/list_/tests/__init__.py +13 -0
- maxframe/dataframe/accessors/list_/tests/test_list_accessor.py +79 -0
- maxframe/dataframe/accessors/plotting/__init__.py +2 -0
- maxframe/dataframe/accessors/string_/__init__.py +1 -0
- maxframe/dataframe/datasource/read_odps_query.py +1 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +4 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/accessor.py +1 -0
- maxframe/dataframe/extensions/apply_chunk.py +34 -21
- maxframe/dataframe/extensions/flatmap.py +8 -1
- maxframe/dataframe/extensions/tests/test_apply_chunk.py +2 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +1 -0
- maxframe/dataframe/merge/concat.py +7 -4
- maxframe/dataframe/merge/merge.py +1 -0
- maxframe/dataframe/merge/tests/test_merge.py +97 -47
- maxframe/dataframe/missing/tests/test_missing.py +1 -0
- maxframe/dataframe/tests/test_utils.py +7 -0
- maxframe/dataframe/ufunc/ufunc.py +1 -0
- maxframe/dataframe/utils.py +3 -0
- maxframe/io/odpsio/schema.py +1 -0
- maxframe/learn/contrib/__init__.py +2 -4
- maxframe/learn/contrib/llm/__init__.py +1 -0
- maxframe/learn/contrib/llm/core.py +31 -10
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +4 -3
- maxframe/learn/contrib/llm/models/managed.py +39 -0
- maxframe/learn/contrib/llm/multi_modal.py +1 -0
- maxframe/learn/contrib/llm/text.py +252 -8
- maxframe/learn/contrib/models.py +77 -0
- maxframe/learn/contrib/utils.py +1 -0
- maxframe/learn/contrib/xgboost/__init__.py +8 -1
- maxframe/learn/contrib/xgboost/classifier.py +15 -4
- maxframe/learn/contrib/xgboost/core.py +108 -1
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -1
- maxframe/learn/contrib/xgboost/predict.py +8 -3
- maxframe/learn/contrib/xgboost/regressor.py +15 -1
- maxframe/learn/contrib/xgboost/train.py +5 -4
- maxframe/lib/dtypes_extension/__init__.py +2 -1
- maxframe/lib/dtypes_extension/dtypes.py +17 -42
- maxframe/lib/dtypes_extension/tests/test_dtypes.py +11 -31
- maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
- maxframe/opcodes.py +19 -0
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
- maxframe/serialization/core.pyx +12 -1
- maxframe/serialization/numpy.py +12 -4
- maxframe/serialization/serializables/tests/test_serializable.py +13 -2
- maxframe/serialization/tests/test_serial.py +2 -0
- maxframe/tensor/merge/concatenate.py +1 -0
- maxframe/tensor/misc/unique.py +11 -10
- maxframe/tensor/reshape/reshape.py +4 -1
- maxframe/utils.py +4 -0
- {maxframe-1.2.0.dist-info → maxframe-1.3.0.dist-info}/METADATA +2 -2
- {maxframe-1.2.0.dist-info → maxframe-1.3.0.dist-info}/RECORD +72 -64
- {maxframe-1.2.0.dist-info → maxframe-1.3.0.dist-info}/WHEEL +1 -1
- maxframe_client/session/odps.py +3 -0
- maxframe_client/session/tests/test_task.py +1 -0
- {maxframe-1.2.0.dist-info → maxframe-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -12,9 +12,11 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from typing import Union
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
|
|
17
|
-
from ....
|
|
19
|
+
from .... import tensor as mt
|
|
18
20
|
from ....tensor.merge.vstack import _vstack
|
|
19
21
|
from ..utils import make_import_error_func
|
|
20
22
|
from .core import XGBScikitLearnBase, xgboost
|
|
@@ -33,6 +35,14 @@ else:
|
|
|
33
35
|
Implementation of the scikit-learn API for XGBoost classification.
|
|
34
36
|
"""
|
|
35
37
|
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
xgb_model: Union[xgboost.XGBClassifier, xgboost.Booster] = None,
|
|
41
|
+
**kwargs,
|
|
42
|
+
):
|
|
43
|
+
super().__init__(**kwargs)
|
|
44
|
+
self._set_model(xgb_model)
|
|
45
|
+
|
|
36
46
|
def fit(
|
|
37
47
|
self,
|
|
38
48
|
X,
|
|
@@ -46,7 +56,7 @@ else:
|
|
|
46
56
|
**kw,
|
|
47
57
|
):
|
|
48
58
|
session = kw.pop("session", None)
|
|
49
|
-
run_kwargs = kw.pop("run_kwargs", dict()
|
|
59
|
+
run_kwargs = kw.pop("run_kwargs", None) or dict()
|
|
50
60
|
dtrain, evals = wrap_evaluation_matrices(
|
|
51
61
|
None,
|
|
52
62
|
X,
|
|
@@ -58,6 +68,7 @@ else:
|
|
|
58
68
|
base_margin_eval_set,
|
|
59
69
|
)
|
|
60
70
|
params = self.get_xgb_params()
|
|
71
|
+
self._n_features_in = X.shape[1]
|
|
61
72
|
self.n_classes_ = num_class or 1
|
|
62
73
|
if self.n_classes_ > 2:
|
|
63
74
|
params["objective"] = "multi:softprob"
|
|
@@ -81,7 +92,7 @@ else:
|
|
|
81
92
|
def predict(self, data, **kw):
|
|
82
93
|
prob = self.predict_proba(data, flag=True, **kw)
|
|
83
94
|
if prob.ndim > 1:
|
|
84
|
-
prediction = argmax(prob, axis=1)
|
|
95
|
+
prediction = mt.argmax(prob, axis=1)
|
|
85
96
|
else:
|
|
86
97
|
prediction = (prob > 0.5).astype(np.int64)
|
|
87
98
|
return prediction
|
|
@@ -103,7 +114,7 @@ else:
|
|
|
103
114
|
# binary logistic function
|
|
104
115
|
classone_probs = prediction
|
|
105
116
|
classzero_probs = 1.0 - classone_probs
|
|
106
|
-
return transpose(_vstack((classzero_probs, classone_probs)))
|
|
117
|
+
return mt.transpose(_vstack((classzero_probs, classone_probs)))
|
|
107
118
|
|
|
108
119
|
@property
|
|
109
120
|
def classes_(self) -> np.ndarray:
|
|
@@ -12,14 +12,21 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import tempfile
|
|
18
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
16
21
|
|
|
17
22
|
try:
|
|
18
23
|
import xgboost
|
|
19
24
|
except ImportError:
|
|
20
25
|
xgboost = None
|
|
21
26
|
|
|
27
|
+
from ....core import OutputType
|
|
22
28
|
from ...core import Model, ModelData
|
|
29
|
+
from ..models import ModelApplyChunk, to_remote_model
|
|
23
30
|
from .dmatrix import DMatrix
|
|
24
31
|
|
|
25
32
|
|
|
@@ -32,6 +39,33 @@ class BoosterData(ModelData):
|
|
|
32
39
|
super().__init__(*args, **kwargs)
|
|
33
40
|
self._evals_result = evals_result if evals_result is not None else dict()
|
|
34
41
|
|
|
42
|
+
@staticmethod
|
|
43
|
+
def _get_booster_score(bst, fmap=None, importance_type="weight"):
|
|
44
|
+
if not fmap:
|
|
45
|
+
tmp_file_name = ""
|
|
46
|
+
else:
|
|
47
|
+
tmp_file = tempfile.NamedTemporaryFile(delete=False)
|
|
48
|
+
tmp_file.write(fmap)
|
|
49
|
+
tmp_file.close()
|
|
50
|
+
tmp_file_name = tmp_file.name
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
return bst.get_score(fmap=tmp_file_name, importance_type=importance_type)
|
|
54
|
+
finally:
|
|
55
|
+
if tmp_file_name:
|
|
56
|
+
os.unlink(tmp_file_name)
|
|
57
|
+
|
|
58
|
+
def get_score(self, fmap="", importance_type="weight"):
|
|
59
|
+
op = ModelApplyChunk(
|
|
60
|
+
func=self._get_booster_score, output_types=[OutputType.object]
|
|
61
|
+
)
|
|
62
|
+
if not fmap:
|
|
63
|
+
fmap_data = None
|
|
64
|
+
else:
|
|
65
|
+
with open(fmap, "rb") as fmap_file:
|
|
66
|
+
fmap_data = fmap_file.read()
|
|
67
|
+
return op(self, [{}], fmap=fmap_data, importance_type=importance_type)[0]
|
|
68
|
+
|
|
35
69
|
def execute(self, session=None, **kw):
|
|
36
70
|
# The evals_result should be fetched when BoosterData.execute() is called.
|
|
37
71
|
result = super().execute(session=session, **kw)
|
|
@@ -82,6 +116,30 @@ else:
|
|
|
82
116
|
Base class for implementing scikit-learn interface
|
|
83
117
|
"""
|
|
84
118
|
|
|
119
|
+
def _set_model(
|
|
120
|
+
self, xgb_model: Union[xgboost.XGBModel, xgboost.Booster] = None
|
|
121
|
+
):
|
|
122
|
+
booster = None
|
|
123
|
+
if isinstance(xgb_model, xgboost.XGBModel):
|
|
124
|
+
booster = xgb_model.get_booster()
|
|
125
|
+
elif isinstance(xgb_model, xgboost.Booster):
|
|
126
|
+
booster = xgb_model
|
|
127
|
+
|
|
128
|
+
if booster is not None:
|
|
129
|
+
self._Booster = to_remote_model(booster, model_cls=Booster)
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
def _get_param_names(cls):
|
|
133
|
+
# make sure `xgb_model` not treated as a model param
|
|
134
|
+
names = super()._get_param_names()
|
|
135
|
+
if names:
|
|
136
|
+
names = [p for p in names if p != "xgb_model"]
|
|
137
|
+
return names
|
|
138
|
+
|
|
139
|
+
def __repr__(self):
|
|
140
|
+
local_model = self.fetch()
|
|
141
|
+
return repr(local_model)
|
|
142
|
+
|
|
85
143
|
def fit(
|
|
86
144
|
self,
|
|
87
145
|
X,
|
|
@@ -157,6 +215,55 @@ else:
|
|
|
157
215
|
self._Booster.execute(session=session, **run_kwargs)
|
|
158
216
|
return super().evals_result()
|
|
159
217
|
|
|
218
|
+
def execute(self, session=None, run_kwargs=None):
|
|
219
|
+
self._Booster.execute(session=session, run_kwargs=run_kwargs)
|
|
220
|
+
return self
|
|
221
|
+
|
|
222
|
+
def fetch(self, session=None, run_kwargs=None):
|
|
223
|
+
from xgboost import sklearn as xgb_sklearn
|
|
224
|
+
|
|
225
|
+
booster = self._Booster.fetch(session=session, run_kwargs=run_kwargs)
|
|
226
|
+
remote_bst, self._Booster = self._Booster, booster
|
|
227
|
+
try:
|
|
228
|
+
local_cls = getattr(xgb_sklearn, type(self).__name__)
|
|
229
|
+
local_model = local_cls(**self.get_params(deep=True))
|
|
230
|
+
local_model._Booster = booster
|
|
231
|
+
return local_model
|
|
232
|
+
finally:
|
|
233
|
+
self._Booster = remote_bst
|
|
234
|
+
|
|
235
|
+
@staticmethod
|
|
236
|
+
def _calc_feature_importance(bst, importance_type, n_features):
|
|
237
|
+
config = json.loads(bst.save_config())
|
|
238
|
+
bst_type = config["learner"]["gradient_booster"]["name"]
|
|
239
|
+
dft = "weight" if bst_type == "gblinear" else "gain"
|
|
240
|
+
importance_type = importance_type or dft
|
|
241
|
+
score = bst.get_score(importance_type=importance_type)
|
|
242
|
+
if bst.feature_names is None:
|
|
243
|
+
feature_names = [f"f{i}" for i in range(n_features)]
|
|
244
|
+
else:
|
|
245
|
+
feature_names = bst.feature_names
|
|
246
|
+
# gblinear returns all features so the `get` in next line is only for gbtree.
|
|
247
|
+
all_features = [score.get(f, 0.0) for f in feature_names]
|
|
248
|
+
all_features_arr = np.array(all_features, dtype=np.float32)
|
|
249
|
+
total = all_features_arr.sum()
|
|
250
|
+
if total == 0:
|
|
251
|
+
return all_features_arr
|
|
252
|
+
return all_features_arr / total
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def feature_importances_(self):
|
|
256
|
+
op = ModelApplyChunk(
|
|
257
|
+
func=self._calc_feature_importance, output_types=[OutputType.tensor]
|
|
258
|
+
)
|
|
259
|
+
params = {"shape": (self._n_features_in,), "dtype": np.dtype(np.float32)}
|
|
260
|
+
return op(
|
|
261
|
+
self.get_booster(),
|
|
262
|
+
[params],
|
|
263
|
+
importance_type=self.importance_type,
|
|
264
|
+
n_features=self._n_features_in,
|
|
265
|
+
)[0]
|
|
266
|
+
|
|
160
267
|
def wrap_evaluation_matrices(
|
|
161
268
|
missing: float,
|
|
162
269
|
X: Any,
|
|
@@ -141,8 +141,8 @@ def to_dmatrix(
|
|
|
141
141
|
label_lower_bound=label_lower_bound,
|
|
142
142
|
label_upper_bound=label_upper_bound,
|
|
143
143
|
gpu=data.op.gpu,
|
|
144
|
-
enable_categorical=enable_categorical,
|
|
145
144
|
_output_types=get_output_types(data),
|
|
145
|
+
enable_categorical=enable_categorical,
|
|
146
146
|
)
|
|
147
147
|
return op()
|
|
148
148
|
|
|
@@ -26,7 +26,8 @@ from ....serialization.serializables import (
|
|
|
26
26
|
TupleField,
|
|
27
27
|
)
|
|
28
28
|
from ....tensor.core import TensorOrder
|
|
29
|
-
from
|
|
29
|
+
from ..models import to_remote_model
|
|
30
|
+
from .core import Booster, BoosterData
|
|
30
31
|
from .dmatrix import check_data
|
|
31
32
|
|
|
32
33
|
|
|
@@ -96,11 +97,15 @@ def predict(
|
|
|
96
97
|
-------
|
|
97
98
|
results: Booster
|
|
98
99
|
"""
|
|
100
|
+
import xgboost
|
|
101
|
+
|
|
99
102
|
data = check_data(data)
|
|
100
|
-
|
|
103
|
+
if not isinstance(model, (Booster, BoosterData, xgboost.Booster)):
|
|
104
|
+
raise TypeError(f"model has to be a xgboost.Booster, got {type(model)} instead")
|
|
105
|
+
elif isinstance(model, xgboost.Booster):
|
|
106
|
+
model = to_remote_model(model, model_cls=Booster)
|
|
101
107
|
|
|
102
108
|
output_types = [OutputType.tensor]
|
|
103
|
-
|
|
104
109
|
iteration_range = iteration_range or (0, 0)
|
|
105
110
|
|
|
106
111
|
return XGBPredict(
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from typing import Union
|
|
15
16
|
|
|
16
17
|
from ..utils import make_import_error_func
|
|
17
18
|
from .core import XGBScikitLearnBase, xgboost
|
|
@@ -19,15 +20,25 @@ from .core import XGBScikitLearnBase, xgboost
|
|
|
19
20
|
if not xgboost:
|
|
20
21
|
XGBRegressor = make_import_error_func("xgboost")
|
|
21
22
|
else:
|
|
23
|
+
from xgboost.sklearn import XGBRegressorBase
|
|
24
|
+
|
|
22
25
|
from .core import wrap_evaluation_matrices
|
|
23
26
|
from .predict import predict
|
|
24
27
|
from .train import train
|
|
25
28
|
|
|
26
|
-
class XGBRegressor(XGBScikitLearnBase):
|
|
29
|
+
class XGBRegressor(XGBScikitLearnBase, XGBRegressorBase):
|
|
27
30
|
"""
|
|
28
31
|
Implementation of the scikit-learn API for XGBoost regressor.
|
|
29
32
|
"""
|
|
30
33
|
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
xgb_model: Union[xgboost.XGBRegressor, xgboost.Booster] = None,
|
|
37
|
+
**kwargs,
|
|
38
|
+
):
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
self._set_model(xgb_model)
|
|
41
|
+
|
|
31
42
|
def fit(
|
|
32
43
|
self,
|
|
33
44
|
X,
|
|
@@ -41,6 +52,9 @@ else:
|
|
|
41
52
|
):
|
|
42
53
|
session = kw.pop("session", None)
|
|
43
54
|
run_kwargs = kw.pop("run_kwargs", dict())
|
|
55
|
+
|
|
56
|
+
self._n_features_in = X.shape[1]
|
|
57
|
+
|
|
44
58
|
dtrain, evals = wrap_evaluation_matrices(
|
|
45
59
|
None,
|
|
46
60
|
X,
|
|
@@ -82,9 +82,8 @@ class XGBTrain(Operator, TileableOperatorMixin):
|
|
|
82
82
|
inputs = [self.dtrain]
|
|
83
83
|
if self.has_evals_result:
|
|
84
84
|
inputs.extend(e[0] for e in self.evals)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
)[0]
|
|
85
|
+
kws = [{"object_class": Booster}, {}]
|
|
86
|
+
return self.new_tileables(inputs, kws=kws, evals_result=evals_result)[0]
|
|
88
87
|
|
|
89
88
|
@property
|
|
90
89
|
def output_limit(self):
|
|
@@ -129,4 +128,6 @@ def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwarg
|
|
|
129
128
|
num_class=num_class,
|
|
130
129
|
**kwargs,
|
|
131
130
|
)(evals_result)
|
|
132
|
-
|
|
131
|
+
if evals:
|
|
132
|
+
data.execute(session=session, **run_kwargs)
|
|
133
|
+
return data
|
|
@@ -11,4 +11,5 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
|
|
15
|
+
from .dtypes import ArrowDtype, dict_, is_list_dtype, is_map_dtype, list_
|
|
@@ -11,12 +11,11 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
|
|
14
15
|
from typing import Union
|
|
15
16
|
|
|
16
|
-
import numpy as np
|
|
17
17
|
import pandas as pd
|
|
18
18
|
import pyarrow as pa
|
|
19
|
-
from pandas.api.extensions import ExtensionDtype
|
|
20
19
|
|
|
21
20
|
try:
|
|
22
21
|
from pandas import ArrowDtype
|
|
@@ -38,6 +37,15 @@ def dict_(
|
|
|
38
37
|
return pd.ArrowDtype(pa.map_(key_type, item_type))
|
|
39
38
|
|
|
40
39
|
|
|
40
|
+
def list_(value_type: Union[pa.DataType, pa.Field]):
|
|
41
|
+
"""
|
|
42
|
+
Create ``pd.ArrowDtype(pa.ListType)`` instance from a list or field.
|
|
43
|
+
"""
|
|
44
|
+
if ArrowDtype is None:
|
|
45
|
+
raise ImportError("ArrowDtype is not supported in current environment")
|
|
46
|
+
return pd.ArrowDtype(pa.list_(value_type))
|
|
47
|
+
|
|
48
|
+
|
|
41
49
|
def is_map_dtype(dtype: ArrowDtype) -> bool:
|
|
42
50
|
"""
|
|
43
51
|
Check whether the dtype is a map type.
|
|
@@ -47,45 +55,12 @@ def is_map_dtype(dtype: ArrowDtype) -> bool:
|
|
|
47
55
|
return isinstance(dtype, ArrowDtype) and isinstance(dtype.pyarrow_dtype, pa.MapType)
|
|
48
56
|
|
|
49
57
|
|
|
50
|
-
|
|
51
|
-
pd.Int8Dtype(): lambda x: pa.int8(),
|
|
52
|
-
pd.Int16Dtype(): lambda x: pa.int16(),
|
|
53
|
-
pd.Int32Dtype(): lambda x: pa.int32(),
|
|
54
|
-
pd.Int64Dtype(): lambda x: pa.int64(),
|
|
55
|
-
pd.UInt8Dtype(): lambda x: pa.uint8(),
|
|
56
|
-
pd.UInt16Dtype(): lambda x: pa.uint16(),
|
|
57
|
-
pd.UInt32Dtype(): lambda x: pa.uint32(),
|
|
58
|
-
pd.UInt64Dtype(): lambda x: pa.uint64(),
|
|
59
|
-
pd.BooleanDtype(): lambda x: pa.bool_(),
|
|
60
|
-
pd.Float32Dtype(): lambda x: pa.float32(),
|
|
61
|
-
pd.Float64Dtype(): lambda x: pa.float64(),
|
|
62
|
-
pd.StringDtype(): lambda x: pa.string(),
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def infer_arrow_dtype(
|
|
67
|
-
dtype: Union[np.dtype, pa.DataType, ExtensionDtype]
|
|
68
|
-
) -> Union[ArrowDtype, ExtensionDtype]:
|
|
58
|
+
def is_list_dtype(dtype: ArrowDtype) -> bool:
|
|
69
59
|
"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
Parameters
|
|
73
|
-
----------
|
|
74
|
-
dtype : Union[np.dtype, pa.DataType, ExtensionDtype]
|
|
75
|
-
The dtype instance, can be np.dtype, pa.DataType or ExtensionDtype
|
|
76
|
-
|
|
77
|
-
Returns
|
|
78
|
-
-------
|
|
79
|
-
Union[pd.ArrowDtype, ExtensionDtype]: The converted pd.ArrowDtype, or ExtensionDtype if conversion failed.
|
|
60
|
+
Check whether the dtype is a list dtype.
|
|
80
61
|
"""
|
|
81
|
-
if
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
if isinstance(dtype, pd.DatetimeTZDtype):
|
|
87
|
-
return pa.timestamp(dtype.unit, dtype.tz)
|
|
88
|
-
|
|
89
|
-
if dtype in _dtype_mapping:
|
|
90
|
-
return ArrowDtype(_dtype_mapping[dtype](dtype))
|
|
91
|
-
return dtype
|
|
62
|
+
if ArrowDtype is None:
|
|
63
|
+
raise ImportError("ArrowDtype is not supported in current environment")
|
|
64
|
+
return isinstance(dtype, ArrowDtype) and isinstance(
|
|
65
|
+
dtype.pyarrow_dtype, pa.ListType
|
|
66
|
+
)
|
|
@@ -12,24 +12,24 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import numpy as np
|
|
16
15
|
import pandas as pd
|
|
17
16
|
import pyarrow as pa
|
|
18
17
|
import pytest
|
|
19
18
|
|
|
20
19
|
from ....utils import ARROW_DTYPE_NOT_SUPPORTED
|
|
21
|
-
from ..dtypes import dict_,
|
|
20
|
+
from ..dtypes import dict_, is_list_dtype, is_map_dtype, list_
|
|
22
21
|
|
|
23
22
|
try:
|
|
24
23
|
from pandas import ArrowDtype
|
|
25
24
|
except:
|
|
26
25
|
ArrowDtype = None
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
@pytest.mark.skipif(
|
|
27
|
+
pytestmark = pytest.mark.skipif(
|
|
30
28
|
ARROW_DTYPE_NOT_SUPPORTED,
|
|
31
29
|
reason="pandas doesn't support ArrowDtype",
|
|
32
30
|
)
|
|
31
|
+
|
|
32
|
+
|
|
33
33
|
def test_map_dtype():
|
|
34
34
|
dt = dict_(pa.int64(), pa.string())
|
|
35
35
|
assert is_map_dtype(dt)
|
|
@@ -39,30 +39,10 @@ def test_map_dtype():
|
|
|
39
39
|
assert not is_map_dtype(pd.Int64Dtype)
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
(
|
|
50
|
-
ArrowDtype(pa.int64()) if ArrowDtype else None,
|
|
51
|
-
ArrowDtype,
|
|
52
|
-
pa.int64(),
|
|
53
|
-
), # pd.ArrowDtype
|
|
54
|
-
(np.dtype("int64"), ArrowDtype, pa.int64()), # np.dtype
|
|
55
|
-
(pd.CategoricalDtype(), pd.CategoricalDtype, None), # pa.DataType
|
|
56
|
-
(pd.Int64Dtype(), ArrowDtype, pa.int64()), # pd.ExtensionDtype
|
|
57
|
-
(
|
|
58
|
-
pd.DatetimeTZDtype("ns", "Asia/Shanghai"),
|
|
59
|
-
pa.TimestampType,
|
|
60
|
-
pa.timestamp("ns", "Asia/Shanghai"),
|
|
61
|
-
),
|
|
62
|
-
],
|
|
63
|
-
)
|
|
64
|
-
def test_infer_arrow_dtype(input_dtype, expected_type, expected_pa_dtype):
|
|
65
|
-
result = infer_arrow_dtype(input_dtype)
|
|
66
|
-
assert isinstance(result, expected_type)
|
|
67
|
-
if expected_type == ArrowDtype:
|
|
68
|
-
assert result.pyarrow_dtype == expected_pa_dtype
|
|
42
|
+
def test_list_dtype():
|
|
43
|
+
dt = list_(pa.int64())
|
|
44
|
+
assert is_list_dtype(dt)
|
|
45
|
+
|
|
46
|
+
dt = pd.ArrowDtype(pa.map_(pa.int64(), pa.string()))
|
|
47
|
+
assert not is_list_dtype(dt)
|
|
48
|
+
assert not is_list_dtype(pd.Int64Dtype)
|
|
Binary file
|
maxframe/opcodes.py
CHANGED
|
@@ -395,6 +395,11 @@ FUSE = 801
|
|
|
395
395
|
# LLM
|
|
396
396
|
DASHSCOPE_TEXT_GENERATION = 810
|
|
397
397
|
DASHSCOPE_MULTI_MODAL_GENERATION = 811
|
|
398
|
+
MANAGED_TEXT_MODAL_GENERATION = 812
|
|
399
|
+
MANAGED_MULTI_MODAL_GENERATION = 813
|
|
400
|
+
LLM_TEXT_SUMMARIZE_TASK = 814
|
|
401
|
+
LLM_TEXT_TRANSLATE_TASK = 815
|
|
402
|
+
LLM_TEXT_CLASSIFY_TASK = 816
|
|
398
403
|
|
|
399
404
|
# table like input for tensor
|
|
400
405
|
TABLE_COO = 1003
|
|
@@ -575,16 +580,30 @@ DATAFRAME_RESHUFFLE = 10001
|
|
|
575
580
|
FLATMAP = 10002
|
|
576
581
|
FLATJSON = 10003
|
|
577
582
|
APPLY_CHUNK = 10004
|
|
583
|
+
|
|
578
584
|
SERIES_DICT_GETITEM = 10005
|
|
579
585
|
SERIES_DICT_SETITEM = 10006
|
|
580
586
|
SERIES_DICT_LENGTH = 10007
|
|
581
587
|
SERIES_DICT_REMOVE = 10008
|
|
582
588
|
SERIES_DICT_CONTAINS = 10009
|
|
589
|
+
SERIES_DICT_FLATTEN = 10010
|
|
590
|
+
|
|
591
|
+
SERIES_LIST_GETITEM = 10020
|
|
592
|
+
SERIES_LIST_SETITEM = 10021
|
|
593
|
+
SERIES_LIST_CONTAINS = 10022
|
|
594
|
+
SERIES_LIST_LENGTH = 10023
|
|
595
|
+
SERIES_LIST_INSERT = 10024
|
|
596
|
+
SERIES_LIST_EXTEND = 10025
|
|
597
|
+
SERIES_LIST_POP = 10026
|
|
598
|
+
SERIES_LIST_SORT = 10027
|
|
599
|
+
SERIES_LIST_FLATTEN = 10028
|
|
583
600
|
|
|
584
601
|
# MaxFrame internal operators
|
|
585
602
|
DATAFRAME_PROJECTION_SAME_INDEX_MERGE = 100001
|
|
586
603
|
GROUPBY_AGGR_SAME_INDEX_MERGE = 100002
|
|
587
604
|
DATAFRAME_ILOC_GET_AND_RENAME_ITEM = 100003
|
|
605
|
+
COLLECT_MODEL_RESULT = 100004
|
|
606
|
+
MODEL_DATA_SOURCE = 100005
|
|
588
607
|
|
|
589
608
|
# fetches
|
|
590
609
|
FETCH_SHUFFLE = 999998
|
|
Binary file
|
maxframe/serialization/core.pyx
CHANGED
|
@@ -112,7 +112,14 @@ cpdef object load_type(str class_name, object parent_class):
|
|
|
112
112
|
|
|
113
113
|
mod_name, cls_name = class_name.rsplit("#", 1)
|
|
114
114
|
|
|
115
|
-
|
|
115
|
+
try:
|
|
116
|
+
cls = importlib.import_module(mod_name)
|
|
117
|
+
except ImportError as ex:
|
|
118
|
+
raise ImportError(
|
|
119
|
+
f"Failed to import {mod_name} when loading "
|
|
120
|
+
f"class {class_name}, {ex}"
|
|
121
|
+
) from None
|
|
122
|
+
|
|
116
123
|
for sub_cls_name in cls_name.split("."):
|
|
117
124
|
cls = getattr(cls, sub_cls_name)
|
|
118
125
|
_type_cache[class_name] = cls
|
|
@@ -122,6 +129,10 @@ cpdef object load_type(str class_name, object parent_class):
|
|
|
122
129
|
return cls
|
|
123
130
|
|
|
124
131
|
|
|
132
|
+
cpdef void clear_type_cache():
|
|
133
|
+
_type_cache.clear()
|
|
134
|
+
|
|
135
|
+
|
|
125
136
|
cdef Serializer get_deserializer(int32_t deserializer_id):
|
|
126
137
|
return _deserializers[deserializer_id]
|
|
127
138
|
|
maxframe/serialization/numpy.py
CHANGED
|
@@ -70,10 +70,18 @@ class NDArraySerializer(Serializer):
|
|
|
70
70
|
if dtype_new_order:
|
|
71
71
|
dtype = dtype[dtype_new_order]
|
|
72
72
|
if dtype.hasobject:
|
|
73
|
-
shape = header["shape"]
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
73
|
+
shape = tuple(header["shape"])
|
|
74
|
+
if shape == ():
|
|
75
|
+
val = np.array(subs[0]).reshape(shape)
|
|
76
|
+
else:
|
|
77
|
+
# fill empty object array
|
|
78
|
+
val = np.empty(shape, dtype=dtype)
|
|
79
|
+
try:
|
|
80
|
+
val[(slice(None),) * len(shape)] = subs[0]
|
|
81
|
+
except ValueError:
|
|
82
|
+
val[(slice(None),) * len(shape)] = np.array(
|
|
83
|
+
subs[0], dtype=dtype
|
|
84
|
+
).reshape(shape)
|
|
77
85
|
else:
|
|
78
86
|
val = np.ndarray(
|
|
79
87
|
shape=tuple(header["shape"]),
|
|
@@ -22,7 +22,7 @@ import pytest
|
|
|
22
22
|
from ....core import EntityData
|
|
23
23
|
from ....lib.wrapped_pickle import switch_unpickle
|
|
24
24
|
from ....utils import no_default
|
|
25
|
-
from ... import deserialize, serialize
|
|
25
|
+
from ... import clear_type_cache, deserialize, serialize
|
|
26
26
|
from .. import (
|
|
27
27
|
AnyField,
|
|
28
28
|
BoolField,
|
|
@@ -202,6 +202,7 @@ def test_serializable(set_is_ci):
|
|
|
202
202
|
def test_compatible_serializable(set_is_ci):
|
|
203
203
|
global MySimpleSerializable, MySubSerializable
|
|
204
204
|
|
|
205
|
+
clear_type_cache()
|
|
205
206
|
old_base, old_sub = MySimpleSerializable, MySubSerializable
|
|
206
207
|
|
|
207
208
|
try:
|
|
@@ -231,13 +232,23 @@ def test_compatible_serializable(set_is_ci):
|
|
|
231
232
|
my_sub_serializable2 = deserialize(header, buffers)
|
|
232
233
|
assert type(my_sub_serializable) is not type(my_sub_serializable2)
|
|
233
234
|
_assert_serializable_eq(my_sub_serializable, my_sub_serializable2)
|
|
235
|
+
|
|
236
|
+
header, buffers = serialize(my_sub_serializable2)
|
|
234
237
|
finally:
|
|
235
238
|
MySimpleSerializable, MySubSerializable = old_base, old_sub
|
|
239
|
+
MyMidSerializable = None
|
|
240
|
+
clear_type_cache()
|
|
241
|
+
|
|
242
|
+
my_sub_serializable3 = deserialize(header, buffers)
|
|
243
|
+
assert type(my_sub_serializable2) is not type(my_sub_serializable3)
|
|
244
|
+
_assert_serializable_eq(my_sub_serializable2, my_sub_serializable3)
|
|
236
245
|
|
|
237
246
|
|
|
238
247
|
def _assert_serializable_eq(my_serializable, my_serializable2):
|
|
239
248
|
for field_name, field in my_serializable._FIELDS.items():
|
|
240
|
-
if not hasattr(my_serializable, field.name)
|
|
249
|
+
if not hasattr(my_serializable, field.name) or not hasattr(
|
|
250
|
+
my_serializable2, field.name
|
|
251
|
+
):
|
|
241
252
|
continue
|
|
242
253
|
expect_value = getattr(my_serializable, field_name)
|
|
243
254
|
if expect_value is no_default:
|