maxframe 2.2.0__cp39-cp39-win32.whl → 2.3.0rc1__cp39-cp39-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp39-win32.pyd +0 -0
- maxframe/codegen/core.py +3 -2
- maxframe/codegen/spe/dataframe/merge.py +4 -0
- maxframe/codegen/spe/dataframe/misc.py +2 -0
- maxframe/codegen/spe/dataframe/reduction.py +18 -0
- maxframe/codegen/spe/dataframe/sort.py +9 -1
- maxframe/codegen/spe/dataframe/tests/test_reduction.py +13 -0
- maxframe/codegen/spe/dataframe/tseries.py +9 -0
- maxframe/codegen/spe/learn/contrib/lightgbm.py +4 -3
- maxframe/codegen/spe/tensor/datasource.py +1 -0
- maxframe/config/config.py +3 -0
- maxframe/conftest.py +10 -0
- maxframe/core/base.py +2 -1
- maxframe/core/entity/tileables.py +2 -0
- maxframe/core/graph/core.cp39-win32.pyd +0 -0
- maxframe/core/graph/entity.py +7 -1
- maxframe/core/mode.py +6 -1
- maxframe/dataframe/__init__.py +2 -2
- maxframe/dataframe/arithmetic/__init__.py +4 -0
- maxframe/dataframe/arithmetic/maximum.py +33 -0
- maxframe/dataframe/arithmetic/minimum.py +33 -0
- maxframe/dataframe/core.py +98 -106
- maxframe/dataframe/datasource/core.py +6 -0
- maxframe/dataframe/datasource/direct.py +57 -0
- maxframe/dataframe/datasource/read_csv.py +19 -11
- maxframe/dataframe/datasource/read_odps_query.py +29 -6
- maxframe/dataframe/datasource/read_odps_table.py +32 -10
- maxframe/dataframe/datasource/read_parquet.py +38 -39
- maxframe/dataframe/datastore/__init__.py +6 -0
- maxframe/dataframe/datastore/direct.py +268 -0
- maxframe/dataframe/datastore/to_odps.py +6 -0
- maxframe/dataframe/extensions/flatjson.py +2 -1
- maxframe/dataframe/groupby/__init__.py +5 -1
- maxframe/dataframe/groupby/aggregation.py +10 -6
- maxframe/dataframe/groupby/apply_chunk.py +1 -3
- maxframe/dataframe/groupby/core.py +20 -4
- maxframe/dataframe/indexing/__init__.py +2 -1
- maxframe/dataframe/indexing/insert.py +45 -17
- maxframe/dataframe/merge/__init__.py +3 -0
- maxframe/dataframe/merge/combine.py +244 -0
- maxframe/dataframe/misc/__init__.py +14 -3
- maxframe/dataframe/misc/check_unique.py +41 -10
- maxframe/dataframe/misc/drop.py +31 -0
- maxframe/dataframe/misc/infer_dtypes.py +251 -0
- maxframe/dataframe/misc/map.py +31 -18
- maxframe/dataframe/misc/repeat.py +159 -0
- maxframe/dataframe/misc/tests/test_misc.py +35 -1
- maxframe/dataframe/missing/checkna.py +3 -2
- maxframe/dataframe/reduction/__init__.py +10 -5
- maxframe/dataframe/reduction/aggregation.py +6 -6
- maxframe/dataframe/reduction/argmax.py +7 -4
- maxframe/dataframe/reduction/argmin.py +7 -4
- maxframe/dataframe/reduction/core.py +18 -9
- maxframe/dataframe/reduction/mode.py +144 -0
- maxframe/dataframe/reduction/nunique.py +10 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +12 -0
- maxframe/dataframe/sort/__init__.py +9 -2
- maxframe/dataframe/sort/argsort.py +7 -1
- maxframe/dataframe/sort/core.py +1 -1
- maxframe/dataframe/sort/rank.py +147 -0
- maxframe/dataframe/tseries/__init__.py +19 -0
- maxframe/dataframe/tseries/at_time.py +61 -0
- maxframe/dataframe/tseries/between_time.py +122 -0
- maxframe/dataframe/utils.py +30 -26
- maxframe/learn/contrib/llm/core.py +16 -7
- maxframe/learn/contrib/llm/deploy/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/config.py +221 -0
- maxframe/learn/contrib/llm/deploy/core.py +247 -0
- maxframe/learn/contrib/llm/deploy/framework.py +35 -0
- maxframe/learn/contrib/llm/deploy/loader.py +360 -0
- maxframe/learn/contrib/llm/deploy/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/deploy/tests/test_register_models.py +359 -0
- maxframe/learn/contrib/llm/models/__init__.py +1 -0
- maxframe/learn/contrib/llm/models/dashscope.py +12 -6
- maxframe/learn/contrib/llm/models/managed.py +76 -11
- maxframe/learn/contrib/llm/models/openai.py +72 -0
- maxframe/learn/contrib/llm/tests/__init__.py +13 -0
- maxframe/learn/contrib/llm/tests/test_core.py +34 -0
- maxframe/learn/contrib/llm/tests/test_openai.py +187 -0
- maxframe/learn/contrib/llm/tests/test_text_gen.py +155 -0
- maxframe/learn/contrib/llm/text.py +348 -42
- maxframe/learn/contrib/models.py +4 -1
- maxframe/learn/contrib/xgboost/classifier.py +2 -0
- maxframe/learn/contrib/xgboost/core.py +31 -7
- maxframe/learn/contrib/xgboost/predict.py +4 -2
- maxframe/learn/contrib/xgboost/regressor.py +5 -0
- maxframe/learn/contrib/xgboost/train.py +2 -0
- maxframe/learn/preprocessing/_data/min_max_scaler.py +34 -23
- maxframe/learn/preprocessing/_data/standard_scaler.py +34 -25
- maxframe/learn/utils/__init__.py +1 -0
- maxframe/learn/utils/extmath.py +42 -9
- maxframe/learn/utils/odpsio.py +80 -11
- maxframe/lib/filesystem/_oss_lib/common.py +2 -0
- maxframe/lib/mmh3.cp39-win32.pyd +0 -0
- maxframe/opcodes.py +9 -1
- maxframe/remote/core.py +4 -0
- maxframe/serialization/core.cp39-win32.pyd +0 -0
- maxframe/serialization/tests/test_serial.py +2 -2
- maxframe/tensor/arithmetic/__init__.py +1 -1
- maxframe/tensor/arithmetic/core.py +2 -2
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +0 -9
- maxframe/tensor/core.py +3 -0
- maxframe/tensor/misc/copyto.py +1 -1
- maxframe/tests/test_udf.py +61 -0
- maxframe/tests/test_utils.py +8 -5
- maxframe/udf.py +103 -7
- maxframe/utils.py +61 -8
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/METADATA +1 -2
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/RECORD +113 -90
- maxframe_client/session/task.py +8 -1
- maxframe_client/tests/test_session.py +24 -0
- maxframe/dataframe/arrays.py +0 -864
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/WHEEL +0 -0
- {maxframe-2.2.0.dist-info → maxframe-2.3.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
from typing import Union
|
|
16
16
|
|
|
17
|
+
from ...utils.odpsio import register_odps_model
|
|
17
18
|
from ..utils import make_import_error_func
|
|
18
19
|
from .core import XGBScikitLearnBase, xgboost
|
|
19
20
|
|
|
@@ -24,6 +25,7 @@ else:
|
|
|
24
25
|
|
|
25
26
|
from .predict import predict
|
|
26
27
|
|
|
28
|
+
@register_odps_model
|
|
27
29
|
class XGBRegressor(XGBScikitLearnBase, XGBRegressorBase):
|
|
28
30
|
"""
|
|
29
31
|
Implementation of the scikit-learn API for XGBoost regressor.
|
|
@@ -69,6 +71,9 @@ else:
|
|
|
69
71
|
A list of the form [L_1, L_2, ..., L_n], where each L_i is a list
|
|
70
72
|
of group weights on the i-th validation set.
|
|
71
73
|
"""
|
|
74
|
+
if y.ndim == 2:
|
|
75
|
+
kw["num_class"] = y.shape[1]
|
|
76
|
+
kw["output_ndim"] = 2
|
|
72
77
|
super().fit(
|
|
73
78
|
X,
|
|
74
79
|
y,
|
|
@@ -25,6 +25,7 @@ from ....serialization.serializables import (
|
|
|
25
25
|
DictField,
|
|
26
26
|
FieldTypes,
|
|
27
27
|
FunctionField,
|
|
28
|
+
Int16Field,
|
|
28
29
|
Int64Field,
|
|
29
30
|
KeyField,
|
|
30
31
|
ListField,
|
|
@@ -65,6 +66,7 @@ class XGBTrain(ObjectOperator, ObjectOperatorMixin):
|
|
|
65
66
|
num_boost_round = Int64Field("num_boost_round", default=10)
|
|
66
67
|
num_class = Int64Field("num_class", default=None)
|
|
67
68
|
_has_evals_result = BoolField("has_evals_result", default=False)
|
|
69
|
+
output_ndim = Int16Field("output_ndim", default=None)
|
|
68
70
|
|
|
69
71
|
def __init__(self, gpu=None, **kw):
|
|
70
72
|
if kw.get("evals_result") is not None:
|
|
@@ -106,10 +106,11 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
|
|
|
106
106
|
<sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
|
|
107
107
|
"""
|
|
108
108
|
|
|
109
|
-
def __init__(self, feature_range=(0, 1), copy=True, clip=False):
|
|
109
|
+
def __init__(self, feature_range=(0, 1), copy=True, clip=False, validate=True):
|
|
110
110
|
self.feature_range = feature_range
|
|
111
111
|
self.copy = copy
|
|
112
112
|
self.clip = clip
|
|
113
|
+
self.validate = validate
|
|
113
114
|
|
|
114
115
|
def _reset(self): # pragma: no cover
|
|
115
116
|
"""Reset internal data-dependent state of the scaler, if necessary.
|
|
@@ -186,13 +187,14 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
|
|
|
186
187
|
)
|
|
187
188
|
|
|
188
189
|
first_pass = not hasattr(self, "n_samples_seen_")
|
|
189
|
-
|
|
190
|
-
X
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
190
|
+
if self.validate:
|
|
191
|
+
X = self._validate_data(
|
|
192
|
+
X,
|
|
193
|
+
reset=first_pass,
|
|
194
|
+
estimator=self,
|
|
195
|
+
dtype=FLOAT_DTYPES,
|
|
196
|
+
force_all_finite="allow-nan",
|
|
197
|
+
)
|
|
196
198
|
|
|
197
199
|
if isinstance(X, (DATAFRAME_TYPE, SERIES_TYPE, INDEX_TYPE)):
|
|
198
200
|
data_min = X.min(axis=0)
|
|
@@ -239,13 +241,14 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
|
|
|
239
241
|
"""
|
|
240
242
|
check_is_fitted(self)
|
|
241
243
|
|
|
242
|
-
|
|
243
|
-
X
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
244
|
+
if self.validate:
|
|
245
|
+
X = self._validate_data(
|
|
246
|
+
X,
|
|
247
|
+
copy=self.copy,
|
|
248
|
+
dtype=FLOAT_DTYPES,
|
|
249
|
+
force_all_finite="allow-nan",
|
|
250
|
+
reset=False,
|
|
251
|
+
)
|
|
249
252
|
|
|
250
253
|
X *= self.scale_
|
|
251
254
|
X += self.min_
|
|
@@ -290,6 +293,7 @@ def minmax_scale(
|
|
|
290
293
|
*,
|
|
291
294
|
axis=0,
|
|
292
295
|
copy=True,
|
|
296
|
+
validate=True,
|
|
293
297
|
execute=False,
|
|
294
298
|
session=None,
|
|
295
299
|
run_kwargs=None
|
|
@@ -368,21 +372,28 @@ def minmax_scale(
|
|
|
368
372
|
""" # noqa
|
|
369
373
|
# Unlike the scaler object, this function allows 1d input.
|
|
370
374
|
# If copy is required, it will be done inside the scaler object.
|
|
371
|
-
|
|
372
|
-
X
|
|
373
|
-
|
|
374
|
-
|
|
375
|
+
if validate:
|
|
376
|
+
X = check_array(
|
|
377
|
+
X,
|
|
378
|
+
copy=False,
|
|
379
|
+
ensure_2d=False,
|
|
380
|
+
dtype=FLOAT_DTYPES,
|
|
381
|
+
force_all_finite="allow-nan",
|
|
382
|
+
)
|
|
383
|
+
original_ndim = X.ndim
|
|
375
384
|
|
|
376
|
-
|
|
377
|
-
|
|
385
|
+
if original_ndim == 1:
|
|
386
|
+
X = X.reshape(X.shape[0], 1)
|
|
387
|
+
else:
|
|
388
|
+
original_ndim = X.ndim
|
|
378
389
|
|
|
379
|
-
s = MinMaxScaler(feature_range=feature_range, copy=copy)
|
|
390
|
+
s = MinMaxScaler(feature_range=feature_range, copy=copy, validate=validate)
|
|
380
391
|
if axis == 0:
|
|
381
392
|
X = s.fit_transform(X)
|
|
382
393
|
else:
|
|
383
394
|
X = s.fit_transform(X.T).T
|
|
384
395
|
|
|
385
|
-
if original_ndim == 1:
|
|
396
|
+
if validate and original_ndim == 1:
|
|
386
397
|
X = X.ravel()
|
|
387
398
|
|
|
388
399
|
if not execute:
|
|
@@ -156,10 +156,11 @@ class StandardScaler(TransformerMixin, BaseEstimator):
|
|
|
156
156
|
[[3. 3.]]
|
|
157
157
|
"""
|
|
158
158
|
|
|
159
|
-
def __init__(self, *, copy=True, with_mean=True, with_std=True):
|
|
159
|
+
def __init__(self, *, copy=True, with_mean=True, with_std=True, validate=True):
|
|
160
160
|
self.with_mean = with_mean
|
|
161
161
|
self.with_std = with_std
|
|
162
162
|
self.copy = copy
|
|
163
|
+
self.validate = validate
|
|
163
164
|
|
|
164
165
|
def _reset(self):
|
|
165
166
|
"""Reset internal data-dependent state of the scaler, if necessary.
|
|
@@ -246,14 +247,15 @@ class StandardScaler(TransformerMixin, BaseEstimator):
|
|
|
246
247
|
Fitted scaler.
|
|
247
248
|
"""
|
|
248
249
|
first_call = not hasattr(self, "n_samples_seen_")
|
|
249
|
-
|
|
250
|
-
X
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
250
|
+
if self.validate:
|
|
251
|
+
X = self._validate_data(
|
|
252
|
+
X,
|
|
253
|
+
accept_sparse=("csr", "csc"),
|
|
254
|
+
dtype=FLOAT_DTYPES,
|
|
255
|
+
force_all_finite="allow-nan",
|
|
256
|
+
reset=first_call,
|
|
257
|
+
)
|
|
258
|
+
n_features = X.shape[1] if X.ndim == 2 else 1
|
|
257
259
|
|
|
258
260
|
if sample_weight is not None:
|
|
259
261
|
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
|
|
@@ -267,7 +269,9 @@ class StandardScaler(TransformerMixin, BaseEstimator):
|
|
|
267
269
|
# incr_mean_variance_axis and _incremental_variance_axis
|
|
268
270
|
dtype = np.int64 if sample_weight is None else X.dtype
|
|
269
271
|
if not hasattr(self, "n_samples_seen_"):
|
|
270
|
-
self.n_samples_seen_ =
|
|
272
|
+
self.n_samples_seen_ = (
|
|
273
|
+
mt.zeros(n_features, dtype=dtype) if X.ndim == 2 else 0
|
|
274
|
+
)
|
|
271
275
|
# elif np.size(self.n_samples_seen_) == 1:
|
|
272
276
|
# self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1])
|
|
273
277
|
# self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False)
|
|
@@ -309,9 +313,11 @@ class StandardScaler(TransformerMixin, BaseEstimator):
|
|
|
309
313
|
constant_mask = _is_constant_feature(
|
|
310
314
|
self.var_, self.mean_, self.n_samples_seen_
|
|
311
315
|
)
|
|
312
|
-
self.scale_ =
|
|
313
|
-
|
|
314
|
-
|
|
316
|
+
self.scale_ = mt.sqrt(self.var_)
|
|
317
|
+
if self.validate:
|
|
318
|
+
self.scale_ = _handle_zeros_in_scale(
|
|
319
|
+
self.scale_, copy=False, constant_mask=constant_mask
|
|
320
|
+
)
|
|
315
321
|
else:
|
|
316
322
|
self.scale_ = None
|
|
317
323
|
|
|
@@ -337,14 +343,15 @@ class StandardScaler(TransformerMixin, BaseEstimator):
|
|
|
337
343
|
check_is_fitted(self)
|
|
338
344
|
|
|
339
345
|
copy = copy if copy is not None else self.copy
|
|
340
|
-
|
|
341
|
-
X
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
346
|
+
if self.validate:
|
|
347
|
+
X = self._validate_data(
|
|
348
|
+
X,
|
|
349
|
+
reset=False,
|
|
350
|
+
accept_sparse="csr",
|
|
351
|
+
copy=copy,
|
|
352
|
+
dtype=FLOAT_DTYPES,
|
|
353
|
+
force_all_finite="allow-nan",
|
|
354
|
+
)
|
|
348
355
|
|
|
349
356
|
if sparse.issparse(X):
|
|
350
357
|
raise NotImplementedError("Scaling on sparse tensors is not supported")
|
|
@@ -397,7 +404,7 @@ class StandardScaler(TransformerMixin, BaseEstimator):
|
|
|
397
404
|
return X
|
|
398
405
|
|
|
399
406
|
|
|
400
|
-
def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
|
|
407
|
+
def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True, validate=True):
|
|
401
408
|
"""Standardize a dataset along any axis.
|
|
402
409
|
|
|
403
410
|
Center to the mean and component wise scale to unit variance.
|
|
@@ -488,16 +495,18 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
|
|
|
488
495
|
X = mt.tensor(X)
|
|
489
496
|
|
|
490
497
|
ndim = X.ndim
|
|
491
|
-
if ndim == 1:
|
|
498
|
+
if validate and ndim == 1:
|
|
492
499
|
X = X.reshape((X.shape[0], 1))
|
|
493
500
|
if axis == 1:
|
|
494
501
|
X = X.T
|
|
495
502
|
|
|
496
|
-
scaler = StandardScaler(
|
|
503
|
+
scaler = StandardScaler(
|
|
504
|
+
with_mean=with_mean, with_std=with_std, copy=copy, validate=validate
|
|
505
|
+
)
|
|
497
506
|
transformed = scaler.fit_transform(X)
|
|
498
507
|
|
|
499
508
|
if axis == 1:
|
|
500
509
|
transformed = transformed.T
|
|
501
|
-
if ndim == 1:
|
|
510
|
+
if validate and ndim == 1:
|
|
502
511
|
transformed = transformed.reshape(transformed.shape[0])
|
|
503
512
|
return transformed
|
maxframe/learn/utils/__init__.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
from .core import convert_to_tensor_or_dataframe
|
|
16
16
|
from .multiclass import check_classification_targets
|
|
17
|
+
from .odpsio import read_odps_model
|
|
17
18
|
from .shuffle import shuffle
|
|
18
19
|
from .sparsefuncs import count_nonzero
|
|
19
20
|
from .validation import check_array, check_consistent_length
|
maxframe/learn/utils/extmath.py
CHANGED
|
@@ -15,6 +15,9 @@
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
|
|
17
17
|
from ... import tensor as mt
|
|
18
|
+
from ...core import ENTITY_TYPE
|
|
19
|
+
from ...dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
20
|
+
from ...tensor.datasource import TensorZeros
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
# Use at least float64 for the accumulating functions to avoid precision issue
|
|
@@ -42,7 +45,11 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
|
|
|
42
45
|
-------
|
|
43
46
|
result : The output of the accumulator function passed to this function
|
|
44
47
|
"""
|
|
45
|
-
if
|
|
48
|
+
if (
|
|
49
|
+
hasattr(x, "dtype")
|
|
50
|
+
and np.issubdtype(x.dtype, np.floating)
|
|
51
|
+
and x.dtype.itemsize < 8
|
|
52
|
+
):
|
|
46
53
|
result = op(x, *args, **kwargs, dtype=np.float64)
|
|
47
54
|
else:
|
|
48
55
|
result = op(x, *args, **kwargs)
|
|
@@ -117,16 +124,31 @@ def _incremental_mean_and_var(
|
|
|
117
124
|
`utils.sparsefuncs.incr_mean_variance_axis` and
|
|
118
125
|
`utils.sparsefuncs_fast.incr_mean_variance_axis0`
|
|
119
126
|
"""
|
|
127
|
+
has_last_sample = isinstance(last_sample_count, ENTITY_TYPE) and not isinstance(
|
|
128
|
+
last_sample_count.op, TensorZeros
|
|
129
|
+
)
|
|
130
|
+
is_df_type = isinstance(X, (DATAFRAME_TYPE, SERIES_TYPE))
|
|
131
|
+
|
|
120
132
|
# old = stats until now
|
|
121
133
|
# new = the current increment
|
|
122
134
|
# updated = the aggregated stats
|
|
123
|
-
last_sum = last_mean * last_sample_count
|
|
135
|
+
last_sum = last_mean * last_sample_count if has_last_sample else 0
|
|
124
136
|
X_nan_mask = mt.isnan(X)
|
|
125
137
|
# if mt.any(X_nan_mask):
|
|
126
138
|
# sum_op = mt.nansum
|
|
127
139
|
# else:
|
|
128
140
|
# sum_op = mt.sum
|
|
129
|
-
|
|
141
|
+
|
|
142
|
+
def df_sum(val, **kw):
|
|
143
|
+
if "dtype" in kw:
|
|
144
|
+
val = val.astype(kw.pop("dtype"))
|
|
145
|
+
return val.sum(**kw)
|
|
146
|
+
|
|
147
|
+
if is_df_type:
|
|
148
|
+
sum_op = df_sum
|
|
149
|
+
else:
|
|
150
|
+
sum_op = mt.nansum
|
|
151
|
+
|
|
130
152
|
if sample_weight is not None:
|
|
131
153
|
# equivalent to np.nansum(X * sample_weight, axis=0)
|
|
132
154
|
# safer because np.float64(X*W) != np.float64(X)*np.float64(W)
|
|
@@ -138,10 +160,16 @@ def _incremental_mean_and_var(
|
|
|
138
160
|
)
|
|
139
161
|
else:
|
|
140
162
|
new_sum = _safe_accumulator_op(sum_op, X, axis=0)
|
|
141
|
-
|
|
142
|
-
|
|
163
|
+
if is_df_type:
|
|
164
|
+
new_sample_count = X.count()
|
|
165
|
+
else:
|
|
166
|
+
n_samples = X.shape[0]
|
|
167
|
+
new_sample_count = n_samples - mt.sum(X_nan_mask, axis=0)
|
|
143
168
|
|
|
144
|
-
|
|
169
|
+
if not has_last_sample:
|
|
170
|
+
updated_sample_count = new_sample_count
|
|
171
|
+
else:
|
|
172
|
+
updated_sample_count = last_sample_count + new_sample_count
|
|
145
173
|
|
|
146
174
|
updated_mean = (last_sum + new_sum) / updated_sample_count
|
|
147
175
|
|
|
@@ -170,7 +198,9 @@ def _incremental_mean_and_var(
|
|
|
170
198
|
# and recommendations", by Chan, Golub, and LeVeque.
|
|
171
199
|
new_unnormalized_variance -= correction**2 / new_sample_count
|
|
172
200
|
|
|
173
|
-
last_unnormalized_variance =
|
|
201
|
+
last_unnormalized_variance = (
|
|
202
|
+
last_variance * last_sample_count if has_last_sample else 0
|
|
203
|
+
)
|
|
174
204
|
|
|
175
205
|
with mt.errstate(divide="ignore", invalid="ignore"):
|
|
176
206
|
last_over_new_count = last_sample_count / new_sample_count
|
|
@@ -182,8 +212,11 @@ def _incremental_mean_and_var(
|
|
|
182
212
|
* (last_sum / last_over_new_count - new_sum) ** 2
|
|
183
213
|
)
|
|
184
214
|
|
|
185
|
-
|
|
186
|
-
|
|
215
|
+
if not has_last_sample:
|
|
216
|
+
updated_unnormalized_variance = new_unnormalized_variance
|
|
217
|
+
else:
|
|
218
|
+
zeros = last_sample_count == 0
|
|
219
|
+
updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros]
|
|
187
220
|
updated_variance = updated_unnormalized_variance / updated_sample_count
|
|
188
221
|
|
|
189
222
|
return updated_mean, updated_variance, updated_sample_count
|
maxframe/learn/utils/odpsio.py
CHANGED
|
@@ -12,7 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any, Dict, List, NamedTuple, Optional
|
|
15
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Set
|
|
16
|
+
|
|
17
|
+
from odps import ODPS
|
|
16
18
|
|
|
17
19
|
from ... import opcodes
|
|
18
20
|
from ...core import ENTITY_TYPE, EntityData, OutputType
|
|
@@ -27,6 +29,36 @@ from ...serialization.serializables import (
|
|
|
27
29
|
from ...utils import find_objects, replace_objects
|
|
28
30
|
from ..core import LearnOperatorMixin
|
|
29
31
|
|
|
32
|
+
_odps_model_classes: Set["ODPSModelMixin"] = set()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def register_odps_model(model_cls: "ODPSModelMixin"):
|
|
36
|
+
_odps_model_classes.add(model_cls)
|
|
37
|
+
return model_cls
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ReadODPSModel(ObjectOperator, LearnOperatorMixin):
|
|
41
|
+
_op_type_ = opcodes.READ_ODPS_MODEL
|
|
42
|
+
|
|
43
|
+
model_name = StringField("model_name", default=None)
|
|
44
|
+
model_version = StringField("model_version", default=None)
|
|
45
|
+
format = StringField("format", default=None)
|
|
46
|
+
location = StringField("location", default=None)
|
|
47
|
+
storage_options = DictField("storage_options", default=None)
|
|
48
|
+
|
|
49
|
+
def has_custom_code(self) -> bool:
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
def __call__(self):
|
|
53
|
+
if not self.format.startswith("BOOSTED_TREE_"):
|
|
54
|
+
# todo support more model formats
|
|
55
|
+
raise ValueError("Only support boosted tree format")
|
|
56
|
+
for model_cls in _odps_model_classes:
|
|
57
|
+
ret = model_cls._build_odps_source_model(self)
|
|
58
|
+
if ret is not None:
|
|
59
|
+
return ret
|
|
60
|
+
raise ValueError(f"Model {self.model_name} not supported")
|
|
61
|
+
|
|
30
62
|
|
|
31
63
|
class ToODPSModel(ObjectOperator, LearnOperatorMixin):
|
|
32
64
|
_op_type_ = opcodes.TO_ODPS_MODEL
|
|
@@ -74,17 +106,21 @@ class ToODPSModel(ObjectOperator, LearnOperatorMixin):
|
|
|
74
106
|
return self.new_tileable(inputs, shape=())
|
|
75
107
|
|
|
76
108
|
|
|
77
|
-
class
|
|
109
|
+
class ODPSModelMixin:
|
|
78
110
|
class ODPSModelInfo(NamedTuple):
|
|
79
111
|
model_format: str
|
|
80
112
|
model_params: Any
|
|
81
113
|
|
|
114
|
+
@classmethod
|
|
115
|
+
def _build_odps_source_model(cls, op: ReadODPSModel) -> Any:
|
|
116
|
+
return None
|
|
117
|
+
|
|
82
118
|
def _get_odps_model_info(self) -> ODPSModelInfo:
|
|
83
119
|
raise NotImplementedError
|
|
84
120
|
|
|
85
121
|
def to_odps_model(
|
|
86
122
|
self,
|
|
87
|
-
model_name: str
|
|
123
|
+
model_name: str,
|
|
88
124
|
model_version: str = None,
|
|
89
125
|
schema: str = None,
|
|
90
126
|
project: str = None,
|
|
@@ -167,14 +203,7 @@ class ToODPSModelMixin:
|
|
|
167
203
|
... "role_arn": "acs:ram::<user_id>:role/aliyunodpsdefaultrole"
|
|
168
204
|
... }).execute()
|
|
169
205
|
"""
|
|
170
|
-
|
|
171
|
-
if project and not schema:
|
|
172
|
-
schema = "default"
|
|
173
|
-
if schema:
|
|
174
|
-
model_name = f"{schema}.{model_name}"
|
|
175
|
-
if project:
|
|
176
|
-
model_name = f"{project}.{model_name}"
|
|
177
|
-
|
|
206
|
+
model_name = _build_odps_model_name(model_name, schema, project)
|
|
178
207
|
model_info = self._get_odps_model_info()
|
|
179
208
|
|
|
180
209
|
op = ToODPSModel(
|
|
@@ -191,3 +220,43 @@ class ToODPSModelMixin:
|
|
|
191
220
|
storage_options=storage_options,
|
|
192
221
|
)
|
|
193
222
|
return op(getattr(self, "training_info_"), model_info.model_params)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _build_odps_model_name(model_name: str, schema: str, project: str = None):
|
|
226
|
+
if "." not in model_name:
|
|
227
|
+
if project and not schema:
|
|
228
|
+
schema = "default"
|
|
229
|
+
if schema:
|
|
230
|
+
model_name = f"{schema}.{model_name}"
|
|
231
|
+
if project:
|
|
232
|
+
model_name = f"{project}.{model_name}"
|
|
233
|
+
return model_name
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def read_odps_model(
|
|
237
|
+
model_name: str,
|
|
238
|
+
schema: str = None,
|
|
239
|
+
project: str = None,
|
|
240
|
+
model_version: str = None,
|
|
241
|
+
odps_entry: ODPS = None,
|
|
242
|
+
):
|
|
243
|
+
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
244
|
+
if not hasattr(odps_entry, "get_model"):
|
|
245
|
+
raise RuntimeError("Need to install pyodps>=0.11.5 to use read_odps_model")
|
|
246
|
+
|
|
247
|
+
model_obj = odps_entry.get_model(model_name, project, schema)
|
|
248
|
+
if model_version:
|
|
249
|
+
model_obj = model_obj.versions[model_version]
|
|
250
|
+
# check if model exists
|
|
251
|
+
model_obj.reload()
|
|
252
|
+
|
|
253
|
+
full_model_name = _build_odps_model_name(model_name, schema, project)
|
|
254
|
+
location = model_obj.path
|
|
255
|
+
format = model_obj.type.value
|
|
256
|
+
op = ReadODPSModel(
|
|
257
|
+
model_name=full_model_name,
|
|
258
|
+
model_version=model_version,
|
|
259
|
+
location=location,
|
|
260
|
+
format=format,
|
|
261
|
+
)
|
|
262
|
+
return op()
|
maxframe/lib/mmh3.cp39-win32.pyd
CHANGED
|
Binary file
|
maxframe/opcodes.py
CHANGED
|
@@ -271,9 +271,9 @@ SEM = 352
|
|
|
271
271
|
STR_CONCAT = 353
|
|
272
272
|
MAD = 354
|
|
273
273
|
MEDIAN = 355
|
|
274
|
-
RANK = 356
|
|
275
274
|
IDXMAX = 357
|
|
276
275
|
IDXMIN = 358
|
|
276
|
+
MODE = 359
|
|
277
277
|
|
|
278
278
|
# tensor operator
|
|
279
279
|
RESHAPE = 401
|
|
@@ -398,6 +398,9 @@ REORDER_LEVELS = 747
|
|
|
398
398
|
DATAFRAME_COMPARE = 748
|
|
399
399
|
DROPLEVEL = 749
|
|
400
400
|
DATAFRAME_UPDATE = 750
|
|
401
|
+
DATAFRAME_COMBINE = 751
|
|
402
|
+
DATAFRAME_INFER_DTYPES = 752
|
|
403
|
+
BETWEEN_TIME = 753
|
|
401
404
|
|
|
402
405
|
FUSE = 801
|
|
403
406
|
|
|
@@ -409,6 +412,9 @@ MANAGED_MULTI_MODAL_GENERATION = 813
|
|
|
409
412
|
LLM_TEXT_SUMMARIZE_TASK = 814
|
|
410
413
|
LLM_TEXT_TRANSLATE_TASK = 815
|
|
411
414
|
LLM_TEXT_CLASSIFY_TASK = 816
|
|
415
|
+
LLM_TEXT_EXTRACT_TASK = 817
|
|
416
|
+
LLM_TEXT_EMBEDDING_TASK = 818
|
|
417
|
+
OPENAI_COMPATIBLE_TEXT_GENERATION = 819
|
|
412
418
|
|
|
413
419
|
# table like input for tensor
|
|
414
420
|
TABLE_COO = 1003
|
|
@@ -456,6 +462,7 @@ PSRS_SORT_REGULAR_SAMPLE = 2040
|
|
|
456
462
|
PSRS_CONCAT_PIVOT = 2041
|
|
457
463
|
PSRS_SHUFFLE = 2042
|
|
458
464
|
PSRS_ALIGN = 2043
|
|
465
|
+
PSRS_RANK_SHUFFLE = 2044
|
|
459
466
|
# partition
|
|
460
467
|
CALC_PARTITIONS_INFO = 2046
|
|
461
468
|
PARTITION_MERGED = 2047
|
|
@@ -463,6 +470,7 @@ PARTITION_MERGED = 2047
|
|
|
463
470
|
# dataframe sort
|
|
464
471
|
SORT_VALUES = 2050
|
|
465
472
|
SORT_INDEX = 2051
|
|
473
|
+
RANK = 2052
|
|
466
474
|
|
|
467
475
|
# window
|
|
468
476
|
ROLLING_AGG = 2060
|
maxframe/remote/core.py
CHANGED
|
@@ -27,6 +27,7 @@ from ..serialization.serializables import (
|
|
|
27
27
|
ListField,
|
|
28
28
|
)
|
|
29
29
|
from ..tensor.core import TENSOR_TYPE
|
|
30
|
+
from ..typing_ import TileableType
|
|
30
31
|
from ..udf import BuiltinFunction
|
|
31
32
|
from ..utils import find_objects, replace_objects
|
|
32
33
|
|
|
@@ -59,6 +60,9 @@ class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
|
|
|
59
60
|
def has_custom_code(self) -> bool:
|
|
60
61
|
return not isinstance(self.function, BuiltinFunction)
|
|
61
62
|
|
|
63
|
+
def check_inputs(self, inputs: List[TileableType]):
|
|
64
|
+
return
|
|
65
|
+
|
|
62
66
|
@classmethod
|
|
63
67
|
def _set_inputs(cls, op: "RemoteFunction", inputs: List[EntityData]):
|
|
64
68
|
raw_inputs = getattr(op, "_inputs", None)
|
|
Binary file
|
|
@@ -239,11 +239,11 @@ def test_pandas():
|
|
|
239
239
|
@pytest.mark.skipif(_arrow_dtype_supported, reason="pandas doesn't support ArrowDtype")
|
|
240
240
|
def test_fake_arrow_dtype_serde():
|
|
241
241
|
serializer = DtypeSerializer()
|
|
242
|
-
payload, data,
|
|
242
|
+
payload, data, is_leaf = serializer.serial(
|
|
243
243
|
FakeArrowDtype(pa.map_(pa.int64(), pa.string())), dict()
|
|
244
244
|
)
|
|
245
245
|
|
|
246
|
-
assert
|
|
246
|
+
assert is_leaf
|
|
247
247
|
assert data == []
|
|
248
248
|
assert payload == ["PA", "map<int64, string>"]
|
|
249
249
|
new_dtype = serializer.deserial(payload, dict(), list())
|
|
@@ -415,8 +415,8 @@ class TensorOutBinOp(TensorOperator, TensorElementWiseWithInputs):
|
|
|
415
415
|
dtype = [r.dtype for r in self._fun(np.empty(1, dtype=x.dtype))]
|
|
416
416
|
|
|
417
417
|
out = out or (None, None)
|
|
418
|
-
out1 = out1
|
|
419
|
-
out2 = out2
|
|
418
|
+
out1 = out1 if out1 is not None else out[0]
|
|
419
|
+
out2 = out2 if out2 is not None else out[1]
|
|
420
420
|
x, out1, out2, where = self._process_inputs(x, out1, out2, where)
|
|
421
421
|
shape = x.shape
|
|
422
422
|
order1 = self._calc_order(x, out1)
|
|
@@ -16,7 +16,6 @@ import numpy as np
|
|
|
16
16
|
import pytest
|
|
17
17
|
import scipy.sparse as sps
|
|
18
18
|
|
|
19
|
-
from ....core import enter_mode
|
|
20
19
|
from ....utils import collect_leaf_operators
|
|
21
20
|
from ...core import SparseTensor, Tensor
|
|
22
21
|
from ...datasource import array, empty, ones, tensor
|
|
@@ -391,14 +390,6 @@ def test_get_set_real():
|
|
|
391
390
|
a.real = [2, 4]
|
|
392
391
|
|
|
393
392
|
|
|
394
|
-
def test_build_mode():
|
|
395
|
-
t1 = ones((2, 3), chunk_size=2)
|
|
396
|
-
assert t1 == 2
|
|
397
|
-
|
|
398
|
-
with enter_mode(build=True):
|
|
399
|
-
assert t1 != 2
|
|
400
|
-
|
|
401
|
-
|
|
402
393
|
def test_unary_op_func_name():
|
|
403
394
|
# make sure all the unary op has defined the func name.
|
|
404
395
|
|
maxframe/tensor/core.py
CHANGED
maxframe/tensor/misc/copyto.py
CHANGED