maxframe 0.1.0b4__cp37-cp37m-win32.whl → 1.0.0__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/codegen.py +56 -5
- maxframe/config/config.py +78 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +2 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +58 -12
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +120 -24
- maxframe/dataframe/datasource/read_odps_table.py +9 -4
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +6 -11
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +93 -1
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +4 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +33 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +110 -0
- maxframe/learn/contrib/xgboost/core.py +241 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
- maxframe/learn/contrib/xgboost/predict.py +121 -0
- maxframe/learn/contrib/xgboost/regressor.py +71 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +132 -0
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +11 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +64 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +37 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +7 -2
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +4 -0
- maxframe/tensor/misc/atleast_1d.py +72 -0
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/misc/unique.py +205 -0
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +62 -3
- maxframe/utils.py +112 -86
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +4 -4
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +123 -54
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +223 -40
- maxframe_client/session/task.py +108 -80
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +136 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -300
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
from ....tensor import argmax, transpose
|
|
18
|
+
from ....tensor.merge.vstack import _vstack
|
|
19
|
+
from ..utils import make_import_error_func
|
|
20
|
+
from .core import XGBScikitLearnBase, xgboost
|
|
21
|
+
|
|
22
|
+
if not xgboost:
|
|
23
|
+
XGBClassifier = make_import_error_func("xgboost")
|
|
24
|
+
else:
|
|
25
|
+
from xgboost.sklearn import XGBClassifierBase
|
|
26
|
+
|
|
27
|
+
from .core import wrap_evaluation_matrices
|
|
28
|
+
from .predict import predict
|
|
29
|
+
from .train import train
|
|
30
|
+
|
|
31
|
+
class XGBClassifier(XGBScikitLearnBase, XGBClassifierBase):
|
|
32
|
+
"""
|
|
33
|
+
Implementation of the scikit-learn API for XGBoost classification.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def fit(
|
|
37
|
+
self,
|
|
38
|
+
X,
|
|
39
|
+
y,
|
|
40
|
+
sample_weight=None,
|
|
41
|
+
base_margin=None,
|
|
42
|
+
eval_set=None,
|
|
43
|
+
sample_weight_eval_set=None,
|
|
44
|
+
base_margin_eval_set=None,
|
|
45
|
+
num_class=None,
|
|
46
|
+
**kw,
|
|
47
|
+
):
|
|
48
|
+
session = kw.pop("session", None)
|
|
49
|
+
run_kwargs = kw.pop("run_kwargs", dict())
|
|
50
|
+
dtrain, evals = wrap_evaluation_matrices(
|
|
51
|
+
None,
|
|
52
|
+
X,
|
|
53
|
+
y,
|
|
54
|
+
sample_weight,
|
|
55
|
+
base_margin,
|
|
56
|
+
eval_set,
|
|
57
|
+
sample_weight_eval_set,
|
|
58
|
+
base_margin_eval_set,
|
|
59
|
+
)
|
|
60
|
+
params = self.get_xgb_params()
|
|
61
|
+
self.n_classes_ = num_class or 1
|
|
62
|
+
if self.n_classes_ > 2:
|
|
63
|
+
params["objective"] = "multi:softprob"
|
|
64
|
+
params["num_class"] = self.n_classes_
|
|
65
|
+
else:
|
|
66
|
+
params["objective"] = "binary:logistic"
|
|
67
|
+
self.evals_result_ = dict()
|
|
68
|
+
result = train(
|
|
69
|
+
params,
|
|
70
|
+
dtrain,
|
|
71
|
+
num_boost_round=self.get_num_boosting_rounds(),
|
|
72
|
+
evals=evals,
|
|
73
|
+
evals_result=self.evals_result_,
|
|
74
|
+
num_class=num_class,
|
|
75
|
+
session=session,
|
|
76
|
+
run_kwargs=run_kwargs,
|
|
77
|
+
)
|
|
78
|
+
self._Booster = result
|
|
79
|
+
return self
|
|
80
|
+
|
|
81
|
+
def predict(self, data, **kw):
|
|
82
|
+
prob = self.predict_proba(data, flag=True, **kw)
|
|
83
|
+
if prob.ndim > 1:
|
|
84
|
+
prediction = argmax(prob, axis=1)
|
|
85
|
+
else:
|
|
86
|
+
prediction = (prob > 0.5).astype(np.int64)
|
|
87
|
+
return prediction
|
|
88
|
+
|
|
89
|
+
def predict_proba(self, data, ntree_limit=None, flag=False, **kw):
|
|
90
|
+
if ntree_limit is not None:
|
|
91
|
+
raise NotImplementedError("ntree_limit is not currently supported")
|
|
92
|
+
prediction = predict(self.get_booster(), data, flag=flag, **kw)
|
|
93
|
+
if len(prediction.shape) == 2 and prediction.shape[1] == self.n_classes_:
|
|
94
|
+
# multi-class
|
|
95
|
+
return prediction
|
|
96
|
+
if (
|
|
97
|
+
len(prediction.shape) == 2
|
|
98
|
+
and self.n_classes_ == 2
|
|
99
|
+
and prediction.shape[1] >= self.n_classes_
|
|
100
|
+
):
|
|
101
|
+
# multi-label
|
|
102
|
+
return prediction
|
|
103
|
+
# binary logistic function
|
|
104
|
+
classone_probs = prediction
|
|
105
|
+
classzero_probs = 1.0 - classone_probs
|
|
106
|
+
return transpose(_vstack((classzero_probs, classone_probs)))
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def classes_(self) -> np.ndarray:
|
|
110
|
+
return np.arange(self.n_classes_)
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import xgboost
|
|
19
|
+
except ImportError:
|
|
20
|
+
xgboost = None
|
|
21
|
+
|
|
22
|
+
from ...core import Model, ModelData
|
|
23
|
+
from .dmatrix import DMatrix
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BoosterData(ModelData):
|
|
27
|
+
__slots__ = ("_evals_result",)
|
|
28
|
+
|
|
29
|
+
_evals_result: Dict
|
|
30
|
+
|
|
31
|
+
def __init__(self, *args, evals_result=None, **kwargs):
|
|
32
|
+
super().__init__(*args, **kwargs)
|
|
33
|
+
self._evals_result = evals_result if evals_result is not None else dict()
|
|
34
|
+
|
|
35
|
+
def execute(self, session=None, **kw):
|
|
36
|
+
# The evals_result should be fetched when BoosterData.execute() is called.
|
|
37
|
+
result = super().execute(session=session, **kw)
|
|
38
|
+
if self.op.has_evals_result and self.key == self.op.outputs[0].key:
|
|
39
|
+
self._evals_result.update(self.op.outputs[1].fetch(session=session))
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
def predict(
|
|
43
|
+
self,
|
|
44
|
+
data,
|
|
45
|
+
output_margin=False,
|
|
46
|
+
pred_leaf=False,
|
|
47
|
+
pred_contribs=False,
|
|
48
|
+
approx_contribs=False,
|
|
49
|
+
pred_interactions=False,
|
|
50
|
+
validate_features=True,
|
|
51
|
+
training=False,
|
|
52
|
+
iteration_range=None,
|
|
53
|
+
strict_shape=False,
|
|
54
|
+
):
|
|
55
|
+
from .predict import predict
|
|
56
|
+
|
|
57
|
+
return predict(
|
|
58
|
+
self,
|
|
59
|
+
data,
|
|
60
|
+
output_margin=output_margin,
|
|
61
|
+
pred_leaf=pred_leaf,
|
|
62
|
+
pred_contribs=pred_contribs,
|
|
63
|
+
approx_contribs=approx_contribs,
|
|
64
|
+
pred_interactions=pred_interactions,
|
|
65
|
+
validate_features=validate_features,
|
|
66
|
+
training=training,
|
|
67
|
+
iteration_range=iteration_range,
|
|
68
|
+
strict_shape=strict_shape,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class Booster(Model):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if not xgboost:
|
|
77
|
+
XGBScikitLearnBase = None
|
|
78
|
+
else:
|
|
79
|
+
|
|
80
|
+
class XGBScikitLearnBase(xgboost.XGBModel):
|
|
81
|
+
"""
|
|
82
|
+
Base class for implementing scikit-learn interface
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def fit(
|
|
86
|
+
self,
|
|
87
|
+
X,
|
|
88
|
+
y,
|
|
89
|
+
sample_weights=None,
|
|
90
|
+
eval_set=None,
|
|
91
|
+
sample_weight_eval_set=None,
|
|
92
|
+
**kw,
|
|
93
|
+
):
|
|
94
|
+
"""
|
|
95
|
+
Fit the regressor. Note that fit() is an eager-execution
|
|
96
|
+
API. The call will be blocked until training finished.
|
|
97
|
+
|
|
98
|
+
Parameters
|
|
99
|
+
----------
|
|
100
|
+
X : array_like
|
|
101
|
+
Feature matrix
|
|
102
|
+
y : array_like
|
|
103
|
+
Labels
|
|
104
|
+
sample_weight : array_like
|
|
105
|
+
instance weights
|
|
106
|
+
eval_set : list, optional
|
|
107
|
+
A list of (X, y) tuple pairs to use as validation sets, for which
|
|
108
|
+
metrics will be computed.
|
|
109
|
+
Validation metrics will help us track the performance of the model.
|
|
110
|
+
sample_weight_eval_set : list, optional
|
|
111
|
+
A list of the form [L_1, L_2, ..., L_n], where each L_i is a list
|
|
112
|
+
of group weights on the i-th validation set.
|
|
113
|
+
"""
|
|
114
|
+
raise NotImplementedError
|
|
115
|
+
|
|
116
|
+
def predict(self, data, **kw):
|
|
117
|
+
"""
|
|
118
|
+
Predict with `data`.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
data: data that can be used to perform prediction
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
prediction : maxframe.tensor.Tensor
|
|
126
|
+
"""
|
|
127
|
+
raise NotImplementedError
|
|
128
|
+
|
|
129
|
+
def evals_result(self, **kw) -> Dict:
|
|
130
|
+
"""Return the evaluation results.
|
|
131
|
+
|
|
132
|
+
If **eval_set** is passed to the :py:meth:`fit` function, you can call
|
|
133
|
+
``evals_result()`` to get evaluation results for all passed **eval_sets**. When
|
|
134
|
+
**eval_metric** is also passed to the :py:meth:`fit` function, the
|
|
135
|
+
**evals_result** will contain the **eval_metrics** passed to the :py:meth:`fit`
|
|
136
|
+
function.
|
|
137
|
+
|
|
138
|
+
The returned evaluation result is a dictionary:
|
|
139
|
+
|
|
140
|
+
.. code-block:: python
|
|
141
|
+
|
|
142
|
+
{'validation_0': {'logloss': ['0.604835', '0.531479']},
|
|
143
|
+
'validation_1': {'logloss': ['0.41965', '0.17686']}}
|
|
144
|
+
|
|
145
|
+
Note that evals_result() will be blocked until the train is finished.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
evals_result
|
|
150
|
+
|
|
151
|
+
"""
|
|
152
|
+
result = super().evals_result()
|
|
153
|
+
if not self._Booster.op.has_evals_result or len(result) != 0:
|
|
154
|
+
return result
|
|
155
|
+
session = kw.pop("session", None)
|
|
156
|
+
run_kwargs = kw.pop("run_kwargs", dict())
|
|
157
|
+
self._Booster.execute(session=session, **run_kwargs)
|
|
158
|
+
return super().evals_result()
|
|
159
|
+
|
|
160
|
+
def wrap_evaluation_matrices(
|
|
161
|
+
missing: float,
|
|
162
|
+
X: Any,
|
|
163
|
+
y: Any,
|
|
164
|
+
sample_weight: Optional[Any],
|
|
165
|
+
base_margin: Optional[Any],
|
|
166
|
+
eval_set: Optional[List[Tuple[Any, Any]]],
|
|
167
|
+
sample_weight_eval_set: Optional[List[Any]],
|
|
168
|
+
base_margin_eval_set: Optional[List[Any]],
|
|
169
|
+
label_transform: Callable = lambda x: x,
|
|
170
|
+
) -> Tuple[Any, Optional[List[Tuple[Any, str]]]]:
|
|
171
|
+
"""
|
|
172
|
+
Convert array_like evaluation matrices into DMatrix.
|
|
173
|
+
Perform validation on the way.
|
|
174
|
+
"""
|
|
175
|
+
train_dmatrix = DMatrix(
|
|
176
|
+
data=X,
|
|
177
|
+
label=label_transform(y),
|
|
178
|
+
weight=sample_weight,
|
|
179
|
+
base_margin=base_margin,
|
|
180
|
+
missing=missing,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
n_validation = 0 if eval_set is None else len(eval_set)
|
|
184
|
+
|
|
185
|
+
def validate_or_none(meta: Optional[List], name: str) -> List:
|
|
186
|
+
if meta is None:
|
|
187
|
+
return [None] * n_validation
|
|
188
|
+
if len(meta) != n_validation:
|
|
189
|
+
raise ValueError(
|
|
190
|
+
f"{name}'s length does not equal `eval_set`'s length, "
|
|
191
|
+
+ f"expecting {n_validation}, got {len(meta)}"
|
|
192
|
+
)
|
|
193
|
+
return meta
|
|
194
|
+
|
|
195
|
+
if eval_set is not None:
|
|
196
|
+
sample_weight_eval_set = validate_or_none(
|
|
197
|
+
sample_weight_eval_set, "sample_weight_eval_set"
|
|
198
|
+
)
|
|
199
|
+
base_margin_eval_set = validate_or_none(
|
|
200
|
+
base_margin_eval_set, "base_margin_eval_set"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
evals = []
|
|
204
|
+
for i, (valid_X, valid_y) in enumerate(eval_set):
|
|
205
|
+
# Skip the duplicated entry.
|
|
206
|
+
if all(
|
|
207
|
+
(
|
|
208
|
+
valid_X is X,
|
|
209
|
+
valid_y is y,
|
|
210
|
+
sample_weight_eval_set[i] is sample_weight,
|
|
211
|
+
base_margin_eval_set[i] is base_margin,
|
|
212
|
+
)
|
|
213
|
+
):
|
|
214
|
+
evals.append(train_dmatrix)
|
|
215
|
+
else:
|
|
216
|
+
m = DMatrix(
|
|
217
|
+
data=valid_X,
|
|
218
|
+
label=label_transform(valid_y),
|
|
219
|
+
weight=sample_weight_eval_set[i],
|
|
220
|
+
base_margin=base_margin_eval_set[i],
|
|
221
|
+
missing=missing,
|
|
222
|
+
)
|
|
223
|
+
evals.append(m)
|
|
224
|
+
nevals = len(evals)
|
|
225
|
+
eval_names = [f"validation_{i}" for i in range(nevals)]
|
|
226
|
+
evals = list(zip(evals, eval_names))
|
|
227
|
+
else:
|
|
228
|
+
if any(
|
|
229
|
+
meta is not None
|
|
230
|
+
for meta in [
|
|
231
|
+
sample_weight_eval_set,
|
|
232
|
+
base_margin_eval_set,
|
|
233
|
+
]
|
|
234
|
+
):
|
|
235
|
+
raise ValueError(
|
|
236
|
+
"`eval_set` is not set but one of the other evaluation meta info is "
|
|
237
|
+
"not None."
|
|
238
|
+
)
|
|
239
|
+
evals = []
|
|
240
|
+
|
|
241
|
+
return train_dmatrix, evals
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from .... import opcodes
|
|
17
|
+
from ....core.entity.output_types import get_output_types
|
|
18
|
+
from ....core.operator.base import Operator
|
|
19
|
+
from ....core.operator.core import TileableOperatorMixin
|
|
20
|
+
from ....dataframe.core import DATAFRAME_TYPE
|
|
21
|
+
from ....serialization.serializables import Float64Field, KeyField, ListField
|
|
22
|
+
from ....serialization.serializables.field import AnyField, Int64Field
|
|
23
|
+
from ....tensor import tensor as astensor
|
|
24
|
+
from ....tensor.core import TENSOR_TYPE
|
|
25
|
+
from ....typing_ import TileableType
|
|
26
|
+
from ...utils import convert_to_tensor_or_dataframe
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ToDMatrix(Operator, TileableOperatorMixin):
|
|
30
|
+
_op_type_ = opcodes.TO_DMATRIX
|
|
31
|
+
|
|
32
|
+
data = KeyField("data", default=None)
|
|
33
|
+
label = KeyField("label", default=None)
|
|
34
|
+
missing = Float64Field("missing", default=None)
|
|
35
|
+
weight = KeyField("weight", default=None)
|
|
36
|
+
base_margin = KeyField("base_margin", default=None)
|
|
37
|
+
feature_names = ListField("feature_names", default=None)
|
|
38
|
+
feature_types = ListField("feature_types", default=None)
|
|
39
|
+
feature_weights = AnyField("feature_weights", default=None)
|
|
40
|
+
nthread = Int64Field("nthread", default=None)
|
|
41
|
+
group = AnyField("group", default=None)
|
|
42
|
+
qid = AnyField("qid", default=None)
|
|
43
|
+
label_lower_bound = AnyField("label_lower_bound", default=None)
|
|
44
|
+
label_upper_bound = AnyField("label_upper_bound", default=None)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def output_limit(self):
|
|
48
|
+
return 1
|
|
49
|
+
|
|
50
|
+
def _set_inputs(self, inputs):
|
|
51
|
+
super()._set_inputs(inputs)
|
|
52
|
+
if self.data is not None:
|
|
53
|
+
self.data = self._inputs[0]
|
|
54
|
+
has_label = self.label is not None
|
|
55
|
+
if has_label:
|
|
56
|
+
self.label = self._inputs[1]
|
|
57
|
+
if self.weight is not None:
|
|
58
|
+
i = 1 if not has_label else 2
|
|
59
|
+
self.weight = self._inputs[i]
|
|
60
|
+
if self.base_margin is not None:
|
|
61
|
+
self.base_margin = self._inputs[-1]
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def _get_kw(obj):
|
|
65
|
+
if isinstance(obj, TENSOR_TYPE):
|
|
66
|
+
return {"shape": obj.shape, "dtype": obj.dtype, "order": obj.order}
|
|
67
|
+
else:
|
|
68
|
+
return {
|
|
69
|
+
"shape": obj.shape,
|
|
70
|
+
"dtypes": obj.dtypes,
|
|
71
|
+
"index_value": obj.index_value,
|
|
72
|
+
"columns_value": obj.columns_value,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
def __call__(self):
|
|
76
|
+
inputs = [self.data]
|
|
77
|
+
kw = self._get_kw(self.data)
|
|
78
|
+
if self.label is not None:
|
|
79
|
+
inputs.append(self.label)
|
|
80
|
+
if self.weight is not None:
|
|
81
|
+
inputs.append(self.weight)
|
|
82
|
+
if self.base_margin is not None:
|
|
83
|
+
inputs.append(self.base_margin)
|
|
84
|
+
|
|
85
|
+
return self.new_tileable(inputs, **kw)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def check_data(data):
|
|
89
|
+
data = convert_to_tensor_or_dataframe(data)
|
|
90
|
+
if data.ndim != 2:
|
|
91
|
+
raise ValueError(f"Expecting 2-d data, got: {data.ndim}-d")
|
|
92
|
+
|
|
93
|
+
return data
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def check_array_like(y: TileableType, name: str) -> TileableType:
|
|
97
|
+
if y is None:
|
|
98
|
+
return
|
|
99
|
+
y = convert_to_tensor_or_dataframe(y)
|
|
100
|
+
if isinstance(y, DATAFRAME_TYPE):
|
|
101
|
+
y = y.iloc[:, 0]
|
|
102
|
+
return astensor(y)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def to_dmatrix(
|
|
106
|
+
data,
|
|
107
|
+
label=None,
|
|
108
|
+
missing=None,
|
|
109
|
+
weight=None,
|
|
110
|
+
base_margin=None,
|
|
111
|
+
feature_names=None,
|
|
112
|
+
feature_types=None,
|
|
113
|
+
feature_weights=None,
|
|
114
|
+
nthread=None,
|
|
115
|
+
group=None,
|
|
116
|
+
qid=None,
|
|
117
|
+
label_lower_bound=None,
|
|
118
|
+
label_upper_bound=None,
|
|
119
|
+
):
|
|
120
|
+
data = check_data(data)
|
|
121
|
+
label = check_array_like(label, "label")
|
|
122
|
+
weight = check_array_like(weight, "weight")
|
|
123
|
+
base_margin = check_array_like(base_margin, "base_margin")
|
|
124
|
+
|
|
125
|
+
# If not multiple outputs, try to collect the chunks on same worker into one
|
|
126
|
+
# to feed the data into XGBoost for training.
|
|
127
|
+
op = ToDMatrix(
|
|
128
|
+
data=data,
|
|
129
|
+
label=label,
|
|
130
|
+
missing=missing,
|
|
131
|
+
weight=weight,
|
|
132
|
+
base_margin=base_margin,
|
|
133
|
+
feature_names=feature_names,
|
|
134
|
+
feature_types=feature_types,
|
|
135
|
+
feature_weights=feature_weights,
|
|
136
|
+
nthread=nthread,
|
|
137
|
+
group=group,
|
|
138
|
+
qid=qid,
|
|
139
|
+
label_lower_bound=label_lower_bound,
|
|
140
|
+
label_upper_bound=label_upper_bound,
|
|
141
|
+
gpu=data.op.gpu,
|
|
142
|
+
_output_types=get_output_types(data),
|
|
143
|
+
)
|
|
144
|
+
return op()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
DMatrix = to_dmatrix
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
from .... import opcodes
|
|
19
|
+
from ....core.entity.output_types import OutputType
|
|
20
|
+
from ....core.operator.base import Operator
|
|
21
|
+
from ....core.operator.core import TileableOperatorMixin
|
|
22
|
+
from ....serialization.serializables import (
|
|
23
|
+
BoolField,
|
|
24
|
+
KeyField,
|
|
25
|
+
ReferenceField,
|
|
26
|
+
TupleField,
|
|
27
|
+
)
|
|
28
|
+
from ....tensor.core import TensorOrder
|
|
29
|
+
from .core import BoosterData
|
|
30
|
+
from .dmatrix import check_data
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class XGBPredict(Operator, TileableOperatorMixin):
|
|
34
|
+
_op_type_ = opcodes.XGBOOST_PREDICT
|
|
35
|
+
output_dtype = np.dtype(np.float32)
|
|
36
|
+
|
|
37
|
+
data = KeyField("data", default=None)
|
|
38
|
+
model = ReferenceField("model", reference_type=BoosterData, default=None)
|
|
39
|
+
pred_leaf = BoolField("pred_leaf", default=False)
|
|
40
|
+
pred_contribs = BoolField("pred_contribs", default=False)
|
|
41
|
+
approx_contribs = BoolField("approx_contribs", default=False)
|
|
42
|
+
pred_interactions = BoolField("pred_interactions", default=False)
|
|
43
|
+
validate_features = BoolField("validate_features", default=True)
|
|
44
|
+
training = BoolField("training", default=False)
|
|
45
|
+
iteration_range = TupleField("iteration_range", default_factory=lambda x: (0, 0))
|
|
46
|
+
strict_shape = BoolField("strict_shape", default=False)
|
|
47
|
+
flag = BoolField("flag", default=False)
|
|
48
|
+
|
|
49
|
+
def __init__(self, output_types=None, gpu=None, **kw):
|
|
50
|
+
super().__init__(_output_types=output_types, gpu=gpu, **kw)
|
|
51
|
+
|
|
52
|
+
def _set_inputs(self, inputs):
|
|
53
|
+
super()._set_inputs(inputs)
|
|
54
|
+
self.data = self._inputs[0]
|
|
55
|
+
self.model = self._inputs[1]
|
|
56
|
+
|
|
57
|
+
def __call__(self):
|
|
58
|
+
num_class = getattr(self.model.op, "num_class", None)
|
|
59
|
+
if num_class is not None:
|
|
60
|
+
num_class = int(num_class)
|
|
61
|
+
if num_class is not None:
|
|
62
|
+
shape = (self.data.shape[0], num_class)
|
|
63
|
+
else:
|
|
64
|
+
shape = (self.data.shape[0],)
|
|
65
|
+
inputs = [self.data, self.model]
|
|
66
|
+
return self.new_tileable(
|
|
67
|
+
inputs,
|
|
68
|
+
shape=shape,
|
|
69
|
+
dtype=self.output_dtype,
|
|
70
|
+
order=TensorOrder.C_ORDER,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def predict(
|
|
75
|
+
model,
|
|
76
|
+
data,
|
|
77
|
+
output_margin=False,
|
|
78
|
+
pred_leaf=False,
|
|
79
|
+
pred_contribs=False,
|
|
80
|
+
approx_contribs=False,
|
|
81
|
+
pred_interactions=False,
|
|
82
|
+
validate_features=True,
|
|
83
|
+
training=False,
|
|
84
|
+
iteration_range=None,
|
|
85
|
+
strict_shape=False,
|
|
86
|
+
flag=False,
|
|
87
|
+
):
|
|
88
|
+
"""
|
|
89
|
+
Using MaxFrame XGBoost model to predict data.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
Parameters are the same as `xgboost.train`. The predict() is lazy-execution mode.
|
|
94
|
+
|
|
95
|
+
Returns
|
|
96
|
+
-------
|
|
97
|
+
results: Booster
|
|
98
|
+
"""
|
|
99
|
+
data = check_data(data)
|
|
100
|
+
# TODO: check model datatype
|
|
101
|
+
|
|
102
|
+
output_types = [OutputType.tensor]
|
|
103
|
+
|
|
104
|
+
iteration_range = iteration_range or (0, 0)
|
|
105
|
+
|
|
106
|
+
return XGBPredict(
|
|
107
|
+
data=data,
|
|
108
|
+
model=model,
|
|
109
|
+
output_margin=output_margin,
|
|
110
|
+
pred_leaf=pred_leaf,
|
|
111
|
+
pred_contribs=pred_contribs,
|
|
112
|
+
approx_contribs=approx_contribs,
|
|
113
|
+
pred_interactions=pred_interactions,
|
|
114
|
+
validate_features=validate_features,
|
|
115
|
+
training=training,
|
|
116
|
+
iteration_range=iteration_range,
|
|
117
|
+
strict_shape=strict_shape,
|
|
118
|
+
gpu=data.op.gpu,
|
|
119
|
+
output_types=output_types,
|
|
120
|
+
flag=flag,
|
|
121
|
+
)()
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from ..utils import make_import_error_func
|
|
17
|
+
from .core import XGBScikitLearnBase, xgboost
|
|
18
|
+
|
|
19
|
+
if not xgboost:
|
|
20
|
+
XGBRegressor = make_import_error_func("xgboost")
|
|
21
|
+
else:
|
|
22
|
+
from .core import wrap_evaluation_matrices
|
|
23
|
+
from .predict import predict
|
|
24
|
+
from .train import train
|
|
25
|
+
|
|
26
|
+
class XGBRegressor(XGBScikitLearnBase):
|
|
27
|
+
"""
|
|
28
|
+
Implementation of the scikit-learn API for XGBoost regressor.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def fit(
|
|
32
|
+
self,
|
|
33
|
+
X,
|
|
34
|
+
y,
|
|
35
|
+
sample_weight=None,
|
|
36
|
+
base_margin=None,
|
|
37
|
+
eval_set=None,
|
|
38
|
+
sample_weight_eval_set=None,
|
|
39
|
+
base_margin_eval_set=None,
|
|
40
|
+
**kw,
|
|
41
|
+
):
|
|
42
|
+
session = kw.pop("session", None)
|
|
43
|
+
run_kwargs = kw.pop("run_kwargs", dict())
|
|
44
|
+
dtrain, evals = wrap_evaluation_matrices(
|
|
45
|
+
None,
|
|
46
|
+
X,
|
|
47
|
+
y,
|
|
48
|
+
sample_weight,
|
|
49
|
+
base_margin,
|
|
50
|
+
eval_set,
|
|
51
|
+
sample_weight_eval_set,
|
|
52
|
+
base_margin_eval_set,
|
|
53
|
+
)
|
|
54
|
+
params = self.get_xgb_params()
|
|
55
|
+
if not params.get("objective"):
|
|
56
|
+
params["objective"] = "reg:squarederror"
|
|
57
|
+
self.evals_result_ = dict()
|
|
58
|
+
result = train(
|
|
59
|
+
params,
|
|
60
|
+
dtrain,
|
|
61
|
+
num_boost_round=self.get_num_boosting_rounds(),
|
|
62
|
+
evals=evals,
|
|
63
|
+
evals_result=self.evals_result_,
|
|
64
|
+
session=session,
|
|
65
|
+
run_kwargs=run_kwargs,
|
|
66
|
+
)
|
|
67
|
+
self._Booster = result
|
|
68
|
+
return self
|
|
69
|
+
|
|
70
|
+
def predict(self, data, **kw):
|
|
71
|
+
return predict(self.get_booster(), data, **kw)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|