maxframe 0.1.0b4__cp39-cp39-win_amd64.whl → 1.0.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (214) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cp39-win_amd64.pyd +0 -0
  3. maxframe/codegen.py +56 -5
  4. maxframe/config/config.py +78 -10
  5. maxframe/config/validators.py +42 -11
  6. maxframe/conftest.py +58 -14
  7. maxframe/core/__init__.py +2 -16
  8. maxframe/core/entity/__init__.py +1 -12
  9. maxframe/core/entity/executable.py +1 -1
  10. maxframe/core/entity/objects.py +46 -45
  11. maxframe/core/entity/output_types.py +0 -3
  12. maxframe/core/entity/tests/test_objects.py +43 -0
  13. maxframe/core/entity/tileables.py +5 -78
  14. maxframe/core/graph/__init__.py +2 -2
  15. maxframe/core/graph/builder/__init__.py +0 -1
  16. maxframe/core/graph/builder/base.py +5 -4
  17. maxframe/core/graph/builder/tileable.py +4 -4
  18. maxframe/core/graph/builder/utils.py +4 -8
  19. maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
  20. maxframe/core/graph/core.pyx +4 -4
  21. maxframe/core/graph/entity.py +9 -33
  22. maxframe/core/operator/__init__.py +2 -9
  23. maxframe/core/operator/base.py +3 -5
  24. maxframe/core/operator/objects.py +0 -9
  25. maxframe/core/operator/utils.py +55 -0
  26. maxframe/dataframe/__init__.py +2 -1
  27. maxframe/dataframe/arithmetic/around.py +5 -17
  28. maxframe/dataframe/arithmetic/core.py +15 -7
  29. maxframe/dataframe/arithmetic/docstring.py +7 -33
  30. maxframe/dataframe/arithmetic/equal.py +4 -2
  31. maxframe/dataframe/arithmetic/greater.py +4 -2
  32. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  33. maxframe/dataframe/arithmetic/less.py +2 -2
  34. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  36. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  37. maxframe/dataframe/core.py +58 -12
  38. maxframe/dataframe/datasource/date_range.py +2 -2
  39. maxframe/dataframe/datasource/read_odps_query.py +120 -24
  40. maxframe/dataframe/datasource/read_odps_table.py +9 -4
  41. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  42. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  43. maxframe/dataframe/datastore/to_odps.py +28 -0
  44. maxframe/dataframe/extensions/__init__.py +5 -0
  45. maxframe/dataframe/extensions/flatjson.py +131 -0
  46. maxframe/dataframe/extensions/flatmap.py +317 -0
  47. maxframe/dataframe/extensions/reshuffle.py +1 -1
  48. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  49. maxframe/dataframe/groupby/core.py +1 -1
  50. maxframe/dataframe/groupby/cum.py +0 -1
  51. maxframe/dataframe/groupby/fill.py +4 -1
  52. maxframe/dataframe/groupby/getitem.py +6 -0
  53. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  54. maxframe/dataframe/groupby/transform.py +5 -1
  55. maxframe/dataframe/indexing/align.py +1 -1
  56. maxframe/dataframe/indexing/loc.py +6 -4
  57. maxframe/dataframe/indexing/rename.py +5 -28
  58. maxframe/dataframe/indexing/sample.py +0 -1
  59. maxframe/dataframe/indexing/set_index.py +68 -1
  60. maxframe/dataframe/initializer.py +11 -1
  61. maxframe/dataframe/merge/__init__.py +9 -1
  62. maxframe/dataframe/merge/concat.py +41 -31
  63. maxframe/dataframe/merge/merge.py +237 -3
  64. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  65. maxframe/dataframe/misc/__init__.py +4 -0
  66. maxframe/dataframe/misc/apply.py +6 -11
  67. maxframe/dataframe/misc/case_when.py +141 -0
  68. maxframe/dataframe/misc/describe.py +2 -2
  69. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  70. maxframe/dataframe/misc/eval.py +4 -0
  71. maxframe/dataframe/misc/memory_usage.py +2 -2
  72. maxframe/dataframe/misc/pct_change.py +1 -83
  73. maxframe/dataframe/misc/pivot_table.py +262 -0
  74. maxframe/dataframe/misc/tests/test_misc.py +93 -1
  75. maxframe/dataframe/misc/transform.py +1 -30
  76. maxframe/dataframe/misc/value_counts.py +4 -17
  77. maxframe/dataframe/missing/dropna.py +1 -1
  78. maxframe/dataframe/missing/fillna.py +5 -5
  79. maxframe/dataframe/operators.py +1 -17
  80. maxframe/dataframe/plotting/core.py +2 -2
  81. maxframe/dataframe/reduction/core.py +4 -3
  82. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  83. maxframe/dataframe/sort/sort_values.py +1 -11
  84. maxframe/dataframe/statistics/corr.py +3 -3
  85. maxframe/dataframe/statistics/quantile.py +13 -19
  86. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  87. maxframe/dataframe/tests/test_initializer.py +33 -2
  88. maxframe/dataframe/utils.py +33 -11
  89. maxframe/dataframe/window/expanding.py +5 -3
  90. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  91. maxframe/errors.py +13 -0
  92. maxframe/extension.py +12 -0
  93. maxframe/io/__init__.py +13 -0
  94. maxframe/io/objects/__init__.py +24 -0
  95. maxframe/io/objects/core.py +140 -0
  96. maxframe/io/objects/tensor.py +76 -0
  97. maxframe/io/objects/tests/__init__.py +13 -0
  98. maxframe/io/objects/tests/test_object_io.py +97 -0
  99. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  100. maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
  101. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  102. maxframe/io/odpsio/tableio.py +719 -0
  103. maxframe/io/odpsio/tests/__init__.py +13 -0
  104. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
  105. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  106. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  107. maxframe/io/odpsio/volumeio.py +63 -0
  108. maxframe/learn/contrib/__init__.py +3 -1
  109. maxframe/learn/contrib/graph/__init__.py +15 -0
  110. maxframe/learn/contrib/graph/connected_components.py +215 -0
  111. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  112. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  113. maxframe/learn/contrib/llm/__init__.py +16 -0
  114. maxframe/learn/contrib/llm/core.py +54 -0
  115. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  116. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  117. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  118. maxframe/learn/contrib/llm/text.py +42 -0
  119. maxframe/learn/contrib/utils.py +52 -0
  120. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  121. maxframe/learn/contrib/xgboost/classifier.py +110 -0
  122. maxframe/learn/contrib/xgboost/core.py +241 -0
  123. maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
  124. maxframe/learn/contrib/xgboost/predict.py +121 -0
  125. maxframe/learn/contrib/xgboost/regressor.py +71 -0
  126. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  127. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  128. maxframe/learn/contrib/xgboost/train.py +132 -0
  129. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  130. maxframe/learn/utils/__init__.py +15 -0
  131. maxframe/learn/utils/core.py +29 -0
  132. maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
  133. maxframe/lib/mmh3.pyi +43 -0
  134. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  135. maxframe/lib/wrapped_pickle.py +2 -1
  136. maxframe/opcodes.py +11 -0
  137. maxframe/protocol.py +154 -27
  138. maxframe/remote/core.py +4 -8
  139. maxframe/serialization/__init__.py +1 -0
  140. maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
  141. maxframe/serialization/core.pxd +3 -0
  142. maxframe/serialization/core.pyi +64 -0
  143. maxframe/serialization/core.pyx +67 -26
  144. maxframe/serialization/exception.py +1 -1
  145. maxframe/serialization/pandas.py +52 -17
  146. maxframe/serialization/serializables/core.py +180 -15
  147. maxframe/serialization/serializables/field_type.py +4 -1
  148. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  149. maxframe/serialization/tests/test_serial.py +2 -1
  150. maxframe/session.py +37 -2
  151. maxframe/tensor/__init__.py +81 -2
  152. maxframe/tensor/arithmetic/isclose.py +1 -0
  153. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  154. maxframe/tensor/core.py +5 -136
  155. maxframe/tensor/datasource/array.py +7 -2
  156. maxframe/tensor/datasource/full.py +1 -1
  157. maxframe/tensor/datasource/scalar.py +1 -1
  158. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  159. maxframe/tensor/indexing/flatnonzero.py +1 -1
  160. maxframe/tensor/indexing/getitem.py +2 -0
  161. maxframe/tensor/merge/__init__.py +2 -0
  162. maxframe/tensor/merge/concatenate.py +101 -0
  163. maxframe/tensor/merge/tests/test_merge.py +30 -1
  164. maxframe/tensor/merge/vstack.py +74 -0
  165. maxframe/tensor/{base → misc}/__init__.py +4 -0
  166. maxframe/tensor/misc/atleast_1d.py +72 -0
  167. maxframe/tensor/misc/atleast_2d.py +70 -0
  168. maxframe/tensor/misc/atleast_3d.py +85 -0
  169. maxframe/tensor/misc/tests/__init__.py +13 -0
  170. maxframe/tensor/{base → misc}/transpose.py +22 -18
  171. maxframe/tensor/misc/unique.py +205 -0
  172. maxframe/tensor/operators.py +1 -7
  173. maxframe/tensor/random/core.py +1 -1
  174. maxframe/tensor/reduction/count_nonzero.py +2 -1
  175. maxframe/tensor/reduction/mean.py +1 -0
  176. maxframe/tensor/reduction/nanmean.py +1 -0
  177. maxframe/tensor/reduction/nanvar.py +2 -0
  178. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  179. maxframe/tensor/reduction/var.py +2 -0
  180. maxframe/tensor/statistics/quantile.py +2 -2
  181. maxframe/tensor/utils.py +2 -22
  182. maxframe/tests/test_protocol.py +34 -0
  183. maxframe/tests/test_utils.py +0 -12
  184. maxframe/tests/utils.py +17 -2
  185. maxframe/typing_.py +4 -1
  186. maxframe/udf.py +62 -3
  187. maxframe/utils.py +112 -86
  188. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  189. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
  190. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  191. maxframe_client/__init__.py +0 -1
  192. maxframe_client/clients/framedriver.py +4 -1
  193. maxframe_client/fetcher.py +123 -54
  194. maxframe_client/session/consts.py +3 -0
  195. maxframe_client/session/graph.py +8 -2
  196. maxframe_client/session/odps.py +223 -40
  197. maxframe_client/session/task.py +108 -80
  198. maxframe_client/tests/test_fetcher.py +21 -3
  199. maxframe_client/tests/test_session.py +136 -8
  200. maxframe/core/entity/chunks.py +0 -68
  201. maxframe/core/entity/fuse.py +0 -73
  202. maxframe/core/graph/builder/chunk.py +0 -430
  203. maxframe/odpsio/tableio.py +0 -300
  204. maxframe/odpsio/volumeio.py +0 -95
  205. maxframe_client/clients/spe.py +0 -104
  206. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  207. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  208. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  209. /maxframe/tensor/{base → misc}/astype.py +0 -0
  210. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  211. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  212. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  213. /maxframe/tensor/{base → misc}/where.py +0 -0
  214. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,110 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+
17
+ from ....tensor import argmax, transpose
18
+ from ....tensor.merge.vstack import _vstack
19
+ from ..utils import make_import_error_func
20
+ from .core import XGBScikitLearnBase, xgboost
21
+
22
+ if not xgboost:
23
+ XGBClassifier = make_import_error_func("xgboost")
24
+ else:
25
+ from xgboost.sklearn import XGBClassifierBase
26
+
27
+ from .core import wrap_evaluation_matrices
28
+ from .predict import predict
29
+ from .train import train
30
+
31
+ class XGBClassifier(XGBScikitLearnBase, XGBClassifierBase):
32
+ """
33
+ Implementation of the scikit-learn API for XGBoost classification.
34
+ """
35
+
36
+ def fit(
37
+ self,
38
+ X,
39
+ y,
40
+ sample_weight=None,
41
+ base_margin=None,
42
+ eval_set=None,
43
+ sample_weight_eval_set=None,
44
+ base_margin_eval_set=None,
45
+ num_class=None,
46
+ **kw,
47
+ ):
48
+ session = kw.pop("session", None)
49
+ run_kwargs = kw.pop("run_kwargs", dict())
50
+ dtrain, evals = wrap_evaluation_matrices(
51
+ None,
52
+ X,
53
+ y,
54
+ sample_weight,
55
+ base_margin,
56
+ eval_set,
57
+ sample_weight_eval_set,
58
+ base_margin_eval_set,
59
+ )
60
+ params = self.get_xgb_params()
61
+ self.n_classes_ = num_class or 1
62
+ if self.n_classes_ > 2:
63
+ params["objective"] = "multi:softprob"
64
+ params["num_class"] = self.n_classes_
65
+ else:
66
+ params["objective"] = "binary:logistic"
67
+ self.evals_result_ = dict()
68
+ result = train(
69
+ params,
70
+ dtrain,
71
+ num_boost_round=self.get_num_boosting_rounds(),
72
+ evals=evals,
73
+ evals_result=self.evals_result_,
74
+ num_class=num_class,
75
+ session=session,
76
+ run_kwargs=run_kwargs,
77
+ )
78
+ self._Booster = result
79
+ return self
80
+
81
+ def predict(self, data, **kw):
82
+ prob = self.predict_proba(data, flag=True, **kw)
83
+ if prob.ndim > 1:
84
+ prediction = argmax(prob, axis=1)
85
+ else:
86
+ prediction = (prob > 0.5).astype(np.int64)
87
+ return prediction
88
+
89
+ def predict_proba(self, data, ntree_limit=None, flag=False, **kw):
90
+ if ntree_limit is not None:
91
+ raise NotImplementedError("ntree_limit is not currently supported")
92
+ prediction = predict(self.get_booster(), data, flag=flag, **kw)
93
+ if len(prediction.shape) == 2 and prediction.shape[1] == self.n_classes_:
94
+ # multi-class
95
+ return prediction
96
+ if (
97
+ len(prediction.shape) == 2
98
+ and self.n_classes_ == 2
99
+ and prediction.shape[1] >= self.n_classes_
100
+ ):
101
+ # multi-label
102
+ return prediction
103
+ # binary logistic function
104
+ classone_probs = prediction
105
+ classzero_probs = 1.0 - classone_probs
106
+ return transpose(_vstack((classzero_probs, classone_probs)))
107
+
108
+ @property
109
+ def classes_(self) -> np.ndarray:
110
+ return np.arange(self.n_classes_)
@@ -0,0 +1,241 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Callable, Dict, List, Optional, Tuple
16
+
17
+ try:
18
+ import xgboost
19
+ except ImportError:
20
+ xgboost = None
21
+
22
+ from ...core import Model, ModelData
23
+ from .dmatrix import DMatrix
24
+
25
+
26
+ class BoosterData(ModelData):
27
+ __slots__ = ("_evals_result",)
28
+
29
+ _evals_result: Dict
30
+
31
+ def __init__(self, *args, evals_result=None, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+ self._evals_result = evals_result if evals_result is not None else dict()
34
+
35
+ def execute(self, session=None, **kw):
36
+ # The evals_result should be fetched when BoosterData.execute() is called.
37
+ result = super().execute(session=session, **kw)
38
+ if self.op.has_evals_result and self.key == self.op.outputs[0].key:
39
+ self._evals_result.update(self.op.outputs[1].fetch(session=session))
40
+ return result
41
+
42
+ def predict(
43
+ self,
44
+ data,
45
+ output_margin=False,
46
+ pred_leaf=False,
47
+ pred_contribs=False,
48
+ approx_contribs=False,
49
+ pred_interactions=False,
50
+ validate_features=True,
51
+ training=False,
52
+ iteration_range=None,
53
+ strict_shape=False,
54
+ ):
55
+ from .predict import predict
56
+
57
+ return predict(
58
+ self,
59
+ data,
60
+ output_margin=output_margin,
61
+ pred_leaf=pred_leaf,
62
+ pred_contribs=pred_contribs,
63
+ approx_contribs=approx_contribs,
64
+ pred_interactions=pred_interactions,
65
+ validate_features=validate_features,
66
+ training=training,
67
+ iteration_range=iteration_range,
68
+ strict_shape=strict_shape,
69
+ )
70
+
71
+
72
+ class Booster(Model):
73
+ pass
74
+
75
+
76
+ if not xgboost:
77
+ XGBScikitLearnBase = None
78
+ else:
79
+
80
+ class XGBScikitLearnBase(xgboost.XGBModel):
81
+ """
82
+ Base class for implementing scikit-learn interface
83
+ """
84
+
85
+ def fit(
86
+ self,
87
+ X,
88
+ y,
89
+ sample_weights=None,
90
+ eval_set=None,
91
+ sample_weight_eval_set=None,
92
+ **kw,
93
+ ):
94
+ """
95
+ Fit the regressor. Note that fit() is an eager-execution
96
+ API. The call will be blocked until training finished.
97
+
98
+ Parameters
99
+ ----------
100
+ X : array_like
101
+ Feature matrix
102
+ y : array_like
103
+ Labels
104
+ sample_weight : array_like
105
+ instance weights
106
+ eval_set : list, optional
107
+ A list of (X, y) tuple pairs to use as validation sets, for which
108
+ metrics will be computed.
109
+ Validation metrics will help us track the performance of the model.
110
+ sample_weight_eval_set : list, optional
111
+ A list of the form [L_1, L_2, ..., L_n], where each L_i is a list
112
+ of group weights on the i-th validation set.
113
+ """
114
+ raise NotImplementedError
115
+
116
+ def predict(self, data, **kw):
117
+ """
118
+ Predict with `data`.
119
+
120
+ Parameters
121
+ ----------
122
+ data: data that can be used to perform prediction
123
+ Returns
124
+ -------
125
+ prediction : maxframe.tensor.Tensor
126
+ """
127
+ raise NotImplementedError
128
+
129
+ def evals_result(self, **kw) -> Dict:
130
+ """Return the evaluation results.
131
+
132
+ If **eval_set** is passed to the :py:meth:`fit` function, you can call
133
+ ``evals_result()`` to get evaluation results for all passed **eval_sets**. When
134
+ **eval_metric** is also passed to the :py:meth:`fit` function, the
135
+ **evals_result** will contain the **eval_metrics** passed to the :py:meth:`fit`
136
+ function.
137
+
138
+ The returned evaluation result is a dictionary:
139
+
140
+ .. code-block:: python
141
+
142
+ {'validation_0': {'logloss': ['0.604835', '0.531479']},
143
+ 'validation_1': {'logloss': ['0.41965', '0.17686']}}
144
+
145
+ Note that evals_result() will be blocked until the train is finished.
146
+
147
+ Returns
148
+ -------
149
+ evals_result
150
+
151
+ """
152
+ result = super().evals_result()
153
+ if not self._Booster.op.has_evals_result or len(result) != 0:
154
+ return result
155
+ session = kw.pop("session", None)
156
+ run_kwargs = kw.pop("run_kwargs", dict())
157
+ self._Booster.execute(session=session, **run_kwargs)
158
+ return super().evals_result()
159
+
160
+ def wrap_evaluation_matrices(
161
+ missing: float,
162
+ X: Any,
163
+ y: Any,
164
+ sample_weight: Optional[Any],
165
+ base_margin: Optional[Any],
166
+ eval_set: Optional[List[Tuple[Any, Any]]],
167
+ sample_weight_eval_set: Optional[List[Any]],
168
+ base_margin_eval_set: Optional[List[Any]],
169
+ label_transform: Callable = lambda x: x,
170
+ ) -> Tuple[Any, Optional[List[Tuple[Any, str]]]]:
171
+ """
172
+ Convert array_like evaluation matrices into DMatrix.
173
+ Perform validation on the way.
174
+ """
175
+ train_dmatrix = DMatrix(
176
+ data=X,
177
+ label=label_transform(y),
178
+ weight=sample_weight,
179
+ base_margin=base_margin,
180
+ missing=missing,
181
+ )
182
+
183
+ n_validation = 0 if eval_set is None else len(eval_set)
184
+
185
+ def validate_or_none(meta: Optional[List], name: str) -> List:
186
+ if meta is None:
187
+ return [None] * n_validation
188
+ if len(meta) != n_validation:
189
+ raise ValueError(
190
+ f"{name}'s length does not equal `eval_set`'s length, "
191
+ + f"expecting {n_validation}, got {len(meta)}"
192
+ )
193
+ return meta
194
+
195
+ if eval_set is not None:
196
+ sample_weight_eval_set = validate_or_none(
197
+ sample_weight_eval_set, "sample_weight_eval_set"
198
+ )
199
+ base_margin_eval_set = validate_or_none(
200
+ base_margin_eval_set, "base_margin_eval_set"
201
+ )
202
+
203
+ evals = []
204
+ for i, (valid_X, valid_y) in enumerate(eval_set):
205
+ # Skip the duplicated entry.
206
+ if all(
207
+ (
208
+ valid_X is X,
209
+ valid_y is y,
210
+ sample_weight_eval_set[i] is sample_weight,
211
+ base_margin_eval_set[i] is base_margin,
212
+ )
213
+ ):
214
+ evals.append(train_dmatrix)
215
+ else:
216
+ m = DMatrix(
217
+ data=valid_X,
218
+ label=label_transform(valid_y),
219
+ weight=sample_weight_eval_set[i],
220
+ base_margin=base_margin_eval_set[i],
221
+ missing=missing,
222
+ )
223
+ evals.append(m)
224
+ nevals = len(evals)
225
+ eval_names = [f"validation_{i}" for i in range(nevals)]
226
+ evals = list(zip(evals, eval_names))
227
+ else:
228
+ if any(
229
+ meta is not None
230
+ for meta in [
231
+ sample_weight_eval_set,
232
+ base_margin_eval_set,
233
+ ]
234
+ ):
235
+ raise ValueError(
236
+ "`eval_set` is not set but one of the other evaluation meta info is "
237
+ "not None."
238
+ )
239
+ evals = []
240
+
241
+ return train_dmatrix, evals
@@ -0,0 +1,147 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from .... import opcodes
17
+ from ....core.entity.output_types import get_output_types
18
+ from ....core.operator.base import Operator
19
+ from ....core.operator.core import TileableOperatorMixin
20
+ from ....dataframe.core import DATAFRAME_TYPE
21
+ from ....serialization.serializables import Float64Field, KeyField, ListField
22
+ from ....serialization.serializables.field import AnyField, Int64Field
23
+ from ....tensor import tensor as astensor
24
+ from ....tensor.core import TENSOR_TYPE
25
+ from ....typing_ import TileableType
26
+ from ...utils import convert_to_tensor_or_dataframe
27
+
28
+
29
+ class ToDMatrix(Operator, TileableOperatorMixin):
30
+ _op_type_ = opcodes.TO_DMATRIX
31
+
32
+ data = KeyField("data", default=None)
33
+ label = KeyField("label", default=None)
34
+ missing = Float64Field("missing", default=None)
35
+ weight = KeyField("weight", default=None)
36
+ base_margin = KeyField("base_margin", default=None)
37
+ feature_names = ListField("feature_names", default=None)
38
+ feature_types = ListField("feature_types", default=None)
39
+ feature_weights = AnyField("feature_weights", default=None)
40
+ nthread = Int64Field("nthread", default=None)
41
+ group = AnyField("group", default=None)
42
+ qid = AnyField("qid", default=None)
43
+ label_lower_bound = AnyField("label_lower_bound", default=None)
44
+ label_upper_bound = AnyField("label_upper_bound", default=None)
45
+
46
+ @property
47
+ def output_limit(self):
48
+ return 1
49
+
50
+ def _set_inputs(self, inputs):
51
+ super()._set_inputs(inputs)
52
+ if self.data is not None:
53
+ self.data = self._inputs[0]
54
+ has_label = self.label is not None
55
+ if has_label:
56
+ self.label = self._inputs[1]
57
+ if self.weight is not None:
58
+ i = 1 if not has_label else 2
59
+ self.weight = self._inputs[i]
60
+ if self.base_margin is not None:
61
+ self.base_margin = self._inputs[-1]
62
+
63
+ @staticmethod
64
+ def _get_kw(obj):
65
+ if isinstance(obj, TENSOR_TYPE):
66
+ return {"shape": obj.shape, "dtype": obj.dtype, "order": obj.order}
67
+ else:
68
+ return {
69
+ "shape": obj.shape,
70
+ "dtypes": obj.dtypes,
71
+ "index_value": obj.index_value,
72
+ "columns_value": obj.columns_value,
73
+ }
74
+
75
+ def __call__(self):
76
+ inputs = [self.data]
77
+ kw = self._get_kw(self.data)
78
+ if self.label is not None:
79
+ inputs.append(self.label)
80
+ if self.weight is not None:
81
+ inputs.append(self.weight)
82
+ if self.base_margin is not None:
83
+ inputs.append(self.base_margin)
84
+
85
+ return self.new_tileable(inputs, **kw)
86
+
87
+
88
+ def check_data(data):
89
+ data = convert_to_tensor_or_dataframe(data)
90
+ if data.ndim != 2:
91
+ raise ValueError(f"Expecting 2-d data, got: {data.ndim}-d")
92
+
93
+ return data
94
+
95
+
96
+ def check_array_like(y: TileableType, name: str) -> TileableType:
97
+ if y is None:
98
+ return
99
+ y = convert_to_tensor_or_dataframe(y)
100
+ if isinstance(y, DATAFRAME_TYPE):
101
+ y = y.iloc[:, 0]
102
+ return astensor(y)
103
+
104
+
105
+ def to_dmatrix(
106
+ data,
107
+ label=None,
108
+ missing=None,
109
+ weight=None,
110
+ base_margin=None,
111
+ feature_names=None,
112
+ feature_types=None,
113
+ feature_weights=None,
114
+ nthread=None,
115
+ group=None,
116
+ qid=None,
117
+ label_lower_bound=None,
118
+ label_upper_bound=None,
119
+ ):
120
+ data = check_data(data)
121
+ label = check_array_like(label, "label")
122
+ weight = check_array_like(weight, "weight")
123
+ base_margin = check_array_like(base_margin, "base_margin")
124
+
125
+ # If not multiple outputs, try to collect the chunks on same worker into one
126
+ # to feed the data into XGBoost for training.
127
+ op = ToDMatrix(
128
+ data=data,
129
+ label=label,
130
+ missing=missing,
131
+ weight=weight,
132
+ base_margin=base_margin,
133
+ feature_names=feature_names,
134
+ feature_types=feature_types,
135
+ feature_weights=feature_weights,
136
+ nthread=nthread,
137
+ group=group,
138
+ qid=qid,
139
+ label_lower_bound=label_lower_bound,
140
+ label_upper_bound=label_upper_bound,
141
+ gpu=data.op.gpu,
142
+ _output_types=get_output_types(data),
143
+ )
144
+ return op()
145
+
146
+
147
+ DMatrix = to_dmatrix
@@ -0,0 +1,121 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import numpy as np
17
+
18
+ from .... import opcodes
19
+ from ....core.entity.output_types import OutputType
20
+ from ....core.operator.base import Operator
21
+ from ....core.operator.core import TileableOperatorMixin
22
+ from ....serialization.serializables import (
23
+ BoolField,
24
+ KeyField,
25
+ ReferenceField,
26
+ TupleField,
27
+ )
28
+ from ....tensor.core import TensorOrder
29
+ from .core import BoosterData
30
+ from .dmatrix import check_data
31
+
32
+
33
+ class XGBPredict(Operator, TileableOperatorMixin):
34
+ _op_type_ = opcodes.XGBOOST_PREDICT
35
+ output_dtype = np.dtype(np.float32)
36
+
37
+ data = KeyField("data", default=None)
38
+ model = ReferenceField("model", reference_type=BoosterData, default=None)
39
+ pred_leaf = BoolField("pred_leaf", default=False)
40
+ pred_contribs = BoolField("pred_contribs", default=False)
41
+ approx_contribs = BoolField("approx_contribs", default=False)
42
+ pred_interactions = BoolField("pred_interactions", default=False)
43
+ validate_features = BoolField("validate_features", default=True)
44
+ training = BoolField("training", default=False)
45
+ iteration_range = TupleField("iteration_range", default_factory=lambda x: (0, 0))
46
+ strict_shape = BoolField("strict_shape", default=False)
47
+ flag = BoolField("flag", default=False)
48
+
49
+ def __init__(self, output_types=None, gpu=None, **kw):
50
+ super().__init__(_output_types=output_types, gpu=gpu, **kw)
51
+
52
+ def _set_inputs(self, inputs):
53
+ super()._set_inputs(inputs)
54
+ self.data = self._inputs[0]
55
+ self.model = self._inputs[1]
56
+
57
+ def __call__(self):
58
+ num_class = getattr(self.model.op, "num_class", None)
59
+ if num_class is not None:
60
+ num_class = int(num_class)
61
+ if num_class is not None:
62
+ shape = (self.data.shape[0], num_class)
63
+ else:
64
+ shape = (self.data.shape[0],)
65
+ inputs = [self.data, self.model]
66
+ return self.new_tileable(
67
+ inputs,
68
+ shape=shape,
69
+ dtype=self.output_dtype,
70
+ order=TensorOrder.C_ORDER,
71
+ )
72
+
73
+
74
+ def predict(
75
+ model,
76
+ data,
77
+ output_margin=False,
78
+ pred_leaf=False,
79
+ pred_contribs=False,
80
+ approx_contribs=False,
81
+ pred_interactions=False,
82
+ validate_features=True,
83
+ training=False,
84
+ iteration_range=None,
85
+ strict_shape=False,
86
+ flag=False,
87
+ ):
88
+ """
89
+ Using MaxFrame XGBoost model to predict data.
90
+
91
+ Parameters
92
+ ----------
93
+ Parameters are the same as `xgboost.train`. The predict() is lazy-execution mode.
94
+
95
+ Returns
96
+ -------
97
+ results: Booster
98
+ """
99
+ data = check_data(data)
100
+ # TODO: check model datatype
101
+
102
+ output_types = [OutputType.tensor]
103
+
104
+ iteration_range = iteration_range or (0, 0)
105
+
106
+ return XGBPredict(
107
+ data=data,
108
+ model=model,
109
+ output_margin=output_margin,
110
+ pred_leaf=pred_leaf,
111
+ pred_contribs=pred_contribs,
112
+ approx_contribs=approx_contribs,
113
+ pred_interactions=pred_interactions,
114
+ validate_features=validate_features,
115
+ training=training,
116
+ iteration_range=iteration_range,
117
+ strict_shape=strict_shape,
118
+ gpu=data.op.gpu,
119
+ output_types=output_types,
120
+ flag=flag,
121
+ )()
@@ -0,0 +1,71 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from ..utils import make_import_error_func
17
+ from .core import XGBScikitLearnBase, xgboost
18
+
19
+ if not xgboost:
20
+ XGBRegressor = make_import_error_func("xgboost")
21
+ else:
22
+ from .core import wrap_evaluation_matrices
23
+ from .predict import predict
24
+ from .train import train
25
+
26
+ class XGBRegressor(XGBScikitLearnBase):
27
+ """
28
+ Implementation of the scikit-learn API for XGBoost regressor.
29
+ """
30
+
31
+ def fit(
32
+ self,
33
+ X,
34
+ y,
35
+ sample_weight=None,
36
+ base_margin=None,
37
+ eval_set=None,
38
+ sample_weight_eval_set=None,
39
+ base_margin_eval_set=None,
40
+ **kw,
41
+ ):
42
+ session = kw.pop("session", None)
43
+ run_kwargs = kw.pop("run_kwargs", dict())
44
+ dtrain, evals = wrap_evaluation_matrices(
45
+ None,
46
+ X,
47
+ y,
48
+ sample_weight,
49
+ base_margin,
50
+ eval_set,
51
+ sample_weight_eval_set,
52
+ base_margin_eval_set,
53
+ )
54
+ params = self.get_xgb_params()
55
+ if not params.get("objective"):
56
+ params["objective"] = "reg:squarederror"
57
+ self.evals_result_ = dict()
58
+ result = train(
59
+ params,
60
+ dtrain,
61
+ num_boost_round=self.get_num_boosting_rounds(),
62
+ evals=evals,
63
+ evals_result=self.evals_result_,
64
+ session=session,
65
+ run_kwargs=run_kwargs,
66
+ )
67
+ self._Booster = result
68
+ return self
69
+
70
+ def predict(self, data, **kw):
71
+ return predict(self.get_booster(), data, **kw)
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.