maxframe 1.0.0rc2__cp310-cp310-win_amd64.whl → 1.0.0rc3__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (106) hide show
  1. maxframe/_utils.cp310-win_amd64.pyd +0 -0
  2. maxframe/codegen.py +3 -2
  3. maxframe/config/config.py +16 -9
  4. maxframe/config/validators.py +42 -12
  5. maxframe/conftest.py +13 -2
  6. maxframe/core/__init__.py +2 -13
  7. maxframe/core/entity/__init__.py +0 -4
  8. maxframe/core/entity/objects.py +45 -2
  9. maxframe/core/entity/output_types.py +0 -3
  10. maxframe/core/entity/tests/test_objects.py +43 -0
  11. maxframe/core/entity/tileables.py +5 -78
  12. maxframe/core/graph/__init__.py +2 -2
  13. maxframe/core/graph/builder/__init__.py +0 -1
  14. maxframe/core/graph/builder/base.py +5 -4
  15. maxframe/core/graph/builder/tileable.py +4 -4
  16. maxframe/core/graph/builder/utils.py +4 -8
  17. maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
  18. maxframe/core/graph/entity.py +9 -33
  19. maxframe/core/operator/__init__.py +2 -9
  20. maxframe/core/operator/base.py +3 -5
  21. maxframe/core/operator/objects.py +0 -9
  22. maxframe/core/operator/utils.py +55 -0
  23. maxframe/dataframe/datasource/read_odps_query.py +1 -1
  24. maxframe/dataframe/datasource/read_odps_table.py +1 -1
  25. maxframe/dataframe/datastore/to_odps.py +1 -1
  26. maxframe/dataframe/operators.py +1 -17
  27. maxframe/dataframe/reduction/core.py +2 -2
  28. maxframe/io/objects/__init__.py +24 -0
  29. maxframe/io/objects/core.py +140 -0
  30. maxframe/io/objects/tensor.py +76 -0
  31. maxframe/io/objects/tests/__init__.py +13 -0
  32. maxframe/io/objects/tests/test_object_io.py +97 -0
  33. maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
  34. maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
  35. maxframe/{odpsio → io/odpsio}/schema.py +5 -5
  36. maxframe/{odpsio → io/odpsio}/tableio.py +10 -4
  37. maxframe/io/odpsio/tests/__init__.py +13 -0
  38. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -3
  39. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
  40. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  41. maxframe/io/odpsio/volumeio.py +57 -0
  42. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  43. maxframe/learn/contrib/xgboost/core.py +87 -2
  44. maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
  45. maxframe/learn/contrib/xgboost/predict.py +19 -5
  46. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  47. maxframe/learn/contrib/xgboost/train.py +25 -15
  48. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  49. maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
  50. maxframe/protocol.py +1 -15
  51. maxframe/remote/core.py +4 -8
  52. maxframe/serialization/__init__.py +1 -0
  53. maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
  54. maxframe/tensor/__init__.py +10 -2
  55. maxframe/tensor/arithmetic/isclose.py +1 -0
  56. maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
  57. maxframe/tensor/core.py +5 -136
  58. maxframe/tensor/datasource/array.py +3 -0
  59. maxframe/tensor/datasource/full.py +1 -1
  60. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  61. maxframe/tensor/indexing/flatnonzero.py +1 -1
  62. maxframe/tensor/merge/__init__.py +2 -0
  63. maxframe/tensor/merge/concatenate.py +98 -0
  64. maxframe/tensor/merge/tests/test_merge.py +30 -1
  65. maxframe/tensor/merge/vstack.py +70 -0
  66. maxframe/tensor/{base → misc}/__init__.py +2 -0
  67. maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
  68. maxframe/tensor/misc/atleast_2d.py +70 -0
  69. maxframe/tensor/misc/atleast_3d.py +85 -0
  70. maxframe/tensor/misc/tests/__init__.py +13 -0
  71. maxframe/tensor/{base → misc}/transpose.py +22 -18
  72. maxframe/tensor/operators.py +1 -7
  73. maxframe/tensor/random/core.py +1 -1
  74. maxframe/tensor/reduction/count_nonzero.py +1 -0
  75. maxframe/tensor/reduction/mean.py +1 -0
  76. maxframe/tensor/reduction/nanmean.py +1 -0
  77. maxframe/tensor/reduction/nanvar.py +2 -0
  78. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  79. maxframe/tensor/reduction/var.py +2 -0
  80. maxframe/tensor/utils.py +2 -22
  81. maxframe/typing_.py +4 -1
  82. maxframe/udf.py +8 -9
  83. maxframe/utils.py +15 -61
  84. maxframe-1.0.0rc3.dist-info/METADATA +104 -0
  85. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +101 -91
  86. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
  87. maxframe_client/fetcher.py +23 -42
  88. maxframe_client/session/graph.py +8 -2
  89. maxframe_client/session/odps.py +54 -18
  90. maxframe_client/tests/test_fetcher.py +1 -1
  91. maxframe_client/tests/test_session.py +14 -2
  92. maxframe/core/entity/chunks.py +0 -68
  93. maxframe/core/entity/fuse.py +0 -73
  94. maxframe/core/graph/builder/chunk.py +0 -430
  95. maxframe/odpsio/volumeio.py +0 -95
  96. maxframe-1.0.0rc2.dist-info/METADATA +0 -177
  97. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  98. /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
  99. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  100. /maxframe/tensor/{base → misc}/astype.py +0 -0
  101. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  102. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  103. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  104. /maxframe/tensor/{base → misc}/unique.py +0 -0
  105. /maxframe/tensor/{base → misc}/where.py +0 -0
  106. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -21,9 +21,9 @@ import pyarrow as pa
21
21
  from odps import types as odps_types
22
22
  from pandas.api import types as pd_types
23
23
 
24
- from ..core import TILEABLE_TYPE, OutputType
25
- from ..protocol import DataFrameTableMeta
26
- from ..tensor.core import TENSOR_TYPE
24
+ from ...core import TILEABLE_TYPE, OutputType
25
+ from ...protocol import DataFrameTableMeta
26
+ from ...tensor.core import TENSOR_TYPE
27
27
 
28
28
  _TEMP_TABLE_PREFIX = "tmp_mf_"
29
29
 
@@ -184,7 +184,7 @@ def pandas_to_odps_schema(
184
184
  unknown_as_string: bool = False,
185
185
  ignore_index=False,
186
186
  ) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
187
- from .. import dataframe as md
187
+ from ... import dataframe as md
188
188
  from .arrow import pandas_to_arrow
189
189
 
190
190
  if _is_scalar_object(df_obj):
@@ -278,7 +278,7 @@ def build_table_column_name(
278
278
  def build_dataframe_table_meta(
279
279
  df_obj: Any, ignore_index: bool = False
280
280
  ) -> DataFrameTableMeta:
281
- from .. import dataframe as md
281
+ from ... import dataframe as md
282
282
 
283
283
  col_to_count = defaultdict(lambda: 0)
284
284
  col_to_idx = defaultdict(lambda: 0)
@@ -20,6 +20,7 @@ from typing import Dict, List, Optional, Union
20
20
 
21
21
  import pyarrow as pa
22
22
  from odps import ODPS
23
+ from odps import __version__ as pyodps_version
23
24
  from odps.apis.storage_api import (
24
25
  StorageApiArrowClient,
25
26
  TableBatchScanResponse,
@@ -34,13 +35,15 @@ try:
34
35
  except ImportError:
35
36
  pac = None
36
37
 
37
- from ..config import options
38
- from ..env import ODPS_STORAGE_API_ENDPOINT
38
+ from ...config import options
39
+ from ...env import ODPS_STORAGE_API_ENDPOINT
40
+ from ...lib.version import Version
39
41
  from .schema import odps_schema_to_arrow_schema
40
42
 
41
43
  PartitionsType = Union[List[str], str, None]
42
44
 
43
45
  _DEFAULT_ROW_BATCH_SIZE = 4096
46
+ _need_convert_timezone = Version(pyodps_version) < Version("0.11.7")
44
47
 
45
48
 
46
49
  @contextmanager
@@ -191,7 +194,7 @@ class TunnelMultiPartitionReader:
191
194
  arrays = []
192
195
  for idx in range(batch.num_columns):
193
196
  col = batch.column(idx)
194
- if isinstance(col.type, pa.TimestampType):
197
+ if _need_convert_timezone and isinstance(col.type, pa.TimestampType):
195
198
  if col.type.tz is not None:
196
199
  target_type = pa.timestamp(
197
200
  self._schema.types[idx].unit, col.type.tz
@@ -354,7 +357,10 @@ class TunnelTableIO(ODPSTableIO):
354
357
  # fixme should yield writer directly once pyodps fixes
355
358
  # related arrow timestamp bug when provided schema and
356
359
  # table schema is identical.
357
- yield TunnelWrappedWriter(writer)
360
+ if _need_convert_timezone:
361
+ yield TunnelWrappedWriter(writer)
362
+ else:
363
+ yield writer
358
364
 
359
365
 
360
366
  class HaloTableArrowReader:
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -18,9 +18,9 @@ import pyarrow as pa
18
18
  import pytest
19
19
  from odps import types as odps_types
20
20
 
21
- from ... import dataframe as md
22
- from ... import tensor as mt
23
- from ...core import OutputType
21
+ from .... import dataframe as md
22
+ from .... import tensor as mt
23
+ from ....core import OutputType
24
24
  from ..schema import (
25
25
  arrow_schema_to_odps_schema,
26
26
  build_dataframe_table_meta,
@@ -20,9 +20,9 @@ import pyarrow as pa
20
20
  import pytest
21
21
  from odps import ODPS
22
22
 
23
- from ...config import options
24
- from ...tests.utils import flaky, tn
25
- from ...utils import config_odps_default_options
23
+ from ....config import options
24
+ from ....tests.utils import flaky, tn
25
+ from ....utils import config_odps_default_options
26
26
  from ..tableio import ODPSTableIO
27
27
 
28
28
 
@@ -15,7 +15,7 @@
15
15
  import pytest
16
16
  from odps import ODPS
17
17
 
18
- from ...tests.utils import tn
18
+ from ....tests.utils import tn
19
19
  from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
20
20
 
21
21
 
@@ -69,19 +69,17 @@ def create_volume(request, oss_config):
69
69
  oss_config.oss_bucket.batch_delete_objects(keys)
70
70
 
71
71
 
72
- @pytest.mark.parametrize("create_volume", ["parted", "external"], indirect=True)
72
+ @pytest.mark.parametrize("create_volume", ["external"], indirect=True)
73
73
  def test_read_write_volume(create_volume):
74
74
  test_vol_dir = "test_vol_dir"
75
75
 
76
76
  odps_entry = ODPS.from_environments()
77
77
 
78
78
  writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
79
- write_session_id = writer.create_write_session()
80
79
 
81
80
  writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
82
- writer.write_file("file1", b"content1", write_session_id)
83
- writer.write_file("file2", b"content2", write_session_id)
84
- writer.commit(["file1", "file2"], write_session_id)
81
+ writer.write_file("file1", b"content1")
82
+ writer.write_file("file2", b"content2")
85
83
 
86
84
  reader = ODPSVolumeReader(odps_entry, create_volume, test_vol_dir)
87
85
  assert reader.read_file("file1") == b"content1"
@@ -0,0 +1,57 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Iterator, List, Union
17
+
18
+ from odps import ODPS
19
+
20
+
21
+ class ODPSVolumeReader:
22
+ def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
23
+ self._odps_entry = odps_entry
24
+ self._volume = odps_entry.get_volume(volume_name)
25
+ self._volume_dir = volume_dir
26
+
27
+ def list_files(self) -> List[str]:
28
+ def _get_file_name(vol_file):
29
+ if hasattr(vol_file, "name"):
30
+ return vol_file.name
31
+ return vol_file.path.rsplit("/", 1)[-1]
32
+
33
+ return [
34
+ _get_file_name(f)
35
+ for f in self._odps_entry.list_volume_files(
36
+ f"/{self._volume.name}/{self._volume_dir}"
37
+ )
38
+ ]
39
+
40
+ def read_file(self, file_name: str) -> bytes:
41
+ with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
42
+ return reader.read()
43
+
44
+
45
+ class ODPSVolumeWriter:
46
+ def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
47
+ self._odps_entry = odps_entry
48
+ self._volume = odps_entry.get_volume(volume_name)
49
+ self._volume_dir = volume_dir
50
+
51
+ def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
52
+ with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
53
+ if not inspect.isgenerator(data):
54
+ writer.write(data)
55
+ else:
56
+ for chunk in data:
57
+ writer.write(chunk)
@@ -14,7 +14,7 @@
14
14
 
15
15
  import numpy as np
16
16
 
17
- from ....tensor import argmax
17
+ from ....tensor import argmax, transpose, vstack
18
18
  from ..utils import make_import_error_func
19
19
  from .core import XGBScikitLearnBase, xgboost
20
20
 
@@ -42,7 +42,10 @@ else:
42
42
  sample_weight_eval_set=None,
43
43
  base_margin_eval_set=None,
44
44
  num_class=None,
45
+ **kw,
45
46
  ):
47
+ session = kw.pop("session", None)
48
+ run_kwargs = kw.pop("run_kwargs", dict())
46
49
  dtrain, evals = wrap_evaluation_matrices(
47
50
  None,
48
51
  X,
@@ -68,6 +71,8 @@ else:
68
71
  evals=evals,
69
72
  evals_result=self.evals_result_,
70
73
  num_class=num_class,
74
+ session=session,
75
+ run_kwargs=run_kwargs,
71
76
  )
72
77
  self._Booster = result
73
78
  return self
@@ -83,4 +88,23 @@ else:
83
88
  def predict_proba(self, data, ntree_limit=None, flag=False, **kw):
84
89
  if ntree_limit is not None:
85
90
  raise NotImplementedError("ntree_limit is not currently supported")
86
- return predict(self.get_booster(), data, flag=flag, **kw)
91
+ prediction = predict(self.get_booster(), data, flag=flag, **kw)
92
+
93
+ if len(prediction.shape) == 2 and prediction.shape[1] == self.n_classes_:
94
+ # multi-class
95
+ return prediction
96
+ if (
97
+ len(prediction.shape) == 2
98
+ and self.n_classes_ == 2
99
+ and prediction.shape[1] >= self.n_classes_
100
+ ):
101
+ # multi-label
102
+ return prediction
103
+ # binary logistic function
104
+ classone_probs = prediction
105
+ classzero_probs = 1.0 - classone_probs
106
+ return transpose(vstack((classzero_probs, classone_probs)))
107
+
108
+ @property
109
+ def classes_(self) -> np.ndarray:
110
+ return np.arange(self.n_classes_)
@@ -12,15 +12,67 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Any, Callable, List, Optional, Tuple
15
+ from typing import Any, Callable, Dict, List, Optional, Tuple
16
16
 
17
17
  try:
18
18
  import xgboost
19
19
  except ImportError:
20
20
  xgboost = None
21
21
 
22
+ from ...core import Model, ModelData
22
23
  from .dmatrix import DMatrix
23
24
 
25
+
26
+ class BoosterData(ModelData):
27
+ __slots__ = ("_evals_result",)
28
+
29
+ _evals_result: Dict
30
+
31
+ def __init__(self, *args, evals_result=None, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+ self._evals_result = evals_result if evals_result is not None else dict()
34
+
35
+ def execute(self, session=None, **kw):
36
+ # The evals_result should be fetched when BoosterData.execute() is called.
37
+ result = super().execute(session=session, **kw)
38
+ if self.op.has_evals_result and self.key == self.op.outputs[0].key:
39
+ self._evals_result.update(self.op.outputs[1].fetch(session=session))
40
+ return result
41
+
42
+ def predict(
43
+ self,
44
+ data,
45
+ output_margin=False,
46
+ pred_leaf=False,
47
+ pred_contribs=False,
48
+ approx_contribs=False,
49
+ pred_interactions=False,
50
+ validate_features=True,
51
+ training=False,
52
+ iteration_range=None,
53
+ strict_shape=False,
54
+ ):
55
+ from .predict import predict
56
+
57
+ return predict(
58
+ self,
59
+ data,
60
+ output_margin=output_margin,
61
+ pred_leaf=pred_leaf,
62
+ pred_contribs=pred_contribs,
63
+ approx_contribs=approx_contribs,
64
+ pred_interactions=pred_interactions,
65
+ validate_features=validate_features,
66
+ training=training,
67
+ iteration_range=iteration_range,
68
+ strict_shape=strict_shape,
69
+ )
70
+
71
+
72
+ class Booster(Model):
73
+ pass
74
+
75
+
24
76
  if not xgboost:
25
77
  XGBScikitLearnBase = None
26
78
  else:
@@ -40,7 +92,9 @@ else:
40
92
  **kw,
41
93
  ):
42
94
  """
43
- Fit the regressor.
95
+ Fit the regressor. Note that fit() is an eager-execution
96
+ API. The call will be blocked until training finished.
97
+
44
98
  Parameters
45
99
  ----------
46
100
  X : array_like
@@ -72,6 +126,37 @@ else:
72
126
  """
73
127
  raise NotImplementedError
74
128
 
129
+ def evals_result(self, **kw) -> Dict:
130
+ """Return the evaluation results.
131
+
132
+ If **eval_set** is passed to the :py:meth:`fit` function, you can call
133
+ ``evals_result()`` to get evaluation results for all passed **eval_sets**. When
134
+ **eval_metric** is also passed to the :py:meth:`fit` function, the
135
+ **evals_result** will contain the **eval_metrics** passed to the :py:meth:`fit`
136
+ function.
137
+
138
+ The returned evaluation result is a dictionary:
139
+
140
+ .. code-block:: python
141
+
142
+ {'validation_0': {'logloss': ['0.604835', '0.531479']},
143
+ 'validation_1': {'logloss': ['0.41965', '0.17686']}}
144
+
145
+ Note that evals_result() will be blocked until the train is finished.
146
+
147
+ Returns
148
+ -------
149
+ evals_result
150
+
151
+ """
152
+ result = super().evals_result()
153
+ if not self._Booster.op.has_evals_result or len(result) != 0:
154
+ return result
155
+ session = kw.pop("session", None)
156
+ run_kwargs = kw.pop("run_kwargs", dict())
157
+ self._Booster.execute(session=session, **run_kwargs)
158
+ return super().evals_result()
159
+
75
160
  def wrap_evaluation_matrices(
76
161
  missing: float,
77
162
  X: Any,
@@ -99,10 +99,7 @@ def check_array_like(y: TileableType, name: str) -> TileableType:
99
99
  y = convert_to_tensor_or_dataframe(y)
100
100
  if isinstance(y, DATAFRAME_TYPE):
101
101
  y = y.iloc[:, 0]
102
- y = astensor(y)
103
- if y.ndim != 1:
104
- raise ValueError(f"Expecting 1-d {name}, got: {y.ndim}-d")
105
- return y
102
+ return astensor(y)
106
103
 
107
104
 
108
105
  def to_dmatrix(
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import pickle
16
15
 
17
16
  import numpy as np
18
17
  import pandas as pd
@@ -22,8 +21,14 @@ from ....core.entity.output_types import OutputType
22
21
  from ....core.operator.base import Operator
23
22
  from ....core.operator.core import TileableOperatorMixin
24
23
  from ....dataframe.utils import parse_index
25
- from ....serialization.serializables import BoolField, BytesField, KeyField, TupleField
24
+ from ....serialization.serializables import (
25
+ BoolField,
26
+ KeyField,
27
+ ReferenceField,
28
+ TupleField,
29
+ )
26
30
  from ....tensor.core import TENSOR_TYPE, TensorOrder
31
+ from .core import BoosterData
27
32
  from .dmatrix import check_data
28
33
 
29
34
 
@@ -32,9 +37,7 @@ class XGBPredict(Operator, TileableOperatorMixin):
32
37
  output_dtype = np.dtype(np.float32)
33
38
 
34
39
  data = KeyField("data", default=None)
35
- model = BytesField(
36
- "model", on_serialize=pickle.dumps, on_deserialize=pickle.loads, default=None
37
- )
40
+ model = ReferenceField("model", reference_type=BoosterData, default=None)
38
41
  pred_leaf = BoolField("pred_leaf", default=False)
39
42
  pred_contribs = BoolField("pred_contribs", default=False)
40
43
  approx_contribs = BoolField("approx_contribs", default=False)
@@ -107,6 +110,17 @@ def predict(
107
110
  strict_shape=False,
108
111
  flag=False,
109
112
  ):
113
+ """
114
+ Using MaxFrame XGBoost model to predict data.
115
+
116
+ Parameters
117
+ ----------
118
+ Parameters are the same as `xgboost.train`. The predict() is lazy-execution mode.
119
+
120
+ Returns
121
+ -------
122
+ results: Booster
123
+ """
110
124
  data = check_data(data)
111
125
  # TODO: check model datatype
112
126
 
@@ -41,11 +41,6 @@ else:
41
41
  ):
42
42
  session = kw.pop("session", None)
43
43
  run_kwargs = kw.pop("run_kwargs", dict())
44
- if kw:
45
- raise TypeError(
46
- f"fit got an unexpected keyword argument '{next(iter(kw))}'"
47
- )
48
-
49
44
  dtrain, evals = wrap_evaluation_matrices(
50
45
  None,
51
46
  X,
@@ -57,6 +52,8 @@ else:
57
52
  base_margin_eval_set,
58
53
  )
59
54
  params = self.get_xgb_params()
55
+ if not params.get("objective"):
56
+ params["objective"] = "reg:squarederror"
60
57
  self.evals_result_ = dict()
61
58
  result = train(
62
59
  params,
@@ -71,8 +68,4 @@ else:
71
68
  return self
72
69
 
73
70
  def predict(self, data, **kw):
74
- session = kw.pop("session", None)
75
- run_kwargs = kw.pop("run_kwargs", None)
76
- return predict(
77
- self.get_booster(), data, session=session, run_kwargs=run_kwargs, **kw
78
- )
71
+ return predict(self.get_booster(), data, **kw)
@@ -29,6 +29,7 @@ from ....serialization.serializables import (
29
29
  KeyField,
30
30
  ListField,
31
31
  )
32
+ from .core import Booster
32
33
  from .dmatrix import ToDMatrix, to_dmatrix
33
34
 
34
35
  logger = logging.getLogger(__name__)
@@ -59,49 +60,59 @@ class XGBTrain(Operator, TileableOperatorMixin):
59
60
  num_boost_round = Int64Field("num_boost_round", default=10)
60
61
  num_class = Int64Field("num_class", default=None)
61
62
 
62
- # Store evals_result in local to store the remote evals_result
63
- evals_result: dict = None
64
-
65
63
  def __init__(self, gpu=None, **kw):
66
64
  super().__init__(gpu=gpu, **kw)
67
65
  if self.output_types is None:
68
66
  self.output_types = [OutputType.object]
67
+ if self.has_evals_result:
68
+ self.output_types.append(OutputType.object)
69
69
 
70
70
  def _set_inputs(self, inputs):
71
71
  super()._set_inputs(inputs)
72
72
  self.dtrain = self._inputs[0]
73
73
  rest = self._inputs[1:]
74
- if self.evals is not None:
74
+ if self.has_evals_result:
75
75
  evals_dict = OrderedDict(self.evals)
76
76
  new_evals_dict = OrderedDict()
77
77
  for new_key, val in zip(rest, evals_dict.values()):
78
78
  new_evals_dict[new_key] = val
79
79
  self.evals = list(new_evals_dict.items())
80
80
 
81
- def __call__(self):
81
+ def __call__(self, evals_result):
82
82
  inputs = [self.dtrain]
83
- if self.evals is not None:
83
+ if self.has_evals_result:
84
84
  inputs.extend(e[0] for e in self.evals)
85
- return self.new_tileable(inputs)
85
+ return self.new_tileables(
86
+ inputs, object_class=Booster, evals_result=evals_result
87
+ )[0]
88
+
89
+ @property
90
+ def output_limit(self):
91
+ return 2 if self.has_evals_result else 1
92
+
93
+ @property
94
+ def has_evals_result(self) -> bool:
95
+ return self.evals
86
96
 
87
97
 
88
98
  def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwargs):
89
99
  """
90
- Train XGBoost model in Mars manner.
100
+ Train XGBoost model in MaxFrame manner.
91
101
 
92
102
  Parameters
93
103
  ----------
94
- Parameters are the same as `xgboost.train`.
104
+ Parameters are the same as `xgboost.train`. Note that train is an eager-execution
105
+ API. The call will be blocked until training finished.
95
106
 
96
107
  Returns
97
108
  -------
98
109
  results: Booster
99
110
  """
100
111
 
101
- evals_result = evals_result or dict()
102
- evals = None or ()
103
-
112
+ evals_result = evals_result if evals_result is not None else dict()
104
113
  processed_evals = []
114
+ session = kwargs.pop("session", None)
115
+ run_kwargs = kwargs.pop("run_kwargs", dict())
105
116
  if evals:
106
117
  for eval_dmatrix, name in evals:
107
118
  if not isinstance(name, str):
@@ -110,12 +121,11 @@ def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwarg
110
121
  processed_evals.append((eval_dmatrix, name))
111
122
  else:
112
123
  processed_evals.append((to_dmatrix(eval_dmatrix), name))
113
-
114
124
  return XGBTrain(
115
125
  params=params,
116
126
  dtrain=dtrain,
117
127
  evals=processed_evals,
118
128
  evals_result=evals_result,
119
129
  num_class=num_class,
120
- **kwargs
121
- )()
130
+ **kwargs,
131
+ )(evals_result).execute(session=session, **run_kwargs)
@@ -12,18 +12,15 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from ... import opcodes
16
- from ...serialization.serializables import ReferenceField
17
- from ..graph import ChunkGraph
18
- from .base import Operator
15
+ from ..core.entity.objects import Object, ObjectData
19
16
 
20
17
 
21
- class Fuse(Operator):
22
- __slots__ = ("_fuse_graph",)
23
- _op_type_ = opcodes.FUSE
18
+ class ModelData(ObjectData):
19
+ pass
24
20
 
25
- fuse_graph = ReferenceField("fuse_graph", ChunkGraph)
26
21
 
22
+ class Model(Object):
23
+ pass
27
24
 
28
- class FuseChunkMixin:
29
- __slots__ = ()
25
+
26
+ MODEL_TYPE = (Model, ModelData)
Binary file
maxframe/protocol.py CHANGED
@@ -15,7 +15,7 @@
15
15
  import base64
16
16
  import enum
17
17
  import uuid
18
- from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar
18
+ from typing import Any, Dict, Generic, List, Optional, Type, TypeVar
19
19
 
20
20
  import pandas as pd
21
21
 
@@ -38,7 +38,6 @@ from .serialization.serializables import (
38
38
  Serializable,
39
39
  SeriesField,
40
40
  StringField,
41
- TupleField,
42
41
  )
43
42
 
44
43
  pickling_support.install()
@@ -92,19 +91,6 @@ class DataSerializeType(enum.Enum):
92
91
  PICKLE = 0
93
92
 
94
93
 
95
- class VolumeDataMeta(Serializable):
96
- output_type: OutputType = EnumField(
97
- "output_type", OutputType, FieldTypes.int8, default=None
98
- )
99
- serial_type: DataSerializeType = EnumField(
100
- "serial_type", DataSerializeType, FieldTypes.int8, default=None
101
- )
102
- shape: Tuple[int, ...] = TupleField("shape", FieldTypes.int64, default=None)
103
- nsplits: Tuple[Tuple[int, ...], ...] = TupleField(
104
- "nsplits", FieldTypes.tuple(FieldTypes.tuple(FieldTypes.int64)), default=None
105
- )
106
-
107
-
108
94
  _result_type_to_info_cls: Dict[ResultType, Type["ResultInfo"]] = dict()
109
95
 
110
96
 
maxframe/remote/core.py CHANGED
@@ -15,7 +15,7 @@
15
15
  from functools import partial
16
16
 
17
17
  from .. import opcodes
18
- from ..core import ENTITY_TYPE, ChunkData
18
+ from ..core import ENTITY_TYPE
19
19
  from ..core.operator import ObjectOperator, ObjectOperatorMixin
20
20
  from ..dataframe.core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
21
21
  from ..serialization.serializables import (
@@ -26,7 +26,7 @@ from ..serialization.serializables import (
26
26
  ListField,
27
27
  )
28
28
  from ..tensor.core import TENSOR_TYPE
29
- from ..utils import build_fetch_tileable, find_objects, replace_objects
29
+ from ..utils import find_objects, replace_objects
30
30
 
31
31
 
32
32
  class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
@@ -63,12 +63,8 @@ class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
63
63
  if raw_inputs is not None:
64
64
  for raw_inp in raw_inputs:
65
65
  if self._no_prepare(raw_inp):
66
- if not isinstance(self._inputs[0], ChunkData):
67
- # not in tile, set_inputs from tileable
68
- mapping[raw_inp] = next(function_inputs)
69
- else:
70
- # in tile, set_inputs from chunk
71
- mapping[raw_inp] = build_fetch_tileable(raw_inp)
66
+ # not in tile, set_inputs from tileable
67
+ mapping[raw_inp] = next(function_inputs)
72
68
  else:
73
69
  mapping[raw_inp] = next(function_inputs)
74
70
  self.function_args = replace_objects(self.function_args, mapping)
@@ -17,6 +17,7 @@ from .core import (
17
17
  PickleContainer,
18
18
  Serializer,
19
19
  deserialize,
20
+ load_type,
20
21
  pickle_buffers,
21
22
  serialize,
22
23
  serialize_with_spawn,