maxframe 1.0.0rc2__cp310-cp310-win_amd64.whl → 1.0.0rc3__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp310-win_amd64.pyd +0 -0
- maxframe/codegen.py +3 -2
- maxframe/config/config.py +16 -9
- maxframe/config/validators.py +42 -12
- maxframe/conftest.py +13 -2
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/objects.py +45 -2
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/datasource/read_odps_query.py +1 -1
- maxframe/dataframe/datasource/read_odps_table.py +1 -1
- maxframe/dataframe/datastore/to_odps.py +1 -1
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
- maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
- maxframe/{odpsio → io/odpsio}/schema.py +5 -5
- maxframe/{odpsio → io/odpsio}/tableio.py +10 -4
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -3
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +57 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
- maxframe/learn/contrib/xgboost/predict.py +19 -5
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +25 -15
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
- maxframe/protocol.py +1 -15
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
- maxframe/tensor/__init__.py +10 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +98 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +70 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/utils.py +2 -22
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +15 -61
- maxframe-1.0.0rc3.dist-info/METADATA +104 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +101 -91
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +23 -42
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +54 -18
- maxframe_client/tests/test_fetcher.py +1 -1
- maxframe_client/tests/test_session.py +14 -2
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/volumeio.py +0 -95
- maxframe-1.0.0rc2.dist-info/METADATA +0 -177
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/unique.py +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
|
@@ -21,9 +21,9 @@ import pyarrow as pa
|
|
|
21
21
|
from odps import types as odps_types
|
|
22
22
|
from pandas.api import types as pd_types
|
|
23
23
|
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from
|
|
24
|
+
from ...core import TILEABLE_TYPE, OutputType
|
|
25
|
+
from ...protocol import DataFrameTableMeta
|
|
26
|
+
from ...tensor.core import TENSOR_TYPE
|
|
27
27
|
|
|
28
28
|
_TEMP_TABLE_PREFIX = "tmp_mf_"
|
|
29
29
|
|
|
@@ -184,7 +184,7 @@ def pandas_to_odps_schema(
|
|
|
184
184
|
unknown_as_string: bool = False,
|
|
185
185
|
ignore_index=False,
|
|
186
186
|
) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
|
|
187
|
-
from
|
|
187
|
+
from ... import dataframe as md
|
|
188
188
|
from .arrow import pandas_to_arrow
|
|
189
189
|
|
|
190
190
|
if _is_scalar_object(df_obj):
|
|
@@ -278,7 +278,7 @@ def build_table_column_name(
|
|
|
278
278
|
def build_dataframe_table_meta(
|
|
279
279
|
df_obj: Any, ignore_index: bool = False
|
|
280
280
|
) -> DataFrameTableMeta:
|
|
281
|
-
from
|
|
281
|
+
from ... import dataframe as md
|
|
282
282
|
|
|
283
283
|
col_to_count = defaultdict(lambda: 0)
|
|
284
284
|
col_to_idx = defaultdict(lambda: 0)
|
|
@@ -20,6 +20,7 @@ from typing import Dict, List, Optional, Union
|
|
|
20
20
|
|
|
21
21
|
import pyarrow as pa
|
|
22
22
|
from odps import ODPS
|
|
23
|
+
from odps import __version__ as pyodps_version
|
|
23
24
|
from odps.apis.storage_api import (
|
|
24
25
|
StorageApiArrowClient,
|
|
25
26
|
TableBatchScanResponse,
|
|
@@ -34,13 +35,15 @@ try:
|
|
|
34
35
|
except ImportError:
|
|
35
36
|
pac = None
|
|
36
37
|
|
|
37
|
-
from
|
|
38
|
-
from
|
|
38
|
+
from ...config import options
|
|
39
|
+
from ...env import ODPS_STORAGE_API_ENDPOINT
|
|
40
|
+
from ...lib.version import Version
|
|
39
41
|
from .schema import odps_schema_to_arrow_schema
|
|
40
42
|
|
|
41
43
|
PartitionsType = Union[List[str], str, None]
|
|
42
44
|
|
|
43
45
|
_DEFAULT_ROW_BATCH_SIZE = 4096
|
|
46
|
+
_need_convert_timezone = Version(pyodps_version) < Version("0.11.7")
|
|
44
47
|
|
|
45
48
|
|
|
46
49
|
@contextmanager
|
|
@@ -191,7 +194,7 @@ class TunnelMultiPartitionReader:
|
|
|
191
194
|
arrays = []
|
|
192
195
|
for idx in range(batch.num_columns):
|
|
193
196
|
col = batch.column(idx)
|
|
194
|
-
if isinstance(col.type, pa.TimestampType):
|
|
197
|
+
if _need_convert_timezone and isinstance(col.type, pa.TimestampType):
|
|
195
198
|
if col.type.tz is not None:
|
|
196
199
|
target_type = pa.timestamp(
|
|
197
200
|
self._schema.types[idx].unit, col.type.tz
|
|
@@ -354,7 +357,10 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
354
357
|
# fixme should yield writer directly once pyodps fixes
|
|
355
358
|
# related arrow timestamp bug when provided schema and
|
|
356
359
|
# table schema is identical.
|
|
357
|
-
|
|
360
|
+
if _need_convert_timezone:
|
|
361
|
+
yield TunnelWrappedWriter(writer)
|
|
362
|
+
else:
|
|
363
|
+
yield writer
|
|
358
364
|
|
|
359
365
|
|
|
360
366
|
class HaloTableArrowReader:
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -18,9 +18,9 @@ import pyarrow as pa
|
|
|
18
18
|
import pytest
|
|
19
19
|
from odps import types as odps_types
|
|
20
20
|
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
21
|
+
from .... import dataframe as md
|
|
22
|
+
from .... import tensor as mt
|
|
23
|
+
from ....core import OutputType
|
|
24
24
|
from ..schema import (
|
|
25
25
|
arrow_schema_to_odps_schema,
|
|
26
26
|
build_dataframe_table_meta,
|
|
@@ -20,9 +20,9 @@ import pyarrow as pa
|
|
|
20
20
|
import pytest
|
|
21
21
|
from odps import ODPS
|
|
22
22
|
|
|
23
|
-
from
|
|
24
|
-
from
|
|
25
|
-
from
|
|
23
|
+
from ....config import options
|
|
24
|
+
from ....tests.utils import flaky, tn
|
|
25
|
+
from ....utils import config_odps_default_options
|
|
26
26
|
from ..tableio import ODPSTableIO
|
|
27
27
|
|
|
28
28
|
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import pytest
|
|
16
16
|
from odps import ODPS
|
|
17
17
|
|
|
18
|
-
from
|
|
18
|
+
from ....tests.utils import tn
|
|
19
19
|
from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
20
20
|
|
|
21
21
|
|
|
@@ -69,19 +69,17 @@ def create_volume(request, oss_config):
|
|
|
69
69
|
oss_config.oss_bucket.batch_delete_objects(keys)
|
|
70
70
|
|
|
71
71
|
|
|
72
|
-
@pytest.mark.parametrize("create_volume", ["
|
|
72
|
+
@pytest.mark.parametrize("create_volume", ["external"], indirect=True)
|
|
73
73
|
def test_read_write_volume(create_volume):
|
|
74
74
|
test_vol_dir = "test_vol_dir"
|
|
75
75
|
|
|
76
76
|
odps_entry = ODPS.from_environments()
|
|
77
77
|
|
|
78
78
|
writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
|
|
79
|
-
write_session_id = writer.create_write_session()
|
|
80
79
|
|
|
81
80
|
writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
|
|
82
|
-
writer.write_file("file1", b"content1"
|
|
83
|
-
writer.write_file("file2", b"content2"
|
|
84
|
-
writer.commit(["file1", "file2"], write_session_id)
|
|
81
|
+
writer.write_file("file1", b"content1")
|
|
82
|
+
writer.write_file("file2", b"content2")
|
|
85
83
|
|
|
86
84
|
reader = ODPSVolumeReader(odps_entry, create_volume, test_vol_dir)
|
|
87
85
|
assert reader.read_file("file1") == b"content1"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import inspect
|
|
16
|
+
from typing import Iterator, List, Union
|
|
17
|
+
|
|
18
|
+
from odps import ODPS
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ODPSVolumeReader:
|
|
22
|
+
def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
|
|
23
|
+
self._odps_entry = odps_entry
|
|
24
|
+
self._volume = odps_entry.get_volume(volume_name)
|
|
25
|
+
self._volume_dir = volume_dir
|
|
26
|
+
|
|
27
|
+
def list_files(self) -> List[str]:
|
|
28
|
+
def _get_file_name(vol_file):
|
|
29
|
+
if hasattr(vol_file, "name"):
|
|
30
|
+
return vol_file.name
|
|
31
|
+
return vol_file.path.rsplit("/", 1)[-1]
|
|
32
|
+
|
|
33
|
+
return [
|
|
34
|
+
_get_file_name(f)
|
|
35
|
+
for f in self._odps_entry.list_volume_files(
|
|
36
|
+
f"/{self._volume.name}/{self._volume_dir}"
|
|
37
|
+
)
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
def read_file(self, file_name: str) -> bytes:
|
|
41
|
+
with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
|
|
42
|
+
return reader.read()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ODPSVolumeWriter:
|
|
46
|
+
def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
|
|
47
|
+
self._odps_entry = odps_entry
|
|
48
|
+
self._volume = odps_entry.get_volume(volume_name)
|
|
49
|
+
self._volume_dir = volume_dir
|
|
50
|
+
|
|
51
|
+
def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
|
|
52
|
+
with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
|
|
53
|
+
if not inspect.isgenerator(data):
|
|
54
|
+
writer.write(data)
|
|
55
|
+
else:
|
|
56
|
+
for chunk in data:
|
|
57
|
+
writer.write(chunk)
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
|
|
17
|
-
from ....tensor import argmax
|
|
17
|
+
from ....tensor import argmax, transpose, vstack
|
|
18
18
|
from ..utils import make_import_error_func
|
|
19
19
|
from .core import XGBScikitLearnBase, xgboost
|
|
20
20
|
|
|
@@ -42,7 +42,10 @@ else:
|
|
|
42
42
|
sample_weight_eval_set=None,
|
|
43
43
|
base_margin_eval_set=None,
|
|
44
44
|
num_class=None,
|
|
45
|
+
**kw,
|
|
45
46
|
):
|
|
47
|
+
session = kw.pop("session", None)
|
|
48
|
+
run_kwargs = kw.pop("run_kwargs", dict())
|
|
46
49
|
dtrain, evals = wrap_evaluation_matrices(
|
|
47
50
|
None,
|
|
48
51
|
X,
|
|
@@ -68,6 +71,8 @@ else:
|
|
|
68
71
|
evals=evals,
|
|
69
72
|
evals_result=self.evals_result_,
|
|
70
73
|
num_class=num_class,
|
|
74
|
+
session=session,
|
|
75
|
+
run_kwargs=run_kwargs,
|
|
71
76
|
)
|
|
72
77
|
self._Booster = result
|
|
73
78
|
return self
|
|
@@ -83,4 +88,23 @@ else:
|
|
|
83
88
|
def predict_proba(self, data, ntree_limit=None, flag=False, **kw):
|
|
84
89
|
if ntree_limit is not None:
|
|
85
90
|
raise NotImplementedError("ntree_limit is not currently supported")
|
|
86
|
-
|
|
91
|
+
prediction = predict(self.get_booster(), data, flag=flag, **kw)
|
|
92
|
+
|
|
93
|
+
if len(prediction.shape) == 2 and prediction.shape[1] == self.n_classes_:
|
|
94
|
+
# multi-class
|
|
95
|
+
return prediction
|
|
96
|
+
if (
|
|
97
|
+
len(prediction.shape) == 2
|
|
98
|
+
and self.n_classes_ == 2
|
|
99
|
+
and prediction.shape[1] >= self.n_classes_
|
|
100
|
+
):
|
|
101
|
+
# multi-label
|
|
102
|
+
return prediction
|
|
103
|
+
# binary logistic function
|
|
104
|
+
classone_probs = prediction
|
|
105
|
+
classzero_probs = 1.0 - classone_probs
|
|
106
|
+
return transpose(vstack((classzero_probs, classone_probs)))
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def classes_(self) -> np.ndarray:
|
|
110
|
+
return np.arange(self.n_classes_)
|
|
@@ -12,15 +12,67 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any, Callable, List, Optional, Tuple
|
|
15
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
16
16
|
|
|
17
17
|
try:
|
|
18
18
|
import xgboost
|
|
19
19
|
except ImportError:
|
|
20
20
|
xgboost = None
|
|
21
21
|
|
|
22
|
+
from ...core import Model, ModelData
|
|
22
23
|
from .dmatrix import DMatrix
|
|
23
24
|
|
|
25
|
+
|
|
26
|
+
class BoosterData(ModelData):
|
|
27
|
+
__slots__ = ("_evals_result",)
|
|
28
|
+
|
|
29
|
+
_evals_result: Dict
|
|
30
|
+
|
|
31
|
+
def __init__(self, *args, evals_result=None, **kwargs):
|
|
32
|
+
super().__init__(*args, **kwargs)
|
|
33
|
+
self._evals_result = evals_result if evals_result is not None else dict()
|
|
34
|
+
|
|
35
|
+
def execute(self, session=None, **kw):
|
|
36
|
+
# The evals_result should be fetched when BoosterData.execute() is called.
|
|
37
|
+
result = super().execute(session=session, **kw)
|
|
38
|
+
if self.op.has_evals_result and self.key == self.op.outputs[0].key:
|
|
39
|
+
self._evals_result.update(self.op.outputs[1].fetch(session=session))
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
def predict(
|
|
43
|
+
self,
|
|
44
|
+
data,
|
|
45
|
+
output_margin=False,
|
|
46
|
+
pred_leaf=False,
|
|
47
|
+
pred_contribs=False,
|
|
48
|
+
approx_contribs=False,
|
|
49
|
+
pred_interactions=False,
|
|
50
|
+
validate_features=True,
|
|
51
|
+
training=False,
|
|
52
|
+
iteration_range=None,
|
|
53
|
+
strict_shape=False,
|
|
54
|
+
):
|
|
55
|
+
from .predict import predict
|
|
56
|
+
|
|
57
|
+
return predict(
|
|
58
|
+
self,
|
|
59
|
+
data,
|
|
60
|
+
output_margin=output_margin,
|
|
61
|
+
pred_leaf=pred_leaf,
|
|
62
|
+
pred_contribs=pred_contribs,
|
|
63
|
+
approx_contribs=approx_contribs,
|
|
64
|
+
pred_interactions=pred_interactions,
|
|
65
|
+
validate_features=validate_features,
|
|
66
|
+
training=training,
|
|
67
|
+
iteration_range=iteration_range,
|
|
68
|
+
strict_shape=strict_shape,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class Booster(Model):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
24
76
|
if not xgboost:
|
|
25
77
|
XGBScikitLearnBase = None
|
|
26
78
|
else:
|
|
@@ -40,7 +92,9 @@ else:
|
|
|
40
92
|
**kw,
|
|
41
93
|
):
|
|
42
94
|
"""
|
|
43
|
-
Fit the regressor.
|
|
95
|
+
Fit the regressor. Note that fit() is an eager-execution
|
|
96
|
+
API. The call will be blocked until training finished.
|
|
97
|
+
|
|
44
98
|
Parameters
|
|
45
99
|
----------
|
|
46
100
|
X : array_like
|
|
@@ -72,6 +126,37 @@ else:
|
|
|
72
126
|
"""
|
|
73
127
|
raise NotImplementedError
|
|
74
128
|
|
|
129
|
+
def evals_result(self, **kw) -> Dict:
|
|
130
|
+
"""Return the evaluation results.
|
|
131
|
+
|
|
132
|
+
If **eval_set** is passed to the :py:meth:`fit` function, you can call
|
|
133
|
+
``evals_result()`` to get evaluation results for all passed **eval_sets**. When
|
|
134
|
+
**eval_metric** is also passed to the :py:meth:`fit` function, the
|
|
135
|
+
**evals_result** will contain the **eval_metrics** passed to the :py:meth:`fit`
|
|
136
|
+
function.
|
|
137
|
+
|
|
138
|
+
The returned evaluation result is a dictionary:
|
|
139
|
+
|
|
140
|
+
.. code-block:: python
|
|
141
|
+
|
|
142
|
+
{'validation_0': {'logloss': ['0.604835', '0.531479']},
|
|
143
|
+
'validation_1': {'logloss': ['0.41965', '0.17686']}}
|
|
144
|
+
|
|
145
|
+
Note that evals_result() will be blocked until the train is finished.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
evals_result
|
|
150
|
+
|
|
151
|
+
"""
|
|
152
|
+
result = super().evals_result()
|
|
153
|
+
if not self._Booster.op.has_evals_result or len(result) != 0:
|
|
154
|
+
return result
|
|
155
|
+
session = kw.pop("session", None)
|
|
156
|
+
run_kwargs = kw.pop("run_kwargs", dict())
|
|
157
|
+
self._Booster.execute(session=session, **run_kwargs)
|
|
158
|
+
return super().evals_result()
|
|
159
|
+
|
|
75
160
|
def wrap_evaluation_matrices(
|
|
76
161
|
missing: float,
|
|
77
162
|
X: Any,
|
|
@@ -99,10 +99,7 @@ def check_array_like(y: TileableType, name: str) -> TileableType:
|
|
|
99
99
|
y = convert_to_tensor_or_dataframe(y)
|
|
100
100
|
if isinstance(y, DATAFRAME_TYPE):
|
|
101
101
|
y = y.iloc[:, 0]
|
|
102
|
-
|
|
103
|
-
if y.ndim != 1:
|
|
104
|
-
raise ValueError(f"Expecting 1-d {name}, got: {y.ndim}-d")
|
|
105
|
-
return y
|
|
102
|
+
return astensor(y)
|
|
106
103
|
|
|
107
104
|
|
|
108
105
|
def to_dmatrix(
|
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import pickle
|
|
16
15
|
|
|
17
16
|
import numpy as np
|
|
18
17
|
import pandas as pd
|
|
@@ -22,8 +21,14 @@ from ....core.entity.output_types import OutputType
|
|
|
22
21
|
from ....core.operator.base import Operator
|
|
23
22
|
from ....core.operator.core import TileableOperatorMixin
|
|
24
23
|
from ....dataframe.utils import parse_index
|
|
25
|
-
from ....serialization.serializables import
|
|
24
|
+
from ....serialization.serializables import (
|
|
25
|
+
BoolField,
|
|
26
|
+
KeyField,
|
|
27
|
+
ReferenceField,
|
|
28
|
+
TupleField,
|
|
29
|
+
)
|
|
26
30
|
from ....tensor.core import TENSOR_TYPE, TensorOrder
|
|
31
|
+
from .core import BoosterData
|
|
27
32
|
from .dmatrix import check_data
|
|
28
33
|
|
|
29
34
|
|
|
@@ -32,9 +37,7 @@ class XGBPredict(Operator, TileableOperatorMixin):
|
|
|
32
37
|
output_dtype = np.dtype(np.float32)
|
|
33
38
|
|
|
34
39
|
data = KeyField("data", default=None)
|
|
35
|
-
model =
|
|
36
|
-
"model", on_serialize=pickle.dumps, on_deserialize=pickle.loads, default=None
|
|
37
|
-
)
|
|
40
|
+
model = ReferenceField("model", reference_type=BoosterData, default=None)
|
|
38
41
|
pred_leaf = BoolField("pred_leaf", default=False)
|
|
39
42
|
pred_contribs = BoolField("pred_contribs", default=False)
|
|
40
43
|
approx_contribs = BoolField("approx_contribs", default=False)
|
|
@@ -107,6 +110,17 @@ def predict(
|
|
|
107
110
|
strict_shape=False,
|
|
108
111
|
flag=False,
|
|
109
112
|
):
|
|
113
|
+
"""
|
|
114
|
+
Using MaxFrame XGBoost model to predict data.
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
Parameters are the same as `xgboost.train`. The predict() is lazy-execution mode.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
results: Booster
|
|
123
|
+
"""
|
|
110
124
|
data = check_data(data)
|
|
111
125
|
# TODO: check model datatype
|
|
112
126
|
|
|
@@ -41,11 +41,6 @@ else:
|
|
|
41
41
|
):
|
|
42
42
|
session = kw.pop("session", None)
|
|
43
43
|
run_kwargs = kw.pop("run_kwargs", dict())
|
|
44
|
-
if kw:
|
|
45
|
-
raise TypeError(
|
|
46
|
-
f"fit got an unexpected keyword argument '{next(iter(kw))}'"
|
|
47
|
-
)
|
|
48
|
-
|
|
49
44
|
dtrain, evals = wrap_evaluation_matrices(
|
|
50
45
|
None,
|
|
51
46
|
X,
|
|
@@ -57,6 +52,8 @@ else:
|
|
|
57
52
|
base_margin_eval_set,
|
|
58
53
|
)
|
|
59
54
|
params = self.get_xgb_params()
|
|
55
|
+
if not params.get("objective"):
|
|
56
|
+
params["objective"] = "reg:squarederror"
|
|
60
57
|
self.evals_result_ = dict()
|
|
61
58
|
result = train(
|
|
62
59
|
params,
|
|
@@ -71,8 +68,4 @@ else:
|
|
|
71
68
|
return self
|
|
72
69
|
|
|
73
70
|
def predict(self, data, **kw):
|
|
74
|
-
|
|
75
|
-
run_kwargs = kw.pop("run_kwargs", None)
|
|
76
|
-
return predict(
|
|
77
|
-
self.get_booster(), data, session=session, run_kwargs=run_kwargs, **kw
|
|
78
|
-
)
|
|
71
|
+
return predict(self.get_booster(), data, **kw)
|
|
@@ -29,6 +29,7 @@ from ....serialization.serializables import (
|
|
|
29
29
|
KeyField,
|
|
30
30
|
ListField,
|
|
31
31
|
)
|
|
32
|
+
from .core import Booster
|
|
32
33
|
from .dmatrix import ToDMatrix, to_dmatrix
|
|
33
34
|
|
|
34
35
|
logger = logging.getLogger(__name__)
|
|
@@ -59,49 +60,59 @@ class XGBTrain(Operator, TileableOperatorMixin):
|
|
|
59
60
|
num_boost_round = Int64Field("num_boost_round", default=10)
|
|
60
61
|
num_class = Int64Field("num_class", default=None)
|
|
61
62
|
|
|
62
|
-
# Store evals_result in local to store the remote evals_result
|
|
63
|
-
evals_result: dict = None
|
|
64
|
-
|
|
65
63
|
def __init__(self, gpu=None, **kw):
|
|
66
64
|
super().__init__(gpu=gpu, **kw)
|
|
67
65
|
if self.output_types is None:
|
|
68
66
|
self.output_types = [OutputType.object]
|
|
67
|
+
if self.has_evals_result:
|
|
68
|
+
self.output_types.append(OutputType.object)
|
|
69
69
|
|
|
70
70
|
def _set_inputs(self, inputs):
|
|
71
71
|
super()._set_inputs(inputs)
|
|
72
72
|
self.dtrain = self._inputs[0]
|
|
73
73
|
rest = self._inputs[1:]
|
|
74
|
-
if self.
|
|
74
|
+
if self.has_evals_result:
|
|
75
75
|
evals_dict = OrderedDict(self.evals)
|
|
76
76
|
new_evals_dict = OrderedDict()
|
|
77
77
|
for new_key, val in zip(rest, evals_dict.values()):
|
|
78
78
|
new_evals_dict[new_key] = val
|
|
79
79
|
self.evals = list(new_evals_dict.items())
|
|
80
80
|
|
|
81
|
-
def __call__(self):
|
|
81
|
+
def __call__(self, evals_result):
|
|
82
82
|
inputs = [self.dtrain]
|
|
83
|
-
if self.
|
|
83
|
+
if self.has_evals_result:
|
|
84
84
|
inputs.extend(e[0] for e in self.evals)
|
|
85
|
-
return self.
|
|
85
|
+
return self.new_tileables(
|
|
86
|
+
inputs, object_class=Booster, evals_result=evals_result
|
|
87
|
+
)[0]
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def output_limit(self):
|
|
91
|
+
return 2 if self.has_evals_result else 1
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def has_evals_result(self) -> bool:
|
|
95
|
+
return self.evals
|
|
86
96
|
|
|
87
97
|
|
|
88
98
|
def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwargs):
|
|
89
99
|
"""
|
|
90
|
-
Train XGBoost model in
|
|
100
|
+
Train XGBoost model in MaxFrame manner.
|
|
91
101
|
|
|
92
102
|
Parameters
|
|
93
103
|
----------
|
|
94
|
-
Parameters are the same as `xgboost.train`.
|
|
104
|
+
Parameters are the same as `xgboost.train`. Note that train is an eager-execution
|
|
105
|
+
API. The call will be blocked until training finished.
|
|
95
106
|
|
|
96
107
|
Returns
|
|
97
108
|
-------
|
|
98
109
|
results: Booster
|
|
99
110
|
"""
|
|
100
111
|
|
|
101
|
-
evals_result = evals_result
|
|
102
|
-
evals = None or ()
|
|
103
|
-
|
|
112
|
+
evals_result = evals_result if evals_result is not None else dict()
|
|
104
113
|
processed_evals = []
|
|
114
|
+
session = kwargs.pop("session", None)
|
|
115
|
+
run_kwargs = kwargs.pop("run_kwargs", dict())
|
|
105
116
|
if evals:
|
|
106
117
|
for eval_dmatrix, name in evals:
|
|
107
118
|
if not isinstance(name, str):
|
|
@@ -110,12 +121,11 @@ def train(params, dtrain, evals=None, evals_result=None, num_class=None, **kwarg
|
|
|
110
121
|
processed_evals.append((eval_dmatrix, name))
|
|
111
122
|
else:
|
|
112
123
|
processed_evals.append((to_dmatrix(eval_dmatrix), name))
|
|
113
|
-
|
|
114
124
|
return XGBTrain(
|
|
115
125
|
params=params,
|
|
116
126
|
dtrain=dtrain,
|
|
117
127
|
evals=processed_evals,
|
|
118
128
|
evals_result=evals_result,
|
|
119
129
|
num_class=num_class,
|
|
120
|
-
**kwargs
|
|
121
|
-
)()
|
|
130
|
+
**kwargs,
|
|
131
|
+
)(evals_result).execute(session=session, **run_kwargs)
|
|
@@ -12,18 +12,15 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from
|
|
16
|
-
from ...serialization.serializables import ReferenceField
|
|
17
|
-
from ..graph import ChunkGraph
|
|
18
|
-
from .base import Operator
|
|
15
|
+
from ..core.entity.objects import Object, ObjectData
|
|
19
16
|
|
|
20
17
|
|
|
21
|
-
class
|
|
22
|
-
|
|
23
|
-
_op_type_ = opcodes.FUSE
|
|
18
|
+
class ModelData(ObjectData):
|
|
19
|
+
pass
|
|
24
20
|
|
|
25
|
-
fuse_graph = ReferenceField("fuse_graph", ChunkGraph)
|
|
26
21
|
|
|
22
|
+
class Model(Object):
|
|
23
|
+
pass
|
|
27
24
|
|
|
28
|
-
|
|
29
|
-
|
|
25
|
+
|
|
26
|
+
MODEL_TYPE = (Model, ModelData)
|
|
Binary file
|
maxframe/protocol.py
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import base64
|
|
16
16
|
import enum
|
|
17
17
|
import uuid
|
|
18
|
-
from typing import Any, Dict, Generic, List, Optional,
|
|
18
|
+
from typing import Any, Dict, Generic, List, Optional, Type, TypeVar
|
|
19
19
|
|
|
20
20
|
import pandas as pd
|
|
21
21
|
|
|
@@ -38,7 +38,6 @@ from .serialization.serializables import (
|
|
|
38
38
|
Serializable,
|
|
39
39
|
SeriesField,
|
|
40
40
|
StringField,
|
|
41
|
-
TupleField,
|
|
42
41
|
)
|
|
43
42
|
|
|
44
43
|
pickling_support.install()
|
|
@@ -92,19 +91,6 @@ class DataSerializeType(enum.Enum):
|
|
|
92
91
|
PICKLE = 0
|
|
93
92
|
|
|
94
93
|
|
|
95
|
-
class VolumeDataMeta(Serializable):
|
|
96
|
-
output_type: OutputType = EnumField(
|
|
97
|
-
"output_type", OutputType, FieldTypes.int8, default=None
|
|
98
|
-
)
|
|
99
|
-
serial_type: DataSerializeType = EnumField(
|
|
100
|
-
"serial_type", DataSerializeType, FieldTypes.int8, default=None
|
|
101
|
-
)
|
|
102
|
-
shape: Tuple[int, ...] = TupleField("shape", FieldTypes.int64, default=None)
|
|
103
|
-
nsplits: Tuple[Tuple[int, ...], ...] = TupleField(
|
|
104
|
-
"nsplits", FieldTypes.tuple(FieldTypes.tuple(FieldTypes.int64)), default=None
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
|
|
108
94
|
_result_type_to_info_cls: Dict[ResultType, Type["ResultInfo"]] = dict()
|
|
109
95
|
|
|
110
96
|
|
maxframe/remote/core.py
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
from functools import partial
|
|
16
16
|
|
|
17
17
|
from .. import opcodes
|
|
18
|
-
from ..core import ENTITY_TYPE
|
|
18
|
+
from ..core import ENTITY_TYPE
|
|
19
19
|
from ..core.operator import ObjectOperator, ObjectOperatorMixin
|
|
20
20
|
from ..dataframe.core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
21
21
|
from ..serialization.serializables import (
|
|
@@ -26,7 +26,7 @@ from ..serialization.serializables import (
|
|
|
26
26
|
ListField,
|
|
27
27
|
)
|
|
28
28
|
from ..tensor.core import TENSOR_TYPE
|
|
29
|
-
from ..utils import
|
|
29
|
+
from ..utils import find_objects, replace_objects
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
|
|
@@ -63,12 +63,8 @@ class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
|
|
|
63
63
|
if raw_inputs is not None:
|
|
64
64
|
for raw_inp in raw_inputs:
|
|
65
65
|
if self._no_prepare(raw_inp):
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
mapping[raw_inp] = next(function_inputs)
|
|
69
|
-
else:
|
|
70
|
-
# in tile, set_inputs from chunk
|
|
71
|
-
mapping[raw_inp] = build_fetch_tileable(raw_inp)
|
|
66
|
+
# not in tile, set_inputs from tileable
|
|
67
|
+
mapping[raw_inp] = next(function_inputs)
|
|
72
68
|
else:
|
|
73
69
|
mapping[raw_inp] = next(function_inputs)
|
|
74
70
|
self.function_args = replace_objects(self.function_args, mapping)
|
|
Binary file
|