maxframe 1.0.0rc1__cp311-cp311-macosx_10_9_universal2.whl → 1.0.0rc3__cp311-cp311-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-311-darwin.so +0 -0
- maxframe/codegen.py +3 -6
- maxframe/config/config.py +49 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +15 -2
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/objects.py +46 -3
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cpython-311-darwin.so +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +7 -1
- maxframe/dataframe/datasource/read_odps_table.py +3 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/datastore/to_odps.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +3 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +12 -8
- maxframe/{odpsio → io/odpsio}/schema.py +15 -12
- maxframe/io/odpsio/tableio.py +702 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +19 -18
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +57 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
- maxframe/learn/contrib/xgboost/predict.py +21 -7
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +27 -17
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
- maxframe/protocol.py +41 -17
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cpython-311-darwin.so +0 -0
- maxframe/serialization/serializables/core.py +48 -9
- maxframe/tensor/__init__.py +69 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +98 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +70 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/{base → misc}/unique.py +2 -2
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/utils.py +11 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +32 -70
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/METADATA +25 -25
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +133 -123
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +60 -68
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +58 -22
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +27 -4
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -322
- maxframe/odpsio/volumeio.py +0 -95
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
|
@@ -18,9 +18,9 @@ import pyarrow as pa
|
|
|
18
18
|
import pytest
|
|
19
19
|
from odps import types as odps_types
|
|
20
20
|
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
21
|
+
from .... import dataframe as md
|
|
22
|
+
from .... import tensor as mt
|
|
23
|
+
from ....core import OutputType
|
|
24
24
|
from ..schema import (
|
|
25
25
|
arrow_schema_to_odps_schema,
|
|
26
26
|
build_dataframe_table_meta,
|
|
@@ -143,17 +143,17 @@ def test_pandas_to_odps_schema_index(wrap_obj):
|
|
|
143
143
|
data = pd.Index(np.random.randint(0, 100, 100))
|
|
144
144
|
|
|
145
145
|
test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
146
|
+
for ignore_idx in (False, True):
|
|
147
|
+
schema, meta = pandas_to_odps_schema(
|
|
148
|
+
test_idx, unknown_as_string=True, ignore_index=ignore_idx
|
|
149
|
+
)
|
|
150
|
+
assert [c.name for c in schema.columns] == ["_idx_0"]
|
|
151
|
+
assert [c.type.name for c in schema.columns] == ["bigint"]
|
|
152
|
+
assert meta.type == OutputType.index
|
|
153
|
+
assert meta.table_column_names == []
|
|
154
|
+
assert meta.table_index_column_names == ["_idx_0"]
|
|
155
|
+
assert meta.pd_column_level_names == []
|
|
156
|
+
assert meta.pd_index_level_names == [None]
|
|
157
157
|
|
|
158
158
|
data = pd.MultiIndex.from_arrays(
|
|
159
159
|
[np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
|
|
@@ -177,6 +177,7 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
177
177
|
test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
178
178
|
if wrap_obj != "no":
|
|
179
179
|
test_scalar.op.data = None
|
|
180
|
+
|
|
180
181
|
schema, meta = pandas_to_odps_schema(test_scalar, unknown_as_string=True)
|
|
181
182
|
assert schema.columns[0].name == "_idx_0"
|
|
182
183
|
assert schema.columns[0].type.name == "double"
|
|
@@ -186,9 +187,6 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
186
187
|
assert meta.pd_column_level_names == []
|
|
187
188
|
assert meta.pd_index_level_names == [None]
|
|
188
189
|
|
|
189
|
-
with pytest.raises(AssertionError):
|
|
190
|
-
pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
|
|
191
|
-
|
|
192
190
|
|
|
193
191
|
def test_odps_arrow_schema_conversion():
|
|
194
192
|
odps_schema = odps_types.OdpsSchema(
|
|
@@ -211,10 +209,11 @@ def test_odps_arrow_schema_conversion():
|
|
|
211
209
|
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
212
210
|
odps_types.Column("col17", "CHAR(15)"),
|
|
213
211
|
odps_types.Column("col18", "VARCHAR(15)"),
|
|
212
|
+
odps_types.Column("col19", "decimal"),
|
|
214
213
|
]
|
|
215
214
|
)
|
|
216
215
|
arrow_schema = odps_schema_to_arrow_schema(odps_schema)
|
|
217
|
-
assert arrow_schema.names == [f"col{i}" for i in range(1,
|
|
216
|
+
assert arrow_schema.names == [f"col{i}" for i in range(1, 20)]
|
|
218
217
|
assert arrow_schema.types == [
|
|
219
218
|
pa.string(),
|
|
220
219
|
pa.binary(),
|
|
@@ -234,6 +233,7 @@ def test_odps_arrow_schema_conversion():
|
|
|
234
233
|
pa.struct([("a1", pa.string()), ("a2", pa.map_(pa.string(), pa.int64()))]),
|
|
235
234
|
pa.string(),
|
|
236
235
|
pa.string(),
|
|
236
|
+
pa.decimal128(38, 18),
|
|
237
237
|
]
|
|
238
238
|
|
|
239
239
|
expected_odps_schema = odps_types.OdpsSchema(
|
|
@@ -256,6 +256,7 @@ def test_odps_arrow_schema_conversion():
|
|
|
256
256
|
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
257
257
|
odps_types.Column("col17", "string"),
|
|
258
258
|
odps_types.Column("col18", "string"),
|
|
259
|
+
odps_types.Column("col19", "decimal(38, 18)"),
|
|
259
260
|
]
|
|
260
261
|
)
|
|
261
262
|
|
|
@@ -12,22 +12,37 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import datetime
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
17
19
|
import pyarrow as pa
|
|
20
|
+
import pytest
|
|
18
21
|
from odps import ODPS
|
|
19
22
|
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
+
from ....config import options
|
|
24
|
+
from ....tests.utils import flaky, tn
|
|
25
|
+
from ....utils import config_odps_default_options
|
|
26
|
+
from ..tableio import ODPSTableIO
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def switch_table_io(request):
|
|
31
|
+
old_use_common_table = options.use_common_table
|
|
32
|
+
try:
|
|
33
|
+
options.use_common_table = request.param
|
|
34
|
+
yield
|
|
35
|
+
finally:
|
|
36
|
+
options.use_common_table = old_use_common_table
|
|
23
37
|
|
|
24
38
|
|
|
25
39
|
@flaky(max_runs=3)
|
|
26
|
-
|
|
40
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
41
|
+
def test_empty_table_io(switch_table_io):
|
|
27
42
|
config_odps_default_options()
|
|
28
43
|
|
|
29
44
|
o = ODPS.from_environments()
|
|
30
|
-
|
|
45
|
+
table_io = ODPSTableIO(o)
|
|
31
46
|
|
|
32
47
|
# test read from empty table
|
|
33
48
|
empty_table_name = tn("test_empty_table_halo_read")
|
|
@@ -35,42 +50,53 @@ def test_empty_table_io():
|
|
|
35
50
|
tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
|
|
36
51
|
|
|
37
52
|
try:
|
|
38
|
-
with
|
|
53
|
+
with table_io.open_reader(empty_table_name) as reader:
|
|
39
54
|
assert len(reader.read_all()) == 0
|
|
40
55
|
finally:
|
|
41
56
|
tb.drop()
|
|
42
57
|
|
|
43
58
|
|
|
44
59
|
@flaky(max_runs=3)
|
|
45
|
-
|
|
60
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
61
|
+
def test_table_io_without_parts(switch_table_io):
|
|
46
62
|
config_odps_default_options()
|
|
47
63
|
|
|
48
64
|
o = ODPS.from_environments()
|
|
49
|
-
|
|
65
|
+
table_io = ODPSTableIO(o)
|
|
50
66
|
|
|
51
67
|
# test read and write tables without partition
|
|
52
68
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
53
69
|
o.delete_table(no_part_table_name, if_exists=True)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
)
|
|
70
|
+
col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
|
|
71
|
+
tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
|
|
57
72
|
|
|
58
73
|
try:
|
|
59
74
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
60
|
-
|
|
75
|
+
date_val = [
|
|
76
|
+
(
|
|
77
|
+
datetime.datetime.now().replace(microsecond=0)
|
|
78
|
+
+ datetime.timedelta(seconds=i)
|
|
79
|
+
)
|
|
80
|
+
for i in range(100)
|
|
81
|
+
]
|
|
82
|
+
pd_data["f"] = pd.Series(date_val, dtype="datetime64[ms]").dt.tz_localize(
|
|
83
|
+
options.local_timezone
|
|
84
|
+
)
|
|
85
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
61
86
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
62
|
-
with
|
|
87
|
+
with table_io.open_reader(no_part_table_name) as reader:
|
|
63
88
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
64
89
|
finally:
|
|
65
90
|
tb.drop()
|
|
66
91
|
|
|
67
92
|
|
|
68
93
|
@flaky(max_runs=3)
|
|
69
|
-
|
|
94
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
95
|
+
def test_table_io_with_range_reader(switch_table_io):
|
|
70
96
|
config_odps_default_options()
|
|
71
97
|
|
|
72
98
|
o = ODPS.from_environments()
|
|
73
|
-
|
|
99
|
+
table_io = ODPSTableIO(o)
|
|
74
100
|
|
|
75
101
|
# test read and write tables without partition
|
|
76
102
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
@@ -81,15 +107,15 @@ def test_table_io_with_range_reader():
|
|
|
81
107
|
|
|
82
108
|
try:
|
|
83
109
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
84
|
-
with
|
|
110
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
85
111
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
86
112
|
|
|
87
|
-
with
|
|
113
|
+
with table_io.open_reader(
|
|
88
114
|
no_part_table_name, start=None, stop=100, row_batch_size=10
|
|
89
115
|
) as reader:
|
|
90
116
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
91
117
|
|
|
92
|
-
with
|
|
118
|
+
with table_io.open_reader(
|
|
93
119
|
no_part_table_name,
|
|
94
120
|
start=-2,
|
|
95
121
|
stop=-52,
|
|
@@ -105,11 +131,12 @@ def test_table_io_with_range_reader():
|
|
|
105
131
|
|
|
106
132
|
|
|
107
133
|
@flaky(max_runs=3)
|
|
108
|
-
|
|
134
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
135
|
+
def test_table_io_with_parts(switch_table_io):
|
|
109
136
|
config_odps_default_options()
|
|
110
137
|
|
|
111
138
|
o = ODPS.from_environments()
|
|
112
|
-
|
|
139
|
+
table_io = ODPSTableIO(o)
|
|
113
140
|
|
|
114
141
|
# test read and write tables with partition
|
|
115
142
|
parted_table_name = tn("test_parted_halo_write")
|
|
@@ -122,11 +149,11 @@ def test_table_io_with_parts():
|
|
|
122
149
|
|
|
123
150
|
try:
|
|
124
151
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
125
|
-
with
|
|
152
|
+
with table_io.open_writer(parted_table_name, "pt=test") as writer:
|
|
126
153
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
127
|
-
with
|
|
154
|
+
with table_io.open_reader(parted_table_name, "pt=test") as reader:
|
|
128
155
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
129
|
-
with
|
|
156
|
+
with table_io.open_reader(
|
|
130
157
|
parted_table_name, "pt=test", partition_columns=True
|
|
131
158
|
) as reader:
|
|
132
159
|
expected_data = pd_data.copy()
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import pytest
|
|
16
16
|
from odps import ODPS
|
|
17
17
|
|
|
18
|
-
from
|
|
18
|
+
from ....tests.utils import tn
|
|
19
19
|
from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
20
20
|
|
|
21
21
|
|
|
@@ -69,19 +69,17 @@ def create_volume(request, oss_config):
|
|
|
69
69
|
oss_config.oss_bucket.batch_delete_objects(keys)
|
|
70
70
|
|
|
71
71
|
|
|
72
|
-
@pytest.mark.parametrize("create_volume", ["
|
|
72
|
+
@pytest.mark.parametrize("create_volume", ["external"], indirect=True)
|
|
73
73
|
def test_read_write_volume(create_volume):
|
|
74
74
|
test_vol_dir = "test_vol_dir"
|
|
75
75
|
|
|
76
76
|
odps_entry = ODPS.from_environments()
|
|
77
77
|
|
|
78
78
|
writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
|
|
79
|
-
write_session_id = writer.create_write_session()
|
|
80
79
|
|
|
81
80
|
writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
|
|
82
|
-
writer.write_file("file1", b"content1"
|
|
83
|
-
writer.write_file("file2", b"content2"
|
|
84
|
-
writer.commit(["file1", "file2"], write_session_id)
|
|
81
|
+
writer.write_file("file1", b"content1")
|
|
82
|
+
writer.write_file("file2", b"content2")
|
|
85
83
|
|
|
86
84
|
reader = ODPSVolumeReader(odps_entry, create_volume, test_vol_dir)
|
|
87
85
|
assert reader.read_file("file1") == b"content1"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import inspect
|
|
16
|
+
from typing import Iterator, List, Union
|
|
17
|
+
|
|
18
|
+
from odps import ODPS
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ODPSVolumeReader:
|
|
22
|
+
def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
|
|
23
|
+
self._odps_entry = odps_entry
|
|
24
|
+
self._volume = odps_entry.get_volume(volume_name)
|
|
25
|
+
self._volume_dir = volume_dir
|
|
26
|
+
|
|
27
|
+
def list_files(self) -> List[str]:
|
|
28
|
+
def _get_file_name(vol_file):
|
|
29
|
+
if hasattr(vol_file, "name"):
|
|
30
|
+
return vol_file.name
|
|
31
|
+
return vol_file.path.rsplit("/", 1)[-1]
|
|
32
|
+
|
|
33
|
+
return [
|
|
34
|
+
_get_file_name(f)
|
|
35
|
+
for f in self._odps_entry.list_volume_files(
|
|
36
|
+
f"/{self._volume.name}/{self._volume_dir}"
|
|
37
|
+
)
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
def read_file(self, file_name: str) -> bytes:
|
|
41
|
+
with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
|
|
42
|
+
return reader.read()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ODPSVolumeWriter:
|
|
46
|
+
def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
|
|
47
|
+
self._odps_entry = odps_entry
|
|
48
|
+
self._volume = odps_entry.get_volume(volume_name)
|
|
49
|
+
self._volume_dir = volume_dir
|
|
50
|
+
|
|
51
|
+
def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
|
|
52
|
+
with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
|
|
53
|
+
if not inspect.isgenerator(data):
|
|
54
|
+
writer.write(data)
|
|
55
|
+
else:
|
|
56
|
+
for chunk in data:
|
|
57
|
+
writer.write(chunk)
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
|
|
17
|
-
from ....tensor import argmax
|
|
17
|
+
from ....tensor import argmax, transpose, vstack
|
|
18
18
|
from ..utils import make_import_error_func
|
|
19
19
|
from .core import XGBScikitLearnBase, xgboost
|
|
20
20
|
|
|
@@ -42,7 +42,10 @@ else:
|
|
|
42
42
|
sample_weight_eval_set=None,
|
|
43
43
|
base_margin_eval_set=None,
|
|
44
44
|
num_class=None,
|
|
45
|
+
**kw,
|
|
45
46
|
):
|
|
47
|
+
session = kw.pop("session", None)
|
|
48
|
+
run_kwargs = kw.pop("run_kwargs", dict())
|
|
46
49
|
dtrain, evals = wrap_evaluation_matrices(
|
|
47
50
|
None,
|
|
48
51
|
X,
|
|
@@ -68,6 +71,8 @@ else:
|
|
|
68
71
|
evals=evals,
|
|
69
72
|
evals_result=self.evals_result_,
|
|
70
73
|
num_class=num_class,
|
|
74
|
+
session=session,
|
|
75
|
+
run_kwargs=run_kwargs,
|
|
71
76
|
)
|
|
72
77
|
self._Booster = result
|
|
73
78
|
return self
|
|
@@ -83,4 +88,23 @@ else:
|
|
|
83
88
|
def predict_proba(self, data, ntree_limit=None, flag=False, **kw):
|
|
84
89
|
if ntree_limit is not None:
|
|
85
90
|
raise NotImplementedError("ntree_limit is not currently supported")
|
|
86
|
-
|
|
91
|
+
prediction = predict(self.get_booster(), data, flag=flag, **kw)
|
|
92
|
+
|
|
93
|
+
if len(prediction.shape) == 2 and prediction.shape[1] == self.n_classes_:
|
|
94
|
+
# multi-class
|
|
95
|
+
return prediction
|
|
96
|
+
if (
|
|
97
|
+
len(prediction.shape) == 2
|
|
98
|
+
and self.n_classes_ == 2
|
|
99
|
+
and prediction.shape[1] >= self.n_classes_
|
|
100
|
+
):
|
|
101
|
+
# multi-label
|
|
102
|
+
return prediction
|
|
103
|
+
# binary logistic function
|
|
104
|
+
classone_probs = prediction
|
|
105
|
+
classzero_probs = 1.0 - classone_probs
|
|
106
|
+
return transpose(vstack((classzero_probs, classone_probs)))
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def classes_(self) -> np.ndarray:
|
|
110
|
+
return np.arange(self.n_classes_)
|
|
@@ -12,15 +12,67 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
from typing import Any, Callable, List, Optional, Tuple
|
|
15
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
16
16
|
|
|
17
17
|
try:
|
|
18
18
|
import xgboost
|
|
19
19
|
except ImportError:
|
|
20
20
|
xgboost = None
|
|
21
21
|
|
|
22
|
+
from ...core import Model, ModelData
|
|
22
23
|
from .dmatrix import DMatrix
|
|
23
24
|
|
|
25
|
+
|
|
26
|
+
class BoosterData(ModelData):
|
|
27
|
+
__slots__ = ("_evals_result",)
|
|
28
|
+
|
|
29
|
+
_evals_result: Dict
|
|
30
|
+
|
|
31
|
+
def __init__(self, *args, evals_result=None, **kwargs):
|
|
32
|
+
super().__init__(*args, **kwargs)
|
|
33
|
+
self._evals_result = evals_result if evals_result is not None else dict()
|
|
34
|
+
|
|
35
|
+
def execute(self, session=None, **kw):
|
|
36
|
+
# The evals_result should be fetched when BoosterData.execute() is called.
|
|
37
|
+
result = super().execute(session=session, **kw)
|
|
38
|
+
if self.op.has_evals_result and self.key == self.op.outputs[0].key:
|
|
39
|
+
self._evals_result.update(self.op.outputs[1].fetch(session=session))
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
def predict(
|
|
43
|
+
self,
|
|
44
|
+
data,
|
|
45
|
+
output_margin=False,
|
|
46
|
+
pred_leaf=False,
|
|
47
|
+
pred_contribs=False,
|
|
48
|
+
approx_contribs=False,
|
|
49
|
+
pred_interactions=False,
|
|
50
|
+
validate_features=True,
|
|
51
|
+
training=False,
|
|
52
|
+
iteration_range=None,
|
|
53
|
+
strict_shape=False,
|
|
54
|
+
):
|
|
55
|
+
from .predict import predict
|
|
56
|
+
|
|
57
|
+
return predict(
|
|
58
|
+
self,
|
|
59
|
+
data,
|
|
60
|
+
output_margin=output_margin,
|
|
61
|
+
pred_leaf=pred_leaf,
|
|
62
|
+
pred_contribs=pred_contribs,
|
|
63
|
+
approx_contribs=approx_contribs,
|
|
64
|
+
pred_interactions=pred_interactions,
|
|
65
|
+
validate_features=validate_features,
|
|
66
|
+
training=training,
|
|
67
|
+
iteration_range=iteration_range,
|
|
68
|
+
strict_shape=strict_shape,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class Booster(Model):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
24
76
|
if not xgboost:
|
|
25
77
|
XGBScikitLearnBase = None
|
|
26
78
|
else:
|
|
@@ -40,7 +92,9 @@ else:
|
|
|
40
92
|
**kw,
|
|
41
93
|
):
|
|
42
94
|
"""
|
|
43
|
-
Fit the regressor.
|
|
95
|
+
Fit the regressor. Note that fit() is an eager-execution
|
|
96
|
+
API. The call will be blocked until training finished.
|
|
97
|
+
|
|
44
98
|
Parameters
|
|
45
99
|
----------
|
|
46
100
|
X : array_like
|
|
@@ -72,6 +126,37 @@ else:
|
|
|
72
126
|
"""
|
|
73
127
|
raise NotImplementedError
|
|
74
128
|
|
|
129
|
+
def evals_result(self, **kw) -> Dict:
|
|
130
|
+
"""Return the evaluation results.
|
|
131
|
+
|
|
132
|
+
If **eval_set** is passed to the :py:meth:`fit` function, you can call
|
|
133
|
+
``evals_result()`` to get evaluation results for all passed **eval_sets**. When
|
|
134
|
+
**eval_metric** is also passed to the :py:meth:`fit` function, the
|
|
135
|
+
**evals_result** will contain the **eval_metrics** passed to the :py:meth:`fit`
|
|
136
|
+
function.
|
|
137
|
+
|
|
138
|
+
The returned evaluation result is a dictionary:
|
|
139
|
+
|
|
140
|
+
.. code-block:: python
|
|
141
|
+
|
|
142
|
+
{'validation_0': {'logloss': ['0.604835', '0.531479']},
|
|
143
|
+
'validation_1': {'logloss': ['0.41965', '0.17686']}}
|
|
144
|
+
|
|
145
|
+
Note that evals_result() will be blocked until the train is finished.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
evals_result
|
|
150
|
+
|
|
151
|
+
"""
|
|
152
|
+
result = super().evals_result()
|
|
153
|
+
if not self._Booster.op.has_evals_result or len(result) != 0:
|
|
154
|
+
return result
|
|
155
|
+
session = kw.pop("session", None)
|
|
156
|
+
run_kwargs = kw.pop("run_kwargs", dict())
|
|
157
|
+
self._Booster.execute(session=session, **run_kwargs)
|
|
158
|
+
return super().evals_result()
|
|
159
|
+
|
|
75
160
|
def wrap_evaluation_matrices(
|
|
76
161
|
missing: float,
|
|
77
162
|
X: Any,
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
from .... import opcodes
|
|
16
|
+
from .... import opcodes
|
|
17
17
|
from ....core.entity.output_types import get_output_types
|
|
18
18
|
from ....core.operator.base import Operator
|
|
19
19
|
from ....core.operator.core import TileableOperatorMixin
|
|
@@ -27,7 +27,7 @@ from ...utils import convert_to_tensor_or_dataframe
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class ToDMatrix(Operator, TileableOperatorMixin):
|
|
30
|
-
_op_type_ =
|
|
30
|
+
_op_type_ = opcodes.TO_DMATRIX
|
|
31
31
|
|
|
32
32
|
data = KeyField("data", default=None)
|
|
33
33
|
label = KeyField("label", default=None)
|
|
@@ -99,10 +99,7 @@ def check_array_like(y: TileableType, name: str) -> TileableType:
|
|
|
99
99
|
y = convert_to_tensor_or_dataframe(y)
|
|
100
100
|
if isinstance(y, DATAFRAME_TYPE):
|
|
101
101
|
y = y.iloc[:, 0]
|
|
102
|
-
|
|
103
|
-
if y.ndim != 1:
|
|
104
|
-
raise ValueError(f"Expecting 1-d {name}, got: {y.ndim}-d")
|
|
105
|
-
return y
|
|
102
|
+
return astensor(y)
|
|
106
103
|
|
|
107
104
|
|
|
108
105
|
def to_dmatrix(
|
|
@@ -12,29 +12,32 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import pickle
|
|
16
15
|
|
|
17
16
|
import numpy as np
|
|
18
17
|
import pandas as pd
|
|
19
18
|
|
|
20
|
-
from .... import opcodes
|
|
19
|
+
from .... import opcodes
|
|
21
20
|
from ....core.entity.output_types import OutputType
|
|
22
21
|
from ....core.operator.base import Operator
|
|
23
22
|
from ....core.operator.core import TileableOperatorMixin
|
|
24
23
|
from ....dataframe.utils import parse_index
|
|
25
|
-
from ....serialization.serializables import
|
|
24
|
+
from ....serialization.serializables import (
|
|
25
|
+
BoolField,
|
|
26
|
+
KeyField,
|
|
27
|
+
ReferenceField,
|
|
28
|
+
TupleField,
|
|
29
|
+
)
|
|
26
30
|
from ....tensor.core import TENSOR_TYPE, TensorOrder
|
|
31
|
+
from .core import BoosterData
|
|
27
32
|
from .dmatrix import check_data
|
|
28
33
|
|
|
29
34
|
|
|
30
35
|
class XGBPredict(Operator, TileableOperatorMixin):
|
|
31
|
-
_op_type_ =
|
|
36
|
+
_op_type_ = opcodes.XGBOOST_PREDICT
|
|
32
37
|
output_dtype = np.dtype(np.float32)
|
|
33
38
|
|
|
34
39
|
data = KeyField("data", default=None)
|
|
35
|
-
model =
|
|
36
|
-
"model", on_serialize=pickle.dumps, on_deserialize=pickle.loads, default=None
|
|
37
|
-
)
|
|
40
|
+
model = ReferenceField("model", reference_type=BoosterData, default=None)
|
|
38
41
|
pred_leaf = BoolField("pred_leaf", default=False)
|
|
39
42
|
pred_contribs = BoolField("pred_contribs", default=False)
|
|
40
43
|
approx_contribs = BoolField("approx_contribs", default=False)
|
|
@@ -107,6 +110,17 @@ def predict(
|
|
|
107
110
|
strict_shape=False,
|
|
108
111
|
flag=False,
|
|
109
112
|
):
|
|
113
|
+
"""
|
|
114
|
+
Using MaxFrame XGBoost model to predict data.
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
Parameters are the same as `xgboost.train`. The predict() is lazy-execution mode.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
results: Booster
|
|
123
|
+
"""
|
|
110
124
|
data = check_data(data)
|
|
111
125
|
# TODO: check model datatype
|
|
112
126
|
|
|
@@ -41,11 +41,6 @@ else:
|
|
|
41
41
|
):
|
|
42
42
|
session = kw.pop("session", None)
|
|
43
43
|
run_kwargs = kw.pop("run_kwargs", dict())
|
|
44
|
-
if kw:
|
|
45
|
-
raise TypeError(
|
|
46
|
-
f"fit got an unexpected keyword argument '{next(iter(kw))}'"
|
|
47
|
-
)
|
|
48
|
-
|
|
49
44
|
dtrain, evals = wrap_evaluation_matrices(
|
|
50
45
|
None,
|
|
51
46
|
X,
|
|
@@ -57,6 +52,8 @@ else:
|
|
|
57
52
|
base_margin_eval_set,
|
|
58
53
|
)
|
|
59
54
|
params = self.get_xgb_params()
|
|
55
|
+
if not params.get("objective"):
|
|
56
|
+
params["objective"] = "reg:squarederror"
|
|
60
57
|
self.evals_result_ = dict()
|
|
61
58
|
result = train(
|
|
62
59
|
params,
|
|
@@ -71,8 +68,4 @@ else:
|
|
|
71
68
|
return self
|
|
72
69
|
|
|
73
70
|
def predict(self, data, **kw):
|
|
74
|
-
|
|
75
|
-
run_kwargs = kw.pop("run_kwargs", None)
|
|
76
|
-
return predict(
|
|
77
|
-
self.get_booster(), data, session=session, run_kwargs=run_kwargs, **kw
|
|
78
|
-
)
|
|
71
|
+
return predict(self.get_booster(), data, **kw)
|