maxframe 1.0.0rc1__cp38-cp38-macosx_10_9_universal2.whl → 1.0.0rc3__cp38-cp38-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (138) hide show
  1. maxframe/_utils.cpython-38-darwin.so +0 -0
  2. maxframe/codegen.py +3 -6
  3. maxframe/config/config.py +49 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +15 -2
  6. maxframe/core/__init__.py +2 -13
  7. maxframe/core/entity/__init__.py +0 -4
  8. maxframe/core/entity/objects.py +46 -3
  9. maxframe/core/entity/output_types.py +0 -3
  10. maxframe/core/entity/tests/test_objects.py +43 -0
  11. maxframe/core/entity/tileables.py +5 -78
  12. maxframe/core/graph/__init__.py +2 -2
  13. maxframe/core/graph/builder/__init__.py +0 -1
  14. maxframe/core/graph/builder/base.py +5 -4
  15. maxframe/core/graph/builder/tileable.py +4 -4
  16. maxframe/core/graph/builder/utils.py +4 -8
  17. maxframe/core/graph/core.cpython-38-darwin.so +0 -0
  18. maxframe/core/graph/entity.py +9 -33
  19. maxframe/core/operator/__init__.py +2 -9
  20. maxframe/core/operator/base.py +3 -5
  21. maxframe/core/operator/objects.py +0 -9
  22. maxframe/core/operator/utils.py +55 -0
  23. maxframe/dataframe/__init__.py +1 -1
  24. maxframe/dataframe/arithmetic/around.py +5 -17
  25. maxframe/dataframe/arithmetic/core.py +15 -7
  26. maxframe/dataframe/arithmetic/docstring.py +5 -55
  27. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  28. maxframe/dataframe/core.py +5 -5
  29. maxframe/dataframe/datasource/date_range.py +2 -2
  30. maxframe/dataframe/datasource/read_odps_query.py +7 -1
  31. maxframe/dataframe/datasource/read_odps_table.py +3 -2
  32. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  33. maxframe/dataframe/datastore/to_odps.py +1 -1
  34. maxframe/dataframe/groupby/cum.py +0 -1
  35. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/rename.py +3 -37
  38. maxframe/dataframe/indexing/sample.py +0 -1
  39. maxframe/dataframe/indexing/set_index.py +68 -1
  40. maxframe/dataframe/merge/merge.py +236 -2
  41. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  42. maxframe/dataframe/misc/apply.py +3 -10
  43. maxframe/dataframe/misc/case_when.py +1 -1
  44. maxframe/dataframe/misc/describe.py +2 -2
  45. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  46. maxframe/dataframe/misc/eval.py +4 -0
  47. maxframe/dataframe/misc/pct_change.py +1 -83
  48. maxframe/dataframe/misc/transform.py +1 -30
  49. maxframe/dataframe/misc/value_counts.py +4 -17
  50. maxframe/dataframe/missing/dropna.py +1 -1
  51. maxframe/dataframe/missing/fillna.py +5 -5
  52. maxframe/dataframe/operators.py +1 -17
  53. maxframe/dataframe/reduction/core.py +2 -2
  54. maxframe/dataframe/sort/sort_values.py +1 -11
  55. maxframe/dataframe/statistics/quantile.py +5 -17
  56. maxframe/dataframe/utils.py +4 -7
  57. maxframe/io/objects/__init__.py +24 -0
  58. maxframe/io/objects/core.py +140 -0
  59. maxframe/io/objects/tensor.py +76 -0
  60. maxframe/io/objects/tests/__init__.py +13 -0
  61. maxframe/io/objects/tests/test_object_io.py +97 -0
  62. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  63. maxframe/{odpsio → io/odpsio}/arrow.py +12 -8
  64. maxframe/{odpsio → io/odpsio}/schema.py +15 -12
  65. maxframe/io/odpsio/tableio.py +702 -0
  66. maxframe/io/odpsio/tests/__init__.py +13 -0
  67. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +19 -18
  68. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  69. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  70. maxframe/io/odpsio/volumeio.py +57 -0
  71. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  72. maxframe/learn/contrib/xgboost/core.py +87 -2
  73. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  74. maxframe/learn/contrib/xgboost/predict.py +21 -7
  75. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  76. maxframe/learn/contrib/xgboost/train.py +27 -17
  77. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  78. maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
  79. maxframe/protocol.py +41 -17
  80. maxframe/remote/core.py +4 -8
  81. maxframe/serialization/__init__.py +1 -0
  82. maxframe/serialization/core.cpython-38-darwin.so +0 -0
  83. maxframe/serialization/serializables/core.py +48 -9
  84. maxframe/tensor/__init__.py +69 -2
  85. maxframe/tensor/arithmetic/isclose.py +1 -0
  86. maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
  87. maxframe/tensor/core.py +5 -136
  88. maxframe/tensor/datasource/array.py +3 -0
  89. maxframe/tensor/datasource/full.py +1 -1
  90. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  91. maxframe/tensor/indexing/flatnonzero.py +1 -1
  92. maxframe/tensor/merge/__init__.py +2 -0
  93. maxframe/tensor/merge/concatenate.py +98 -0
  94. maxframe/tensor/merge/tests/test_merge.py +30 -1
  95. maxframe/tensor/merge/vstack.py +70 -0
  96. maxframe/tensor/{base → misc}/__init__.py +2 -0
  97. maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
  98. maxframe/tensor/misc/atleast_2d.py +70 -0
  99. maxframe/tensor/misc/atleast_3d.py +85 -0
  100. maxframe/tensor/misc/tests/__init__.py +13 -0
  101. maxframe/tensor/{base → misc}/transpose.py +22 -18
  102. maxframe/tensor/{base → misc}/unique.py +2 -2
  103. maxframe/tensor/operators.py +1 -7
  104. maxframe/tensor/random/core.py +1 -1
  105. maxframe/tensor/reduction/count_nonzero.py +1 -0
  106. maxframe/tensor/reduction/mean.py +1 -0
  107. maxframe/tensor/reduction/nanmean.py +1 -0
  108. maxframe/tensor/reduction/nanvar.py +2 -0
  109. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  110. maxframe/tensor/reduction/var.py +2 -0
  111. maxframe/tensor/statistics/quantile.py +2 -2
  112. maxframe/tensor/utils.py +2 -22
  113. maxframe/tests/utils.py +11 -2
  114. maxframe/typing_.py +4 -1
  115. maxframe/udf.py +8 -9
  116. maxframe/utils.py +32 -70
  117. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/METADATA +25 -25
  118. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +133 -123
  119. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
  120. maxframe_client/fetcher.py +60 -68
  121. maxframe_client/session/graph.py +8 -2
  122. maxframe_client/session/odps.py +58 -22
  123. maxframe_client/tests/test_fetcher.py +21 -3
  124. maxframe_client/tests/test_session.py +27 -4
  125. maxframe/core/entity/chunks.py +0 -68
  126. maxframe/core/entity/fuse.py +0 -73
  127. maxframe/core/graph/builder/chunk.py +0 -430
  128. maxframe/odpsio/tableio.py +0 -322
  129. maxframe/odpsio/volumeio.py +0 -95
  130. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  131. /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
  132. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  133. /maxframe/tensor/{base → misc}/astype.py +0 -0
  134. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  135. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  136. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  137. /maxframe/tensor/{base → misc}/where.py +0 -0
  138. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -18,9 +18,9 @@ import pyarrow as pa
18
18
  import pytest
19
19
  from odps import types as odps_types
20
20
 
21
- from ... import dataframe as md
22
- from ... import tensor as mt
23
- from ...core import OutputType
21
+ from .... import dataframe as md
22
+ from .... import tensor as mt
23
+ from ....core import OutputType
24
24
  from ..schema import (
25
25
  arrow_schema_to_odps_schema,
26
26
  build_dataframe_table_meta,
@@ -143,17 +143,17 @@ def test_pandas_to_odps_schema_index(wrap_obj):
143
143
  data = pd.Index(np.random.randint(0, 100, 100))
144
144
 
145
145
  test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
146
- schema, meta = pandas_to_odps_schema(test_idx, unknown_as_string=True)
147
- assert [c.name for c in schema.columns] == ["_idx_0"]
148
- assert [c.type.name for c in schema.columns] == ["bigint"]
149
- assert meta.type == OutputType.index
150
- assert meta.table_column_names == []
151
- assert meta.table_index_column_names == ["_idx_0"]
152
- assert meta.pd_column_level_names == []
153
- assert meta.pd_index_level_names == [None]
154
-
155
- with pytest.raises(AssertionError):
156
- pandas_to_odps_schema(test_idx, unknown_as_string=True, ignore_index=True)
146
+ for ignore_idx in (False, True):
147
+ schema, meta = pandas_to_odps_schema(
148
+ test_idx, unknown_as_string=True, ignore_index=ignore_idx
149
+ )
150
+ assert [c.name for c in schema.columns] == ["_idx_0"]
151
+ assert [c.type.name for c in schema.columns] == ["bigint"]
152
+ assert meta.type == OutputType.index
153
+ assert meta.table_column_names == []
154
+ assert meta.table_index_column_names == ["_idx_0"]
155
+ assert meta.pd_column_level_names == []
156
+ assert meta.pd_index_level_names == [None]
157
157
 
158
158
  data = pd.MultiIndex.from_arrays(
159
159
  [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
@@ -177,6 +177,7 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
177
177
  test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
178
178
  if wrap_obj != "no":
179
179
  test_scalar.op.data = None
180
+
180
181
  schema, meta = pandas_to_odps_schema(test_scalar, unknown_as_string=True)
181
182
  assert schema.columns[0].name == "_idx_0"
182
183
  assert schema.columns[0].type.name == "double"
@@ -186,9 +187,6 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
186
187
  assert meta.pd_column_level_names == []
187
188
  assert meta.pd_index_level_names == [None]
188
189
 
189
- with pytest.raises(AssertionError):
190
- pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
191
-
192
190
 
193
191
  def test_odps_arrow_schema_conversion():
194
192
  odps_schema = odps_types.OdpsSchema(
@@ -211,10 +209,11 @@ def test_odps_arrow_schema_conversion():
211
209
  odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
212
210
  odps_types.Column("col17", "CHAR(15)"),
213
211
  odps_types.Column("col18", "VARCHAR(15)"),
212
+ odps_types.Column("col19", "decimal"),
214
213
  ]
215
214
  )
216
215
  arrow_schema = odps_schema_to_arrow_schema(odps_schema)
217
- assert arrow_schema.names == [f"col{i}" for i in range(1, 19)]
216
+ assert arrow_schema.names == [f"col{i}" for i in range(1, 20)]
218
217
  assert arrow_schema.types == [
219
218
  pa.string(),
220
219
  pa.binary(),
@@ -234,6 +233,7 @@ def test_odps_arrow_schema_conversion():
234
233
  pa.struct([("a1", pa.string()), ("a2", pa.map_(pa.string(), pa.int64()))]),
235
234
  pa.string(),
236
235
  pa.string(),
236
+ pa.decimal128(38, 18),
237
237
  ]
238
238
 
239
239
  expected_odps_schema = odps_types.OdpsSchema(
@@ -256,6 +256,7 @@ def test_odps_arrow_schema_conversion():
256
256
  odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
257
257
  odps_types.Column("col17", "string"),
258
258
  odps_types.Column("col18", "string"),
259
+ odps_types.Column("col19", "decimal(38, 18)"),
259
260
  ]
260
261
  )
261
262
 
@@ -12,22 +12,37 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import datetime
16
+
15
17
  import numpy as np
16
18
  import pandas as pd
17
19
  import pyarrow as pa
20
+ import pytest
18
21
  from odps import ODPS
19
22
 
20
- from ...tests.utils import flaky, tn
21
- from ...utils import config_odps_default_options
22
- from ..tableio import HaloTableIO
23
+ from ....config import options
24
+ from ....tests.utils import flaky, tn
25
+ from ....utils import config_odps_default_options
26
+ from ..tableio import ODPSTableIO
27
+
28
+
29
+ @pytest.fixture
30
+ def switch_table_io(request):
31
+ old_use_common_table = options.use_common_table
32
+ try:
33
+ options.use_common_table = request.param
34
+ yield
35
+ finally:
36
+ options.use_common_table = old_use_common_table
23
37
 
24
38
 
25
39
  @flaky(max_runs=3)
26
- def test_empty_table_io():
40
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
41
+ def test_empty_table_io(switch_table_io):
27
42
  config_odps_default_options()
28
43
 
29
44
  o = ODPS.from_environments()
30
- halo_table_io = HaloTableIO(o)
45
+ table_io = ODPSTableIO(o)
31
46
 
32
47
  # test read from empty table
33
48
  empty_table_name = tn("test_empty_table_halo_read")
@@ -35,42 +50,53 @@ def test_empty_table_io():
35
50
  tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
36
51
 
37
52
  try:
38
- with halo_table_io.open_reader(empty_table_name) as reader:
53
+ with table_io.open_reader(empty_table_name) as reader:
39
54
  assert len(reader.read_all()) == 0
40
55
  finally:
41
56
  tb.drop()
42
57
 
43
58
 
44
59
  @flaky(max_runs=3)
45
- def test_table_io_without_parts():
60
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
61
+ def test_table_io_without_parts(switch_table_io):
46
62
  config_odps_default_options()
47
63
 
48
64
  o = ODPS.from_environments()
49
- halo_table_io = HaloTableIO(o)
65
+ table_io = ODPSTableIO(o)
50
66
 
51
67
  # test read and write tables without partition
52
68
  no_part_table_name = tn("test_no_part_halo_write")
53
69
  o.delete_table(no_part_table_name, if_exists=True)
54
- tb = o.create_table(
55
- no_part_table_name, ",".join(f"{c} double" for c in "abcde"), lifecycle=1
56
- )
70
+ col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
71
+ tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
57
72
 
58
73
  try:
59
74
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
60
- with halo_table_io.open_writer(no_part_table_name) as writer:
75
+ date_val = [
76
+ (
77
+ datetime.datetime.now().replace(microsecond=0)
78
+ + datetime.timedelta(seconds=i)
79
+ )
80
+ for i in range(100)
81
+ ]
82
+ pd_data["f"] = pd.Series(date_val, dtype="datetime64[ms]").dt.tz_localize(
83
+ options.local_timezone
84
+ )
85
+ with table_io.open_writer(no_part_table_name) as writer:
61
86
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
62
- with halo_table_io.open_reader(no_part_table_name) as reader:
87
+ with table_io.open_reader(no_part_table_name) as reader:
63
88
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
64
89
  finally:
65
90
  tb.drop()
66
91
 
67
92
 
68
93
  @flaky(max_runs=3)
69
- def test_table_io_with_range_reader():
94
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
95
+ def test_table_io_with_range_reader(switch_table_io):
70
96
  config_odps_default_options()
71
97
 
72
98
  o = ODPS.from_environments()
73
- halo_table_io = HaloTableIO(o)
99
+ table_io = ODPSTableIO(o)
74
100
 
75
101
  # test read and write tables without partition
76
102
  no_part_table_name = tn("test_no_part_halo_write")
@@ -81,15 +107,15 @@ def test_table_io_with_range_reader():
81
107
 
82
108
  try:
83
109
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
84
- with halo_table_io.open_writer(no_part_table_name) as writer:
110
+ with table_io.open_writer(no_part_table_name) as writer:
85
111
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
86
112
 
87
- with halo_table_io.open_reader(
113
+ with table_io.open_reader(
88
114
  no_part_table_name, start=None, stop=100, row_batch_size=10
89
115
  ) as reader:
90
116
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
91
117
 
92
- with halo_table_io.open_reader(
118
+ with table_io.open_reader(
93
119
  no_part_table_name,
94
120
  start=-2,
95
121
  stop=-52,
@@ -105,11 +131,12 @@ def test_table_io_with_range_reader():
105
131
 
106
132
 
107
133
  @flaky(max_runs=3)
108
- def test_table_io_with_parts():
134
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
135
+ def test_table_io_with_parts(switch_table_io):
109
136
  config_odps_default_options()
110
137
 
111
138
  o = ODPS.from_environments()
112
- halo_table_io = HaloTableIO(o)
139
+ table_io = ODPSTableIO(o)
113
140
 
114
141
  # test read and write tables with partition
115
142
  parted_table_name = tn("test_parted_halo_write")
@@ -122,11 +149,11 @@ def test_table_io_with_parts():
122
149
 
123
150
  try:
124
151
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
125
- with halo_table_io.open_writer(parted_table_name, "pt=test") as writer:
152
+ with table_io.open_writer(parted_table_name, "pt=test") as writer:
126
153
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
127
- with halo_table_io.open_reader(parted_table_name, "pt=test") as reader:
154
+ with table_io.open_reader(parted_table_name, "pt=test") as reader:
128
155
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
129
- with halo_table_io.open_reader(
156
+ with table_io.open_reader(
130
157
  parted_table_name, "pt=test", partition_columns=True
131
158
  ) as reader:
132
159
  expected_data = pd_data.copy()
@@ -15,7 +15,7 @@
15
15
  import pytest
16
16
  from odps import ODPS
17
17
 
18
- from ...tests.utils import tn
18
+ from ....tests.utils import tn
19
19
  from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
20
20
 
21
21
 
@@ -69,19 +69,17 @@ def create_volume(request, oss_config):
69
69
  oss_config.oss_bucket.batch_delete_objects(keys)
70
70
 
71
71
 
72
- @pytest.mark.parametrize("create_volume", ["parted", "external"], indirect=True)
72
+ @pytest.mark.parametrize("create_volume", ["external"], indirect=True)
73
73
  def test_read_write_volume(create_volume):
74
74
  test_vol_dir = "test_vol_dir"
75
75
 
76
76
  odps_entry = ODPS.from_environments()
77
77
 
78
78
  writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
79
- write_session_id = writer.create_write_session()
80
79
 
81
80
  writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
82
- writer.write_file("file1", b"content1", write_session_id)
83
- writer.write_file("file2", b"content2", write_session_id)
84
- writer.commit(["file1", "file2"], write_session_id)
81
+ writer.write_file("file1", b"content1")
82
+ writer.write_file("file2", b"content2")
85
83
 
86
84
  reader = ODPSVolumeReader(odps_entry, create_volume, test_vol_dir)
87
85
  assert reader.read_file("file1") == b"content1"
@@ -0,0 +1,57 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Iterator, List, Union
17
+
18
+ from odps import ODPS
19
+
20
+
21
+ class ODPSVolumeReader:
22
+ def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
23
+ self._odps_entry = odps_entry
24
+ self._volume = odps_entry.get_volume(volume_name)
25
+ self._volume_dir = volume_dir
26
+
27
+ def list_files(self) -> List[str]:
28
+ def _get_file_name(vol_file):
29
+ if hasattr(vol_file, "name"):
30
+ return vol_file.name
31
+ return vol_file.path.rsplit("/", 1)[-1]
32
+
33
+ return [
34
+ _get_file_name(f)
35
+ for f in self._odps_entry.list_volume_files(
36
+ f"/{self._volume.name}/{self._volume_dir}"
37
+ )
38
+ ]
39
+
40
+ def read_file(self, file_name: str) -> bytes:
41
+ with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
42
+ return reader.read()
43
+
44
+
45
+ class ODPSVolumeWriter:
46
+ def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
47
+ self._odps_entry = odps_entry
48
+ self._volume = odps_entry.get_volume(volume_name)
49
+ self._volume_dir = volume_dir
50
+
51
+ def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
52
+ with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
53
+ if not inspect.isgenerator(data):
54
+ writer.write(data)
55
+ else:
56
+ for chunk in data:
57
+ writer.write(chunk)
@@ -14,7 +14,7 @@
14
14
 
15
15
  import numpy as np
16
16
 
17
- from ....tensor import argmax
17
+ from ....tensor import argmax, transpose, vstack
18
18
  from ..utils import make_import_error_func
19
19
  from .core import XGBScikitLearnBase, xgboost
20
20
 
@@ -42,7 +42,10 @@ else:
42
42
  sample_weight_eval_set=None,
43
43
  base_margin_eval_set=None,
44
44
  num_class=None,
45
+ **kw,
45
46
  ):
47
+ session = kw.pop("session", None)
48
+ run_kwargs = kw.pop("run_kwargs", dict())
46
49
  dtrain, evals = wrap_evaluation_matrices(
47
50
  None,
48
51
  X,
@@ -68,6 +71,8 @@ else:
68
71
  evals=evals,
69
72
  evals_result=self.evals_result_,
70
73
  num_class=num_class,
74
+ session=session,
75
+ run_kwargs=run_kwargs,
71
76
  )
72
77
  self._Booster = result
73
78
  return self
@@ -83,4 +88,23 @@ else:
83
88
  def predict_proba(self, data, ntree_limit=None, flag=False, **kw):
84
89
  if ntree_limit is not None:
85
90
  raise NotImplementedError("ntree_limit is not currently supported")
86
- return predict(self.get_booster(), data, flag=flag, **kw)
91
+ prediction = predict(self.get_booster(), data, flag=flag, **kw)
92
+
93
+ if len(prediction.shape) == 2 and prediction.shape[1] == self.n_classes_:
94
+ # multi-class
95
+ return prediction
96
+ if (
97
+ len(prediction.shape) == 2
98
+ and self.n_classes_ == 2
99
+ and prediction.shape[1] >= self.n_classes_
100
+ ):
101
+ # multi-label
102
+ return prediction
103
+ # binary logistic function
104
+ classone_probs = prediction
105
+ classzero_probs = 1.0 - classone_probs
106
+ return transpose(vstack((classzero_probs, classone_probs)))
107
+
108
+ @property
109
+ def classes_(self) -> np.ndarray:
110
+ return np.arange(self.n_classes_)
@@ -12,15 +12,67 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Any, Callable, List, Optional, Tuple
15
+ from typing import Any, Callable, Dict, List, Optional, Tuple
16
16
 
17
17
  try:
18
18
  import xgboost
19
19
  except ImportError:
20
20
  xgboost = None
21
21
 
22
+ from ...core import Model, ModelData
22
23
  from .dmatrix import DMatrix
23
24
 
25
+
26
+ class BoosterData(ModelData):
27
+ __slots__ = ("_evals_result",)
28
+
29
+ _evals_result: Dict
30
+
31
+ def __init__(self, *args, evals_result=None, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+ self._evals_result = evals_result if evals_result is not None else dict()
34
+
35
+ def execute(self, session=None, **kw):
36
+ # The evals_result should be fetched when BoosterData.execute() is called.
37
+ result = super().execute(session=session, **kw)
38
+ if self.op.has_evals_result and self.key == self.op.outputs[0].key:
39
+ self._evals_result.update(self.op.outputs[1].fetch(session=session))
40
+ return result
41
+
42
+ def predict(
43
+ self,
44
+ data,
45
+ output_margin=False,
46
+ pred_leaf=False,
47
+ pred_contribs=False,
48
+ approx_contribs=False,
49
+ pred_interactions=False,
50
+ validate_features=True,
51
+ training=False,
52
+ iteration_range=None,
53
+ strict_shape=False,
54
+ ):
55
+ from .predict import predict
56
+
57
+ return predict(
58
+ self,
59
+ data,
60
+ output_margin=output_margin,
61
+ pred_leaf=pred_leaf,
62
+ pred_contribs=pred_contribs,
63
+ approx_contribs=approx_contribs,
64
+ pred_interactions=pred_interactions,
65
+ validate_features=validate_features,
66
+ training=training,
67
+ iteration_range=iteration_range,
68
+ strict_shape=strict_shape,
69
+ )
70
+
71
+
72
+ class Booster(Model):
73
+ pass
74
+
75
+
24
76
  if not xgboost:
25
77
  XGBScikitLearnBase = None
26
78
  else:
@@ -40,7 +92,9 @@ else:
40
92
  **kw,
41
93
  ):
42
94
  """
43
- Fit the regressor.
95
+ Fit the regressor. Note that fit() is an eager-execution
96
+ API. The call will be blocked until training finished.
97
+
44
98
  Parameters
45
99
  ----------
46
100
  X : array_like
@@ -72,6 +126,37 @@ else:
72
126
  """
73
127
  raise NotImplementedError
74
128
 
129
+ def evals_result(self, **kw) -> Dict:
130
+ """Return the evaluation results.
131
+
132
+ If **eval_set** is passed to the :py:meth:`fit` function, you can call
133
+ ``evals_result()`` to get evaluation results for all passed **eval_sets**. When
134
+ **eval_metric** is also passed to the :py:meth:`fit` function, the
135
+ **evals_result** will contain the **eval_metrics** passed to the :py:meth:`fit`
136
+ function.
137
+
138
+ The returned evaluation result is a dictionary:
139
+
140
+ .. code-block:: python
141
+
142
+ {'validation_0': {'logloss': ['0.604835', '0.531479']},
143
+ 'validation_1': {'logloss': ['0.41965', '0.17686']}}
144
+
145
+ Note that evals_result() will be blocked until the train is finished.
146
+
147
+ Returns
148
+ -------
149
+ evals_result
150
+
151
+ """
152
+ result = super().evals_result()
153
+ if not self._Booster.op.has_evals_result or len(result) != 0:
154
+ return result
155
+ session = kw.pop("session", None)
156
+ run_kwargs = kw.pop("run_kwargs", dict())
157
+ self._Booster.execute(session=session, **run_kwargs)
158
+ return super().evals_result()
159
+
75
160
  def wrap_evaluation_matrices(
76
161
  missing: float,
77
162
  X: Any,
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
- from .... import opcodes as OperandDef
16
+ from .... import opcodes
17
17
  from ....core.entity.output_types import get_output_types
18
18
  from ....core.operator.base import Operator
19
19
  from ....core.operator.core import TileableOperatorMixin
@@ -27,7 +27,7 @@ from ...utils import convert_to_tensor_or_dataframe
27
27
 
28
28
 
29
29
  class ToDMatrix(Operator, TileableOperatorMixin):
30
- _op_type_ = OperandDef.TO_DMATRIX
30
+ _op_type_ = opcodes.TO_DMATRIX
31
31
 
32
32
  data = KeyField("data", default=None)
33
33
  label = KeyField("label", default=None)
@@ -99,10 +99,7 @@ def check_array_like(y: TileableType, name: str) -> TileableType:
99
99
  y = convert_to_tensor_or_dataframe(y)
100
100
  if isinstance(y, DATAFRAME_TYPE):
101
101
  y = y.iloc[:, 0]
102
- y = astensor(y)
103
- if y.ndim != 1:
104
- raise ValueError(f"Expecting 1-d {name}, got: {y.ndim}-d")
105
- return y
102
+ return astensor(y)
106
103
 
107
104
 
108
105
  def to_dmatrix(
@@ -12,29 +12,32 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import pickle
16
15
 
17
16
  import numpy as np
18
17
  import pandas as pd
19
18
 
20
- from .... import opcodes as OperandDef
19
+ from .... import opcodes
21
20
  from ....core.entity.output_types import OutputType
22
21
  from ....core.operator.base import Operator
23
22
  from ....core.operator.core import TileableOperatorMixin
24
23
  from ....dataframe.utils import parse_index
25
- from ....serialization.serializables import BoolField, BytesField, KeyField, TupleField
24
+ from ....serialization.serializables import (
25
+ BoolField,
26
+ KeyField,
27
+ ReferenceField,
28
+ TupleField,
29
+ )
26
30
  from ....tensor.core import TENSOR_TYPE, TensorOrder
31
+ from .core import BoosterData
27
32
  from .dmatrix import check_data
28
33
 
29
34
 
30
35
  class XGBPredict(Operator, TileableOperatorMixin):
31
- _op_type_ = OperandDef.XGBOOST_PREDICT
36
+ _op_type_ = opcodes.XGBOOST_PREDICT
32
37
  output_dtype = np.dtype(np.float32)
33
38
 
34
39
  data = KeyField("data", default=None)
35
- model = BytesField(
36
- "model", on_serialize=pickle.dumps, on_deserialize=pickle.loads, default=None
37
- )
40
+ model = ReferenceField("model", reference_type=BoosterData, default=None)
38
41
  pred_leaf = BoolField("pred_leaf", default=False)
39
42
  pred_contribs = BoolField("pred_contribs", default=False)
40
43
  approx_contribs = BoolField("approx_contribs", default=False)
@@ -107,6 +110,17 @@ def predict(
107
110
  strict_shape=False,
108
111
  flag=False,
109
112
  ):
113
+ """
114
+ Using MaxFrame XGBoost model to predict data.
115
+
116
+ Parameters
117
+ ----------
118
+ Parameters are the same as `xgboost.train`. The predict() is lazy-execution mode.
119
+
120
+ Returns
121
+ -------
122
+ results: Booster
123
+ """
110
124
  data = check_data(data)
111
125
  # TODO: check model datatype
112
126
 
@@ -41,11 +41,6 @@ else:
41
41
  ):
42
42
  session = kw.pop("session", None)
43
43
  run_kwargs = kw.pop("run_kwargs", dict())
44
- if kw:
45
- raise TypeError(
46
- f"fit got an unexpected keyword argument '{next(iter(kw))}'"
47
- )
48
-
49
44
  dtrain, evals = wrap_evaluation_matrices(
50
45
  None,
51
46
  X,
@@ -57,6 +52,8 @@ else:
57
52
  base_margin_eval_set,
58
53
  )
59
54
  params = self.get_xgb_params()
55
+ if not params.get("objective"):
56
+ params["objective"] = "reg:squarederror"
60
57
  self.evals_result_ = dict()
61
58
  result = train(
62
59
  params,
@@ -71,8 +68,4 @@ else:
71
68
  return self
72
69
 
73
70
  def predict(self, data, **kw):
74
- session = kw.pop("session", None)
75
- run_kwargs = kw.pop("run_kwargs", None)
76
- return predict(
77
- self.get_booster(), data, session=session, run_kwargs=run_kwargs, **kw
78
- )
71
+ return predict(self.get_booster(), data, **kw)