maxframe 1.0.0rc1__cp39-cp39-win32.whl → 1.0.0rc3__cp39-cp39-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (138) hide show
  1. maxframe/_utils.cp39-win32.pyd +0 -0
  2. maxframe/codegen.py +3 -6
  3. maxframe/config/config.py +49 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +15 -2
  6. maxframe/core/__init__.py +2 -13
  7. maxframe/core/entity/__init__.py +0 -4
  8. maxframe/core/entity/objects.py +46 -3
  9. maxframe/core/entity/output_types.py +0 -3
  10. maxframe/core/entity/tests/test_objects.py +43 -0
  11. maxframe/core/entity/tileables.py +5 -78
  12. maxframe/core/graph/__init__.py +2 -2
  13. maxframe/core/graph/builder/__init__.py +0 -1
  14. maxframe/core/graph/builder/base.py +5 -4
  15. maxframe/core/graph/builder/tileable.py +4 -4
  16. maxframe/core/graph/builder/utils.py +4 -8
  17. maxframe/core/graph/core.cp39-win32.pyd +0 -0
  18. maxframe/core/graph/entity.py +9 -33
  19. maxframe/core/operator/__init__.py +2 -9
  20. maxframe/core/operator/base.py +3 -5
  21. maxframe/core/operator/objects.py +0 -9
  22. maxframe/core/operator/utils.py +55 -0
  23. maxframe/dataframe/__init__.py +1 -1
  24. maxframe/dataframe/arithmetic/around.py +5 -17
  25. maxframe/dataframe/arithmetic/core.py +15 -7
  26. maxframe/dataframe/arithmetic/docstring.py +5 -55
  27. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  28. maxframe/dataframe/core.py +5 -5
  29. maxframe/dataframe/datasource/date_range.py +2 -2
  30. maxframe/dataframe/datasource/read_odps_query.py +7 -1
  31. maxframe/dataframe/datasource/read_odps_table.py +3 -2
  32. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  33. maxframe/dataframe/datastore/to_odps.py +1 -1
  34. maxframe/dataframe/groupby/cum.py +0 -1
  35. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/rename.py +3 -37
  38. maxframe/dataframe/indexing/sample.py +0 -1
  39. maxframe/dataframe/indexing/set_index.py +68 -1
  40. maxframe/dataframe/merge/merge.py +236 -2
  41. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  42. maxframe/dataframe/misc/apply.py +3 -10
  43. maxframe/dataframe/misc/case_when.py +1 -1
  44. maxframe/dataframe/misc/describe.py +2 -2
  45. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  46. maxframe/dataframe/misc/eval.py +4 -0
  47. maxframe/dataframe/misc/pct_change.py +1 -83
  48. maxframe/dataframe/misc/transform.py +1 -30
  49. maxframe/dataframe/misc/value_counts.py +4 -17
  50. maxframe/dataframe/missing/dropna.py +1 -1
  51. maxframe/dataframe/missing/fillna.py +5 -5
  52. maxframe/dataframe/operators.py +1 -17
  53. maxframe/dataframe/reduction/core.py +2 -2
  54. maxframe/dataframe/sort/sort_values.py +1 -11
  55. maxframe/dataframe/statistics/quantile.py +5 -17
  56. maxframe/dataframe/utils.py +4 -7
  57. maxframe/io/objects/__init__.py +24 -0
  58. maxframe/io/objects/core.py +140 -0
  59. maxframe/io/objects/tensor.py +76 -0
  60. maxframe/io/objects/tests/__init__.py +13 -0
  61. maxframe/io/objects/tests/test_object_io.py +97 -0
  62. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  63. maxframe/{odpsio → io/odpsio}/arrow.py +12 -8
  64. maxframe/{odpsio → io/odpsio}/schema.py +15 -12
  65. maxframe/io/odpsio/tableio.py +702 -0
  66. maxframe/io/odpsio/tests/__init__.py +13 -0
  67. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +19 -18
  68. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  69. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  70. maxframe/io/odpsio/volumeio.py +57 -0
  71. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  72. maxframe/learn/contrib/xgboost/core.py +87 -2
  73. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  74. maxframe/learn/contrib/xgboost/predict.py +21 -7
  75. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  76. maxframe/learn/contrib/xgboost/train.py +27 -17
  77. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  78. maxframe/lib/mmh3.cp39-win32.pyd +0 -0
  79. maxframe/protocol.py +41 -17
  80. maxframe/remote/core.py +4 -8
  81. maxframe/serialization/__init__.py +1 -0
  82. maxframe/serialization/core.cp39-win32.pyd +0 -0
  83. maxframe/serialization/serializables/core.py +48 -9
  84. maxframe/tensor/__init__.py +69 -2
  85. maxframe/tensor/arithmetic/isclose.py +1 -0
  86. maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
  87. maxframe/tensor/core.py +5 -136
  88. maxframe/tensor/datasource/array.py +3 -0
  89. maxframe/tensor/datasource/full.py +1 -1
  90. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  91. maxframe/tensor/indexing/flatnonzero.py +1 -1
  92. maxframe/tensor/merge/__init__.py +2 -0
  93. maxframe/tensor/merge/concatenate.py +98 -0
  94. maxframe/tensor/merge/tests/test_merge.py +30 -1
  95. maxframe/tensor/merge/vstack.py +70 -0
  96. maxframe/tensor/{base → misc}/__init__.py +2 -0
  97. maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
  98. maxframe/tensor/misc/atleast_2d.py +70 -0
  99. maxframe/tensor/misc/atleast_3d.py +85 -0
  100. maxframe/tensor/misc/tests/__init__.py +13 -0
  101. maxframe/tensor/{base → misc}/transpose.py +22 -18
  102. maxframe/tensor/{base → misc}/unique.py +2 -2
  103. maxframe/tensor/operators.py +1 -7
  104. maxframe/tensor/random/core.py +1 -1
  105. maxframe/tensor/reduction/count_nonzero.py +1 -0
  106. maxframe/tensor/reduction/mean.py +1 -0
  107. maxframe/tensor/reduction/nanmean.py +1 -0
  108. maxframe/tensor/reduction/nanvar.py +2 -0
  109. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  110. maxframe/tensor/reduction/var.py +2 -0
  111. maxframe/tensor/statistics/quantile.py +2 -2
  112. maxframe/tensor/utils.py +2 -22
  113. maxframe/tests/utils.py +11 -2
  114. maxframe/typing_.py +4 -1
  115. maxframe/udf.py +8 -9
  116. maxframe/utils.py +32 -70
  117. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/METADATA +25 -25
  118. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +133 -123
  119. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
  120. maxframe_client/fetcher.py +60 -68
  121. maxframe_client/session/graph.py +8 -2
  122. maxframe_client/session/odps.py +58 -22
  123. maxframe_client/tests/test_fetcher.py +21 -3
  124. maxframe_client/tests/test_session.py +27 -4
  125. maxframe/core/entity/chunks.py +0 -68
  126. maxframe/core/entity/fuse.py +0 -73
  127. maxframe/core/graph/builder/chunk.py +0 -430
  128. maxframe/odpsio/tableio.py +0 -322
  129. maxframe/odpsio/volumeio.py +0 -95
  130. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  131. /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
  132. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  133. /maxframe/tensor/{base → misc}/astype.py +0 -0
  134. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  135. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  136. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  137. /maxframe/tensor/{base → misc}/where.py +0 -0
  138. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -1,322 +0,0 @@
1
- # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import os
16
- import time
17
- from abc import ABC, abstractmethod
18
- from contextlib import contextmanager
19
- from typing import List, Optional, Union
20
-
21
- import pyarrow as pa
22
- from odps import ODPS
23
- from odps.apis.storage_api import (
24
- StorageApiArrowClient,
25
- TableBatchScanResponse,
26
- TableBatchWriteResponse,
27
- )
28
- from odps.types import PartitionSpec
29
-
30
- from ..env import ODPS_STORAGE_API_ENDPOINT
31
- from .schema import odps_schema_to_arrow_schema
32
-
33
- PartitionsType = Union[List[str], str, None]
34
-
35
- _DEFAULT_ROW_BATCH_SIZE = 4096
36
-
37
-
38
- class MCTableIO(ABC):
39
- def __init__(self, odps: ODPS):
40
- self._odps = odps
41
-
42
- @abstractmethod
43
- def open_reader(
44
- self,
45
- full_table_name: str,
46
- partitions: PartitionsType = None,
47
- columns: Optional[List[str]] = None,
48
- start: Optional[int] = None,
49
- count: Optional[int] = None,
50
- ):
51
- raise NotImplementedError
52
-
53
- @abstractmethod
54
- def open_writer(
55
- self,
56
- full_table_name: str,
57
- partition: Optional[str] = None,
58
- overwrite: bool = True,
59
- ):
60
- raise NotImplementedError
61
-
62
-
63
- class HaloTableArrowReader:
64
- def __init__(
65
- self,
66
- client: StorageApiArrowClient,
67
- scan_info: TableBatchScanResponse,
68
- schema: pa.Schema,
69
- start: Optional[int] = None,
70
- count: Optional[int] = None,
71
- row_batch_size: Optional[int] = None,
72
- ):
73
- self._client = client
74
- self._scan_info = scan_info
75
-
76
- self._cur_split_id = -1
77
- self._cur_reader = None
78
-
79
- self._schema = schema
80
-
81
- self._start = start
82
- self._count = count
83
- self._cursor = 0
84
- self._row_batch_size = row_batch_size
85
-
86
- @property
87
- def count(self) -> int:
88
- return self._count
89
-
90
- def _open_next_reader(self):
91
- from odps.apis.storage_api import ReadRowsRequest
92
-
93
- if 0 <= self._scan_info.split_count <= self._cur_split_id + 1:
94
- # scan by split
95
- self._cur_reader = None
96
- return
97
- elif self._count is not None and self._cursor >= self._count:
98
- # scan by range
99
- self._cur_reader = None
100
- return
101
-
102
- read_rows_kw = {}
103
- if self._start is not None:
104
- read_rows_kw["row_index"] = self._start + self._cursor
105
- read_rows_kw["row_count"] = min(
106
- self._row_batch_size, self._count - self._cursor
107
- )
108
- self._cursor = min(self._count, self._cursor + self._row_batch_size)
109
-
110
- req = ReadRowsRequest(
111
- session_id=self._scan_info.session_id,
112
- split_index=self._cur_split_id + 1,
113
- **read_rows_kw,
114
- )
115
- self._cur_reader = self._client.read_rows_arrow(req)
116
- self._cur_split_id += 1
117
-
118
- def read(self):
119
- if self._cur_reader is None:
120
- self._open_next_reader()
121
- if self._cur_reader is None:
122
- return None
123
- while self._cur_reader is not None:
124
- batch = self._cur_reader.read()
125
- if batch is not None:
126
- return batch
127
- self._open_next_reader()
128
- return None
129
-
130
- def read_all(self) -> pa.Table:
131
- batches = []
132
- while True:
133
- batch = self.read()
134
- if batch is None:
135
- break
136
- batches.append(batch)
137
- if not batches:
138
- return self._schema.empty_table()
139
- return pa.Table.from_batches(batches)
140
-
141
-
142
- class HaloTableArrowWriter:
143
- def __init__(
144
- self, client: StorageApiArrowClient, write_info: TableBatchWriteResponse
145
- ):
146
- self._client = client
147
- self._write_info = write_info
148
-
149
- self._writer = None
150
-
151
- def open(self):
152
- from odps.apis.storage_api import WriteRowsRequest
153
-
154
- self._writer = self._client.write_rows_arrow(
155
- WriteRowsRequest(self._write_info.session_id)
156
- )
157
-
158
- def write(self, batch):
159
- if isinstance(batch, pa.Table):
160
- for b in batch.to_batches():
161
- self._writer.write(b)
162
- else:
163
- self._writer.write(batch)
164
-
165
- def close(self):
166
- commit_msg, is_success = self._writer.finish()
167
- if not is_success:
168
- raise IOError(commit_msg)
169
- return commit_msg
170
-
171
-
172
- class HaloTableIO(MCTableIO):
173
- _storage_api_endpoint = os.getenv(ODPS_STORAGE_API_ENDPOINT)
174
-
175
- @staticmethod
176
- def _convert_partitions(partitions: PartitionsType) -> Optional[List[str]]:
177
- if partitions is None:
178
- return []
179
- elif isinstance(partitions, (str, PartitionSpec)):
180
- partitions = [partitions]
181
- return [
182
- "/".join(f"{k}={v}" for k, v in PartitionSpec(pt).items())
183
- for pt in partitions
184
- ]
185
-
186
- def get_table_record_count(
187
- self, full_table_name: str, partitions: PartitionsType = None
188
- ):
189
- from odps.apis.storage_api import SplitOptions, TableBatchScanRequest
190
-
191
- table = self._odps.get_table(full_table_name)
192
- client = StorageApiArrowClient(
193
- self._odps, table, rest_endpoint=self._storage_api_endpoint
194
- )
195
-
196
- split_option = SplitOptions.SplitMode.SIZE
197
-
198
- scan_kw = {
199
- "required_partitions": self._convert_partitions(partitions),
200
- "split_options": SplitOptions.get_default_options(split_option),
201
- }
202
-
203
- # todo add more options for partition column handling
204
- req = TableBatchScanRequest(**scan_kw)
205
- resp = client.create_read_session(req)
206
- return resp.record_count
207
-
208
- @contextmanager
209
- def open_reader(
210
- self,
211
- full_table_name: str,
212
- partitions: PartitionsType = None,
213
- columns: Optional[List[str]] = None,
214
- partition_columns: Union[None, bool, List[str]] = None,
215
- start: Optional[int] = None,
216
- stop: Optional[int] = None,
217
- reverse_range: bool = False,
218
- row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
219
- ):
220
- from odps.apis.storage_api import (
221
- SessionRequest,
222
- SplitOptions,
223
- Status,
224
- TableBatchScanRequest,
225
- )
226
-
227
- table = self._odps.get_table(full_table_name)
228
- client = StorageApiArrowClient(
229
- self._odps, table, rest_endpoint=self._storage_api_endpoint
230
- )
231
-
232
- split_option = SplitOptions.SplitMode.SIZE
233
- if start is not None or stop is not None:
234
- split_option = SplitOptions.SplitMode.ROW_OFFSET
235
-
236
- scan_kw = {
237
- "required_partitions": self._convert_partitions(partitions),
238
- "split_options": SplitOptions.get_default_options(split_option),
239
- }
240
- columns = columns or [c.name for c in table.table_schema.simple_columns]
241
- scan_kw["required_data_columns"] = columns
242
- if partition_columns is True:
243
- scan_kw["required_partition_columns"] = [
244
- c.name for c in table.table_schema.partitions
245
- ]
246
- else:
247
- scan_kw["required_partition_columns"] = partition_columns
248
-
249
- # todo add more options for partition column handling
250
- req = TableBatchScanRequest(**scan_kw)
251
- resp = client.create_read_session(req)
252
-
253
- session_id = resp.session_id
254
- status = resp.status
255
- while status == Status.WAIT:
256
- resp = client.get_read_session(SessionRequest(session_id))
257
- status = resp.status
258
- time.sleep(1.0)
259
-
260
- assert status == Status.OK
261
-
262
- count = None
263
- if start is not None or stop is not None:
264
- if reverse_range:
265
- start = start if start is not None else resp.record_count - 1
266
- stop = stop if stop is not None else -1
267
- else:
268
- start = start if start is not None else 0
269
- stop = stop if stop is not None else resp.record_count
270
- start = start if start >= 0 else resp.record_count + start
271
- stop = stop if stop >= 0 else resp.record_count + stop
272
- if reverse_range:
273
- count = start - stop
274
- start = stop + 1
275
- else:
276
- count = stop - start
277
-
278
- yield HaloTableArrowReader(
279
- client,
280
- resp,
281
- schema=odps_schema_to_arrow_schema(table.table_schema),
282
- start=start,
283
- count=count,
284
- row_batch_size=row_batch_size,
285
- )
286
-
287
- @contextmanager
288
- def open_writer(
289
- self,
290
- full_table_name: str,
291
- partition: Optional[str] = None,
292
- overwrite: bool = True,
293
- ):
294
- from odps.apis.storage_api import (
295
- SessionRequest,
296
- SessionStatus,
297
- TableBatchWriteRequest,
298
- )
299
-
300
- table = self._odps.get_table(full_table_name)
301
- client = StorageApiArrowClient(
302
- self._odps, table, rest_endpoint=self._storage_api_endpoint
303
- )
304
-
305
- part_strs = self._convert_partitions(partition)
306
- part_str = part_strs[0] if part_strs else None
307
- req = TableBatchWriteRequest(partition_spec=part_str, overwrite=overwrite)
308
- resp = client.create_write_session(req)
309
-
310
- session_id = resp.session_id
311
- writer = HaloTableArrowWriter(client, resp)
312
- writer.open()
313
-
314
- yield writer
315
-
316
- commit_msg = writer.close()
317
- resp = client.commit_write_session(
318
- SessionRequest(session_id=session_id), [commit_msg]
319
- )
320
- while resp.session_status == SessionStatus.COMMITTING:
321
- resp = client.get_write_session(SessionRequest(session_id=session_id))
322
- assert resp.session_status == SessionStatus.COMMITTED
@@ -1,95 +0,0 @@
1
- # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from typing import List, Optional
16
-
17
- from odps import ODPS
18
- from odps.models import ExternalVolume, PartedVolume
19
- from odps.tunnel.volumetunnel import VolumeTunnel
20
-
21
-
22
- class ODPSVolumeReader:
23
- def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
24
- self._odps_entry = odps_entry
25
- self._volume = odps_entry.get_volume(volume_name)
26
- self._volume_dir = volume_dir
27
-
28
- def list_files(self) -> List[str]:
29
- def _get_file_name(vol_file):
30
- if hasattr(vol_file, "name"):
31
- return vol_file.name
32
- return vol_file.path.rsplit("/", 1)[-1]
33
-
34
- return [
35
- _get_file_name(f)
36
- for f in self._odps_entry.list_volume_files(
37
- f"/{self._volume.name}/{self._volume_dir}"
38
- )
39
- ]
40
-
41
- def read_file(self, file_name: str) -> bytes:
42
- if isinstance(self._volume, PartedVolume):
43
- vol_tunnel = VolumeTunnel(self._odps_entry)
44
- session = vol_tunnel.create_download_session(
45
- self._volume, self._volume_dir, file_name
46
- )
47
- with session.open() as reader:
48
- return reader.read()
49
- elif isinstance(self._volume, ExternalVolume):
50
- with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
51
- return reader.read()
52
-
53
-
54
- class ODPSVolumeWriter:
55
- def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
56
- self._odps_entry = odps_entry
57
- self._volume = odps_entry.get_volume(volume_name)
58
- self._volume_dir = volume_dir
59
- self._session_cache = None
60
-
61
- def create_write_session(self) -> Optional[str]:
62
- if not isinstance(self._volume, PartedVolume):
63
- return None
64
- vol_tunnel = VolumeTunnel(self._odps_entry)
65
- session = self._session_cache = vol_tunnel.create_upload_session(
66
- self._volume, self._volume_dir
67
- )
68
- return session.id
69
-
70
- def _get_existing_upload_session(self, write_session_id: Optional[str]):
71
- if self._session_cache is not None and (
72
- write_session_id is None or write_session_id == self._session_cache.id
73
- ):
74
- return self._session_cache
75
- vol_tunnel = VolumeTunnel(self._odps_entry)
76
- return vol_tunnel.create_upload_session(
77
- self._volume, self._volume_dir, write_session_id
78
- )
79
-
80
- def write_file(
81
- self, file_name: str, data: bytes, write_session_id: Optional[str] = None
82
- ):
83
- if isinstance(self._volume, PartedVolume):
84
- session = self._get_existing_upload_session(write_session_id)
85
- with session.open(file_name) as writer:
86
- writer.write(data)
87
- elif isinstance(self._volume, ExternalVolume):
88
- with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
89
- writer.write(data)
90
-
91
- def commit(self, files: List[str], write_session_id: Optional[str] = None):
92
- if not isinstance(self._volume, PartedVolume):
93
- return None
94
- session = self._get_existing_upload_session(write_session_id)
95
- session.commit(files)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes