maxframe 1.0.0rc1__cp311-cp311-win_amd64.whl → 1.0.0rc3__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (138) hide show
  1. maxframe/_utils.cp311-win_amd64.pyd +0 -0
  2. maxframe/codegen.py +3 -6
  3. maxframe/config/config.py +49 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +15 -2
  6. maxframe/core/__init__.py +2 -13
  7. maxframe/core/entity/__init__.py +0 -4
  8. maxframe/core/entity/objects.py +46 -3
  9. maxframe/core/entity/output_types.py +0 -3
  10. maxframe/core/entity/tests/test_objects.py +43 -0
  11. maxframe/core/entity/tileables.py +5 -78
  12. maxframe/core/graph/__init__.py +2 -2
  13. maxframe/core/graph/builder/__init__.py +0 -1
  14. maxframe/core/graph/builder/base.py +5 -4
  15. maxframe/core/graph/builder/tileable.py +4 -4
  16. maxframe/core/graph/builder/utils.py +4 -8
  17. maxframe/core/graph/core.cp311-win_amd64.pyd +0 -0
  18. maxframe/core/graph/entity.py +9 -33
  19. maxframe/core/operator/__init__.py +2 -9
  20. maxframe/core/operator/base.py +3 -5
  21. maxframe/core/operator/objects.py +0 -9
  22. maxframe/core/operator/utils.py +55 -0
  23. maxframe/dataframe/__init__.py +1 -1
  24. maxframe/dataframe/arithmetic/around.py +5 -17
  25. maxframe/dataframe/arithmetic/core.py +15 -7
  26. maxframe/dataframe/arithmetic/docstring.py +5 -55
  27. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  28. maxframe/dataframe/core.py +5 -5
  29. maxframe/dataframe/datasource/date_range.py +2 -2
  30. maxframe/dataframe/datasource/read_odps_query.py +7 -1
  31. maxframe/dataframe/datasource/read_odps_table.py +3 -2
  32. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  33. maxframe/dataframe/datastore/to_odps.py +1 -1
  34. maxframe/dataframe/groupby/cum.py +0 -1
  35. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  36. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  37. maxframe/dataframe/indexing/rename.py +3 -37
  38. maxframe/dataframe/indexing/sample.py +0 -1
  39. maxframe/dataframe/indexing/set_index.py +68 -1
  40. maxframe/dataframe/merge/merge.py +236 -2
  41. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  42. maxframe/dataframe/misc/apply.py +3 -10
  43. maxframe/dataframe/misc/case_when.py +1 -1
  44. maxframe/dataframe/misc/describe.py +2 -2
  45. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  46. maxframe/dataframe/misc/eval.py +4 -0
  47. maxframe/dataframe/misc/pct_change.py +1 -83
  48. maxframe/dataframe/misc/transform.py +1 -30
  49. maxframe/dataframe/misc/value_counts.py +4 -17
  50. maxframe/dataframe/missing/dropna.py +1 -1
  51. maxframe/dataframe/missing/fillna.py +5 -5
  52. maxframe/dataframe/operators.py +1 -17
  53. maxframe/dataframe/reduction/core.py +2 -2
  54. maxframe/dataframe/sort/sort_values.py +1 -11
  55. maxframe/dataframe/statistics/quantile.py +5 -17
  56. maxframe/dataframe/utils.py +4 -7
  57. maxframe/io/objects/__init__.py +24 -0
  58. maxframe/io/objects/core.py +140 -0
  59. maxframe/io/objects/tensor.py +76 -0
  60. maxframe/io/objects/tests/__init__.py +13 -0
  61. maxframe/io/objects/tests/test_object_io.py +97 -0
  62. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  63. maxframe/{odpsio → io/odpsio}/arrow.py +12 -8
  64. maxframe/{odpsio → io/odpsio}/schema.py +15 -12
  65. maxframe/io/odpsio/tableio.py +702 -0
  66. maxframe/io/odpsio/tests/__init__.py +13 -0
  67. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +19 -18
  68. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  69. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  70. maxframe/io/odpsio/volumeio.py +57 -0
  71. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  72. maxframe/learn/contrib/xgboost/core.py +87 -2
  73. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  74. maxframe/learn/contrib/xgboost/predict.py +21 -7
  75. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  76. maxframe/learn/contrib/xgboost/train.py +27 -17
  77. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  78. maxframe/lib/mmh3.cp311-win_amd64.pyd +0 -0
  79. maxframe/protocol.py +41 -17
  80. maxframe/remote/core.py +4 -8
  81. maxframe/serialization/__init__.py +1 -0
  82. maxframe/serialization/core.cp311-win_amd64.pyd +0 -0
  83. maxframe/serialization/serializables/core.py +48 -9
  84. maxframe/tensor/__init__.py +69 -2
  85. maxframe/tensor/arithmetic/isclose.py +1 -0
  86. maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
  87. maxframe/tensor/core.py +5 -136
  88. maxframe/tensor/datasource/array.py +3 -0
  89. maxframe/tensor/datasource/full.py +1 -1
  90. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  91. maxframe/tensor/indexing/flatnonzero.py +1 -1
  92. maxframe/tensor/merge/__init__.py +2 -0
  93. maxframe/tensor/merge/concatenate.py +98 -0
  94. maxframe/tensor/merge/tests/test_merge.py +30 -1
  95. maxframe/tensor/merge/vstack.py +70 -0
  96. maxframe/tensor/{base → misc}/__init__.py +2 -0
  97. maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
  98. maxframe/tensor/misc/atleast_2d.py +70 -0
  99. maxframe/tensor/misc/atleast_3d.py +85 -0
  100. maxframe/tensor/misc/tests/__init__.py +13 -0
  101. maxframe/tensor/{base → misc}/transpose.py +22 -18
  102. maxframe/tensor/{base → misc}/unique.py +2 -2
  103. maxframe/tensor/operators.py +1 -7
  104. maxframe/tensor/random/core.py +1 -1
  105. maxframe/tensor/reduction/count_nonzero.py +1 -0
  106. maxframe/tensor/reduction/mean.py +1 -0
  107. maxframe/tensor/reduction/nanmean.py +1 -0
  108. maxframe/tensor/reduction/nanvar.py +2 -0
  109. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  110. maxframe/tensor/reduction/var.py +2 -0
  111. maxframe/tensor/statistics/quantile.py +2 -2
  112. maxframe/tensor/utils.py +2 -22
  113. maxframe/tests/utils.py +11 -2
  114. maxframe/typing_.py +4 -1
  115. maxframe/udf.py +8 -9
  116. maxframe/utils.py +32 -70
  117. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/METADATA +25 -25
  118. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +133 -123
  119. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
  120. maxframe_client/fetcher.py +60 -68
  121. maxframe_client/session/graph.py +8 -2
  122. maxframe_client/session/odps.py +58 -22
  123. maxframe_client/tests/test_fetcher.py +21 -3
  124. maxframe_client/tests/test_session.py +27 -4
  125. maxframe/core/entity/chunks.py +0 -68
  126. maxframe/core/entity/fuse.py +0 -73
  127. maxframe/core/graph/builder/chunk.py +0 -430
  128. maxframe/odpsio/tableio.py +0 -322
  129. maxframe/odpsio/volumeio.py +0 -95
  130. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  131. /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
  132. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  133. /maxframe/tensor/{base → misc}/astype.py +0 -0
  134. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  135. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  136. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  137. /maxframe/tensor/{base → misc}/where.py +0 -0
  138. {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -12,23 +12,25 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import base64
16
- import json
17
15
  from abc import ABC, abstractmethod
18
16
  from numbers import Integral
19
- from typing import Any, Dict, List, Optional, Type, Union
17
+ from typing import Any, Dict, List, Optional, Tuple, Type, Union
20
18
 
21
19
  import pandas as pd
22
20
  import pyarrow as pa
23
21
  from odps import ODPS
24
- from odps.models import ExternalVolume, PartedVolume
22
+ from odps.models import ExternalVolume
25
23
  from odps.tunnel import TableTunnel
26
- from tornado import httpclient
27
24
 
28
25
  from maxframe.core import OBJECT_TYPE
29
26
  from maxframe.dataframe.core import DATAFRAME_TYPE
30
- from maxframe.lib import wrapped_pickle as pickle
31
- from maxframe.odpsio import HaloTableIO, arrow_to_pandas, build_dataframe_table_meta
27
+ from maxframe.io.objects import get_object_io_handler
28
+ from maxframe.io.odpsio import (
29
+ ODPSTableIO,
30
+ ODPSVolumeReader,
31
+ arrow_to_pandas,
32
+ build_dataframe_table_meta,
33
+ )
32
34
  from maxframe.protocol import (
33
35
  DataFrameTableMeta,
34
36
  ODPSTableResultInfo,
@@ -38,7 +40,7 @@ from maxframe.protocol import (
38
40
  )
39
41
  from maxframe.tensor.core import TENSOR_TYPE
40
42
  from maxframe.typing_ import PandasObjectTypes, TileableType
41
- from maxframe.utils import ToThreadMixin, deserialize_serializable
43
+ from maxframe.utils import ToThreadMixin
42
44
 
43
45
  _result_fetchers: Dict[ResultType, Type["ResultFetcher"]] = dict()
44
46
 
@@ -109,17 +111,12 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
109
111
  tileable: TileableType,
110
112
  info: ODPSTableResultInfo,
111
113
  ) -> None:
112
- if isinstance(tileable, DATAFRAME_TYPE) and tileable.dtypes is None:
113
- tb_comment = await self.to_thread(
114
- self._get_table_comment, info.full_table_name
115
- )
116
- if tb_comment: # pragma: no branch
117
- comment_data = json.loads(tb_comment)
118
-
119
- table_meta: DataFrameTableMeta = deserialize_serializable(
120
- base64.b64decode(comment_data["table_meta"])
121
- )
122
- tileable.refresh_from_table_meta(table_meta)
114
+ if (
115
+ isinstance(tileable, DATAFRAME_TYPE)
116
+ and tileable.dtypes is None
117
+ and info.table_meta is not None
118
+ ):
119
+ tileable.refresh_from_table_meta(info.table_meta)
123
120
 
124
121
  if tileable.shape and any(pd.isna(x) for x in tileable.shape):
125
122
  part_specs = [None] if not info.partition_specs else info.partition_specs
@@ -131,16 +128,39 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
131
128
  )
132
129
  total_records += session.count
133
130
  new_shape_list = list(tileable.shape)
134
- new_shape_list[-1] = total_records
131
+ new_shape_list[0] = total_records
135
132
  tileable.params = {"shape": tuple(new_shape_list)}
136
133
 
134
+ @staticmethod
135
+ def _align_selection_with_shape(
136
+ row_sel: slice, shape: Tuple[Optional[int], ...]
137
+ ) -> dict:
138
+ size = shape[0]
139
+ if not row_sel.start and not row_sel.stop:
140
+ return {}
141
+ is_reversed = row_sel.step is not None and row_sel.step < 0
142
+ read_kw = {
143
+ "start": row_sel.start,
144
+ "stop": row_sel.stop,
145
+ "reverse_range": is_reversed,
146
+ }
147
+ if pd.isna(size):
148
+ return read_kw
149
+
150
+ if is_reversed and row_sel.start is not None:
151
+ read_kw["start"] = min(size - 1, row_sel.start)
152
+ if not is_reversed and row_sel.stop is not None:
153
+ read_kw["stop"] = min(size, row_sel.stop)
154
+ return read_kw
155
+
137
156
  def _read_single_source(
138
157
  self,
139
158
  table_meta: DataFrameTableMeta,
140
159
  info: ODPSTableResultInfo,
141
160
  indexes: List[Union[None, Integral, slice]],
161
+ shape: Tuple[Optional[int], ...],
142
162
  ):
143
- table_io = HaloTableIO(self._odps_entry)
163
+ table_io = ODPSTableIO(self._odps_entry)
144
164
  read_kw = {}
145
165
  row_step = None
146
166
  if indexes:
@@ -148,13 +168,8 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
148
168
  indexes += [None]
149
169
  row_sel, col_sel = indexes
150
170
  if isinstance(row_sel, slice):
151
- if row_sel.start or row_sel.stop:
152
- read_kw["start"] = row_sel.start
153
- read_kw["stop"] = row_sel.stop
154
- read_kw["reverse_range"] = (
155
- row_sel.step is not None and row_sel.step < 0
156
- )
157
- row_step = row_sel.step
171
+ row_step = row_sel.step
172
+ read_kw = self._align_selection_with_shape(row_sel, shape)
158
173
  elif isinstance(row_sel, int):
159
174
  read_kw["start"] = row_sel
160
175
  read_kw["stop"] = row_sel + 1
@@ -173,8 +188,8 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
173
188
  with table_io.open_reader(
174
189
  info.full_table_name, info.partition_specs, **read_kw
175
190
  ) as reader:
176
- reader_count = reader.count
177
191
  result = reader.read_all()
192
+ reader_count = result.num_rows
178
193
 
179
194
  if not row_step:
180
195
  return result
@@ -195,7 +210,7 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
195
210
  ) -> PandasObjectTypes:
196
211
  table_meta = build_dataframe_table_meta(tileable)
197
212
  arrow_table: pa.Table = await self.to_thread(
198
- self._read_single_source, table_meta, info, indexes
213
+ self._read_single_source, table_meta, info, indexes, tileable.shape
199
214
  )
200
215
  return arrow_to_pandas(arrow_table, table_meta)
201
216
 
@@ -211,47 +226,24 @@ class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
211
226
  ) -> None:
212
227
  return
213
228
 
214
- async def _read_parted_volume_data(
215
- self, volume: PartedVolume, partition: str, file_name: str
216
- ) -> bytes:
217
- def sync_read():
218
- with volume.open_reader(partition, file_name) as reader:
219
- return reader.read()
220
-
221
- return await self.to_thread(sync_read)
222
-
223
- async def _read_external_volume_data(
224
- self, volume: ExternalVolume, path: str, file_name: str
225
- ) -> bytes:
226
- signed_url = await self.to_thread(
227
- volume.get_sign_url, path + "/" + file_name, "GET"
228
- )
229
- http_client = httpclient.AsyncHTTPClient()
230
-
231
- resp = await http_client.fetch(signed_url)
232
- if hasattr(resp, "status_code") and resp.code >= 400:
233
- try:
234
- import oss2.exceptions
235
-
236
- oss_exc = oss2.exceptions.make_exception(resp.body)
237
- raise oss_exc
238
- except ImportError:
239
- raise SystemError(resp.body)
240
- return resp.body
229
+ async def _fetch_object(
230
+ self,
231
+ tileable: TileableType,
232
+ info: ODPSVolumeResultInfo,
233
+ indexes: List[Union[Integral, slice]],
234
+ ) -> Any:
235
+ def volume_fetch_func():
236
+ reader = ODPSVolumeReader(
237
+ self._odps_entry, info.volume_name, info.volume_path
238
+ )
239
+ io_handler = get_object_io_handler(tileable)()
240
+ return io_handler.read_object(reader, tileable, indexes)
241
241
 
242
- async def _fetch_object(self, info: ODPSVolumeResultInfo) -> Any:
243
242
  volume = await self.to_thread(self._odps_entry.get_volume, info.volume_name)
244
- if isinstance(volume, PartedVolume):
245
- byte_data = await self._read_parted_volume_data(
246
- volume, info.volume_path, "data"
247
- )
248
- elif isinstance(volume, ExternalVolume):
249
- byte_data = await self._read_external_volume_data(
250
- volume, info.volume_path, "data"
251
- )
243
+ if isinstance(volume, ExternalVolume):
244
+ return await self.to_thread(volume_fetch_func)
252
245
  else:
253
246
  raise NotImplementedError(f"Volume type {type(volume)} not supported")
254
- return pickle.loads(byte_data)
255
247
 
256
248
  async def fetch(
257
249
  self,
@@ -260,5 +252,5 @@ class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
260
252
  indexes: List[Union[Integral, slice]],
261
253
  ) -> Any:
262
254
  if isinstance(tileable, (OBJECT_TYPE, TENSOR_TYPE)):
263
- return await self._fetch_object(info)
255
+ return await self._fetch_object(tileable, info, indexes)
264
256
  raise NotImplementedError(f"Fetching {type(tileable)} not implemented")
@@ -19,10 +19,16 @@ from dataclasses import dataclass
19
19
  from typing import Any, Dict, List, Tuple, Union
20
20
  from weakref import WeakSet
21
21
 
22
- from maxframe.core import ChunkType, TileableGraph, TileableType, enter_mode
22
+ from maxframe.core import (
23
+ ChunkType,
24
+ TileableGraph,
25
+ TileableType,
26
+ build_fetch,
27
+ enter_mode,
28
+ )
23
29
  from maxframe.core.operator import Fetch
24
30
  from maxframe.session import AbstractSession
25
- from maxframe.utils import build_fetch, copy_tileables
31
+ from maxframe.utils import copy_tileables
26
32
 
27
33
  logger = logging.getLogger(__name__)
28
34
 
@@ -26,7 +26,8 @@ import pandas as pd
26
26
  from odps import ODPS
27
27
 
28
28
  from maxframe.config import options
29
- from maxframe.core import Entity, TileableGraph, enter_mode
29
+ from maxframe.core import Entity, TileableGraph, build_fetch, enter_mode
30
+ from maxframe.core.operator import Fetch
30
31
  from maxframe.dataframe import read_odps_table
31
32
  from maxframe.dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
32
33
  from maxframe.dataframe.datasource import PandasDataSourceOperator
@@ -36,11 +37,18 @@ from maxframe.errors import (
36
37
  NoTaskServerResponseError,
37
38
  SessionAlreadyClosedError,
38
39
  )
39
- from maxframe.odpsio import HaloTableIO, pandas_to_arrow, pandas_to_odps_schema
40
+ from maxframe.io.objects import get_object_io_handler
41
+ from maxframe.io.odpsio import (
42
+ ODPSTableIO,
43
+ ODPSVolumeWriter,
44
+ pandas_to_arrow,
45
+ pandas_to_odps_schema,
46
+ )
40
47
  from maxframe.protocol import (
41
48
  DagInfo,
42
49
  DagStatus,
43
50
  ODPSTableResultInfo,
51
+ ODPSVolumeResultInfo,
44
52
  ResultInfo,
45
53
  SessionInfo,
46
54
  )
@@ -51,8 +59,13 @@ from maxframe.session import (
51
59
  Profiling,
52
60
  Progress,
53
61
  )
62
+ from maxframe.tensor.datasource import ArrayDataSource
54
63
  from maxframe.typing_ import TileableType
55
- from maxframe.utils import ToThreadMixin, build_temp_table_name
64
+ from maxframe.utils import (
65
+ ToThreadMixin,
66
+ build_session_volume_name,
67
+ build_temp_table_name,
68
+ )
56
69
 
57
70
  from ..clients.framedriver import FrameDriverClient
58
71
  from ..fetcher import get_fetcher_cls
@@ -139,14 +152,9 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
139
152
  self._session_id = session_info.session_id
140
153
  await self._show_logview_address()
141
154
 
142
- def _upload_and_get_read_tileable(self, t: TileableType) -> Optional[TileableType]:
143
- if (
144
- not isinstance(t.op, PandasDataSourceOperator)
145
- or t.op.get_data() is None
146
- or t.inputs
147
- ):
148
- return None
149
-
155
+ def _upload_and_get_table_read_tileable(
156
+ self, t: TileableType
157
+ ) -> Optional[TileableType]:
150
158
  schema, table_meta = pandas_to_odps_schema(t, unknown_as_string=True)
151
159
  if self._odps_entry.exist_table(table_meta.table_name):
152
160
  self._odps_entry.delete_table(
@@ -164,8 +172,8 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
164
172
  batch_size = options.session.upload_batch_size
165
173
 
166
174
  if len(data):
167
- halo_client = HaloTableIO(self._odps_entry)
168
- with halo_client.open_writer(table_obj.full_table_name) as writer:
175
+ table_client = ODPSTableIO(self._odps_entry)
176
+ with table_client.open_writer(table_obj.full_table_name) as writer:
169
177
  for batch_start in range(0, len(data), batch_size):
170
178
  if isinstance(data, pd.Index):
171
179
  batch = data[batch_start : batch_start + batch_size]
@@ -188,13 +196,34 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
188
196
  read_tileable.name = t.name
189
197
  else: # INDEX_TYPE
190
198
  if list(read_tileable.names) != list(t.names):
191
- read_tileable.names = t.names
199
+ read_tileable.rename(t.names, inplace=True)
192
200
  read_tileable._key = t.key
193
201
  read_tileable.params = t.params
194
202
  return read_tileable.data
195
203
 
204
+ def _upload_and_get_vol_read_tileable(
205
+ self, t: TileableType
206
+ ) -> Optional[TileableType]:
207
+ vol_name = build_session_volume_name(self.session_id)
208
+ writer = ODPSVolumeWriter(self._odps_entry, vol_name, t.key)
209
+ io_handler = get_object_io_handler(t)
210
+ io_handler().write_object(writer, t, t.op.data)
211
+ return build_fetch(t).data
212
+
213
+ def _upload_and_get_read_tileable(self, t: TileableType) -> Optional[TileableType]:
214
+ if (
215
+ not isinstance(t.op, (ArrayDataSource, PandasDataSourceOperator))
216
+ or t.op.get_data() is None
217
+ or t.inputs
218
+ ):
219
+ return None
220
+ if isinstance(t.op, PandasDataSourceOperator):
221
+ return self._upload_and_get_table_read_tileable(t)
222
+ else:
223
+ return self._upload_and_get_vol_read_tileable(t)
224
+
196
225
  @enter_mode(kernel=True, build=True)
197
- def _scan_and_replace_pandas_sources(
226
+ def _scan_and_replace_local_sources(
198
227
  self, graph: TileableGraph
199
228
  ) -> Dict[TileableType, TileableType]:
200
229
  """Replaces Pandas data sources with temp table sources in the graph"""
@@ -223,14 +252,21 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
223
252
  @enter_mode(kernel=True, build=True)
224
253
  def _get_input_infos(self, tileables: List[TileableType]) -> Dict[str, ResultInfo]:
225
254
  """Generate ResultInfo structs from generated temp tables"""
255
+ vol_name = build_session_volume_name(self.session_id)
256
+
226
257
  infos = dict()
227
258
  for t in tileables:
228
259
  key = t.key
229
- if not isinstance(t.op, DataFrameReadODPSTable):
230
- if not isinstance(t.inputs[0].op, DataFrameReadODPSTable):
231
- continue
232
- t = t.inputs[0]
233
- infos[key] = ODPSTableResultInfo(full_table_name=t.op.table_name)
260
+ if isinstance(t.op, DataFrameReadODPSTable):
261
+ infos[key] = ODPSTableResultInfo(full_table_name=t.op.table_name)
262
+ else:
263
+ if isinstance(t.op, Fetch):
264
+ infos[key] = ODPSVolumeResultInfo(
265
+ volume_name=vol_name, volume_path=t.key
266
+ )
267
+ elif t.inputs and isinstance(t.inputs[0].op, DataFrameReadODPSTable):
268
+ t = t.inputs[0]
269
+ infos[key] = ODPSTableResultInfo(full_table_name=t.op.table_name)
234
270
  return infos
235
271
 
236
272
  async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
@@ -242,7 +278,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
242
278
  tileable_graph, to_execute_tileables = gen_submit_tileable_graph(
243
279
  self, tileables, tileable_to_copied
244
280
  )
245
- source_replacements = self._scan_and_replace_pandas_sources(tileable_graph)
281
+ source_replacements = self._scan_and_replace_local_sources(tileable_graph)
246
282
 
247
283
  # we need to manage uploaded data sources with refcounting mechanism
248
284
  # as nodes in tileable_graph are copied, we need to use original nodes
@@ -384,7 +420,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
384
420
  data_tileable, indexes = self._get_data_tileable_and_indexes(tileable)
385
421
  info = self._tileable_to_infos[data_tileable]
386
422
  fetcher = get_fetcher_cls(info.result_type)(self._odps_entry)
387
- results.append(await fetcher.fetch(tileable, info, indexes))
423
+ results.append(await fetcher.fetch(data_tileable, info, indexes))
388
424
  return results
389
425
 
390
426
  async def decref(self, *tileable_keys):
@@ -17,19 +17,32 @@ import uuid
17
17
  import numpy as np
18
18
  import pandas as pd
19
19
  import pyarrow as pa
20
+ import pytest
20
21
  from odps import ODPS
21
22
 
22
23
  import maxframe.dataframe as md
23
- from maxframe.odpsio import HaloTableIO
24
+ from maxframe.config import options
25
+ from maxframe.io.odpsio import ODPSTableIO
24
26
  from maxframe.protocol import ODPSTableResultInfo, ResultType
25
27
  from maxframe.tests.utils import tn
26
28
 
27
29
  from ..fetcher import ODPSTableFetcher
28
30
 
29
31
 
30
- async def test_table_fetcher():
32
+ @pytest.fixture
33
+ def switch_table_io(request):
34
+ old_use_common_table = options.use_common_table
35
+ try:
36
+ options.use_common_table = request.param
37
+ yield
38
+ finally:
39
+ options.use_common_table = old_use_common_table
40
+
41
+
42
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
43
+ async def test_table_fetcher(switch_table_io):
31
44
  odps_entry = ODPS.from_environments()
32
- halo_table_io = HaloTableIO(odps_entry)
45
+ halo_table_io = ODPSTableIO(odps_entry)
33
46
  fetcher = ODPSTableFetcher(odps_entry)
34
47
 
35
48
  data = pd.DataFrame(
@@ -58,6 +71,11 @@ async def test_table_fetcher():
58
71
  assert len(fetched) == 1000
59
72
  pd.testing.assert_frame_equal(raw_data, fetched)
60
73
 
74
+ result_info = ODPSTableResultInfo(ResultType.ODPS_TABLE, full_table_name=table_name)
75
+ fetched = await fetcher.fetch(tileable, result_info, [slice(None, 2000), None])
76
+ assert len(fetched) == 1000
77
+ pd.testing.assert_frame_equal(raw_data, fetched)
78
+
61
79
  result_info = ODPSTableResultInfo(ResultType.ODPS_TABLE, full_table_name=table_name)
62
80
  fetched = await fetcher.fetch(tileable, result_info, [2, None])
63
81
  assert len(fetched) == 1
@@ -195,7 +195,8 @@ def test_run_dataframe_from_to_odps_table(start_mock_session):
195
195
  assert len(result_df) == 10
196
196
  assert len(result_df.columns) == 6
197
197
 
198
- df = md.read_odps_table(table_obj, index_col="index").head(10).execute().fetch()
198
+ df = md.read_odps_table(table_obj, index_col="index").head(10).execute()
199
+ assert df.shape == (10, 5)
199
200
  assert len(df) == 10
200
201
  assert len(df.columns) == 5
201
202
  finally:
@@ -246,7 +247,19 @@ def test_run_and_fetch_series(start_mock_session):
246
247
  )
247
248
 
248
249
 
249
- def test_run_remote_success(start_mock_session):
250
+ def test_execute_with_tensor(oss_config, start_mock_session):
251
+ pd_df = pd.DataFrame(
252
+ {"angles": [0, 3, 4], "degrees": [360, 180, 360]},
253
+ index=["circle", "triangle", "rectangle"],
254
+ )
255
+ df = md.DataFrame(pd_df)
256
+
257
+ result = (df - [1, 2]).execute().fetch()
258
+ expected = pd_df - [1, 2]
259
+ pd.testing.assert_frame_equal(result, expected)
260
+
261
+
262
+ def test_run_remote_success(oss_config, start_mock_session):
250
263
  def func(a, b):
251
264
  return a + b
252
265
 
@@ -257,7 +270,7 @@ def test_run_remote_success(start_mock_session):
257
270
  assert result == 21
258
271
 
259
272
 
260
- def test_run_remote_error(start_mock_session):
273
+ def test_run_remote_error(oss_config, start_mock_session):
261
274
  def func():
262
275
  raise ValueError
263
276
 
@@ -280,7 +293,7 @@ def test_pivot_dataframe(start_mock_session):
280
293
  df = md.DataFrame(pd_df)
281
294
  pivot = df.pivot_table(values="D", index=["A", "B"], columns=["C"], aggfunc="sum")
282
295
  executed = pivot.execute()
283
- assert pivot.shape == (2, 4)
296
+ assert pivot.shape == (4, 2)
284
297
  pd.testing.assert_index_equal(
285
298
  pivot.dtypes.index, pd.Index(["large", "small"], name="C")
286
299
  )
@@ -289,3 +302,13 @@ def test_pivot_dataframe(start_mock_session):
289
302
  values="D", index=["A", "B"], columns=["C"], aggfunc="sum"
290
303
  )
291
304
  pd.testing.assert_frame_equal(executed.to_pandas(), expected)
305
+
306
+
307
+ def test_index_drop_duplicates(start_mock_session):
308
+ pd_idx = pd.Index(["lame", "cow", "lame", "beetle", "lame", "hippo"])
309
+ idx = md.Index(pd_idx)
310
+ executed = idx.drop_duplicates(keep="first").execute()
311
+ expected = pd_idx.drop_duplicates(keep="first")
312
+ pd.testing.assert_index_equal(
313
+ executed.to_pandas().sort_values(), expected.sort_values()
314
+ )
@@ -1,68 +0,0 @@
1
- # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from ...serialization.serializables import BoolField, FieldTypes, TupleField
16
- from ...utils import tokenize
17
- from .core import Entity, EntityData
18
-
19
-
20
- class ChunkData(EntityData):
21
- __slots__ = ()
22
-
23
- is_broadcaster = BoolField("is_broadcaster", default=False)
24
- # If the operator is a shuffle mapper, this flag indicates whether the current chunk is mapper chunk when
25
- # the operator produce multiple chunks such as TensorUnique.
26
- is_mapper = BoolField("is_mapper", default=None)
27
- # optional fields
28
- _index = TupleField("index", FieldTypes.uint32)
29
-
30
- def __repr__(self):
31
- if self.op.stage is None:
32
- return (
33
- f"{type(self).__name__} <op={type(self.op).__name__}, "
34
- f"key={self.key}>"
35
- )
36
- else:
37
- return (
38
- f"{type(self).__name__} <op={type(self.op).__name__}, "
39
- f"stage={self.op.stage.name}, key={self.key}>"
40
- )
41
-
42
- @property
43
- def index(self):
44
- return getattr(self, "_index", None)
45
-
46
- @property
47
- def device(self):
48
- return self.op.device
49
-
50
- def _update_key(self):
51
- object.__setattr__(
52
- self,
53
- "_key",
54
- tokenize(
55
- type(self).__name__,
56
- *(getattr(self, k, None) for k in self._keys_ if k != "_index"),
57
- ),
58
- )
59
-
60
-
61
- class Chunk(Entity):
62
- _allow_data_type_ = (ChunkData,)
63
-
64
- def __repr__(self):
65
- return f"{type(self).__name__}({self._data.__repr__()})"
66
-
67
-
68
- CHUNK_TYPE = (Chunk, ChunkData)
@@ -1,73 +0,0 @@
1
- # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import numpy as np
16
-
17
- from ...serialization.serializables import ReferenceField
18
- from .chunks import CHUNK_TYPE, Chunk, ChunkData
19
-
20
-
21
- class FuseChunkData(ChunkData):
22
- __slots__ = ("_inited",)
23
-
24
- _chunk = ReferenceField(
25
- "chunk", CHUNK_TYPE, on_serialize=lambda x: x.data if hasattr(x, "data") else x
26
- )
27
-
28
- def __init__(self, *args, **kwargs):
29
- self._inited = False
30
- super().__init__(*args, **kwargs)
31
- self._extra_params = {}
32
- self._inited = True
33
-
34
- @property
35
- def chunk(self):
36
- return self._chunk
37
-
38
- @property
39
- def composed(self):
40
- # for compatibility, just return the topological ordering,
41
- # once we apply optimization on the subgraph,
42
- # `composed` is not needed any more and should be removed then.
43
- assert getattr(self._op, "fuse_graph", None) is not None
44
- fuse_graph = self._op.fuse_graph
45
- return list(fuse_graph.topological_iter())
46
-
47
- def __getattr__(self, attr):
48
- if not self._inited:
49
- return object.__getattribute__(self, attr)
50
- if attr in self._extra_params:
51
- return self._extra_params[attr]
52
- try:
53
- return getattr(self._chunk, attr)
54
- except AttributeError:
55
- return object.__getattribute__(self, attr)
56
-
57
- def __setattr__(self, attr, value):
58
- if attr == "params":
59
- self._chunk.params = value
60
- else:
61
- super().__setattr__(attr, value)
62
-
63
- @property
64
- def nbytes(self):
65
- return np.prod(self.shape) * self.dtype.itemsize
66
-
67
-
68
- class FuseChunk(Chunk):
69
- __slots__ = ()
70
- _allow_data_type_ = (FuseChunkData,)
71
-
72
-
73
- FUSE_CHUNK_TYPE = (FuseChunkData, FuseChunk)