maxframe 1.0.0rc1__cp38-cp38-macosx_10_9_universal2.whl → 1.0.0rc3__cp38-cp38-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-38-darwin.so +0 -0
- maxframe/codegen.py +3 -6
- maxframe/config/config.py +49 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +15 -2
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/objects.py +46 -3
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cpython-38-darwin.so +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +7 -1
- maxframe/dataframe/datasource/read_odps_table.py +3 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/datastore/to_odps.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +3 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +12 -8
- maxframe/{odpsio → io/odpsio}/schema.py +15 -12
- maxframe/io/odpsio/tableio.py +702 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +19 -18
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +57 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
- maxframe/learn/contrib/xgboost/predict.py +21 -7
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +27 -17
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
- maxframe/protocol.py +41 -17
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cpython-38-darwin.so +0 -0
- maxframe/serialization/serializables/core.py +48 -9
- maxframe/tensor/__init__.py +69 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +98 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +70 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/{base → misc}/unique.py +2 -2
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/utils.py +11 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +32 -70
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/METADATA +25 -25
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +133 -123
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +60 -68
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +58 -22
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +27 -4
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -322
- maxframe/odpsio/volumeio.py +0 -95
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
maxframe_client/fetcher.py
CHANGED
|
@@ -12,23 +12,25 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import base64
|
|
16
|
-
import json
|
|
17
15
|
from abc import ABC, abstractmethod
|
|
18
16
|
from numbers import Integral
|
|
19
|
-
from typing import Any, Dict, List, Optional, Type, Union
|
|
17
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
20
18
|
|
|
21
19
|
import pandas as pd
|
|
22
20
|
import pyarrow as pa
|
|
23
21
|
from odps import ODPS
|
|
24
|
-
from odps.models import ExternalVolume
|
|
22
|
+
from odps.models import ExternalVolume
|
|
25
23
|
from odps.tunnel import TableTunnel
|
|
26
|
-
from tornado import httpclient
|
|
27
24
|
|
|
28
25
|
from maxframe.core import OBJECT_TYPE
|
|
29
26
|
from maxframe.dataframe.core import DATAFRAME_TYPE
|
|
30
|
-
from maxframe.
|
|
31
|
-
from maxframe.odpsio import
|
|
27
|
+
from maxframe.io.objects import get_object_io_handler
|
|
28
|
+
from maxframe.io.odpsio import (
|
|
29
|
+
ODPSTableIO,
|
|
30
|
+
ODPSVolumeReader,
|
|
31
|
+
arrow_to_pandas,
|
|
32
|
+
build_dataframe_table_meta,
|
|
33
|
+
)
|
|
32
34
|
from maxframe.protocol import (
|
|
33
35
|
DataFrameTableMeta,
|
|
34
36
|
ODPSTableResultInfo,
|
|
@@ -38,7 +40,7 @@ from maxframe.protocol import (
|
|
|
38
40
|
)
|
|
39
41
|
from maxframe.tensor.core import TENSOR_TYPE
|
|
40
42
|
from maxframe.typing_ import PandasObjectTypes, TileableType
|
|
41
|
-
from maxframe.utils import ToThreadMixin
|
|
43
|
+
from maxframe.utils import ToThreadMixin
|
|
42
44
|
|
|
43
45
|
_result_fetchers: Dict[ResultType, Type["ResultFetcher"]] = dict()
|
|
44
46
|
|
|
@@ -109,17 +111,12 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
|
|
|
109
111
|
tileable: TileableType,
|
|
110
112
|
info: ODPSTableResultInfo,
|
|
111
113
|
) -> None:
|
|
112
|
-
if
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
table_meta: DataFrameTableMeta = deserialize_serializable(
|
|
120
|
-
base64.b64decode(comment_data["table_meta"])
|
|
121
|
-
)
|
|
122
|
-
tileable.refresh_from_table_meta(table_meta)
|
|
114
|
+
if (
|
|
115
|
+
isinstance(tileable, DATAFRAME_TYPE)
|
|
116
|
+
and tileable.dtypes is None
|
|
117
|
+
and info.table_meta is not None
|
|
118
|
+
):
|
|
119
|
+
tileable.refresh_from_table_meta(info.table_meta)
|
|
123
120
|
|
|
124
121
|
if tileable.shape and any(pd.isna(x) for x in tileable.shape):
|
|
125
122
|
part_specs = [None] if not info.partition_specs else info.partition_specs
|
|
@@ -131,16 +128,39 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
|
|
|
131
128
|
)
|
|
132
129
|
total_records += session.count
|
|
133
130
|
new_shape_list = list(tileable.shape)
|
|
134
|
-
new_shape_list[
|
|
131
|
+
new_shape_list[0] = total_records
|
|
135
132
|
tileable.params = {"shape": tuple(new_shape_list)}
|
|
136
133
|
|
|
134
|
+
@staticmethod
|
|
135
|
+
def _align_selection_with_shape(
|
|
136
|
+
row_sel: slice, shape: Tuple[Optional[int], ...]
|
|
137
|
+
) -> dict:
|
|
138
|
+
size = shape[0]
|
|
139
|
+
if not row_sel.start and not row_sel.stop:
|
|
140
|
+
return {}
|
|
141
|
+
is_reversed = row_sel.step is not None and row_sel.step < 0
|
|
142
|
+
read_kw = {
|
|
143
|
+
"start": row_sel.start,
|
|
144
|
+
"stop": row_sel.stop,
|
|
145
|
+
"reverse_range": is_reversed,
|
|
146
|
+
}
|
|
147
|
+
if pd.isna(size):
|
|
148
|
+
return read_kw
|
|
149
|
+
|
|
150
|
+
if is_reversed and row_sel.start is not None:
|
|
151
|
+
read_kw["start"] = min(size - 1, row_sel.start)
|
|
152
|
+
if not is_reversed and row_sel.stop is not None:
|
|
153
|
+
read_kw["stop"] = min(size, row_sel.stop)
|
|
154
|
+
return read_kw
|
|
155
|
+
|
|
137
156
|
def _read_single_source(
|
|
138
157
|
self,
|
|
139
158
|
table_meta: DataFrameTableMeta,
|
|
140
159
|
info: ODPSTableResultInfo,
|
|
141
160
|
indexes: List[Union[None, Integral, slice]],
|
|
161
|
+
shape: Tuple[Optional[int], ...],
|
|
142
162
|
):
|
|
143
|
-
table_io =
|
|
163
|
+
table_io = ODPSTableIO(self._odps_entry)
|
|
144
164
|
read_kw = {}
|
|
145
165
|
row_step = None
|
|
146
166
|
if indexes:
|
|
@@ -148,13 +168,8 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
|
|
|
148
168
|
indexes += [None]
|
|
149
169
|
row_sel, col_sel = indexes
|
|
150
170
|
if isinstance(row_sel, slice):
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
read_kw["stop"] = row_sel.stop
|
|
154
|
-
read_kw["reverse_range"] = (
|
|
155
|
-
row_sel.step is not None and row_sel.step < 0
|
|
156
|
-
)
|
|
157
|
-
row_step = row_sel.step
|
|
171
|
+
row_step = row_sel.step
|
|
172
|
+
read_kw = self._align_selection_with_shape(row_sel, shape)
|
|
158
173
|
elif isinstance(row_sel, int):
|
|
159
174
|
read_kw["start"] = row_sel
|
|
160
175
|
read_kw["stop"] = row_sel + 1
|
|
@@ -173,8 +188,8 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
|
|
|
173
188
|
with table_io.open_reader(
|
|
174
189
|
info.full_table_name, info.partition_specs, **read_kw
|
|
175
190
|
) as reader:
|
|
176
|
-
reader_count = reader.count
|
|
177
191
|
result = reader.read_all()
|
|
192
|
+
reader_count = result.num_rows
|
|
178
193
|
|
|
179
194
|
if not row_step:
|
|
180
195
|
return result
|
|
@@ -195,7 +210,7 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
|
|
|
195
210
|
) -> PandasObjectTypes:
|
|
196
211
|
table_meta = build_dataframe_table_meta(tileable)
|
|
197
212
|
arrow_table: pa.Table = await self.to_thread(
|
|
198
|
-
self._read_single_source, table_meta, info, indexes
|
|
213
|
+
self._read_single_source, table_meta, info, indexes, tileable.shape
|
|
199
214
|
)
|
|
200
215
|
return arrow_to_pandas(arrow_table, table_meta)
|
|
201
216
|
|
|
@@ -211,47 +226,24 @@ class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
|
|
|
211
226
|
) -> None:
|
|
212
227
|
return
|
|
213
228
|
|
|
214
|
-
async def
|
|
215
|
-
self,
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
signed_url = await self.to_thread(
|
|
227
|
-
volume.get_sign_url, path + "/" + file_name, "GET"
|
|
228
|
-
)
|
|
229
|
-
http_client = httpclient.AsyncHTTPClient()
|
|
230
|
-
|
|
231
|
-
resp = await http_client.fetch(signed_url)
|
|
232
|
-
if hasattr(resp, "status_code") and resp.code >= 400:
|
|
233
|
-
try:
|
|
234
|
-
import oss2.exceptions
|
|
235
|
-
|
|
236
|
-
oss_exc = oss2.exceptions.make_exception(resp.body)
|
|
237
|
-
raise oss_exc
|
|
238
|
-
except ImportError:
|
|
239
|
-
raise SystemError(resp.body)
|
|
240
|
-
return resp.body
|
|
229
|
+
async def _fetch_object(
|
|
230
|
+
self,
|
|
231
|
+
tileable: TileableType,
|
|
232
|
+
info: ODPSVolumeResultInfo,
|
|
233
|
+
indexes: List[Union[Integral, slice]],
|
|
234
|
+
) -> Any:
|
|
235
|
+
def volume_fetch_func():
|
|
236
|
+
reader = ODPSVolumeReader(
|
|
237
|
+
self._odps_entry, info.volume_name, info.volume_path
|
|
238
|
+
)
|
|
239
|
+
io_handler = get_object_io_handler(tileable)()
|
|
240
|
+
return io_handler.read_object(reader, tileable, indexes)
|
|
241
241
|
|
|
242
|
-
async def _fetch_object(self, info: ODPSVolumeResultInfo) -> Any:
|
|
243
242
|
volume = await self.to_thread(self._odps_entry.get_volume, info.volume_name)
|
|
244
|
-
if isinstance(volume,
|
|
245
|
-
|
|
246
|
-
volume, info.volume_path, "data"
|
|
247
|
-
)
|
|
248
|
-
elif isinstance(volume, ExternalVolume):
|
|
249
|
-
byte_data = await self._read_external_volume_data(
|
|
250
|
-
volume, info.volume_path, "data"
|
|
251
|
-
)
|
|
243
|
+
if isinstance(volume, ExternalVolume):
|
|
244
|
+
return await self.to_thread(volume_fetch_func)
|
|
252
245
|
else:
|
|
253
246
|
raise NotImplementedError(f"Volume type {type(volume)} not supported")
|
|
254
|
-
return pickle.loads(byte_data)
|
|
255
247
|
|
|
256
248
|
async def fetch(
|
|
257
249
|
self,
|
|
@@ -260,5 +252,5 @@ class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
|
|
|
260
252
|
indexes: List[Union[Integral, slice]],
|
|
261
253
|
) -> Any:
|
|
262
254
|
if isinstance(tileable, (OBJECT_TYPE, TENSOR_TYPE)):
|
|
263
|
-
return await self._fetch_object(info)
|
|
255
|
+
return await self._fetch_object(tileable, info, indexes)
|
|
264
256
|
raise NotImplementedError(f"Fetching {type(tileable)} not implemented")
|
maxframe_client/session/graph.py
CHANGED
|
@@ -19,10 +19,16 @@ from dataclasses import dataclass
|
|
|
19
19
|
from typing import Any, Dict, List, Tuple, Union
|
|
20
20
|
from weakref import WeakSet
|
|
21
21
|
|
|
22
|
-
from maxframe.core import
|
|
22
|
+
from maxframe.core import (
|
|
23
|
+
ChunkType,
|
|
24
|
+
TileableGraph,
|
|
25
|
+
TileableType,
|
|
26
|
+
build_fetch,
|
|
27
|
+
enter_mode,
|
|
28
|
+
)
|
|
23
29
|
from maxframe.core.operator import Fetch
|
|
24
30
|
from maxframe.session import AbstractSession
|
|
25
|
-
from maxframe.utils import
|
|
31
|
+
from maxframe.utils import copy_tileables
|
|
26
32
|
|
|
27
33
|
logger = logging.getLogger(__name__)
|
|
28
34
|
|
maxframe_client/session/odps.py
CHANGED
|
@@ -26,7 +26,8 @@ import pandas as pd
|
|
|
26
26
|
from odps import ODPS
|
|
27
27
|
|
|
28
28
|
from maxframe.config import options
|
|
29
|
-
from maxframe.core import Entity, TileableGraph, enter_mode
|
|
29
|
+
from maxframe.core import Entity, TileableGraph, build_fetch, enter_mode
|
|
30
|
+
from maxframe.core.operator import Fetch
|
|
30
31
|
from maxframe.dataframe import read_odps_table
|
|
31
32
|
from maxframe.dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
32
33
|
from maxframe.dataframe.datasource import PandasDataSourceOperator
|
|
@@ -36,11 +37,18 @@ from maxframe.errors import (
|
|
|
36
37
|
NoTaskServerResponseError,
|
|
37
38
|
SessionAlreadyClosedError,
|
|
38
39
|
)
|
|
39
|
-
from maxframe.
|
|
40
|
+
from maxframe.io.objects import get_object_io_handler
|
|
41
|
+
from maxframe.io.odpsio import (
|
|
42
|
+
ODPSTableIO,
|
|
43
|
+
ODPSVolumeWriter,
|
|
44
|
+
pandas_to_arrow,
|
|
45
|
+
pandas_to_odps_schema,
|
|
46
|
+
)
|
|
40
47
|
from maxframe.protocol import (
|
|
41
48
|
DagInfo,
|
|
42
49
|
DagStatus,
|
|
43
50
|
ODPSTableResultInfo,
|
|
51
|
+
ODPSVolumeResultInfo,
|
|
44
52
|
ResultInfo,
|
|
45
53
|
SessionInfo,
|
|
46
54
|
)
|
|
@@ -51,8 +59,13 @@ from maxframe.session import (
|
|
|
51
59
|
Profiling,
|
|
52
60
|
Progress,
|
|
53
61
|
)
|
|
62
|
+
from maxframe.tensor.datasource import ArrayDataSource
|
|
54
63
|
from maxframe.typing_ import TileableType
|
|
55
|
-
from maxframe.utils import
|
|
64
|
+
from maxframe.utils import (
|
|
65
|
+
ToThreadMixin,
|
|
66
|
+
build_session_volume_name,
|
|
67
|
+
build_temp_table_name,
|
|
68
|
+
)
|
|
56
69
|
|
|
57
70
|
from ..clients.framedriver import FrameDriverClient
|
|
58
71
|
from ..fetcher import get_fetcher_cls
|
|
@@ -139,14 +152,9 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
139
152
|
self._session_id = session_info.session_id
|
|
140
153
|
await self._show_logview_address()
|
|
141
154
|
|
|
142
|
-
def
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
or t.op.get_data() is None
|
|
146
|
-
or t.inputs
|
|
147
|
-
):
|
|
148
|
-
return None
|
|
149
|
-
|
|
155
|
+
def _upload_and_get_table_read_tileable(
|
|
156
|
+
self, t: TileableType
|
|
157
|
+
) -> Optional[TileableType]:
|
|
150
158
|
schema, table_meta = pandas_to_odps_schema(t, unknown_as_string=True)
|
|
151
159
|
if self._odps_entry.exist_table(table_meta.table_name):
|
|
152
160
|
self._odps_entry.delete_table(
|
|
@@ -164,8 +172,8 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
164
172
|
batch_size = options.session.upload_batch_size
|
|
165
173
|
|
|
166
174
|
if len(data):
|
|
167
|
-
|
|
168
|
-
with
|
|
175
|
+
table_client = ODPSTableIO(self._odps_entry)
|
|
176
|
+
with table_client.open_writer(table_obj.full_table_name) as writer:
|
|
169
177
|
for batch_start in range(0, len(data), batch_size):
|
|
170
178
|
if isinstance(data, pd.Index):
|
|
171
179
|
batch = data[batch_start : batch_start + batch_size]
|
|
@@ -188,13 +196,34 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
188
196
|
read_tileable.name = t.name
|
|
189
197
|
else: # INDEX_TYPE
|
|
190
198
|
if list(read_tileable.names) != list(t.names):
|
|
191
|
-
read_tileable.names =
|
|
199
|
+
read_tileable.rename(t.names, inplace=True)
|
|
192
200
|
read_tileable._key = t.key
|
|
193
201
|
read_tileable.params = t.params
|
|
194
202
|
return read_tileable.data
|
|
195
203
|
|
|
204
|
+
def _upload_and_get_vol_read_tileable(
|
|
205
|
+
self, t: TileableType
|
|
206
|
+
) -> Optional[TileableType]:
|
|
207
|
+
vol_name = build_session_volume_name(self.session_id)
|
|
208
|
+
writer = ODPSVolumeWriter(self._odps_entry, vol_name, t.key)
|
|
209
|
+
io_handler = get_object_io_handler(t)
|
|
210
|
+
io_handler().write_object(writer, t, t.op.data)
|
|
211
|
+
return build_fetch(t).data
|
|
212
|
+
|
|
213
|
+
def _upload_and_get_read_tileable(self, t: TileableType) -> Optional[TileableType]:
|
|
214
|
+
if (
|
|
215
|
+
not isinstance(t.op, (ArrayDataSource, PandasDataSourceOperator))
|
|
216
|
+
or t.op.get_data() is None
|
|
217
|
+
or t.inputs
|
|
218
|
+
):
|
|
219
|
+
return None
|
|
220
|
+
if isinstance(t.op, PandasDataSourceOperator):
|
|
221
|
+
return self._upload_and_get_table_read_tileable(t)
|
|
222
|
+
else:
|
|
223
|
+
return self._upload_and_get_vol_read_tileable(t)
|
|
224
|
+
|
|
196
225
|
@enter_mode(kernel=True, build=True)
|
|
197
|
-
def
|
|
226
|
+
def _scan_and_replace_local_sources(
|
|
198
227
|
self, graph: TileableGraph
|
|
199
228
|
) -> Dict[TileableType, TileableType]:
|
|
200
229
|
"""Replaces Pandas data sources with temp table sources in the graph"""
|
|
@@ -223,14 +252,21 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
223
252
|
@enter_mode(kernel=True, build=True)
|
|
224
253
|
def _get_input_infos(self, tileables: List[TileableType]) -> Dict[str, ResultInfo]:
|
|
225
254
|
"""Generate ResultInfo structs from generated temp tables"""
|
|
255
|
+
vol_name = build_session_volume_name(self.session_id)
|
|
256
|
+
|
|
226
257
|
infos = dict()
|
|
227
258
|
for t in tileables:
|
|
228
259
|
key = t.key
|
|
229
|
-
if
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
260
|
+
if isinstance(t.op, DataFrameReadODPSTable):
|
|
261
|
+
infos[key] = ODPSTableResultInfo(full_table_name=t.op.table_name)
|
|
262
|
+
else:
|
|
263
|
+
if isinstance(t.op, Fetch):
|
|
264
|
+
infos[key] = ODPSVolumeResultInfo(
|
|
265
|
+
volume_name=vol_name, volume_path=t.key
|
|
266
|
+
)
|
|
267
|
+
elif t.inputs and isinstance(t.inputs[0].op, DataFrameReadODPSTable):
|
|
268
|
+
t = t.inputs[0]
|
|
269
|
+
infos[key] = ODPSTableResultInfo(full_table_name=t.op.table_name)
|
|
234
270
|
return infos
|
|
235
271
|
|
|
236
272
|
async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
|
|
@@ -242,7 +278,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
242
278
|
tileable_graph, to_execute_tileables = gen_submit_tileable_graph(
|
|
243
279
|
self, tileables, tileable_to_copied
|
|
244
280
|
)
|
|
245
|
-
source_replacements = self.
|
|
281
|
+
source_replacements = self._scan_and_replace_local_sources(tileable_graph)
|
|
246
282
|
|
|
247
283
|
# we need to manage uploaded data sources with refcounting mechanism
|
|
248
284
|
# as nodes in tileable_graph are copied, we need to use original nodes
|
|
@@ -384,7 +420,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
384
420
|
data_tileable, indexes = self._get_data_tileable_and_indexes(tileable)
|
|
385
421
|
info = self._tileable_to_infos[data_tileable]
|
|
386
422
|
fetcher = get_fetcher_cls(info.result_type)(self._odps_entry)
|
|
387
|
-
results.append(await fetcher.fetch(
|
|
423
|
+
results.append(await fetcher.fetch(data_tileable, info, indexes))
|
|
388
424
|
return results
|
|
389
425
|
|
|
390
426
|
async def decref(self, *tileable_keys):
|
|
@@ -17,19 +17,32 @@ import uuid
|
|
|
17
17
|
import numpy as np
|
|
18
18
|
import pandas as pd
|
|
19
19
|
import pyarrow as pa
|
|
20
|
+
import pytest
|
|
20
21
|
from odps import ODPS
|
|
21
22
|
|
|
22
23
|
import maxframe.dataframe as md
|
|
23
|
-
from maxframe.
|
|
24
|
+
from maxframe.config import options
|
|
25
|
+
from maxframe.io.odpsio import ODPSTableIO
|
|
24
26
|
from maxframe.protocol import ODPSTableResultInfo, ResultType
|
|
25
27
|
from maxframe.tests.utils import tn
|
|
26
28
|
|
|
27
29
|
from ..fetcher import ODPSTableFetcher
|
|
28
30
|
|
|
29
31
|
|
|
30
|
-
|
|
32
|
+
@pytest.fixture
|
|
33
|
+
def switch_table_io(request):
|
|
34
|
+
old_use_common_table = options.use_common_table
|
|
35
|
+
try:
|
|
36
|
+
options.use_common_table = request.param
|
|
37
|
+
yield
|
|
38
|
+
finally:
|
|
39
|
+
options.use_common_table = old_use_common_table
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
43
|
+
async def test_table_fetcher(switch_table_io):
|
|
31
44
|
odps_entry = ODPS.from_environments()
|
|
32
|
-
halo_table_io =
|
|
45
|
+
halo_table_io = ODPSTableIO(odps_entry)
|
|
33
46
|
fetcher = ODPSTableFetcher(odps_entry)
|
|
34
47
|
|
|
35
48
|
data = pd.DataFrame(
|
|
@@ -58,6 +71,11 @@ async def test_table_fetcher():
|
|
|
58
71
|
assert len(fetched) == 1000
|
|
59
72
|
pd.testing.assert_frame_equal(raw_data, fetched)
|
|
60
73
|
|
|
74
|
+
result_info = ODPSTableResultInfo(ResultType.ODPS_TABLE, full_table_name=table_name)
|
|
75
|
+
fetched = await fetcher.fetch(tileable, result_info, [slice(None, 2000), None])
|
|
76
|
+
assert len(fetched) == 1000
|
|
77
|
+
pd.testing.assert_frame_equal(raw_data, fetched)
|
|
78
|
+
|
|
61
79
|
result_info = ODPSTableResultInfo(ResultType.ODPS_TABLE, full_table_name=table_name)
|
|
62
80
|
fetched = await fetcher.fetch(tileable, result_info, [2, None])
|
|
63
81
|
assert len(fetched) == 1
|
|
@@ -195,7 +195,8 @@ def test_run_dataframe_from_to_odps_table(start_mock_session):
|
|
|
195
195
|
assert len(result_df) == 10
|
|
196
196
|
assert len(result_df.columns) == 6
|
|
197
197
|
|
|
198
|
-
df = md.read_odps_table(table_obj, index_col="index").head(10).execute()
|
|
198
|
+
df = md.read_odps_table(table_obj, index_col="index").head(10).execute()
|
|
199
|
+
assert df.shape == (10, 5)
|
|
199
200
|
assert len(df) == 10
|
|
200
201
|
assert len(df.columns) == 5
|
|
201
202
|
finally:
|
|
@@ -246,7 +247,19 @@ def test_run_and_fetch_series(start_mock_session):
|
|
|
246
247
|
)
|
|
247
248
|
|
|
248
249
|
|
|
249
|
-
def
|
|
250
|
+
def test_execute_with_tensor(oss_config, start_mock_session):
|
|
251
|
+
pd_df = pd.DataFrame(
|
|
252
|
+
{"angles": [0, 3, 4], "degrees": [360, 180, 360]},
|
|
253
|
+
index=["circle", "triangle", "rectangle"],
|
|
254
|
+
)
|
|
255
|
+
df = md.DataFrame(pd_df)
|
|
256
|
+
|
|
257
|
+
result = (df - [1, 2]).execute().fetch()
|
|
258
|
+
expected = pd_df - [1, 2]
|
|
259
|
+
pd.testing.assert_frame_equal(result, expected)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def test_run_remote_success(oss_config, start_mock_session):
|
|
250
263
|
def func(a, b):
|
|
251
264
|
return a + b
|
|
252
265
|
|
|
@@ -257,7 +270,7 @@ def test_run_remote_success(start_mock_session):
|
|
|
257
270
|
assert result == 21
|
|
258
271
|
|
|
259
272
|
|
|
260
|
-
def test_run_remote_error(start_mock_session):
|
|
273
|
+
def test_run_remote_error(oss_config, start_mock_session):
|
|
261
274
|
def func():
|
|
262
275
|
raise ValueError
|
|
263
276
|
|
|
@@ -280,7 +293,7 @@ def test_pivot_dataframe(start_mock_session):
|
|
|
280
293
|
df = md.DataFrame(pd_df)
|
|
281
294
|
pivot = df.pivot_table(values="D", index=["A", "B"], columns=["C"], aggfunc="sum")
|
|
282
295
|
executed = pivot.execute()
|
|
283
|
-
assert pivot.shape == (
|
|
296
|
+
assert pivot.shape == (4, 2)
|
|
284
297
|
pd.testing.assert_index_equal(
|
|
285
298
|
pivot.dtypes.index, pd.Index(["large", "small"], name="C")
|
|
286
299
|
)
|
|
@@ -289,3 +302,13 @@ def test_pivot_dataframe(start_mock_session):
|
|
|
289
302
|
values="D", index=["A", "B"], columns=["C"], aggfunc="sum"
|
|
290
303
|
)
|
|
291
304
|
pd.testing.assert_frame_equal(executed.to_pandas(), expected)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def test_index_drop_duplicates(start_mock_session):
|
|
308
|
+
pd_idx = pd.Index(["lame", "cow", "lame", "beetle", "lame", "hippo"])
|
|
309
|
+
idx = md.Index(pd_idx)
|
|
310
|
+
executed = idx.drop_duplicates(keep="first").execute()
|
|
311
|
+
expected = pd_idx.drop_duplicates(keep="first")
|
|
312
|
+
pd.testing.assert_index_equal(
|
|
313
|
+
executed.to_pandas().sort_values(), expected.sort_values()
|
|
314
|
+
)
|
maxframe/core/entity/chunks.py
DELETED
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from ...serialization.serializables import BoolField, FieldTypes, TupleField
|
|
16
|
-
from ...utils import tokenize
|
|
17
|
-
from .core import Entity, EntityData
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class ChunkData(EntityData):
|
|
21
|
-
__slots__ = ()
|
|
22
|
-
|
|
23
|
-
is_broadcaster = BoolField("is_broadcaster", default=False)
|
|
24
|
-
# If the operator is a shuffle mapper, this flag indicates whether the current chunk is mapper chunk when
|
|
25
|
-
# the operator produce multiple chunks such as TensorUnique.
|
|
26
|
-
is_mapper = BoolField("is_mapper", default=None)
|
|
27
|
-
# optional fields
|
|
28
|
-
_index = TupleField("index", FieldTypes.uint32)
|
|
29
|
-
|
|
30
|
-
def __repr__(self):
|
|
31
|
-
if self.op.stage is None:
|
|
32
|
-
return (
|
|
33
|
-
f"{type(self).__name__} <op={type(self.op).__name__}, "
|
|
34
|
-
f"key={self.key}>"
|
|
35
|
-
)
|
|
36
|
-
else:
|
|
37
|
-
return (
|
|
38
|
-
f"{type(self).__name__} <op={type(self.op).__name__}, "
|
|
39
|
-
f"stage={self.op.stage.name}, key={self.key}>"
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
@property
|
|
43
|
-
def index(self):
|
|
44
|
-
return getattr(self, "_index", None)
|
|
45
|
-
|
|
46
|
-
@property
|
|
47
|
-
def device(self):
|
|
48
|
-
return self.op.device
|
|
49
|
-
|
|
50
|
-
def _update_key(self):
|
|
51
|
-
object.__setattr__(
|
|
52
|
-
self,
|
|
53
|
-
"_key",
|
|
54
|
-
tokenize(
|
|
55
|
-
type(self).__name__,
|
|
56
|
-
*(getattr(self, k, None) for k in self._keys_ if k != "_index"),
|
|
57
|
-
),
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class Chunk(Entity):
|
|
62
|
-
_allow_data_type_ = (ChunkData,)
|
|
63
|
-
|
|
64
|
-
def __repr__(self):
|
|
65
|
-
return f"{type(self).__name__}({self._data.__repr__()})"
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
CHUNK_TYPE = (Chunk, ChunkData)
|
maxframe/core/entity/fuse.py
DELETED
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import numpy as np
|
|
16
|
-
|
|
17
|
-
from ...serialization.serializables import ReferenceField
|
|
18
|
-
from .chunks import CHUNK_TYPE, Chunk, ChunkData
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class FuseChunkData(ChunkData):
|
|
22
|
-
__slots__ = ("_inited",)
|
|
23
|
-
|
|
24
|
-
_chunk = ReferenceField(
|
|
25
|
-
"chunk", CHUNK_TYPE, on_serialize=lambda x: x.data if hasattr(x, "data") else x
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
def __init__(self, *args, **kwargs):
|
|
29
|
-
self._inited = False
|
|
30
|
-
super().__init__(*args, **kwargs)
|
|
31
|
-
self._extra_params = {}
|
|
32
|
-
self._inited = True
|
|
33
|
-
|
|
34
|
-
@property
|
|
35
|
-
def chunk(self):
|
|
36
|
-
return self._chunk
|
|
37
|
-
|
|
38
|
-
@property
|
|
39
|
-
def composed(self):
|
|
40
|
-
# for compatibility, just return the topological ordering,
|
|
41
|
-
# once we apply optimization on the subgraph,
|
|
42
|
-
# `composed` is not needed any more and should be removed then.
|
|
43
|
-
assert getattr(self._op, "fuse_graph", None) is not None
|
|
44
|
-
fuse_graph = self._op.fuse_graph
|
|
45
|
-
return list(fuse_graph.topological_iter())
|
|
46
|
-
|
|
47
|
-
def __getattr__(self, attr):
|
|
48
|
-
if not self._inited:
|
|
49
|
-
return object.__getattribute__(self, attr)
|
|
50
|
-
if attr in self._extra_params:
|
|
51
|
-
return self._extra_params[attr]
|
|
52
|
-
try:
|
|
53
|
-
return getattr(self._chunk, attr)
|
|
54
|
-
except AttributeError:
|
|
55
|
-
return object.__getattribute__(self, attr)
|
|
56
|
-
|
|
57
|
-
def __setattr__(self, attr, value):
|
|
58
|
-
if attr == "params":
|
|
59
|
-
self._chunk.params = value
|
|
60
|
-
else:
|
|
61
|
-
super().__setattr__(attr, value)
|
|
62
|
-
|
|
63
|
-
@property
|
|
64
|
-
def nbytes(self):
|
|
65
|
-
return np.prod(self.shape) * self.dtype.itemsize
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class FuseChunk(Chunk):
|
|
69
|
-
__slots__ = ()
|
|
70
|
-
_allow_data_type_ = (FuseChunkData,)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
FUSE_CHUNK_TYPE = (FuseChunkData, FuseChunk)
|