maxframe 1.0.0rc2__cp38-cp38-macosx_10_9_universal2.whl → 1.0.0rc4__cp38-cp38-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-38-darwin.so +0 -0
- maxframe/codegen.py +4 -2
- maxframe/config/config.py +28 -9
- maxframe/config/validators.py +42 -12
- maxframe/conftest.py +56 -14
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +45 -2
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cpython-38-darwin.so +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/core.py +2 -0
- maxframe/dataframe/datasource/read_odps_query.py +67 -8
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
- maxframe/dataframe/datastore/to_odps.py +8 -1
- maxframe/dataframe/extensions/__init__.py +3 -0
- maxframe/dataframe/extensions/flatmap.py +326 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/misc/drop_duplicates.py +18 -1
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
- maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
- maxframe/{odpsio → io/odpsio}/schema.py +10 -8
- maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +2 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
- maxframe/learn/contrib/xgboost/predict.py +27 -44
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +27 -16
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cpython-38-darwin.so +0 -0
- maxframe/opcodes.py +3 -0
- maxframe/protocol.py +7 -16
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cpython-38-darwin.so +0 -0
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +10 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/utils.py +2 -22
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +49 -73
- maxframe-1.0.0rc4.dist-info/METADATA +104 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +33 -50
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +134 -27
- maxframe_client/session/task.py +58 -20
- maxframe_client/tests/test_fetcher.py +1 -1
- maxframe_client/tests/test_session.py +27 -3
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/volumeio.py +0 -95
- maxframe-1.0.0rc2.dist-info/METADATA +0 -177
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/unique.py +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
maxframe_client/fetcher.py
CHANGED
|
@@ -19,14 +19,18 @@ from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
|
19
19
|
import pandas as pd
|
|
20
20
|
import pyarrow as pa
|
|
21
21
|
from odps import ODPS
|
|
22
|
-
from odps.models import ExternalVolume
|
|
22
|
+
from odps.models import ExternalVolume
|
|
23
23
|
from odps.tunnel import TableTunnel
|
|
24
|
-
from tornado import httpclient
|
|
25
24
|
|
|
26
25
|
from maxframe.core import OBJECT_TYPE
|
|
27
26
|
from maxframe.dataframe.core import DATAFRAME_TYPE
|
|
28
|
-
from maxframe.
|
|
29
|
-
from maxframe.odpsio import
|
|
27
|
+
from maxframe.io.objects import get_object_io_handler
|
|
28
|
+
from maxframe.io.odpsio import (
|
|
29
|
+
ODPSTableIO,
|
|
30
|
+
ODPSVolumeReader,
|
|
31
|
+
arrow_to_pandas,
|
|
32
|
+
build_dataframe_table_meta,
|
|
33
|
+
)
|
|
30
34
|
from maxframe.protocol import (
|
|
31
35
|
DataFrameTableMeta,
|
|
32
36
|
ODPSTableResultInfo,
|
|
@@ -36,7 +40,7 @@ from maxframe.protocol import (
|
|
|
36
40
|
)
|
|
37
41
|
from maxframe.tensor.core import TENSOR_TYPE
|
|
38
42
|
from maxframe.typing_ import PandasObjectTypes, TileableType
|
|
39
|
-
from maxframe.utils import ToThreadMixin
|
|
43
|
+
from maxframe.utils import ToThreadMixin, sync_pyodps_options
|
|
40
44
|
|
|
41
45
|
_result_fetchers: Dict[ResultType, Type["ResultFetcher"]] = dict()
|
|
42
46
|
|
|
@@ -116,13 +120,15 @@ class ODPSTableFetcher(ToThreadMixin, ResultFetcher):
|
|
|
116
120
|
|
|
117
121
|
if tileable.shape and any(pd.isna(x) for x in tileable.shape):
|
|
118
122
|
part_specs = [None] if not info.partition_specs else info.partition_specs
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
123
|
+
|
|
124
|
+
with sync_pyodps_options():
|
|
125
|
+
table = self._odps_entry.get_table(info.full_table_name)
|
|
126
|
+
tunnel = TableTunnel(self._odps_entry)
|
|
127
|
+
total_records = 0
|
|
128
|
+
for part_spec in part_specs:
|
|
129
|
+
session = tunnel.create_download_session(table, part_spec)
|
|
130
|
+
total_records += session.count
|
|
131
|
+
|
|
126
132
|
new_shape_list = list(tileable.shape)
|
|
127
133
|
new_shape_list[0] = total_records
|
|
128
134
|
tileable.params = {"shape": tuple(new_shape_list)}
|
|
@@ -222,47 +228,24 @@ class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
|
|
|
222
228
|
) -> None:
|
|
223
229
|
return
|
|
224
230
|
|
|
225
|
-
async def
|
|
226
|
-
self,
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
signed_url = await self.to_thread(
|
|
238
|
-
volume.get_sign_url, path + "/" + file_name, "GET"
|
|
239
|
-
)
|
|
240
|
-
http_client = httpclient.AsyncHTTPClient()
|
|
241
|
-
|
|
242
|
-
resp = await http_client.fetch(signed_url)
|
|
243
|
-
if hasattr(resp, "status_code") and resp.code >= 400:
|
|
244
|
-
try:
|
|
245
|
-
import oss2.exceptions
|
|
246
|
-
|
|
247
|
-
oss_exc = oss2.exceptions.make_exception(resp.body)
|
|
248
|
-
raise oss_exc
|
|
249
|
-
except ImportError:
|
|
250
|
-
raise SystemError(resp.body)
|
|
251
|
-
return resp.body
|
|
231
|
+
async def _fetch_object(
|
|
232
|
+
self,
|
|
233
|
+
tileable: TileableType,
|
|
234
|
+
info: ODPSVolumeResultInfo,
|
|
235
|
+
indexes: List[Union[Integral, slice]],
|
|
236
|
+
) -> Any:
|
|
237
|
+
def volume_fetch_func():
|
|
238
|
+
reader = ODPSVolumeReader(
|
|
239
|
+
self._odps_entry, info.volume_name, info.volume_path
|
|
240
|
+
)
|
|
241
|
+
io_handler = get_object_io_handler(tileable)()
|
|
242
|
+
return io_handler.read_object(reader, tileable, indexes)
|
|
252
243
|
|
|
253
|
-
async def _fetch_object(self, info: ODPSVolumeResultInfo) -> Any:
|
|
254
244
|
volume = await self.to_thread(self._odps_entry.get_volume, info.volume_name)
|
|
255
|
-
if isinstance(volume,
|
|
256
|
-
|
|
257
|
-
volume, info.volume_path, "data"
|
|
258
|
-
)
|
|
259
|
-
elif isinstance(volume, ExternalVolume):
|
|
260
|
-
byte_data = await self._read_external_volume_data(
|
|
261
|
-
volume, info.volume_path, "data"
|
|
262
|
-
)
|
|
245
|
+
if isinstance(volume, ExternalVolume):
|
|
246
|
+
return await self.to_thread(volume_fetch_func)
|
|
263
247
|
else:
|
|
264
248
|
raise NotImplementedError(f"Volume type {type(volume)} not supported")
|
|
265
|
-
return pickle.loads(byte_data)
|
|
266
249
|
|
|
267
250
|
async def fetch(
|
|
268
251
|
self,
|
|
@@ -271,5 +254,5 @@ class ODPSVolumeFetcher(ToThreadMixin, ResultFetcher):
|
|
|
271
254
|
indexes: List[Union[Integral, slice]],
|
|
272
255
|
) -> Any:
|
|
273
256
|
if isinstance(tileable, (OBJECT_TYPE, TENSOR_TYPE)):
|
|
274
|
-
return await self._fetch_object(info)
|
|
257
|
+
return await self._fetch_object(tileable, info, indexes)
|
|
275
258
|
raise NotImplementedError(f"Fetching {type(tileable)} not implemented")
|
|
@@ -12,6 +12,9 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
# retry consts
|
|
16
|
+
EMPTY_RESPONSE_RETRY_COUNT = 5
|
|
17
|
+
|
|
15
18
|
# Restful Service
|
|
16
19
|
RESTFUL_SESSION_INSECURE_SCHEME = "mf"
|
|
17
20
|
RESTFUL_SESSION_SECURE_SCHEME = "mfs"
|
maxframe_client/session/graph.py
CHANGED
|
@@ -19,10 +19,16 @@ from dataclasses import dataclass
|
|
|
19
19
|
from typing import Any, Dict, List, Tuple, Union
|
|
20
20
|
from weakref import WeakSet
|
|
21
21
|
|
|
22
|
-
from maxframe.core import
|
|
22
|
+
from maxframe.core import (
|
|
23
|
+
ChunkType,
|
|
24
|
+
TileableGraph,
|
|
25
|
+
TileableType,
|
|
26
|
+
build_fetch,
|
|
27
|
+
enter_mode,
|
|
28
|
+
)
|
|
23
29
|
from maxframe.core.operator import Fetch
|
|
24
30
|
from maxframe.session import AbstractSession
|
|
25
|
-
from maxframe.utils import
|
|
31
|
+
from maxframe.utils import copy_tileables
|
|
26
32
|
|
|
27
33
|
logger = logging.getLogger(__name__)
|
|
28
34
|
|
maxframe_client/session/odps.py
CHANGED
|
@@ -18,15 +18,17 @@ import logging
|
|
|
18
18
|
import time
|
|
19
19
|
import weakref
|
|
20
20
|
from numbers import Integral
|
|
21
|
-
from typing import Dict, List, Mapping, Optional, Tuple, Union
|
|
21
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union
|
|
22
22
|
from urllib.parse import urlparse
|
|
23
23
|
|
|
24
24
|
import numpy as np
|
|
25
25
|
import pandas as pd
|
|
26
26
|
from odps import ODPS
|
|
27
|
+
from odps import options as odps_options
|
|
27
28
|
|
|
28
29
|
from maxframe.config import options
|
|
29
|
-
from maxframe.core import Entity, TileableGraph, enter_mode
|
|
30
|
+
from maxframe.core import Entity, TileableGraph, build_fetch, enter_mode
|
|
31
|
+
from maxframe.core.operator import Fetch
|
|
30
32
|
from maxframe.dataframe import read_odps_table
|
|
31
33
|
from maxframe.dataframe.core import DATAFRAME_TYPE, SERIES_TYPE
|
|
32
34
|
from maxframe.dataframe.datasource import PandasDataSourceOperator
|
|
@@ -36,11 +38,18 @@ from maxframe.errors import (
|
|
|
36
38
|
NoTaskServerResponseError,
|
|
37
39
|
SessionAlreadyClosedError,
|
|
38
40
|
)
|
|
39
|
-
from maxframe.
|
|
41
|
+
from maxframe.io.objects import get_object_io_handler
|
|
42
|
+
from maxframe.io.odpsio import (
|
|
43
|
+
ODPSTableIO,
|
|
44
|
+
ODPSVolumeWriter,
|
|
45
|
+
pandas_to_arrow,
|
|
46
|
+
pandas_to_odps_schema,
|
|
47
|
+
)
|
|
40
48
|
from maxframe.protocol import (
|
|
41
49
|
DagInfo,
|
|
42
50
|
DagStatus,
|
|
43
51
|
ODPSTableResultInfo,
|
|
52
|
+
ODPSVolumeResultInfo,
|
|
44
53
|
ResultInfo,
|
|
45
54
|
SessionInfo,
|
|
46
55
|
)
|
|
@@ -51,8 +60,15 @@ from maxframe.session import (
|
|
|
51
60
|
Profiling,
|
|
52
61
|
Progress,
|
|
53
62
|
)
|
|
63
|
+
from maxframe.tensor.datasource import ArrayDataSource
|
|
54
64
|
from maxframe.typing_ import TileableType
|
|
55
|
-
from maxframe.utils import
|
|
65
|
+
from maxframe.utils import (
|
|
66
|
+
ToThreadMixin,
|
|
67
|
+
build_session_volume_name,
|
|
68
|
+
build_temp_table_name,
|
|
69
|
+
str_to_bool,
|
|
70
|
+
sync_pyodps_options,
|
|
71
|
+
)
|
|
56
72
|
|
|
57
73
|
from ..clients.framedriver import FrameDriverClient
|
|
58
74
|
from ..fetcher import get_fetcher_cls
|
|
@@ -63,6 +79,43 @@ logger = logging.getLogger(__name__)
|
|
|
63
79
|
|
|
64
80
|
|
|
65
81
|
class MaxFrameServiceCaller(metaclass=abc.ABCMeta):
|
|
82
|
+
def get_settings_to_upload(self) -> Dict[str, Any]:
|
|
83
|
+
sql_settings = (odps_options.sql.settings or {}).copy()
|
|
84
|
+
sql_settings.update(options.sql.settings or {})
|
|
85
|
+
|
|
86
|
+
quota_name = options.session.quota_name or getattr(
|
|
87
|
+
odps_options, "quota_name", None
|
|
88
|
+
)
|
|
89
|
+
lifecycle = options.session.table_lifecycle or odps_options.lifecycle
|
|
90
|
+
temp_lifecycle = (
|
|
91
|
+
options.session.temp_table_lifecycle or odps_options.temp_lifecycle
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
enable_schema = options.session.enable_schema
|
|
95
|
+
default_schema = options.session.default_schema
|
|
96
|
+
if hasattr(self, "_odps_entry"):
|
|
97
|
+
default_schema = default_schema or self._odps_entry.schema
|
|
98
|
+
|
|
99
|
+
# use flags in sql settings
|
|
100
|
+
if sql_settings.get("odps.default.schema"):
|
|
101
|
+
default_schema = sql_settings["odps.default.schema"]
|
|
102
|
+
if str_to_bool(
|
|
103
|
+
sql_settings.get("odps.namespace.schema") or "false"
|
|
104
|
+
) or str_to_bool(
|
|
105
|
+
sql_settings.get("odps.sql.allow.namespace.schema") or "false"
|
|
106
|
+
):
|
|
107
|
+
enable_schema = True
|
|
108
|
+
|
|
109
|
+
mf_settings = dict(options.to_dict(remote_only=True).items())
|
|
110
|
+
mf_settings["sql.settings"] = sql_settings
|
|
111
|
+
mf_settings["session.table_lifecycle"] = lifecycle
|
|
112
|
+
mf_settings["session.temp_table_lifecycle"] = temp_lifecycle
|
|
113
|
+
mf_settings["session.quota_name"] = quota_name
|
|
114
|
+
if enable_schema is not None:
|
|
115
|
+
mf_settings["session.enable_schema"] = enable_schema
|
|
116
|
+
mf_settings["session.default_schema"] = default_schema or "default"
|
|
117
|
+
return mf_settings
|
|
118
|
+
|
|
66
119
|
@abc.abstractmethod
|
|
67
120
|
def create_session(self) -> SessionInfo:
|
|
68
121
|
raise NotImplementedError
|
|
@@ -73,7 +126,10 @@ class MaxFrameServiceCaller(metaclass=abc.ABCMeta):
|
|
|
73
126
|
|
|
74
127
|
@abc.abstractmethod
|
|
75
128
|
def submit_dag(
|
|
76
|
-
self,
|
|
129
|
+
self,
|
|
130
|
+
dag: TileableGraph,
|
|
131
|
+
managed_input_infos: Dict[str, ResultInfo],
|
|
132
|
+
new_settings: Dict[str, Any] = None,
|
|
77
133
|
) -> DagInfo:
|
|
78
134
|
raise NotImplementedError
|
|
79
135
|
|
|
@@ -127,6 +183,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
127
183
|
self._tileable_to_infos = weakref.WeakKeyDictionary()
|
|
128
184
|
|
|
129
185
|
self._caller = self._create_caller(odps_entry, address, **kwargs)
|
|
186
|
+
self._last_settings = None
|
|
130
187
|
|
|
131
188
|
@classmethod
|
|
132
189
|
def _create_caller(
|
|
@@ -136,18 +193,14 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
136
193
|
|
|
137
194
|
async def _init(self, _address: str):
|
|
138
195
|
session_info = await self.ensure_async_call(self._caller.create_session)
|
|
196
|
+
self._last_settings = self._caller.get_settings_to_upload()
|
|
139
197
|
self._session_id = session_info.session_id
|
|
140
198
|
await self._show_logview_address()
|
|
141
199
|
|
|
142
|
-
def
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
or t.inputs
|
|
147
|
-
):
|
|
148
|
-
return None
|
|
149
|
-
|
|
150
|
-
schema, table_meta = pandas_to_odps_schema(t, unknown_as_string=True)
|
|
200
|
+
def _upload_and_get_table_read_tileable(
|
|
201
|
+
self, t: TileableType
|
|
202
|
+
) -> Optional[TileableType]:
|
|
203
|
+
table_schema, table_meta = pandas_to_odps_schema(t, unknown_as_string=True)
|
|
151
204
|
if self._odps_entry.exist_table(table_meta.table_name):
|
|
152
205
|
self._odps_entry.delete_table(
|
|
153
206
|
table_meta.table_name, hints=options.sql.settings
|
|
@@ -155,7 +208,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
155
208
|
table_name = build_temp_table_name(self.session_id, t.key)
|
|
156
209
|
table_obj = self._odps_entry.create_table(
|
|
157
210
|
table_name,
|
|
158
|
-
|
|
211
|
+
table_schema,
|
|
159
212
|
lifecycle=options.session.temp_table_lifecycle,
|
|
160
213
|
hints=options.sql.settings,
|
|
161
214
|
)
|
|
@@ -193,8 +246,30 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
193
246
|
read_tileable.params = t.params
|
|
194
247
|
return read_tileable.data
|
|
195
248
|
|
|
249
|
+
def _upload_and_get_vol_read_tileable(
|
|
250
|
+
self, t: TileableType
|
|
251
|
+
) -> Optional[TileableType]:
|
|
252
|
+
vol_name = build_session_volume_name(self.session_id)
|
|
253
|
+
writer = ODPSVolumeWriter(self._odps_entry, vol_name, t.key)
|
|
254
|
+
io_handler = get_object_io_handler(t)
|
|
255
|
+
io_handler().write_object(writer, t, t.op.data)
|
|
256
|
+
return build_fetch(t).data
|
|
257
|
+
|
|
258
|
+
def _upload_and_get_read_tileable(self, t: TileableType) -> Optional[TileableType]:
|
|
259
|
+
if (
|
|
260
|
+
not isinstance(t.op, (ArrayDataSource, PandasDataSourceOperator))
|
|
261
|
+
or t.op.get_data() is None
|
|
262
|
+
or t.inputs
|
|
263
|
+
):
|
|
264
|
+
return None
|
|
265
|
+
with sync_pyodps_options():
|
|
266
|
+
if isinstance(t.op, PandasDataSourceOperator):
|
|
267
|
+
return self._upload_and_get_table_read_tileable(t)
|
|
268
|
+
else:
|
|
269
|
+
return self._upload_and_get_vol_read_tileable(t)
|
|
270
|
+
|
|
196
271
|
@enter_mode(kernel=True, build=True)
|
|
197
|
-
def
|
|
272
|
+
def _scan_and_replace_local_sources(
|
|
198
273
|
self, graph: TileableGraph
|
|
199
274
|
) -> Dict[TileableType, TileableType]:
|
|
200
275
|
"""Replaces Pandas data sources with temp table sources in the graph"""
|
|
@@ -215,7 +290,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
215
290
|
|
|
216
291
|
for succ in successors:
|
|
217
292
|
graph.add_edge(replaced, succ)
|
|
218
|
-
succ.
|
|
293
|
+
succ.op._set_inputs([replacements.get(t, t) for t in succ.inputs])
|
|
219
294
|
|
|
220
295
|
graph.results = [replacements.get(t, t) for t in graph.results]
|
|
221
296
|
return replacements
|
|
@@ -223,16 +298,41 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
223
298
|
@enter_mode(kernel=True, build=True)
|
|
224
299
|
def _get_input_infos(self, tileables: List[TileableType]) -> Dict[str, ResultInfo]:
|
|
225
300
|
"""Generate ResultInfo structs from generated temp tables"""
|
|
301
|
+
vol_name = build_session_volume_name(self.session_id)
|
|
302
|
+
|
|
226
303
|
infos = dict()
|
|
227
304
|
for t in tileables:
|
|
228
305
|
key = t.key
|
|
229
|
-
if
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
306
|
+
if isinstance(t.op, DataFrameReadODPSTable):
|
|
307
|
+
infos[key] = ODPSTableResultInfo(full_table_name=t.op.table_name)
|
|
308
|
+
else:
|
|
309
|
+
if isinstance(t.op, Fetch):
|
|
310
|
+
infos[key] = ODPSVolumeResultInfo(
|
|
311
|
+
volume_name=vol_name, volume_path=t.key
|
|
312
|
+
)
|
|
313
|
+
elif t.inputs and isinstance(t.inputs[0].op, DataFrameReadODPSTable):
|
|
314
|
+
t = t.inputs[0]
|
|
315
|
+
infos[key] = ODPSTableResultInfo(full_table_name=t.op.table_name)
|
|
234
316
|
return infos
|
|
235
317
|
|
|
318
|
+
def _get_diff_settings(self) -> Dict[str, Any]:
|
|
319
|
+
new_settings = self._caller.get_settings_to_upload()
|
|
320
|
+
if not self._last_settings: # pragma: no cover
|
|
321
|
+
self._last_settings = new_settings
|
|
322
|
+
return new_settings
|
|
323
|
+
|
|
324
|
+
update = dict()
|
|
325
|
+
for k in new_settings.keys():
|
|
326
|
+
old_item = self._last_settings.get(k)
|
|
327
|
+
new_item = new_settings.get(k)
|
|
328
|
+
try:
|
|
329
|
+
if old_item != new_item:
|
|
330
|
+
update[k] = new_item
|
|
331
|
+
except: # noqa: E722 # nosec # pylint: disable=bare-except
|
|
332
|
+
update[k] = new_item
|
|
333
|
+
self._last_settings = new_settings
|
|
334
|
+
return update
|
|
335
|
+
|
|
236
336
|
async def execute(self, *tileables, **kwargs) -> ExecutionInfo:
|
|
237
337
|
tileables = [
|
|
238
338
|
tileable.data if isinstance(tileable, Entity) else tileable
|
|
@@ -242,7 +342,7 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
242
342
|
tileable_graph, to_execute_tileables = gen_submit_tileable_graph(
|
|
243
343
|
self, tileables, tileable_to_copied
|
|
244
344
|
)
|
|
245
|
-
source_replacements = self.
|
|
345
|
+
source_replacements = self._scan_and_replace_local_sources(tileable_graph)
|
|
246
346
|
|
|
247
347
|
# we need to manage uploaded data sources with refcounting mechanism
|
|
248
348
|
# as nodes in tileable_graph are copied, we need to use original nodes
|
|
@@ -252,7 +352,10 @@ class MaxFrameSession(ToThreadMixin, IsolatedAsyncSession):
|
|
|
252
352
|
|
|
253
353
|
replaced_infos = self._get_input_infos(list(source_replacements.values()))
|
|
254
354
|
dag_info = await self.ensure_async_call(
|
|
255
|
-
self._caller.submit_dag,
|
|
355
|
+
self._caller.submit_dag,
|
|
356
|
+
tileable_graph,
|
|
357
|
+
replaced_infos,
|
|
358
|
+
self._get_diff_settings(),
|
|
256
359
|
)
|
|
257
360
|
|
|
258
361
|
await self._show_logview_address(dag_info.dag_id)
|
|
@@ -462,7 +565,8 @@ class MaxFrameRestCaller(MaxFrameServiceCaller):
|
|
|
462
565
|
_client: FrameDriverClient
|
|
463
566
|
_session_id: Optional[str]
|
|
464
567
|
|
|
465
|
-
def __init__(self, client: FrameDriverClient):
|
|
568
|
+
def __init__(self, odps_entry: ODPS, client: FrameDriverClient):
|
|
569
|
+
self._odps_entry = odps_entry
|
|
466
570
|
self._client = client
|
|
467
571
|
self._session_id = None
|
|
468
572
|
|
|
@@ -475,7 +579,10 @@ class MaxFrameRestCaller(MaxFrameServiceCaller):
|
|
|
475
579
|
await self._client.delete_session(self._session_id)
|
|
476
580
|
|
|
477
581
|
async def submit_dag(
|
|
478
|
-
self,
|
|
582
|
+
self,
|
|
583
|
+
dag: TileableGraph,
|
|
584
|
+
managed_input_infos: Dict[str, ResultInfo] = None,
|
|
585
|
+
new_settings: Dict[str, Any] = None,
|
|
479
586
|
) -> DagInfo:
|
|
480
587
|
return await self._client.submit_dag(self._session_id, dag, managed_input_infos)
|
|
481
588
|
|
|
@@ -515,7 +622,7 @@ class MaxFrameRestSession(MaxFrameSession):
|
|
|
515
622
|
|
|
516
623
|
@classmethod
|
|
517
624
|
def _create_caller(cls, odps_entry: ODPS, address: str, **kwargs):
|
|
518
|
-
return MaxFrameRestCaller(FrameDriverClient(address))
|
|
625
|
+
return MaxFrameRestCaller(odps_entry, FrameDriverClient(address))
|
|
519
626
|
|
|
520
627
|
|
|
521
628
|
def register_session_schemes(overwrite: bool = False):
|
maxframe_client/session/task.py
CHANGED
|
@@ -16,7 +16,7 @@ import base64
|
|
|
16
16
|
import json
|
|
17
17
|
import logging
|
|
18
18
|
import time
|
|
19
|
-
from typing import Dict, List, Optional, Type, Union
|
|
19
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
|
20
20
|
|
|
21
21
|
import msgpack
|
|
22
22
|
from odps import ODPS
|
|
@@ -24,6 +24,12 @@ from odps import options as odps_options
|
|
|
24
24
|
from odps.errors import parse_instance_error
|
|
25
25
|
from odps.models import Instance, MaxFrameTask
|
|
26
26
|
|
|
27
|
+
try:
|
|
28
|
+
from odps.errors import EmptyTaskInfoError
|
|
29
|
+
except ImportError: # pragma: no cover
|
|
30
|
+
# todo remove when pyodps>=0.12.0 is enforced
|
|
31
|
+
EmptyTaskInfoError = type("EmptyTaskInfoError", (Exception,), {})
|
|
32
|
+
|
|
27
33
|
from maxframe.config import options
|
|
28
34
|
from maxframe.core import TileableGraph
|
|
29
35
|
from maxframe.errors import NoTaskServerResponseError, SessionAlreadyClosedError
|
|
@@ -36,6 +42,7 @@ except ImportError:
|
|
|
36
42
|
mf_version = None
|
|
37
43
|
|
|
38
44
|
from .consts import (
|
|
45
|
+
EMPTY_RESPONSE_RETRY_COUNT,
|
|
39
46
|
MAXFRAME_DEFAULT_PROTOCOL,
|
|
40
47
|
MAXFRAME_OUTPUT_JSON_FORMAT,
|
|
41
48
|
MAXFRAME_OUTPUT_MAXFRAME_FORMAT,
|
|
@@ -92,6 +99,10 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
|
|
|
92
99
|
self._nested = True
|
|
93
100
|
self._instance = odps_entry.get_instance(nested_instance_id)
|
|
94
101
|
|
|
102
|
+
@property
|
|
103
|
+
def instance(self):
|
|
104
|
+
return self._instance
|
|
105
|
+
|
|
95
106
|
def _deserial_task_info_result(
|
|
96
107
|
self, content: Union[bytes, str, dict], target_cls: Type[JsonSerializable]
|
|
97
108
|
):
|
|
@@ -125,16 +136,8 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
|
|
|
125
136
|
major_version=self._major_version,
|
|
126
137
|
service_endpoint=self._odps_entry.endpoint,
|
|
127
138
|
)
|
|
128
|
-
|
|
129
|
-
# merge sql options
|
|
130
|
-
sql_settings = (odps_options.sql.settings or {}).copy()
|
|
131
|
-
sql_settings.update(options.sql.settings or {})
|
|
132
|
-
|
|
133
|
-
mf_settings = dict(options.to_dict(remote_only=True).items())
|
|
134
|
-
mf_settings["sql.settings"] = sql_settings
|
|
135
|
-
|
|
136
139
|
mf_opts = {
|
|
137
|
-
"odps.maxframe.settings": json.dumps(
|
|
140
|
+
"odps.maxframe.settings": json.dumps(self.get_settings_to_upload()),
|
|
138
141
|
"odps.maxframe.output_format": self._output_format,
|
|
139
142
|
}
|
|
140
143
|
if mf_version:
|
|
@@ -189,18 +192,39 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
|
|
|
189
192
|
interval = min(max_interval, interval * 2)
|
|
190
193
|
|
|
191
194
|
def _put_task_info(self, method_name: str, json_data: dict):
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
195
|
+
for trial in range(EMPTY_RESPONSE_RETRY_COUNT):
|
|
196
|
+
try:
|
|
197
|
+
return self._instance.put_task_info(
|
|
198
|
+
self._task_name,
|
|
199
|
+
method_name,
|
|
200
|
+
json.dumps(json_data),
|
|
201
|
+
raise_empty=True,
|
|
202
|
+
)
|
|
203
|
+
except TypeError: # pragma: no cover
|
|
204
|
+
# todo remove when pyodps>=0.12.0 is enforced
|
|
205
|
+
resp_data = self._instance.put_task_info(
|
|
206
|
+
self._task_name, method_name, json.dumps(json_data)
|
|
207
|
+
)
|
|
208
|
+
if resp_data:
|
|
209
|
+
return resp_data
|
|
210
|
+
else:
|
|
211
|
+
raise NoTaskServerResponseError(
|
|
212
|
+
f"No response for request {method_name}. "
|
|
213
|
+
f"Instance ID: {self._instance.id}"
|
|
214
|
+
)
|
|
215
|
+
except EmptyTaskInfoError as ex:
|
|
216
|
+
# retry when server returns HTTP 204, which is designed for retry
|
|
217
|
+
if ex.code != 204 or trial >= EMPTY_RESPONSE_RETRY_COUNT - 1:
|
|
218
|
+
raise NoTaskServerResponseError(
|
|
219
|
+
f"No response for request {method_name}. "
|
|
220
|
+
f"Instance ID: {self._instance.id}. "
|
|
221
|
+
f"Request ID: {ex.request_id}"
|
|
222
|
+
) from None
|
|
223
|
+
time.sleep(0.5)
|
|
198
224
|
|
|
199
225
|
def get_session(self) -> SessionInfo:
|
|
200
226
|
req_data = {"output_format": self._output_format}
|
|
201
|
-
serialized = self.
|
|
202
|
-
self._task_name, MAXFRAME_TASK_GET_SESSION_METHOD, json.dumps(req_data)
|
|
203
|
-
)
|
|
227
|
+
serialized = self._put_task_info(MAXFRAME_TASK_GET_SESSION_METHOD, req_data)
|
|
204
228
|
info: SessionInfo = self._deserial_task_info_result(serialized, SessionInfo)
|
|
205
229
|
info.session_id = self._instance.id
|
|
206
230
|
return info
|
|
@@ -217,13 +241,18 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
|
|
|
217
241
|
self,
|
|
218
242
|
dag: TileableGraph,
|
|
219
243
|
managed_input_infos: Optional[Dict[str, ResultInfo]] = None,
|
|
244
|
+
new_settings: Dict[str, Any] = None,
|
|
220
245
|
) -> DagInfo:
|
|
246
|
+
new_settings_value = {
|
|
247
|
+
"odps.maxframe.settings": json.dumps(new_settings),
|
|
248
|
+
}
|
|
221
249
|
req_data = {
|
|
222
250
|
"protocol": MAXFRAME_DEFAULT_PROTOCOL,
|
|
223
251
|
"dag": base64.b64encode(serialize_serializable(dag)).decode(),
|
|
224
252
|
"managed_input_infos": base64.b64encode(
|
|
225
253
|
serialize_serializable(managed_input_infos)
|
|
226
254
|
).decode(),
|
|
255
|
+
"new_settings": json.dumps(new_settings_value),
|
|
227
256
|
"output_format": self._output_format,
|
|
228
257
|
}
|
|
229
258
|
res = self._put_task_info(MAXFRAME_TASK_SUBMIT_DAG_METHOD, req_data)
|
|
@@ -276,7 +305,7 @@ class MaxFrameInstanceCaller(MaxFrameServiceCaller):
|
|
|
276
305
|
class MaxFrameTaskSession(MaxFrameSession):
|
|
277
306
|
schemes = [ODPS_SESSION_INSECURE_SCHEME, ODPS_SESSION_SECURE_SCHEME]
|
|
278
307
|
|
|
279
|
-
|
|
308
|
+
_caller: MaxFrameInstanceCaller
|
|
280
309
|
|
|
281
310
|
@classmethod
|
|
282
311
|
def _create_caller(
|
|
@@ -296,6 +325,15 @@ class MaxFrameTaskSession(MaxFrameSession):
|
|
|
296
325
|
**kwargs,
|
|
297
326
|
)
|
|
298
327
|
|
|
328
|
+
@property
|
|
329
|
+
def closed(self) -> bool:
|
|
330
|
+
if super().closed:
|
|
331
|
+
return True
|
|
332
|
+
if not self._caller or not self._caller.instance:
|
|
333
|
+
# session not initialized yet
|
|
334
|
+
return False
|
|
335
|
+
return self._caller.instance.is_terminated()
|
|
336
|
+
|
|
299
337
|
|
|
300
338
|
def register_session_schemes(overwrite: bool = False):
|
|
301
339
|
MaxFrameTaskSession.register_schemes(overwrite=overwrite)
|
|
@@ -22,7 +22,7 @@ from odps import ODPS
|
|
|
22
22
|
|
|
23
23
|
import maxframe.dataframe as md
|
|
24
24
|
from maxframe.config import options
|
|
25
|
-
from maxframe.odpsio import ODPSTableIO
|
|
25
|
+
from maxframe.io.odpsio import ODPSTableIO
|
|
26
26
|
from maxframe.protocol import ODPSTableResultInfo, ResultType
|
|
27
27
|
from maxframe.tests.utils import tn
|
|
28
28
|
|
|
@@ -137,6 +137,15 @@ def test_simple_run_dataframe(start_mock_session):
|
|
|
137
137
|
assert not odps_entry.exist_table(build_temp_table_name(start_mock_session, key))
|
|
138
138
|
|
|
139
139
|
|
|
140
|
+
def test_run_and_fetch_slice(start_mock_session):
|
|
141
|
+
pd_df = pd.DataFrame(np.random.rand(1000, 5), columns=list("ABCDE"))
|
|
142
|
+
df = md.DataFrame(pd_df)
|
|
143
|
+
result = df.execute()
|
|
144
|
+
|
|
145
|
+
sliced = result.head(10).fetch()
|
|
146
|
+
assert len(sliced) == 10
|
|
147
|
+
|
|
148
|
+
|
|
140
149
|
def test_run_empty_table(start_mock_session):
|
|
141
150
|
odps_entry = ODPS.from_environments()
|
|
142
151
|
|
|
@@ -189,7 +198,7 @@ def test_run_dataframe_from_to_odps_table(start_mock_session):
|
|
|
189
198
|
table_name = build_temp_table_name(start_mock_session, "tmp_save")
|
|
190
199
|
table_obj = odps_entry.get_table(table_name)
|
|
191
200
|
try:
|
|
192
|
-
md.to_odps_table(md.DataFrame(pd_df), table_obj).execute().fetch()
|
|
201
|
+
md.to_odps_table(md.DataFrame(pd_df), table_obj, lifecycle=1).execute().fetch()
|
|
193
202
|
with table_obj.open_reader() as reader:
|
|
194
203
|
result_df = reader.to_pandas()
|
|
195
204
|
assert len(result_df) == 10
|
|
@@ -247,7 +256,22 @@ def test_run_and_fetch_series(start_mock_session):
|
|
|
247
256
|
)
|
|
248
257
|
|
|
249
258
|
|
|
250
|
-
def
|
|
259
|
+
def test_execute_with_tensor(oss_config, start_mock_session):
|
|
260
|
+
pd_df = pd.DataFrame(
|
|
261
|
+
{"angles": [0, 3, 4], "degrees": [360, 180, 360]},
|
|
262
|
+
index=["circle", "triangle", "rectangle"],
|
|
263
|
+
)
|
|
264
|
+
df = md.DataFrame(pd_df)
|
|
265
|
+
|
|
266
|
+
result = (df - [1, 2]).execute().fetch()
|
|
267
|
+
expected = pd_df - [1, 2]
|
|
268
|
+
# TODO: currently the record order in tensor reading from table is the index
|
|
269
|
+
# sorting order
|
|
270
|
+
expected.sort_index(axis=0, inplace=True)
|
|
271
|
+
pd.testing.assert_frame_equal(result, expected, check_like=True)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def test_run_remote_success(oss_config, start_mock_session):
|
|
251
275
|
def func(a, b):
|
|
252
276
|
return a + b
|
|
253
277
|
|
|
@@ -258,7 +282,7 @@ def test_run_remote_success(start_mock_session):
|
|
|
258
282
|
assert result == 21
|
|
259
283
|
|
|
260
284
|
|
|
261
|
-
def test_run_remote_error(start_mock_session):
|
|
285
|
+
def test_run_remote_error(oss_config, start_mock_session):
|
|
262
286
|
def func():
|
|
263
287
|
raise ValueError
|
|
264
288
|
|