maxframe 2.0.0b1__cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl → 2.0.0b2__cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/dataframe/datasource/read_odps_query.py +76 -16
- maxframe/dataframe/datasource/tests/test_datasource.py +47 -1
- maxframe/io/objects/tests/test_object_io.py +4 -2
- maxframe/io/odpsio/tests/test_volumeio.py +4 -15
- maxframe/io/odpsio/volumeio.py +23 -8
- maxframe/learn/contrib/xgboost/core.py +1 -0
- maxframe/tests/utils.py +5 -13
- {maxframe-2.0.0b1.dist-info → maxframe-2.0.0b2.dist-info}/METADATA +1 -1
- {maxframe-2.0.0b1.dist-info → maxframe-2.0.0b2.dist-info}/RECORD +882 -882
- {maxframe-2.0.0b1.dist-info → maxframe-2.0.0b2.dist-info}/WHEEL +0 -0
- {maxframe-2.0.0b1.dist-info → maxframe-2.0.0b2.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import dataclasses
|
|
16
|
+
import functools
|
|
16
17
|
import io
|
|
17
18
|
import logging
|
|
18
19
|
import re
|
|
@@ -22,6 +23,8 @@ from typing import Dict, List, MutableMapping, Optional, Tuple, Union
|
|
|
22
23
|
import numpy as np
|
|
23
24
|
import pandas as pd
|
|
24
25
|
from odps import ODPS
|
|
26
|
+
from odps.errors import ODPSError
|
|
27
|
+
from odps.models import TableSchema
|
|
25
28
|
from odps.types import Column, OdpsSchema, validate_data_type
|
|
26
29
|
from odps.utils import split_sql_by_semicolon
|
|
27
30
|
|
|
@@ -245,13 +248,18 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
|
245
248
|
return _parse_full_explain(explain_string)
|
|
246
249
|
|
|
247
250
|
|
|
248
|
-
def _build_explain_sql(
|
|
251
|
+
def _build_explain_sql(
|
|
252
|
+
sql_stmt: str, no_split: bool = False, use_output: bool = False
|
|
253
|
+
) -> str:
|
|
254
|
+
clause = "EXPLAIN "
|
|
255
|
+
if use_output:
|
|
256
|
+
clause += "OUTPUT "
|
|
249
257
|
if no_split:
|
|
250
|
-
return
|
|
258
|
+
return clause + sql_stmt
|
|
251
259
|
sql_parts = split_sql_by_semicolon(sql_stmt)
|
|
252
260
|
if not sql_parts:
|
|
253
261
|
raise ValueError(f"Cannot explain SQL statement {sql_stmt}")
|
|
254
|
-
sql_parts[-1] =
|
|
262
|
+
sql_parts[-1] = clause + sql_parts[-1]
|
|
255
263
|
return "\n".join(sql_parts)
|
|
256
264
|
|
|
257
265
|
|
|
@@ -332,6 +340,62 @@ def _check_token_in_sql(token: str, sql: str) -> bool:
|
|
|
332
340
|
return False
|
|
333
341
|
|
|
334
342
|
|
|
343
|
+
def _resolve_schema_by_explain(
|
|
344
|
+
odps_entry: ODPS,
|
|
345
|
+
query: str,
|
|
346
|
+
no_split_sql: bool = False,
|
|
347
|
+
hints: Dict[str, str] = None,
|
|
348
|
+
use_explain_output: bool = True,
|
|
349
|
+
) -> OdpsSchema:
|
|
350
|
+
hints = (hints or dict()).copy()
|
|
351
|
+
hints["odps.sql.select.output.format"] = "json"
|
|
352
|
+
explain_stmt = _build_explain_sql(
|
|
353
|
+
query, no_split=no_split_sql, use_output=use_explain_output
|
|
354
|
+
)
|
|
355
|
+
inst = odps_entry.execute_sql(explain_stmt, hints=hints)
|
|
356
|
+
logger.debug("Explain output instance ID: %s", inst.id)
|
|
357
|
+
explain_str = list(inst.get_task_results().values())[0]
|
|
358
|
+
if use_explain_output:
|
|
359
|
+
if not explain_str or "nothing to explain" in explain_str:
|
|
360
|
+
raise ValueError("The SQL statement should be an instant query")
|
|
361
|
+
return TableSchema.parse(None, explain_str)
|
|
362
|
+
else:
|
|
363
|
+
return _parse_explained_schema(explain_str)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def _resolve_query_schema(
|
|
367
|
+
odps_entry: ODPS,
|
|
368
|
+
query: str,
|
|
369
|
+
no_split_sql: bool = False,
|
|
370
|
+
hints: Dict[str, str] = None,
|
|
371
|
+
use_explain_output: Optional[bool] = None,
|
|
372
|
+
) -> OdpsSchema:
|
|
373
|
+
methods = []
|
|
374
|
+
if use_explain_output is not False:
|
|
375
|
+
# None or True
|
|
376
|
+
methods.append(_resolve_schema_by_explain)
|
|
377
|
+
if use_explain_output is not True:
|
|
378
|
+
# None or False
|
|
379
|
+
methods.append(
|
|
380
|
+
functools.partial(_resolve_schema_by_explain, use_explain_output=False)
|
|
381
|
+
)
|
|
382
|
+
for idx, resolve_method in enumerate(methods):
|
|
383
|
+
try:
|
|
384
|
+
return resolve_method(
|
|
385
|
+
odps_entry, query, no_split_sql=no_split_sql, hints=hints
|
|
386
|
+
)
|
|
387
|
+
except ODPSError as ex:
|
|
388
|
+
msg = (
|
|
389
|
+
f"Failed to obtain schema from SQL explain: {ex!r}\n"
|
|
390
|
+
f"Explain instance ID: {ex.instance_id}"
|
|
391
|
+
)
|
|
392
|
+
if idx + 1 == len(methods) or "ODPS-0130161" not in str(ex):
|
|
393
|
+
exc = ValueError(msg)
|
|
394
|
+
raise exc.with_traceback(ex.__traceback__) from None
|
|
395
|
+
# will this happen?
|
|
396
|
+
raise ValueError("Failed to obtain schema from SQL explain") # pragma: no cover
|
|
397
|
+
|
|
398
|
+
|
|
335
399
|
def read_odps_query(
|
|
336
400
|
query: str,
|
|
337
401
|
odps_entry: ODPS = None,
|
|
@@ -371,6 +435,8 @@ def read_odps_query(
|
|
|
371
435
|
DataFrame read from MaxCompute (ODPS) table
|
|
372
436
|
"""
|
|
373
437
|
no_split_sql = kw.pop("no_split_sql", False)
|
|
438
|
+
# if use_explain_output is None, will try two methods.
|
|
439
|
+
use_explain_output = kw.pop("use_explain_output", None)
|
|
374
440
|
|
|
375
441
|
hints = options.sql.settings.copy() or {}
|
|
376
442
|
if sql_hints:
|
|
@@ -395,19 +461,13 @@ def read_odps_query(
|
|
|
395
461
|
|
|
396
462
|
col_renames = {}
|
|
397
463
|
if not skip_schema:
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
except BaseException as ex:
|
|
406
|
-
exc = ValueError(
|
|
407
|
-
f"Failed to obtain schema from SQL explain: {ex!r}"
|
|
408
|
-
f"\nExplain instance ID: {inst.id}"
|
|
409
|
-
)
|
|
410
|
-
raise exc.with_traceback(ex.__traceback__) from None
|
|
464
|
+
odps_schema = _resolve_query_schema(
|
|
465
|
+
odps_entry,
|
|
466
|
+
query,
|
|
467
|
+
no_split_sql=no_split_sql,
|
|
468
|
+
hints=hints,
|
|
469
|
+
use_explain_output=use_explain_output,
|
|
470
|
+
)
|
|
411
471
|
|
|
412
472
|
new_columns = []
|
|
413
473
|
for col in odps_schema.columns:
|
|
@@ -17,11 +17,13 @@ import uuid
|
|
|
17
17
|
from collections import OrderedDict
|
|
18
18
|
from math import isinf
|
|
19
19
|
|
|
20
|
+
import mock
|
|
20
21
|
import numpy as np
|
|
21
22
|
import pandas as pd
|
|
22
23
|
import pytest
|
|
23
24
|
from odps import ODPS
|
|
24
25
|
from odps import types as odps_types
|
|
26
|
+
from odps.errors import ODPSError
|
|
25
27
|
|
|
26
28
|
from .... import tensor as mt
|
|
27
29
|
from ....core import OutputType
|
|
@@ -50,6 +52,7 @@ from ..read_odps_query import (
|
|
|
50
52
|
ColumnSchema,
|
|
51
53
|
_parse_full_explain,
|
|
52
54
|
_parse_simple_explain,
|
|
55
|
+
_resolve_query_schema,
|
|
53
56
|
_resolve_task_sector,
|
|
54
57
|
)
|
|
55
58
|
from ..series import from_pandas as from_pandas_series
|
|
@@ -360,7 +363,7 @@ def test_from_odps_query():
|
|
|
360
363
|
|
|
361
364
|
with pytest.raises(ValueError) as err_info:
|
|
362
365
|
read_odps_query(
|
|
363
|
-
f"CREATE TABLE dummy_table_{uuid.uuid4().hex} "
|
|
366
|
+
f"CREATE TABLE dummy_table_{uuid.uuid4().hex} LIFECYCLE 1 "
|
|
364
367
|
f"AS SELECT * FROM {table1_name}"
|
|
365
368
|
)
|
|
366
369
|
assert "instant query" in err_info.value.args[0]
|
|
@@ -578,3 +581,46 @@ def test_resolve_break_lines():
|
|
|
578
581
|
for col, (exp_nm, exp_tp) in zip(schema.columns, expected_col_types.items()):
|
|
579
582
|
assert col.name == exp_nm
|
|
580
583
|
assert col.type == odps_types.validate_data_type(exp_tp)
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
@pytest.mark.parametrize("use_explain_output", [None, False, True])
|
|
587
|
+
def test_explain_use_explain_output(use_explain_output):
|
|
588
|
+
class MockInstance:
|
|
589
|
+
@property
|
|
590
|
+
def id(self):
|
|
591
|
+
return "mock_id"
|
|
592
|
+
|
|
593
|
+
def get_task_results(self):
|
|
594
|
+
return {"pot": """{"columns":[{"name":"a_bigint","type":"BIGINT"}]}"""}
|
|
595
|
+
|
|
596
|
+
old_execute_sql = ODPS.execute_sql
|
|
597
|
+
exec_count = 0
|
|
598
|
+
|
|
599
|
+
def new_execute_sql(self, sql, *args, **kw):
|
|
600
|
+
nonlocal exec_count
|
|
601
|
+
exec_count += 1
|
|
602
|
+
|
|
603
|
+
if use_explain_output and sql.lower().startswith("explain output select"):
|
|
604
|
+
return MockInstance()
|
|
605
|
+
elif use_explain_output is None and sql.lower().startswith("explain output"):
|
|
606
|
+
raise ODPSError("ODPS-0130161: mock error")
|
|
607
|
+
return old_execute_sql(self, sql, *args, **kw)
|
|
608
|
+
|
|
609
|
+
odps_entry = ODPS.from_environments()
|
|
610
|
+
|
|
611
|
+
with mock.patch("odps.core.ODPS.execute_sql", new=new_execute_sql):
|
|
612
|
+
with pytest.raises(ValueError):
|
|
613
|
+
_resolve_query_schema(
|
|
614
|
+
odps_entry, "not_a_sql", use_explain_output=use_explain_output
|
|
615
|
+
)
|
|
616
|
+
assert exec_count == (2 if use_explain_output is None else 1)
|
|
617
|
+
|
|
618
|
+
exec_count = 0
|
|
619
|
+
schema = _resolve_query_schema(
|
|
620
|
+
odps_entry,
|
|
621
|
+
"select cast(1 as bigint) as a_bigint",
|
|
622
|
+
use_explain_output=use_explain_output,
|
|
623
|
+
)
|
|
624
|
+
assert schema.columns[0].name == "a_bigint"
|
|
625
|
+
assert schema.columns[0].type == odps_types.bigint
|
|
626
|
+
assert exec_count == (2 if use_explain_output is None else 1)
|
|
@@ -19,7 +19,7 @@ from odps import ODPS
|
|
|
19
19
|
from ....core import OutputType
|
|
20
20
|
from ....core.operator import ObjectOperatorMixin, Operator
|
|
21
21
|
from ....tensor.datasource import ArrayDataSource
|
|
22
|
-
from ....tests.utils import create_test_volume, tn
|
|
22
|
+
from ....tests.utils import create_test_volume, get_test_unique_name, tn
|
|
23
23
|
from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
|
|
24
24
|
from ..core import get_object_io_handler
|
|
25
25
|
|
|
@@ -32,7 +32,9 @@ class TestObjectOp(Operator, ObjectOperatorMixin):
|
|
|
32
32
|
|
|
33
33
|
@pytest.fixture(scope="module")
|
|
34
34
|
def create_volume(oss_config):
|
|
35
|
-
with create_test_volume(
|
|
35
|
+
with create_test_volume(
|
|
36
|
+
tn("test_object_io_vol_" + get_test_unique_name(5)), oss_config
|
|
37
|
+
) as test_vol_name:
|
|
36
38
|
yield test_vol_name
|
|
37
39
|
|
|
38
40
|
|
|
@@ -17,13 +17,13 @@ import contextlib
|
|
|
17
17
|
import pytest
|
|
18
18
|
from odps import ODPS
|
|
19
19
|
|
|
20
|
-
from ....tests.utils import create_test_volume, tn
|
|
20
|
+
from ....tests.utils import create_test_volume, get_test_unique_name, tn
|
|
21
21
|
from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
@pytest.fixture
|
|
25
25
|
def create_volume(request, oss_config):
|
|
26
|
-
test_vol_name = tn("test_vol_name_" + request.param)
|
|
26
|
+
test_vol_name = tn(f"test_vol_name_{get_test_unique_name(5)}_" + request.param)
|
|
27
27
|
odps_entry = ODPS.from_environments()
|
|
28
28
|
|
|
29
29
|
@contextlib.contextmanager
|
|
@@ -41,24 +41,13 @@ def create_volume(request, oss_config):
|
|
|
41
41
|
except BaseException:
|
|
42
42
|
pass
|
|
43
43
|
|
|
44
|
-
oss_test_dir_name = None
|
|
45
44
|
if request.param == "parted":
|
|
46
45
|
ctx = create_parted_volume()
|
|
47
46
|
else:
|
|
48
47
|
ctx = create_test_volume(test_vol_name, oss_config)
|
|
49
48
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
yield test_vol_name
|
|
53
|
-
finally:
|
|
54
|
-
if oss_test_dir_name is not None:
|
|
55
|
-
import oss2
|
|
56
|
-
|
|
57
|
-
keys = [
|
|
58
|
-
obj.key
|
|
59
|
-
for obj in oss2.ObjectIterator(oss_config.oss_bucket, oss_test_dir_name)
|
|
60
|
-
]
|
|
61
|
-
oss_config.oss_bucket.batch_delete_objects(keys)
|
|
49
|
+
with ctx:
|
|
50
|
+
yield test_vol_name
|
|
62
51
|
|
|
63
52
|
|
|
64
53
|
@pytest.mark.parametrize("create_volume", ["external"], indirect=True)
|
maxframe/io/odpsio/volumeio.py
CHANGED
|
@@ -14,7 +14,9 @@
|
|
|
14
14
|
|
|
15
15
|
import inspect
|
|
16
16
|
from typing import Iterator, List, Optional, Union
|
|
17
|
+
from urllib.parse import urlparse
|
|
17
18
|
|
|
19
|
+
import requests
|
|
18
20
|
from odps import ODPS
|
|
19
21
|
from odps import __version__ as pyodps_version
|
|
20
22
|
|
|
@@ -74,14 +76,27 @@ class ODPSVolumeWriter:
|
|
|
74
76
|
self._replace_internal_host = replace_internal_host
|
|
75
77
|
|
|
76
78
|
def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
79
|
+
sign_url = self._volume.get_sign_url(
|
|
80
|
+
self._volume_dir + "/" + file_name,
|
|
81
|
+
method="PUT",
|
|
82
|
+
seconds=3600,
|
|
83
|
+
)
|
|
84
|
+
if self._replace_internal_host:
|
|
85
|
+
parsed_url = urlparse(sign_url)
|
|
86
|
+
if "-internal." in parsed_url.netloc:
|
|
87
|
+
new_netloc = parsed_url.netloc.replace("-internal.", ".")
|
|
88
|
+
sign_url = sign_url.replace(parsed_url.netloc, new_netloc)
|
|
89
|
+
|
|
90
|
+
def _to_bytes(d):
|
|
91
|
+
if not isinstance(d, (bytes, bytearray)):
|
|
92
|
+
return bytes(d)
|
|
93
|
+
return d
|
|
94
|
+
|
|
95
|
+
def data_func():
|
|
83
96
|
if not inspect.isgenerator(data):
|
|
84
|
-
|
|
97
|
+
yield _to_bytes(data)
|
|
85
98
|
else:
|
|
86
99
|
for chunk in data:
|
|
87
|
-
|
|
100
|
+
yield _to_bytes(chunk)
|
|
101
|
+
|
|
102
|
+
requests.put(sign_url, data=data_func())
|
maxframe/tests/utils.py
CHANGED
|
@@ -16,6 +16,7 @@ import asyncio
|
|
|
16
16
|
import contextlib
|
|
17
17
|
import functools
|
|
18
18
|
import hashlib
|
|
19
|
+
import logging
|
|
19
20
|
import os
|
|
20
21
|
import queue
|
|
21
22
|
import socket
|
|
@@ -191,14 +192,8 @@ def assert_mf_index_dtype(idx_obj, dtype):
|
|
|
191
192
|
|
|
192
193
|
@contextlib.contextmanager
|
|
193
194
|
def create_test_volume(vol_name, oss_config):
|
|
194
|
-
test_vol_name = vol_name
|
|
195
195
|
odps_entry = ODPS.from_environments()
|
|
196
196
|
|
|
197
|
-
try:
|
|
198
|
-
odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
|
|
199
|
-
except:
|
|
200
|
-
pass
|
|
201
|
-
|
|
202
197
|
oss_test_dir_name = "test_dir_" + vol_name
|
|
203
198
|
if oss_config is None:
|
|
204
199
|
pytest.skip("Need oss and its config to run this test")
|
|
@@ -232,17 +227,14 @@ def create_test_volume(vol_name, oss_config):
|
|
|
232
227
|
rolearn = oss_config.oss_rolearn
|
|
233
228
|
|
|
234
229
|
oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
|
|
235
|
-
odps_entry.create_external_volume(
|
|
236
|
-
test_vol_name, location=test_location, rolearn=rolearn
|
|
237
|
-
)
|
|
230
|
+
odps_entry.create_external_volume(vol_name, location=test_location, rolearn=rolearn)
|
|
238
231
|
|
|
239
232
|
try:
|
|
240
|
-
yield
|
|
233
|
+
yield vol_name
|
|
241
234
|
finally:
|
|
242
235
|
try:
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
)
|
|
236
|
+
logging.warning("Deleting test volume %s", vol_name)
|
|
237
|
+
odps_entry.delete_volume(vol_name, auto_remove_dir=True, recursive=True)
|
|
246
238
|
except:
|
|
247
239
|
pass
|
|
248
240
|
|