maxframe 1.0.0rc3__cp310-cp310-win_amd64.whl → 1.0.0rc4__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp310-win_amd64.pyd +0 -0
- maxframe/codegen.py +1 -0
- maxframe/config/config.py +13 -1
- maxframe/conftest.py +43 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/core.py +2 -0
- maxframe/dataframe/datasource/read_odps_query.py +66 -7
- maxframe/dataframe/datasource/read_odps_table.py +3 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
- maxframe/dataframe/datastore/to_odps.py +7 -0
- maxframe/dataframe/extensions/__init__.py +3 -0
- maxframe/dataframe/extensions/flatmap.py +326 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/misc/drop_duplicates.py +18 -1
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/io/odpsio/schema.py +5 -3
- maxframe/io/odpsio/tableio.py +44 -38
- maxframe/io/odpsio/tests/test_schema.py +0 -4
- maxframe/io/odpsio/volumeio.py +9 -3
- maxframe/learn/contrib/__init__.py +2 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/xgboost/classifier.py +3 -3
- maxframe/learn/contrib/xgboost/predict.py +8 -39
- maxframe/learn/contrib/xgboost/train.py +4 -3
- maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
- maxframe/opcodes.py +3 -0
- maxframe/protocol.py +6 -1
- maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
- maxframe/session.py +9 -2
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/concatenate.py +23 -20
- maxframe/tensor/merge/vstack.py +5 -1
- maxframe/tensor/misc/transpose.py +1 -1
- maxframe/utils.py +34 -12
- {maxframe-1.0.0rc3.dist-info → maxframe-1.0.0rc4.dist-info}/METADATA +1 -1
- {maxframe-1.0.0rc3.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +57 -52
- {maxframe-1.0.0rc3.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +10 -8
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/odps.py +84 -13
- maxframe_client/session/task.py +58 -20
- maxframe_client/tests/test_session.py +14 -2
- {maxframe-1.0.0rc3.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
|
Binary file
|
maxframe/codegen.py
CHANGED
|
@@ -347,6 +347,7 @@ BUILTIN_ENGINE_SPE = "SPE"
|
|
|
347
347
|
BUILTIN_ENGINE_MCSQL = "MCSQL"
|
|
348
348
|
|
|
349
349
|
FAST_RANGE_INDEX_ENABLED = "codegen.fast_range_index_enabled"
|
|
350
|
+
ROW_NUMBER_WINDOW_INDEX_ENABLED = "codegen.row_number_window_index_enabled"
|
|
350
351
|
|
|
351
352
|
|
|
352
353
|
class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
maxframe/config/config.py
CHANGED
|
@@ -343,6 +343,9 @@ default_options.register_option("sql.enable_mcqa", True, validator=is_bool, remo
|
|
|
343
343
|
default_options.register_option(
|
|
344
344
|
"sql.generate_comments", True, validator=is_bool, remote=True
|
|
345
345
|
)
|
|
346
|
+
default_options.register_option(
|
|
347
|
+
"sql.auto_use_common_image", True, validator=is_bool, remote=True
|
|
348
|
+
)
|
|
346
349
|
default_options.register_option("sql.settings", {}, validator=is_dict, remote=True)
|
|
347
350
|
|
|
348
351
|
default_options.register_option("is_production", False, validator=is_bool, remote=True)
|
|
@@ -371,13 +374,22 @@ default_options.register_option(
|
|
|
371
374
|
validator=is_numeric,
|
|
372
375
|
remote=True,
|
|
373
376
|
)
|
|
377
|
+
default_options.register_option(
|
|
378
|
+
"session.quota_name", None, validator=is_null | is_string, remote=True
|
|
379
|
+
)
|
|
380
|
+
default_options.register_option(
|
|
381
|
+
"session.enable_schema", None, validator=is_null | is_bool, remote=True
|
|
382
|
+
)
|
|
383
|
+
default_options.register_option(
|
|
384
|
+
"session.default_schema", None, validator=is_null | is_string, remote=True
|
|
385
|
+
)
|
|
374
386
|
default_options.register_option(
|
|
375
387
|
"session.upload_batch_size",
|
|
376
388
|
_DEFAULT_UPLOAD_BATCH_SIZE,
|
|
377
389
|
validator=is_integer,
|
|
378
390
|
)
|
|
379
391
|
default_options.register_option(
|
|
380
|
-
"session.table_lifecycle", None, validator=is_null | is_integer
|
|
392
|
+
"session.table_lifecycle", None, validator=is_null | is_integer, remote=True
|
|
381
393
|
)
|
|
382
394
|
default_options.register_option(
|
|
383
395
|
"session.temp_table_lifecycle",
|
maxframe/conftest.py
CHANGED
|
@@ -14,10 +14,11 @@
|
|
|
14
14
|
|
|
15
15
|
import faulthandler
|
|
16
16
|
import os
|
|
17
|
-
from configparser import ConfigParser, NoOptionError
|
|
17
|
+
from configparser import ConfigParser, NoOptionError, NoSectionError
|
|
18
18
|
|
|
19
19
|
import pytest
|
|
20
20
|
from odps import ODPS
|
|
21
|
+
from odps.accounts import BearerTokenAccount
|
|
21
22
|
|
|
22
23
|
from .config import options
|
|
23
24
|
|
|
@@ -34,12 +35,23 @@ def test_config():
|
|
|
34
35
|
return config
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
38
|
+
def _get_odps_env(test_config: ConfigParser, section_name: str) -> ODPS:
|
|
39
|
+
try:
|
|
40
|
+
access_id = test_config.get(section_name, "access_id")
|
|
41
|
+
except NoOptionError:
|
|
42
|
+
access_id = test_config.get("odps", "access_id")
|
|
43
|
+
try:
|
|
44
|
+
secret_access_key = test_config.get(section_name, "secret_access_key")
|
|
45
|
+
except NoOptionError:
|
|
46
|
+
secret_access_key = test_config.get("odps", "secret_access_key")
|
|
47
|
+
try:
|
|
48
|
+
project = test_config.get(section_name, "project")
|
|
49
|
+
except NoOptionError:
|
|
50
|
+
project = test_config.get("odps", "project")
|
|
51
|
+
try:
|
|
52
|
+
endpoint = test_config.get(section_name, "endpoint")
|
|
53
|
+
except NoOptionError:
|
|
54
|
+
endpoint = test_config.get("odps", "endpoint")
|
|
43
55
|
try:
|
|
44
56
|
tunnel_endpoint = test_config.get("odps", "tunnel_endpoint")
|
|
45
57
|
except NoOptionError:
|
|
@@ -55,12 +67,31 @@ def odps_envs(test_config):
|
|
|
55
67
|
],
|
|
56
68
|
}
|
|
57
69
|
token = entry.get_project().generate_auth_token(policy, "bearer", 5)
|
|
70
|
+
return ODPS(
|
|
71
|
+
account=BearerTokenAccount(token, 5),
|
|
72
|
+
project=project,
|
|
73
|
+
endpoint=endpoint,
|
|
74
|
+
tunnel_endpoint=tunnel_endpoint,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@pytest.fixture(scope="session")
|
|
79
|
+
def odps_with_schema(test_config):
|
|
80
|
+
try:
|
|
81
|
+
return _get_odps_env(test_config, "odps_with_schema")
|
|
82
|
+
except NoSectionError:
|
|
83
|
+
pytest.skip("Need to specify odps_with_schema section in test.conf")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@pytest.fixture(scope="session", autouse=True)
|
|
87
|
+
def odps_envs(test_config):
|
|
88
|
+
entry = _get_odps_env(test_config, "odps")
|
|
58
89
|
|
|
59
|
-
os.environ["ODPS_BEARER_TOKEN"] = token
|
|
60
|
-
os.environ["ODPS_PROJECT_NAME"] = project
|
|
61
|
-
os.environ["ODPS_ENDPOINT"] = endpoint
|
|
62
|
-
if tunnel_endpoint:
|
|
63
|
-
os.environ["ODPS_TUNNEL_ENDPOINT"] = tunnel_endpoint
|
|
90
|
+
os.environ["ODPS_BEARER_TOKEN"] = entry.account.token
|
|
91
|
+
os.environ["ODPS_PROJECT_NAME"] = entry.project
|
|
92
|
+
os.environ["ODPS_ENDPOINT"] = entry.endpoint
|
|
93
|
+
if entry.tunnel_endpoint:
|
|
94
|
+
os.environ["ODPS_TUNNEL_ENDPOINT"] = entry.tunnel_endpoint
|
|
64
95
|
|
|
65
96
|
try:
|
|
66
97
|
yield
|
|
Binary file
|
|
@@ -185,7 +185,6 @@ e NaN
|
|
|
185
185
|
dtype: float64
|
|
186
186
|
"""
|
|
187
187
|
|
|
188
|
-
# FIXME: https://github.com/aliyun/alibabacloud-odps-maxframe-client/issues/48
|
|
189
188
|
_flex_comp_doc_FRAME = """
|
|
190
189
|
Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
|
|
191
190
|
Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
|
|
@@ -291,7 +290,7 @@ C True False
|
|
|
291
290
|
|
|
292
291
|
Compare to a DataFrame of different shape.
|
|
293
292
|
|
|
294
|
-
>>> other =
|
|
293
|
+
>>> other = md.DataFrame({{'revenue': [300, 250, 100, 150]}},
|
|
295
294
|
... index=['A', 'B', 'C', 'D'])
|
|
296
295
|
>>> other.execute()
|
|
297
296
|
revenue
|
|
@@ -306,6 +305,31 @@ A False False
|
|
|
306
305
|
B False False
|
|
307
306
|
C False True
|
|
308
307
|
D False False
|
|
308
|
+
|
|
309
|
+
Compare to a MultiIndex by level.
|
|
310
|
+
|
|
311
|
+
>>> df_multindex = md.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
|
|
312
|
+
... 'revenue': [100, 250, 300, 200, 175, 225]}},
|
|
313
|
+
... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
|
|
314
|
+
... ['A', 'B', 'C', 'A', 'B', 'C']])
|
|
315
|
+
>>> df_multindex.execute()
|
|
316
|
+
cost revenue
|
|
317
|
+
Q1 A 250 100
|
|
318
|
+
B 150 250
|
|
319
|
+
C 100 300
|
|
320
|
+
Q2 A 150 200
|
|
321
|
+
B 300 175
|
|
322
|
+
C 220 225
|
|
323
|
+
|
|
324
|
+
>>> df.le(df_multindex, level=1).execute()
|
|
325
|
+
cost revenue
|
|
326
|
+
Q1 A True True
|
|
327
|
+
B True True
|
|
328
|
+
C True True
|
|
329
|
+
Q2 A False True
|
|
330
|
+
B True False
|
|
331
|
+
C True False
|
|
332
|
+
|
|
309
333
|
"""
|
|
310
334
|
|
|
311
335
|
|
|
@@ -51,6 +51,8 @@ dtype: bool
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@bin_compare_doc("Equal to", equiv="==", series_example=_eq_example)
|
|
54
|
-
def eq(df, other, axis="columns", level=None):
|
|
55
|
-
op = DataFrameEqual(
|
|
54
|
+
def eq(df, other, axis="columns", level=None, fill_value=None):
|
|
55
|
+
op = DataFrameEqual(
|
|
56
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
57
|
+
)
|
|
56
58
|
return op(df, other)
|
|
@@ -52,6 +52,8 @@ dtype: bool
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
@bin_compare_doc("Greater than", equiv=">", series_example=_gt_example)
|
|
55
|
-
def gt(df, other, axis="columns", level=None):
|
|
56
|
-
op = DataFrameGreater(
|
|
55
|
+
def gt(df, other, axis="columns", level=None, fill_value=None):
|
|
56
|
+
op = DataFrameGreater(
|
|
57
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
58
|
+
)
|
|
57
59
|
return op(df, other)
|
|
@@ -52,6 +52,8 @@ dtype: bool
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
@bin_compare_doc("Greater than or equal to", equiv=">=", series_example=_ge_example)
|
|
55
|
-
def ge(df, other, axis="columns", level=None):
|
|
56
|
-
op = DataFrameGreaterEqual(
|
|
55
|
+
def ge(df, other, axis="columns", level=None, fill_value=None):
|
|
56
|
+
op = DataFrameGreaterEqual(
|
|
57
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
58
|
+
)
|
|
57
59
|
return op(df, other)
|
|
@@ -52,6 +52,6 @@ dtype: bool
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
@bin_compare_doc("Less than", equiv="<", series_example=_lt_example)
|
|
55
|
-
def lt(df, other, axis="columns", level=None):
|
|
56
|
-
op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other)
|
|
55
|
+
def lt(df, other, axis="columns", level=None, fill_value=None):
|
|
56
|
+
op = DataFrameLess(axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value)
|
|
57
57
|
return op(df, other)
|
|
@@ -52,6 +52,8 @@ dtype: bool
|
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
@bin_compare_doc("Less than or equal to", equiv="<=", series_example=_le_example)
|
|
55
|
-
def le(df, other, axis="columns", level=None):
|
|
56
|
-
op = DataFrameLessEqual(
|
|
55
|
+
def le(df, other, axis="columns", level=None, fill_value=None):
|
|
56
|
+
op = DataFrameLessEqual(
|
|
57
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
58
|
+
)
|
|
57
59
|
return op(df, other)
|
|
@@ -51,6 +51,8 @@ dtype: bool
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
@bin_compare_doc("Not equal to", equiv="!=", series_example=_ne_example)
|
|
54
|
-
def ne(df, other, axis="columns", level=None):
|
|
55
|
-
op = DataFrameNotEqual(
|
|
54
|
+
def ne(df, other, axis="columns", level=None, fill_value=None):
|
|
55
|
+
op = DataFrameNotEqual(
|
|
56
|
+
axis=axis, level=level, lhs=df, rhs=other, fill_value=fill_value
|
|
57
|
+
)
|
|
56
58
|
return op(df, other)
|
maxframe/dataframe/core.py
CHANGED
|
@@ -1666,6 +1666,8 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
|
|
|
1666
1666
|
raise NotImplementedError
|
|
1667
1667
|
|
|
1668
1668
|
corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
|
|
1669
|
+
if corner_data is None:
|
|
1670
|
+
return
|
|
1669
1671
|
|
|
1670
1672
|
buf = StringIO()
|
|
1671
1673
|
max_rows = pd.get_option("display.max_rows")
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import dataclasses
|
|
16
|
+
import logging
|
|
16
17
|
import re
|
|
17
18
|
from typing import Dict, List, Optional, Tuple, Union
|
|
18
19
|
|
|
@@ -22,12 +23,14 @@ from odps import ODPS
|
|
|
22
23
|
from odps.types import Column, OdpsSchema, validate_data_type
|
|
23
24
|
|
|
24
25
|
from ... import opcodes
|
|
26
|
+
from ...config import options
|
|
25
27
|
from ...core import OutputType
|
|
26
28
|
from ...core.graph import DAG
|
|
27
29
|
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
28
30
|
from ...serialization.serializables import (
|
|
29
31
|
AnyField,
|
|
30
32
|
BoolField,
|
|
33
|
+
DictField,
|
|
31
34
|
FieldTypes,
|
|
32
35
|
Int64Field,
|
|
33
36
|
ListField,
|
|
@@ -37,6 +40,10 @@ from ...serialization.serializables import (
|
|
|
37
40
|
from ..utils import parse_index
|
|
38
41
|
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
39
42
|
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
_DEFAULT_ANONYMOUS_COL_PREFIX = "_anon_col_"
|
|
46
|
+
|
|
40
47
|
_EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
|
|
41
48
|
_EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
|
|
42
49
|
_EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
|
|
@@ -46,8 +53,11 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
|
|
|
46
53
|
r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
|
|
47
54
|
re.MULTILINE,
|
|
48
55
|
)
|
|
49
|
-
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([
|
|
50
|
-
_ANONYMOUS_COL_REGEX = re.compile(r"^_c\d
|
|
56
|
+
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^\n]+)\)(?:| AS ([^ ]+))(?:\n|$)")
|
|
57
|
+
_ANONYMOUS_COL_REGEX = re.compile(r"^_c(\d+)$")
|
|
58
|
+
|
|
59
|
+
_SIMPLE_SCHEMA_COLS_REGEX = re.compile(r"SELECT (([^:]+:[^, ]+[, ]*)+)FROM")
|
|
60
|
+
_SIMPLE_SCHEMA_COL_REGEX = re.compile(r"([^\.]+):([^, ]+)")
|
|
51
61
|
|
|
52
62
|
|
|
53
63
|
@dataclasses.dataclass
|
|
@@ -152,7 +162,7 @@ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
|
|
|
152
162
|
return TaskSector(job_name, task_name, out_target, schemas)
|
|
153
163
|
|
|
154
164
|
|
|
155
|
-
def
|
|
165
|
+
def _parse_full_explain(explain_string: str) -> OdpsSchema:
|
|
156
166
|
sectors = _split_explain_string(explain_string)
|
|
157
167
|
jobs_sector = tasks_sector = None
|
|
158
168
|
|
|
@@ -191,6 +201,25 @@ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
|
191
201
|
return OdpsSchema(cols)
|
|
192
202
|
|
|
193
203
|
|
|
204
|
+
def _parse_simple_explain(explain_string: str) -> OdpsSchema:
|
|
205
|
+
fields_match = _SIMPLE_SCHEMA_COLS_REGEX.search(explain_string)
|
|
206
|
+
if not fields_match:
|
|
207
|
+
raise ValueError("Cannot detect output table schema")
|
|
208
|
+
|
|
209
|
+
fields_str = fields_match.group(1)
|
|
210
|
+
cols = []
|
|
211
|
+
for field, type_name in _SIMPLE_SCHEMA_COL_REGEX.findall(fields_str):
|
|
212
|
+
cols.append(Column(field, validate_data_type(type_name)))
|
|
213
|
+
return OdpsSchema(cols)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
217
|
+
if explain_string.startswith("AdhocSink"):
|
|
218
|
+
return _parse_simple_explain(explain_string)
|
|
219
|
+
else:
|
|
220
|
+
return _parse_full_explain(explain_string)
|
|
221
|
+
|
|
222
|
+
|
|
194
223
|
class DataFrameReadODPSQuery(
|
|
195
224
|
IncrementalIndexDatasource,
|
|
196
225
|
ColumnPruneSupportedDataSourceMixin,
|
|
@@ -205,6 +234,7 @@ class DataFrameReadODPSQuery(
|
|
|
205
234
|
string_as_binary = BoolField("string_as_binary", default=None)
|
|
206
235
|
index_columns = ListField("index_columns", FieldTypes.string, default=None)
|
|
207
236
|
index_dtypes = SeriesField("index_dtypes", default=None)
|
|
237
|
+
column_renames = DictField("column_renames", default=None)
|
|
208
238
|
|
|
209
239
|
def get_columns(self):
|
|
210
240
|
return self.columns
|
|
@@ -246,6 +276,8 @@ def read_odps_query(
|
|
|
246
276
|
odps_entry: ODPS = None,
|
|
247
277
|
index_col: Union[None, str, List[str]] = None,
|
|
248
278
|
string_as_binary: bool = None,
|
|
279
|
+
sql_hints: Dict[str, str] = None,
|
|
280
|
+
anonymous_col_prefix: str = _DEFAULT_ANONYMOUS_COL_PREFIX,
|
|
249
281
|
**kw,
|
|
250
282
|
):
|
|
251
283
|
"""
|
|
@@ -260,25 +292,51 @@ def read_odps_query(
|
|
|
260
292
|
MaxCompute SQL statement.
|
|
261
293
|
index_col: Union[None, str, List[str]]
|
|
262
294
|
Columns to be specified as indexes.
|
|
295
|
+
string_as_binary: bool, optional
|
|
296
|
+
Whether to convert string columns to binary.
|
|
297
|
+
sql_hints: Dict[str, str], optional
|
|
298
|
+
User specified SQL hints.
|
|
299
|
+
anonymous_col_prefix: str, optional
|
|
300
|
+
Prefix for anonymous columns, '_anon_col_' by default.
|
|
263
301
|
|
|
264
302
|
Returns
|
|
265
303
|
-------
|
|
266
304
|
result: DataFrame
|
|
267
305
|
DataFrame read from MaxCompute (ODPS) table
|
|
268
306
|
"""
|
|
307
|
+
hints = options.sql.settings.copy() or {}
|
|
308
|
+
if sql_hints:
|
|
309
|
+
hints.update(sql_hints)
|
|
310
|
+
|
|
269
311
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
312
|
+
|
|
313
|
+
if options.session.enable_schema or odps_entry.is_schema_namespace_enabled():
|
|
314
|
+
hints["odps.namespace.schema"] = "true"
|
|
315
|
+
hints["odps.sql.allow.namespace.schema"] = "true"
|
|
316
|
+
|
|
317
|
+
# fixme workaround for multi-stage split process
|
|
318
|
+
hints["odps.sql.object.table.split.by.object.size.enabled"] = "false"
|
|
319
|
+
|
|
270
320
|
if odps_entry is None:
|
|
271
321
|
raise ValueError("Missing odps_entry parameter")
|
|
272
|
-
inst = odps_entry.execute_sql(f"EXPLAIN {query}")
|
|
322
|
+
inst = odps_entry.execute_sql(f"EXPLAIN {query}", hints=hints)
|
|
323
|
+
logger.debug("Explain instance ID: %s", inst.id)
|
|
273
324
|
explain_str = list(inst.get_task_results().values())[0]
|
|
274
325
|
|
|
275
326
|
odps_schema = _parse_explained_schema(explain_str)
|
|
276
327
|
|
|
328
|
+
new_columns = []
|
|
329
|
+
col_renames = {}
|
|
277
330
|
for col in odps_schema.columns:
|
|
278
|
-
|
|
279
|
-
|
|
331
|
+
anon_match = _ANONYMOUS_COL_REGEX.match(col.name)
|
|
332
|
+
if anon_match and col.name not in query:
|
|
333
|
+
new_name = anonymous_col_prefix + anon_match.group(1)
|
|
334
|
+
col_renames[col.name] = new_name
|
|
335
|
+
new_columns.append(Column(new_name, col.type))
|
|
336
|
+
else:
|
|
337
|
+
new_columns.append(col)
|
|
280
338
|
|
|
281
|
-
dtypes = odps_schema_to_pandas_dtypes(
|
|
339
|
+
dtypes = odps_schema_to_pandas_dtypes(OdpsSchema(new_columns))
|
|
282
340
|
|
|
283
341
|
if not index_col:
|
|
284
342
|
index_dtypes = None
|
|
@@ -301,5 +359,6 @@ def read_odps_query(
|
|
|
301
359
|
string_as_binary=string_as_binary,
|
|
302
360
|
index_columns=index_col,
|
|
303
361
|
index_dtypes=index_dtypes,
|
|
362
|
+
column_renames=col_renames,
|
|
304
363
|
)
|
|
305
364
|
return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
|
|
@@ -22,6 +22,7 @@ from odps.models import Table
|
|
|
22
22
|
from odps.utils import to_timestamp
|
|
23
23
|
|
|
24
24
|
from ... import opcodes
|
|
25
|
+
from ...config import options
|
|
25
26
|
from ...core import OutputType
|
|
26
27
|
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
27
28
|
from ...serialization.serializables import (
|
|
@@ -167,12 +168,13 @@ def read_odps_table(
|
|
|
167
168
|
DataFrame read from MaxCompute (ODPS) table
|
|
168
169
|
"""
|
|
169
170
|
odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
|
|
171
|
+
schema = options.session.default_schema or odps_entry.schema
|
|
170
172
|
if odps_entry is None:
|
|
171
173
|
raise ValueError("Missing odps_entry parameter")
|
|
172
174
|
if isinstance(table_name, Table):
|
|
173
175
|
table = table_name
|
|
174
176
|
else:
|
|
175
|
-
table = odps_entry.get_table(table_name)
|
|
177
|
+
table = odps_entry.get_table(table_name, schema=schema)
|
|
176
178
|
|
|
177
179
|
if not table.table_schema.partitions and (
|
|
178
180
|
partitions is not None or append_partitions
|
|
@@ -19,6 +19,7 @@ import numpy as np
|
|
|
19
19
|
import pandas as pd
|
|
20
20
|
import pytest
|
|
21
21
|
from odps import ODPS
|
|
22
|
+
from odps import types as odps_types
|
|
22
23
|
|
|
23
24
|
from .... import tensor as mt
|
|
24
25
|
from ....core import OutputType
|
|
@@ -35,7 +36,7 @@ from ..from_tensor import (
|
|
|
35
36
|
)
|
|
36
37
|
from ..index import from_pandas as from_pandas_index
|
|
37
38
|
from ..index import from_tileable
|
|
38
|
-
from ..read_odps_query import ColumnSchema, _resolve_task_sector
|
|
39
|
+
from ..read_odps_query import ColumnSchema, _parse_simple_explain, _resolve_task_sector
|
|
39
40
|
from ..series import from_pandas as from_pandas_series
|
|
40
41
|
|
|
41
42
|
ray = lazy_import("ray")
|
|
@@ -329,10 +330,6 @@ def test_from_odps_query():
|
|
|
329
330
|
read_odps_query(f"CREATE TABLE dummy_table AS SELECT * FROM {table1_name}")
|
|
330
331
|
assert "instant query" in err_info.value.args[0]
|
|
331
332
|
|
|
332
|
-
with pytest.raises(ValueError) as err_info:
|
|
333
|
-
read_odps_query(f"SELECT col1, col2 + col3 FROM {table1_name}")
|
|
334
|
-
assert "names" in err_info.value.args[0]
|
|
335
|
-
|
|
336
333
|
query1 = f"SELECT * FROM {table1_name} WHERE col1 > 10"
|
|
337
334
|
df = read_odps_query(query1)
|
|
338
335
|
assert df.op.query == query1
|
|
@@ -401,7 +398,9 @@ def test_date_range():
|
|
|
401
398
|
|
|
402
399
|
|
|
403
400
|
def test_resolve_task_sector():
|
|
404
|
-
input_path = os.path.join(
|
|
401
|
+
input_path = os.path.join(
|
|
402
|
+
os.path.dirname(__file__), "test-data", "task-input-full.txt"
|
|
403
|
+
)
|
|
405
404
|
with open(input_path, "r") as f:
|
|
406
405
|
sector = f.read()
|
|
407
406
|
actual_sector = _resolve_task_sector("job0", sector)
|
|
@@ -413,3 +412,33 @@ def test_resolve_task_sector():
|
|
|
413
412
|
assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
|
|
414
413
|
assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
|
|
415
414
|
assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def test_resolve_task_odps2():
|
|
418
|
+
input_path = os.path.join(
|
|
419
|
+
os.path.dirname(__file__), "test-data", "task-input-odps2.txt"
|
|
420
|
+
)
|
|
421
|
+
with open(input_path, "r") as f:
|
|
422
|
+
sector = f.read()
|
|
423
|
+
actual_sector = _resolve_task_sector("job0", sector)
|
|
424
|
+
|
|
425
|
+
assert actual_sector.job_name == "job0"
|
|
426
|
+
assert actual_sector.task_name == "M1"
|
|
427
|
+
assert actual_sector.output_target == "Screen"
|
|
428
|
+
assert len(actual_sector.schema) == 2
|
|
429
|
+
assert actual_sector.schema[0] == ColumnSchema("key", "varchar(2048)", "")
|
|
430
|
+
assert actual_sector.schema[1] == ColumnSchema("data", "binary", "")
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def test_resolve_simple_explain():
|
|
434
|
+
input_path = os.path.join(
|
|
435
|
+
os.path.dirname(__file__), "test-data", "task-input-simple.txt"
|
|
436
|
+
)
|
|
437
|
+
with open(input_path, "r") as f:
|
|
438
|
+
sector = f.read()
|
|
439
|
+
|
|
440
|
+
schema = _parse_simple_explain(sector)
|
|
441
|
+
assert schema.columns[0].name == "memberid"
|
|
442
|
+
assert schema.columns[0].type == odps_types.string
|
|
443
|
+
assert schema.columns[1].name == "createdate"
|
|
444
|
+
assert schema.columns[1].type == odps_types.bigint
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
import logging
|
|
18
18
|
from typing import List, Optional, Union
|
|
19
19
|
|
|
20
|
+
from odps import ODPS
|
|
20
21
|
from odps.models import Table as ODPSTable
|
|
21
22
|
from odps.types import PartitionSpec
|
|
22
23
|
|
|
@@ -136,8 +137,14 @@ def to_odps_table(
|
|
|
136
137
|
--------
|
|
137
138
|
|
|
138
139
|
"""
|
|
140
|
+
odps_entry = ODPS.from_global() or ODPS.from_environments()
|
|
139
141
|
if isinstance(table, ODPSTable):
|
|
140
142
|
table = table.full_table_name
|
|
143
|
+
elif options.session.enable_schema and "." not in table:
|
|
144
|
+
default_schema = (
|
|
145
|
+
options.session.default_schema or odps_entry.schema or "default"
|
|
146
|
+
)
|
|
147
|
+
table = default_schema + "." + table
|
|
141
148
|
|
|
142
149
|
if isinstance(index_label, str):
|
|
143
150
|
index_label = [index_label]
|
|
@@ -18,6 +18,7 @@ from .accessor import (
|
|
|
18
18
|
IndexMaxFrameAccessor,
|
|
19
19
|
SeriesMaxFrameAccessor,
|
|
20
20
|
)
|
|
21
|
+
from .flatmap import df_flatmap, series_flatmap
|
|
21
22
|
from .reshuffle import DataFrameReshuffle, df_reshuffle
|
|
22
23
|
|
|
23
24
|
|
|
@@ -25,6 +26,8 @@ def _install():
|
|
|
25
26
|
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
26
27
|
|
|
27
28
|
DataFrameMaxFrameAccessor._register("reshuffle", df_reshuffle)
|
|
29
|
+
DataFrameMaxFrameAccessor._register("flatmap", df_flatmap)
|
|
30
|
+
SeriesMaxFrameAccessor._register("flatmap", series_flatmap)
|
|
28
31
|
|
|
29
32
|
if DataFrameMaxFrameAccessor._api_count:
|
|
30
33
|
for t in DATAFRAME_TYPE:
|