maxframe 0.1.0b1__cp39-cp39-macosx_10_9_x86_64.whl → 0.1.0b3__cp39-cp39-macosx_10_9_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-39-darwin.so +0 -0
- maxframe/codegen.py +88 -19
- maxframe/config/config.py +10 -0
- maxframe/core/entity/executable.py +1 -0
- maxframe/core/entity/objects.py +3 -2
- maxframe/core/graph/core.cpython-39-darwin.so +0 -0
- maxframe/core/graph/core.pyx +2 -2
- maxframe/core/operator/base.py +14 -0
- maxframe/dataframe/__init__.py +3 -1
- maxframe/dataframe/datasource/from_records.py +4 -0
- maxframe/dataframe/datasource/read_odps_query.py +295 -0
- maxframe/dataframe/datasource/read_odps_table.py +1 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +84 -1
- maxframe/dataframe/groupby/__init__.py +4 -0
- maxframe/dataframe/groupby/core.py +5 -0
- maxframe/dataframe/misc/to_numeric.py +4 -0
- maxframe/dataframe/window/aggregation.py +1 -24
- maxframe/dataframe/window/ewm.py +0 -7
- maxframe/dataframe/window/tests/test_ewm.py +0 -6
- maxframe/errors.py +21 -0
- maxframe/lib/aio/isolation.py +6 -1
- maxframe/lib/mmh3.cpython-39-darwin.so +0 -0
- maxframe/opcodes.py +1 -0
- maxframe/protocol.py +25 -5
- maxframe/serialization/core.cpython-39-darwin.so +0 -0
- maxframe/serialization/exception.py +2 -1
- maxframe/serialization/serializables/core.py +6 -1
- maxframe/serialization/serializables/field.py +2 -0
- maxframe/tensor/core.py +3 -3
- maxframe/tests/test_codegen.py +69 -0
- maxframe/tests/test_protocol.py +16 -8
- maxframe/tests/utils.py +1 -0
- maxframe/udf.py +15 -16
- maxframe/utils.py +21 -1
- {maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/METADATA +1 -74
- {maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/RECORD +42 -39
- {maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/WHEEL +1 -1
- maxframe_client/clients/framedriver.py +7 -7
- maxframe_client/session/task.py +31 -3
- maxframe_client/session/tests/test_task.py +29 -11
- maxframe_client/tests/test_session.py +2 -0
- {maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/top_level.txt +0 -0
|
Binary file
|
maxframe/codegen.py
CHANGED
|
@@ -17,7 +17,7 @@ import base64
|
|
|
17
17
|
import dataclasses
|
|
18
18
|
import logging
|
|
19
19
|
from enum import Enum
|
|
20
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
|
|
21
21
|
|
|
22
22
|
from odps.types import OdpsSchema
|
|
23
23
|
from odps.utils import camel_to_underline
|
|
@@ -30,6 +30,7 @@ from .odpsio import build_dataframe_table_meta
|
|
|
30
30
|
from .odpsio.schema import pandas_to_odps_schema
|
|
31
31
|
from .protocol import DataFrameTableMeta, ResultInfo
|
|
32
32
|
from .serialization import PickleContainer
|
|
33
|
+
from .serialization.serializables import Serializable, StringField
|
|
33
34
|
from .typing_ import PandasObjectTypes
|
|
34
35
|
from .udf import MarkedFunction
|
|
35
36
|
|
|
@@ -48,8 +49,11 @@ class CodeGenResult:
|
|
|
48
49
|
constants: Dict[str, Any]
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
class AbstractUDF(
|
|
52
|
-
_session_id: str
|
|
52
|
+
class AbstractUDF(Serializable):
|
|
53
|
+
_session_id: str = StringField("session_id")
|
|
54
|
+
|
|
55
|
+
def __init__(self, session_id: Optional[str] = None, **kw):
|
|
56
|
+
super().__init__(_session_id=session_id, **kw)
|
|
53
57
|
|
|
54
58
|
@property
|
|
55
59
|
def name(self) -> str:
|
|
@@ -74,7 +78,66 @@ class AbstractUDF(abc.ABC):
|
|
|
74
78
|
|
|
75
79
|
class UserCodeMixin:
|
|
76
80
|
@classmethod
|
|
77
|
-
def
|
|
81
|
+
def obj_to_python_expr(cls, obj: Any = None) -> str:
|
|
82
|
+
"""
|
|
83
|
+
Parameters
|
|
84
|
+
----------
|
|
85
|
+
obj
|
|
86
|
+
The object to convert to python expr.
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
str :
|
|
90
|
+
The str type content equals to the object when use in the python code directly.
|
|
91
|
+
"""
|
|
92
|
+
if obj is None:
|
|
93
|
+
return "None"
|
|
94
|
+
|
|
95
|
+
if isinstance(obj, (int, float)):
|
|
96
|
+
return repr(obj)
|
|
97
|
+
|
|
98
|
+
if isinstance(obj, bool):
|
|
99
|
+
return "True" if obj else "False"
|
|
100
|
+
|
|
101
|
+
if isinstance(obj, bytes):
|
|
102
|
+
base64_bytes = base64.b64encode(obj)
|
|
103
|
+
return f"base64.b64decode({base64_bytes})"
|
|
104
|
+
|
|
105
|
+
if isinstance(obj, str):
|
|
106
|
+
return repr(obj)
|
|
107
|
+
|
|
108
|
+
if isinstance(obj, list):
|
|
109
|
+
return (
|
|
110
|
+
f"[{', '.join([cls.obj_to_python_expr(element) for element in obj])}]"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
if isinstance(obj, dict):
|
|
114
|
+
items = (
|
|
115
|
+
f"{repr(key)}: {cls.obj_to_python_expr(value)}"
|
|
116
|
+
for key, value in obj.items()
|
|
117
|
+
)
|
|
118
|
+
return f"{{{', '.join(items)}}}"
|
|
119
|
+
|
|
120
|
+
if isinstance(obj, tuple):
|
|
121
|
+
return f"({', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}{',' if len(obj) == 1 else ''})"
|
|
122
|
+
|
|
123
|
+
if isinstance(obj, set):
|
|
124
|
+
return (
|
|
125
|
+
f"{{{', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}}}"
|
|
126
|
+
if obj
|
|
127
|
+
else "set()"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if isinstance(obj, PickleContainer):
|
|
131
|
+
return UserCodeMixin.generate_pickled_codes(obj, None)
|
|
132
|
+
|
|
133
|
+
raise ValueError(f"not support arg type {type(obj)}")
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def generate_pickled_codes(
|
|
137
|
+
cls,
|
|
138
|
+
code_to_pickle: Any,
|
|
139
|
+
unpicked_data_var_name: Union[str, None] = "pickled_data",
|
|
140
|
+
) -> str:
|
|
78
141
|
"""
|
|
79
142
|
Generate pickled codes. The final pickled variable is called 'pickled_data'.
|
|
80
143
|
|
|
@@ -82,20 +145,20 @@ class UserCodeMixin:
|
|
|
82
145
|
----------
|
|
83
146
|
code_to_pickle: Any
|
|
84
147
|
The code to be pickled.
|
|
148
|
+
unpicked_data_var_name: str
|
|
149
|
+
The variables in code used to hold the loads object from the cloudpickle
|
|
85
150
|
|
|
86
151
|
Returns
|
|
87
152
|
-------
|
|
88
|
-
|
|
89
|
-
The code snippets of pickling, the final variable is called 'pickled_data'.
|
|
153
|
+
str :
|
|
154
|
+
The code snippets of pickling, the final variable is called 'pickled_data' by default.
|
|
90
155
|
"""
|
|
91
156
|
pickled, buffers = cls.dump_pickled_data(code_to_pickle)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
f"pickled_data = cloudpickle.loads(base64_data, buffers=[{buffers_str}])",
|
|
98
|
-
]
|
|
157
|
+
pickle_loads_expr = f"cloudpickle.loads({cls.obj_to_python_expr(pickled)}, buffers={cls.obj_to_python_expr(buffers)})"
|
|
158
|
+
if unpicked_data_var_name:
|
|
159
|
+
return f"{unpicked_data_var_name} = {pickle_loads_expr}"
|
|
160
|
+
|
|
161
|
+
return pickle_loads_expr
|
|
99
162
|
|
|
100
163
|
@staticmethod
|
|
101
164
|
def dump_pickled_data(
|
|
@@ -114,8 +177,9 @@ class UserCodeMixin:
|
|
|
114
177
|
|
|
115
178
|
|
|
116
179
|
class BigDagCodeContext(metaclass=abc.ABCMeta):
|
|
117
|
-
def __init__(self, session_id: str = None):
|
|
180
|
+
def __init__(self, session_id: str = None, subdag_id: str = None):
|
|
118
181
|
self._session_id = session_id
|
|
182
|
+
self._subdag_id = subdag_id
|
|
119
183
|
self._tileable_key_to_variables = dict()
|
|
120
184
|
self.constants = dict()
|
|
121
185
|
self._data_table_meta_cache = dict()
|
|
@@ -142,10 +206,14 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
|
|
|
142
206
|
except KeyError:
|
|
143
207
|
var_name = self._tileable_key_to_variables[
|
|
144
208
|
tileable.key
|
|
145
|
-
] =
|
|
146
|
-
self._next_var_id += 1
|
|
209
|
+
] = self.next_var_name()
|
|
147
210
|
return var_name
|
|
148
211
|
|
|
212
|
+
def next_var_name(self) -> str:
|
|
213
|
+
var_name = f"var_{self._next_var_id}"
|
|
214
|
+
self._next_var_id += 1
|
|
215
|
+
return var_name
|
|
216
|
+
|
|
149
217
|
def get_odps_schema(
|
|
150
218
|
self, data: PandasObjectTypes, unknown_as_string: bool = False
|
|
151
219
|
) -> OdpsSchema:
|
|
@@ -275,9 +343,10 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
275
343
|
engine_priority: int = 0
|
|
276
344
|
_extension_loaded = False
|
|
277
345
|
|
|
278
|
-
def __init__(self, session_id: str):
|
|
346
|
+
def __init__(self, session_id: str, subdag_id: str = None):
|
|
279
347
|
self._session_id = session_id
|
|
280
|
-
self.
|
|
348
|
+
self._subdag_id = subdag_id
|
|
349
|
+
self._context = self._init_context(session_id, subdag_id)
|
|
281
350
|
|
|
282
351
|
@classmethod
|
|
283
352
|
def _load_engine_extensions(cls):
|
|
@@ -307,7 +376,7 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
|
|
|
307
376
|
raise NotImplementedError
|
|
308
377
|
|
|
309
378
|
@abc.abstractmethod
|
|
310
|
-
def _init_context(self, session_id: str) -> BigDagCodeContext:
|
|
379
|
+
def _init_context(self, session_id: str, subdag_id: str) -> BigDagCodeContext:
|
|
311
380
|
raise NotImplementedError
|
|
312
381
|
|
|
313
382
|
def _generate_comments(
|
maxframe/config/config.py
CHANGED
|
@@ -19,6 +19,7 @@ import warnings
|
|
|
19
19
|
from copy import deepcopy
|
|
20
20
|
from typing import Any, Dict, Optional, Union
|
|
21
21
|
|
|
22
|
+
from ..utils import get_python_tag
|
|
22
23
|
from .validators import (
|
|
23
24
|
ValidatorType,
|
|
24
25
|
all_validator,
|
|
@@ -299,6 +300,9 @@ default_options = Config()
|
|
|
299
300
|
default_options.register_option(
|
|
300
301
|
"execution_mode", "trigger", validator=is_in(["trigger", "eager"])
|
|
301
302
|
)
|
|
303
|
+
default_options.register_option(
|
|
304
|
+
"python_tag", get_python_tag(), validator=is_string, remote=True
|
|
305
|
+
)
|
|
302
306
|
default_options.register_option(
|
|
303
307
|
"client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
|
|
304
308
|
)
|
|
@@ -336,6 +340,12 @@ default_options.register_option(
|
|
|
336
340
|
validator=is_integer,
|
|
337
341
|
remote=True,
|
|
338
342
|
)
|
|
343
|
+
default_options.register_option(
|
|
344
|
+
"session.subinstance_priority",
|
|
345
|
+
None,
|
|
346
|
+
validator=any_validator(is_null, is_integer),
|
|
347
|
+
remote=True,
|
|
348
|
+
)
|
|
339
349
|
|
|
340
350
|
default_options.register_option("warn_duplicated_execution", False, validator=is_bool)
|
|
341
351
|
default_options.register_option("dataframe.use_arrow_dtype", True, validator=is_bool)
|
maxframe/core/entity/objects.py
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
from typing import Any, Dict
|
|
16
16
|
|
|
17
17
|
from ...serialization.serializables import FieldTypes, ListField
|
|
18
|
+
from ...utils import skip_na_call
|
|
18
19
|
from .chunks import Chunk, ChunkData
|
|
19
20
|
from .core import Entity
|
|
20
21
|
from .executable import _ToObjectMixin
|
|
@@ -62,8 +63,8 @@ class ObjectData(TileableData, _ToObjectMixin):
|
|
|
62
63
|
_chunks = ListField(
|
|
63
64
|
"chunks",
|
|
64
65
|
FieldTypes.reference(ObjectChunkData),
|
|
65
|
-
on_serialize=lambda x: [it.data for it in x]
|
|
66
|
-
on_deserialize=lambda x: [ObjectChunk(it) for it in x]
|
|
66
|
+
on_serialize=skip_na_call(lambda x: [it.data for it in x]),
|
|
67
|
+
on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
|
|
67
68
|
)
|
|
68
69
|
|
|
69
70
|
def __init__(self, op=None, nsplits=None, **kw):
|
|
Binary file
|
maxframe/core/graph/core.pyx
CHANGED
|
@@ -404,10 +404,10 @@ cdef class DirectedGraph:
|
|
|
404
404
|
|
|
405
405
|
Fusion(self).decompose(nodes=nodes)
|
|
406
406
|
|
|
407
|
-
def view(self, filename='default', graph_attrs=None, node_attrs=None, result_chunk_keys=None, show_columns=False): # pragma: no cover
|
|
407
|
+
def view(self, filename='default', graph_attrs=None, trunc_key=5, node_attrs=None, result_chunk_keys=None, show_columns=False): # pragma: no cover
|
|
408
408
|
from graphviz import Source
|
|
409
409
|
|
|
410
|
-
g = Source(self.to_dot(graph_attrs, node_attrs, result_chunk_keys=result_chunk_keys, show_columns=show_columns))
|
|
410
|
+
g = Source(self.to_dot(graph_attrs, node_attrs, trunc_key=trunc_key, result_chunk_keys=result_chunk_keys, show_columns=show_columns))
|
|
411
411
|
g.view(filename=filename, cleanup=True)
|
|
412
412
|
|
|
413
413
|
def to_dag(self):
|
maxframe/core/operator/base.py
CHANGED
|
@@ -287,6 +287,20 @@ class Operator(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperatorMetaclass
|
|
|
287
287
|
self.check_inputs(inputs)
|
|
288
288
|
setattr(self, "_inputs", inputs)
|
|
289
289
|
|
|
290
|
+
def replace_input(self, index: int, replaced_input: ENTITY_TYPE):
|
|
291
|
+
"""
|
|
292
|
+
Replace the input[index] with replaced_input.
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
index : int
|
|
297
|
+
The input to be replaced index.
|
|
298
|
+
replaced_input : ENTITY_TYPE
|
|
299
|
+
The replaced input object.
|
|
300
|
+
"""
|
|
301
|
+
self.inputs[index] = replaced_input
|
|
302
|
+
self._set_inputs(self.inputs)
|
|
303
|
+
|
|
290
304
|
@property
|
|
291
305
|
def inputs(self) -> List[Union[ENTITY_TYPE]]:
|
|
292
306
|
inputs = self._inputs
|
maxframe/dataframe/__init__.py
CHANGED
|
@@ -35,9 +35,11 @@ from .datasource.from_index import series_from_index
|
|
|
35
35
|
from .datasource.from_records import from_records
|
|
36
36
|
from .datasource.from_tensor import dataframe_from_tensor, series_from_tensor
|
|
37
37
|
from .datasource.read_csv import read_csv
|
|
38
|
+
from .datasource.read_odps_query import read_odps_query
|
|
38
39
|
from .datasource.read_odps_table import read_odps_table
|
|
39
40
|
from .datasource.read_parquet import read_parquet
|
|
40
41
|
from .datastore.to_odps import to_odps_table
|
|
42
|
+
from .groupby import NamedAgg
|
|
41
43
|
from .initializer import DataFrame, Index, Series, read_pandas
|
|
42
44
|
from .merge import concat, merge
|
|
43
45
|
from .misc.cut import cut
|
|
@@ -51,7 +53,7 @@ from .reduction import CustomReduction, unique
|
|
|
51
53
|
from .tseries.to_datetime import to_datetime
|
|
52
54
|
|
|
53
55
|
try:
|
|
54
|
-
from pandas import NA,
|
|
56
|
+
from pandas import NA, Timestamp
|
|
55
57
|
except ImportError: # pragma: no cover
|
|
56
58
|
pass
|
|
57
59
|
|
|
@@ -38,6 +38,10 @@ class DataFrameFromRecords(DataFrameOperator, DataFrameOperatorMixin):
|
|
|
38
38
|
raise NotImplementedError("Specifying index value is not supported for now")
|
|
39
39
|
super().__init__(columns=columns, _output_types=[OutputType.dataframe], **kw)
|
|
40
40
|
|
|
41
|
+
@property
|
|
42
|
+
def input(self):
|
|
43
|
+
return self._inputs[0]
|
|
44
|
+
|
|
41
45
|
def __call__(self, data):
|
|
42
46
|
if self.nrows is None:
|
|
43
47
|
nrows = data.shape[0]
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import dataclasses
|
|
16
|
+
import re
|
|
17
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from odps import ODPS
|
|
22
|
+
from odps.types import Column, OdpsSchema, validate_data_type
|
|
23
|
+
|
|
24
|
+
from ... import opcodes
|
|
25
|
+
from ...core import OutputType
|
|
26
|
+
from ...core.graph import DAG
|
|
27
|
+
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
28
|
+
from ...serialization.serializables import (
|
|
29
|
+
AnyField,
|
|
30
|
+
BoolField,
|
|
31
|
+
FieldTypes,
|
|
32
|
+
Int64Field,
|
|
33
|
+
ListField,
|
|
34
|
+
SeriesField,
|
|
35
|
+
StringField,
|
|
36
|
+
)
|
|
37
|
+
from ..utils import parse_index
|
|
38
|
+
from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
|
|
39
|
+
|
|
40
|
+
_EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
|
|
41
|
+
_EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
|
|
42
|
+
_EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
|
|
43
|
+
_EXPLAIN_ROOT_TASKS_REGEX = re.compile(r"root Tasks: (.+)")
|
|
44
|
+
_EXPLAIN_TASK_REGEX = re.compile(r"In Task ([^:]+)")
|
|
45
|
+
_EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
|
|
46
|
+
r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
|
|
47
|
+
re.MULTILINE,
|
|
48
|
+
)
|
|
49
|
+
_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclasses.dataclass
|
|
53
|
+
class DependencySector:
|
|
54
|
+
roots: List[str]
|
|
55
|
+
dependencies: List[Tuple[str, str]]
|
|
56
|
+
|
|
57
|
+
def build_dag(self) -> DAG:
|
|
58
|
+
dag = DAG()
|
|
59
|
+
for r in self.roots:
|
|
60
|
+
dag.add_node(r)
|
|
61
|
+
for v_from, v_to in self.dependencies:
|
|
62
|
+
dag.add_node(v_from)
|
|
63
|
+
dag.add_node(v_to)
|
|
64
|
+
dag.add_edge(v_from, v_to)
|
|
65
|
+
return dag
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclasses.dataclass
|
|
69
|
+
class JobsSector(DependencySector):
|
|
70
|
+
jobs: Dict[str, "TasksSector"] = dataclasses.field(default_factory=dict)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclasses.dataclass
|
|
74
|
+
class TasksSector(DependencySector):
|
|
75
|
+
job_name: str
|
|
76
|
+
tasks: Dict[str, "TaskSector"] = dataclasses.field(default_factory=dict)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclasses.dataclass
|
|
80
|
+
class ColumnSchema:
|
|
81
|
+
column_name: str
|
|
82
|
+
column_type: str
|
|
83
|
+
column_alias: Optional[str]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclasses.dataclass
|
|
87
|
+
class TaskSector:
|
|
88
|
+
job_name: str
|
|
89
|
+
task_name: str
|
|
90
|
+
output_target: Optional[str]
|
|
91
|
+
schema: List[ColumnSchema]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _split_explain_string(explain_string: str) -> List[str]:
|
|
95
|
+
parts = explain_string.split("\n\n")
|
|
96
|
+
final_parts = []
|
|
97
|
+
grouped = []
|
|
98
|
+
for part in parts:
|
|
99
|
+
part = part.strip("\n")
|
|
100
|
+
if grouped and not part.startswith(" "):
|
|
101
|
+
final_parts.append("\n\n".join(grouped).strip())
|
|
102
|
+
grouped = []
|
|
103
|
+
grouped.append(part)
|
|
104
|
+
if grouped:
|
|
105
|
+
final_parts.append("\n\n".join(grouped).strip())
|
|
106
|
+
return final_parts
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _find_all_deps(sector: str) -> List[Tuple[str, str]]:
|
|
110
|
+
deps = []
|
|
111
|
+
for match in _EXPLAIN_DEPENDS_REGEX.findall(sector):
|
|
112
|
+
descendant = match[0]
|
|
113
|
+
for r in match[1].split(","):
|
|
114
|
+
deps.append((r.strip(), descendant))
|
|
115
|
+
return deps
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _resolve_jobs_sector(sector: str) -> JobsSector:
|
|
119
|
+
match = _EXPLAIN_JOB_REGEX.search(sector)
|
|
120
|
+
roots = [r.strip() for r in match.group(1).split(",")]
|
|
121
|
+
deps = _find_all_deps(sector)
|
|
122
|
+
return JobsSector(roots, deps)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _resolve_tasks_sector(sector: str) -> TasksSector:
|
|
126
|
+
match = _EXPLAIN_ROOT_TASKS_REGEX.search(sector)
|
|
127
|
+
roots = [r.strip() for r in match.group(1).split(",")]
|
|
128
|
+
|
|
129
|
+
match = _EXPLAIN_TASKS_HEADER_REGEX.search(sector)
|
|
130
|
+
job_name = match.group(1)
|
|
131
|
+
|
|
132
|
+
deps = _find_all_deps(sector)
|
|
133
|
+
return TasksSector(roots, deps, job_name)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
|
|
137
|
+
match = _EXPLAIN_TASK_REGEX.match(sector)
|
|
138
|
+
task_name = match.group(1)
|
|
139
|
+
|
|
140
|
+
match = _EXPLAIN_TASK_SCHEMA_REGEX.match(sector)
|
|
141
|
+
if match is None:
|
|
142
|
+
return TaskSector(job_name, task_name, None, [])
|
|
143
|
+
|
|
144
|
+
out_target = match.group(2)
|
|
145
|
+
out_schema = match.group(3)
|
|
146
|
+
|
|
147
|
+
schemas = []
|
|
148
|
+
for match in _EXPLAIN_COLUMN_REGEX.findall(out_schema):
|
|
149
|
+
col_name, data_type, alias = match
|
|
150
|
+
schemas.append(ColumnSchema(col_name.strip(), data_type.strip(), alias.strip()))
|
|
151
|
+
return TaskSector(job_name, task_name, out_target, schemas)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _parse_explained_schema(explain_string: str) -> OdpsSchema:
|
|
155
|
+
sectors = _split_explain_string(explain_string)
|
|
156
|
+
jobs_sector = tasks_sector = None
|
|
157
|
+
|
|
158
|
+
for sector in sectors:
|
|
159
|
+
if _EXPLAIN_JOB_REGEX.search(sector):
|
|
160
|
+
jobs_sector = _resolve_jobs_sector(sector)
|
|
161
|
+
elif _EXPLAIN_TASKS_HEADER_REGEX.search(sector):
|
|
162
|
+
tasks_sector = _resolve_tasks_sector(sector)
|
|
163
|
+
assert jobs_sector is not None
|
|
164
|
+
jobs_sector.jobs[tasks_sector.job_name] = tasks_sector
|
|
165
|
+
elif _EXPLAIN_TASK_REGEX.search(sector):
|
|
166
|
+
assert tasks_sector is not None
|
|
167
|
+
task_sector = _resolve_task_sector(tasks_sector.job_name, sector)
|
|
168
|
+
tasks_sector.tasks[task_sector.task_name] = task_sector
|
|
169
|
+
|
|
170
|
+
job_dag = jobs_sector.build_dag()
|
|
171
|
+
indep_job_names = list(job_dag.iter_indep(reverse=True))
|
|
172
|
+
if len(indep_job_names) > 1: # pragma: no cover
|
|
173
|
+
raise ValueError("Only one final job is allowed in SQL statement")
|
|
174
|
+
|
|
175
|
+
tasks_sector = jobs_sector.jobs[indep_job_names[0]]
|
|
176
|
+
task_dag = tasks_sector.build_dag()
|
|
177
|
+
indep_task_names = list(task_dag.iter_indep(reverse=True))
|
|
178
|
+
if len(indep_task_names) > 1: # pragma: no cover
|
|
179
|
+
raise ValueError("Only one final task is allowed in SQL statement")
|
|
180
|
+
|
|
181
|
+
task_sector = tasks_sector.tasks[indep_task_names[0]]
|
|
182
|
+
if not task_sector.schema: # pragma: no cover
|
|
183
|
+
raise ValueError("Cannot detect output schema")
|
|
184
|
+
if task_sector.output_target != "Screen":
|
|
185
|
+
raise ValueError("The SQL statement should be an instant query")
|
|
186
|
+
cols = [
|
|
187
|
+
Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
|
|
188
|
+
for c in task_sector.schema
|
|
189
|
+
]
|
|
190
|
+
return OdpsSchema(cols)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class DataFrameReadODPSQuery(
|
|
194
|
+
IncrementalIndexDatasource,
|
|
195
|
+
ColumnPruneSupportedDataSourceMixin,
|
|
196
|
+
):
|
|
197
|
+
_op_type_ = opcodes.READ_ODPS_QUERY
|
|
198
|
+
|
|
199
|
+
query = StringField("query")
|
|
200
|
+
dtypes = SeriesField("dtypes", default=None)
|
|
201
|
+
columns = AnyField("columns", default=None)
|
|
202
|
+
nrows = Int64Field("nrows", default=None)
|
|
203
|
+
use_arrow_dtype = BoolField("use_arrow_dtype", default=None)
|
|
204
|
+
string_as_binary = BoolField("string_as_binary", default=None)
|
|
205
|
+
index_columns = ListField("index_columns", FieldTypes.string, default=None)
|
|
206
|
+
index_dtypes = SeriesField("index_dtypes", default=None)
|
|
207
|
+
|
|
208
|
+
def get_columns(self):
|
|
209
|
+
return self.columns
|
|
210
|
+
|
|
211
|
+
def set_pruned_columns(self, columns, *, keep_order=None): # pragma: no cover
|
|
212
|
+
self.columns = columns
|
|
213
|
+
|
|
214
|
+
def __call__(self, chunk_bytes=None, chunk_size=None):
|
|
215
|
+
if not self.index_columns:
|
|
216
|
+
index_value = parse_index(pd.RangeIndex(0))
|
|
217
|
+
elif len(self.index_columns) == 1:
|
|
218
|
+
index_value = parse_index(
|
|
219
|
+
pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
|
|
220
|
+
)
|
|
221
|
+
else:
|
|
222
|
+
idx = pd.MultiIndex.from_frame(
|
|
223
|
+
pd.DataFrame([], columns=self.index_columns).astype(self.index_dtypes)
|
|
224
|
+
)
|
|
225
|
+
index_value = parse_index(idx)
|
|
226
|
+
|
|
227
|
+
columns_value = parse_index(self.dtypes.index, store_data=True)
|
|
228
|
+
self.output_types = [OutputType.dataframe]
|
|
229
|
+
return self.new_tileable(
|
|
230
|
+
[],
|
|
231
|
+
None,
|
|
232
|
+
shape=(len(self.dtypes), np.nan),
|
|
233
|
+
dtypes=self.dtypes,
|
|
234
|
+
index_value=index_value,
|
|
235
|
+
columns_value=columns_value,
|
|
236
|
+
chunk_bytes=chunk_bytes,
|
|
237
|
+
chunk_size=chunk_size,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def read_odps_query(
|
|
242
|
+
query: str,
|
|
243
|
+
odps_entry: ODPS = None,
|
|
244
|
+
index_col: Union[None, str, List[str]] = None,
|
|
245
|
+
string_as_binary: bool = None,
|
|
246
|
+
**kw,
|
|
247
|
+
):
|
|
248
|
+
"""
|
|
249
|
+
Read data from a MaxCompute (ODPS) query into DataFrame.
|
|
250
|
+
|
|
251
|
+
Supports specifying some columns as indexes. If not specified, RangeIndex
|
|
252
|
+
will be generated.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
query: str
|
|
257
|
+
MaxCompute SQL statement.
|
|
258
|
+
index_col: Union[None, str, List[str]]
|
|
259
|
+
Columns to be specified as indexes.
|
|
260
|
+
|
|
261
|
+
Returns
|
|
262
|
+
-------
|
|
263
|
+
result: DataFrame
|
|
264
|
+
DataFrame read from MaxCompute (ODPS) table
|
|
265
|
+
"""
|
|
266
|
+
odps_entry = odps_entry or ODPS.from_environments()
|
|
267
|
+
inst = odps_entry.execute_sql(f"EXPLAIN {query}")
|
|
268
|
+
explain_str = list(inst.get_task_results().values())[0]
|
|
269
|
+
|
|
270
|
+
odps_schema = _parse_explained_schema(explain_str)
|
|
271
|
+
dtypes = odps_schema_to_pandas_dtypes(odps_schema)
|
|
272
|
+
|
|
273
|
+
if not index_col:
|
|
274
|
+
index_dtypes = None
|
|
275
|
+
else:
|
|
276
|
+
if isinstance(index_col, str):
|
|
277
|
+
index_col = [index_col]
|
|
278
|
+
index_col_set = set(index_col)
|
|
279
|
+
data_cols = [c for c in dtypes.index if c not in index_col_set]
|
|
280
|
+
idx_dtype_vals = [dtypes[c] for c in index_col]
|
|
281
|
+
col_dtype_vals = [dtypes[c] for c in data_cols]
|
|
282
|
+
index_dtypes = pd.Series(idx_dtype_vals, index=index_col)
|
|
283
|
+
dtypes = pd.Series(col_dtype_vals, index=data_cols)
|
|
284
|
+
|
|
285
|
+
chunk_bytes = kw.pop("chunk_bytes", None)
|
|
286
|
+
chunk_size = kw.pop("chunk_size", None)
|
|
287
|
+
op = DataFrameReadODPSQuery(
|
|
288
|
+
query=query,
|
|
289
|
+
dtypes=dtypes,
|
|
290
|
+
use_arrow_dtype=kw.pop("use_arrow_dtype", True),
|
|
291
|
+
string_as_binary=string_as_binary,
|
|
292
|
+
index_columns=index_col,
|
|
293
|
+
index_dtypes=index_dtypes,
|
|
294
|
+
)
|
|
295
|
+
return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
|
|
@@ -69,7 +69,7 @@ class DataFrameReadODPSTable(
|
|
|
69
69
|
return getattr(self, "partition_spec", None)
|
|
70
70
|
|
|
71
71
|
def get_columns(self):
|
|
72
|
-
return self.columns
|
|
72
|
+
return self.columns or list(self.dtypes.index)
|
|
73
73
|
|
|
74
74
|
def set_pruned_columns(self, columns, *, keep_order=None): # pragma: no cover
|
|
75
75
|
self.columns = columns
|