maxframe 0.1.0b1__cp37-cp37m-macosx_10_9_x86_64.whl → 0.1.0b3__cp37-cp37m-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (42) hide show
  1. maxframe/_utils.cpython-37m-darwin.so +0 -0
  2. maxframe/codegen.py +88 -19
  3. maxframe/config/config.py +10 -0
  4. maxframe/core/entity/executable.py +1 -0
  5. maxframe/core/entity/objects.py +3 -2
  6. maxframe/core/graph/core.cpython-37m-darwin.so +0 -0
  7. maxframe/core/graph/core.pyx +2 -2
  8. maxframe/core/operator/base.py +14 -0
  9. maxframe/dataframe/__init__.py +3 -1
  10. maxframe/dataframe/datasource/from_records.py +4 -0
  11. maxframe/dataframe/datasource/read_odps_query.py +295 -0
  12. maxframe/dataframe/datasource/read_odps_table.py +1 -1
  13. maxframe/dataframe/datasource/tests/test_datasource.py +84 -1
  14. maxframe/dataframe/groupby/__init__.py +4 -0
  15. maxframe/dataframe/groupby/core.py +5 -0
  16. maxframe/dataframe/misc/to_numeric.py +4 -0
  17. maxframe/dataframe/window/aggregation.py +1 -24
  18. maxframe/dataframe/window/ewm.py +0 -7
  19. maxframe/dataframe/window/tests/test_ewm.py +0 -6
  20. maxframe/errors.py +21 -0
  21. maxframe/lib/aio/isolation.py +6 -1
  22. maxframe/lib/mmh3.cpython-37m-darwin.so +0 -0
  23. maxframe/opcodes.py +1 -0
  24. maxframe/protocol.py +25 -5
  25. maxframe/serialization/core.cpython-37m-darwin.so +0 -0
  26. maxframe/serialization/exception.py +2 -1
  27. maxframe/serialization/serializables/core.py +6 -1
  28. maxframe/serialization/serializables/field.py +2 -0
  29. maxframe/tensor/core.py +3 -3
  30. maxframe/tests/test_codegen.py +69 -0
  31. maxframe/tests/test_protocol.py +16 -8
  32. maxframe/tests/utils.py +1 -0
  33. maxframe/udf.py +15 -16
  34. maxframe/utils.py +21 -1
  35. {maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/METADATA +1 -74
  36. {maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/RECORD +42 -39
  37. maxframe_client/clients/framedriver.py +7 -7
  38. maxframe_client/session/task.py +31 -3
  39. maxframe_client/session/tests/test_task.py +29 -11
  40. maxframe_client/tests/test_session.py +2 -0
  41. {maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/WHEEL +0 -0
  42. {maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/top_level.txt +0 -0
Binary file
maxframe/codegen.py CHANGED
@@ -17,7 +17,7 @@ import base64
17
17
  import dataclasses
18
18
  import logging
19
19
  from enum import Enum
20
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
20
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
21
21
 
22
22
  from odps.types import OdpsSchema
23
23
  from odps.utils import camel_to_underline
@@ -30,6 +30,7 @@ from .odpsio import build_dataframe_table_meta
30
30
  from .odpsio.schema import pandas_to_odps_schema
31
31
  from .protocol import DataFrameTableMeta, ResultInfo
32
32
  from .serialization import PickleContainer
33
+ from .serialization.serializables import Serializable, StringField
33
34
  from .typing_ import PandasObjectTypes
34
35
  from .udf import MarkedFunction
35
36
 
@@ -48,8 +49,11 @@ class CodeGenResult:
48
49
  constants: Dict[str, Any]
49
50
 
50
51
 
51
- class AbstractUDF(abc.ABC):
52
- _session_id: str
52
+ class AbstractUDF(Serializable):
53
+ _session_id: str = StringField("session_id")
54
+
55
+ def __init__(self, session_id: Optional[str] = None, **kw):
56
+ super().__init__(_session_id=session_id, **kw)
53
57
 
54
58
  @property
55
59
  def name(self) -> str:
@@ -74,7 +78,66 @@ class AbstractUDF(abc.ABC):
74
78
 
75
79
  class UserCodeMixin:
76
80
  @classmethod
77
- def generate_pickled_codes(cls, code_to_pickle: Any) -> List[str]:
81
+ def obj_to_python_expr(cls, obj: Any = None) -> str:
82
+ """
83
+ Parameters
84
+ ----------
85
+ obj
86
+ The object to convert to python expr.
87
+ Returns
88
+ -------
89
+ str :
90
+ The str type content equals to the object when use in the python code directly.
91
+ """
92
+ if obj is None:
93
+ return "None"
94
+
95
+ if isinstance(obj, (int, float)):
96
+ return repr(obj)
97
+
98
+ if isinstance(obj, bool):
99
+ return "True" if obj else "False"
100
+
101
+ if isinstance(obj, bytes):
102
+ base64_bytes = base64.b64encode(obj)
103
+ return f"base64.b64decode({base64_bytes})"
104
+
105
+ if isinstance(obj, str):
106
+ return repr(obj)
107
+
108
+ if isinstance(obj, list):
109
+ return (
110
+ f"[{', '.join([cls.obj_to_python_expr(element) for element in obj])}]"
111
+ )
112
+
113
+ if isinstance(obj, dict):
114
+ items = (
115
+ f"{repr(key)}: {cls.obj_to_python_expr(value)}"
116
+ for key, value in obj.items()
117
+ )
118
+ return f"{{{', '.join(items)}}}"
119
+
120
+ if isinstance(obj, tuple):
121
+ return f"({', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}{',' if len(obj) == 1 else ''})"
122
+
123
+ if isinstance(obj, set):
124
+ return (
125
+ f"{{{', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}}}"
126
+ if obj
127
+ else "set()"
128
+ )
129
+
130
+ if isinstance(obj, PickleContainer):
131
+ return UserCodeMixin.generate_pickled_codes(obj, None)
132
+
133
+ raise ValueError(f"not support arg type {type(obj)}")
134
+
135
+ @classmethod
136
+ def generate_pickled_codes(
137
+ cls,
138
+ code_to_pickle: Any,
139
+ unpicked_data_var_name: Union[str, None] = "pickled_data",
140
+ ) -> str:
78
141
  """
79
142
  Generate pickled codes. The final pickled variable is called 'pickled_data'.
80
143
 
@@ -82,20 +145,20 @@ class UserCodeMixin:
82
145
  ----------
83
146
  code_to_pickle: Any
84
147
  The code to be pickled.
148
+ unpicked_data_var_name: str
149
+ The variables in code used to hold the loads object from the cloudpickle
85
150
 
86
151
  Returns
87
152
  -------
88
- List[str] :
89
- The code snippets of pickling, the final variable is called 'pickled_data'.
153
+ str :
154
+ The code snippets of pickling, the final variable is called 'pickled_data' by default.
90
155
  """
91
156
  pickled, buffers = cls.dump_pickled_data(code_to_pickle)
92
- pickled = base64.b64encode(pickled)
93
- buffers = [base64.b64encode(b) for b in buffers]
94
- buffers_str = ", ".join(f"base64.b64decode(b'{b.decode()}')" for b in buffers)
95
- return [
96
- f"base64_data = base64.b64decode(b'{pickled.decode()}')",
97
- f"pickled_data = cloudpickle.loads(base64_data, buffers=[{buffers_str}])",
98
- ]
157
+ pickle_loads_expr = f"cloudpickle.loads({cls.obj_to_python_expr(pickled)}, buffers={cls.obj_to_python_expr(buffers)})"
158
+ if unpicked_data_var_name:
159
+ return f"{unpicked_data_var_name} = {pickle_loads_expr}"
160
+
161
+ return pickle_loads_expr
99
162
 
100
163
  @staticmethod
101
164
  def dump_pickled_data(
@@ -114,8 +177,9 @@ class UserCodeMixin:
114
177
 
115
178
 
116
179
  class BigDagCodeContext(metaclass=abc.ABCMeta):
117
- def __init__(self, session_id: str = None):
180
+ def __init__(self, session_id: str = None, subdag_id: str = None):
118
181
  self._session_id = session_id
182
+ self._subdag_id = subdag_id
119
183
  self._tileable_key_to_variables = dict()
120
184
  self.constants = dict()
121
185
  self._data_table_meta_cache = dict()
@@ -142,10 +206,14 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
142
206
  except KeyError:
143
207
  var_name = self._tileable_key_to_variables[
144
208
  tileable.key
145
- ] = f"var_{self._next_var_id}"
146
- self._next_var_id += 1
209
+ ] = self.next_var_name()
147
210
  return var_name
148
211
 
212
+ def next_var_name(self) -> str:
213
+ var_name = f"var_{self._next_var_id}"
214
+ self._next_var_id += 1
215
+ return var_name
216
+
149
217
  def get_odps_schema(
150
218
  self, data: PandasObjectTypes, unknown_as_string: bool = False
151
219
  ) -> OdpsSchema:
@@ -275,9 +343,10 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
275
343
  engine_priority: int = 0
276
344
  _extension_loaded = False
277
345
 
278
- def __init__(self, session_id: str):
346
+ def __init__(self, session_id: str, subdag_id: str = None):
279
347
  self._session_id = session_id
280
- self._context = self._init_context(session_id)
348
+ self._subdag_id = subdag_id
349
+ self._context = self._init_context(session_id, subdag_id)
281
350
 
282
351
  @classmethod
283
352
  def _load_engine_extensions(cls):
@@ -307,7 +376,7 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
307
376
  raise NotImplementedError
308
377
 
309
378
  @abc.abstractmethod
310
- def _init_context(self, session_id: str) -> BigDagCodeContext:
379
+ def _init_context(self, session_id: str, subdag_id: str) -> BigDagCodeContext:
311
380
  raise NotImplementedError
312
381
 
313
382
  def _generate_comments(
maxframe/config/config.py CHANGED
@@ -19,6 +19,7 @@ import warnings
19
19
  from copy import deepcopy
20
20
  from typing import Any, Dict, Optional, Union
21
21
 
22
+ from ..utils import get_python_tag
22
23
  from .validators import (
23
24
  ValidatorType,
24
25
  all_validator,
@@ -299,6 +300,9 @@ default_options = Config()
299
300
  default_options.register_option(
300
301
  "execution_mode", "trigger", validator=is_in(["trigger", "eager"])
301
302
  )
303
+ default_options.register_option(
304
+ "python_tag", get_python_tag(), validator=is_string, remote=True
305
+ )
302
306
  default_options.register_option(
303
307
  "client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
304
308
  )
@@ -336,6 +340,12 @@ default_options.register_option(
336
340
  validator=is_integer,
337
341
  remote=True,
338
342
  )
343
+ default_options.register_option(
344
+ "session.subinstance_priority",
345
+ None,
346
+ validator=any_validator(is_null, is_integer),
347
+ remote=True,
348
+ )
339
349
 
340
350
  default_options.register_option("warn_duplicated_execution", False, validator=is_bool)
341
351
  default_options.register_option("dataframe.use_arrow_dtype", True, validator=is_bool)
@@ -66,6 +66,7 @@ class DecrefRunner:
66
66
  if self._decref_thread: # pragma: no branch
67
67
  self._queue.put_nowait((None, None, None))
68
68
  self._decref_thread.join(1)
69
+ self._decref_thread = None
69
70
 
70
71
  def put(self, key: str, session_ref: ref):
71
72
  if self._decref_thread is None:
@@ -15,6 +15,7 @@
15
15
  from typing import Any, Dict
16
16
 
17
17
  from ...serialization.serializables import FieldTypes, ListField
18
+ from ...utils import skip_na_call
18
19
  from .chunks import Chunk, ChunkData
19
20
  from .core import Entity
20
21
  from .executable import _ToObjectMixin
@@ -62,8 +63,8 @@ class ObjectData(TileableData, _ToObjectMixin):
62
63
  _chunks = ListField(
63
64
  "chunks",
64
65
  FieldTypes.reference(ObjectChunkData),
65
- on_serialize=lambda x: [it.data for it in x] if x is not None else x,
66
- on_deserialize=lambda x: [ObjectChunk(it) for it in x] if x is not None else x,
66
+ on_serialize=skip_na_call(lambda x: [it.data for it in x]),
67
+ on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
67
68
  )
68
69
 
69
70
  def __init__(self, op=None, nsplits=None, **kw):
@@ -404,10 +404,10 @@ cdef class DirectedGraph:
404
404
 
405
405
  Fusion(self).decompose(nodes=nodes)
406
406
 
407
- def view(self, filename='default', graph_attrs=None, node_attrs=None, result_chunk_keys=None, show_columns=False): # pragma: no cover
407
+ def view(self, filename='default', graph_attrs=None, trunc_key=5, node_attrs=None, result_chunk_keys=None, show_columns=False): # pragma: no cover
408
408
  from graphviz import Source
409
409
 
410
- g = Source(self.to_dot(graph_attrs, node_attrs, result_chunk_keys=result_chunk_keys, show_columns=show_columns))
410
+ g = Source(self.to_dot(graph_attrs, node_attrs, trunc_key=trunc_key, result_chunk_keys=result_chunk_keys, show_columns=show_columns))
411
411
  g.view(filename=filename, cleanup=True)
412
412
 
413
413
  def to_dag(self):
@@ -287,6 +287,20 @@ class Operator(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperatorMetaclass
287
287
  self.check_inputs(inputs)
288
288
  setattr(self, "_inputs", inputs)
289
289
 
290
+ def replace_input(self, index: int, replaced_input: ENTITY_TYPE):
291
+ """
292
+ Replace the input[index] with replaced_input.
293
+
294
+ Parameters
295
+ ----------
296
+ index : int
297
+ The input to be replaced index.
298
+ replaced_input : ENTITY_TYPE
299
+ The replaced input object.
300
+ """
301
+ self.inputs[index] = replaced_input
302
+ self._set_inputs(self.inputs)
303
+
290
304
  @property
291
305
  def inputs(self) -> List[Union[ENTITY_TYPE]]:
292
306
  inputs = self._inputs
@@ -35,9 +35,11 @@ from .datasource.from_index import series_from_index
35
35
  from .datasource.from_records import from_records
36
36
  from .datasource.from_tensor import dataframe_from_tensor, series_from_tensor
37
37
  from .datasource.read_csv import read_csv
38
+ from .datasource.read_odps_query import read_odps_query
38
39
  from .datasource.read_odps_table import read_odps_table
39
40
  from .datasource.read_parquet import read_parquet
40
41
  from .datastore.to_odps import to_odps_table
42
+ from .groupby import NamedAgg
41
43
  from .initializer import DataFrame, Index, Series, read_pandas
42
44
  from .merge import concat, merge
43
45
  from .misc.cut import cut
@@ -51,7 +53,7 @@ from .reduction import CustomReduction, unique
51
53
  from .tseries.to_datetime import to_datetime
52
54
 
53
55
  try:
54
- from pandas import NA, NamedAgg, Timestamp
56
+ from pandas import NA, Timestamp
55
57
  except ImportError: # pragma: no cover
56
58
  pass
57
59
 
@@ -38,6 +38,10 @@ class DataFrameFromRecords(DataFrameOperator, DataFrameOperatorMixin):
38
38
  raise NotImplementedError("Specifying index value is not supported for now")
39
39
  super().__init__(columns=columns, _output_types=[OutputType.dataframe], **kw)
40
40
 
41
+ @property
42
+ def input(self):
43
+ return self._inputs[0]
44
+
41
45
  def __call__(self, data):
42
46
  if self.nrows is None:
43
47
  nrows = data.shape[0]
@@ -0,0 +1,295 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import dataclasses
16
+ import re
17
+ from typing import Dict, List, Optional, Tuple, Union
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ from odps import ODPS
22
+ from odps.types import Column, OdpsSchema, validate_data_type
23
+
24
+ from ... import opcodes
25
+ from ...core import OutputType
26
+ from ...core.graph import DAG
27
+ from ...odpsio import odps_schema_to_pandas_dtypes
28
+ from ...serialization.serializables import (
29
+ AnyField,
30
+ BoolField,
31
+ FieldTypes,
32
+ Int64Field,
33
+ ListField,
34
+ SeriesField,
35
+ StringField,
36
+ )
37
+ from ..utils import parse_index
38
+ from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
39
+
40
+ _EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
41
+ _EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
42
+ _EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
43
+ _EXPLAIN_ROOT_TASKS_REGEX = re.compile(r"root Tasks: (.+)")
44
+ _EXPLAIN_TASK_REGEX = re.compile(r"In Task ([^:]+)")
45
+ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
46
+ r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
47
+ re.MULTILINE,
48
+ )
49
+ _EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
50
+
51
+
52
+ @dataclasses.dataclass
53
+ class DependencySector:
54
+ roots: List[str]
55
+ dependencies: List[Tuple[str, str]]
56
+
57
+ def build_dag(self) -> DAG:
58
+ dag = DAG()
59
+ for r in self.roots:
60
+ dag.add_node(r)
61
+ for v_from, v_to in self.dependencies:
62
+ dag.add_node(v_from)
63
+ dag.add_node(v_to)
64
+ dag.add_edge(v_from, v_to)
65
+ return dag
66
+
67
+
68
+ @dataclasses.dataclass
69
+ class JobsSector(DependencySector):
70
+ jobs: Dict[str, "TasksSector"] = dataclasses.field(default_factory=dict)
71
+
72
+
73
+ @dataclasses.dataclass
74
+ class TasksSector(DependencySector):
75
+ job_name: str
76
+ tasks: Dict[str, "TaskSector"] = dataclasses.field(default_factory=dict)
77
+
78
+
79
+ @dataclasses.dataclass
80
+ class ColumnSchema:
81
+ column_name: str
82
+ column_type: str
83
+ column_alias: Optional[str]
84
+
85
+
86
+ @dataclasses.dataclass
87
+ class TaskSector:
88
+ job_name: str
89
+ task_name: str
90
+ output_target: Optional[str]
91
+ schema: List[ColumnSchema]
92
+
93
+
94
+ def _split_explain_string(explain_string: str) -> List[str]:
95
+ parts = explain_string.split("\n\n")
96
+ final_parts = []
97
+ grouped = []
98
+ for part in parts:
99
+ part = part.strip("\n")
100
+ if grouped and not part.startswith(" "):
101
+ final_parts.append("\n\n".join(grouped).strip())
102
+ grouped = []
103
+ grouped.append(part)
104
+ if grouped:
105
+ final_parts.append("\n\n".join(grouped).strip())
106
+ return final_parts
107
+
108
+
109
+ def _find_all_deps(sector: str) -> List[Tuple[str, str]]:
110
+ deps = []
111
+ for match in _EXPLAIN_DEPENDS_REGEX.findall(sector):
112
+ descendant = match[0]
113
+ for r in match[1].split(","):
114
+ deps.append((r.strip(), descendant))
115
+ return deps
116
+
117
+
118
+ def _resolve_jobs_sector(sector: str) -> JobsSector:
119
+ match = _EXPLAIN_JOB_REGEX.search(sector)
120
+ roots = [r.strip() for r in match.group(1).split(",")]
121
+ deps = _find_all_deps(sector)
122
+ return JobsSector(roots, deps)
123
+
124
+
125
+ def _resolve_tasks_sector(sector: str) -> TasksSector:
126
+ match = _EXPLAIN_ROOT_TASKS_REGEX.search(sector)
127
+ roots = [r.strip() for r in match.group(1).split(",")]
128
+
129
+ match = _EXPLAIN_TASKS_HEADER_REGEX.search(sector)
130
+ job_name = match.group(1)
131
+
132
+ deps = _find_all_deps(sector)
133
+ return TasksSector(roots, deps, job_name)
134
+
135
+
136
+ def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
137
+ match = _EXPLAIN_TASK_REGEX.match(sector)
138
+ task_name = match.group(1)
139
+
140
+ match = _EXPLAIN_TASK_SCHEMA_REGEX.match(sector)
141
+ if match is None:
142
+ return TaskSector(job_name, task_name, None, [])
143
+
144
+ out_target = match.group(2)
145
+ out_schema = match.group(3)
146
+
147
+ schemas = []
148
+ for match in _EXPLAIN_COLUMN_REGEX.findall(out_schema):
149
+ col_name, data_type, alias = match
150
+ schemas.append(ColumnSchema(col_name.strip(), data_type.strip(), alias.strip()))
151
+ return TaskSector(job_name, task_name, out_target, schemas)
152
+
153
+
154
+ def _parse_explained_schema(explain_string: str) -> OdpsSchema:
155
+ sectors = _split_explain_string(explain_string)
156
+ jobs_sector = tasks_sector = None
157
+
158
+ for sector in sectors:
159
+ if _EXPLAIN_JOB_REGEX.search(sector):
160
+ jobs_sector = _resolve_jobs_sector(sector)
161
+ elif _EXPLAIN_TASKS_HEADER_REGEX.search(sector):
162
+ tasks_sector = _resolve_tasks_sector(sector)
163
+ assert jobs_sector is not None
164
+ jobs_sector.jobs[tasks_sector.job_name] = tasks_sector
165
+ elif _EXPLAIN_TASK_REGEX.search(sector):
166
+ assert tasks_sector is not None
167
+ task_sector = _resolve_task_sector(tasks_sector.job_name, sector)
168
+ tasks_sector.tasks[task_sector.task_name] = task_sector
169
+
170
+ job_dag = jobs_sector.build_dag()
171
+ indep_job_names = list(job_dag.iter_indep(reverse=True))
172
+ if len(indep_job_names) > 1: # pragma: no cover
173
+ raise ValueError("Only one final job is allowed in SQL statement")
174
+
175
+ tasks_sector = jobs_sector.jobs[indep_job_names[0]]
176
+ task_dag = tasks_sector.build_dag()
177
+ indep_task_names = list(task_dag.iter_indep(reverse=True))
178
+ if len(indep_task_names) > 1: # pragma: no cover
179
+ raise ValueError("Only one final task is allowed in SQL statement")
180
+
181
+ task_sector = tasks_sector.tasks[indep_task_names[0]]
182
+ if not task_sector.schema: # pragma: no cover
183
+ raise ValueError("Cannot detect output schema")
184
+ if task_sector.output_target != "Screen":
185
+ raise ValueError("The SQL statement should be an instant query")
186
+ cols = [
187
+ Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
188
+ for c in task_sector.schema
189
+ ]
190
+ return OdpsSchema(cols)
191
+
192
+
193
+ class DataFrameReadODPSQuery(
194
+ IncrementalIndexDatasource,
195
+ ColumnPruneSupportedDataSourceMixin,
196
+ ):
197
+ _op_type_ = opcodes.READ_ODPS_QUERY
198
+
199
+ query = StringField("query")
200
+ dtypes = SeriesField("dtypes", default=None)
201
+ columns = AnyField("columns", default=None)
202
+ nrows = Int64Field("nrows", default=None)
203
+ use_arrow_dtype = BoolField("use_arrow_dtype", default=None)
204
+ string_as_binary = BoolField("string_as_binary", default=None)
205
+ index_columns = ListField("index_columns", FieldTypes.string, default=None)
206
+ index_dtypes = SeriesField("index_dtypes", default=None)
207
+
208
+ def get_columns(self):
209
+ return self.columns
210
+
211
+ def set_pruned_columns(self, columns, *, keep_order=None): # pragma: no cover
212
+ self.columns = columns
213
+
214
+ def __call__(self, chunk_bytes=None, chunk_size=None):
215
+ if not self.index_columns:
216
+ index_value = parse_index(pd.RangeIndex(0))
217
+ elif len(self.index_columns) == 1:
218
+ index_value = parse_index(
219
+ pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
220
+ )
221
+ else:
222
+ idx = pd.MultiIndex.from_frame(
223
+ pd.DataFrame([], columns=self.index_columns).astype(self.index_dtypes)
224
+ )
225
+ index_value = parse_index(idx)
226
+
227
+ columns_value = parse_index(self.dtypes.index, store_data=True)
228
+ self.output_types = [OutputType.dataframe]
229
+ return self.new_tileable(
230
+ [],
231
+ None,
232
+ shape=(len(self.dtypes), np.nan),
233
+ dtypes=self.dtypes,
234
+ index_value=index_value,
235
+ columns_value=columns_value,
236
+ chunk_bytes=chunk_bytes,
237
+ chunk_size=chunk_size,
238
+ )
239
+
240
+
241
+ def read_odps_query(
242
+ query: str,
243
+ odps_entry: ODPS = None,
244
+ index_col: Union[None, str, List[str]] = None,
245
+ string_as_binary: bool = None,
246
+ **kw,
247
+ ):
248
+ """
249
+ Read data from a MaxCompute (ODPS) query into DataFrame.
250
+
251
+ Supports specifying some columns as indexes. If not specified, RangeIndex
252
+ will be generated.
253
+
254
+ Parameters
255
+ ----------
256
+ query: str
257
+ MaxCompute SQL statement.
258
+ index_col: Union[None, str, List[str]]
259
+ Columns to be specified as indexes.
260
+
261
+ Returns
262
+ -------
263
+ result: DataFrame
264
+ DataFrame read from MaxCompute (ODPS) table
265
+ """
266
+ odps_entry = odps_entry or ODPS.from_environments()
267
+ inst = odps_entry.execute_sql(f"EXPLAIN {query}")
268
+ explain_str = list(inst.get_task_results().values())[0]
269
+
270
+ odps_schema = _parse_explained_schema(explain_str)
271
+ dtypes = odps_schema_to_pandas_dtypes(odps_schema)
272
+
273
+ if not index_col:
274
+ index_dtypes = None
275
+ else:
276
+ if isinstance(index_col, str):
277
+ index_col = [index_col]
278
+ index_col_set = set(index_col)
279
+ data_cols = [c for c in dtypes.index if c not in index_col_set]
280
+ idx_dtype_vals = [dtypes[c] for c in index_col]
281
+ col_dtype_vals = [dtypes[c] for c in data_cols]
282
+ index_dtypes = pd.Series(idx_dtype_vals, index=index_col)
283
+ dtypes = pd.Series(col_dtype_vals, index=data_cols)
284
+
285
+ chunk_bytes = kw.pop("chunk_bytes", None)
286
+ chunk_size = kw.pop("chunk_size", None)
287
+ op = DataFrameReadODPSQuery(
288
+ query=query,
289
+ dtypes=dtypes,
290
+ use_arrow_dtype=kw.pop("use_arrow_dtype", True),
291
+ string_as_binary=string_as_binary,
292
+ index_columns=index_col,
293
+ index_dtypes=index_dtypes,
294
+ )
295
+ return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)
@@ -69,7 +69,7 @@ class DataFrameReadODPSTable(
69
69
  return getattr(self, "partition_spec", None)
70
70
 
71
71
  def get_columns(self):
72
- return self.columns
72
+ return self.columns or list(self.dtypes.index)
73
73
 
74
74
  def set_pruned_columns(self, columns, *, keep_order=None): # pragma: no cover
75
75
  self.columns = columns