PyPI - maxframe - Versions diffs - 0.1.0b1__cp37-cp37m-macosx_10_9_x86_64.whl → 0.1.0b3__cp37-cp37m-macosx_10_9_x86_64.whl - Mend

maxframe 0.1.0b1__cp37-cp37m-macosx_10_9_x86_64.whl → 0.1.0b3__cp37-cp37m-macosx_10_9_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (42) hide show

maxframe/_utils.cpython-37m-darwin.so +0 -0
maxframe/codegen.py +88 -19
maxframe/config/config.py +10 -0
maxframe/core/entity/executable.py +1 -0
maxframe/core/entity/objects.py +3 -2
maxframe/core/graph/core.cpython-37m-darwin.so +0 -0
maxframe/core/graph/core.pyx +2 -2
maxframe/core/operator/base.py +14 -0
maxframe/dataframe/__init__.py +3 -1
maxframe/dataframe/datasource/from_records.py +4 -0
maxframe/dataframe/datasource/read_odps_query.py +295 -0
maxframe/dataframe/datasource/read_odps_table.py +1 -1
maxframe/dataframe/datasource/tests/test_datasource.py +84 -1
maxframe/dataframe/groupby/__init__.py +4 -0
maxframe/dataframe/groupby/core.py +5 -0
maxframe/dataframe/misc/to_numeric.py +4 -0
maxframe/dataframe/window/aggregation.py +1 -24
maxframe/dataframe/window/ewm.py +0 -7
maxframe/dataframe/window/tests/test_ewm.py +0 -6
maxframe/errors.py +21 -0
maxframe/lib/aio/isolation.py +6 -1
maxframe/lib/mmh3.cpython-37m-darwin.so +0 -0
maxframe/opcodes.py +1 -0
maxframe/protocol.py +25 -5
maxframe/serialization/core.cpython-37m-darwin.so +0 -0
maxframe/serialization/exception.py +2 -1
maxframe/serialization/serializables/core.py +6 -1
maxframe/serialization/serializables/field.py +2 -0
maxframe/tensor/core.py +3 -3
maxframe/tests/test_codegen.py +69 -0
maxframe/tests/test_protocol.py +16 -8
maxframe/tests/utils.py +1 -0
maxframe/udf.py +15 -16
maxframe/utils.py +21 -1
{maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/METADATA +1 -74
{maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/RECORD +42 -39
maxframe_client/clients/framedriver.py +7 -7
maxframe_client/session/task.py +31 -3
maxframe_client/session/tests/test_task.py +29 -11
maxframe_client/tests/test_session.py +2 -0
{maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/WHEEL +0 -0
{maxframe-0.1.0b1.dist-info → maxframe-0.1.0b3.dist-info}/top_level.txt +0 -0

maxframe/_utils.cpython-37m-darwin.so CHANGED Viewed

Binary file

maxframe/codegen.py CHANGED Viewed

@@ -17,7 +17,7 @@ import base64
 import dataclasses
 import logging
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 from odps.types import OdpsSchema
 from odps.utils import camel_to_underline
@@ -30,6 +30,7 @@ from .odpsio import build_dataframe_table_meta
 from .odpsio.schema import pandas_to_odps_schema
 from .protocol import DataFrameTableMeta, ResultInfo
 from .serialization import PickleContainer
+from .serialization.serializables import Serializable, StringField
 from .typing_ import PandasObjectTypes
 from .udf import MarkedFunction
@@ -48,8 +49,11 @@ class CodeGenResult:
     constants: Dict[str, Any]
-class AbstractUDF(abc.ABC):
-    _session_id: str
+class AbstractUDF(Serializable):
+    _session_id: str = StringField("session_id")
+    def __init__(self, session_id: Optional[str] = None, **kw):
+        super().__init__(_session_id=session_id, **kw)
     @property
     def name(self) -> str:
@@ -74,7 +78,66 @@ class AbstractUDF(abc.ABC):
 class UserCodeMixin:
     @classmethod
-    def generate_pickled_codes(cls, code_to_pickle: Any) -> List[str]:
+    def obj_to_python_expr(cls, obj: Any = None) -> str:
+        """
+        Parameters
+        ----------
+        obj
+            The object to convert to python expr.
+        Returns
+        -------
+        str :
+            The str type content equals to the object when use in the python code directly.
+        """
+        if obj is None:
+            return "None"
+        if isinstance(obj, (int, float)):
+            return repr(obj)
+        if isinstance(obj, bool):
+            return "True" if obj else "False"
+        if isinstance(obj, bytes):
+            base64_bytes = base64.b64encode(obj)
+            return f"base64.b64decode({base64_bytes})"
+        if isinstance(obj, str):
+            return repr(obj)
+        if isinstance(obj, list):
+            return (
+                f"[{', '.join([cls.obj_to_python_expr(element) for element in obj])}]"
+            )
+        if isinstance(obj, dict):
+            items = (
+                f"{repr(key)}: {cls.obj_to_python_expr(value)}"
+                for key, value in obj.items()
+            )
+            return f"{{{', '.join(items)}}}"
+        if isinstance(obj, tuple):
+            return f"({', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}{',' if len(obj) == 1 else ''})"
+        if isinstance(obj, set):
+            return (
+                f"{{{', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}}}"
+                if obj
+                else "set()"
+            )
+        if isinstance(obj, PickleContainer):
+            return UserCodeMixin.generate_pickled_codes(obj, None)
+        raise ValueError(f"not support arg type {type(obj)}")
+    @classmethod
+    def generate_pickled_codes(
+        cls,
+        code_to_pickle: Any,
+        unpicked_data_var_name: Union[str, None] = "pickled_data",
+    ) -> str:
         """
         Generate pickled codes. The final pickled variable is called 'pickled_data'.
@@ -82,20 +145,20 @@ class UserCodeMixin:
         ----------
         code_to_pickle: Any
             The code to be pickled.
+        unpicked_data_var_name: str
+            The variables in code used to hold the loads object from the cloudpickle
         Returns
         -------
-        List[str] :
-            The code snippets of pickling, the final variable is called 'pickled_data'.
+        str :
+            The code snippets of pickling, the final variable is called 'pickled_data' by default.
         """
         pickled, buffers = cls.dump_pickled_data(code_to_pickle)
-        pickled = base64.b64encode(pickled)
-        buffers = [base64.b64encode(b) for b in buffers]
-        buffers_str = ", ".join(f"base64.b64decode(b'{b.decode()}')" for b in buffers)
-        return [
-            f"base64_data = base64.b64decode(b'{pickled.decode()}')",
-            f"pickled_data = cloudpickle.loads(base64_data, buffers=[{buffers_str}])",
-        ]
+        pickle_loads_expr = f"cloudpickle.loads({cls.obj_to_python_expr(pickled)}, buffers={cls.obj_to_python_expr(buffers)})"
+        if unpicked_data_var_name:
+            return f"{unpicked_data_var_name} = {pickle_loads_expr}"
+        return pickle_loads_expr
     @staticmethod
     def dump_pickled_data(
@@ -114,8 +177,9 @@ class UserCodeMixin:
 class BigDagCodeContext(metaclass=abc.ABCMeta):
-    def __init__(self, session_id: str = None):
+    def __init__(self, session_id: str = None, subdag_id: str = None):
         self._session_id = session_id
+        self._subdag_id = subdag_id
         self._tileable_key_to_variables = dict()
         self.constants = dict()
         self._data_table_meta_cache = dict()
@@ -142,10 +206,14 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
         except KeyError:
             var_name = self._tileable_key_to_variables[
                 tileable.key
-            ] = f"var_{self._next_var_id}"
-            self._next_var_id += 1
+            ] = self.next_var_name()
             return var_name
+    def next_var_name(self) -> str:
+        var_name = f"var_{self._next_var_id}"
+        self._next_var_id += 1
+        return var_name
     def get_odps_schema(
         self, data: PandasObjectTypes, unknown_as_string: bool = False
     ) -> OdpsSchema:
@@ -275,9 +343,10 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
     engine_priority: int = 0
     _extension_loaded = False
-    def __init__(self, session_id: str):
+    def __init__(self, session_id: str, subdag_id: str = None):
         self._session_id = session_id
-        self._context = self._init_context(session_id)
+        self._subdag_id = subdag_id
+        self._context = self._init_context(session_id, subdag_id)
     @classmethod
     def _load_engine_extensions(cls):
@@ -307,7 +376,7 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
         raise NotImplementedError
     @abc.abstractmethod
-    def _init_context(self, session_id: str) -> BigDagCodeContext:
+    def _init_context(self, session_id: str, subdag_id: str) -> BigDagCodeContext:
         raise NotImplementedError
     def _generate_comments(

maxframe/config/config.py CHANGED Viewed

@@ -19,6 +19,7 @@ import warnings
 from copy import deepcopy
 from typing import Any, Dict, Optional, Union
+from ..utils import get_python_tag
 from .validators import (
     ValidatorType,
     all_validator,
@@ -299,6 +300,9 @@ default_options = Config()
 default_options.register_option(
     "execution_mode", "trigger", validator=is_in(["trigger", "eager"])
 )
+default_options.register_option(
+    "python_tag", get_python_tag(), validator=is_string, remote=True
+)
 default_options.register_option(
     "client.task_start_timeout", _DEFAULT_TASK_START_TIMEOUT, validator=is_integer
 )
@@ -336,6 +340,12 @@ default_options.register_option(
     validator=is_integer,
     remote=True,
 )
+default_options.register_option(
+    "session.subinstance_priority",
+    None,
+    validator=any_validator(is_null, is_integer),
+    remote=True,
+)
 default_options.register_option("warn_duplicated_execution", False, validator=is_bool)
 default_options.register_option("dataframe.use_arrow_dtype", True, validator=is_bool)

maxframe/core/entity/executable.py CHANGED Viewed

@@ -66,6 +66,7 @@ class DecrefRunner:
         if self._decref_thread:  # pragma: no branch
             self._queue.put_nowait((None, None, None))
             self._decref_thread.join(1)
+        self._decref_thread = None
     def put(self, key: str, session_ref: ref):
         if self._decref_thread is None:

maxframe/core/entity/objects.py CHANGED Viewed

@@ -15,6 +15,7 @@
 from typing import Any, Dict
 from ...serialization.serializables import FieldTypes, ListField
+from ...utils import skip_na_call
 from .chunks import Chunk, ChunkData
 from .core import Entity
 from .executable import _ToObjectMixin
@@ -62,8 +63,8 @@ class ObjectData(TileableData, _ToObjectMixin):
     _chunks = ListField(
         "chunks",
         FieldTypes.reference(ObjectChunkData),
-        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
-        on_deserialize=lambda x: [ObjectChunk(it) for it in x] if x is not None else x,
+        on_serialize=skip_na_call(lambda x: [it.data for it in x]),
+        on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
     )
     def __init__(self, op=None, nsplits=None, **kw):

maxframe/core/graph/core.cpython-37m-darwin.so CHANGED Viewed

Binary file

maxframe/core/graph/core.pyx CHANGED Viewed

@@ -404,10 +404,10 @@ cdef class DirectedGraph:
         Fusion(self).decompose(nodes=nodes)
-    def view(self, filename='default', graph_attrs=None, node_attrs=None, result_chunk_keys=None, show_columns=False):  # pragma: no cover
+    def view(self, filename='default', graph_attrs=None, trunc_key=5, node_attrs=None, result_chunk_keys=None, show_columns=False):  # pragma: no cover
         from graphviz import Source
-        g = Source(self.to_dot(graph_attrs, node_attrs, result_chunk_keys=result_chunk_keys, show_columns=show_columns))
+        g = Source(self.to_dot(graph_attrs, node_attrs, trunc_key=trunc_key, result_chunk_keys=result_chunk_keys, show_columns=show_columns))
         g.view(filename=filename, cleanup=True)
     def to_dag(self):

maxframe/core/operator/base.py CHANGED Viewed

@@ -287,6 +287,20 @@ class Operator(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperatorMetaclass
             self.check_inputs(inputs)
         setattr(self, "_inputs", inputs)
+    def replace_input(self, index: int, replaced_input: ENTITY_TYPE):
+        """
+        Replace the input[index] with replaced_input.
+        Parameters
+        ----------
+        index : int
+            The input to be replaced index.
+        replaced_input : ENTITY_TYPE
+            The replaced input object.
+        """
+        self.inputs[index] = replaced_input
+        self._set_inputs(self.inputs)
     @property
     def inputs(self) -> List[Union[ENTITY_TYPE]]:
         inputs = self._inputs

maxframe/dataframe/__init__.py CHANGED Viewed

@@ -35,9 +35,11 @@ from .datasource.from_index import series_from_index
 from .datasource.from_records import from_records
 from .datasource.from_tensor import dataframe_from_tensor, series_from_tensor
 from .datasource.read_csv import read_csv
+from .datasource.read_odps_query import read_odps_query
 from .datasource.read_odps_table import read_odps_table
 from .datasource.read_parquet import read_parquet
 from .datastore.to_odps import to_odps_table
+from .groupby import NamedAgg
 from .initializer import DataFrame, Index, Series, read_pandas
 from .merge import concat, merge
 from .misc.cut import cut
@@ -51,7 +53,7 @@ from .reduction import CustomReduction, unique
 from .tseries.to_datetime import to_datetime
 try:
-    from pandas import NA, NamedAgg, Timestamp
+    from pandas import NA, Timestamp
 except ImportError:  # pragma: no cover
     pass

maxframe/dataframe/datasource/from_records.py CHANGED Viewed

@@ -38,6 +38,10 @@ class DataFrameFromRecords(DataFrameOperator, DataFrameOperatorMixin):
             raise NotImplementedError("Specifying index value is not supported for now")
         super().__init__(columns=columns, _output_types=[OutputType.dataframe], **kw)
+    @property
+    def input(self):
+        return self._inputs[0]
     def __call__(self, data):
         if self.nrows is None:
             nrows = data.shape[0]

maxframe/dataframe/datasource/read_odps_query.py ADDED Viewed

@@ -0,0 +1,295 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+import re
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import pandas as pd
+from odps import ODPS
+from odps.types import Column, OdpsSchema, validate_data_type
+from ... import opcodes
+from ...core import OutputType
+from ...core.graph import DAG
+from ...odpsio import odps_schema_to_pandas_dtypes
+from ...serialization.serializables import (
+    AnyField,
+    BoolField,
+    FieldTypes,
+    Int64Field,
+    ListField,
+    SeriesField,
+    StringField,
+)
+from ..utils import parse_index
+from .core import ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource
+_EXPLAIN_DEPENDS_REGEX = re.compile(r"([^\s]+) depends on: ([^\n]+)")
+_EXPLAIN_JOB_REGEX = re.compile(r"(\S+) is root job")
+_EXPLAIN_TASKS_HEADER_REGEX = re.compile(r"In Job ([^:]+):")
+_EXPLAIN_ROOT_TASKS_REGEX = re.compile(r"root Tasks: (.+)")
+_EXPLAIN_TASK_REGEX = re.compile(r"In Task ([^:]+)")
+_EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
+    r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
+    re.MULTILINE,
+)
+_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
+@dataclasses.dataclass
+class DependencySector:
+    roots: List[str]
+    dependencies: List[Tuple[str, str]]
+    def build_dag(self) -> DAG:
+        dag = DAG()
+        for r in self.roots:
+            dag.add_node(r)
+        for v_from, v_to in self.dependencies:
+            dag.add_node(v_from)
+            dag.add_node(v_to)
+            dag.add_edge(v_from, v_to)
+        return dag
+@dataclasses.dataclass
+class JobsSector(DependencySector):
+    jobs: Dict[str, "TasksSector"] = dataclasses.field(default_factory=dict)
+@dataclasses.dataclass
+class TasksSector(DependencySector):
+    job_name: str
+    tasks: Dict[str, "TaskSector"] = dataclasses.field(default_factory=dict)
+@dataclasses.dataclass
+class ColumnSchema:
+    column_name: str
+    column_type: str
+    column_alias: Optional[str]
+@dataclasses.dataclass
+class TaskSector:
+    job_name: str
+    task_name: str
+    output_target: Optional[str]
+    schema: List[ColumnSchema]
+def _split_explain_string(explain_string: str) -> List[str]:
+    parts = explain_string.split("\n\n")
+    final_parts = []
+    grouped = []
+    for part in parts:
+        part = part.strip("\n")
+        if grouped and not part.startswith(" "):
+            final_parts.append("\n\n".join(grouped).strip())
+            grouped = []
+        grouped.append(part)
+    if grouped:
+        final_parts.append("\n\n".join(grouped).strip())
+    return final_parts
+def _find_all_deps(sector: str) -> List[Tuple[str, str]]:
+    deps = []
+    for match in _EXPLAIN_DEPENDS_REGEX.findall(sector):
+        descendant = match[0]
+        for r in match[1].split(","):
+            deps.append((r.strip(), descendant))
+    return deps
+def _resolve_jobs_sector(sector: str) -> JobsSector:
+    match = _EXPLAIN_JOB_REGEX.search(sector)
+    roots = [r.strip() for r in match.group(1).split(",")]
+    deps = _find_all_deps(sector)
+    return JobsSector(roots, deps)
+def _resolve_tasks_sector(sector: str) -> TasksSector:
+    match = _EXPLAIN_ROOT_TASKS_REGEX.search(sector)
+    roots = [r.strip() for r in match.group(1).split(",")]
+    match = _EXPLAIN_TASKS_HEADER_REGEX.search(sector)
+    job_name = match.group(1)
+    deps = _find_all_deps(sector)
+    return TasksSector(roots, deps, job_name)
+def _resolve_task_sector(job_name: str, sector: str) -> TaskSector:
+    match = _EXPLAIN_TASK_REGEX.match(sector)
+    task_name = match.group(1)
+    match = _EXPLAIN_TASK_SCHEMA_REGEX.match(sector)
+    if match is None:
+        return TaskSector(job_name, task_name, None, [])
+    out_target = match.group(2)
+    out_schema = match.group(3)
+    schemas = []
+    for match in _EXPLAIN_COLUMN_REGEX.findall(out_schema):
+        col_name, data_type, alias = match
+        schemas.append(ColumnSchema(col_name.strip(), data_type.strip(), alias.strip()))
+    return TaskSector(job_name, task_name, out_target, schemas)
+def _parse_explained_schema(explain_string: str) -> OdpsSchema:
+    sectors = _split_explain_string(explain_string)
+    jobs_sector = tasks_sector = None
+    for sector in sectors:
+        if _EXPLAIN_JOB_REGEX.search(sector):
+            jobs_sector = _resolve_jobs_sector(sector)
+        elif _EXPLAIN_TASKS_HEADER_REGEX.search(sector):
+            tasks_sector = _resolve_tasks_sector(sector)
+            assert jobs_sector is not None
+            jobs_sector.jobs[tasks_sector.job_name] = tasks_sector
+        elif _EXPLAIN_TASK_REGEX.search(sector):
+            assert tasks_sector is not None
+            task_sector = _resolve_task_sector(tasks_sector.job_name, sector)
+            tasks_sector.tasks[task_sector.task_name] = task_sector
+    job_dag = jobs_sector.build_dag()
+    indep_job_names = list(job_dag.iter_indep(reverse=True))
+    if len(indep_job_names) > 1:  # pragma: no cover
+        raise ValueError("Only one final job is allowed in SQL statement")
+    tasks_sector = jobs_sector.jobs[indep_job_names[0]]
+    task_dag = tasks_sector.build_dag()
+    indep_task_names = list(task_dag.iter_indep(reverse=True))
+    if len(indep_task_names) > 1:  # pragma: no cover
+        raise ValueError("Only one final task is allowed in SQL statement")
+    task_sector = tasks_sector.tasks[indep_task_names[0]]
+    if not task_sector.schema:  # pragma: no cover
+        raise ValueError("Cannot detect output schema")
+    if task_sector.output_target != "Screen":
+        raise ValueError("The SQL statement should be an instant query")
+    cols = [
+        Column(c.column_alias or c.column_name, validate_data_type(c.column_type))
+        for c in task_sector.schema
+    ]
+    return OdpsSchema(cols)
+class DataFrameReadODPSQuery(
+    IncrementalIndexDatasource,
+    ColumnPruneSupportedDataSourceMixin,
+):
+    _op_type_ = opcodes.READ_ODPS_QUERY
+    query = StringField("query")
+    dtypes = SeriesField("dtypes", default=None)
+    columns = AnyField("columns", default=None)
+    nrows = Int64Field("nrows", default=None)
+    use_arrow_dtype = BoolField("use_arrow_dtype", default=None)
+    string_as_binary = BoolField("string_as_binary", default=None)
+    index_columns = ListField("index_columns", FieldTypes.string, default=None)
+    index_dtypes = SeriesField("index_dtypes", default=None)
+    def get_columns(self):
+        return self.columns
+    def set_pruned_columns(self, columns, *, keep_order=None):  # pragma: no cover
+        self.columns = columns
+    def __call__(self, chunk_bytes=None, chunk_size=None):
+        if not self.index_columns:
+            index_value = parse_index(pd.RangeIndex(0))
+        elif len(self.index_columns) == 1:
+            index_value = parse_index(
+                pd.Index([], name=self.index_columns[0]).astype(self.index_dtypes[0])
+            )
+        else:
+            idx = pd.MultiIndex.from_frame(
+                pd.DataFrame([], columns=self.index_columns).astype(self.index_dtypes)
+            )
+            index_value = parse_index(idx)
+        columns_value = parse_index(self.dtypes.index, store_data=True)
+        self.output_types = [OutputType.dataframe]
+        return self.new_tileable(
+            [],
+            None,
+            shape=(len(self.dtypes), np.nan),
+            dtypes=self.dtypes,
+            index_value=index_value,
+            columns_value=columns_value,
+            chunk_bytes=chunk_bytes,
+            chunk_size=chunk_size,
+        )
+def read_odps_query(
+    query: str,
+    odps_entry: ODPS = None,
+    index_col: Union[None, str, List[str]] = None,
+    string_as_binary: bool = None,
+    **kw,
+):
+    """
+    Read data from a MaxCompute (ODPS) query into DataFrame.
+    Supports specifying some columns as indexes. If not specified, RangeIndex
+    will be generated.
+    Parameters
+    ----------
+    query: str
+        MaxCompute SQL statement.
+    index_col: Union[None, str, List[str]]
+        Columns to be specified as indexes.
+    Returns
+    -------
+    result: DataFrame
+        DataFrame read from MaxCompute (ODPS) table
+    """
+    odps_entry = odps_entry or ODPS.from_environments()
+    inst = odps_entry.execute_sql(f"EXPLAIN {query}")
+    explain_str = list(inst.get_task_results().values())[0]
+    odps_schema = _parse_explained_schema(explain_str)
+    dtypes = odps_schema_to_pandas_dtypes(odps_schema)
+    if not index_col:
+        index_dtypes = None
+    else:
+        if isinstance(index_col, str):
+            index_col = [index_col]
+        index_col_set = set(index_col)
+        data_cols = [c for c in dtypes.index if c not in index_col_set]
+        idx_dtype_vals = [dtypes[c] for c in index_col]
+        col_dtype_vals = [dtypes[c] for c in data_cols]
+        index_dtypes = pd.Series(idx_dtype_vals, index=index_col)
+        dtypes = pd.Series(col_dtype_vals, index=data_cols)
+    chunk_bytes = kw.pop("chunk_bytes", None)
+    chunk_size = kw.pop("chunk_size", None)
+    op = DataFrameReadODPSQuery(
+        query=query,
+        dtypes=dtypes,
+        use_arrow_dtype=kw.pop("use_arrow_dtype", True),
+        string_as_binary=string_as_binary,
+        index_columns=index_col,
+        index_dtypes=index_dtypes,
+    )
+    return op(chunk_bytes=chunk_bytes, chunk_size=chunk_size)

maxframe/dataframe/datasource/read_odps_table.py CHANGED Viewed

@@ -69,7 +69,7 @@ class DataFrameReadODPSTable(
         return getattr(self, "partition_spec", None)
     def get_columns(self):
-        return self.columns
+        return self.columns or list(self.dtypes.index)
     def set_pruned_columns(self, columns, *, keep_order=None):  # pragma: no cover
         self.columns = columns