PyPI - maxframe - Versions diffs - 0.1.0b2__cp310-cp310-win_amd64.whl → 0.1.0b4__cp310-cp310-win_amd64.whl - Mend

maxframe 0.1.0b2__cp310-cp310-win_amd64.whl → 0.1.0b4__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (42) hide show

maxframe/_utils.cp310-win_amd64.pyd +0 -0
maxframe/codegen.py +88 -19
maxframe/config/config.py +9 -0
maxframe/core/entity/executable.py +1 -0
maxframe/core/entity/objects.py +3 -2
maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
maxframe/dataframe/__init__.py +7 -1
maxframe/dataframe/core.py +4 -2
maxframe/dataframe/datasource/read_odps_query.py +4 -2
maxframe/dataframe/datasource/read_odps_table.py +3 -1
maxframe/dataframe/datasource/tests/test_datasource.py +22 -0
maxframe/dataframe/datastore/core.py +19 -0
maxframe/dataframe/datastore/to_csv.py +2 -2
maxframe/dataframe/datastore/to_odps.py +2 -2
maxframe/dataframe/groupby/__init__.py +1 -0
maxframe/dataframe/groupby/core.py +5 -0
maxframe/dataframe/indexing/reset_index.py +1 -17
maxframe/lib/aio/isolation.py +6 -1
maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
maxframe/odpsio/arrow.py +8 -3
maxframe/odpsio/schema.py +18 -5
maxframe/odpsio/tests/test_schema.py +25 -0
maxframe/opcodes.py +5 -0
maxframe/protocol.py +7 -0
maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
maxframe/serialization/serializables/core.py +6 -1
maxframe/serialization/serializables/field.py +2 -0
maxframe/session.py +4 -2
maxframe/tensor/core.py +3 -3
maxframe/tests/test_codegen.py +69 -0
maxframe/tests/test_protocol.py +16 -8
maxframe/tests/utils.py +1 -0
maxframe/utils.py +20 -1
{maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/METADATA +1 -1
{maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/RECORD +42 -40
maxframe_client/clients/framedriver.py +7 -7
maxframe_client/session/odps.py +11 -10
maxframe_client/session/task.py +8 -1
maxframe_client/session/tests/test_task.py +29 -11
maxframe_client/tests/test_session.py +23 -0
{maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/WHEEL +0 -0
{maxframe-0.1.0b2.dist-info → maxframe-0.1.0b4.dist-info}/top_level.txt +0 -0

maxframe/_utils.cp310-win_amd64.pyd CHANGED Viewed

Binary file

maxframe/codegen.py CHANGED Viewed

@@ -17,7 +17,7 @@ import base64
 import dataclasses
 import logging
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
 from odps.types import OdpsSchema
 from odps.utils import camel_to_underline
@@ -30,6 +30,7 @@ from .odpsio import build_dataframe_table_meta
 from .odpsio.schema import pandas_to_odps_schema
 from .protocol import DataFrameTableMeta, ResultInfo
 from .serialization import PickleContainer
+from .serialization.serializables import Serializable, StringField
 from .typing_ import PandasObjectTypes
 from .udf import MarkedFunction
@@ -48,8 +49,11 @@ class CodeGenResult:
     constants: Dict[str, Any]
-class AbstractUDF(abc.ABC):
-    _session_id: str
+class AbstractUDF(Serializable):
+    _session_id: str = StringField("session_id")
+    def __init__(self, session_id: Optional[str] = None, **kw):
+        super().__init__(_session_id=session_id, **kw)
     @property
     def name(self) -> str:
@@ -74,7 +78,66 @@ class AbstractUDF(abc.ABC):
 class UserCodeMixin:
     @classmethod
-    def generate_pickled_codes(cls, code_to_pickle: Any) -> List[str]:
+    def obj_to_python_expr(cls, obj: Any = None) -> str:
+        """
+        Parameters
+        ----------
+        obj
+            The object to convert to python expr.
+        Returns
+        -------
+        str :
+            The str type content equals to the object when use in the python code directly.
+        """
+        if obj is None:
+            return "None"
+        if isinstance(obj, (int, float)):
+            return repr(obj)
+        if isinstance(obj, bool):
+            return "True" if obj else "False"
+        if isinstance(obj, bytes):
+            base64_bytes = base64.b64encode(obj)
+            return f"base64.b64decode({base64_bytes})"
+        if isinstance(obj, str):
+            return repr(obj)
+        if isinstance(obj, list):
+            return (
+                f"[{', '.join([cls.obj_to_python_expr(element) for element in obj])}]"
+            )
+        if isinstance(obj, dict):
+            items = (
+                f"{repr(key)}: {cls.obj_to_python_expr(value)}"
+                for key, value in obj.items()
+            )
+            return f"{{{', '.join(items)}}}"
+        if isinstance(obj, tuple):
+            return f"({', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}{',' if len(obj) == 1 else ''})"
+        if isinstance(obj, set):
+            return (
+                f"{{{', '.join([cls.obj_to_python_expr(sub_obj) for sub_obj in obj])}}}"
+                if obj
+                else "set()"
+            )
+        if isinstance(obj, PickleContainer):
+            return UserCodeMixin.generate_pickled_codes(obj, None)
+        raise ValueError(f"not support arg type {type(obj)}")
+    @classmethod
+    def generate_pickled_codes(
+        cls,
+        code_to_pickle: Any,
+        unpicked_data_var_name: Union[str, None] = "pickled_data",
+    ) -> str:
         """
         Generate pickled codes. The final pickled variable is called 'pickled_data'.
@@ -82,20 +145,20 @@ class UserCodeMixin:
         ----------
         code_to_pickle: Any
             The code to be pickled.
+        unpicked_data_var_name: str
+            The variables in code used to hold the loads object from the cloudpickle
         Returns
         -------
-        List[str] :
-            The code snippets of pickling, the final variable is called 'pickled_data'.
+        str :
+            The code snippets of pickling, the final variable is called 'pickled_data' by default.
         """
         pickled, buffers = cls.dump_pickled_data(code_to_pickle)
-        pickled = base64.b64encode(pickled)
-        buffers = [base64.b64encode(b) for b in buffers]
-        buffers_str = ", ".join(f"base64.b64decode(b'{b.decode()}')" for b in buffers)
-        return [
-            f"base64_data = base64.b64decode(b'{pickled.decode()}')",
-            f"pickled_data = cloudpickle.loads(base64_data, buffers=[{buffers_str}])",
-        ]
+        pickle_loads_expr = f"cloudpickle.loads({cls.obj_to_python_expr(pickled)}, buffers={cls.obj_to_python_expr(buffers)})"
+        if unpicked_data_var_name:
+            return f"{unpicked_data_var_name} = {pickle_loads_expr}"
+        return pickle_loads_expr
     @staticmethod
     def dump_pickled_data(
@@ -114,8 +177,9 @@ class UserCodeMixin:
 class BigDagCodeContext(metaclass=abc.ABCMeta):
-    def __init__(self, session_id: str = None):
+    def __init__(self, session_id: str = None, subdag_id: str = None):
         self._session_id = session_id
+        self._subdag_id = subdag_id
         self._tileable_key_to_variables = dict()
         self.constants = dict()
         self._data_table_meta_cache = dict()
@@ -142,10 +206,14 @@ class BigDagCodeContext(metaclass=abc.ABCMeta):
         except KeyError:
             var_name = self._tileable_key_to_variables[
                 tileable.key
-            ] = f"var_{self._next_var_id}"
-            self._next_var_id += 1
+            ] = self.next_var_name()
             return var_name
+    def next_var_name(self) -> str:
+        var_name = f"var_{self._next_var_id}"
+        self._next_var_id += 1
+        return var_name
     def get_odps_schema(
         self, data: PandasObjectTypes, unknown_as_string: bool = False
     ) -> OdpsSchema:
@@ -275,9 +343,10 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
     engine_priority: int = 0
     _extension_loaded = False
-    def __init__(self, session_id: str):
+    def __init__(self, session_id: str, subdag_id: str = None):
         self._session_id = session_id
-        self._context = self._init_context(session_id)
+        self._subdag_id = subdag_id
+        self._context = self._init_context(session_id, subdag_id)
     @classmethod
     def _load_engine_extensions(cls):
@@ -307,7 +376,7 @@ class BigDagCodeGenerator(metaclass=abc.ABCMeta):
         raise NotImplementedError
     @abc.abstractmethod
-    def _init_context(self, session_id: str) -> BigDagCodeContext:
+    def _init_context(self, session_id: str, subdag_id: str) -> BigDagCodeContext:
         raise NotImplementedError
     def _generate_comments(

maxframe/config/config.py CHANGED Viewed

@@ -340,6 +340,12 @@ default_options.register_option(
     validator=is_integer,
     remote=True,
 )
+default_options.register_option(
+    "session.subinstance_priority",
+    None,
+    validator=any_validator(is_null, is_integer),
+    remote=True,
+)
 default_options.register_option("warn_duplicated_execution", False, validator=is_bool)
 default_options.register_option("dataframe.use_arrow_dtype", True, validator=is_bool)
@@ -352,6 +358,9 @@ default_options.register_option(
 default_options.register_option(
     "show_progress", "auto", validator=any_validator(is_bool, is_string)
 )
+default_options.register_option(
+    "dag.settings", value=dict(), validator=is_dict, remote=True
+)
 ################
 # SPE Settings #

maxframe/core/entity/executable.py CHANGED Viewed

@@ -66,6 +66,7 @@ class DecrefRunner:
         if self._decref_thread:  # pragma: no branch
             self._queue.put_nowait((None, None, None))
             self._decref_thread.join(1)
+        self._decref_thread = None
     def put(self, key: str, session_ref: ref):
         if self._decref_thread is None:

maxframe/core/entity/objects.py CHANGED Viewed

@@ -15,6 +15,7 @@
 from typing import Any, Dict
 from ...serialization.serializables import FieldTypes, ListField
+from ...utils import skip_na_call
 from .chunks import Chunk, ChunkData
 from .core import Entity
 from .executable import _ToObjectMixin
@@ -62,8 +63,8 @@ class ObjectData(TileableData, _ToObjectMixin):
     _chunks = ListField(
         "chunks",
         FieldTypes.reference(ObjectChunkData),
-        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
-        on_deserialize=lambda x: [ObjectChunk(it) for it in x] if x is not None else x,
+        on_serialize=skip_na_call(lambda x: [it.data for it in x]),
+        on_deserialize=skip_na_call(lambda x: [ObjectChunk(it) for it in x]),
     )
     def __init__(self, op=None, nsplits=None, **kw):

maxframe/core/graph/core.cp310-win_amd64.pyd CHANGED Viewed

Binary file

maxframe/dataframe/__init__.py CHANGED Viewed

@@ -39,6 +39,7 @@ from .datasource.read_odps_query import read_odps_query
 from .datasource.read_odps_table import read_odps_table
 from .datasource.read_parquet import read_parquet
 from .datastore.to_odps import to_odps_table
+from .groupby import NamedAgg
 from .initializer import DataFrame, Index, Series, read_pandas
 from .merge import concat, merge
 from .misc.cut import cut
@@ -52,7 +53,12 @@ from .reduction import CustomReduction, unique
 from .tseries.to_datetime import to_datetime
 try:
-    from pandas import NA, NamedAgg, Timestamp
+    from pandas import NA, Timestamp
+except ImportError:  # pragma: no cover
+    pass
+try:
+    from . import _internal
 except ImportError:  # pragma: no cover
     pass

maxframe/dataframe/core.py CHANGED Viewed

@@ -960,7 +960,9 @@ class BaseSeriesData(HasShapeTileableData, _ToPandasMixin):
             buf = StringIO()
             max_rows = pd.get_option("display.max_rows")
             corner_max_rows = (
-                max_rows if self.shape[0] <= max_rows else corner_data.shape[0] - 1
+                max_rows
+                if self.shape[0] <= max_rows or corner_data.shape[0] == 0
+                else corner_data.shape[0] - 1
             )  # make sure max_rows < corner_data
             with pd.option_context("display.max_rows", corner_max_rows):
@@ -1605,7 +1607,7 @@ class DataFrameData(_BatchedFetcher, BaseDataFrameData):
             buf = StringIO()
             max_rows = pd.get_option("display.max_rows")
-            if self.shape[0] <= max_rows:
+            if self.shape[0] <= max_rows or corner_data.shape[0] == 0:
                 buf.write(repr(corner_data) if representation else str(corner_data))
             else:
                 # remember we cannot directly call repr(df),

maxframe/dataframe/datasource/read_odps_query.py CHANGED Viewed

@@ -46,7 +46,7 @@ _EXPLAIN_TASK_SCHEMA_REGEX = re.compile(
     r"In Task ([^:]+)[\S\s]+FS: output: ([^\n #]+)[\s\S]+schema:\s+([\S\s]+)$",
     re.MULTILINE,
 )
-_EXPLAIN_COLUMN_REGEX = re.compile(r"([^ ]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
+_EXPLAIN_COLUMN_REGEX = re.compile(r"([^\(]+) \(([^)]+)\)(?:| AS ([^ ]+))(?:\n|$)")
 @dataclasses.dataclass
@@ -263,7 +263,9 @@ def read_odps_query(
     result: DataFrame
         DataFrame read from MaxCompute (ODPS) table
     """
-    odps_entry = odps_entry or ODPS.from_environments()
+    odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
+    if odps_entry is None:
+        raise ValueError("Missing odps_entry parameter")
     inst = odps_entry.execute_sql(f"EXPLAIN {query}")
     explain_str = list(inst.get_task_results().values())[0]

maxframe/dataframe/datasource/read_odps_table.py CHANGED Viewed

@@ -69,7 +69,7 @@ class DataFrameReadODPSTable(
         return getattr(self, "partition_spec", None)
     def get_columns(self):
-        return self.columns
+        return self.columns or list(self.dtypes.index)
     def set_pruned_columns(self, columns, *, keep_order=None):  # pragma: no cover
         self.columns = columns
@@ -164,6 +164,8 @@ def read_odps_table(
         DataFrame read from MaxCompute (ODPS) table
     """
     odps_entry = odps_entry or ODPS.from_global() or ODPS.from_environments()
+    if odps_entry is None:
+        raise ValueError("Missing odps_entry parameter")
     if isinstance(table_name, Table):
         table = table_name
     else:

maxframe/dataframe/datasource/tests/test_datasource.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from collections import OrderedDict
 import numpy as np
@@ -33,6 +34,7 @@ from ..from_tensor import (
 )
 from ..index import from_pandas as from_pandas_index
 from ..index import from_tileable
+from ..read_odps_query import ColumnSchema, _resolve_task_sector
 from ..series import from_pandas as from_pandas_series
 ray = lazy_import("ray")
@@ -228,6 +230,7 @@ def test_from_odps_table():
     assert df.op.table_name == test_table.full_table_name
     assert df.index_value.name is None
     assert isinstance(df.index_value.value, IndexValue.RangeIndex)
+    assert df.op.get_columns() == ["col1", "col2", "col3"]
     pd.testing.assert_series_equal(
         df.dtypes,
         pd.Series(
@@ -247,6 +250,7 @@ def test_from_odps_table():
     assert df.op.table_name == test_table.full_table_name
     assert df.index_value.name is None
     assert isinstance(df.index_value.value, IndexValue.RangeIndex)
+    assert df.op.get_columns() == ["col1", "col2"]
     pd.testing.assert_series_equal(
         df.dtypes,
         pd.Series([np.dtype("O"), np.dtype("int64")], index=["col1", "col2"]),
@@ -257,6 +261,7 @@ def test_from_odps_table():
     assert df.index_value.name == "col1"
     assert isinstance(df.index_value.value, IndexValue.Index)
     assert df.index.dtype == np.dtype("O")
+    assert df.op.get_columns() == ["col2", "col3"]
     pd.testing.assert_series_equal(
         df.dtypes,
         pd.Series([np.dtype("int64"), np.dtype("float64")], index=["col2", "col3"]),
@@ -267,6 +272,7 @@ def test_from_odps_table():
     df = read_odps_table(test_parted_table, append_partitions=True)
     assert df.op.append_partitions is True
+    assert df.op.get_columns() == ["col1", "col2", "col3", "pt"]
     pd.testing.assert_series_equal(
         df.dtypes,
         pd.Series(
@@ -280,6 +286,7 @@ def test_from_odps_table():
     )
     assert df.op.append_partitions is True
     assert df.op.partitions == ["pt=20240103"]
+    assert df.op.get_columns() == ["col1", "col2", "pt"]
     pd.testing.assert_series_equal(
         df.dtypes,
         pd.Series(
@@ -377,3 +384,18 @@ def test_date_range():
     assert dr.index_value.is_unique == expected.is_unique
     assert dr.index_value.is_monotonic_increasing == expected.is_monotonic_increasing
     assert dr.name == expected.name
+def test_resolve_task_sector():
+    input_path = os.path.join(os.path.dirname(__file__), "test-data", "task-input.txt")
+    with open(input_path, "r") as f:
+        sector = f.read()
+    actual_sector = _resolve_task_sector("job0", sector)
+    assert actual_sector.job_name == "job0"
+    assert actual_sector.task_name == "M1"
+    assert actual_sector.output_target == "Screen"
+    assert len(actual_sector.schema) == 78
+    assert actual_sector.schema[0] == ColumnSchema("unnamed: 0", "bigint", "")
+    assert actual_sector.schema[1] == ColumnSchema("id", "bigint", "id_alias")
+    assert actual_sector.schema[2] == ColumnSchema("listing_url", "string", "")

maxframe/dataframe/datastore/core.py ADDED Viewed

@@ -0,0 +1,19 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..operators import DataFrameOperator, DataFrameOperatorMixin
+class DataFrameDataStore(DataFrameOperator, DataFrameOperatorMixin):
+    pass

maxframe/dataframe/datastore/to_csv.py CHANGED Viewed

@@ -23,11 +23,11 @@ from ...serialization.serializables import (
     ListField,
     StringField,
 )
-from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import parse_index
+from .core import DataFrameDataStore
-class DataFrameToCSV(DataFrameOperator, DataFrameOperatorMixin):
+class DataFrameToCSV(DataFrameDataStore):
     _op_type_ = opcodes.TO_CSV
     input = KeyField("input")

maxframe/dataframe/datastore/to_odps.py CHANGED Viewed

@@ -32,13 +32,13 @@ from ...serialization.serializables import (
 )
 from ...typing_ import TileableType
 from ..core import DataFrame  # noqa: F401
-from ..operators import DataFrameOperator, DataFrameOperatorMixin
 from ..utils import parse_index
+from .core import DataFrameDataStore
 logger = logging.getLogger(__name__)
-class DataFrameToODPSTable(DataFrameOperator, DataFrameOperatorMixin):
+class DataFrameToODPSTable(DataFrameDataStore):
     _op_type_ = opcodes.TO_ODPS_TABLE
     dtypes = SeriesField("dtypes")

maxframe/dataframe/groupby/__init__.py CHANGED Viewed

@@ -14,6 +14,7 @@
 # noinspection PyUnresolvedReferences
 from ..core import DataFrameGroupBy, GroupBy, SeriesGroupBy
+from .core import NamedAgg
 def _install():

maxframe/dataframe/groupby/core.py CHANGED Viewed

@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import namedtuple
 import pandas as pd
 from ... import opcodes
@@ -30,6 +32,9 @@ _GROUP_KEYS_NO_DEFAULT = pd_release_version >= (1, 5, 0)
 _default_group_keys = no_default if _GROUP_KEYS_NO_DEFAULT else True
+NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
 class DataFrameGroupByOperator(MapReduceOperator, DataFrameOperatorMixin):
     _op_type_ = opcodes.GROUPBY

maxframe/dataframe/indexing/reset_index.py CHANGED Viewed

@@ -107,7 +107,6 @@ def df_reset_index(
     inplace=False,
     col_level=0,
     col_fill="",
-    incremental_index=False,
 ):
     """
     Reset the index, or a level of it.
@@ -133,12 +132,6 @@ def df_reset_index(
     col_fill : object, default ''
         If the columns have multiple levels, determines how the other
         levels are named. If None then the index name is repeated.
-    incremental_index: bool, default False
-        Ensure RangeIndex incremental, when output DataFrame has multiple chunks,
-        ensuring index incremental costs more computation,
-        so by default, each chunk will have index which starts from 0,
-        setting incremental_index=True，reset_index will guarantee that
-        output DataFrame's index is from 0 to n - 1.
     Returns
     -------
@@ -264,7 +257,6 @@ def df_reset_index(
         drop=drop,
         col_level=col_level,
         col_fill=col_fill,
-        incremental_index=incremental_index,
         output_types=[OutputType.dataframe],
     )
     ret = op(df)
@@ -280,7 +272,6 @@ def series_reset_index(
     drop=False,
     name=no_default,
     inplace=False,
-    incremental_index=False,
 ):
     """
     Generate a new DataFrame or Series with the index reset.
@@ -303,12 +294,6 @@ def series_reset_index(
         when `drop` is True.
     inplace : bool, default False
         Modify the Series in place (do not create a new object).
-    incremental_index: bool, default False
-        Ensure RangeIndex incremental, when output Series has multiple chunks,
-        ensuring index incremental costs more computation,
-        so by default, each chunk will have index which starts from 0,
-        setting incremental_index=True，reset_index will guarantee that
-        output Series's index is from 0 to n - 1.
     Returns
     -------
@@ -406,8 +391,7 @@ def series_reset_index(
         level=level,
         drop=drop,
         name=name,
-        incremental_index=incremental_index,
-        output_types=[OutputType.series],
+        output_types=[OutputType.series if drop else OutputType.dataframe],
     )
     ret = op(series)
     if not inplace:

maxframe/lib/aio/isolation.py CHANGED Viewed

@@ -14,11 +14,14 @@
 import asyncio
 import atexit
+import itertools
 import threading
 from typing import Dict, Optional
 class Isolation:
+    _counter = itertools.count().__next__
     loop: asyncio.AbstractEventLoop
     _stopped: Optional[asyncio.Event]
     _thread: Optional[threading.Thread]
@@ -38,7 +41,9 @@ class Isolation:
     def start(self):
         if self._threaded:
-            self._thread = thread = threading.Thread(target=self._run)
+            self._thread = thread = threading.Thread(
+                name=f"IsolationThread-{self._counter()}", target=self._run
+            )
             thread.daemon = True
             thread.start()
             self._thread_ident = thread.ident

maxframe/lib/mmh3.cp310-win_amd64.pyd CHANGED Viewed

Binary file

maxframe/odpsio/arrow.py CHANGED Viewed

@@ -65,14 +65,19 @@ def arrow_to_pandas(
         raise ValueError(f"Does not support meta type {table_meta.type!r}")
-def pandas_to_arrow(df: Any, nthreads=1) -> Tuple[ArrowTableType, DataFrameTableMeta]:
-    table_meta = build_dataframe_table_meta(df)
+def pandas_to_arrow(
+    df: Any, nthreads=1, ignore_index=False
+) -> Tuple[ArrowTableType, DataFrameTableMeta]:
+    table_meta = build_dataframe_table_meta(df, ignore_index)
     df = df.copy() if callable(getattr(df, "copy", None)) else df
     if table_meta.type in (OutputType.dataframe, OutputType.series):
         if table_meta.type == OutputType.series:
             df = df.to_frame("_data" if df.name is None else df.name)
         df.columns = pd.Index(table_meta.table_column_names)
-        df = df.rename_axis(table_meta.table_index_column_names).reset_index()
+        if not ignore_index:
+            df = df.rename_axis(table_meta.table_index_column_names).reset_index()
+    elif ignore_index:
+        df = pd.DataFrame([], columns=[])
     elif table_meta.type == OutputType.index:
         names = [f"_idx_{idx}" for idx in range(len(df.names))]
         df = df.to_frame(name=names[0] if len(names) == 1 else names)

maxframe/odpsio/schema.py CHANGED Viewed

@@ -175,7 +175,9 @@ def _scalar_as_index(df_obj: Any) -> pd.Index:
 def pandas_to_odps_schema(
-    df_obj: Any, unknown_as_string: bool = False
+    df_obj: Any,
+    unknown_as_string: bool = False,
+    ignore_index=False,
 ) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
     from .. import dataframe as md
     from .arrow import pandas_to_arrow
@@ -209,7 +211,7 @@ def pandas_to_odps_schema(
     else:
         empty_df_obj = df_obj
-    arrow_data, table_meta = pandas_to_arrow(empty_df_obj)
+    arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
     return (
         arrow_schema_to_odps_schema(
             arrow_data.schema, unknown_as_string=unknown_as_string
@@ -268,7 +270,9 @@ def build_table_column_name(
     return col_name
-def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
+def build_dataframe_table_meta(
+    df_obj: Any, ignore_index: bool = False
+) -> DataFrameTableMeta:
     from .. import dataframe as md
     col_to_count = defaultdict(lambda: 0)
@@ -285,6 +289,8 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
     else:  # pragma: no cover
         raise TypeError(f"Cannot accept type {type(df_obj)}")
+    assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
     if obj_type == OutputType.scalar:
         pd_dtypes = pd.Series([])
         column_index_names = []
@@ -340,12 +346,19 @@ def build_dataframe_table_meta(df_obj: Any) -> DataFrameTableMeta:
     else:
         index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
+    if ignore_index:
+        table_index_column_names = []
+        pd_index_dtypes = pd.Series([], index=[])
+    else:
+        table_index_column_names = [f"_idx_{i}" for i in range(len(index_obj.names))]
+        pd_index_dtypes = index_dtypes
     return DataFrameTableMeta(
         table_name=table_name,
         type=obj_type,
         table_column_names=final_sql_columns,
-        table_index_column_names=[f"_idx_{i}" for i in range(len(index_obj.names))],
+        table_index_column_names=table_index_column_names,
         pd_column_dtypes=pd_dtypes,
         pd_column_level_names=column_index_names,
-        pd_index_dtypes=index_dtypes,
+        pd_index_dtypes=pd_index_dtypes,
     )