PyPI - maxframe - Versions diffs - 1.0.0rc3__cp311-cp311-macosx_10_9_universal2.whl → 1.1.0__cp311-cp311-macosx_10_9_universal2.whl - Mend

maxframe 1.0.0rc3__cp311-cp311-macosx_10_9_universal2.whl → 1.1.0__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (112) hide show

maxframe/_utils.cpython-311-darwin.so +0 -0
maxframe/codegen.py +1 -0
maxframe/config/config.py +16 -1
maxframe/conftest.py +52 -14
maxframe/core/entity/executable.py +1 -1
maxframe/core/graph/core.cpython-311-darwin.so +0 -0
maxframe/core/operator/base.py +2 -0
maxframe/dataframe/arithmetic/docstring.py +26 -2
maxframe/dataframe/arithmetic/equal.py +4 -2
maxframe/dataframe/arithmetic/greater.py +4 -2
maxframe/dataframe/arithmetic/greater_equal.py +4 -2
maxframe/dataframe/arithmetic/less.py +2 -2
maxframe/dataframe/arithmetic/less_equal.py +4 -2
maxframe/dataframe/arithmetic/not_equal.py +4 -2
maxframe/dataframe/arithmetic/tests/test_arithmetic.py +17 -16
maxframe/dataframe/core.py +26 -2
maxframe/dataframe/datasource/read_odps_query.py +116 -28
maxframe/dataframe/datasource/read_odps_table.py +3 -1
maxframe/dataframe/datasource/tests/test_datasource.py +93 -12
maxframe/dataframe/datastore/to_odps.py +7 -0
maxframe/dataframe/extensions/__init__.py +8 -0
maxframe/dataframe/extensions/apply_chunk.py +649 -0
maxframe/dataframe/extensions/flatjson.py +131 -0
maxframe/dataframe/extensions/flatmap.py +314 -0
maxframe/dataframe/extensions/reshuffle.py +1 -1
maxframe/dataframe/extensions/tests/test_apply_chunk.py +186 -0
maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
maxframe/dataframe/groupby/__init__.py +1 -0
maxframe/dataframe/groupby/aggregation.py +1 -0
maxframe/dataframe/groupby/apply.py +9 -1
maxframe/dataframe/groupby/core.py +1 -1
maxframe/dataframe/groupby/fill.py +4 -1
maxframe/dataframe/groupby/getitem.py +6 -0
maxframe/dataframe/groupby/tests/test_groupby.py +1 -1
maxframe/dataframe/groupby/transform.py +8 -2
maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
maxframe/dataframe/indexing/loc.py +6 -4
maxframe/dataframe/indexing/rename.py +11 -0
maxframe/dataframe/initializer.py +11 -1
maxframe/dataframe/merge/__init__.py +9 -1
maxframe/dataframe/merge/concat.py +41 -31
maxframe/dataframe/merge/merge.py +1 -1
maxframe/dataframe/merge/tests/test_merge.py +3 -1
maxframe/dataframe/misc/apply.py +3 -0
maxframe/dataframe/misc/drop_duplicates.py +23 -2
maxframe/dataframe/misc/map.py +3 -1
maxframe/dataframe/misc/tests/test_misc.py +24 -2
maxframe/dataframe/misc/transform.py +22 -13
maxframe/dataframe/reduction/__init__.py +3 -0
maxframe/dataframe/reduction/aggregation.py +1 -0
maxframe/dataframe/reduction/median.py +56 -0
maxframe/dataframe/reduction/tests/test_reduction.py +17 -7
maxframe/dataframe/statistics/quantile.py +8 -2
maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
maxframe/dataframe/tests/test_initializer.py +33 -2
maxframe/dataframe/tests/test_utils.py +60 -0
maxframe/dataframe/utils.py +110 -7
maxframe/dataframe/window/expanding.py +5 -3
maxframe/dataframe/window/tests/test_expanding.py +2 -2
maxframe/io/objects/tests/test_object_io.py +39 -12
maxframe/io/odpsio/arrow.py +30 -2
maxframe/io/odpsio/schema.py +28 -8
maxframe/io/odpsio/tableio.py +55 -133
maxframe/io/odpsio/tests/test_schema.py +40 -4
maxframe/io/odpsio/tests/test_tableio.py +5 -5
maxframe/io/odpsio/tests/test_volumeio.py +35 -11
maxframe/io/odpsio/volumeio.py +36 -6
maxframe/learn/contrib/__init__.py +3 -1
maxframe/learn/contrib/graph/__init__.py +15 -0
maxframe/learn/contrib/graph/connected_components.py +215 -0
maxframe/learn/contrib/graph/tests/__init__.py +13 -0
maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
maxframe/learn/contrib/llm/__init__.py +16 -0
maxframe/learn/contrib/llm/core.py +54 -0
maxframe/learn/contrib/llm/models/__init__.py +14 -0
maxframe/learn/contrib/llm/models/dashscope.py +73 -0
maxframe/learn/contrib/llm/multi_modal.py +42 -0
maxframe/learn/contrib/llm/text.py +42 -0
maxframe/learn/contrib/xgboost/classifier.py +3 -3
maxframe/learn/contrib/xgboost/predict.py +8 -39
maxframe/learn/contrib/xgboost/train.py +4 -3
maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
maxframe/lib/sparse/tests/test_sparse.py +15 -15
maxframe/opcodes.py +10 -1
maxframe/protocol.py +6 -1
maxframe/serialization/core.cpython-311-darwin.so +0 -0
maxframe/serialization/core.pyx +13 -1
maxframe/serialization/pandas.py +50 -20
maxframe/serialization/serializables/core.py +24 -5
maxframe/serialization/serializables/field_type.py +4 -1
maxframe/serialization/serializables/tests/test_serializable.py +8 -1
maxframe/serialization/tests/test_serial.py +2 -1
maxframe/session.py +9 -2
maxframe/tensor/__init__.py +19 -7
maxframe/tensor/indexing/getitem.py +2 -0
maxframe/tensor/merge/concatenate.py +23 -20
maxframe/tensor/merge/vstack.py +5 -1
maxframe/tensor/misc/transpose.py +1 -1
maxframe/tests/utils.py +16 -0
maxframe/udf.py +27 -0
maxframe/utils.py +64 -14
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/METADATA +2 -2
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/RECORD +112 -96
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/WHEEL +1 -1
maxframe_client/clients/framedriver.py +4 -1
maxframe_client/fetcher.py +28 -10
maxframe_client/session/consts.py +3 -0
maxframe_client/session/odps.py +104 -20
maxframe_client/session/task.py +42 -26
maxframe_client/session/tests/test_task.py +0 -4
maxframe_client/tests/test_session.py +44 -12
{maxframe-1.0.0rc3.dist-info → maxframe-1.1.0.dist-info}/top_level.txt +0 -0

maxframe/dataframe/window/expanding.py CHANGED Viewed

@@ -28,6 +28,7 @@ from .aggregation import BaseDataFrameExpandingAgg
 from .core import Window
 _window_has_method = pd_release_version >= (1, 3, 0)
+_window_has_center = pd_release_version < (2, 0, 0)
 class DataFrameExpandingAgg(BaseDataFrameExpandingAgg):
@@ -49,10 +50,11 @@ class Expanding(Window):
     def params(self):
         p = OrderedDict()
+        args = ["min_periods", "center", "axis", "method"]
         if not _window_has_method:  # pragma: no cover
-            args = ["min_periods", "center", "axis"]
-        else:
-            args = ["min_periods", "center", "axis", "method"]
+            args = [a for a in args if a != "method"]
+        if not _window_has_center:
+            args = [a for a in args if a != "center"]
         for k in args:
             p[k] = getattr(self, k)

maxframe/dataframe/window/tests/test_expanding.py CHANGED Viewed

@@ -29,8 +29,8 @@ def test_expanding():
     with pytest.raises(NotImplementedError):
         _ = df2.expanding(3, axis=1)
-    r = df2.expanding(3, center=False)
-    expected = df.expanding(3, center=False)
+    r = df2.expanding(3)
+    expected = df.expanding(3)
     assert repr(r) == repr(expected)
     assert "b" in dir(r)

maxframe/io/objects/tests/test_object_io.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
 import pytest
 from odps import ODPS
@@ -48,15 +49,33 @@ def create_volume(request, oss_config):
         oss_bucket_name,
         oss_endpoint,
     ) = oss_config.oss_config
-    test_location = "oss://%s:%s@%s/%s/%s" % (
-        oss_access_id,
-        oss_secret_access_key,
-        oss_endpoint,
-        oss_bucket_name,
-        oss_test_dir_name,
-    )
+    if "test" in oss_endpoint:
+        # offline config
+        test_location = "oss://%s:%s@%s/%s/%s" % (
+            oss_access_id,
+            oss_secret_access_key,
+            oss_endpoint,
+            oss_bucket_name,
+            oss_test_dir_name,
+        )
+        rolearn = None
+    else:
+        # online config
+        endpoint_parts = oss_endpoint.split(".", 1)
+        if "-internal" not in endpoint_parts[0]:
+            endpoint_parts[0] += "-internal"
+        test_location = "oss://%s/%s/%s" % (
+            ".".join(endpoint_parts),
+            oss_bucket_name,
+            oss_test_dir_name,
+        )
+        rolearn = oss_config.oss_rolearn
     oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
-    odps_entry.create_external_volume(test_vol_name, location=test_location)
+    odps_entry.create_external_volume(
+        test_vol_name, location=test_location, rolearn=rolearn
+    )
     try:
         yield test_vol_name
@@ -75,8 +94,12 @@ def test_simple_object_io(create_volume):
     odps_entry = ODPS.from_environments()
-    reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
-    writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
+    reader = ODPSVolumeReader(
+        odps_entry, create_volume, obj.key, replace_internal_host=True
+    )
+    writer = ODPSVolumeWriter(
+        odps_entry, create_volume, obj.key, replace_internal_host=True
+    )
     handler = get_object_io_handler(obj)()
     handler.write_object(writer, obj, data)
@@ -89,8 +112,12 @@ def test_tensor_object_io(create_volume):
     odps_entry = ODPS.from_environments()
-    reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
-    writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
+    reader = ODPSVolumeReader(
+        odps_entry, create_volume, obj.key, replace_internal_host=True
+    )
+    writer = ODPSVolumeWriter(
+        odps_entry, create_volume, obj.key, replace_internal_host=True
+    )
     handler = get_object_io_handler(obj)()
     handler.write_object(writer, obj, data)

maxframe/io/odpsio/arrow.py CHANGED Viewed

@@ -69,13 +69,24 @@ def arrow_to_pandas(
 def pandas_to_arrow(
-    df: Any, nthreads=1, ignore_index=False
+    df: Any, nthreads=1, ignore_index=False, ms_cols=None
 ) -> Tuple[ArrowTableType, DataFrameTableMeta]:
     table_meta = build_dataframe_table_meta(df, ignore_index)
     df = df.copy() if callable(getattr(df, "copy", None)) else df
+    table_datetime_cols = None
     if table_meta.type in (OutputType.dataframe, OutputType.series):
         if table_meta.type == OutputType.series:
             df = df.to_frame("_data" if df.name is None else df.name)
+            if ms_cols:
+                table_datetime_cols = {"_data"}
+        elif ms_cols:
+            ms_col_set = set(ms_cols)
+            table_datetime_cols = set()
+            for pd_col, table_col in zip(
+                table_meta.pd_column_dtypes.keys(), table_meta.table_column_names
+            ):
+                if pd_col in ms_col_set:
+                    table_datetime_cols.add(table_col)
         df.columns = pd.Index(table_meta.table_column_names)
         if not ignore_index:
             df = df.rename_axis(table_meta.table_index_column_names).reset_index()
@@ -83,6 +94,12 @@ def pandas_to_arrow(
         df = pd.DataFrame([], columns=[])
     elif table_meta.type == OutputType.index:
         names = [f"_idx_{idx}" for idx in range(len(df.names))]
+        table_datetime_cols = set()
+        if ms_cols:
+            if isinstance(df, pd.MultiIndex):
+                table_datetime_cols = {f"_idx_{idx}" for idx in ms_cols}
+            else:
+                table_datetime_cols = {"_idx_0"}
         df = df.to_frame(name=names[0] if len(names) == 1 else names)
     elif table_meta.type == OutputType.scalar:
         names = ["_idx_0"]
@@ -92,4 +109,15 @@ def pandas_to_arrow(
             df = pd.DataFrame([[df]], columns=names)
     else:  # this could never happen  # pragma: no cover
         raise ValueError(f"Does not support meta type {table_meta.type!r}")
-    return pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False), table_meta
+    pa_table = pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False)
+    if table_datetime_cols:
+        col_names = pa_table.schema.names
+        col_datas = []
+        for idx, col_name in enumerate(pa_table.schema.names):
+            if col_name not in table_datetime_cols:
+                col_datas.append(pa_table.column(idx))
+                continue
+            col_data = pa_table.column(idx).cast(pa.timestamp("ms"))
+            col_datas.append(col_data)
+        pa_table = pa.Table.from_arrays(col_datas, names=col_names)
+    return pa_table, table_meta

maxframe/io/odpsio/schema.py CHANGED Viewed

@@ -16,6 +16,7 @@ import string
 from collections import defaultdict
 from typing import Any, Dict, Tuple
+import numpy as np
 import pandas as pd
 import pyarrow as pa
 from odps import types as odps_types
@@ -39,6 +40,7 @@ _arrow_to_odps_types = {
     pa.float64(): odps_types.double,
     pa.date32(): odps_types.date,
     pa.timestamp("ms"): odps_types.datetime,
+    pa.timestamp("us"): odps_types.timestamp,
     pa.timestamp("ns"): odps_types.timestamp,
 }
@@ -54,7 +56,9 @@ _odps_type_to_arrow = {
     odps_types.double: pa.float64(),
     odps_types.date: pa.date32(),
     odps_types.datetime: pa.timestamp("ms"),
+    odps_types.json: pa.string(),
     odps_types.timestamp: pa.timestamp("ns"),
+    odps_types.timestamp_ntz: pa.timestamp("ns"),
 }
@@ -166,7 +170,7 @@ def odps_schema_to_pandas_dtypes(
     return arrow_schema.empty_table().to_pandas().dtypes
-def _is_scalar_object(df_obj: Any) -> bool:
+def is_scalar_object(df_obj: Any) -> bool:
     return (
         isinstance(df_obj, TENSOR_TYPE) and df_obj.shape == ()
     ) or pd_types.is_scalar(df_obj)
@@ -187,7 +191,7 @@ def pandas_to_odps_schema(
     from ... import dataframe as md
     from .arrow import pandas_to_arrow
-    if _is_scalar_object(df_obj):
+    if is_scalar_object(df_obj):
         empty_index = None
     elif hasattr(df_obj, "index_value"):
         empty_index = df_obj.index_value.to_pandas()[:0]
@@ -203,20 +207,35 @@ def pandas_to_odps_schema(
     else:
         empty_columns = None
+    ms_cols = None
     if isinstance(df_obj, (md.DataFrame, pd.DataFrame)):
         empty_df_obj = pd.DataFrame(
             [], columns=empty_columns, index=empty_index
         ).astype(df_obj.dtypes)
+        ms_cols = [
+            col for col, dt in df_obj.dtypes.items() if dt == np.dtype("datetime64[ms]")
+        ]
     elif isinstance(df_obj, (md.Series, pd.Series)):
         empty_df_obj = pd.Series([], name=df_obj.name, index=empty_index).astype(
             df_obj.dtype
         )
+        ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
     elif isinstance(df_obj, (md.Index, pd.Index)):
         empty_df_obj = empty_index
+        if isinstance(empty_index, pd.MultiIndex):
+            ms_cols = [
+                idx
+                for idx, dt in enumerate(empty_index.dtypes.values)
+                if dt == np.dtype("datetime64[ms]")
+            ]
+        else:
+            ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
     else:
         empty_df_obj = df_obj
-    arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
+    arrow_data, table_meta = pandas_to_arrow(
+        empty_df_obj, ignore_index=ignore_index, ms_cols=ms_cols
+    )
     return (
         arrow_schema_to_odps_schema(
             arrow_data.schema, unknown_as_string=unknown_as_string
@@ -289,7 +308,7 @@ def build_dataframe_table_meta(
         obj_type = OutputType.series
     elif isinstance(df_obj, (md.Index, pd.Index)):
         obj_type = OutputType.index
-    elif _is_scalar_object(df_obj):
+    elif is_scalar_object(df_obj):
         obj_type = OutputType.scalar
     else:  # pragma: no cover
         raise TypeError(f"Cannot accept type {type(df_obj)}")
@@ -344,10 +363,11 @@ def build_dataframe_table_meta(
     else:
         pd_index_val = index_obj
-    if hasattr(pd_index_val, "dtypes"):
-        index_dtypes = pd.Series(pd_index_val.dtypes.values, index=pd_index_val.names)
-    else:
-        index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
+    level_dtypes = [
+        pd_index_val.get_level_values(level).dtype
+        for level in range(pd_index_val.nlevels)
+    ]
+    index_dtypes = pd.Series(level_dtypes, index=pd_index_val.names)
     if ignore_index and obj_type != OutputType.index:
         table_index_column_names = []

maxframe/io/odpsio/tableio.py CHANGED Viewed

@@ -20,15 +20,14 @@ from typing import Dict, List, Optional, Union
 import pyarrow as pa
 from odps import ODPS
-from odps import __version__ as pyodps_version
 from odps.apis.storage_api import (
     StorageApiArrowClient,
     TableBatchScanResponse,
     TableBatchWriteResponse,
 )
-from odps.config import option_context as pyodps_option_context
 from odps.tunnel import TableTunnel
 from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
+from odps.utils import call_with_retry
 try:
     import pyarrow.compute as pac
@@ -37,26 +36,18 @@ except ImportError:
 from ...config import options
 from ...env import ODPS_STORAGE_API_ENDPOINT
-from ...lib.version import Version
+from ...utils import sync_pyodps_options
 from .schema import odps_schema_to_arrow_schema
 PartitionsType = Union[List[str], str, None]
 _DEFAULT_ROW_BATCH_SIZE = 4096
-_need_convert_timezone = Version(pyodps_version) < Version("0.11.7")
-@contextmanager
-def _sync_pyodps_timezone():
-    with pyodps_option_context() as cfg:
-        cfg.local_timezone = options.local_timezone
-        yield
 class ODPSTableIO(ABC):
     def __new__(cls, odps: ODPS):
         if cls is ODPSTableIO:
-            if options.use_common_table:
+            if options.use_common_table or ODPS_STORAGE_API_ENDPOINT in os.environ:
                 return HaloTableIO(odps)
             else:
                 return TunnelTableIO(odps)
@@ -138,7 +129,12 @@ class TunnelMultiPartitionReader:
         self._cur_partition_id = -1
         self._reader_start_pos = 0
-        if partitions is None or isinstance(partitions, str):
+        if partitions is None:
+            if not self._table.table_schema.partitions:
+                self._partitions = [None]
+            else:
+                self._partitions = [str(pt) for pt in self._table.partitions]
+        elif isinstance(partitions, str):
             self._partitions = [partitions]
         else:
             self._partitions = partitions
@@ -166,12 +162,14 @@ class TunnelMultiPartitionReader:
             self._cur_partition_id += 1
             part_str = self._partitions[self._cur_partition_id]
-            with _sync_pyodps_timezone():
+            req_columns = self._schema.names
+            with sync_pyodps_options():
                 self._cur_reader = self._table.open_reader(
                     part_str,
-                    columns=self._columns,
+                    columns=req_columns,
                     arrow=True,
                     download_id=self._partition_to_download_ids.get(part_str),
+                    append_partitions=True,
                 )
             if self._cur_reader.count + self._reader_start_pos > self._start:
                 start = self._start - self._reader_start_pos
@@ -180,43 +178,15 @@ class TunnelMultiPartitionReader:
                 else:
                     count = min(self._count, self._cur_reader.count - start)
-                with _sync_pyodps_timezone():
+                with sync_pyodps_options():
                     self._reader_iter = self._cur_reader.read(start, count)
                 break
             self._reader_start_pos += self._cur_reader.count
         else:
             self._cur_reader = None
-    def _fill_batch_partition(self, batch: pa.RecordBatch) -> pa.RecordBatch:
-        pt_spec = PartitionSpec(self._partitions[self._cur_partition_id])
-        names = list(batch.schema.names)
-        arrays = []
-        for idx in range(batch.num_columns):
-            col = batch.column(idx)
-            if _need_convert_timezone and isinstance(col.type, pa.TimestampType):
-                if col.type.tz is not None:
-                    target_type = pa.timestamp(
-                        self._schema.types[idx].unit, col.type.tz
-                    )
-                    arrays.append(col.cast(target_type))
-                else:
-                    target_type = pa.timestamp(
-                        self._schema.types[idx].unit, options.local_timezone
-                    )
-                    pd_col = col.to_pandas().dt.tz_localize(options.local_timezone)
-                    arrays.append(pa.Array.from_pandas(pd_col).cast(target_type))
-            else:
-                arrays.append(batch.column(idx))
-        for part_col in self._partition_cols or []:
-            names.append(part_col)
-            col_type = self._schema.field_by_name(part_col).type
-            arrays.append(pa.array([pt_spec[part_col]] * batch.num_rows).cast(col_type))
-        return pa.RecordBatch.from_arrays(arrays, names)
     def read(self):
-        with _sync_pyodps_timezone():
+        with sync_pyodps_options():
             if self._cur_reader is None:
                 self._open_next_reader()
                 if self._cur_reader is None:
@@ -227,7 +197,7 @@ class TunnelMultiPartitionReader:
                     if batch is not None:
                         if self._row_left is not None:
                             self._row_left -= batch.num_rows
-                        return self._fill_batch_partition(batch)
+                        return batch
                 except StopIteration:
                     self._open_next_reader()
             return None
@@ -244,34 +214,6 @@ class TunnelMultiPartitionReader:
         return pa.Table.from_batches(batches)
-class TunnelWrappedWriter:
-    def __init__(self, nested_writer):
-        self._writer = nested_writer
-    def write(self, data: Union[pa.RecordBatch, pa.Table]):
-        if not any(isinstance(tp, pa.TimestampType) for tp in data.schema.types):
-            self._writer.write(data)
-            return
-        pa_type = type(data)
-        arrays = []
-        for idx in range(data.num_columns):
-            name = data.schema.names[idx]
-            col = data.column(idx)
-            if not isinstance(col.type, pa.TimestampType):
-                arrays.append(col)
-                continue
-            if self._writer.schema[name].type == timestamp_ntz:
-                col = HaloTableArrowWriter._localize_timezone(col, "UTC")
-            else:
-                col = HaloTableArrowWriter._localize_timezone(col)
-            arrays.append(col)
-        data = pa_type.from_arrays(arrays, names=data.schema.names)
-        self._writer.write(data)
-    def __getattr__(self, item):
-        return getattr(self._writer, item)
 class TunnelTableIO(ODPSTableIO):
     @contextmanager
     def open_reader(
@@ -285,7 +227,9 @@ class TunnelTableIO(ODPSTableIO):
         reverse_range: bool = False,
         row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
     ):
-        table = self._odps.get_table(full_table_name)
+        with sync_pyodps_options():
+            table = self._odps.get_table(full_table_name)
         if partition_columns is True:
             partition_columns = [c.name for c in table.table_schema.partitions]
@@ -296,21 +240,22 @@ class TunnelTableIO(ODPSTableIO):
             or (stop is not None and stop < 0)
             or (reverse_range and start is None)
         ):
-            table = self._odps.get_table(full_table_name)
-            tunnel = TableTunnel(self._odps)
-            parts = (
-                [partitions]
-                if partitions is None or isinstance(partitions, str)
-                else partitions
-            )
-            part_to_down_id = dict()
-            total_records = 0
-            for part in parts:
-                down_session = tunnel.create_download_session(
-                    table, async_mode=True, partition_spec=part
+            with sync_pyodps_options():
+                table = self._odps.get_table(full_table_name)
+                tunnel = TableTunnel(self._odps)
+                parts = (
+                    [partitions]
+                    if partitions is None or isinstance(partitions, str)
+                    else partitions
                 )
-                part_to_down_id[part] = down_session.id
-                total_records += down_session.count
+                part_to_down_id = dict()
+                total_records = 0
+                for part in parts:
+                    down_session = tunnel.create_download_session(
+                        table, async_mode=True, partition_spec=part
+                    )
+                    part_to_down_id[part] = down_session.id
+                    total_records += down_session.count
         count = None
         if start is not None or stop is not None:
@@ -347,20 +292,14 @@ class TunnelTableIO(ODPSTableIO):
         overwrite: bool = True,
     ):
         table = self._odps.get_table(full_table_name)
-        with _sync_pyodps_timezone():
+        with sync_pyodps_options():
             with table.open_writer(
                 partition=partition,
                 arrow=True,
                 create_partition=partition is not None,
                 overwrite=overwrite,
             ) as writer:
-                # fixme should yield writer directly once pyodps fixes
-                #  related arrow timestamp bug when provided schema and
-                #  table schema is identical.
-                if _need_convert_timezone:
-                    yield TunnelWrappedWriter(writer)
-                else:
-                    yield writer
+                yield writer
 class HaloTableArrowReader:
@@ -416,7 +355,7 @@ class HaloTableArrowReader:
             split_index=self._cur_split_id + 1,
             **read_rows_kw,
         )
-        self._cur_reader = self._client.read_rows_arrow(req)
+        self._cur_reader = call_with_retry(self._client.read_rows_arrow, req)
         self._cur_split_id += 1
     def _convert_timezone(self, batch: pa.RecordBatch) -> pa.RecordBatch:
@@ -488,8 +427,9 @@ class HaloTableArrowWriter:
     def open(self):
         from odps.apis.storage_api import WriteRowsRequest
-        self._writer = self._client.write_rows_arrow(
-            WriteRowsRequest(self._write_info.session_id)
+        self._writer = call_with_retry(
+            self._client.write_rows_arrow,
+            WriteRowsRequest(self._write_info.session_id),
         )
     @classmethod
@@ -560,28 +500,6 @@ class HaloTableIO(ODPSTableIO):
             for pt in partitions
         ]
-    def get_table_record_count(
-        self, full_table_name: str, partitions: PartitionsType = None
-    ):
-        from odps.apis.storage_api import SplitOptions, TableBatchScanRequest
-        table = self._odps.get_table(full_table_name)
-        client = StorageApiArrowClient(
-            self._odps, table, rest_endpoint=self._storage_api_endpoint
-        )
-        split_option = SplitOptions.SplitMode.SIZE
-        scan_kw = {
-            "required_partitions": self._convert_partitions(partitions),
-            "split_options": SplitOptions.get_default_options(split_option),
-        }
-        # todo add more options for partition column handling
-        req = TableBatchScanRequest(**scan_kw)
-        resp = client.create_read_session(req)
-        return resp.record_count
     @contextmanager
     def open_reader(
         self,
@@ -596,8 +514,8 @@ class HaloTableIO(ODPSTableIO):
     ):
         from odps.apis.storage_api import (
             SessionRequest,
+            SessionStatus,
             SplitOptions,
-            Status,
             TableBatchScanRequest,
         )
@@ -625,16 +543,16 @@ class HaloTableIO(ODPSTableIO):
         # todo add more options for partition column handling
         req = TableBatchScanRequest(**scan_kw)
-        resp = client.create_read_session(req)
+        resp = call_with_retry(client.create_read_session, req)
         session_id = resp.session_id
-        status = resp.status
-        while status == Status.WAIT:
-            resp = client.get_read_session(SessionRequest(session_id))
-            status = resp.status
+        status = resp.session_status
+        while status == SessionStatus.INIT:
+            resp = call_with_retry(client.get_read_session, SessionRequest(session_id))
+            status = resp.session_status
             time.sleep(1.0)
-        assert status == Status.OK
+        assert status == SessionStatus.NORMAL
         count = None
         if start is not None or stop is not None:
@@ -685,7 +603,7 @@ class HaloTableIO(ODPSTableIO):
         part_strs = self._convert_partitions(partition)
         part_str = part_strs[0] if part_strs else None
         req = TableBatchWriteRequest(partition_spec=part_str, overwrite=overwrite)
-        resp = client.create_write_session(req)
+        resp = call_with_retry(client.create_write_session, req)
         session_id = resp.session_id
         writer = HaloTableArrowWriter(client, resp, table.table_schema)
@@ -694,9 +612,13 @@ class HaloTableIO(ODPSTableIO):
         yield writer
         commit_msg = writer.close()
-        resp = client.commit_write_session(
-            SessionRequest(session_id=session_id), [commit_msg]
+        resp = call_with_retry(
+            client.commit_write_session,
+            SessionRequest(session_id=session_id),
+            [commit_msg],
         )
         while resp.session_status == SessionStatus.COMMITTING:
-            resp = client.get_write_session(SessionRequest(session_id=session_id))
+            resp = call_with_retry(
+                client.get_write_session, SessionRequest(session_id=session_id)
+            )
         assert resp.session_status == SessionStatus.COMMITTED

maxframe/io/odpsio/tests/test_schema.py CHANGED Viewed

@@ -21,6 +21,7 @@ from odps import types as odps_types
 from .... import dataframe as md
 from .... import tensor as mt
 from ....core import OutputType
+from ....utils import pd_release_version
 from ..schema import (
     arrow_schema_to_odps_schema,
     build_dataframe_table_meta,
@@ -270,10 +271,6 @@ def test_odps_arrow_schema_conversion():
     with pytest.raises(TypeError):
         arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
-    with pytest.raises(TypeError):
-        odps_schema_to_arrow_schema(
-            odps_types.OdpsSchema([odps_types.Column("col1", "json")])
-        )
 def test_build_column_name():
@@ -296,3 +293,42 @@ def test_build_table_meta(wrap_obj):
     table_meta = build_dataframe_table_meta(test_df)
     expected_cols = ["a_2", "a_3", "a_0", "a_1_0", "a_1_1", "b", "c"]
     assert table_meta.table_column_names == expected_cols
+@pytest.mark.skipif(
+    pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
+)
+def test_table_meta_with_datetime():
+    raw_df = pd.DataFrame(
+        [
+            [1, "abc", "2024-10-01 11:23:12"],
+            [3, "uvw", "2024-10-02 22:55:13"],
+        ],
+        columns=["col1", "col2", "col3"],
+    )
+    df = md.DataFrame(raw_df).astype({"col3": "datetime64[ms]"})
+    schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
+    assert schema.columns[3].type == odps_types.datetime
+    raw_series = pd.Series(
+        ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
+    )
+    s = md.Series(raw_series)
+    schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
+    assert schema.columns[1].type == odps_types.datetime
+    raw_index = pd.Index(
+        ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
+    )
+    idx = md.Index(raw_index)
+    schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
+    assert schema.columns[0].type == odps_types.datetime
+    src_df = pd.DataFrame(
+        [[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
+        columns=["A", "B"],
+    ).astype({"B": "datetime64[ms]"})
+    raw_multiindex = pd.MultiIndex.from_frame(src_df)
+    multiidx = md.Index(raw_multiindex)
+    schema, _ = pandas_to_odps_schema(multiidx, unknown_as_string=True)
+    assert schema.columns[1].type == odps_types.datetime