PyPI - maxframe - Versions diffs - 1.0.0rc2__cp38-cp38-win32.whl → 1.0.0rc4__cp38-cp38-win32.whl - Mend

maxframe 1.0.0rc2cp38-cp38-win32.whl → 1.0.0rc4cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of maxframe might be problematic. Click here for more details.

Files changed (134) hide show

maxframe/_utils.cp38-win32.pyd +0 -0
maxframe/codegen.py +4 -2
maxframe/config/config.py +28 -9
maxframe/config/validators.py +42 -12
maxframe/conftest.py +56 -14
maxframe/core/__init__.py +2 -13
maxframe/core/entity/__init__.py +0 -4
maxframe/core/entity/executable.py +1 -1
maxframe/core/entity/objects.py +45 -2
maxframe/core/entity/output_types.py +0 -3
maxframe/core/entity/tests/test_objects.py +43 -0
maxframe/core/entity/tileables.py +5 -78
maxframe/core/graph/__init__.py +2 -2
maxframe/core/graph/builder/__init__.py +0 -1
maxframe/core/graph/builder/base.py +5 -4
maxframe/core/graph/builder/tileable.py +4 -4
maxframe/core/graph/builder/utils.py +4 -8
maxframe/core/graph/core.cp38-win32.pyd +0 -0
maxframe/core/graph/entity.py +9 -33
maxframe/core/operator/__init__.py +2 -9
maxframe/core/operator/base.py +3 -5
maxframe/core/operator/objects.py +0 -9
maxframe/core/operator/utils.py +55 -0
maxframe/dataframe/arithmetic/docstring.py +26 -2
maxframe/dataframe/arithmetic/equal.py +4 -2
maxframe/dataframe/arithmetic/greater.py +4 -2
maxframe/dataframe/arithmetic/greater_equal.py +4 -2
maxframe/dataframe/arithmetic/less.py +2 -2
maxframe/dataframe/arithmetic/less_equal.py +4 -2
maxframe/dataframe/arithmetic/not_equal.py +4 -2
maxframe/dataframe/core.py +2 -0
maxframe/dataframe/datasource/read_odps_query.py +67 -8
maxframe/dataframe/datasource/read_odps_table.py +4 -2
maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
maxframe/dataframe/datastore/to_odps.py +8 -1
maxframe/dataframe/extensions/__init__.py +3 -0
maxframe/dataframe/extensions/flatmap.py +326 -0
maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
maxframe/dataframe/indexing/rename.py +11 -0
maxframe/dataframe/initializer.py +11 -1
maxframe/dataframe/misc/drop_duplicates.py +18 -1
maxframe/dataframe/operators.py +1 -17
maxframe/dataframe/reduction/core.py +2 -2
maxframe/dataframe/tests/test_initializer.py +33 -2
maxframe/io/objects/__init__.py +24 -0
maxframe/io/objects/core.py +140 -0
maxframe/io/objects/tensor.py +76 -0
maxframe/io/objects/tests/__init__.py +13 -0
maxframe/io/objects/tests/test_object_io.py +97 -0
maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
maxframe/{odpsio → io/odpsio}/schema.py +10 -8
maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
maxframe/io/odpsio/tests/__init__.py +13 -0
maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
maxframe/io/odpsio/volumeio.py +63 -0
maxframe/learn/contrib/__init__.py +2 -1
maxframe/learn/contrib/graph/__init__.py +15 -0
maxframe/learn/contrib/graph/connected_components.py +215 -0
maxframe/learn/contrib/graph/tests/__init__.py +13 -0
maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
maxframe/learn/contrib/xgboost/classifier.py +26 -2
maxframe/learn/contrib/xgboost/core.py +87 -2
maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
maxframe/learn/contrib/xgboost/predict.py +27 -44
maxframe/learn/contrib/xgboost/regressor.py +3 -10
maxframe/learn/contrib/xgboost/train.py +27 -16
maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
maxframe/lib/mmh3.cp38-win32.pyd +0 -0
maxframe/opcodes.py +3 -0
maxframe/protocol.py +7 -16
maxframe/remote/core.py +4 -8
maxframe/serialization/__init__.py +1 -0
maxframe/serialization/core.cp38-win32.pyd +0 -0
maxframe/session.py +9 -2
maxframe/tensor/__init__.py +10 -2
maxframe/tensor/arithmetic/isclose.py +1 -0
maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
maxframe/tensor/core.py +5 -136
maxframe/tensor/datasource/array.py +3 -0
maxframe/tensor/datasource/full.py +1 -1
maxframe/tensor/datasource/tests/test_datasource.py +1 -1
maxframe/tensor/indexing/flatnonzero.py +1 -1
maxframe/tensor/indexing/getitem.py +2 -0
maxframe/tensor/merge/__init__.py +2 -0
maxframe/tensor/merge/concatenate.py +101 -0
maxframe/tensor/merge/tests/test_merge.py +30 -1
maxframe/tensor/merge/vstack.py +74 -0
maxframe/tensor/{base → misc}/__init__.py +2 -0
maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
maxframe/tensor/misc/atleast_2d.py +70 -0
maxframe/tensor/misc/atleast_3d.py +85 -0
maxframe/tensor/misc/tests/__init__.py +13 -0
maxframe/tensor/{base → misc}/transpose.py +22 -18
maxframe/tensor/operators.py +1 -7
maxframe/tensor/random/core.py +1 -1
maxframe/tensor/reduction/count_nonzero.py +1 -0
maxframe/tensor/reduction/mean.py +1 -0
maxframe/tensor/reduction/nanmean.py +1 -0
maxframe/tensor/reduction/nanvar.py +2 -0
maxframe/tensor/reduction/tests/test_reduction.py +12 -1
maxframe/tensor/reduction/var.py +2 -0
maxframe/tensor/utils.py +2 -22
maxframe/typing_.py +4 -1
maxframe/udf.py +8 -9
maxframe/utils.py +49 -73
maxframe-1.0.0rc4.dist-info/METADATA +104 -0
{maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
{maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
maxframe_client/fetcher.py +33 -50
maxframe_client/session/consts.py +3 -0
maxframe_client/session/graph.py +8 -2
maxframe_client/session/odps.py +134 -27
maxframe_client/session/task.py +58 -20
maxframe_client/tests/test_fetcher.py +1 -1
maxframe_client/tests/test_session.py +27 -3
maxframe/core/entity/chunks.py +0 -68
maxframe/core/entity/fuse.py +0 -73
maxframe/core/graph/builder/chunk.py +0 -430
maxframe/odpsio/volumeio.py +0 -95
maxframe-1.0.0rc2.dist-info/METADATA +0 -177
/maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
/maxframe/{tensor/base/tests → io}/__init__.py +0 -0
/maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
/maxframe/tensor/{base → misc}/astype.py +0 -0
/maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
/maxframe/tensor/{base → misc}/ravel.py +0 -0
/maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
/maxframe/tensor/{base → misc}/unique.py +0 -0
/maxframe/tensor/{base → misc}/where.py +0 -0
{maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0

maxframe/dataframe/tests/test_initializer.py CHANGED Viewed

@@ -13,12 +13,13 @@
 # limitations under the License.
 import pandas as pd
+import pytest
 from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
-from ..initializer import read_pandas
+from ..initializer import DataFrame, Series, read_pandas
-def test_from_pandas():
+def test_read_pandas():
     df_data = pd.DataFrame([["a", 1], ["b", 2]], columns=["a", "b"])
     assert isinstance(read_pandas(df_data), DATAFRAME_TYPE)
@@ -27,3 +28,33 @@ def test_from_pandas():
     idx_data = pd.Index(["a", "b"])
     assert isinstance(read_pandas(idx_data), INDEX_TYPE)
+def test_init_dataframe_from_maxframe_series():
+    s = Series([1, 2, 3, 4], index=[1, 2, 3, 4])
+    df = DataFrame(s, index=s.index, columns=["col1"])
+    assert isinstance(df, DATAFRAME_TYPE)
+    assert df.dtypes.index == ["col1"]
+    with pytest.raises(ValueError):
+        DataFrame(s, index=s.index, columns=[])
+    with pytest.raises(ValueError):
+        DataFrame(s, index=s.index, columns="col1")
+    with pytest.raises(ValueError):
+        DataFrame(s, index=s.index, columns="col2")
+def test_init_dataframe_from_maxframe_dataframe():
+    df1 = DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
+    df2 = DataFrame(df1, index=df1.index, columns=["col1", "col2"])
+    assert isinstance(df2, DATAFRAME_TYPE)
+    assert list(df2.dtypes.index) == ["col1", "col2"]
+    with pytest.raises(ValueError):
+        DataFrame(df1, index=df1.index, columns=["col1", "col2", "col3"])

maxframe/io/objects/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .core import (
+    AbstractObjectIOHandler,
+    get_object_io_handler,
+    register_object_io_handler,
+)
+# isort: off
+from . import tensor
+del tensor

maxframe/io/objects/core.py ADDED Viewed

@@ -0,0 +1,140 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABCMeta, abstractmethod
+from typing import Any, Dict, Type, Union
+import msgpack
+from ...core import Entity, EntityData
+from ...core.entity import ObjectData, TileableData
+from ...lib import wrapped_pickle as pickle
+from ...typing_ import SlicesType, TileableType
+from ...utils import TypeDispatcher
+from ..odpsio.volumeio import ODPSVolumeReader, ODPSVolumeWriter
+_MetaType = Dict[str, Any]
+_META_FILE_NAME = ".meta"
+_META_PICKLED_KEYS_KEY = ".pickled_keys"
+_io_handler_dispatcher = TypeDispatcher()
+def register_object_io_handler(tileable_data_type: Type[TileableData]):
+    def wrapper(handler_cls):
+        _io_handler_dispatcher.register(tileable_data_type, handler_cls)
+        return handler_cls
+    return wrapper
+def get_object_io_handler(
+    tileable_data_type: Union[Entity, EntityData, Type[EntityData]]
+) -> Type["AbstractObjectIOHandler"]:
+    if not isinstance(tileable_data_type, type):
+        if isinstance(tileable_data_type, Entity):
+            tileable_data_type = tileable_data_type.data
+        tileable_data_type = type(tileable_data_type)
+    return _io_handler_dispatcher.get_handler(tileable_data_type)
+class AbstractObjectIOHandler(metaclass=ABCMeta):
+    def _prepare_meta_for_serial(
+        self, tileable: TileableType, meta: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        to_pack = meta.copy()
+        pickled_keys = []
+        for k, v in meta.items():
+            if not isinstance(v, (str, bytes, int, float, bool)):
+                to_pack[k] = pickle.dumps(v)
+                pickled_keys.append(k)
+        to_pack[".pickled_keys"] = pickled_keys
+        return to_pack
+    def _prepare_meta_for_deserial(
+        self, tileable: TileableType, meta: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        pickled_keys = meta.pop(".pickled_keys", None) or []
+        for k in pickled_keys:
+            meta[k] = pickle.loads(meta[k])
+        return meta
+    def read_object_meta(
+        self, reader: ODPSVolumeReader, tileable: TileableType
+    ) -> Dict[str, Any]:
+        meta_obj = msgpack.loads(reader.read_file(_META_FILE_NAME))
+        return self._prepare_meta_for_deserial(tileable, meta_obj)
+    @abstractmethod
+    def _read_object_body(
+        self,
+        reader: ODPSVolumeReader,
+        tileable: TileableType,
+        meta: Dict[str, Any],
+        slices: SlicesType = None,
+    ) -> Any:
+        raise NotImplementedError
+    def read_object(
+        self,
+        reader: ODPSVolumeReader,
+        tileable: TileableType,
+        slices: SlicesType = None,
+    ) -> Any:
+        meta = self.read_object_meta(reader, tileable)
+        return self._read_object_body(reader, tileable, meta, slices)
+    @abstractmethod
+    def _write_object_body(
+        self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
+    ):
+        raise NotImplementedError
+    def write_object_meta(
+        self,
+        writer: ODPSVolumeWriter,
+        tileable: TileableType,
+        extra_meta: Dict[str, Any] = None,
+    ):
+        meta_obj = tileable.params.copy()
+        if extra_meta:
+            meta_obj.update(extra_meta)
+        meta_obj = self._prepare_meta_for_serial(tileable, meta_obj)
+        packed = msgpack.dumps(meta_obj)
+        writer.write_file(_META_FILE_NAME, packed)
+    def write_object(
+        self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
+    ):
+        self.write_object_meta(writer, tileable)
+        self._write_object_body(writer, tileable, value)
+@register_object_io_handler(ObjectData)
+class ObjectIOHandler(AbstractObjectIOHandler):
+    def _read_object_body(
+        self,
+        reader: ODPSVolumeReader,
+        tileable: TileableType,
+        meta: Dict[str, Any],
+        slices: SlicesType = None,
+    ) -> Any:
+        return pickle.loads(reader.read_file("data"))
+    def _write_object_body(
+        self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
+    ):
+        writer.write_file("data", pickle.dumps(value))

maxframe/io/objects/tensor.py ADDED Viewed

@@ -0,0 +1,76 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import struct
+from io import BytesIO
+from typing import Any, Dict
+import msgpack
+import numpy as np
+from ...lib import wrapped_pickle as pickle
+from ...tensor.core import TensorData
+from ...typing_ import SlicesType, TileableType
+from ..odpsio import ODPSVolumeReader, ODPSVolumeWriter
+from .core import AbstractObjectIOHandler, register_object_io_handler
+@register_object_io_handler(TensorData)
+class TensorIOHandler(AbstractObjectIOHandler):
+    def write_object_meta(
+        self,
+        writer: ODPSVolumeWriter,
+        tileable: TileableType,
+        extra_meta: Dict[str, Any] = None,
+    ):
+        # fixme upload in real slices when tensors are supported in DPE
+        extra_meta = extra_meta or dict()
+        extra_meta["nsplits"] = ((np.nan,),)
+        super().write_object_meta(writer, tileable, extra_meta=extra_meta)
+    def _read_object_body(
+        self,
+        reader: ODPSVolumeReader,
+        tileable: TileableType,
+        meta: Dict[str, Any],
+        slices: SlicesType = None,
+    ) -> Any:
+        # fixme read data with slices when tensors are supported in DPE
+        body = reader.read_file("0,0.dat")
+        bio = BytesIO(body)
+        (header_len,) = struct.unpack("<I", bio.read(4))
+        header_data = msgpack.loads(bio.read(header_len))
+        pickled = bio.read(header_data[0])
+        bufs = [bio.read(size) for size in header_data[1:]]
+        return pickle.loads(pickled, buffers=bufs)
+    def _write_object_body(
+        self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
+    ):
+        # fixme upload in real slices when tensors are supported in DPE
+        def data_gen():
+            bufs = []
+            pickled = pickle.dumps(value, buffer_callback=bufs.append)
+            header_data = msgpack.dumps(
+                [len(pickled)] + [len(buf.raw()) for buf in bufs]
+            )
+            yield struct.pack("<I", len(header_data))
+            yield header_data
+            yield pickled
+            for buf in bufs:
+                yield buf
+        writer.write_file("0,0.dat", data_gen())

maxframe/io/objects/tests/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

maxframe/io/objects/tests/test_object_io.py ADDED Viewed

@@ -0,0 +1,97 @@
+# Copyright 1999-2024 Alibaba Group Holding Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import pytest
+from odps import ODPS
+from ....core import OutputType
+from ....core.operator import ObjectOperatorMixin, Operator
+from ....tensor.datasource import ArrayDataSource
+from ....tests.utils import tn
+from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
+from ..core import get_object_io_handler
+class TestObjectOp(Operator, ObjectOperatorMixin):
+    def __call__(self):
+        self._output_types = [OutputType.object]
+        return self.new_tileable([])
+@pytest.fixture(scope="module")
+def create_volume(request, oss_config):
+    test_vol_name = tn("test_object_io_volume")
+    odps_entry = ODPS.from_environments()
+    try:
+        odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
+    except:
+        pass
+    oss_test_dir_name = tn("test_oss_directory")
+    if oss_config is None:
+        pytest.skip("Need oss and its config to run this test")
+    (
+        oss_access_id,
+        oss_secret_access_key,
+        oss_bucket_name,
+        oss_endpoint,
+    ) = oss_config.oss_config
+    test_location = "oss://%s:%s@%s/%s/%s" % (
+        oss_access_id,
+        oss_secret_access_key,
+        oss_endpoint,
+        oss_bucket_name,
+        oss_test_dir_name,
+    )
+    oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
+    odps_entry.create_external_volume(test_vol_name, location=test_location)
+    try:
+        yield test_vol_name
+    finally:
+        try:
+            odps_entry.delete_volume(
+                test_vol_name, auto_remove_dir=True, recursive=True
+            )
+        except:
+            pass
+def test_simple_object_io(create_volume):
+    obj = TestObjectOp()()
+    data = "abcdefg"
+    odps_entry = ODPS.from_environments()
+    reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
+    writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
+    handler = get_object_io_handler(obj)()
+    handler.write_object(writer, obj, data)
+    assert data == handler.read_object(reader, obj)
+def test_tensor_object_io(create_volume):
+    data = np.array([[4, 9, 2], [3, 5, 7], [8, 1, 6]])
+    obj = ArrayDataSource(data, dtype=data.dtype)(data.shape)
+    odps_entry = ODPS.from_environments()
+    reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
+    writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
+    handler = get_object_io_handler(obj)()
+    handler.write_object(writer, obj, data)
+    np.testing.assert_equal(data, handler.read_object(reader, obj))

maxframe/{odpsio → io/odpsio}/__init__.py RENAMED Viewed

@@ -14,8 +14,10 @@
 from .arrow import arrow_to_pandas, pandas_to_arrow
 from .schema import (
+    arrow_schema_to_odps_schema,
     build_dataframe_table_meta,
     odps_schema_to_pandas_dtypes,
     pandas_to_odps_schema,
 )
 from .tableio import HaloTableIO, ODPSTableIO
+from .volumeio import ODPSVolumeReader, ODPSVolumeWriter

maxframe/{odpsio → io/odpsio}/arrow.py RENAMED Viewed

@@ -17,10 +17,10 @@ from typing import Any, Tuple, Union
 import pandas as pd
 import pyarrow as pa
-from ..core import OutputType
-from ..protocol import DataFrameTableMeta
-from ..tensor.core import TENSOR_TYPE
-from ..typing_ import ArrowTableType, PandasObjectTypes
+from ...core import OutputType
+from ...protocol import DataFrameTableMeta
+from ...tensor.core import TENSOR_TYPE
+from ...typing_ import ArrowTableType, PandasObjectTypes
 from .schema import build_dataframe_table_meta

maxframe/{odpsio → io/odpsio}/schema.py RENAMED Viewed

@@ -21,9 +21,9 @@ import pyarrow as pa
 from odps import types as odps_types
 from pandas.api import types as pd_types
-from ..core import TILEABLE_TYPE, OutputType
-from ..protocol import DataFrameTableMeta
-from ..tensor.core import TENSOR_TYPE
+from ...core import TILEABLE_TYPE, OutputType
+from ...protocol import DataFrameTableMeta
+from ...tensor.core import TENSOR_TYPE
 _TEMP_TABLE_PREFIX = "tmp_mf_"
@@ -54,7 +54,9 @@ _odps_type_to_arrow = {
     odps_types.double: pa.float64(),
     odps_types.date: pa.date32(),
     odps_types.datetime: pa.timestamp("ms"),
+    odps_types.json: pa.string(),
     odps_types.timestamp: pa.timestamp("ns"),
+    odps_types.timestamp_ntz: pa.timestamp("ns"),
 }
@@ -166,7 +168,7 @@ def odps_schema_to_pandas_dtypes(
     return arrow_schema.empty_table().to_pandas().dtypes
-def _is_scalar_object(df_obj: Any) -> bool:
+def is_scalar_object(df_obj: Any) -> bool:
     return (
         isinstance(df_obj, TENSOR_TYPE) and df_obj.shape == ()
     ) or pd_types.is_scalar(df_obj)
@@ -184,10 +186,10 @@ def pandas_to_odps_schema(
     unknown_as_string: bool = False,
     ignore_index=False,
 ) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
-    from .. import dataframe as md
+    from ... import dataframe as md
     from .arrow import pandas_to_arrow
-    if _is_scalar_object(df_obj):
+    if is_scalar_object(df_obj):
         empty_index = None
     elif hasattr(df_obj, "index_value"):
         empty_index = df_obj.index_value.to_pandas()[:0]
@@ -278,7 +280,7 @@ def build_table_column_name(
 def build_dataframe_table_meta(
     df_obj: Any, ignore_index: bool = False
 ) -> DataFrameTableMeta:
-    from .. import dataframe as md
+    from ... import dataframe as md
     col_to_count = defaultdict(lambda: 0)
     col_to_idx = defaultdict(lambda: 0)
@@ -289,7 +291,7 @@ def build_dataframe_table_meta(
         obj_type = OutputType.series
     elif isinstance(df_obj, (md.Index, pd.Index)):
         obj_type = OutputType.index
-    elif _is_scalar_object(df_obj):
+    elif is_scalar_object(df_obj):
         obj_type = OutputType.scalar
     else:  # pragma: no cover
         raise TypeError(f"Cannot accept type {type(df_obj)}")

maxframe/{odpsio → io/odpsio}/tableio.py RENAMED Viewed

@@ -18,14 +18,15 @@ from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Dict, List, Optional, Union
+import numpy as np
 import pyarrow as pa
 from odps import ODPS
+from odps import __version__ as pyodps_version
 from odps.apis.storage_api import (
     StorageApiArrowClient,
     TableBatchScanResponse,
     TableBatchWriteResponse,
 )
-from odps.config import option_context as pyodps_option_context
 from odps.tunnel import TableTunnel
 from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
@@ -34,20 +35,16 @@ try:
 except ImportError:
     pac = None
-from ..config import options
-from ..env import ODPS_STORAGE_API_ENDPOINT
+from ...config import options
+from ...env import ODPS_STORAGE_API_ENDPOINT
+from ...lib.version import Version
+from ...utils import sync_pyodps_options
 from .schema import odps_schema_to_arrow_schema
 PartitionsType = Union[List[str], str, None]
 _DEFAULT_ROW_BATCH_SIZE = 4096
-@contextmanager
-def _sync_pyodps_timezone():
-    with pyodps_option_context() as cfg:
-        cfg.local_timezone = options.local_timezone
-        yield
+_need_patch_batch = Version(pyodps_version) < Version("0.12.0")
 class ODPSTableIO(ABC):
@@ -163,10 +160,15 @@ class TunnelMultiPartitionReader:
             self._cur_partition_id += 1
             part_str = self._partitions[self._cur_partition_id]
-            with _sync_pyodps_timezone():
+            # todo make this more formal when PyODPS 0.12.0 is released
+            req_columns = self._columns
+            if not _need_patch_batch:
+                req_columns = self._schema.names
+            with sync_pyodps_options():
                 self._cur_reader = self._table.open_reader(
                     part_str,
-                    columns=self._columns,
+                    columns=req_columns,
                     arrow=True,
                     download_id=self._partition_to_download_ids.get(part_str),
                 )
@@ -177,7 +179,7 @@ class TunnelMultiPartitionReader:
                 else:
                     count = min(self._count, self._cur_reader.count - start)
-                with _sync_pyodps_timezone():
+                with sync_pyodps_options():
                     self._reader_iter = self._cur_reader.read(start, count)
                 break
             self._reader_start_pos += self._cur_reader.count
@@ -209,11 +211,12 @@ class TunnelMultiPartitionReader:
         for part_col in self._partition_cols or []:
             names.append(part_col)
             col_type = self._schema.field_by_name(part_col).type
-            arrays.append(pa.array([pt_spec[part_col]] * batch.num_rows).cast(col_type))
+            pt_col = np.repeat([pt_spec[part_col]], batch.num_rows)
+            arrays.append(pa.array(pt_col).cast(col_type))
         return pa.RecordBatch.from_arrays(arrays, names)
     def read(self):
-        with _sync_pyodps_timezone():
+        with sync_pyodps_options():
             if self._cur_reader is None:
                 self._open_next_reader()
                 if self._cur_reader is None:
@@ -224,7 +227,10 @@ class TunnelMultiPartitionReader:
                     if batch is not None:
                         if self._row_left is not None:
                             self._row_left -= batch.num_rows
-                        return self._fill_batch_partition(batch)
+                        if _need_patch_batch:
+                            return self._fill_batch_partition(batch)
+                        else:
+                            return batch
                 except StopIteration:
                     self._open_next_reader()
             return None
@@ -282,7 +288,9 @@ class TunnelTableIO(ODPSTableIO):
         reverse_range: bool = False,
         row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
     ):
-        table = self._odps.get_table(full_table_name)
+        with sync_pyodps_options():
+            table = self._odps.get_table(full_table_name)
         if partition_columns is True:
             partition_columns = [c.name for c in table.table_schema.partitions]
@@ -293,21 +301,22 @@ class TunnelTableIO(ODPSTableIO):
             or (stop is not None and stop < 0)
             or (reverse_range and start is None)
         ):
-            table = self._odps.get_table(full_table_name)
-            tunnel = TableTunnel(self._odps)
-            parts = (
-                [partitions]
-                if partitions is None or isinstance(partitions, str)
-                else partitions
-            )
-            part_to_down_id = dict()
-            total_records = 0
-            for part in parts:
-                down_session = tunnel.create_download_session(
-                    table, async_mode=True, partition_spec=part
+            with sync_pyodps_options():
+                table = self._odps.get_table(full_table_name)
+                tunnel = TableTunnel(self._odps)
+                parts = (
+                    [partitions]
+                    if partitions is None or isinstance(partitions, str)
+                    else partitions
                 )
-                part_to_down_id[part] = down_session.id
-                total_records += down_session.count
+                part_to_down_id = dict()
+                total_records = 0
+                for part in parts:
+                    down_session = tunnel.create_download_session(
+                        table, async_mode=True, partition_spec=part
+                    )
+                    part_to_down_id[part] = down_session.id
+                    total_records += down_session.count
         count = None
         if start is not None or stop is not None:
@@ -344,7 +353,7 @@ class TunnelTableIO(ODPSTableIO):
         overwrite: bool = True,
     ):
         table = self._odps.get_table(full_table_name)
-        with _sync_pyodps_timezone():
+        with sync_pyodps_options():
             with table.open_writer(
                 partition=partition,
                 arrow=True,
@@ -354,7 +363,10 @@ class TunnelTableIO(ODPSTableIO):
                 # fixme should yield writer directly once pyodps fixes
                 #  related arrow timestamp bug when provided schema and
                 #  table schema is identical.
-                yield TunnelWrappedWriter(writer)
+                if _need_patch_batch:
+                    yield TunnelWrappedWriter(writer)
+                else:
+                    yield writer
 class HaloTableArrowReader:
@@ -590,8 +602,8 @@ class HaloTableIO(ODPSTableIO):
     ):
         from odps.apis.storage_api import (
             SessionRequest,
+            SessionStatus,
             SplitOptions,
-            Status,
             TableBatchScanRequest,
         )
@@ -622,13 +634,13 @@ class HaloTableIO(ODPSTableIO):
         resp = client.create_read_session(req)
         session_id = resp.session_id
-        status = resp.status
-        while status == Status.WAIT:
+        status = resp.session_status
+        while status == SessionStatus.INIT:
             resp = client.get_read_session(SessionRequest(session_id))
-            status = resp.status
+            status = resp.session_status
             time.sleep(1.0)
-        assert status == Status.OK
+        assert status == SessionStatus.NORMAL
         count = None
         if start is not None or stop is not None:

maxframe 1.0.0rc2__cp38-cp38-win32.whl → 1.0.0rc4__cp38-cp38-win32.whl

Potentially problematic release.

maxframe 1.0.0rc2cp38-cp38-win32.whl → 1.0.0rc4cp38-cp38-win32.whl