maxframe 1.0.0rc2__cp38-cp38-win32.whl → 1.0.0rc4__cp38-cp38-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp38-win32.pyd +0 -0
- maxframe/codegen.py +4 -2
- maxframe/config/config.py +28 -9
- maxframe/config/validators.py +42 -12
- maxframe/conftest.py +56 -14
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +45 -2
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp38-win32.pyd +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/arithmetic/docstring.py +26 -2
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/core.py +2 -0
- maxframe/dataframe/datasource/read_odps_query.py +67 -8
- maxframe/dataframe/datasource/read_odps_table.py +4 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
- maxframe/dataframe/datastore/to_odps.py +8 -1
- maxframe/dataframe/extensions/__init__.py +3 -0
- maxframe/dataframe/extensions/flatmap.py +326 -0
- maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +11 -0
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/misc/drop_duplicates.py +18 -1
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
- maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
- maxframe/{odpsio → io/odpsio}/schema.py +10 -8
- maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +2 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
- maxframe/learn/contrib/xgboost/predict.py +27 -44
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +27 -16
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cp38-win32.pyd +0 -0
- maxframe/opcodes.py +3 -0
- maxframe/protocol.py +7 -16
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp38-win32.pyd +0 -0
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +10 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/utils.py +2 -22
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +49 -73
- maxframe-1.0.0rc4.dist-info/METADATA +104 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
- maxframe_client/fetcher.py +33 -50
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +134 -27
- maxframe_client/session/task.py +58 -20
- maxframe_client/tests/test_fetcher.py +1 -1
- maxframe_client/tests/test_session.py +27 -3
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/volumeio.py +0 -95
- maxframe-1.0.0rc2.dist-info/METADATA +0 -177
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/unique.py +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
|
@@ -13,12 +13,13 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import pandas as pd
|
|
16
|
+
import pytest
|
|
16
17
|
|
|
17
18
|
from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
|
|
18
|
-
from ..initializer import read_pandas
|
|
19
|
+
from ..initializer import DataFrame, Series, read_pandas
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
def
|
|
22
|
+
def test_read_pandas():
|
|
22
23
|
df_data = pd.DataFrame([["a", 1], ["b", 2]], columns=["a", "b"])
|
|
23
24
|
assert isinstance(read_pandas(df_data), DATAFRAME_TYPE)
|
|
24
25
|
|
|
@@ -27,3 +28,33 @@ def test_from_pandas():
|
|
|
27
28
|
|
|
28
29
|
idx_data = pd.Index(["a", "b"])
|
|
29
30
|
assert isinstance(read_pandas(idx_data), INDEX_TYPE)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_init_dataframe_from_maxframe_series():
|
|
34
|
+
s = Series([1, 2, 3, 4], index=[1, 2, 3, 4])
|
|
35
|
+
|
|
36
|
+
df = DataFrame(s, index=s.index, columns=["col1"])
|
|
37
|
+
|
|
38
|
+
assert isinstance(df, DATAFRAME_TYPE)
|
|
39
|
+
assert df.dtypes.index == ["col1"]
|
|
40
|
+
|
|
41
|
+
with pytest.raises(ValueError):
|
|
42
|
+
DataFrame(s, index=s.index, columns=[])
|
|
43
|
+
|
|
44
|
+
with pytest.raises(ValueError):
|
|
45
|
+
DataFrame(s, index=s.index, columns="col1")
|
|
46
|
+
|
|
47
|
+
with pytest.raises(ValueError):
|
|
48
|
+
DataFrame(s, index=s.index, columns="col2")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_init_dataframe_from_maxframe_dataframe():
|
|
52
|
+
df1 = DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
|
|
53
|
+
|
|
54
|
+
df2 = DataFrame(df1, index=df1.index, columns=["col1", "col2"])
|
|
55
|
+
|
|
56
|
+
assert isinstance(df2, DATAFRAME_TYPE)
|
|
57
|
+
assert list(df2.dtypes.index) == ["col1", "col2"]
|
|
58
|
+
|
|
59
|
+
with pytest.raises(ValueError):
|
|
60
|
+
DataFrame(df1, index=df1.index, columns=["col1", "col2", "col3"])
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .core import (
|
|
16
|
+
AbstractObjectIOHandler,
|
|
17
|
+
get_object_io_handler,
|
|
18
|
+
register_object_io_handler,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# isort: off
|
|
22
|
+
from . import tensor
|
|
23
|
+
|
|
24
|
+
del tensor
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from abc import ABCMeta, abstractmethod
|
|
16
|
+
from typing import Any, Dict, Type, Union
|
|
17
|
+
|
|
18
|
+
import msgpack
|
|
19
|
+
|
|
20
|
+
from ...core import Entity, EntityData
|
|
21
|
+
from ...core.entity import ObjectData, TileableData
|
|
22
|
+
from ...lib import wrapped_pickle as pickle
|
|
23
|
+
from ...typing_ import SlicesType, TileableType
|
|
24
|
+
from ...utils import TypeDispatcher
|
|
25
|
+
from ..odpsio.volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
26
|
+
|
|
27
|
+
_MetaType = Dict[str, Any]
|
|
28
|
+
|
|
29
|
+
_META_FILE_NAME = ".meta"
|
|
30
|
+
_META_PICKLED_KEYS_KEY = ".pickled_keys"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_io_handler_dispatcher = TypeDispatcher()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def register_object_io_handler(tileable_data_type: Type[TileableData]):
|
|
37
|
+
def wrapper(handler_cls):
|
|
38
|
+
_io_handler_dispatcher.register(tileable_data_type, handler_cls)
|
|
39
|
+
return handler_cls
|
|
40
|
+
|
|
41
|
+
return wrapper
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_object_io_handler(
|
|
45
|
+
tileable_data_type: Union[Entity, EntityData, Type[EntityData]]
|
|
46
|
+
) -> Type["AbstractObjectIOHandler"]:
|
|
47
|
+
if not isinstance(tileable_data_type, type):
|
|
48
|
+
if isinstance(tileable_data_type, Entity):
|
|
49
|
+
tileable_data_type = tileable_data_type.data
|
|
50
|
+
tileable_data_type = type(tileable_data_type)
|
|
51
|
+
return _io_handler_dispatcher.get_handler(tileable_data_type)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AbstractObjectIOHandler(metaclass=ABCMeta):
|
|
55
|
+
def _prepare_meta_for_serial(
|
|
56
|
+
self, tileable: TileableType, meta: Dict[str, Any]
|
|
57
|
+
) -> Dict[str, Any]:
|
|
58
|
+
to_pack = meta.copy()
|
|
59
|
+
pickled_keys = []
|
|
60
|
+
for k, v in meta.items():
|
|
61
|
+
if not isinstance(v, (str, bytes, int, float, bool)):
|
|
62
|
+
to_pack[k] = pickle.dumps(v)
|
|
63
|
+
pickled_keys.append(k)
|
|
64
|
+
to_pack[".pickled_keys"] = pickled_keys
|
|
65
|
+
return to_pack
|
|
66
|
+
|
|
67
|
+
def _prepare_meta_for_deserial(
|
|
68
|
+
self, tileable: TileableType, meta: Dict[str, Any]
|
|
69
|
+
) -> Dict[str, Any]:
|
|
70
|
+
pickled_keys = meta.pop(".pickled_keys", None) or []
|
|
71
|
+
for k in pickled_keys:
|
|
72
|
+
meta[k] = pickle.loads(meta[k])
|
|
73
|
+
return meta
|
|
74
|
+
|
|
75
|
+
def read_object_meta(
|
|
76
|
+
self, reader: ODPSVolumeReader, tileable: TileableType
|
|
77
|
+
) -> Dict[str, Any]:
|
|
78
|
+
meta_obj = msgpack.loads(reader.read_file(_META_FILE_NAME))
|
|
79
|
+
return self._prepare_meta_for_deserial(tileable, meta_obj)
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def _read_object_body(
|
|
83
|
+
self,
|
|
84
|
+
reader: ODPSVolumeReader,
|
|
85
|
+
tileable: TileableType,
|
|
86
|
+
meta: Dict[str, Any],
|
|
87
|
+
slices: SlicesType = None,
|
|
88
|
+
) -> Any:
|
|
89
|
+
raise NotImplementedError
|
|
90
|
+
|
|
91
|
+
def read_object(
|
|
92
|
+
self,
|
|
93
|
+
reader: ODPSVolumeReader,
|
|
94
|
+
tileable: TileableType,
|
|
95
|
+
slices: SlicesType = None,
|
|
96
|
+
) -> Any:
|
|
97
|
+
meta = self.read_object_meta(reader, tileable)
|
|
98
|
+
return self._read_object_body(reader, tileable, meta, slices)
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def _write_object_body(
|
|
102
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
103
|
+
):
|
|
104
|
+
raise NotImplementedError
|
|
105
|
+
|
|
106
|
+
def write_object_meta(
|
|
107
|
+
self,
|
|
108
|
+
writer: ODPSVolumeWriter,
|
|
109
|
+
tileable: TileableType,
|
|
110
|
+
extra_meta: Dict[str, Any] = None,
|
|
111
|
+
):
|
|
112
|
+
meta_obj = tileable.params.copy()
|
|
113
|
+
if extra_meta:
|
|
114
|
+
meta_obj.update(extra_meta)
|
|
115
|
+
meta_obj = self._prepare_meta_for_serial(tileable, meta_obj)
|
|
116
|
+
packed = msgpack.dumps(meta_obj)
|
|
117
|
+
writer.write_file(_META_FILE_NAME, packed)
|
|
118
|
+
|
|
119
|
+
def write_object(
|
|
120
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
121
|
+
):
|
|
122
|
+
self.write_object_meta(writer, tileable)
|
|
123
|
+
self._write_object_body(writer, tileable, value)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@register_object_io_handler(ObjectData)
|
|
127
|
+
class ObjectIOHandler(AbstractObjectIOHandler):
|
|
128
|
+
def _read_object_body(
|
|
129
|
+
self,
|
|
130
|
+
reader: ODPSVolumeReader,
|
|
131
|
+
tileable: TileableType,
|
|
132
|
+
meta: Dict[str, Any],
|
|
133
|
+
slices: SlicesType = None,
|
|
134
|
+
) -> Any:
|
|
135
|
+
return pickle.loads(reader.read_file("data"))
|
|
136
|
+
|
|
137
|
+
def _write_object_body(
|
|
138
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
139
|
+
):
|
|
140
|
+
writer.write_file("data", pickle.dumps(value))
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import struct
|
|
16
|
+
from io import BytesIO
|
|
17
|
+
from typing import Any, Dict
|
|
18
|
+
|
|
19
|
+
import msgpack
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
from ...lib import wrapped_pickle as pickle
|
|
23
|
+
from ...tensor.core import TensorData
|
|
24
|
+
from ...typing_ import SlicesType, TileableType
|
|
25
|
+
from ..odpsio import ODPSVolumeReader, ODPSVolumeWriter
|
|
26
|
+
from .core import AbstractObjectIOHandler, register_object_io_handler
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_object_io_handler(TensorData)
|
|
30
|
+
class TensorIOHandler(AbstractObjectIOHandler):
|
|
31
|
+
def write_object_meta(
|
|
32
|
+
self,
|
|
33
|
+
writer: ODPSVolumeWriter,
|
|
34
|
+
tileable: TileableType,
|
|
35
|
+
extra_meta: Dict[str, Any] = None,
|
|
36
|
+
):
|
|
37
|
+
# fixme upload in real slices when tensors are supported in DPE
|
|
38
|
+
extra_meta = extra_meta or dict()
|
|
39
|
+
extra_meta["nsplits"] = ((np.nan,),)
|
|
40
|
+
|
|
41
|
+
super().write_object_meta(writer, tileable, extra_meta=extra_meta)
|
|
42
|
+
|
|
43
|
+
def _read_object_body(
|
|
44
|
+
self,
|
|
45
|
+
reader: ODPSVolumeReader,
|
|
46
|
+
tileable: TileableType,
|
|
47
|
+
meta: Dict[str, Any],
|
|
48
|
+
slices: SlicesType = None,
|
|
49
|
+
) -> Any:
|
|
50
|
+
# fixme read data with slices when tensors are supported in DPE
|
|
51
|
+
body = reader.read_file("0,0.dat")
|
|
52
|
+
bio = BytesIO(body)
|
|
53
|
+
(header_len,) = struct.unpack("<I", bio.read(4))
|
|
54
|
+
header_data = msgpack.loads(bio.read(header_len))
|
|
55
|
+
|
|
56
|
+
pickled = bio.read(header_data[0])
|
|
57
|
+
bufs = [bio.read(size) for size in header_data[1:]]
|
|
58
|
+
return pickle.loads(pickled, buffers=bufs)
|
|
59
|
+
|
|
60
|
+
def _write_object_body(
|
|
61
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
62
|
+
):
|
|
63
|
+
# fixme upload in real slices when tensors are supported in DPE
|
|
64
|
+
def data_gen():
|
|
65
|
+
bufs = []
|
|
66
|
+
pickled = pickle.dumps(value, buffer_callback=bufs.append)
|
|
67
|
+
header_data = msgpack.dumps(
|
|
68
|
+
[len(pickled)] + [len(buf.raw()) for buf in bufs]
|
|
69
|
+
)
|
|
70
|
+
yield struct.pack("<I", len(header_data))
|
|
71
|
+
yield header_data
|
|
72
|
+
yield pickled
|
|
73
|
+
for buf in bufs:
|
|
74
|
+
yield buf
|
|
75
|
+
|
|
76
|
+
writer.write_file("0,0.dat", data_gen())
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pytest
|
|
16
|
+
from odps import ODPS
|
|
17
|
+
|
|
18
|
+
from ....core import OutputType
|
|
19
|
+
from ....core.operator import ObjectOperatorMixin, Operator
|
|
20
|
+
from ....tensor.datasource import ArrayDataSource
|
|
21
|
+
from ....tests.utils import tn
|
|
22
|
+
from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
|
|
23
|
+
from ..core import get_object_io_handler
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TestObjectOp(Operator, ObjectOperatorMixin):
|
|
27
|
+
def __call__(self):
|
|
28
|
+
self._output_types = [OutputType.object]
|
|
29
|
+
return self.new_tileable([])
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture(scope="module")
|
|
33
|
+
def create_volume(request, oss_config):
|
|
34
|
+
test_vol_name = tn("test_object_io_volume")
|
|
35
|
+
odps_entry = ODPS.from_environments()
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
|
|
39
|
+
except:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
oss_test_dir_name = tn("test_oss_directory")
|
|
43
|
+
if oss_config is None:
|
|
44
|
+
pytest.skip("Need oss and its config to run this test")
|
|
45
|
+
(
|
|
46
|
+
oss_access_id,
|
|
47
|
+
oss_secret_access_key,
|
|
48
|
+
oss_bucket_name,
|
|
49
|
+
oss_endpoint,
|
|
50
|
+
) = oss_config.oss_config
|
|
51
|
+
test_location = "oss://%s:%s@%s/%s/%s" % (
|
|
52
|
+
oss_access_id,
|
|
53
|
+
oss_secret_access_key,
|
|
54
|
+
oss_endpoint,
|
|
55
|
+
oss_bucket_name,
|
|
56
|
+
oss_test_dir_name,
|
|
57
|
+
)
|
|
58
|
+
oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
|
|
59
|
+
odps_entry.create_external_volume(test_vol_name, location=test_location)
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
yield test_vol_name
|
|
63
|
+
finally:
|
|
64
|
+
try:
|
|
65
|
+
odps_entry.delete_volume(
|
|
66
|
+
test_vol_name, auto_remove_dir=True, recursive=True
|
|
67
|
+
)
|
|
68
|
+
except:
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_simple_object_io(create_volume):
|
|
73
|
+
obj = TestObjectOp()()
|
|
74
|
+
data = "abcdefg"
|
|
75
|
+
|
|
76
|
+
odps_entry = ODPS.from_environments()
|
|
77
|
+
|
|
78
|
+
reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
|
|
79
|
+
writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
|
|
80
|
+
|
|
81
|
+
handler = get_object_io_handler(obj)()
|
|
82
|
+
handler.write_object(writer, obj, data)
|
|
83
|
+
assert data == handler.read_object(reader, obj)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_tensor_object_io(create_volume):
|
|
87
|
+
data = np.array([[4, 9, 2], [3, 5, 7], [8, 1, 6]])
|
|
88
|
+
obj = ArrayDataSource(data, dtype=data.dtype)(data.shape)
|
|
89
|
+
|
|
90
|
+
odps_entry = ODPS.from_environments()
|
|
91
|
+
|
|
92
|
+
reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
|
|
93
|
+
writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
|
|
94
|
+
|
|
95
|
+
handler = get_object_io_handler(obj)()
|
|
96
|
+
handler.write_object(writer, obj, data)
|
|
97
|
+
np.testing.assert_equal(data, handler.read_object(reader, obj))
|
|
@@ -14,8 +14,10 @@
|
|
|
14
14
|
|
|
15
15
|
from .arrow import arrow_to_pandas, pandas_to_arrow
|
|
16
16
|
from .schema import (
|
|
17
|
+
arrow_schema_to_odps_schema,
|
|
17
18
|
build_dataframe_table_meta,
|
|
18
19
|
odps_schema_to_pandas_dtypes,
|
|
19
20
|
pandas_to_odps_schema,
|
|
20
21
|
)
|
|
21
22
|
from .tableio import HaloTableIO, ODPSTableIO
|
|
23
|
+
from .volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
@@ -17,10 +17,10 @@ from typing import Any, Tuple, Union
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
import pyarrow as pa
|
|
19
19
|
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
20
|
+
from ...core import OutputType
|
|
21
|
+
from ...protocol import DataFrameTableMeta
|
|
22
|
+
from ...tensor.core import TENSOR_TYPE
|
|
23
|
+
from ...typing_ import ArrowTableType, PandasObjectTypes
|
|
24
24
|
from .schema import build_dataframe_table_meta
|
|
25
25
|
|
|
26
26
|
|
|
@@ -21,9 +21,9 @@ import pyarrow as pa
|
|
|
21
21
|
from odps import types as odps_types
|
|
22
22
|
from pandas.api import types as pd_types
|
|
23
23
|
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from
|
|
24
|
+
from ...core import TILEABLE_TYPE, OutputType
|
|
25
|
+
from ...protocol import DataFrameTableMeta
|
|
26
|
+
from ...tensor.core import TENSOR_TYPE
|
|
27
27
|
|
|
28
28
|
_TEMP_TABLE_PREFIX = "tmp_mf_"
|
|
29
29
|
|
|
@@ -54,7 +54,9 @@ _odps_type_to_arrow = {
|
|
|
54
54
|
odps_types.double: pa.float64(),
|
|
55
55
|
odps_types.date: pa.date32(),
|
|
56
56
|
odps_types.datetime: pa.timestamp("ms"),
|
|
57
|
+
odps_types.json: pa.string(),
|
|
57
58
|
odps_types.timestamp: pa.timestamp("ns"),
|
|
59
|
+
odps_types.timestamp_ntz: pa.timestamp("ns"),
|
|
58
60
|
}
|
|
59
61
|
|
|
60
62
|
|
|
@@ -166,7 +168,7 @@ def odps_schema_to_pandas_dtypes(
|
|
|
166
168
|
return arrow_schema.empty_table().to_pandas().dtypes
|
|
167
169
|
|
|
168
170
|
|
|
169
|
-
def
|
|
171
|
+
def is_scalar_object(df_obj: Any) -> bool:
|
|
170
172
|
return (
|
|
171
173
|
isinstance(df_obj, TENSOR_TYPE) and df_obj.shape == ()
|
|
172
174
|
) or pd_types.is_scalar(df_obj)
|
|
@@ -184,10 +186,10 @@ def pandas_to_odps_schema(
|
|
|
184
186
|
unknown_as_string: bool = False,
|
|
185
187
|
ignore_index=False,
|
|
186
188
|
) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
|
|
187
|
-
from
|
|
189
|
+
from ... import dataframe as md
|
|
188
190
|
from .arrow import pandas_to_arrow
|
|
189
191
|
|
|
190
|
-
if
|
|
192
|
+
if is_scalar_object(df_obj):
|
|
191
193
|
empty_index = None
|
|
192
194
|
elif hasattr(df_obj, "index_value"):
|
|
193
195
|
empty_index = df_obj.index_value.to_pandas()[:0]
|
|
@@ -278,7 +280,7 @@ def build_table_column_name(
|
|
|
278
280
|
def build_dataframe_table_meta(
|
|
279
281
|
df_obj: Any, ignore_index: bool = False
|
|
280
282
|
) -> DataFrameTableMeta:
|
|
281
|
-
from
|
|
283
|
+
from ... import dataframe as md
|
|
282
284
|
|
|
283
285
|
col_to_count = defaultdict(lambda: 0)
|
|
284
286
|
col_to_idx = defaultdict(lambda: 0)
|
|
@@ -289,7 +291,7 @@ def build_dataframe_table_meta(
|
|
|
289
291
|
obj_type = OutputType.series
|
|
290
292
|
elif isinstance(df_obj, (md.Index, pd.Index)):
|
|
291
293
|
obj_type = OutputType.index
|
|
292
|
-
elif
|
|
294
|
+
elif is_scalar_object(df_obj):
|
|
293
295
|
obj_type = OutputType.scalar
|
|
294
296
|
else: # pragma: no cover
|
|
295
297
|
raise TypeError(f"Cannot accept type {type(df_obj)}")
|
|
@@ -18,14 +18,15 @@ from abc import ABC, abstractmethod
|
|
|
18
18
|
from contextlib import contextmanager
|
|
19
19
|
from typing import Dict, List, Optional, Union
|
|
20
20
|
|
|
21
|
+
import numpy as np
|
|
21
22
|
import pyarrow as pa
|
|
22
23
|
from odps import ODPS
|
|
24
|
+
from odps import __version__ as pyodps_version
|
|
23
25
|
from odps.apis.storage_api import (
|
|
24
26
|
StorageApiArrowClient,
|
|
25
27
|
TableBatchScanResponse,
|
|
26
28
|
TableBatchWriteResponse,
|
|
27
29
|
)
|
|
28
|
-
from odps.config import option_context as pyodps_option_context
|
|
29
30
|
from odps.tunnel import TableTunnel
|
|
30
31
|
from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
|
|
31
32
|
|
|
@@ -34,20 +35,16 @@ try:
|
|
|
34
35
|
except ImportError:
|
|
35
36
|
pac = None
|
|
36
37
|
|
|
37
|
-
from
|
|
38
|
-
from
|
|
38
|
+
from ...config import options
|
|
39
|
+
from ...env import ODPS_STORAGE_API_ENDPOINT
|
|
40
|
+
from ...lib.version import Version
|
|
41
|
+
from ...utils import sync_pyodps_options
|
|
39
42
|
from .schema import odps_schema_to_arrow_schema
|
|
40
43
|
|
|
41
44
|
PartitionsType = Union[List[str], str, None]
|
|
42
45
|
|
|
43
46
|
_DEFAULT_ROW_BATCH_SIZE = 4096
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@contextmanager
|
|
47
|
-
def _sync_pyodps_timezone():
|
|
48
|
-
with pyodps_option_context() as cfg:
|
|
49
|
-
cfg.local_timezone = options.local_timezone
|
|
50
|
-
yield
|
|
47
|
+
_need_patch_batch = Version(pyodps_version) < Version("0.12.0")
|
|
51
48
|
|
|
52
49
|
|
|
53
50
|
class ODPSTableIO(ABC):
|
|
@@ -163,10 +160,15 @@ class TunnelMultiPartitionReader:
|
|
|
163
160
|
self._cur_partition_id += 1
|
|
164
161
|
|
|
165
162
|
part_str = self._partitions[self._cur_partition_id]
|
|
166
|
-
|
|
163
|
+
|
|
164
|
+
# todo make this more formal when PyODPS 0.12.0 is released
|
|
165
|
+
req_columns = self._columns
|
|
166
|
+
if not _need_patch_batch:
|
|
167
|
+
req_columns = self._schema.names
|
|
168
|
+
with sync_pyodps_options():
|
|
167
169
|
self._cur_reader = self._table.open_reader(
|
|
168
170
|
part_str,
|
|
169
|
-
columns=
|
|
171
|
+
columns=req_columns,
|
|
170
172
|
arrow=True,
|
|
171
173
|
download_id=self._partition_to_download_ids.get(part_str),
|
|
172
174
|
)
|
|
@@ -177,7 +179,7 @@ class TunnelMultiPartitionReader:
|
|
|
177
179
|
else:
|
|
178
180
|
count = min(self._count, self._cur_reader.count - start)
|
|
179
181
|
|
|
180
|
-
with
|
|
182
|
+
with sync_pyodps_options():
|
|
181
183
|
self._reader_iter = self._cur_reader.read(start, count)
|
|
182
184
|
break
|
|
183
185
|
self._reader_start_pos += self._cur_reader.count
|
|
@@ -209,11 +211,12 @@ class TunnelMultiPartitionReader:
|
|
|
209
211
|
for part_col in self._partition_cols or []:
|
|
210
212
|
names.append(part_col)
|
|
211
213
|
col_type = self._schema.field_by_name(part_col).type
|
|
212
|
-
|
|
214
|
+
pt_col = np.repeat([pt_spec[part_col]], batch.num_rows)
|
|
215
|
+
arrays.append(pa.array(pt_col).cast(col_type))
|
|
213
216
|
return pa.RecordBatch.from_arrays(arrays, names)
|
|
214
217
|
|
|
215
218
|
def read(self):
|
|
216
|
-
with
|
|
219
|
+
with sync_pyodps_options():
|
|
217
220
|
if self._cur_reader is None:
|
|
218
221
|
self._open_next_reader()
|
|
219
222
|
if self._cur_reader is None:
|
|
@@ -224,7 +227,10 @@ class TunnelMultiPartitionReader:
|
|
|
224
227
|
if batch is not None:
|
|
225
228
|
if self._row_left is not None:
|
|
226
229
|
self._row_left -= batch.num_rows
|
|
227
|
-
|
|
230
|
+
if _need_patch_batch:
|
|
231
|
+
return self._fill_batch_partition(batch)
|
|
232
|
+
else:
|
|
233
|
+
return batch
|
|
228
234
|
except StopIteration:
|
|
229
235
|
self._open_next_reader()
|
|
230
236
|
return None
|
|
@@ -282,7 +288,9 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
282
288
|
reverse_range: bool = False,
|
|
283
289
|
row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
|
|
284
290
|
):
|
|
285
|
-
|
|
291
|
+
with sync_pyodps_options():
|
|
292
|
+
table = self._odps.get_table(full_table_name)
|
|
293
|
+
|
|
286
294
|
if partition_columns is True:
|
|
287
295
|
partition_columns = [c.name for c in table.table_schema.partitions]
|
|
288
296
|
|
|
@@ -293,21 +301,22 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
293
301
|
or (stop is not None and stop < 0)
|
|
294
302
|
or (reverse_range and start is None)
|
|
295
303
|
):
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
part_to_down_id = dict()
|
|
304
|
-
total_records = 0
|
|
305
|
-
for part in parts:
|
|
306
|
-
down_session = tunnel.create_download_session(
|
|
307
|
-
table, async_mode=True, partition_spec=part
|
|
304
|
+
with sync_pyodps_options():
|
|
305
|
+
table = self._odps.get_table(full_table_name)
|
|
306
|
+
tunnel = TableTunnel(self._odps)
|
|
307
|
+
parts = (
|
|
308
|
+
[partitions]
|
|
309
|
+
if partitions is None or isinstance(partitions, str)
|
|
310
|
+
else partitions
|
|
308
311
|
)
|
|
309
|
-
part_to_down_id
|
|
310
|
-
total_records
|
|
312
|
+
part_to_down_id = dict()
|
|
313
|
+
total_records = 0
|
|
314
|
+
for part in parts:
|
|
315
|
+
down_session = tunnel.create_download_session(
|
|
316
|
+
table, async_mode=True, partition_spec=part
|
|
317
|
+
)
|
|
318
|
+
part_to_down_id[part] = down_session.id
|
|
319
|
+
total_records += down_session.count
|
|
311
320
|
|
|
312
321
|
count = None
|
|
313
322
|
if start is not None or stop is not None:
|
|
@@ -344,7 +353,7 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
344
353
|
overwrite: bool = True,
|
|
345
354
|
):
|
|
346
355
|
table = self._odps.get_table(full_table_name)
|
|
347
|
-
with
|
|
356
|
+
with sync_pyodps_options():
|
|
348
357
|
with table.open_writer(
|
|
349
358
|
partition=partition,
|
|
350
359
|
arrow=True,
|
|
@@ -354,7 +363,10 @@ class TunnelTableIO(ODPSTableIO):
|
|
|
354
363
|
# fixme should yield writer directly once pyodps fixes
|
|
355
364
|
# related arrow timestamp bug when provided schema and
|
|
356
365
|
# table schema is identical.
|
|
357
|
-
|
|
366
|
+
if _need_patch_batch:
|
|
367
|
+
yield TunnelWrappedWriter(writer)
|
|
368
|
+
else:
|
|
369
|
+
yield writer
|
|
358
370
|
|
|
359
371
|
|
|
360
372
|
class HaloTableArrowReader:
|
|
@@ -590,8 +602,8 @@ class HaloTableIO(ODPSTableIO):
|
|
|
590
602
|
):
|
|
591
603
|
from odps.apis.storage_api import (
|
|
592
604
|
SessionRequest,
|
|
605
|
+
SessionStatus,
|
|
593
606
|
SplitOptions,
|
|
594
|
-
Status,
|
|
595
607
|
TableBatchScanRequest,
|
|
596
608
|
)
|
|
597
609
|
|
|
@@ -622,13 +634,13 @@ class HaloTableIO(ODPSTableIO):
|
|
|
622
634
|
resp = client.create_read_session(req)
|
|
623
635
|
|
|
624
636
|
session_id = resp.session_id
|
|
625
|
-
status = resp.
|
|
626
|
-
while status ==
|
|
637
|
+
status = resp.session_status
|
|
638
|
+
while status == SessionStatus.INIT:
|
|
627
639
|
resp = client.get_read_session(SessionRequest(session_id))
|
|
628
|
-
status = resp.
|
|
640
|
+
status = resp.session_status
|
|
629
641
|
time.sleep(1.0)
|
|
630
642
|
|
|
631
|
-
assert status ==
|
|
643
|
+
assert status == SessionStatus.NORMAL
|
|
632
644
|
|
|
633
645
|
count = None
|
|
634
646
|
if start is not None or stop is not None:
|