maxframe 1.0.0rc1__cp37-cp37m-win32.whl → 1.0.0rc3__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/codegen.py +3 -6
- maxframe/config/config.py +49 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +15 -2
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/objects.py +46 -3
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +7 -1
- maxframe/dataframe/datasource/read_odps_table.py +3 -2
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/datastore/to_odps.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +3 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +12 -8
- maxframe/{odpsio → io/odpsio}/schema.py +15 -12
- maxframe/io/odpsio/tableio.py +702 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +19 -18
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +57 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
- maxframe/learn/contrib/xgboost/predict.py +21 -7
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +27 -17
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/protocol.py +41 -17
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/serialization/serializables/core.py +48 -9
- maxframe/tensor/__init__.py +69 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +98 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +70 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/{base → misc}/unique.py +2 -2
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/utils.py +11 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +32 -70
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/METADATA +2 -2
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +133 -123
- maxframe_client/fetcher.py +60 -68
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +58 -22
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +27 -4
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -322
- maxframe/odpsio/volumeio.py +0 -95
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +0 -0
- {maxframe-1.0.0rc1.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from abc import ABCMeta, abstractmethod
|
|
16
|
+
from typing import Any, Dict, Type, Union
|
|
17
|
+
|
|
18
|
+
import msgpack
|
|
19
|
+
|
|
20
|
+
from ...core import Entity, EntityData
|
|
21
|
+
from ...core.entity import ObjectData, TileableData
|
|
22
|
+
from ...lib import wrapped_pickle as pickle
|
|
23
|
+
from ...typing_ import SlicesType, TileableType
|
|
24
|
+
from ...utils import TypeDispatcher
|
|
25
|
+
from ..odpsio.volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
26
|
+
|
|
27
|
+
_MetaType = Dict[str, Any]
|
|
28
|
+
|
|
29
|
+
_META_FILE_NAME = ".meta"
|
|
30
|
+
_META_PICKLED_KEYS_KEY = ".pickled_keys"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_io_handler_dispatcher = TypeDispatcher()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def register_object_io_handler(tileable_data_type: Type[TileableData]):
|
|
37
|
+
def wrapper(handler_cls):
|
|
38
|
+
_io_handler_dispatcher.register(tileable_data_type, handler_cls)
|
|
39
|
+
return handler_cls
|
|
40
|
+
|
|
41
|
+
return wrapper
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_object_io_handler(
|
|
45
|
+
tileable_data_type: Union[Entity, EntityData, Type[EntityData]]
|
|
46
|
+
) -> Type["AbstractObjectIOHandler"]:
|
|
47
|
+
if not isinstance(tileable_data_type, type):
|
|
48
|
+
if isinstance(tileable_data_type, Entity):
|
|
49
|
+
tileable_data_type = tileable_data_type.data
|
|
50
|
+
tileable_data_type = type(tileable_data_type)
|
|
51
|
+
return _io_handler_dispatcher.get_handler(tileable_data_type)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AbstractObjectIOHandler(metaclass=ABCMeta):
|
|
55
|
+
def _prepare_meta_for_serial(
|
|
56
|
+
self, tileable: TileableType, meta: Dict[str, Any]
|
|
57
|
+
) -> Dict[str, Any]:
|
|
58
|
+
to_pack = meta.copy()
|
|
59
|
+
pickled_keys = []
|
|
60
|
+
for k, v in meta.items():
|
|
61
|
+
if not isinstance(v, (str, bytes, int, float, bool)):
|
|
62
|
+
to_pack[k] = pickle.dumps(v)
|
|
63
|
+
pickled_keys.append(k)
|
|
64
|
+
to_pack[".pickled_keys"] = pickled_keys
|
|
65
|
+
return to_pack
|
|
66
|
+
|
|
67
|
+
def _prepare_meta_for_deserial(
|
|
68
|
+
self, tileable: TileableType, meta: Dict[str, Any]
|
|
69
|
+
) -> Dict[str, Any]:
|
|
70
|
+
pickled_keys = meta.pop(".pickled_keys", None) or []
|
|
71
|
+
for k in pickled_keys:
|
|
72
|
+
meta[k] = pickle.loads(meta[k])
|
|
73
|
+
return meta
|
|
74
|
+
|
|
75
|
+
def read_object_meta(
|
|
76
|
+
self, reader: ODPSVolumeReader, tileable: TileableType
|
|
77
|
+
) -> Dict[str, Any]:
|
|
78
|
+
meta_obj = msgpack.loads(reader.read_file(_META_FILE_NAME))
|
|
79
|
+
return self._prepare_meta_for_deserial(tileable, meta_obj)
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def _read_object_body(
|
|
83
|
+
self,
|
|
84
|
+
reader: ODPSVolumeReader,
|
|
85
|
+
tileable: TileableType,
|
|
86
|
+
meta: Dict[str, Any],
|
|
87
|
+
slices: SlicesType = None,
|
|
88
|
+
) -> Any:
|
|
89
|
+
raise NotImplementedError
|
|
90
|
+
|
|
91
|
+
def read_object(
|
|
92
|
+
self,
|
|
93
|
+
reader: ODPSVolumeReader,
|
|
94
|
+
tileable: TileableType,
|
|
95
|
+
slices: SlicesType = None,
|
|
96
|
+
) -> Any:
|
|
97
|
+
meta = self.read_object_meta(reader, tileable)
|
|
98
|
+
return self._read_object_body(reader, tileable, meta, slices)
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def _write_object_body(
|
|
102
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
103
|
+
):
|
|
104
|
+
raise NotImplementedError
|
|
105
|
+
|
|
106
|
+
def write_object_meta(
|
|
107
|
+
self,
|
|
108
|
+
writer: ODPSVolumeWriter,
|
|
109
|
+
tileable: TileableType,
|
|
110
|
+
extra_meta: Dict[str, Any] = None,
|
|
111
|
+
):
|
|
112
|
+
meta_obj = tileable.params.copy()
|
|
113
|
+
if extra_meta:
|
|
114
|
+
meta_obj.update(extra_meta)
|
|
115
|
+
meta_obj = self._prepare_meta_for_serial(tileable, meta_obj)
|
|
116
|
+
packed = msgpack.dumps(meta_obj)
|
|
117
|
+
writer.write_file(_META_FILE_NAME, packed)
|
|
118
|
+
|
|
119
|
+
def write_object(
|
|
120
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
121
|
+
):
|
|
122
|
+
self.write_object_meta(writer, tileable)
|
|
123
|
+
self._write_object_body(writer, tileable, value)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@register_object_io_handler(ObjectData)
|
|
127
|
+
class ObjectIOHandler(AbstractObjectIOHandler):
|
|
128
|
+
def _read_object_body(
|
|
129
|
+
self,
|
|
130
|
+
reader: ODPSVolumeReader,
|
|
131
|
+
tileable: TileableType,
|
|
132
|
+
meta: Dict[str, Any],
|
|
133
|
+
slices: SlicesType = None,
|
|
134
|
+
) -> Any:
|
|
135
|
+
return pickle.loads(reader.read_file("data"))
|
|
136
|
+
|
|
137
|
+
def _write_object_body(
|
|
138
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
139
|
+
):
|
|
140
|
+
writer.write_file("data", pickle.dumps(value))
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import struct
|
|
16
|
+
from io import BytesIO
|
|
17
|
+
from typing import Any, Dict
|
|
18
|
+
|
|
19
|
+
import msgpack
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
from ...lib import wrapped_pickle as pickle
|
|
23
|
+
from ...tensor.core import TensorData
|
|
24
|
+
from ...typing_ import SlicesType, TileableType
|
|
25
|
+
from ..odpsio import ODPSVolumeReader, ODPSVolumeWriter
|
|
26
|
+
from .core import AbstractObjectIOHandler, register_object_io_handler
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_object_io_handler(TensorData)
|
|
30
|
+
class TensorIOHandler(AbstractObjectIOHandler):
|
|
31
|
+
def write_object_meta(
|
|
32
|
+
self,
|
|
33
|
+
writer: ODPSVolumeWriter,
|
|
34
|
+
tileable: TileableType,
|
|
35
|
+
extra_meta: Dict[str, Any] = None,
|
|
36
|
+
):
|
|
37
|
+
# fixme upload in real slices when tensors are supported in DPE
|
|
38
|
+
extra_meta = extra_meta or dict()
|
|
39
|
+
extra_meta["nsplits"] = ((np.nan,),)
|
|
40
|
+
|
|
41
|
+
super().write_object_meta(writer, tileable, extra_meta=extra_meta)
|
|
42
|
+
|
|
43
|
+
def _read_object_body(
|
|
44
|
+
self,
|
|
45
|
+
reader: ODPSVolumeReader,
|
|
46
|
+
tileable: TileableType,
|
|
47
|
+
meta: Dict[str, Any],
|
|
48
|
+
slices: SlicesType = None,
|
|
49
|
+
) -> Any:
|
|
50
|
+
# fixme read data with slices when tensors are supported in DPE
|
|
51
|
+
body = reader.read_file("0,0.dat")
|
|
52
|
+
bio = BytesIO(body)
|
|
53
|
+
(header_len,) = struct.unpack("<I", bio.read(4))
|
|
54
|
+
header_data = msgpack.loads(bio.read(header_len))
|
|
55
|
+
|
|
56
|
+
pickled = bio.read(header_data[0])
|
|
57
|
+
bufs = [bio.read(size) for size in header_data[1:]]
|
|
58
|
+
return pickle.loads(pickled, buffers=bufs)
|
|
59
|
+
|
|
60
|
+
def _write_object_body(
|
|
61
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
62
|
+
):
|
|
63
|
+
# fixme upload in real slices when tensors are supported in DPE
|
|
64
|
+
def data_gen():
|
|
65
|
+
bufs = []
|
|
66
|
+
pickled = pickle.dumps(value, buffer_callback=bufs.append)
|
|
67
|
+
header_data = msgpack.dumps(
|
|
68
|
+
[len(pickled)] + [len(buf.raw()) for buf in bufs]
|
|
69
|
+
)
|
|
70
|
+
yield struct.pack("<I", len(header_data))
|
|
71
|
+
yield header_data
|
|
72
|
+
yield pickled
|
|
73
|
+
for buf in bufs:
|
|
74
|
+
yield buf
|
|
75
|
+
|
|
76
|
+
writer.write_file("0,0.dat", data_gen())
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pytest
|
|
16
|
+
from odps import ODPS
|
|
17
|
+
|
|
18
|
+
from ....core import OutputType
|
|
19
|
+
from ....core.operator import ObjectOperatorMixin, Operator
|
|
20
|
+
from ....tensor.datasource import ArrayDataSource
|
|
21
|
+
from ....tests.utils import tn
|
|
22
|
+
from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
|
|
23
|
+
from ..core import get_object_io_handler
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TestObjectOp(Operator, ObjectOperatorMixin):
|
|
27
|
+
def __call__(self):
|
|
28
|
+
self._output_types = [OutputType.object]
|
|
29
|
+
return self.new_tileable([])
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture(scope="module")
|
|
33
|
+
def create_volume(request, oss_config):
|
|
34
|
+
test_vol_name = tn("test_object_io_volume")
|
|
35
|
+
odps_entry = ODPS.from_environments()
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
|
|
39
|
+
except:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
oss_test_dir_name = tn("test_oss_directory")
|
|
43
|
+
if oss_config is None:
|
|
44
|
+
pytest.skip("Need oss and its config to run this test")
|
|
45
|
+
(
|
|
46
|
+
oss_access_id,
|
|
47
|
+
oss_secret_access_key,
|
|
48
|
+
oss_bucket_name,
|
|
49
|
+
oss_endpoint,
|
|
50
|
+
) = oss_config.oss_config
|
|
51
|
+
test_location = "oss://%s:%s@%s/%s/%s" % (
|
|
52
|
+
oss_access_id,
|
|
53
|
+
oss_secret_access_key,
|
|
54
|
+
oss_endpoint,
|
|
55
|
+
oss_bucket_name,
|
|
56
|
+
oss_test_dir_name,
|
|
57
|
+
)
|
|
58
|
+
oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
|
|
59
|
+
odps_entry.create_external_volume(test_vol_name, location=test_location)
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
yield test_vol_name
|
|
63
|
+
finally:
|
|
64
|
+
try:
|
|
65
|
+
odps_entry.delete_volume(
|
|
66
|
+
test_vol_name, auto_remove_dir=True, recursive=True
|
|
67
|
+
)
|
|
68
|
+
except:
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_simple_object_io(create_volume):
|
|
73
|
+
obj = TestObjectOp()()
|
|
74
|
+
data = "abcdefg"
|
|
75
|
+
|
|
76
|
+
odps_entry = ODPS.from_environments()
|
|
77
|
+
|
|
78
|
+
reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
|
|
79
|
+
writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
|
|
80
|
+
|
|
81
|
+
handler = get_object_io_handler(obj)()
|
|
82
|
+
handler.write_object(writer, obj, data)
|
|
83
|
+
assert data == handler.read_object(reader, obj)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_tensor_object_io(create_volume):
|
|
87
|
+
data = np.array([[4, 9, 2], [3, 5, 7], [8, 1, 6]])
|
|
88
|
+
obj = ArrayDataSource(data, dtype=data.dtype)(data.shape)
|
|
89
|
+
|
|
90
|
+
odps_entry = ODPS.from_environments()
|
|
91
|
+
|
|
92
|
+
reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
|
|
93
|
+
writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
|
|
94
|
+
|
|
95
|
+
handler = get_object_io_handler(obj)()
|
|
96
|
+
handler.write_object(writer, obj, data)
|
|
97
|
+
np.testing.assert_equal(data, handler.read_object(reader, obj))
|
|
@@ -14,8 +14,10 @@
|
|
|
14
14
|
|
|
15
15
|
from .arrow import arrow_to_pandas, pandas_to_arrow
|
|
16
16
|
from .schema import (
|
|
17
|
+
arrow_schema_to_odps_schema,
|
|
17
18
|
build_dataframe_table_meta,
|
|
18
19
|
odps_schema_to_pandas_dtypes,
|
|
19
20
|
pandas_to_odps_schema,
|
|
20
21
|
)
|
|
21
|
-
from .tableio import HaloTableIO
|
|
22
|
+
from .tableio import HaloTableIO, ODPSTableIO
|
|
23
|
+
from .volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
@@ -17,10 +17,10 @@ from typing import Any, Tuple, Union
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
import pyarrow as pa
|
|
19
19
|
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
20
|
+
from ...core import OutputType
|
|
21
|
+
from ...protocol import DataFrameTableMeta
|
|
22
|
+
from ...tensor.core import TENSOR_TYPE
|
|
23
|
+
from ...typing_ import ArrowTableType, PandasObjectTypes
|
|
24
24
|
from .schema import build_dataframe_table_meta
|
|
25
25
|
|
|
26
26
|
|
|
@@ -45,9 +45,13 @@ def _rebuild_dataframe(
|
|
|
45
45
|
|
|
46
46
|
def _rebuild_index(df: pd.DataFrame, table_meta: DataFrameTableMeta) -> pd.Index:
|
|
47
47
|
if df.shape[1] > 1:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
idx = pd.MultiIndex.from_frame(df)
|
|
49
|
+
idx.names = table_meta.pd_index_level_names
|
|
50
|
+
else:
|
|
51
|
+
# make sure even if None names are updated properly
|
|
52
|
+
idx = pd.Index(df.iloc[:, 0])
|
|
53
|
+
idx.name = table_meta.pd_index_level_names[0]
|
|
54
|
+
return idx
|
|
51
55
|
|
|
52
56
|
|
|
53
57
|
def arrow_to_pandas(
|
|
@@ -75,7 +79,7 @@ def pandas_to_arrow(
|
|
|
75
79
|
df.columns = pd.Index(table_meta.table_column_names)
|
|
76
80
|
if not ignore_index:
|
|
77
81
|
df = df.rename_axis(table_meta.table_index_column_names).reset_index()
|
|
78
|
-
elif ignore_index:
|
|
82
|
+
elif ignore_index and table_meta.type != OutputType.index:
|
|
79
83
|
df = pd.DataFrame([], columns=[])
|
|
80
84
|
elif table_meta.type == OutputType.index:
|
|
81
85
|
names = [f"_idx_{idx}" for idx in range(len(df.names))]
|
|
@@ -21,9 +21,9 @@ import pyarrow as pa
|
|
|
21
21
|
from odps import types as odps_types
|
|
22
22
|
from pandas.api import types as pd_types
|
|
23
23
|
|
|
24
|
-
from
|
|
25
|
-
from
|
|
26
|
-
from
|
|
24
|
+
from ...core import TILEABLE_TYPE, OutputType
|
|
25
|
+
from ...protocol import DataFrameTableMeta
|
|
26
|
+
from ...tensor.core import TENSOR_TYPE
|
|
27
27
|
|
|
28
28
|
_TEMP_TABLE_PREFIX = "tmp_mf_"
|
|
29
29
|
|
|
@@ -126,10 +126,15 @@ def odps_type_to_arrow_type(
|
|
|
126
126
|
]
|
|
127
127
|
col_type = pa.struct(fields)
|
|
128
128
|
elif isinstance(odps_type, odps_types.Decimal):
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
129
|
+
if odps_type.name == "decimal":
|
|
130
|
+
# legacy decimal data without precision or scale
|
|
131
|
+
# precision data from internal compat mode
|
|
132
|
+
col_type = pa.decimal128(38, 18)
|
|
133
|
+
else:
|
|
134
|
+
col_type = pa.decimal128(
|
|
135
|
+
odps_type.precision or odps_types.Decimal._max_precision,
|
|
136
|
+
odps_type.scale or odps_types.Decimal._max_scale,
|
|
137
|
+
)
|
|
133
138
|
elif isinstance(odps_type, (odps_types.Varchar, odps_types.Char)):
|
|
134
139
|
col_type = pa.string()
|
|
135
140
|
else:
|
|
@@ -179,7 +184,7 @@ def pandas_to_odps_schema(
|
|
|
179
184
|
unknown_as_string: bool = False,
|
|
180
185
|
ignore_index=False,
|
|
181
186
|
) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
|
|
182
|
-
from
|
|
187
|
+
from ... import dataframe as md
|
|
183
188
|
from .arrow import pandas_to_arrow
|
|
184
189
|
|
|
185
190
|
if _is_scalar_object(df_obj):
|
|
@@ -273,7 +278,7 @@ def build_table_column_name(
|
|
|
273
278
|
def build_dataframe_table_meta(
|
|
274
279
|
df_obj: Any, ignore_index: bool = False
|
|
275
280
|
) -> DataFrameTableMeta:
|
|
276
|
-
from
|
|
281
|
+
from ... import dataframe as md
|
|
277
282
|
|
|
278
283
|
col_to_count = defaultdict(lambda: 0)
|
|
279
284
|
col_to_idx = defaultdict(lambda: 0)
|
|
@@ -289,8 +294,6 @@ def build_dataframe_table_meta(
|
|
|
289
294
|
else: # pragma: no cover
|
|
290
295
|
raise TypeError(f"Cannot accept type {type(df_obj)}")
|
|
291
296
|
|
|
292
|
-
assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
|
|
293
|
-
|
|
294
297
|
if obj_type == OutputType.scalar:
|
|
295
298
|
pd_dtypes = pd.Series([])
|
|
296
299
|
column_index_names = []
|
|
@@ -346,7 +349,7 @@ def build_dataframe_table_meta(
|
|
|
346
349
|
else:
|
|
347
350
|
index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
|
|
348
351
|
|
|
349
|
-
if ignore_index:
|
|
352
|
+
if ignore_index and obj_type != OutputType.index:
|
|
350
353
|
table_index_column_names = []
|
|
351
354
|
pd_index_dtypes = pd.Series([], index=[])
|
|
352
355
|
else:
|