maxframe 1.0.0rc2__cp37-cp37m-win32.whl → 1.0.0rc3__cp37-cp37m-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp37-win32.pyd +0 -0
- maxframe/codegen.py +3 -2
- maxframe/config/config.py +16 -9
- maxframe/config/validators.py +42 -12
- maxframe/conftest.py +13 -2
- maxframe/core/__init__.py +2 -13
- maxframe/core/entity/__init__.py +0 -4
- maxframe/core/entity/objects.py +45 -2
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp37-win32.pyd +0 -0
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/datasource/read_odps_query.py +1 -1
- maxframe/dataframe/datasource/read_odps_table.py +1 -1
- maxframe/dataframe/datastore/to_odps.py +1 -1
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
- maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
- maxframe/{odpsio → io/odpsio}/schema.py +5 -5
- maxframe/{odpsio → io/odpsio}/tableio.py +10 -4
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -3
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +57 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
- maxframe/learn/contrib/xgboost/predict.py +19 -5
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +25 -15
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cp37-win32.pyd +0 -0
- maxframe/protocol.py +1 -15
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp37-win32.pyd +0 -0
- maxframe/tensor/__init__.py +10 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +98 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +70 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +1 -0
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/utils.py +2 -22
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +15 -61
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/METADATA +2 -75
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +101 -91
- maxframe_client/fetcher.py +23 -42
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +54 -18
- maxframe_client/tests/test_fetcher.py +1 -1
- maxframe_client/tests/test_session.py +14 -2
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/volumeio.py +0 -95
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/unique.py +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +0 -0
- {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
maxframe/core/graph/entity.py
CHANGED
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
from abc import ABCMeta, abstractmethod
|
|
16
|
-
from typing import Dict, Iterable, List
|
|
16
|
+
from typing import Dict, Iterable, List
|
|
17
17
|
|
|
18
|
-
from ...core import
|
|
18
|
+
from ...core import Tileable
|
|
19
19
|
from ...serialization.core import buffered
|
|
20
20
|
from ...serialization.serializables import BoolField, DictField, ListField, Serializable
|
|
21
21
|
from ...serialization.serializables.core import SerializableSerializer
|
|
@@ -97,26 +97,6 @@ class TileableGraph(EntityGraph, Iterable[Tileable]):
|
|
|
97
97
|
return self._logic_key
|
|
98
98
|
|
|
99
99
|
|
|
100
|
-
class ChunkGraph(EntityGraph, Iterable[Chunk]):
|
|
101
|
-
_result_chunks: List[Chunk]
|
|
102
|
-
|
|
103
|
-
def __init__(self, result_chunks: List[Chunk] = None):
|
|
104
|
-
super().__init__()
|
|
105
|
-
self._result_chunks = result_chunks
|
|
106
|
-
|
|
107
|
-
@property
|
|
108
|
-
def result_chunks(self):
|
|
109
|
-
return self._result_chunks
|
|
110
|
-
|
|
111
|
-
@property
|
|
112
|
-
def results(self):
|
|
113
|
-
return self._result_chunks
|
|
114
|
-
|
|
115
|
-
@results.setter
|
|
116
|
-
def results(self, new_results):
|
|
117
|
-
self._result_chunks = new_results
|
|
118
|
-
|
|
119
|
-
|
|
120
100
|
class SerializableGraph(Serializable):
|
|
121
101
|
_is_chunk = BoolField("is_chunk")
|
|
122
102
|
# TODO(qinxuye): remove this logic when we handle fetch elegantly,
|
|
@@ -132,12 +112,11 @@ class SerializableGraph(Serializable):
|
|
|
132
112
|
_results = ListField("results")
|
|
133
113
|
|
|
134
114
|
@classmethod
|
|
135
|
-
def from_graph(cls, graph:
|
|
115
|
+
def from_graph(cls, graph: EntityGraph) -> "SerializableGraph":
|
|
136
116
|
from ..operator import Fetch
|
|
137
117
|
|
|
138
|
-
is_chunk = isinstance(graph, ChunkGraph)
|
|
139
118
|
return SerializableGraph(
|
|
140
|
-
_is_chunk=
|
|
119
|
+
_is_chunk=False,
|
|
141
120
|
_fetch_nodes=[chunk for chunk in graph if isinstance(chunk.op, Fetch)],
|
|
142
121
|
_nodes=graph._nodes,
|
|
143
122
|
_predecessors=graph._predecessors,
|
|
@@ -145,9 +124,8 @@ class SerializableGraph(Serializable):
|
|
|
145
124
|
_results=graph.results,
|
|
146
125
|
)
|
|
147
126
|
|
|
148
|
-
def to_graph(self) ->
|
|
149
|
-
|
|
150
|
-
graph = graph_cls(self._results)
|
|
127
|
+
def to_graph(self) -> EntityGraph:
|
|
128
|
+
graph = TileableGraph(self._results)
|
|
151
129
|
graph._nodes.update(self._nodes)
|
|
152
130
|
graph._predecessors.update(self._predecessors)
|
|
153
131
|
graph._successors.update(self._successors)
|
|
@@ -156,14 +134,12 @@ class SerializableGraph(Serializable):
|
|
|
156
134
|
|
|
157
135
|
class GraphSerializer(SerializableSerializer):
|
|
158
136
|
@buffered
|
|
159
|
-
def serial(self, obj:
|
|
137
|
+
def serial(self, obj: EntityGraph, context: Dict):
|
|
160
138
|
serializable_graph = SerializableGraph.from_graph(obj)
|
|
161
139
|
return [], [serializable_graph], False
|
|
162
140
|
|
|
163
|
-
def deserial(
|
|
164
|
-
|
|
165
|
-
) -> Union[TileableGraph, ChunkGraph]:
|
|
166
|
-
serializable_graph: SerializableGraph = subs[0]
|
|
141
|
+
def deserial(self, serialized: List, context: Dict, subs: List) -> TileableGraph:
|
|
142
|
+
serializable_graph: EntityGraph = subs[0]
|
|
167
143
|
return serializable_graph.to_graph()
|
|
168
144
|
|
|
169
145
|
|
|
@@ -22,13 +22,6 @@ from .base import (
|
|
|
22
22
|
)
|
|
23
23
|
from .core import TileableOperatorMixin, estimate_size, execute
|
|
24
24
|
from .fetch import Fetch, FetchMixin, FetchShuffle, ShuffleFetchType
|
|
25
|
-
from .
|
|
26
|
-
from .objects import (
|
|
27
|
-
MergeDictOperator,
|
|
28
|
-
ObjectFetch,
|
|
29
|
-
ObjectFuseChunk,
|
|
30
|
-
ObjectFuseChunkMixin,
|
|
31
|
-
ObjectOperator,
|
|
32
|
-
ObjectOperatorMixin,
|
|
33
|
-
)
|
|
25
|
+
from .objects import MergeDictOperator, ObjectFetch, ObjectOperator, ObjectOperatorMixin
|
|
34
26
|
from .shuffle import MapReduceOperator, ShuffleProxy
|
|
27
|
+
from .utils import add_fetch_builder, build_fetch
|
maxframe/core/operator/base.py
CHANGED
|
@@ -12,11 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import functools
|
|
16
15
|
import weakref
|
|
17
16
|
from copy import deepcopy
|
|
18
17
|
from enum import Enum
|
|
19
|
-
from functools import partial
|
|
18
|
+
from functools import lru_cache, partial
|
|
20
19
|
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
21
20
|
|
|
22
21
|
from ...serialization.core import Placeholder
|
|
@@ -37,7 +36,6 @@ from ...serialization.serializables.core import SerializableSerializer
|
|
|
37
36
|
from ...typing_ import OperatorType
|
|
38
37
|
from ...utils import AttributeDict, classproperty, get_user_call_point, tokenize
|
|
39
38
|
from ..base import Base
|
|
40
|
-
from ..entity.chunks import Chunk
|
|
41
39
|
from ..entity.core import ENTITY_TYPE, Entity, EntityData
|
|
42
40
|
from ..entity.output_types import OutputType
|
|
43
41
|
from ..entity.tileables import Tileable
|
|
@@ -90,7 +88,7 @@ class SchedulingHint(Serializable):
|
|
|
90
88
|
priority = Int32Field("priority", default=None)
|
|
91
89
|
|
|
92
90
|
@classproperty
|
|
93
|
-
@
|
|
91
|
+
@lru_cache(1)
|
|
94
92
|
def all_hint_names(cls):
|
|
95
93
|
return list(cls._FIELDS)
|
|
96
94
|
|
|
@@ -341,7 +339,7 @@ class Operator(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperatorMetaclass
|
|
|
341
339
|
raise ValueError("Outputs' size exceeds limitation")
|
|
342
340
|
|
|
343
341
|
@property
|
|
344
|
-
def outputs(self) -> List[
|
|
342
|
+
def outputs(self) -> List[Tileable]:
|
|
345
343
|
outputs = self._outputs
|
|
346
344
|
if outputs:
|
|
347
345
|
return [ref() for ref in outputs]
|
|
@@ -17,7 +17,6 @@ from ..entity import OutputType, register_fetch_class
|
|
|
17
17
|
from .base import Operator
|
|
18
18
|
from .core import TileableOperatorMixin
|
|
19
19
|
from .fetch import Fetch, FetchMixin
|
|
20
|
-
from .fuse import Fuse, FuseChunkMixin
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
class ObjectOperator(Operator):
|
|
@@ -28,14 +27,6 @@ class ObjectOperatorMixin(TileableOperatorMixin):
|
|
|
28
27
|
_output_type_ = OutputType.object
|
|
29
28
|
|
|
30
29
|
|
|
31
|
-
class ObjectFuseChunkMixin(FuseChunkMixin, ObjectOperatorMixin):
|
|
32
|
-
__slots__ = ()
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class ObjectFuseChunk(ObjectFuseChunkMixin, Fuse):
|
|
36
|
-
pass
|
|
37
|
-
|
|
38
|
-
|
|
39
30
|
class ObjectFetch(FetchMixin, ObjectOperatorMixin, Fetch):
|
|
40
31
|
_output_type_ = OutputType.object
|
|
41
32
|
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from ...typing_ import EntityType, TileableType
|
|
16
|
+
from ..entity import TILEABLE_TYPE
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def build_fetch_tileable(tileable: TileableType) -> TileableType:
|
|
20
|
+
if tileable.is_coarse():
|
|
21
|
+
chunks = None
|
|
22
|
+
else:
|
|
23
|
+
chunks = []
|
|
24
|
+
for c in tileable.chunks:
|
|
25
|
+
fetch_chunk = build_fetch(c, index=c.index)
|
|
26
|
+
chunks.append(fetch_chunk)
|
|
27
|
+
|
|
28
|
+
tileable_op = tileable.op
|
|
29
|
+
params = tileable.params.copy()
|
|
30
|
+
|
|
31
|
+
new_op = tileable_op.get_fetch_op_cls(tileable)(_id=tileable_op.id)
|
|
32
|
+
return new_op.new_tileables(
|
|
33
|
+
None,
|
|
34
|
+
chunks=chunks,
|
|
35
|
+
nsplits=tileable.nsplits,
|
|
36
|
+
_key=tileable.key,
|
|
37
|
+
_id=tileable.id,
|
|
38
|
+
**params,
|
|
39
|
+
)[0]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
_type_to_builder = [
|
|
43
|
+
(TILEABLE_TYPE, build_fetch_tileable),
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def build_fetch(entity: EntityType, **kw) -> EntityType:
|
|
48
|
+
for entity_types, func in _type_to_builder:
|
|
49
|
+
if isinstance(entity, entity_types):
|
|
50
|
+
return func(entity, **kw)
|
|
51
|
+
raise TypeError(f"Type {type(entity)} not supported")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def add_fetch_builder(entity_type, builder_func):
|
|
55
|
+
_type_to_builder.append((entity_type, builder_func))
|
|
@@ -24,7 +24,7 @@ from odps.types import Column, OdpsSchema, validate_data_type
|
|
|
24
24
|
from ... import opcodes
|
|
25
25
|
from ...core import OutputType
|
|
26
26
|
from ...core.graph import DAG
|
|
27
|
-
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
27
|
+
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
28
28
|
from ...serialization.serializables import (
|
|
29
29
|
AnyField,
|
|
30
30
|
BoolField,
|
|
@@ -23,7 +23,7 @@ from odps.utils import to_timestamp
|
|
|
23
23
|
|
|
24
24
|
from ... import opcodes
|
|
25
25
|
from ...core import OutputType
|
|
26
|
-
from ...odpsio import odps_schema_to_pandas_dtypes
|
|
26
|
+
from ...io.odpsio import odps_schema_to_pandas_dtypes
|
|
27
27
|
from ...serialization.serializables import (
|
|
28
28
|
AnyField,
|
|
29
29
|
BoolField,
|
|
@@ -23,7 +23,7 @@ from odps.types import PartitionSpec
|
|
|
23
23
|
from ... import opcodes
|
|
24
24
|
from ...config import options
|
|
25
25
|
from ...core import OutputType
|
|
26
|
-
from ...odpsio import build_dataframe_table_meta
|
|
26
|
+
from ...io.odpsio import build_dataframe_table_meta
|
|
27
27
|
from ...serialization.serializables import (
|
|
28
28
|
BoolField,
|
|
29
29
|
FieldTypes,
|
maxframe/dataframe/operators.py
CHANGED
|
@@ -16,13 +16,7 @@ import numpy as np
|
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
18
|
from ..core import ENTITY_TYPE, OutputType
|
|
19
|
-
from ..core.operator import
|
|
20
|
-
Fuse,
|
|
21
|
-
FuseChunkMixin,
|
|
22
|
-
Operator,
|
|
23
|
-
ShuffleProxy,
|
|
24
|
-
TileableOperatorMixin,
|
|
25
|
-
)
|
|
19
|
+
from ..core.operator import Operator, ShuffleProxy, TileableOperatorMixin
|
|
26
20
|
from ..tensor.core import TENSOR_TYPE
|
|
27
21
|
from ..tensor.datasource import tensor as astensor
|
|
28
22
|
from .core import DATAFRAME_TYPE, SERIES_TYPE
|
|
@@ -261,13 +255,3 @@ DataFrameOperator = Operator
|
|
|
261
255
|
class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperatorMixin):
|
|
262
256
|
def __init__(self, sparse=None, output_types=None, **kwargs):
|
|
263
257
|
super().__init__(sparse=sparse, _output_types=output_types, **kwargs)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperatorMixin):
|
|
267
|
-
__slots__ = ()
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin):
|
|
271
|
-
@property
|
|
272
|
-
def output_types(self):
|
|
273
|
-
return self.outputs[-1].chunk.op.output_types
|
|
@@ -552,7 +552,7 @@ class ReductionCompiler:
|
|
|
552
552
|
@enter_mode(build=True)
|
|
553
553
|
def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps:
|
|
554
554
|
from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
|
|
555
|
-
from ...tensor.
|
|
555
|
+
from ...tensor.misc import TensorWhere
|
|
556
556
|
from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
|
|
557
557
|
from ..datasource.dataframe import DataFrameDataSource
|
|
558
558
|
from ..datasource.series import SeriesDataSource
|
|
@@ -679,8 +679,8 @@ class ReductionCompiler:
|
|
|
679
679
|
]
|
|
680
680
|
"""
|
|
681
681
|
from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
|
|
682
|
-
from ...tensor.base import TensorWhere
|
|
683
682
|
from ...tensor.datasource import Scalar
|
|
683
|
+
from ...tensor.misc import TensorWhere
|
|
684
684
|
from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
|
|
685
685
|
from ..datasource.dataframe import DataFrameDataSource
|
|
686
686
|
from ..datasource.series import SeriesDataSource
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .core import (
|
|
16
|
+
AbstractObjectIOHandler,
|
|
17
|
+
get_object_io_handler,
|
|
18
|
+
register_object_io_handler,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# isort: off
|
|
22
|
+
from . import tensor
|
|
23
|
+
|
|
24
|
+
del tensor
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from abc import ABCMeta, abstractmethod
|
|
16
|
+
from typing import Any, Dict, Type, Union
|
|
17
|
+
|
|
18
|
+
import msgpack
|
|
19
|
+
|
|
20
|
+
from ...core import Entity, EntityData
|
|
21
|
+
from ...core.entity import ObjectData, TileableData
|
|
22
|
+
from ...lib import wrapped_pickle as pickle
|
|
23
|
+
from ...typing_ import SlicesType, TileableType
|
|
24
|
+
from ...utils import TypeDispatcher
|
|
25
|
+
from ..odpsio.volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
26
|
+
|
|
27
|
+
_MetaType = Dict[str, Any]
|
|
28
|
+
|
|
29
|
+
_META_FILE_NAME = ".meta"
|
|
30
|
+
_META_PICKLED_KEYS_KEY = ".pickled_keys"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_io_handler_dispatcher = TypeDispatcher()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def register_object_io_handler(tileable_data_type: Type[TileableData]):
|
|
37
|
+
def wrapper(handler_cls):
|
|
38
|
+
_io_handler_dispatcher.register(tileable_data_type, handler_cls)
|
|
39
|
+
return handler_cls
|
|
40
|
+
|
|
41
|
+
return wrapper
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_object_io_handler(
|
|
45
|
+
tileable_data_type: Union[Entity, EntityData, Type[EntityData]]
|
|
46
|
+
) -> Type["AbstractObjectIOHandler"]:
|
|
47
|
+
if not isinstance(tileable_data_type, type):
|
|
48
|
+
if isinstance(tileable_data_type, Entity):
|
|
49
|
+
tileable_data_type = tileable_data_type.data
|
|
50
|
+
tileable_data_type = type(tileable_data_type)
|
|
51
|
+
return _io_handler_dispatcher.get_handler(tileable_data_type)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AbstractObjectIOHandler(metaclass=ABCMeta):
|
|
55
|
+
def _prepare_meta_for_serial(
|
|
56
|
+
self, tileable: TileableType, meta: Dict[str, Any]
|
|
57
|
+
) -> Dict[str, Any]:
|
|
58
|
+
to_pack = meta.copy()
|
|
59
|
+
pickled_keys = []
|
|
60
|
+
for k, v in meta.items():
|
|
61
|
+
if not isinstance(v, (str, bytes, int, float, bool)):
|
|
62
|
+
to_pack[k] = pickle.dumps(v)
|
|
63
|
+
pickled_keys.append(k)
|
|
64
|
+
to_pack[".pickled_keys"] = pickled_keys
|
|
65
|
+
return to_pack
|
|
66
|
+
|
|
67
|
+
def _prepare_meta_for_deserial(
|
|
68
|
+
self, tileable: TileableType, meta: Dict[str, Any]
|
|
69
|
+
) -> Dict[str, Any]:
|
|
70
|
+
pickled_keys = meta.pop(".pickled_keys", None) or []
|
|
71
|
+
for k in pickled_keys:
|
|
72
|
+
meta[k] = pickle.loads(meta[k])
|
|
73
|
+
return meta
|
|
74
|
+
|
|
75
|
+
def read_object_meta(
|
|
76
|
+
self, reader: ODPSVolumeReader, tileable: TileableType
|
|
77
|
+
) -> Dict[str, Any]:
|
|
78
|
+
meta_obj = msgpack.loads(reader.read_file(_META_FILE_NAME))
|
|
79
|
+
return self._prepare_meta_for_deserial(tileable, meta_obj)
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def _read_object_body(
|
|
83
|
+
self,
|
|
84
|
+
reader: ODPSVolumeReader,
|
|
85
|
+
tileable: TileableType,
|
|
86
|
+
meta: Dict[str, Any],
|
|
87
|
+
slices: SlicesType = None,
|
|
88
|
+
) -> Any:
|
|
89
|
+
raise NotImplementedError
|
|
90
|
+
|
|
91
|
+
def read_object(
|
|
92
|
+
self,
|
|
93
|
+
reader: ODPSVolumeReader,
|
|
94
|
+
tileable: TileableType,
|
|
95
|
+
slices: SlicesType = None,
|
|
96
|
+
) -> Any:
|
|
97
|
+
meta = self.read_object_meta(reader, tileable)
|
|
98
|
+
return self._read_object_body(reader, tileable, meta, slices)
|
|
99
|
+
|
|
100
|
+
@abstractmethod
|
|
101
|
+
def _write_object_body(
|
|
102
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
103
|
+
):
|
|
104
|
+
raise NotImplementedError
|
|
105
|
+
|
|
106
|
+
def write_object_meta(
|
|
107
|
+
self,
|
|
108
|
+
writer: ODPSVolumeWriter,
|
|
109
|
+
tileable: TileableType,
|
|
110
|
+
extra_meta: Dict[str, Any] = None,
|
|
111
|
+
):
|
|
112
|
+
meta_obj = tileable.params.copy()
|
|
113
|
+
if extra_meta:
|
|
114
|
+
meta_obj.update(extra_meta)
|
|
115
|
+
meta_obj = self._prepare_meta_for_serial(tileable, meta_obj)
|
|
116
|
+
packed = msgpack.dumps(meta_obj)
|
|
117
|
+
writer.write_file(_META_FILE_NAME, packed)
|
|
118
|
+
|
|
119
|
+
def write_object(
|
|
120
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
121
|
+
):
|
|
122
|
+
self.write_object_meta(writer, tileable)
|
|
123
|
+
self._write_object_body(writer, tileable, value)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@register_object_io_handler(ObjectData)
|
|
127
|
+
class ObjectIOHandler(AbstractObjectIOHandler):
|
|
128
|
+
def _read_object_body(
|
|
129
|
+
self,
|
|
130
|
+
reader: ODPSVolumeReader,
|
|
131
|
+
tileable: TileableType,
|
|
132
|
+
meta: Dict[str, Any],
|
|
133
|
+
slices: SlicesType = None,
|
|
134
|
+
) -> Any:
|
|
135
|
+
return pickle.loads(reader.read_file("data"))
|
|
136
|
+
|
|
137
|
+
def _write_object_body(
|
|
138
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
139
|
+
):
|
|
140
|
+
writer.write_file("data", pickle.dumps(value))
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import struct
|
|
16
|
+
from io import BytesIO
|
|
17
|
+
from typing import Any, Dict
|
|
18
|
+
|
|
19
|
+
import msgpack
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
from ...lib import wrapped_pickle as pickle
|
|
23
|
+
from ...tensor.core import TensorData
|
|
24
|
+
from ...typing_ import SlicesType, TileableType
|
|
25
|
+
from ..odpsio import ODPSVolumeReader, ODPSVolumeWriter
|
|
26
|
+
from .core import AbstractObjectIOHandler, register_object_io_handler
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register_object_io_handler(TensorData)
|
|
30
|
+
class TensorIOHandler(AbstractObjectIOHandler):
|
|
31
|
+
def write_object_meta(
|
|
32
|
+
self,
|
|
33
|
+
writer: ODPSVolumeWriter,
|
|
34
|
+
tileable: TileableType,
|
|
35
|
+
extra_meta: Dict[str, Any] = None,
|
|
36
|
+
):
|
|
37
|
+
# fixme upload in real slices when tensors are supported in DPE
|
|
38
|
+
extra_meta = extra_meta or dict()
|
|
39
|
+
extra_meta["nsplits"] = ((np.nan,),)
|
|
40
|
+
|
|
41
|
+
super().write_object_meta(writer, tileable, extra_meta=extra_meta)
|
|
42
|
+
|
|
43
|
+
def _read_object_body(
|
|
44
|
+
self,
|
|
45
|
+
reader: ODPSVolumeReader,
|
|
46
|
+
tileable: TileableType,
|
|
47
|
+
meta: Dict[str, Any],
|
|
48
|
+
slices: SlicesType = None,
|
|
49
|
+
) -> Any:
|
|
50
|
+
# fixme read data with slices when tensors are supported in DPE
|
|
51
|
+
body = reader.read_file("0,0.dat")
|
|
52
|
+
bio = BytesIO(body)
|
|
53
|
+
(header_len,) = struct.unpack("<I", bio.read(4))
|
|
54
|
+
header_data = msgpack.loads(bio.read(header_len))
|
|
55
|
+
|
|
56
|
+
pickled = bio.read(header_data[0])
|
|
57
|
+
bufs = [bio.read(size) for size in header_data[1:]]
|
|
58
|
+
return pickle.loads(pickled, buffers=bufs)
|
|
59
|
+
|
|
60
|
+
def _write_object_body(
|
|
61
|
+
self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
|
|
62
|
+
):
|
|
63
|
+
# fixme upload in real slices when tensors are supported in DPE
|
|
64
|
+
def data_gen():
|
|
65
|
+
bufs = []
|
|
66
|
+
pickled = pickle.dumps(value, buffer_callback=bufs.append)
|
|
67
|
+
header_data = msgpack.dumps(
|
|
68
|
+
[len(pickled)] + [len(buf.raw()) for buf in bufs]
|
|
69
|
+
)
|
|
70
|
+
yield struct.pack("<I", len(header_data))
|
|
71
|
+
yield header_data
|
|
72
|
+
yield pickled
|
|
73
|
+
for buf in bufs:
|
|
74
|
+
yield buf
|
|
75
|
+
|
|
76
|
+
writer.write_file("0,0.dat", data_gen())
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pytest
|
|
16
|
+
from odps import ODPS
|
|
17
|
+
|
|
18
|
+
from ....core import OutputType
|
|
19
|
+
from ....core.operator import ObjectOperatorMixin, Operator
|
|
20
|
+
from ....tensor.datasource import ArrayDataSource
|
|
21
|
+
from ....tests.utils import tn
|
|
22
|
+
from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
|
|
23
|
+
from ..core import get_object_io_handler
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TestObjectOp(Operator, ObjectOperatorMixin):
|
|
27
|
+
def __call__(self):
|
|
28
|
+
self._output_types = [OutputType.object]
|
|
29
|
+
return self.new_tileable([])
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture(scope="module")
|
|
33
|
+
def create_volume(request, oss_config):
|
|
34
|
+
test_vol_name = tn("test_object_io_volume")
|
|
35
|
+
odps_entry = ODPS.from_environments()
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
|
|
39
|
+
except:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
oss_test_dir_name = tn("test_oss_directory")
|
|
43
|
+
if oss_config is None:
|
|
44
|
+
pytest.skip("Need oss and its config to run this test")
|
|
45
|
+
(
|
|
46
|
+
oss_access_id,
|
|
47
|
+
oss_secret_access_key,
|
|
48
|
+
oss_bucket_name,
|
|
49
|
+
oss_endpoint,
|
|
50
|
+
) = oss_config.oss_config
|
|
51
|
+
test_location = "oss://%s:%s@%s/%s/%s" % (
|
|
52
|
+
oss_access_id,
|
|
53
|
+
oss_secret_access_key,
|
|
54
|
+
oss_endpoint,
|
|
55
|
+
oss_bucket_name,
|
|
56
|
+
oss_test_dir_name,
|
|
57
|
+
)
|
|
58
|
+
oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
|
|
59
|
+
odps_entry.create_external_volume(test_vol_name, location=test_location)
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
yield test_vol_name
|
|
63
|
+
finally:
|
|
64
|
+
try:
|
|
65
|
+
odps_entry.delete_volume(
|
|
66
|
+
test_vol_name, auto_remove_dir=True, recursive=True
|
|
67
|
+
)
|
|
68
|
+
except:
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_simple_object_io(create_volume):
|
|
73
|
+
obj = TestObjectOp()()
|
|
74
|
+
data = "abcdefg"
|
|
75
|
+
|
|
76
|
+
odps_entry = ODPS.from_environments()
|
|
77
|
+
|
|
78
|
+
reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
|
|
79
|
+
writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
|
|
80
|
+
|
|
81
|
+
handler = get_object_io_handler(obj)()
|
|
82
|
+
handler.write_object(writer, obj, data)
|
|
83
|
+
assert data == handler.read_object(reader, obj)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_tensor_object_io(create_volume):
|
|
87
|
+
data = np.array([[4, 9, 2], [3, 5, 7], [8, 1, 6]])
|
|
88
|
+
obj = ArrayDataSource(data, dtype=data.dtype)(data.shape)
|
|
89
|
+
|
|
90
|
+
odps_entry = ODPS.from_environments()
|
|
91
|
+
|
|
92
|
+
reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
|
|
93
|
+
writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
|
|
94
|
+
|
|
95
|
+
handler = get_object_io_handler(obj)()
|
|
96
|
+
handler.write_object(writer, obj, data)
|
|
97
|
+
np.testing.assert_equal(data, handler.read_object(reader, obj))
|
|
@@ -14,8 +14,10 @@
|
|
|
14
14
|
|
|
15
15
|
from .arrow import arrow_to_pandas, pandas_to_arrow
|
|
16
16
|
from .schema import (
|
|
17
|
+
arrow_schema_to_odps_schema,
|
|
17
18
|
build_dataframe_table_meta,
|
|
18
19
|
odps_schema_to_pandas_dtypes,
|
|
19
20
|
pandas_to_odps_schema,
|
|
20
21
|
)
|
|
21
22
|
from .tableio import HaloTableIO, ODPSTableIO
|
|
23
|
+
from .volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
@@ -17,10 +17,10 @@ from typing import Any, Tuple, Union
|
|
|
17
17
|
import pandas as pd
|
|
18
18
|
import pyarrow as pa
|
|
19
19
|
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
20
|
+
from ...core import OutputType
|
|
21
|
+
from ...protocol import DataFrameTableMeta
|
|
22
|
+
from ...tensor.core import TENSOR_TYPE
|
|
23
|
+
from ...typing_ import ArrowTableType, PandasObjectTypes
|
|
24
24
|
from .schema import build_dataframe_table_meta
|
|
25
25
|
|
|
26
26
|
|