maxframe 1.0.0rc2__cp310-cp310-win_amd64.whl → 1.0.0rc3__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (106) hide show
  1. maxframe/_utils.cp310-win_amd64.pyd +0 -0
  2. maxframe/codegen.py +3 -2
  3. maxframe/config/config.py +16 -9
  4. maxframe/config/validators.py +42 -12
  5. maxframe/conftest.py +13 -2
  6. maxframe/core/__init__.py +2 -13
  7. maxframe/core/entity/__init__.py +0 -4
  8. maxframe/core/entity/objects.py +45 -2
  9. maxframe/core/entity/output_types.py +0 -3
  10. maxframe/core/entity/tests/test_objects.py +43 -0
  11. maxframe/core/entity/tileables.py +5 -78
  12. maxframe/core/graph/__init__.py +2 -2
  13. maxframe/core/graph/builder/__init__.py +0 -1
  14. maxframe/core/graph/builder/base.py +5 -4
  15. maxframe/core/graph/builder/tileable.py +4 -4
  16. maxframe/core/graph/builder/utils.py +4 -8
  17. maxframe/core/graph/core.cp310-win_amd64.pyd +0 -0
  18. maxframe/core/graph/entity.py +9 -33
  19. maxframe/core/operator/__init__.py +2 -9
  20. maxframe/core/operator/base.py +3 -5
  21. maxframe/core/operator/objects.py +0 -9
  22. maxframe/core/operator/utils.py +55 -0
  23. maxframe/dataframe/datasource/read_odps_query.py +1 -1
  24. maxframe/dataframe/datasource/read_odps_table.py +1 -1
  25. maxframe/dataframe/datastore/to_odps.py +1 -1
  26. maxframe/dataframe/operators.py +1 -17
  27. maxframe/dataframe/reduction/core.py +2 -2
  28. maxframe/io/objects/__init__.py +24 -0
  29. maxframe/io/objects/core.py +140 -0
  30. maxframe/io/objects/tensor.py +76 -0
  31. maxframe/io/objects/tests/__init__.py +13 -0
  32. maxframe/io/objects/tests/test_object_io.py +97 -0
  33. maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
  34. maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
  35. maxframe/{odpsio → io/odpsio}/schema.py +5 -5
  36. maxframe/{odpsio → io/odpsio}/tableio.py +10 -4
  37. maxframe/io/odpsio/tests/__init__.py +13 -0
  38. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -3
  39. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
  40. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  41. maxframe/io/odpsio/volumeio.py +57 -0
  42. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  43. maxframe/learn/contrib/xgboost/core.py +87 -2
  44. maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
  45. maxframe/learn/contrib/xgboost/predict.py +19 -5
  46. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  47. maxframe/learn/contrib/xgboost/train.py +25 -15
  48. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  49. maxframe/lib/mmh3.cp310-win_amd64.pyd +0 -0
  50. maxframe/protocol.py +1 -15
  51. maxframe/remote/core.py +4 -8
  52. maxframe/serialization/__init__.py +1 -0
  53. maxframe/serialization/core.cp310-win_amd64.pyd +0 -0
  54. maxframe/tensor/__init__.py +10 -2
  55. maxframe/tensor/arithmetic/isclose.py +1 -0
  56. maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
  57. maxframe/tensor/core.py +5 -136
  58. maxframe/tensor/datasource/array.py +3 -0
  59. maxframe/tensor/datasource/full.py +1 -1
  60. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  61. maxframe/tensor/indexing/flatnonzero.py +1 -1
  62. maxframe/tensor/merge/__init__.py +2 -0
  63. maxframe/tensor/merge/concatenate.py +98 -0
  64. maxframe/tensor/merge/tests/test_merge.py +30 -1
  65. maxframe/tensor/merge/vstack.py +70 -0
  66. maxframe/tensor/{base → misc}/__init__.py +2 -0
  67. maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
  68. maxframe/tensor/misc/atleast_2d.py +70 -0
  69. maxframe/tensor/misc/atleast_3d.py +85 -0
  70. maxframe/tensor/misc/tests/__init__.py +13 -0
  71. maxframe/tensor/{base → misc}/transpose.py +22 -18
  72. maxframe/tensor/operators.py +1 -7
  73. maxframe/tensor/random/core.py +1 -1
  74. maxframe/tensor/reduction/count_nonzero.py +1 -0
  75. maxframe/tensor/reduction/mean.py +1 -0
  76. maxframe/tensor/reduction/nanmean.py +1 -0
  77. maxframe/tensor/reduction/nanvar.py +2 -0
  78. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  79. maxframe/tensor/reduction/var.py +2 -0
  80. maxframe/tensor/utils.py +2 -22
  81. maxframe/typing_.py +4 -1
  82. maxframe/udf.py +8 -9
  83. maxframe/utils.py +15 -61
  84. maxframe-1.0.0rc3.dist-info/METADATA +104 -0
  85. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/RECORD +101 -91
  86. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/WHEEL +1 -1
  87. maxframe_client/fetcher.py +23 -42
  88. maxframe_client/session/graph.py +8 -2
  89. maxframe_client/session/odps.py +54 -18
  90. maxframe_client/tests/test_fetcher.py +1 -1
  91. maxframe_client/tests/test_session.py +14 -2
  92. maxframe/core/entity/chunks.py +0 -68
  93. maxframe/core/entity/fuse.py +0 -73
  94. maxframe/core/graph/builder/chunk.py +0 -430
  95. maxframe/odpsio/volumeio.py +0 -95
  96. maxframe-1.0.0rc2.dist-info/METADATA +0 -177
  97. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  98. /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
  99. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  100. /maxframe/tensor/{base → misc}/astype.py +0 -0
  101. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  102. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  103. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  104. /maxframe/tensor/{base → misc}/unique.py +0 -0
  105. /maxframe/tensor/{base → misc}/where.py +0 -0
  106. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -13,9 +13,9 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from abc import ABCMeta, abstractmethod
16
- from typing import Dict, Iterable, List, Union
16
+ from typing import Dict, Iterable, List
17
17
 
18
- from ...core import Chunk, Tileable
18
+ from ...core import Tileable
19
19
  from ...serialization.core import buffered
20
20
  from ...serialization.serializables import BoolField, DictField, ListField, Serializable
21
21
  from ...serialization.serializables.core import SerializableSerializer
@@ -97,26 +97,6 @@ class TileableGraph(EntityGraph, Iterable[Tileable]):
97
97
  return self._logic_key
98
98
 
99
99
 
100
- class ChunkGraph(EntityGraph, Iterable[Chunk]):
101
- _result_chunks: List[Chunk]
102
-
103
- def __init__(self, result_chunks: List[Chunk] = None):
104
- super().__init__()
105
- self._result_chunks = result_chunks
106
-
107
- @property
108
- def result_chunks(self):
109
- return self._result_chunks
110
-
111
- @property
112
- def results(self):
113
- return self._result_chunks
114
-
115
- @results.setter
116
- def results(self, new_results):
117
- self._result_chunks = new_results
118
-
119
-
120
100
  class SerializableGraph(Serializable):
121
101
  _is_chunk = BoolField("is_chunk")
122
102
  # TODO(qinxuye): remove this logic when we handle fetch elegantly,
@@ -132,12 +112,11 @@ class SerializableGraph(Serializable):
132
112
  _results = ListField("results")
133
113
 
134
114
  @classmethod
135
- def from_graph(cls, graph: Union[TileableGraph, ChunkGraph]) -> "SerializableGraph":
115
+ def from_graph(cls, graph: EntityGraph) -> "SerializableGraph":
136
116
  from ..operator import Fetch
137
117
 
138
- is_chunk = isinstance(graph, ChunkGraph)
139
118
  return SerializableGraph(
140
- _is_chunk=is_chunk,
119
+ _is_chunk=False,
141
120
  _fetch_nodes=[chunk for chunk in graph if isinstance(chunk.op, Fetch)],
142
121
  _nodes=graph._nodes,
143
122
  _predecessors=graph._predecessors,
@@ -145,9 +124,8 @@ class SerializableGraph(Serializable):
145
124
  _results=graph.results,
146
125
  )
147
126
 
148
- def to_graph(self) -> Union[TileableGraph, ChunkGraph]:
149
- graph_cls = ChunkGraph if self._is_chunk else TileableGraph
150
- graph = graph_cls(self._results)
127
+ def to_graph(self) -> EntityGraph:
128
+ graph = TileableGraph(self._results)
151
129
  graph._nodes.update(self._nodes)
152
130
  graph._predecessors.update(self._predecessors)
153
131
  graph._successors.update(self._successors)
@@ -156,14 +134,12 @@ class SerializableGraph(Serializable):
156
134
 
157
135
  class GraphSerializer(SerializableSerializer):
158
136
  @buffered
159
- def serial(self, obj: Union[TileableGraph, ChunkGraph], context: Dict):
137
+ def serial(self, obj: EntityGraph, context: Dict):
160
138
  serializable_graph = SerializableGraph.from_graph(obj)
161
139
  return [], [serializable_graph], False
162
140
 
163
- def deserial(
164
- self, serialized: List, context: Dict, subs: List
165
- ) -> Union[TileableGraph, ChunkGraph]:
166
- serializable_graph: SerializableGraph = subs[0]
141
+ def deserial(self, serialized: List, context: Dict, subs: List) -> TileableGraph:
142
+ serializable_graph: EntityGraph = subs[0]
167
143
  return serializable_graph.to_graph()
168
144
 
169
145
 
@@ -22,13 +22,6 @@ from .base import (
22
22
  )
23
23
  from .core import TileableOperatorMixin, estimate_size, execute
24
24
  from .fetch import Fetch, FetchMixin, FetchShuffle, ShuffleFetchType
25
- from .fuse import Fuse, FuseChunkMixin
26
- from .objects import (
27
- MergeDictOperator,
28
- ObjectFetch,
29
- ObjectFuseChunk,
30
- ObjectFuseChunkMixin,
31
- ObjectOperator,
32
- ObjectOperatorMixin,
33
- )
25
+ from .objects import MergeDictOperator, ObjectFetch, ObjectOperator, ObjectOperatorMixin
34
26
  from .shuffle import MapReduceOperator, ShuffleProxy
27
+ from .utils import add_fetch_builder, build_fetch
@@ -12,11 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import functools
16
15
  import weakref
17
16
  from copy import deepcopy
18
17
  from enum import Enum
19
- from functools import partial
18
+ from functools import lru_cache, partial
20
19
  from typing import Any, Dict, List, Optional, Tuple, Type, Union
21
20
 
22
21
  from ...serialization.core import Placeholder
@@ -37,7 +36,6 @@ from ...serialization.serializables.core import SerializableSerializer
37
36
  from ...typing_ import OperatorType
38
37
  from ...utils import AttributeDict, classproperty, get_user_call_point, tokenize
39
38
  from ..base import Base
40
- from ..entity.chunks import Chunk
41
39
  from ..entity.core import ENTITY_TYPE, Entity, EntityData
42
40
  from ..entity.output_types import OutputType
43
41
  from ..entity.tileables import Tileable
@@ -90,7 +88,7 @@ class SchedulingHint(Serializable):
90
88
  priority = Int32Field("priority", default=None)
91
89
 
92
90
  @classproperty
93
- @functools.lru_cache(1)
91
+ @lru_cache(1)
94
92
  def all_hint_names(cls):
95
93
  return list(cls._FIELDS)
96
94
 
@@ -341,7 +339,7 @@ class Operator(Base, OperatorLogicKeyGeneratorMixin, metaclass=OperatorMetaclass
341
339
  raise ValueError("Outputs' size exceeds limitation")
342
340
 
343
341
  @property
344
- def outputs(self) -> List[Union[Chunk, Tileable]]:
342
+ def outputs(self) -> List[Tileable]:
345
343
  outputs = self._outputs
346
344
  if outputs:
347
345
  return [ref() for ref in outputs]
@@ -17,7 +17,6 @@ from ..entity import OutputType, register_fetch_class
17
17
  from .base import Operator
18
18
  from .core import TileableOperatorMixin
19
19
  from .fetch import Fetch, FetchMixin
20
- from .fuse import Fuse, FuseChunkMixin
21
20
 
22
21
 
23
22
  class ObjectOperator(Operator):
@@ -28,14 +27,6 @@ class ObjectOperatorMixin(TileableOperatorMixin):
28
27
  _output_type_ = OutputType.object
29
28
 
30
29
 
31
- class ObjectFuseChunkMixin(FuseChunkMixin, ObjectOperatorMixin):
32
- __slots__ = ()
33
-
34
-
35
- class ObjectFuseChunk(ObjectFuseChunkMixin, Fuse):
36
- pass
37
-
38
-
39
30
  class ObjectFetch(FetchMixin, ObjectOperatorMixin, Fetch):
40
31
  _output_type_ = OutputType.object
41
32
 
@@ -0,0 +1,55 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ...typing_ import EntityType, TileableType
16
+ from ..entity import TILEABLE_TYPE
17
+
18
+
19
+ def build_fetch_tileable(tileable: TileableType) -> TileableType:
20
+ if tileable.is_coarse():
21
+ chunks = None
22
+ else:
23
+ chunks = []
24
+ for c in tileable.chunks:
25
+ fetch_chunk = build_fetch(c, index=c.index)
26
+ chunks.append(fetch_chunk)
27
+
28
+ tileable_op = tileable.op
29
+ params = tileable.params.copy()
30
+
31
+ new_op = tileable_op.get_fetch_op_cls(tileable)(_id=tileable_op.id)
32
+ return new_op.new_tileables(
33
+ None,
34
+ chunks=chunks,
35
+ nsplits=tileable.nsplits,
36
+ _key=tileable.key,
37
+ _id=tileable.id,
38
+ **params,
39
+ )[0]
40
+
41
+
42
+ _type_to_builder = [
43
+ (TILEABLE_TYPE, build_fetch_tileable),
44
+ ]
45
+
46
+
47
+ def build_fetch(entity: EntityType, **kw) -> EntityType:
48
+ for entity_types, func in _type_to_builder:
49
+ if isinstance(entity, entity_types):
50
+ return func(entity, **kw)
51
+ raise TypeError(f"Type {type(entity)} not supported")
52
+
53
+
54
+ def add_fetch_builder(entity_type, builder_func):
55
+ _type_to_builder.append((entity_type, builder_func))
@@ -24,7 +24,7 @@ from odps.types import Column, OdpsSchema, validate_data_type
24
24
  from ... import opcodes
25
25
  from ...core import OutputType
26
26
  from ...core.graph import DAG
27
- from ...odpsio import odps_schema_to_pandas_dtypes
27
+ from ...io.odpsio import odps_schema_to_pandas_dtypes
28
28
  from ...serialization.serializables import (
29
29
  AnyField,
30
30
  BoolField,
@@ -23,7 +23,7 @@ from odps.utils import to_timestamp
23
23
 
24
24
  from ... import opcodes
25
25
  from ...core import OutputType
26
- from ...odpsio import odps_schema_to_pandas_dtypes
26
+ from ...io.odpsio import odps_schema_to_pandas_dtypes
27
27
  from ...serialization.serializables import (
28
28
  AnyField,
29
29
  BoolField,
@@ -23,7 +23,7 @@ from odps.types import PartitionSpec
23
23
  from ... import opcodes
24
24
  from ...config import options
25
25
  from ...core import OutputType
26
- from ...odpsio import build_dataframe_table_meta
26
+ from ...io.odpsio import build_dataframe_table_meta
27
27
  from ...serialization.serializables import (
28
28
  BoolField,
29
29
  FieldTypes,
@@ -16,13 +16,7 @@ import numpy as np
16
16
  import pandas as pd
17
17
 
18
18
  from ..core import ENTITY_TYPE, OutputType
19
- from ..core.operator import (
20
- Fuse,
21
- FuseChunkMixin,
22
- Operator,
23
- ShuffleProxy,
24
- TileableOperatorMixin,
25
- )
19
+ from ..core.operator import Operator, ShuffleProxy, TileableOperatorMixin
26
20
  from ..tensor.core import TENSOR_TYPE
27
21
  from ..tensor.datasource import tensor as astensor
28
22
  from .core import DATAFRAME_TYPE, SERIES_TYPE
@@ -261,13 +255,3 @@ DataFrameOperator = Operator
261
255
  class DataFrameShuffleProxy(ShuffleProxy, DataFrameOperatorMixin):
262
256
  def __init__(self, sparse=None, output_types=None, **kwargs):
263
257
  super().__init__(sparse=sparse, _output_types=output_types, **kwargs)
264
-
265
-
266
- class DataFrameFuseChunkMixin(FuseChunkMixin, DataFrameOperatorMixin):
267
- __slots__ = ()
268
-
269
-
270
- class DataFrameFuseChunk(Fuse, DataFrameFuseChunkMixin):
271
- @property
272
- def output_types(self):
273
- return self.outputs[-1].chunk.op.output_types
@@ -552,7 +552,7 @@ class ReductionCompiler:
552
552
  @enter_mode(build=True)
553
553
  def _compile_function(self, func, func_name=None, ndim=1) -> ReductionSteps:
554
554
  from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
555
- from ...tensor.base import TensorWhere
555
+ from ...tensor.misc import TensorWhere
556
556
  from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
557
557
  from ..datasource.dataframe import DataFrameDataSource
558
558
  from ..datasource.series import SeriesDataSource
@@ -679,8 +679,8 @@ class ReductionCompiler:
679
679
  ]
680
680
  """
681
681
  from ...tensor.arithmetic.core import TensorBinOp, TensorUnaryOp
682
- from ...tensor.base import TensorWhere
683
682
  from ...tensor.datasource import Scalar
683
+ from ...tensor.misc import TensorWhere
684
684
  from ..arithmetic.core import DataFrameBinOp, DataFrameUnaryOp
685
685
  from ..datasource.dataframe import DataFrameDataSource
686
686
  from ..datasource.series import SeriesDataSource
@@ -0,0 +1,24 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .core import (
16
+ AbstractObjectIOHandler,
17
+ get_object_io_handler,
18
+ register_object_io_handler,
19
+ )
20
+
21
+ # isort: off
22
+ from . import tensor
23
+
24
+ del tensor
@@ -0,0 +1,140 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from abc import ABCMeta, abstractmethod
16
+ from typing import Any, Dict, Type, Union
17
+
18
+ import msgpack
19
+
20
+ from ...core import Entity, EntityData
21
+ from ...core.entity import ObjectData, TileableData
22
+ from ...lib import wrapped_pickle as pickle
23
+ from ...typing_ import SlicesType, TileableType
24
+ from ...utils import TypeDispatcher
25
+ from ..odpsio.volumeio import ODPSVolumeReader, ODPSVolumeWriter
26
+
27
+ _MetaType = Dict[str, Any]
28
+
29
+ _META_FILE_NAME = ".meta"
30
+ _META_PICKLED_KEYS_KEY = ".pickled_keys"
31
+
32
+
33
+ _io_handler_dispatcher = TypeDispatcher()
34
+
35
+
36
+ def register_object_io_handler(tileable_data_type: Type[TileableData]):
37
+ def wrapper(handler_cls):
38
+ _io_handler_dispatcher.register(tileable_data_type, handler_cls)
39
+ return handler_cls
40
+
41
+ return wrapper
42
+
43
+
44
+ def get_object_io_handler(
45
+ tileable_data_type: Union[Entity, EntityData, Type[EntityData]]
46
+ ) -> Type["AbstractObjectIOHandler"]:
47
+ if not isinstance(tileable_data_type, type):
48
+ if isinstance(tileable_data_type, Entity):
49
+ tileable_data_type = tileable_data_type.data
50
+ tileable_data_type = type(tileable_data_type)
51
+ return _io_handler_dispatcher.get_handler(tileable_data_type)
52
+
53
+
54
+ class AbstractObjectIOHandler(metaclass=ABCMeta):
55
+ def _prepare_meta_for_serial(
56
+ self, tileable: TileableType, meta: Dict[str, Any]
57
+ ) -> Dict[str, Any]:
58
+ to_pack = meta.copy()
59
+ pickled_keys = []
60
+ for k, v in meta.items():
61
+ if not isinstance(v, (str, bytes, int, float, bool)):
62
+ to_pack[k] = pickle.dumps(v)
63
+ pickled_keys.append(k)
64
+ to_pack[".pickled_keys"] = pickled_keys
65
+ return to_pack
66
+
67
+ def _prepare_meta_for_deserial(
68
+ self, tileable: TileableType, meta: Dict[str, Any]
69
+ ) -> Dict[str, Any]:
70
+ pickled_keys = meta.pop(".pickled_keys", None) or []
71
+ for k in pickled_keys:
72
+ meta[k] = pickle.loads(meta[k])
73
+ return meta
74
+
75
+ def read_object_meta(
76
+ self, reader: ODPSVolumeReader, tileable: TileableType
77
+ ) -> Dict[str, Any]:
78
+ meta_obj = msgpack.loads(reader.read_file(_META_FILE_NAME))
79
+ return self._prepare_meta_for_deserial(tileable, meta_obj)
80
+
81
+ @abstractmethod
82
+ def _read_object_body(
83
+ self,
84
+ reader: ODPSVolumeReader,
85
+ tileable: TileableType,
86
+ meta: Dict[str, Any],
87
+ slices: SlicesType = None,
88
+ ) -> Any:
89
+ raise NotImplementedError
90
+
91
+ def read_object(
92
+ self,
93
+ reader: ODPSVolumeReader,
94
+ tileable: TileableType,
95
+ slices: SlicesType = None,
96
+ ) -> Any:
97
+ meta = self.read_object_meta(reader, tileable)
98
+ return self._read_object_body(reader, tileable, meta, slices)
99
+
100
+ @abstractmethod
101
+ def _write_object_body(
102
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
103
+ ):
104
+ raise NotImplementedError
105
+
106
+ def write_object_meta(
107
+ self,
108
+ writer: ODPSVolumeWriter,
109
+ tileable: TileableType,
110
+ extra_meta: Dict[str, Any] = None,
111
+ ):
112
+ meta_obj = tileable.params.copy()
113
+ if extra_meta:
114
+ meta_obj.update(extra_meta)
115
+ meta_obj = self._prepare_meta_for_serial(tileable, meta_obj)
116
+ packed = msgpack.dumps(meta_obj)
117
+ writer.write_file(_META_FILE_NAME, packed)
118
+
119
+ def write_object(
120
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
121
+ ):
122
+ self.write_object_meta(writer, tileable)
123
+ self._write_object_body(writer, tileable, value)
124
+
125
+
126
+ @register_object_io_handler(ObjectData)
127
+ class ObjectIOHandler(AbstractObjectIOHandler):
128
+ def _read_object_body(
129
+ self,
130
+ reader: ODPSVolumeReader,
131
+ tileable: TileableType,
132
+ meta: Dict[str, Any],
133
+ slices: SlicesType = None,
134
+ ) -> Any:
135
+ return pickle.loads(reader.read_file("data"))
136
+
137
+ def _write_object_body(
138
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
139
+ ):
140
+ writer.write_file("data", pickle.dumps(value))
@@ -0,0 +1,76 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import struct
16
+ from io import BytesIO
17
+ from typing import Any, Dict
18
+
19
+ import msgpack
20
+ import numpy as np
21
+
22
+ from ...lib import wrapped_pickle as pickle
23
+ from ...tensor.core import TensorData
24
+ from ...typing_ import SlicesType, TileableType
25
+ from ..odpsio import ODPSVolumeReader, ODPSVolumeWriter
26
+ from .core import AbstractObjectIOHandler, register_object_io_handler
27
+
28
+
29
+ @register_object_io_handler(TensorData)
30
+ class TensorIOHandler(AbstractObjectIOHandler):
31
+ def write_object_meta(
32
+ self,
33
+ writer: ODPSVolumeWriter,
34
+ tileable: TileableType,
35
+ extra_meta: Dict[str, Any] = None,
36
+ ):
37
+ # fixme upload in real slices when tensors are supported in DPE
38
+ extra_meta = extra_meta or dict()
39
+ extra_meta["nsplits"] = ((np.nan,),)
40
+
41
+ super().write_object_meta(writer, tileable, extra_meta=extra_meta)
42
+
43
+ def _read_object_body(
44
+ self,
45
+ reader: ODPSVolumeReader,
46
+ tileable: TileableType,
47
+ meta: Dict[str, Any],
48
+ slices: SlicesType = None,
49
+ ) -> Any:
50
+ # fixme read data with slices when tensors are supported in DPE
51
+ body = reader.read_file("0,0.dat")
52
+ bio = BytesIO(body)
53
+ (header_len,) = struct.unpack("<I", bio.read(4))
54
+ header_data = msgpack.loads(bio.read(header_len))
55
+
56
+ pickled = bio.read(header_data[0])
57
+ bufs = [bio.read(size) for size in header_data[1:]]
58
+ return pickle.loads(pickled, buffers=bufs)
59
+
60
+ def _write_object_body(
61
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
62
+ ):
63
+ # fixme upload in real slices when tensors are supported in DPE
64
+ def data_gen():
65
+ bufs = []
66
+ pickled = pickle.dumps(value, buffer_callback=bufs.append)
67
+ header_data = msgpack.dumps(
68
+ [len(pickled)] + [len(buf.raw()) for buf in bufs]
69
+ )
70
+ yield struct.pack("<I", len(header_data))
71
+ yield header_data
72
+ yield pickled
73
+ for buf in bufs:
74
+ yield buf
75
+
76
+ writer.write_file("0,0.dat", data_gen())
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,97 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import numpy as np
15
+ import pytest
16
+ from odps import ODPS
17
+
18
+ from ....core import OutputType
19
+ from ....core.operator import ObjectOperatorMixin, Operator
20
+ from ....tensor.datasource import ArrayDataSource
21
+ from ....tests.utils import tn
22
+ from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
23
+ from ..core import get_object_io_handler
24
+
25
+
26
+ class TestObjectOp(Operator, ObjectOperatorMixin):
27
+ def __call__(self):
28
+ self._output_types = [OutputType.object]
29
+ return self.new_tileable([])
30
+
31
+
32
+ @pytest.fixture(scope="module")
33
+ def create_volume(request, oss_config):
34
+ test_vol_name = tn("test_object_io_volume")
35
+ odps_entry = ODPS.from_environments()
36
+
37
+ try:
38
+ odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
39
+ except:
40
+ pass
41
+
42
+ oss_test_dir_name = tn("test_oss_directory")
43
+ if oss_config is None:
44
+ pytest.skip("Need oss and its config to run this test")
45
+ (
46
+ oss_access_id,
47
+ oss_secret_access_key,
48
+ oss_bucket_name,
49
+ oss_endpoint,
50
+ ) = oss_config.oss_config
51
+ test_location = "oss://%s:%s@%s/%s/%s" % (
52
+ oss_access_id,
53
+ oss_secret_access_key,
54
+ oss_endpoint,
55
+ oss_bucket_name,
56
+ oss_test_dir_name,
57
+ )
58
+ oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
59
+ odps_entry.create_external_volume(test_vol_name, location=test_location)
60
+
61
+ try:
62
+ yield test_vol_name
63
+ finally:
64
+ try:
65
+ odps_entry.delete_volume(
66
+ test_vol_name, auto_remove_dir=True, recursive=True
67
+ )
68
+ except:
69
+ pass
70
+
71
+
72
+ def test_simple_object_io(create_volume):
73
+ obj = TestObjectOp()()
74
+ data = "abcdefg"
75
+
76
+ odps_entry = ODPS.from_environments()
77
+
78
+ reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
79
+ writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
80
+
81
+ handler = get_object_io_handler(obj)()
82
+ handler.write_object(writer, obj, data)
83
+ assert data == handler.read_object(reader, obj)
84
+
85
+
86
+ def test_tensor_object_io(create_volume):
87
+ data = np.array([[4, 9, 2], [3, 5, 7], [8, 1, 6]])
88
+ obj = ArrayDataSource(data, dtype=data.dtype)(data.shape)
89
+
90
+ odps_entry = ODPS.from_environments()
91
+
92
+ reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
93
+ writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
94
+
95
+ handler = get_object_io_handler(obj)()
96
+ handler.write_object(writer, obj, data)
97
+ np.testing.assert_equal(data, handler.read_object(reader, obj))
@@ -14,8 +14,10 @@
14
14
 
15
15
  from .arrow import arrow_to_pandas, pandas_to_arrow
16
16
  from .schema import (
17
+ arrow_schema_to_odps_schema,
17
18
  build_dataframe_table_meta,
18
19
  odps_schema_to_pandas_dtypes,
19
20
  pandas_to_odps_schema,
20
21
  )
21
22
  from .tableio import HaloTableIO, ODPSTableIO
23
+ from .volumeio import ODPSVolumeReader, ODPSVolumeWriter
@@ -17,10 +17,10 @@ from typing import Any, Tuple, Union
17
17
  import pandas as pd
18
18
  import pyarrow as pa
19
19
 
20
- from ..core import OutputType
21
- from ..protocol import DataFrameTableMeta
22
- from ..tensor.core import TENSOR_TYPE
23
- from ..typing_ import ArrowTableType, PandasObjectTypes
20
+ from ...core import OutputType
21
+ from ...protocol import DataFrameTableMeta
22
+ from ...tensor.core import TENSOR_TYPE
23
+ from ...typing_ import ArrowTableType, PandasObjectTypes
24
24
  from .schema import build_dataframe_table_meta
25
25
 
26
26