maxframe 0.1.0b5__cp38-cp38-win32.whl → 1.0.0__cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show
  1. maxframe/_utils.cp38-win32.pyd +0 -0
  2. maxframe/codegen.py +10 -4
  3. maxframe/config/config.py +68 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +58 -14
  6. maxframe/core/__init__.py +2 -16
  7. maxframe/core/entity/__init__.py +1 -12
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +46 -45
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cp38-win32.pyd +0 -0
  19. maxframe/core/graph/core.pyx +4 -4
  20. maxframe/core/graph/entity.py +9 -33
  21. maxframe/core/operator/__init__.py +2 -9
  22. maxframe/core/operator/base.py +3 -5
  23. maxframe/core/operator/objects.py +0 -9
  24. maxframe/core/operator/utils.py +55 -0
  25. maxframe/dataframe/__init__.py +1 -1
  26. maxframe/dataframe/arithmetic/around.py +5 -17
  27. maxframe/dataframe/arithmetic/core.py +15 -7
  28. maxframe/dataframe/arithmetic/docstring.py +7 -33
  29. maxframe/dataframe/arithmetic/equal.py +4 -2
  30. maxframe/dataframe/arithmetic/greater.py +4 -2
  31. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  32. maxframe/dataframe/arithmetic/less.py +2 -2
  33. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  34. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  36. maxframe/dataframe/core.py +31 -7
  37. maxframe/dataframe/datasource/date_range.py +2 -2
  38. maxframe/dataframe/datasource/read_odps_query.py +117 -23
  39. maxframe/dataframe/datasource/read_odps_table.py +6 -3
  40. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  41. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  42. maxframe/dataframe/datastore/to_odps.py +28 -0
  43. maxframe/dataframe/extensions/__init__.py +5 -0
  44. maxframe/dataframe/extensions/flatjson.py +131 -0
  45. maxframe/dataframe/extensions/flatmap.py +317 -0
  46. maxframe/dataframe/extensions/reshuffle.py +1 -1
  47. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  48. maxframe/dataframe/groupby/core.py +1 -1
  49. maxframe/dataframe/groupby/cum.py +0 -1
  50. maxframe/dataframe/groupby/fill.py +4 -1
  51. maxframe/dataframe/groupby/getitem.py +6 -0
  52. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  53. maxframe/dataframe/groupby/transform.py +5 -1
  54. maxframe/dataframe/indexing/align.py +1 -1
  55. maxframe/dataframe/indexing/loc.py +6 -4
  56. maxframe/dataframe/indexing/rename.py +5 -28
  57. maxframe/dataframe/indexing/sample.py +0 -1
  58. maxframe/dataframe/indexing/set_index.py +68 -1
  59. maxframe/dataframe/initializer.py +11 -1
  60. maxframe/dataframe/merge/__init__.py +9 -1
  61. maxframe/dataframe/merge/concat.py +41 -31
  62. maxframe/dataframe/merge/merge.py +237 -3
  63. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  64. maxframe/dataframe/misc/apply.py +5 -10
  65. maxframe/dataframe/misc/case_when.py +1 -1
  66. maxframe/dataframe/misc/describe.py +2 -2
  67. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  68. maxframe/dataframe/misc/eval.py +4 -0
  69. maxframe/dataframe/misc/memory_usage.py +2 -2
  70. maxframe/dataframe/misc/pct_change.py +1 -83
  71. maxframe/dataframe/misc/tests/test_misc.py +33 -2
  72. maxframe/dataframe/misc/transform.py +1 -30
  73. maxframe/dataframe/misc/value_counts.py +4 -17
  74. maxframe/dataframe/missing/dropna.py +1 -1
  75. maxframe/dataframe/missing/fillna.py +5 -5
  76. maxframe/dataframe/operators.py +1 -17
  77. maxframe/dataframe/reduction/core.py +2 -2
  78. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  79. maxframe/dataframe/sort/sort_values.py +1 -11
  80. maxframe/dataframe/statistics/corr.py +3 -3
  81. maxframe/dataframe/statistics/quantile.py +13 -19
  82. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  83. maxframe/dataframe/tests/test_initializer.py +33 -2
  84. maxframe/dataframe/utils.py +26 -11
  85. maxframe/dataframe/window/expanding.py +5 -3
  86. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  87. maxframe/errors.py +13 -0
  88. maxframe/extension.py +12 -0
  89. maxframe/io/__init__.py +13 -0
  90. maxframe/io/objects/__init__.py +24 -0
  91. maxframe/io/objects/core.py +140 -0
  92. maxframe/io/objects/tensor.py +76 -0
  93. maxframe/io/objects/tests/__init__.py +13 -0
  94. maxframe/io/objects/tests/test_object_io.py +97 -0
  95. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  96. maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
  97. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  98. maxframe/io/odpsio/tableio.py +719 -0
  99. maxframe/io/odpsio/tests/__init__.py +13 -0
  100. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
  101. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  102. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  103. maxframe/io/odpsio/volumeio.py +63 -0
  104. maxframe/learn/contrib/__init__.py +3 -1
  105. maxframe/learn/contrib/graph/__init__.py +15 -0
  106. maxframe/learn/contrib/graph/connected_components.py +215 -0
  107. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  108. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  109. maxframe/learn/contrib/llm/__init__.py +16 -0
  110. maxframe/learn/contrib/llm/core.py +54 -0
  111. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  112. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  113. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  114. maxframe/learn/contrib/llm/text.py +42 -0
  115. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  116. maxframe/learn/contrib/xgboost/core.py +87 -2
  117. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  118. maxframe/learn/contrib/xgboost/predict.py +29 -46
  119. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  120. maxframe/learn/contrib/xgboost/train.py +29 -18
  121. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  122. maxframe/lib/mmh3.cp38-win32.pyd +0 -0
  123. maxframe/lib/mmh3.pyi +43 -0
  124. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  125. maxframe/lib/wrapped_pickle.py +2 -1
  126. maxframe/opcodes.py +8 -0
  127. maxframe/protocol.py +154 -27
  128. maxframe/remote/core.py +4 -8
  129. maxframe/serialization/__init__.py +1 -0
  130. maxframe/serialization/core.cp38-win32.pyd +0 -0
  131. maxframe/serialization/core.pxd +3 -0
  132. maxframe/serialization/core.pyi +3 -0
  133. maxframe/serialization/core.pyx +67 -26
  134. maxframe/serialization/exception.py +1 -1
  135. maxframe/serialization/pandas.py +52 -17
  136. maxframe/serialization/serializables/core.py +180 -15
  137. maxframe/serialization/serializables/field_type.py +4 -1
  138. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  139. maxframe/serialization/tests/test_serial.py +2 -1
  140. maxframe/session.py +9 -2
  141. maxframe/tensor/__init__.py +81 -2
  142. maxframe/tensor/arithmetic/isclose.py +1 -0
  143. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  144. maxframe/tensor/core.py +5 -136
  145. maxframe/tensor/datasource/array.py +3 -0
  146. maxframe/tensor/datasource/full.py +1 -1
  147. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  148. maxframe/tensor/indexing/flatnonzero.py +1 -1
  149. maxframe/tensor/indexing/getitem.py +2 -0
  150. maxframe/tensor/merge/__init__.py +2 -0
  151. maxframe/tensor/merge/concatenate.py +101 -0
  152. maxframe/tensor/merge/tests/test_merge.py +30 -1
  153. maxframe/tensor/merge/vstack.py +74 -0
  154. maxframe/tensor/{base → misc}/__init__.py +2 -0
  155. maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
  156. maxframe/tensor/misc/atleast_2d.py +70 -0
  157. maxframe/tensor/misc/atleast_3d.py +85 -0
  158. maxframe/tensor/misc/tests/__init__.py +13 -0
  159. maxframe/tensor/{base → misc}/transpose.py +22 -18
  160. maxframe/tensor/{base → misc}/unique.py +3 -3
  161. maxframe/tensor/operators.py +1 -7
  162. maxframe/tensor/random/core.py +1 -1
  163. maxframe/tensor/reduction/count_nonzero.py +2 -1
  164. maxframe/tensor/reduction/mean.py +1 -0
  165. maxframe/tensor/reduction/nanmean.py +1 -0
  166. maxframe/tensor/reduction/nanvar.py +2 -0
  167. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  168. maxframe/tensor/reduction/var.py +2 -0
  169. maxframe/tensor/statistics/quantile.py +2 -2
  170. maxframe/tensor/utils.py +2 -22
  171. maxframe/tests/test_protocol.py +34 -0
  172. maxframe/tests/test_utils.py +0 -12
  173. maxframe/tests/utils.py +17 -2
  174. maxframe/typing_.py +4 -1
  175. maxframe/udf.py +8 -9
  176. maxframe/utils.py +106 -86
  177. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  178. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
  179. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  180. maxframe_client/__init__.py +0 -1
  181. maxframe_client/clients/framedriver.py +4 -1
  182. maxframe_client/fetcher.py +81 -74
  183. maxframe_client/session/consts.py +3 -0
  184. maxframe_client/session/graph.py +8 -2
  185. maxframe_client/session/odps.py +194 -40
  186. maxframe_client/session/task.py +94 -39
  187. maxframe_client/tests/test_fetcher.py +21 -3
  188. maxframe_client/tests/test_session.py +109 -8
  189. maxframe/core/entity/chunks.py +0 -68
  190. maxframe/core/entity/fuse.py +0 -73
  191. maxframe/core/graph/builder/chunk.py +0 -430
  192. maxframe/odpsio/tableio.py +0 -322
  193. maxframe/odpsio/volumeio.py +0 -95
  194. maxframe_client/clients/spe.py +0 -104
  195. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  196. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  197. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  198. /maxframe/tensor/{base → misc}/astype.py +0 -0
  199. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  200. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  201. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  202. /maxframe/tensor/{base → misc}/where.py +0 -0
  203. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -29,8 +29,8 @@ def test_expanding():
29
29
  with pytest.raises(NotImplementedError):
30
30
  _ = df2.expanding(3, axis=1)
31
31
 
32
- r = df2.expanding(3, center=False)
33
- expected = df.expanding(3, center=False)
32
+ r = df2.expanding(3)
33
+ expected = df.expanding(3)
34
34
  assert repr(r) == repr(expected)
35
35
 
36
36
  assert "b" in dir(r)
maxframe/errors.py CHANGED
@@ -17,5 +17,18 @@ class MaxFrameError(Exception):
17
17
  pass
18
18
 
19
19
 
20
+ class MaxFrameIntentionalError(MaxFrameError):
21
+ pass
22
+
23
+
20
24
  class MaxFrameUserError(MaxFrameError):
21
25
  pass
26
+
27
+
28
+ class NoTaskServerResponseError(MaxFrameError):
29
+ pass
30
+
31
+
32
+ class SessionAlreadyClosedError(MaxFrameError):
33
+ def __init__(self, session_id: str):
34
+ super().__init__(f"Session {session_id} is already closed")
maxframe/extension.py CHANGED
@@ -48,6 +48,18 @@ class MaxFrameExtension(metaclass=abc.ABCMeta):
48
48
  """
49
49
  pass
50
50
 
51
+ @classmethod
52
+ async def reload_session(cls, session_id: str) -> None:
53
+ """
54
+ Reload the session state when the session is recovered from failover.
55
+
56
+ Parameters
57
+ ----------
58
+ session_id : str
59
+ The session id.
60
+ """
61
+ pass
62
+
51
63
  @classmethod
52
64
  def init_service_extension(cls) -> None:
53
65
  """
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,24 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .core import (
16
+ AbstractObjectIOHandler,
17
+ get_object_io_handler,
18
+ register_object_io_handler,
19
+ )
20
+
21
+ # isort: off
22
+ from . import tensor
23
+
24
+ del tensor
@@ -0,0 +1,140 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from abc import ABCMeta, abstractmethod
16
+ from typing import Any, Dict, Type, Union
17
+
18
+ import msgpack
19
+
20
+ from ...core import Entity, EntityData
21
+ from ...core.entity import ObjectData, TileableData
22
+ from ...lib import wrapped_pickle as pickle
23
+ from ...typing_ import SlicesType, TileableType
24
+ from ...utils import TypeDispatcher
25
+ from ..odpsio.volumeio import ODPSVolumeReader, ODPSVolumeWriter
26
+
27
+ _MetaType = Dict[str, Any]
28
+
29
+ _META_FILE_NAME = ".meta"
30
+ _META_PICKLED_KEYS_KEY = ".pickled_keys"
31
+
32
+
33
+ _io_handler_dispatcher = TypeDispatcher()
34
+
35
+
36
+ def register_object_io_handler(tileable_data_type: Type[TileableData]):
37
+ def wrapper(handler_cls):
38
+ _io_handler_dispatcher.register(tileable_data_type, handler_cls)
39
+ return handler_cls
40
+
41
+ return wrapper
42
+
43
+
44
+ def get_object_io_handler(
45
+ tileable_data_type: Union[Entity, EntityData, Type[EntityData]]
46
+ ) -> Type["AbstractObjectIOHandler"]:
47
+ if not isinstance(tileable_data_type, type):
48
+ if isinstance(tileable_data_type, Entity):
49
+ tileable_data_type = tileable_data_type.data
50
+ tileable_data_type = type(tileable_data_type)
51
+ return _io_handler_dispatcher.get_handler(tileable_data_type)
52
+
53
+
54
+ class AbstractObjectIOHandler(metaclass=ABCMeta):
55
+ def _prepare_meta_for_serial(
56
+ self, tileable: TileableType, meta: Dict[str, Any]
57
+ ) -> Dict[str, Any]:
58
+ to_pack = meta.copy()
59
+ pickled_keys = []
60
+ for k, v in meta.items():
61
+ if not isinstance(v, (str, bytes, int, float, bool)):
62
+ to_pack[k] = pickle.dumps(v)
63
+ pickled_keys.append(k)
64
+ to_pack[".pickled_keys"] = pickled_keys
65
+ return to_pack
66
+
67
+ def _prepare_meta_for_deserial(
68
+ self, tileable: TileableType, meta: Dict[str, Any]
69
+ ) -> Dict[str, Any]:
70
+ pickled_keys = meta.pop(".pickled_keys", None) or []
71
+ for k in pickled_keys:
72
+ meta[k] = pickle.loads(meta[k])
73
+ return meta
74
+
75
+ def read_object_meta(
76
+ self, reader: ODPSVolumeReader, tileable: TileableType
77
+ ) -> Dict[str, Any]:
78
+ meta_obj = msgpack.loads(reader.read_file(_META_FILE_NAME))
79
+ return self._prepare_meta_for_deserial(tileable, meta_obj)
80
+
81
+ @abstractmethod
82
+ def _read_object_body(
83
+ self,
84
+ reader: ODPSVolumeReader,
85
+ tileable: TileableType,
86
+ meta: Dict[str, Any],
87
+ slices: SlicesType = None,
88
+ ) -> Any:
89
+ raise NotImplementedError
90
+
91
+ def read_object(
92
+ self,
93
+ reader: ODPSVolumeReader,
94
+ tileable: TileableType,
95
+ slices: SlicesType = None,
96
+ ) -> Any:
97
+ meta = self.read_object_meta(reader, tileable)
98
+ return self._read_object_body(reader, tileable, meta, slices)
99
+
100
+ @abstractmethod
101
+ def _write_object_body(
102
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
103
+ ):
104
+ raise NotImplementedError
105
+
106
+ def write_object_meta(
107
+ self,
108
+ writer: ODPSVolumeWriter,
109
+ tileable: TileableType,
110
+ extra_meta: Dict[str, Any] = None,
111
+ ):
112
+ meta_obj = tileable.params.copy()
113
+ if extra_meta:
114
+ meta_obj.update(extra_meta)
115
+ meta_obj = self._prepare_meta_for_serial(tileable, meta_obj)
116
+ packed = msgpack.dumps(meta_obj)
117
+ writer.write_file(_META_FILE_NAME, packed)
118
+
119
+ def write_object(
120
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
121
+ ):
122
+ self.write_object_meta(writer, tileable)
123
+ self._write_object_body(writer, tileable, value)
124
+
125
+
126
+ @register_object_io_handler(ObjectData)
127
+ class ObjectIOHandler(AbstractObjectIOHandler):
128
+ def _read_object_body(
129
+ self,
130
+ reader: ODPSVolumeReader,
131
+ tileable: TileableType,
132
+ meta: Dict[str, Any],
133
+ slices: SlicesType = None,
134
+ ) -> Any:
135
+ return pickle.loads(reader.read_file("data"))
136
+
137
+ def _write_object_body(
138
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
139
+ ):
140
+ writer.write_file("data", pickle.dumps(value))
@@ -0,0 +1,76 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import struct
16
+ from io import BytesIO
17
+ from typing import Any, Dict
18
+
19
+ import msgpack
20
+ import numpy as np
21
+
22
+ from ...lib import wrapped_pickle as pickle
23
+ from ...tensor.core import TensorData
24
+ from ...typing_ import SlicesType, TileableType
25
+ from ..odpsio import ODPSVolumeReader, ODPSVolumeWriter
26
+ from .core import AbstractObjectIOHandler, register_object_io_handler
27
+
28
+
29
+ @register_object_io_handler(TensorData)
30
+ class TensorIOHandler(AbstractObjectIOHandler):
31
+ def write_object_meta(
32
+ self,
33
+ writer: ODPSVolumeWriter,
34
+ tileable: TileableType,
35
+ extra_meta: Dict[str, Any] = None,
36
+ ):
37
+ # fixme upload in real slices when tensors are supported in DPE
38
+ extra_meta = extra_meta or dict()
39
+ extra_meta["nsplits"] = ((np.nan,),)
40
+
41
+ super().write_object_meta(writer, tileable, extra_meta=extra_meta)
42
+
43
+ def _read_object_body(
44
+ self,
45
+ reader: ODPSVolumeReader,
46
+ tileable: TileableType,
47
+ meta: Dict[str, Any],
48
+ slices: SlicesType = None,
49
+ ) -> Any:
50
+ # fixme read data with slices when tensors are supported in DPE
51
+ body = reader.read_file("0,0.dat")
52
+ bio = BytesIO(body)
53
+ (header_len,) = struct.unpack("<I", bio.read(4))
54
+ header_data = msgpack.loads(bio.read(header_len))
55
+
56
+ pickled = bio.read(header_data[0])
57
+ bufs = [bio.read(size) for size in header_data[1:]]
58
+ return pickle.loads(pickled, buffers=bufs)
59
+
60
+ def _write_object_body(
61
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
62
+ ):
63
+ # fixme upload in real slices when tensors are supported in DPE
64
+ def data_gen():
65
+ bufs = []
66
+ pickled = pickle.dumps(value, buffer_callback=bufs.append)
67
+ header_data = msgpack.dumps(
68
+ [len(pickled)] + [len(buf.raw()) for buf in bufs]
69
+ )
70
+ yield struct.pack("<I", len(header_data))
71
+ yield header_data
72
+ yield pickled
73
+ for buf in bufs:
74
+ yield buf
75
+
76
+ writer.write_file("0,0.dat", data_gen())
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,97 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import numpy as np
15
+ import pytest
16
+ from odps import ODPS
17
+
18
+ from ....core import OutputType
19
+ from ....core.operator import ObjectOperatorMixin, Operator
20
+ from ....tensor.datasource import ArrayDataSource
21
+ from ....tests.utils import tn
22
+ from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
23
+ from ..core import get_object_io_handler
24
+
25
+
26
+ class TestObjectOp(Operator, ObjectOperatorMixin):
27
+ def __call__(self):
28
+ self._output_types = [OutputType.object]
29
+ return self.new_tileable([])
30
+
31
+
32
+ @pytest.fixture(scope="module")
33
+ def create_volume(request, oss_config):
34
+ test_vol_name = tn("test_object_io_volume")
35
+ odps_entry = ODPS.from_environments()
36
+
37
+ try:
38
+ odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
39
+ except:
40
+ pass
41
+
42
+ oss_test_dir_name = tn("test_oss_directory")
43
+ if oss_config is None:
44
+ pytest.skip("Need oss and its config to run this test")
45
+ (
46
+ oss_access_id,
47
+ oss_secret_access_key,
48
+ oss_bucket_name,
49
+ oss_endpoint,
50
+ ) = oss_config.oss_config
51
+ test_location = "oss://%s:%s@%s/%s/%s" % (
52
+ oss_access_id,
53
+ oss_secret_access_key,
54
+ oss_endpoint,
55
+ oss_bucket_name,
56
+ oss_test_dir_name,
57
+ )
58
+ oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
59
+ odps_entry.create_external_volume(test_vol_name, location=test_location)
60
+
61
+ try:
62
+ yield test_vol_name
63
+ finally:
64
+ try:
65
+ odps_entry.delete_volume(
66
+ test_vol_name, auto_remove_dir=True, recursive=True
67
+ )
68
+ except:
69
+ pass
70
+
71
+
72
+ def test_simple_object_io(create_volume):
73
+ obj = TestObjectOp()()
74
+ data = "abcdefg"
75
+
76
+ odps_entry = ODPS.from_environments()
77
+
78
+ reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
79
+ writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
80
+
81
+ handler = get_object_io_handler(obj)()
82
+ handler.write_object(writer, obj, data)
83
+ assert data == handler.read_object(reader, obj)
84
+
85
+
86
+ def test_tensor_object_io(create_volume):
87
+ data = np.array([[4, 9, 2], [3, 5, 7], [8, 1, 6]])
88
+ obj = ArrayDataSource(data, dtype=data.dtype)(data.shape)
89
+
90
+ odps_entry = ODPS.from_environments()
91
+
92
+ reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
93
+ writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
94
+
95
+ handler = get_object_io_handler(obj)()
96
+ handler.write_object(writer, obj, data)
97
+ np.testing.assert_equal(data, handler.read_object(reader, obj))
@@ -14,8 +14,10 @@
14
14
 
15
15
  from .arrow import arrow_to_pandas, pandas_to_arrow
16
16
  from .schema import (
17
+ arrow_schema_to_odps_schema,
17
18
  build_dataframe_table_meta,
18
19
  odps_schema_to_pandas_dtypes,
19
20
  pandas_to_odps_schema,
20
21
  )
21
- from .tableio import HaloTableIO
22
+ from .tableio import HaloTableIO, ODPSTableIO
23
+ from .volumeio import ODPSVolumeReader, ODPSVolumeWriter
@@ -17,10 +17,10 @@ from typing import Any, Tuple, Union
17
17
  import pandas as pd
18
18
  import pyarrow as pa
19
19
 
20
- from ..core import OutputType
21
- from ..protocol import DataFrameTableMeta
22
- from ..tensor.core import TENSOR_TYPE
23
- from ..typing_ import ArrowTableType, PandasObjectTypes
20
+ from ...core import OutputType
21
+ from ...protocol import DataFrameTableMeta
22
+ from ...tensor.core import TENSOR_TYPE
23
+ from ...typing_ import ArrowTableType, PandasObjectTypes
24
24
  from .schema import build_dataframe_table_meta
25
25
 
26
26
 
@@ -45,9 +45,13 @@ def _rebuild_dataframe(
45
45
 
46
46
  def _rebuild_index(df: pd.DataFrame, table_meta: DataFrameTableMeta) -> pd.Index:
47
47
  if df.shape[1] > 1:
48
- df.columns = pd.Index(table_meta.pd_index_level_names)
49
- return pd.MultiIndex.from_frame(df)
50
- return pd.Index(df.iloc[:, 0], name=table_meta.pd_index_level_names[0])
48
+ idx = pd.MultiIndex.from_frame(df)
49
+ idx.names = table_meta.pd_index_level_names
50
+ else:
51
+ # make sure even if None names are updated properly
52
+ idx = pd.Index(df.iloc[:, 0])
53
+ idx.name = table_meta.pd_index_level_names[0]
54
+ return idx
51
55
 
52
56
 
53
57
  def arrow_to_pandas(
@@ -65,20 +69,37 @@ def arrow_to_pandas(
65
69
 
66
70
 
67
71
  def pandas_to_arrow(
68
- df: Any, nthreads=1, ignore_index=False
72
+ df: Any, nthreads=1, ignore_index=False, ms_cols=None
69
73
  ) -> Tuple[ArrowTableType, DataFrameTableMeta]:
70
74
  table_meta = build_dataframe_table_meta(df, ignore_index)
71
75
  df = df.copy() if callable(getattr(df, "copy", None)) else df
76
+ table_datetime_cols = None
72
77
  if table_meta.type in (OutputType.dataframe, OutputType.series):
73
78
  if table_meta.type == OutputType.series:
74
79
  df = df.to_frame("_data" if df.name is None else df.name)
80
+ if ms_cols:
81
+ table_datetime_cols = {"_data"}
82
+ elif ms_cols:
83
+ ms_col_set = set(ms_cols)
84
+ table_datetime_cols = set()
85
+ for pd_col, table_col in zip(
86
+ table_meta.pd_column_dtypes.keys(), table_meta.table_column_names
87
+ ):
88
+ if pd_col in ms_col_set:
89
+ table_datetime_cols.add(table_col)
75
90
  df.columns = pd.Index(table_meta.table_column_names)
76
91
  if not ignore_index:
77
92
  df = df.rename_axis(table_meta.table_index_column_names).reset_index()
78
- elif ignore_index:
93
+ elif ignore_index and table_meta.type != OutputType.index:
79
94
  df = pd.DataFrame([], columns=[])
80
95
  elif table_meta.type == OutputType.index:
81
96
  names = [f"_idx_{idx}" for idx in range(len(df.names))]
97
+ table_datetime_cols = set()
98
+ if ms_cols:
99
+ if isinstance(df, pd.MultiIndex):
100
+ table_datetime_cols = {f"_idx_{idx}" for idx in ms_cols}
101
+ else:
102
+ table_datetime_cols = {"_idx_0"}
82
103
  df = df.to_frame(name=names[0] if len(names) == 1 else names)
83
104
  elif table_meta.type == OutputType.scalar:
84
105
  names = ["_idx_0"]
@@ -88,4 +109,15 @@ def pandas_to_arrow(
88
109
  df = pd.DataFrame([[df]], columns=names)
89
110
  else: # this could never happen # pragma: no cover
90
111
  raise ValueError(f"Does not support meta type {table_meta.type!r}")
91
- return pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False), table_meta
112
+ pa_table = pa.Table.from_pandas(df, nthreads=nthreads, preserve_index=False)
113
+ if table_datetime_cols:
114
+ col_names = pa_table.schema.names
115
+ col_datas = []
116
+ for idx, col_name in enumerate(pa_table.schema.names):
117
+ if col_name not in table_datetime_cols:
118
+ col_datas.append(pa_table.column(idx))
119
+ continue
120
+ col_data = pa_table.column(idx).cast(pa.timestamp("ms"))
121
+ col_datas.append(col_data)
122
+ pa_table = pa.Table.from_arrays(col_datas, names=col_names)
123
+ return pa_table, table_meta
@@ -16,14 +16,15 @@ import string
16
16
  from collections import defaultdict
17
17
  from typing import Any, Dict, Tuple
18
18
 
19
+ import numpy as np
19
20
  import pandas as pd
20
21
  import pyarrow as pa
21
22
  from odps import types as odps_types
22
23
  from pandas.api import types as pd_types
23
24
 
24
- from ..core import TILEABLE_TYPE, OutputType
25
- from ..protocol import DataFrameTableMeta
26
- from ..tensor.core import TENSOR_TYPE
25
+ from ...core import TILEABLE_TYPE, OutputType
26
+ from ...protocol import DataFrameTableMeta
27
+ from ...tensor.core import TENSOR_TYPE
27
28
 
28
29
  _TEMP_TABLE_PREFIX = "tmp_mf_"
29
30
 
@@ -39,6 +40,7 @@ _arrow_to_odps_types = {
39
40
  pa.float64(): odps_types.double,
40
41
  pa.date32(): odps_types.date,
41
42
  pa.timestamp("ms"): odps_types.datetime,
43
+ pa.timestamp("us"): odps_types.timestamp,
42
44
  pa.timestamp("ns"): odps_types.timestamp,
43
45
  }
44
46
 
@@ -54,7 +56,9 @@ _odps_type_to_arrow = {
54
56
  odps_types.double: pa.float64(),
55
57
  odps_types.date: pa.date32(),
56
58
  odps_types.datetime: pa.timestamp("ms"),
59
+ odps_types.json: pa.string(),
57
60
  odps_types.timestamp: pa.timestamp("ns"),
61
+ odps_types.timestamp_ntz: pa.timestamp("ns"),
58
62
  }
59
63
 
60
64
 
@@ -126,10 +130,15 @@ def odps_type_to_arrow_type(
126
130
  ]
127
131
  col_type = pa.struct(fields)
128
132
  elif isinstance(odps_type, odps_types.Decimal):
129
- col_type = pa.decimal128(
130
- odps_type.precision or odps_types.Decimal._max_precision,
131
- odps_type.scale or odps_types.Decimal._max_scale,
132
- )
133
+ if odps_type.name == "decimal":
134
+ # legacy decimal data without precision or scale
135
+ # precision data from internal compat mode
136
+ col_type = pa.decimal128(38, 18)
137
+ else:
138
+ col_type = pa.decimal128(
139
+ odps_type.precision or odps_types.Decimal._max_precision,
140
+ odps_type.scale or odps_types.Decimal._max_scale,
141
+ )
133
142
  elif isinstance(odps_type, (odps_types.Varchar, odps_types.Char)):
134
143
  col_type = pa.string()
135
144
  else:
@@ -161,7 +170,7 @@ def odps_schema_to_pandas_dtypes(
161
170
  return arrow_schema.empty_table().to_pandas().dtypes
162
171
 
163
172
 
164
- def _is_scalar_object(df_obj: Any) -> bool:
173
+ def is_scalar_object(df_obj: Any) -> bool:
165
174
  return (
166
175
  isinstance(df_obj, TENSOR_TYPE) and df_obj.shape == ()
167
176
  ) or pd_types.is_scalar(df_obj)
@@ -179,10 +188,10 @@ def pandas_to_odps_schema(
179
188
  unknown_as_string: bool = False,
180
189
  ignore_index=False,
181
190
  ) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
182
- from .. import dataframe as md
191
+ from ... import dataframe as md
183
192
  from .arrow import pandas_to_arrow
184
193
 
185
- if _is_scalar_object(df_obj):
194
+ if is_scalar_object(df_obj):
186
195
  empty_index = None
187
196
  elif hasattr(df_obj, "index_value"):
188
197
  empty_index = df_obj.index_value.to_pandas()[:0]
@@ -198,20 +207,35 @@ def pandas_to_odps_schema(
198
207
  else:
199
208
  empty_columns = None
200
209
 
210
+ ms_cols = None
201
211
  if isinstance(df_obj, (md.DataFrame, pd.DataFrame)):
202
212
  empty_df_obj = pd.DataFrame(
203
213
  [], columns=empty_columns, index=empty_index
204
214
  ).astype(df_obj.dtypes)
215
+ ms_cols = [
216
+ col for col, dt in df_obj.dtypes.items() if dt == np.dtype("datetime64[ms]")
217
+ ]
205
218
  elif isinstance(df_obj, (md.Series, pd.Series)):
206
219
  empty_df_obj = pd.Series([], name=df_obj.name, index=empty_index).astype(
207
220
  df_obj.dtype
208
221
  )
222
+ ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
209
223
  elif isinstance(df_obj, (md.Index, pd.Index)):
210
224
  empty_df_obj = empty_index
225
+ if isinstance(empty_index, pd.MultiIndex):
226
+ ms_cols = [
227
+ idx
228
+ for idx, dt in enumerate(empty_index.dtypes.values)
229
+ if dt == np.dtype("datetime64[ms]")
230
+ ]
231
+ else:
232
+ ms_cols = df_obj.dtype == np.dtype("datetime64[ms]")
211
233
  else:
212
234
  empty_df_obj = df_obj
213
235
 
214
- arrow_data, table_meta = pandas_to_arrow(empty_df_obj, ignore_index=ignore_index)
236
+ arrow_data, table_meta = pandas_to_arrow(
237
+ empty_df_obj, ignore_index=ignore_index, ms_cols=ms_cols
238
+ )
215
239
  return (
216
240
  arrow_schema_to_odps_schema(
217
241
  arrow_data.schema, unknown_as_string=unknown_as_string
@@ -273,7 +297,7 @@ def build_table_column_name(
273
297
  def build_dataframe_table_meta(
274
298
  df_obj: Any, ignore_index: bool = False
275
299
  ) -> DataFrameTableMeta:
276
- from .. import dataframe as md
300
+ from ... import dataframe as md
277
301
 
278
302
  col_to_count = defaultdict(lambda: 0)
279
303
  col_to_idx = defaultdict(lambda: 0)
@@ -284,13 +308,11 @@ def build_dataframe_table_meta(
284
308
  obj_type = OutputType.series
285
309
  elif isinstance(df_obj, (md.Index, pd.Index)):
286
310
  obj_type = OutputType.index
287
- elif _is_scalar_object(df_obj):
311
+ elif is_scalar_object(df_obj):
288
312
  obj_type = OutputType.scalar
289
313
  else: # pragma: no cover
290
314
  raise TypeError(f"Cannot accept type {type(df_obj)}")
291
315
 
292
- assert not ignore_index or obj_type in (OutputType.dataframe, OutputType.series)
293
-
294
316
  if obj_type == OutputType.scalar:
295
317
  pd_dtypes = pd.Series([])
296
318
  column_index_names = []
@@ -346,7 +368,7 @@ def build_dataframe_table_meta(
346
368
  else:
347
369
  index_dtypes = pd.Series([pd_index_val.dtype], index=pd_index_val.names)
348
370
 
349
- if ignore_index:
371
+ if ignore_index and obj_type != OutputType.index:
350
372
  table_index_column_names = []
351
373
  pd_index_dtypes = pd.Series([], index=[])
352
374
  else: