maxframe 1.0.0rc2__cp38-cp38-win32.whl → 1.0.0rc4__cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (134) hide show
  1. maxframe/_utils.cp38-win32.pyd +0 -0
  2. maxframe/codegen.py +4 -2
  3. maxframe/config/config.py +28 -9
  4. maxframe/config/validators.py +42 -12
  5. maxframe/conftest.py +56 -14
  6. maxframe/core/__init__.py +2 -13
  7. maxframe/core/entity/__init__.py +0 -4
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +45 -2
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cp38-win32.pyd +0 -0
  19. maxframe/core/graph/entity.py +9 -33
  20. maxframe/core/operator/__init__.py +2 -9
  21. maxframe/core/operator/base.py +3 -5
  22. maxframe/core/operator/objects.py +0 -9
  23. maxframe/core/operator/utils.py +55 -0
  24. maxframe/dataframe/arithmetic/docstring.py +26 -2
  25. maxframe/dataframe/arithmetic/equal.py +4 -2
  26. maxframe/dataframe/arithmetic/greater.py +4 -2
  27. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  28. maxframe/dataframe/arithmetic/less.py +2 -2
  29. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  30. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  31. maxframe/dataframe/core.py +2 -0
  32. maxframe/dataframe/datasource/read_odps_query.py +67 -8
  33. maxframe/dataframe/datasource/read_odps_table.py +4 -2
  34. maxframe/dataframe/datasource/tests/test_datasource.py +35 -6
  35. maxframe/dataframe/datastore/to_odps.py +8 -1
  36. maxframe/dataframe/extensions/__init__.py +3 -0
  37. maxframe/dataframe/extensions/flatmap.py +326 -0
  38. maxframe/dataframe/extensions/tests/test_extensions.py +62 -1
  39. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  40. maxframe/dataframe/indexing/rename.py +11 -0
  41. maxframe/dataframe/initializer.py +11 -1
  42. maxframe/dataframe/misc/drop_duplicates.py +18 -1
  43. maxframe/dataframe/operators.py +1 -17
  44. maxframe/dataframe/reduction/core.py +2 -2
  45. maxframe/dataframe/tests/test_initializer.py +33 -2
  46. maxframe/io/objects/__init__.py +24 -0
  47. maxframe/io/objects/core.py +140 -0
  48. maxframe/io/objects/tensor.py +76 -0
  49. maxframe/io/objects/tests/__init__.py +13 -0
  50. maxframe/io/objects/tests/test_object_io.py +97 -0
  51. maxframe/{odpsio → io/odpsio}/__init__.py +2 -0
  52. maxframe/{odpsio → io/odpsio}/arrow.py +4 -4
  53. maxframe/{odpsio → io/odpsio}/schema.py +10 -8
  54. maxframe/{odpsio → io/odpsio}/tableio.py +50 -38
  55. maxframe/io/odpsio/tests/__init__.py +13 -0
  56. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +3 -7
  57. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +3 -3
  58. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  59. maxframe/io/odpsio/volumeio.py +63 -0
  60. maxframe/learn/contrib/__init__.py +2 -1
  61. maxframe/learn/contrib/graph/__init__.py +15 -0
  62. maxframe/learn/contrib/graph/connected_components.py +215 -0
  63. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  64. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  65. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  66. maxframe/learn/contrib/xgboost/core.py +87 -2
  67. maxframe/learn/contrib/xgboost/dmatrix.py +1 -4
  68. maxframe/learn/contrib/xgboost/predict.py +27 -44
  69. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  70. maxframe/learn/contrib/xgboost/train.py +27 -16
  71. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  72. maxframe/lib/mmh3.cp38-win32.pyd +0 -0
  73. maxframe/opcodes.py +3 -0
  74. maxframe/protocol.py +7 -16
  75. maxframe/remote/core.py +4 -8
  76. maxframe/serialization/__init__.py +1 -0
  77. maxframe/serialization/core.cp38-win32.pyd +0 -0
  78. maxframe/session.py +9 -2
  79. maxframe/tensor/__init__.py +10 -2
  80. maxframe/tensor/arithmetic/isclose.py +1 -0
  81. maxframe/tensor/arithmetic/tests/test_arithmetic.py +21 -17
  82. maxframe/tensor/core.py +5 -136
  83. maxframe/tensor/datasource/array.py +3 -0
  84. maxframe/tensor/datasource/full.py +1 -1
  85. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  86. maxframe/tensor/indexing/flatnonzero.py +1 -1
  87. maxframe/tensor/indexing/getitem.py +2 -0
  88. maxframe/tensor/merge/__init__.py +2 -0
  89. maxframe/tensor/merge/concatenate.py +101 -0
  90. maxframe/tensor/merge/tests/test_merge.py +30 -1
  91. maxframe/tensor/merge/vstack.py +74 -0
  92. maxframe/tensor/{base → misc}/__init__.py +2 -0
  93. maxframe/tensor/{base → misc}/atleast_1d.py +0 -2
  94. maxframe/tensor/misc/atleast_2d.py +70 -0
  95. maxframe/tensor/misc/atleast_3d.py +85 -0
  96. maxframe/tensor/misc/tests/__init__.py +13 -0
  97. maxframe/tensor/{base → misc}/transpose.py +22 -18
  98. maxframe/tensor/operators.py +1 -7
  99. maxframe/tensor/random/core.py +1 -1
  100. maxframe/tensor/reduction/count_nonzero.py +1 -0
  101. maxframe/tensor/reduction/mean.py +1 -0
  102. maxframe/tensor/reduction/nanmean.py +1 -0
  103. maxframe/tensor/reduction/nanvar.py +2 -0
  104. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  105. maxframe/tensor/reduction/var.py +2 -0
  106. maxframe/tensor/utils.py +2 -22
  107. maxframe/typing_.py +4 -1
  108. maxframe/udf.py +8 -9
  109. maxframe/utils.py +49 -73
  110. maxframe-1.0.0rc4.dist-info/METADATA +104 -0
  111. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/RECORD +129 -114
  112. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/WHEEL +1 -1
  113. maxframe_client/fetcher.py +33 -50
  114. maxframe_client/session/consts.py +3 -0
  115. maxframe_client/session/graph.py +8 -2
  116. maxframe_client/session/odps.py +134 -27
  117. maxframe_client/session/task.py +58 -20
  118. maxframe_client/tests/test_fetcher.py +1 -1
  119. maxframe_client/tests/test_session.py +27 -3
  120. maxframe/core/entity/chunks.py +0 -68
  121. maxframe/core/entity/fuse.py +0 -73
  122. maxframe/core/graph/builder/chunk.py +0 -430
  123. maxframe/odpsio/volumeio.py +0 -95
  124. maxframe-1.0.0rc2.dist-info/METADATA +0 -177
  125. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  126. /maxframe/{tensor/base/tests → io}/__init__.py +0 -0
  127. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  128. /maxframe/tensor/{base → misc}/astype.py +0 -0
  129. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  130. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  131. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  132. /maxframe/tensor/{base → misc}/unique.py +0 -0
  133. /maxframe/tensor/{base → misc}/where.py +0 -0
  134. {maxframe-1.0.0rc2.dist-info → maxframe-1.0.0rc4.dist-info}/top_level.txt +0 -0
@@ -13,12 +13,13 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import pandas as pd
16
+ import pytest
16
17
 
17
18
  from ..core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
18
- from ..initializer import read_pandas
19
+ from ..initializer import DataFrame, Series, read_pandas
19
20
 
20
21
 
21
- def test_from_pandas():
22
+ def test_read_pandas():
22
23
  df_data = pd.DataFrame([["a", 1], ["b", 2]], columns=["a", "b"])
23
24
  assert isinstance(read_pandas(df_data), DATAFRAME_TYPE)
24
25
 
@@ -27,3 +28,33 @@ def test_from_pandas():
27
28
 
28
29
  idx_data = pd.Index(["a", "b"])
29
30
  assert isinstance(read_pandas(idx_data), INDEX_TYPE)
31
+
32
+
33
+ def test_init_dataframe_from_maxframe_series():
34
+ s = Series([1, 2, 3, 4], index=[1, 2, 3, 4])
35
+
36
+ df = DataFrame(s, index=s.index, columns=["col1"])
37
+
38
+ assert isinstance(df, DATAFRAME_TYPE)
39
+ assert df.dtypes.index == ["col1"]
40
+
41
+ with pytest.raises(ValueError):
42
+ DataFrame(s, index=s.index, columns=[])
43
+
44
+ with pytest.raises(ValueError):
45
+ DataFrame(s, index=s.index, columns="col1")
46
+
47
+ with pytest.raises(ValueError):
48
+ DataFrame(s, index=s.index, columns="col2")
49
+
50
+
51
+ def test_init_dataframe_from_maxframe_dataframe():
52
+ df1 = DataFrame({"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, index=[1, 2, 3, 4])
53
+
54
+ df2 = DataFrame(df1, index=df1.index, columns=["col1", "col2"])
55
+
56
+ assert isinstance(df2, DATAFRAME_TYPE)
57
+ assert list(df2.dtypes.index) == ["col1", "col2"]
58
+
59
+ with pytest.raises(ValueError):
60
+ DataFrame(df1, index=df1.index, columns=["col1", "col2", "col3"])
@@ -0,0 +1,24 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .core import (
16
+ AbstractObjectIOHandler,
17
+ get_object_io_handler,
18
+ register_object_io_handler,
19
+ )
20
+
21
+ # isort: off
22
+ from . import tensor
23
+
24
+ del tensor
@@ -0,0 +1,140 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from abc import ABCMeta, abstractmethod
16
+ from typing import Any, Dict, Type, Union
17
+
18
+ import msgpack
19
+
20
+ from ...core import Entity, EntityData
21
+ from ...core.entity import ObjectData, TileableData
22
+ from ...lib import wrapped_pickle as pickle
23
+ from ...typing_ import SlicesType, TileableType
24
+ from ...utils import TypeDispatcher
25
+ from ..odpsio.volumeio import ODPSVolumeReader, ODPSVolumeWriter
26
+
27
+ _MetaType = Dict[str, Any]
28
+
29
+ _META_FILE_NAME = ".meta"
30
+ _META_PICKLED_KEYS_KEY = ".pickled_keys"
31
+
32
+
33
+ _io_handler_dispatcher = TypeDispatcher()
34
+
35
+
36
+ def register_object_io_handler(tileable_data_type: Type[TileableData]):
37
+ def wrapper(handler_cls):
38
+ _io_handler_dispatcher.register(tileable_data_type, handler_cls)
39
+ return handler_cls
40
+
41
+ return wrapper
42
+
43
+
44
+ def get_object_io_handler(
45
+ tileable_data_type: Union[Entity, EntityData, Type[EntityData]]
46
+ ) -> Type["AbstractObjectIOHandler"]:
47
+ if not isinstance(tileable_data_type, type):
48
+ if isinstance(tileable_data_type, Entity):
49
+ tileable_data_type = tileable_data_type.data
50
+ tileable_data_type = type(tileable_data_type)
51
+ return _io_handler_dispatcher.get_handler(tileable_data_type)
52
+
53
+
54
+ class AbstractObjectIOHandler(metaclass=ABCMeta):
55
+ def _prepare_meta_for_serial(
56
+ self, tileable: TileableType, meta: Dict[str, Any]
57
+ ) -> Dict[str, Any]:
58
+ to_pack = meta.copy()
59
+ pickled_keys = []
60
+ for k, v in meta.items():
61
+ if not isinstance(v, (str, bytes, int, float, bool)):
62
+ to_pack[k] = pickle.dumps(v)
63
+ pickled_keys.append(k)
64
+ to_pack[".pickled_keys"] = pickled_keys
65
+ return to_pack
66
+
67
+ def _prepare_meta_for_deserial(
68
+ self, tileable: TileableType, meta: Dict[str, Any]
69
+ ) -> Dict[str, Any]:
70
+ pickled_keys = meta.pop(".pickled_keys", None) or []
71
+ for k in pickled_keys:
72
+ meta[k] = pickle.loads(meta[k])
73
+ return meta
74
+
75
+ def read_object_meta(
76
+ self, reader: ODPSVolumeReader, tileable: TileableType
77
+ ) -> Dict[str, Any]:
78
+ meta_obj = msgpack.loads(reader.read_file(_META_FILE_NAME))
79
+ return self._prepare_meta_for_deserial(tileable, meta_obj)
80
+
81
+ @abstractmethod
82
+ def _read_object_body(
83
+ self,
84
+ reader: ODPSVolumeReader,
85
+ tileable: TileableType,
86
+ meta: Dict[str, Any],
87
+ slices: SlicesType = None,
88
+ ) -> Any:
89
+ raise NotImplementedError
90
+
91
+ def read_object(
92
+ self,
93
+ reader: ODPSVolumeReader,
94
+ tileable: TileableType,
95
+ slices: SlicesType = None,
96
+ ) -> Any:
97
+ meta = self.read_object_meta(reader, tileable)
98
+ return self._read_object_body(reader, tileable, meta, slices)
99
+
100
+ @abstractmethod
101
+ def _write_object_body(
102
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
103
+ ):
104
+ raise NotImplementedError
105
+
106
+ def write_object_meta(
107
+ self,
108
+ writer: ODPSVolumeWriter,
109
+ tileable: TileableType,
110
+ extra_meta: Dict[str, Any] = None,
111
+ ):
112
+ meta_obj = tileable.params.copy()
113
+ if extra_meta:
114
+ meta_obj.update(extra_meta)
115
+ meta_obj = self._prepare_meta_for_serial(tileable, meta_obj)
116
+ packed = msgpack.dumps(meta_obj)
117
+ writer.write_file(_META_FILE_NAME, packed)
118
+
119
+ def write_object(
120
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
121
+ ):
122
+ self.write_object_meta(writer, tileable)
123
+ self._write_object_body(writer, tileable, value)
124
+
125
+
126
+ @register_object_io_handler(ObjectData)
127
+ class ObjectIOHandler(AbstractObjectIOHandler):
128
+ def _read_object_body(
129
+ self,
130
+ reader: ODPSVolumeReader,
131
+ tileable: TileableType,
132
+ meta: Dict[str, Any],
133
+ slices: SlicesType = None,
134
+ ) -> Any:
135
+ return pickle.loads(reader.read_file("data"))
136
+
137
+ def _write_object_body(
138
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
139
+ ):
140
+ writer.write_file("data", pickle.dumps(value))
@@ -0,0 +1,76 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import struct
16
+ from io import BytesIO
17
+ from typing import Any, Dict
18
+
19
+ import msgpack
20
+ import numpy as np
21
+
22
+ from ...lib import wrapped_pickle as pickle
23
+ from ...tensor.core import TensorData
24
+ from ...typing_ import SlicesType, TileableType
25
+ from ..odpsio import ODPSVolumeReader, ODPSVolumeWriter
26
+ from .core import AbstractObjectIOHandler, register_object_io_handler
27
+
28
+
29
+ @register_object_io_handler(TensorData)
30
+ class TensorIOHandler(AbstractObjectIOHandler):
31
+ def write_object_meta(
32
+ self,
33
+ writer: ODPSVolumeWriter,
34
+ tileable: TileableType,
35
+ extra_meta: Dict[str, Any] = None,
36
+ ):
37
+ # fixme upload in real slices when tensors are supported in DPE
38
+ extra_meta = extra_meta or dict()
39
+ extra_meta["nsplits"] = ((np.nan,),)
40
+
41
+ super().write_object_meta(writer, tileable, extra_meta=extra_meta)
42
+
43
+ def _read_object_body(
44
+ self,
45
+ reader: ODPSVolumeReader,
46
+ tileable: TileableType,
47
+ meta: Dict[str, Any],
48
+ slices: SlicesType = None,
49
+ ) -> Any:
50
+ # fixme read data with slices when tensors are supported in DPE
51
+ body = reader.read_file("0,0.dat")
52
+ bio = BytesIO(body)
53
+ (header_len,) = struct.unpack("<I", bio.read(4))
54
+ header_data = msgpack.loads(bio.read(header_len))
55
+
56
+ pickled = bio.read(header_data[0])
57
+ bufs = [bio.read(size) for size in header_data[1:]]
58
+ return pickle.loads(pickled, buffers=bufs)
59
+
60
+ def _write_object_body(
61
+ self, writer: ODPSVolumeWriter, tileable: TileableType, value: Any
62
+ ):
63
+ # fixme upload in real slices when tensors are supported in DPE
64
+ def data_gen():
65
+ bufs = []
66
+ pickled = pickle.dumps(value, buffer_callback=bufs.append)
67
+ header_data = msgpack.dumps(
68
+ [len(pickled)] + [len(buf.raw()) for buf in bufs]
69
+ )
70
+ yield struct.pack("<I", len(header_data))
71
+ yield header_data
72
+ yield pickled
73
+ for buf in bufs:
74
+ yield buf
75
+
76
+ writer.write_file("0,0.dat", data_gen())
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,97 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import numpy as np
15
+ import pytest
16
+ from odps import ODPS
17
+
18
+ from ....core import OutputType
19
+ from ....core.operator import ObjectOperatorMixin, Operator
20
+ from ....tensor.datasource import ArrayDataSource
21
+ from ....tests.utils import tn
22
+ from ...odpsio import ODPSVolumeReader, ODPSVolumeWriter
23
+ from ..core import get_object_io_handler
24
+
25
+
26
+ class TestObjectOp(Operator, ObjectOperatorMixin):
27
+ def __call__(self):
28
+ self._output_types = [OutputType.object]
29
+ return self.new_tileable([])
30
+
31
+
32
+ @pytest.fixture(scope="module")
33
+ def create_volume(request, oss_config):
34
+ test_vol_name = tn("test_object_io_volume")
35
+ odps_entry = ODPS.from_environments()
36
+
37
+ try:
38
+ odps_entry.delete_volume(test_vol_name, auto_remove_dir=True, recursive=True)
39
+ except:
40
+ pass
41
+
42
+ oss_test_dir_name = tn("test_oss_directory")
43
+ if oss_config is None:
44
+ pytest.skip("Need oss and its config to run this test")
45
+ (
46
+ oss_access_id,
47
+ oss_secret_access_key,
48
+ oss_bucket_name,
49
+ oss_endpoint,
50
+ ) = oss_config.oss_config
51
+ test_location = "oss://%s:%s@%s/%s/%s" % (
52
+ oss_access_id,
53
+ oss_secret_access_key,
54
+ oss_endpoint,
55
+ oss_bucket_name,
56
+ oss_test_dir_name,
57
+ )
58
+ oss_config.oss_bucket.put_object(oss_test_dir_name + "/", b"")
59
+ odps_entry.create_external_volume(test_vol_name, location=test_location)
60
+
61
+ try:
62
+ yield test_vol_name
63
+ finally:
64
+ try:
65
+ odps_entry.delete_volume(
66
+ test_vol_name, auto_remove_dir=True, recursive=True
67
+ )
68
+ except:
69
+ pass
70
+
71
+
72
+ def test_simple_object_io(create_volume):
73
+ obj = TestObjectOp()()
74
+ data = "abcdefg"
75
+
76
+ odps_entry = ODPS.from_environments()
77
+
78
+ reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
79
+ writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
80
+
81
+ handler = get_object_io_handler(obj)()
82
+ handler.write_object(writer, obj, data)
83
+ assert data == handler.read_object(reader, obj)
84
+
85
+
86
+ def test_tensor_object_io(create_volume):
87
+ data = np.array([[4, 9, 2], [3, 5, 7], [8, 1, 6]])
88
+ obj = ArrayDataSource(data, dtype=data.dtype)(data.shape)
89
+
90
+ odps_entry = ODPS.from_environments()
91
+
92
+ reader = ODPSVolumeReader(odps_entry, create_volume, obj.key)
93
+ writer = ODPSVolumeWriter(odps_entry, create_volume, obj.key)
94
+
95
+ handler = get_object_io_handler(obj)()
96
+ handler.write_object(writer, obj, data)
97
+ np.testing.assert_equal(data, handler.read_object(reader, obj))
@@ -14,8 +14,10 @@
14
14
 
15
15
  from .arrow import arrow_to_pandas, pandas_to_arrow
16
16
  from .schema import (
17
+ arrow_schema_to_odps_schema,
17
18
  build_dataframe_table_meta,
18
19
  odps_schema_to_pandas_dtypes,
19
20
  pandas_to_odps_schema,
20
21
  )
21
22
  from .tableio import HaloTableIO, ODPSTableIO
23
+ from .volumeio import ODPSVolumeReader, ODPSVolumeWriter
@@ -17,10 +17,10 @@ from typing import Any, Tuple, Union
17
17
  import pandas as pd
18
18
  import pyarrow as pa
19
19
 
20
- from ..core import OutputType
21
- from ..protocol import DataFrameTableMeta
22
- from ..tensor.core import TENSOR_TYPE
23
- from ..typing_ import ArrowTableType, PandasObjectTypes
20
+ from ...core import OutputType
21
+ from ...protocol import DataFrameTableMeta
22
+ from ...tensor.core import TENSOR_TYPE
23
+ from ...typing_ import ArrowTableType, PandasObjectTypes
24
24
  from .schema import build_dataframe_table_meta
25
25
 
26
26
 
@@ -21,9 +21,9 @@ import pyarrow as pa
21
21
  from odps import types as odps_types
22
22
  from pandas.api import types as pd_types
23
23
 
24
- from ..core import TILEABLE_TYPE, OutputType
25
- from ..protocol import DataFrameTableMeta
26
- from ..tensor.core import TENSOR_TYPE
24
+ from ...core import TILEABLE_TYPE, OutputType
25
+ from ...protocol import DataFrameTableMeta
26
+ from ...tensor.core import TENSOR_TYPE
27
27
 
28
28
  _TEMP_TABLE_PREFIX = "tmp_mf_"
29
29
 
@@ -54,7 +54,9 @@ _odps_type_to_arrow = {
54
54
  odps_types.double: pa.float64(),
55
55
  odps_types.date: pa.date32(),
56
56
  odps_types.datetime: pa.timestamp("ms"),
57
+ odps_types.json: pa.string(),
57
58
  odps_types.timestamp: pa.timestamp("ns"),
59
+ odps_types.timestamp_ntz: pa.timestamp("ns"),
58
60
  }
59
61
 
60
62
 
@@ -166,7 +168,7 @@ def odps_schema_to_pandas_dtypes(
166
168
  return arrow_schema.empty_table().to_pandas().dtypes
167
169
 
168
170
 
169
- def _is_scalar_object(df_obj: Any) -> bool:
171
+ def is_scalar_object(df_obj: Any) -> bool:
170
172
  return (
171
173
  isinstance(df_obj, TENSOR_TYPE) and df_obj.shape == ()
172
174
  ) or pd_types.is_scalar(df_obj)
@@ -184,10 +186,10 @@ def pandas_to_odps_schema(
184
186
  unknown_as_string: bool = False,
185
187
  ignore_index=False,
186
188
  ) -> Tuple[odps_types.OdpsSchema, DataFrameTableMeta]:
187
- from .. import dataframe as md
189
+ from ... import dataframe as md
188
190
  from .arrow import pandas_to_arrow
189
191
 
190
- if _is_scalar_object(df_obj):
192
+ if is_scalar_object(df_obj):
191
193
  empty_index = None
192
194
  elif hasattr(df_obj, "index_value"):
193
195
  empty_index = df_obj.index_value.to_pandas()[:0]
@@ -278,7 +280,7 @@ def build_table_column_name(
278
280
  def build_dataframe_table_meta(
279
281
  df_obj: Any, ignore_index: bool = False
280
282
  ) -> DataFrameTableMeta:
281
- from .. import dataframe as md
283
+ from ... import dataframe as md
282
284
 
283
285
  col_to_count = defaultdict(lambda: 0)
284
286
  col_to_idx = defaultdict(lambda: 0)
@@ -289,7 +291,7 @@ def build_dataframe_table_meta(
289
291
  obj_type = OutputType.series
290
292
  elif isinstance(df_obj, (md.Index, pd.Index)):
291
293
  obj_type = OutputType.index
292
- elif _is_scalar_object(df_obj):
294
+ elif is_scalar_object(df_obj):
293
295
  obj_type = OutputType.scalar
294
296
  else: # pragma: no cover
295
297
  raise TypeError(f"Cannot accept type {type(df_obj)}")
@@ -18,14 +18,15 @@ from abc import ABC, abstractmethod
18
18
  from contextlib import contextmanager
19
19
  from typing import Dict, List, Optional, Union
20
20
 
21
+ import numpy as np
21
22
  import pyarrow as pa
22
23
  from odps import ODPS
24
+ from odps import __version__ as pyodps_version
23
25
  from odps.apis.storage_api import (
24
26
  StorageApiArrowClient,
25
27
  TableBatchScanResponse,
26
28
  TableBatchWriteResponse,
27
29
  )
28
- from odps.config import option_context as pyodps_option_context
29
30
  from odps.tunnel import TableTunnel
30
31
  from odps.types import OdpsSchema, PartitionSpec, timestamp_ntz
31
32
 
@@ -34,20 +35,16 @@ try:
34
35
  except ImportError:
35
36
  pac = None
36
37
 
37
- from ..config import options
38
- from ..env import ODPS_STORAGE_API_ENDPOINT
38
+ from ...config import options
39
+ from ...env import ODPS_STORAGE_API_ENDPOINT
40
+ from ...lib.version import Version
41
+ from ...utils import sync_pyodps_options
39
42
  from .schema import odps_schema_to_arrow_schema
40
43
 
41
44
  PartitionsType = Union[List[str], str, None]
42
45
 
43
46
  _DEFAULT_ROW_BATCH_SIZE = 4096
44
-
45
-
46
- @contextmanager
47
- def _sync_pyodps_timezone():
48
- with pyodps_option_context() as cfg:
49
- cfg.local_timezone = options.local_timezone
50
- yield
47
+ _need_patch_batch = Version(pyodps_version) < Version("0.12.0")
51
48
 
52
49
 
53
50
  class ODPSTableIO(ABC):
@@ -163,10 +160,15 @@ class TunnelMultiPartitionReader:
163
160
  self._cur_partition_id += 1
164
161
 
165
162
  part_str = self._partitions[self._cur_partition_id]
166
- with _sync_pyodps_timezone():
163
+
164
+ # todo make this more formal when PyODPS 0.12.0 is released
165
+ req_columns = self._columns
166
+ if not _need_patch_batch:
167
+ req_columns = self._schema.names
168
+ with sync_pyodps_options():
167
169
  self._cur_reader = self._table.open_reader(
168
170
  part_str,
169
- columns=self._columns,
171
+ columns=req_columns,
170
172
  arrow=True,
171
173
  download_id=self._partition_to_download_ids.get(part_str),
172
174
  )
@@ -177,7 +179,7 @@ class TunnelMultiPartitionReader:
177
179
  else:
178
180
  count = min(self._count, self._cur_reader.count - start)
179
181
 
180
- with _sync_pyodps_timezone():
182
+ with sync_pyodps_options():
181
183
  self._reader_iter = self._cur_reader.read(start, count)
182
184
  break
183
185
  self._reader_start_pos += self._cur_reader.count
@@ -209,11 +211,12 @@ class TunnelMultiPartitionReader:
209
211
  for part_col in self._partition_cols or []:
210
212
  names.append(part_col)
211
213
  col_type = self._schema.field_by_name(part_col).type
212
- arrays.append(pa.array([pt_spec[part_col]] * batch.num_rows).cast(col_type))
214
+ pt_col = np.repeat([pt_spec[part_col]], batch.num_rows)
215
+ arrays.append(pa.array(pt_col).cast(col_type))
213
216
  return pa.RecordBatch.from_arrays(arrays, names)
214
217
 
215
218
  def read(self):
216
- with _sync_pyodps_timezone():
219
+ with sync_pyodps_options():
217
220
  if self._cur_reader is None:
218
221
  self._open_next_reader()
219
222
  if self._cur_reader is None:
@@ -224,7 +227,10 @@ class TunnelMultiPartitionReader:
224
227
  if batch is not None:
225
228
  if self._row_left is not None:
226
229
  self._row_left -= batch.num_rows
227
- return self._fill_batch_partition(batch)
230
+ if _need_patch_batch:
231
+ return self._fill_batch_partition(batch)
232
+ else:
233
+ return batch
228
234
  except StopIteration:
229
235
  self._open_next_reader()
230
236
  return None
@@ -282,7 +288,9 @@ class TunnelTableIO(ODPSTableIO):
282
288
  reverse_range: bool = False,
283
289
  row_batch_size: int = _DEFAULT_ROW_BATCH_SIZE,
284
290
  ):
285
- table = self._odps.get_table(full_table_name)
291
+ with sync_pyodps_options():
292
+ table = self._odps.get_table(full_table_name)
293
+
286
294
  if partition_columns is True:
287
295
  partition_columns = [c.name for c in table.table_schema.partitions]
288
296
 
@@ -293,21 +301,22 @@ class TunnelTableIO(ODPSTableIO):
293
301
  or (stop is not None and stop < 0)
294
302
  or (reverse_range and start is None)
295
303
  ):
296
- table = self._odps.get_table(full_table_name)
297
- tunnel = TableTunnel(self._odps)
298
- parts = (
299
- [partitions]
300
- if partitions is None or isinstance(partitions, str)
301
- else partitions
302
- )
303
- part_to_down_id = dict()
304
- total_records = 0
305
- for part in parts:
306
- down_session = tunnel.create_download_session(
307
- table, async_mode=True, partition_spec=part
304
+ with sync_pyodps_options():
305
+ table = self._odps.get_table(full_table_name)
306
+ tunnel = TableTunnel(self._odps)
307
+ parts = (
308
+ [partitions]
309
+ if partitions is None or isinstance(partitions, str)
310
+ else partitions
308
311
  )
309
- part_to_down_id[part] = down_session.id
310
- total_records += down_session.count
312
+ part_to_down_id = dict()
313
+ total_records = 0
314
+ for part in parts:
315
+ down_session = tunnel.create_download_session(
316
+ table, async_mode=True, partition_spec=part
317
+ )
318
+ part_to_down_id[part] = down_session.id
319
+ total_records += down_session.count
311
320
 
312
321
  count = None
313
322
  if start is not None or stop is not None:
@@ -344,7 +353,7 @@ class TunnelTableIO(ODPSTableIO):
344
353
  overwrite: bool = True,
345
354
  ):
346
355
  table = self._odps.get_table(full_table_name)
347
- with _sync_pyodps_timezone():
356
+ with sync_pyodps_options():
348
357
  with table.open_writer(
349
358
  partition=partition,
350
359
  arrow=True,
@@ -354,7 +363,10 @@ class TunnelTableIO(ODPSTableIO):
354
363
  # fixme should yield writer directly once pyodps fixes
355
364
  # related arrow timestamp bug when provided schema and
356
365
  # table schema is identical.
357
- yield TunnelWrappedWriter(writer)
366
+ if _need_patch_batch:
367
+ yield TunnelWrappedWriter(writer)
368
+ else:
369
+ yield writer
358
370
 
359
371
 
360
372
  class HaloTableArrowReader:
@@ -590,8 +602,8 @@ class HaloTableIO(ODPSTableIO):
590
602
  ):
591
603
  from odps.apis.storage_api import (
592
604
  SessionRequest,
605
+ SessionStatus,
593
606
  SplitOptions,
594
- Status,
595
607
  TableBatchScanRequest,
596
608
  )
597
609
 
@@ -622,13 +634,13 @@ class HaloTableIO(ODPSTableIO):
622
634
  resp = client.create_read_session(req)
623
635
 
624
636
  session_id = resp.session_id
625
- status = resp.status
626
- while status == Status.WAIT:
637
+ status = resp.session_status
638
+ while status == SessionStatus.INIT:
627
639
  resp = client.get_read_session(SessionRequest(session_id))
628
- status = resp.status
640
+ status = resp.session_status
629
641
  time.sleep(1.0)
630
642
 
631
- assert status == Status.OK
643
+ assert status == SessionStatus.NORMAL
632
644
 
633
645
  count = None
634
646
  if start is not None or stop is not None: