maxframe 0.1.0b4__cp311-cp311-win32.whl → 1.0.0__cp311-cp311-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/__init__.py +1 -0
- maxframe/_utils.cp311-win32.pyd +0 -0
- maxframe/codegen.py +56 -5
- maxframe/config/config.py +78 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp311-win32.pyd +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +2 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +58 -12
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +120 -24
- maxframe/dataframe/datasource/read_odps_table.py +9 -4
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/__init__.py +4 -0
- maxframe/dataframe/misc/apply.py +6 -11
- maxframe/dataframe/misc/case_when.py +141 -0
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/pivot_table.py +262 -0
- maxframe/dataframe/misc/tests/test_misc.py +93 -1
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/plotting/core.py +2 -2
- maxframe/dataframe/reduction/core.py +4 -3
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +33 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/utils.py +52 -0
- maxframe/learn/contrib/xgboost/__init__.py +26 -0
- maxframe/learn/contrib/xgboost/classifier.py +110 -0
- maxframe/learn/contrib/xgboost/core.py +241 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
- maxframe/learn/contrib/xgboost/predict.py +121 -0
- maxframe/learn/contrib/xgboost/regressor.py +71 -0
- maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
- maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
- maxframe/learn/contrib/xgboost/train.py +132 -0
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/learn/utils/__init__.py +15 -0
- maxframe/learn/utils/core.py +29 -0
- maxframe/lib/mmh3.cp311-win32.pyd +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +11 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp311-win32.pyd +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +64 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +37 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +7 -2
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/scalar.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +4 -0
- maxframe/tensor/misc/atleast_1d.py +72 -0
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/misc/unique.py +205 -0
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +62 -3
- maxframe/utils.py +112 -86
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +123 -54
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +223 -40
- maxframe_client/session/task.py +108 -80
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +136 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -300
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -18,9 +18,10 @@ import pyarrow as pa
|
|
|
18
18
|
import pytest
|
|
19
19
|
from odps import types as odps_types
|
|
20
20
|
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
21
|
+
from .... import dataframe as md
|
|
22
|
+
from .... import tensor as mt
|
|
23
|
+
from ....core import OutputType
|
|
24
|
+
from ....utils import pd_release_version
|
|
24
25
|
from ..schema import (
|
|
25
26
|
arrow_schema_to_odps_schema,
|
|
26
27
|
build_dataframe_table_meta,
|
|
@@ -30,20 +31,23 @@ from ..schema import (
|
|
|
30
31
|
)
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
def _wrap_maxframe_obj(obj, wrap=
|
|
34
|
-
if
|
|
34
|
+
def _wrap_maxframe_obj(obj, wrap="no"):
|
|
35
|
+
if wrap == "no":
|
|
35
36
|
return obj
|
|
36
37
|
if isinstance(obj, pd.DataFrame):
|
|
37
|
-
|
|
38
|
+
obj = md.DataFrame(obj)
|
|
38
39
|
elif isinstance(obj, pd.Series):
|
|
39
|
-
|
|
40
|
+
obj = md.Series(obj)
|
|
40
41
|
elif isinstance(obj, pd.Index):
|
|
41
|
-
|
|
42
|
+
obj = md.Index(obj)
|
|
42
43
|
else:
|
|
43
|
-
|
|
44
|
+
obj = mt.scalar(obj)
|
|
45
|
+
if wrap == "data":
|
|
46
|
+
return obj.data
|
|
47
|
+
return obj
|
|
44
48
|
|
|
45
49
|
|
|
46
|
-
@pytest.mark.parametrize("wrap_obj", [
|
|
50
|
+
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
47
51
|
def test_pandas_to_odps_schema_dataframe(wrap_obj):
|
|
48
52
|
data = pd.DataFrame(np.random.rand(100, 5), columns=list("ABCDE"))
|
|
49
53
|
|
|
@@ -94,7 +98,7 @@ def test_pandas_to_odps_schema_dataframe(wrap_obj):
|
|
|
94
98
|
assert meta.pd_index_level_names == [None, None]
|
|
95
99
|
|
|
96
100
|
|
|
97
|
-
@pytest.mark.parametrize("wrap_obj", [
|
|
101
|
+
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
98
102
|
def test_pandas_to_odps_schema_series(wrap_obj):
|
|
99
103
|
data = pd.Series(np.random.rand(100))
|
|
100
104
|
|
|
@@ -135,22 +139,22 @@ def test_pandas_to_odps_schema_series(wrap_obj):
|
|
|
135
139
|
assert meta.pd_index_level_names == ["c1", "c2"]
|
|
136
140
|
|
|
137
141
|
|
|
138
|
-
@pytest.mark.parametrize("wrap_obj", [
|
|
142
|
+
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
139
143
|
def test_pandas_to_odps_schema_index(wrap_obj):
|
|
140
144
|
data = pd.Index(np.random.randint(0, 100, 100))
|
|
141
145
|
|
|
142
146
|
test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
147
|
+
for ignore_idx in (False, True):
|
|
148
|
+
schema, meta = pandas_to_odps_schema(
|
|
149
|
+
test_idx, unknown_as_string=True, ignore_index=ignore_idx
|
|
150
|
+
)
|
|
151
|
+
assert [c.name for c in schema.columns] == ["_idx_0"]
|
|
152
|
+
assert [c.type.name for c in schema.columns] == ["bigint"]
|
|
153
|
+
assert meta.type == OutputType.index
|
|
154
|
+
assert meta.table_column_names == []
|
|
155
|
+
assert meta.table_index_column_names == ["_idx_0"]
|
|
156
|
+
assert meta.pd_column_level_names == []
|
|
157
|
+
assert meta.pd_index_level_names == [None]
|
|
154
158
|
|
|
155
159
|
data = pd.MultiIndex.from_arrays(
|
|
156
160
|
[np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
|
|
@@ -167,11 +171,14 @@ def test_pandas_to_odps_schema_index(wrap_obj):
|
|
|
167
171
|
assert meta.pd_index_level_names == ["c1", "c2"]
|
|
168
172
|
|
|
169
173
|
|
|
170
|
-
@pytest.mark.parametrize("wrap_obj", [
|
|
174
|
+
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
171
175
|
def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
172
176
|
data = 1234.56
|
|
173
177
|
|
|
174
178
|
test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
179
|
+
if wrap_obj != "no":
|
|
180
|
+
test_scalar.op.data = None
|
|
181
|
+
|
|
175
182
|
schema, meta = pandas_to_odps_schema(test_scalar, unknown_as_string=True)
|
|
176
183
|
assert schema.columns[0].name == "_idx_0"
|
|
177
184
|
assert schema.columns[0].type.name == "double"
|
|
@@ -181,9 +188,6 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
181
188
|
assert meta.pd_column_level_names == []
|
|
182
189
|
assert meta.pd_index_level_names == [None]
|
|
183
190
|
|
|
184
|
-
with pytest.raises(AssertionError):
|
|
185
|
-
pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
|
|
186
|
-
|
|
187
191
|
|
|
188
192
|
def test_odps_arrow_schema_conversion():
|
|
189
193
|
odps_schema = odps_types.OdpsSchema(
|
|
@@ -206,10 +210,11 @@ def test_odps_arrow_schema_conversion():
|
|
|
206
210
|
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
207
211
|
odps_types.Column("col17", "CHAR(15)"),
|
|
208
212
|
odps_types.Column("col18", "VARCHAR(15)"),
|
|
213
|
+
odps_types.Column("col19", "decimal"),
|
|
209
214
|
]
|
|
210
215
|
)
|
|
211
216
|
arrow_schema = odps_schema_to_arrow_schema(odps_schema)
|
|
212
|
-
assert arrow_schema.names == [f"col{i}" for i in range(1,
|
|
217
|
+
assert arrow_schema.names == [f"col{i}" for i in range(1, 20)]
|
|
213
218
|
assert arrow_schema.types == [
|
|
214
219
|
pa.string(),
|
|
215
220
|
pa.binary(),
|
|
@@ -229,6 +234,7 @@ def test_odps_arrow_schema_conversion():
|
|
|
229
234
|
pa.struct([("a1", pa.string()), ("a2", pa.map_(pa.string(), pa.int64()))]),
|
|
230
235
|
pa.string(),
|
|
231
236
|
pa.string(),
|
|
237
|
+
pa.decimal128(38, 18),
|
|
232
238
|
]
|
|
233
239
|
|
|
234
240
|
expected_odps_schema = odps_types.OdpsSchema(
|
|
@@ -251,6 +257,7 @@ def test_odps_arrow_schema_conversion():
|
|
|
251
257
|
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
252
258
|
odps_types.Column("col17", "string"),
|
|
253
259
|
odps_types.Column("col18", "string"),
|
|
260
|
+
odps_types.Column("col19", "decimal(38, 18)"),
|
|
254
261
|
]
|
|
255
262
|
)
|
|
256
263
|
|
|
@@ -264,10 +271,6 @@ def test_odps_arrow_schema_conversion():
|
|
|
264
271
|
|
|
265
272
|
with pytest.raises(TypeError):
|
|
266
273
|
arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
|
|
267
|
-
with pytest.raises(TypeError):
|
|
268
|
-
odps_schema_to_arrow_schema(
|
|
269
|
-
odps_types.OdpsSchema([odps_types.Column("col1", "json")])
|
|
270
|
-
)
|
|
271
274
|
|
|
272
275
|
|
|
273
276
|
def test_build_column_name():
|
|
@@ -279,7 +282,7 @@ def test_build_column_name():
|
|
|
279
282
|
assert build_table_column_name(4, ("A", 1), records) == "a_1"
|
|
280
283
|
|
|
281
284
|
|
|
282
|
-
@pytest.mark.parametrize("wrap_obj", [
|
|
285
|
+
@pytest.mark.parametrize("wrap_obj", ["no", "yes", "data"])
|
|
283
286
|
def test_build_table_meta(wrap_obj):
|
|
284
287
|
data = pd.DataFrame(
|
|
285
288
|
np.random.rand(100, 7),
|
|
@@ -290,3 +293,42 @@ def test_build_table_meta(wrap_obj):
|
|
|
290
293
|
table_meta = build_dataframe_table_meta(test_df)
|
|
291
294
|
expected_cols = ["a_2", "a_3", "a_0", "a_1_0", "a_1_1", "b", "c"]
|
|
292
295
|
assert table_meta.table_column_names == expected_cols
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@pytest.mark.skipif(
|
|
299
|
+
pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
|
|
300
|
+
)
|
|
301
|
+
def test_table_meta_with_datetime():
|
|
302
|
+
raw_df = pd.DataFrame(
|
|
303
|
+
[
|
|
304
|
+
[1, "abc", "2024-10-01 11:23:12"],
|
|
305
|
+
[3, "uvw", "2024-10-02 22:55:13"],
|
|
306
|
+
],
|
|
307
|
+
columns=["col1", "col2", "col3"],
|
|
308
|
+
)
|
|
309
|
+
df = md.DataFrame(raw_df).astype({"col3": "datetime64[ms]"})
|
|
310
|
+
schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
|
|
311
|
+
assert schema.columns[3].type == odps_types.datetime
|
|
312
|
+
|
|
313
|
+
raw_series = pd.Series(
|
|
314
|
+
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
315
|
+
)
|
|
316
|
+
s = md.Series(raw_series)
|
|
317
|
+
schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
|
|
318
|
+
assert schema.columns[1].type == odps_types.datetime
|
|
319
|
+
|
|
320
|
+
raw_index = pd.Index(
|
|
321
|
+
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
322
|
+
)
|
|
323
|
+
idx = md.Index(raw_index)
|
|
324
|
+
schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
|
|
325
|
+
assert schema.columns[0].type == odps_types.datetime
|
|
326
|
+
|
|
327
|
+
src_df = pd.DataFrame(
|
|
328
|
+
[[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
|
|
329
|
+
columns=["A", "B"],
|
|
330
|
+
).astype({"B": "datetime64[ms]"})
|
|
331
|
+
raw_multiindex = pd.MultiIndex.from_frame(src_df)
|
|
332
|
+
multiidx = md.Index(raw_multiindex)
|
|
333
|
+
schema, _ = pandas_to_odps_schema(multiidx, unknown_as_string=True)
|
|
334
|
+
assert schema.columns[1].type == odps_types.datetime
|
|
@@ -12,22 +12,37 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import datetime
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
17
19
|
import pyarrow as pa
|
|
20
|
+
import pytest
|
|
18
21
|
from odps import ODPS
|
|
19
22
|
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
+
from ....config import options
|
|
24
|
+
from ....tests.utils import flaky, tn
|
|
25
|
+
from ....utils import config_odps_default_options
|
|
26
|
+
from ..tableio import ODPSTableIO
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def switch_table_io(request):
|
|
31
|
+
old_use_common_table = options.use_common_table
|
|
32
|
+
try:
|
|
33
|
+
options.use_common_table = request.param
|
|
34
|
+
yield
|
|
35
|
+
finally:
|
|
36
|
+
options.use_common_table = old_use_common_table
|
|
23
37
|
|
|
24
38
|
|
|
25
39
|
@flaky(max_runs=3)
|
|
26
|
-
|
|
40
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
41
|
+
def test_empty_table_io(switch_table_io):
|
|
27
42
|
config_odps_default_options()
|
|
28
43
|
|
|
29
44
|
o = ODPS.from_environments()
|
|
30
|
-
|
|
45
|
+
table_io = ODPSTableIO(o)
|
|
31
46
|
|
|
32
47
|
# test read from empty table
|
|
33
48
|
empty_table_name = tn("test_empty_table_halo_read")
|
|
@@ -35,42 +50,53 @@ def test_empty_table_io():
|
|
|
35
50
|
tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
|
|
36
51
|
|
|
37
52
|
try:
|
|
38
|
-
with
|
|
53
|
+
with table_io.open_reader(empty_table_name) as reader:
|
|
39
54
|
assert len(reader.read_all()) == 0
|
|
40
55
|
finally:
|
|
41
56
|
tb.drop()
|
|
42
57
|
|
|
43
58
|
|
|
44
59
|
@flaky(max_runs=3)
|
|
45
|
-
|
|
60
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
61
|
+
def test_table_io_without_parts(switch_table_io):
|
|
46
62
|
config_odps_default_options()
|
|
47
63
|
|
|
48
64
|
o = ODPS.from_environments()
|
|
49
|
-
|
|
65
|
+
table_io = ODPSTableIO(o)
|
|
50
66
|
|
|
51
67
|
# test read and write tables without partition
|
|
52
68
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
53
69
|
o.delete_table(no_part_table_name, if_exists=True)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
)
|
|
70
|
+
col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
|
|
71
|
+
tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
|
|
57
72
|
|
|
58
73
|
try:
|
|
59
74
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
60
|
-
|
|
75
|
+
date_val = [
|
|
76
|
+
(
|
|
77
|
+
datetime.datetime.now().replace(microsecond=0)
|
|
78
|
+
+ datetime.timedelta(seconds=i)
|
|
79
|
+
)
|
|
80
|
+
for i in range(100)
|
|
81
|
+
]
|
|
82
|
+
pd_data["f"] = pd.Series(date_val, dtype="datetime64[ms]").dt.tz_localize(
|
|
83
|
+
options.local_timezone
|
|
84
|
+
)
|
|
85
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
61
86
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
62
|
-
with
|
|
87
|
+
with table_io.open_reader(no_part_table_name) as reader:
|
|
63
88
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
64
89
|
finally:
|
|
65
90
|
tb.drop()
|
|
66
91
|
|
|
67
92
|
|
|
68
93
|
@flaky(max_runs=3)
|
|
69
|
-
|
|
94
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
95
|
+
def test_table_io_with_range_reader(switch_table_io):
|
|
70
96
|
config_odps_default_options()
|
|
71
97
|
|
|
72
98
|
o = ODPS.from_environments()
|
|
73
|
-
|
|
99
|
+
table_io = ODPSTableIO(o)
|
|
74
100
|
|
|
75
101
|
# test read and write tables without partition
|
|
76
102
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
@@ -81,15 +107,15 @@ def test_table_io_with_range_reader():
|
|
|
81
107
|
|
|
82
108
|
try:
|
|
83
109
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
84
|
-
with
|
|
110
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
85
111
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
86
112
|
|
|
87
|
-
with
|
|
113
|
+
with table_io.open_reader(
|
|
88
114
|
no_part_table_name, start=None, stop=100, row_batch_size=10
|
|
89
115
|
) as reader:
|
|
90
116
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
91
117
|
|
|
92
|
-
with
|
|
118
|
+
with table_io.open_reader(
|
|
93
119
|
no_part_table_name,
|
|
94
120
|
start=-2,
|
|
95
121
|
stop=-52,
|
|
@@ -105,11 +131,12 @@ def test_table_io_with_range_reader():
|
|
|
105
131
|
|
|
106
132
|
|
|
107
133
|
@flaky(max_runs=3)
|
|
108
|
-
|
|
134
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
135
|
+
def test_table_io_with_parts(switch_table_io):
|
|
109
136
|
config_odps_default_options()
|
|
110
137
|
|
|
111
138
|
o = ODPS.from_environments()
|
|
112
|
-
|
|
139
|
+
table_io = ODPSTableIO(o)
|
|
113
140
|
|
|
114
141
|
# test read and write tables with partition
|
|
115
142
|
parted_table_name = tn("test_parted_halo_write")
|
|
@@ -122,11 +149,11 @@ def test_table_io_with_parts():
|
|
|
122
149
|
|
|
123
150
|
try:
|
|
124
151
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
125
|
-
with
|
|
152
|
+
with table_io.open_writer(parted_table_name, "pt=test") as writer:
|
|
126
153
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
127
|
-
with
|
|
154
|
+
with table_io.open_reader(parted_table_name, "pt=test") as reader:
|
|
128
155
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
129
|
-
with
|
|
156
|
+
with table_io.open_reader(
|
|
130
157
|
parted_table_name, "pt=test", partition_columns=True
|
|
131
158
|
) as reader:
|
|
132
159
|
expected_data = pd_data.copy()
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import pytest
|
|
16
16
|
from odps import ODPS
|
|
17
17
|
|
|
18
|
-
from
|
|
18
|
+
from ....tests.utils import tn
|
|
19
19
|
from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
20
20
|
|
|
21
21
|
|
|
@@ -69,19 +69,17 @@ def create_volume(request, oss_config):
|
|
|
69
69
|
oss_config.oss_bucket.batch_delete_objects(keys)
|
|
70
70
|
|
|
71
71
|
|
|
72
|
-
@pytest.mark.parametrize("create_volume", ["
|
|
72
|
+
@pytest.mark.parametrize("create_volume", ["external"], indirect=True)
|
|
73
73
|
def test_read_write_volume(create_volume):
|
|
74
74
|
test_vol_dir = "test_vol_dir"
|
|
75
75
|
|
|
76
76
|
odps_entry = ODPS.from_environments()
|
|
77
77
|
|
|
78
78
|
writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
|
|
79
|
-
write_session_id = writer.create_write_session()
|
|
80
79
|
|
|
81
80
|
writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
|
|
82
|
-
writer.write_file("file1", b"content1"
|
|
83
|
-
writer.write_file("file2", b"content2"
|
|
84
|
-
writer.commit(["file1", "file2"], write_session_id)
|
|
81
|
+
writer.write_file("file1", b"content1")
|
|
82
|
+
writer.write_file("file2", b"content2")
|
|
85
83
|
|
|
86
84
|
reader = ODPSVolumeReader(odps_entry, create_volume, test_vol_dir)
|
|
87
85
|
assert reader.read_file("file1") == b"content1"
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import inspect
|
|
16
|
+
from typing import Iterator, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
from odps import ODPS
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ODPSVolumeReader:
|
|
22
|
+
def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
|
|
23
|
+
self._odps_entry = odps_entry
|
|
24
|
+
self._volume = odps_entry.get_volume(volume_name)
|
|
25
|
+
self._volume_dir = volume_dir
|
|
26
|
+
|
|
27
|
+
def list_files(self) -> List[str]:
|
|
28
|
+
def _get_file_name(vol_file):
|
|
29
|
+
if hasattr(vol_file, "name"):
|
|
30
|
+
return vol_file.name
|
|
31
|
+
return vol_file.path.rsplit("/", 1)[-1]
|
|
32
|
+
|
|
33
|
+
return [
|
|
34
|
+
_get_file_name(f)
|
|
35
|
+
for f in self._odps_entry.list_volume_files(
|
|
36
|
+
f"/{self._volume.name}/{self._volume_dir}"
|
|
37
|
+
)
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
def read_file(self, file_name: str) -> bytes:
|
|
41
|
+
with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
|
|
42
|
+
return reader.read()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ODPSVolumeWriter:
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
odps_entry: ODPS,
|
|
49
|
+
volume_name: str,
|
|
50
|
+
volume_dir: str,
|
|
51
|
+
schema_name: Optional[str] = None,
|
|
52
|
+
):
|
|
53
|
+
self._odps_entry = odps_entry
|
|
54
|
+
self._volume = odps_entry.get_volume(volume_name, schema=schema_name)
|
|
55
|
+
self._volume_dir = volume_dir
|
|
56
|
+
|
|
57
|
+
def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
|
|
58
|
+
with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
|
|
59
|
+
if not inspect.isgenerator(data):
|
|
60
|
+
writer.write(data)
|
|
61
|
+
else:
|
|
62
|
+
for chunk in data:
|
|
63
|
+
writer.write(chunk)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .connected_components import connected_components
|