maxframe 0.1.0b5__cp310-cp310-win32.whl → 1.0.0__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cp310-win32.pyd +0 -0
- maxframe/codegen.py +10 -4
- maxframe/config/config.py +68 -10
- maxframe/config/validators.py +42 -11
- maxframe/conftest.py +58 -14
- maxframe/core/__init__.py +2 -16
- maxframe/core/entity/__init__.py +1 -12
- maxframe/core/entity/executable.py +1 -1
- maxframe/core/entity/objects.py +46 -45
- maxframe/core/entity/output_types.py +0 -3
- maxframe/core/entity/tests/test_objects.py +43 -0
- maxframe/core/entity/tileables.py +5 -78
- maxframe/core/graph/__init__.py +2 -2
- maxframe/core/graph/builder/__init__.py +0 -1
- maxframe/core/graph/builder/base.py +5 -4
- maxframe/core/graph/builder/tileable.py +4 -4
- maxframe/core/graph/builder/utils.py +4 -8
- maxframe/core/graph/core.cp310-win32.pyd +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/core/graph/entity.py +9 -33
- maxframe/core/operator/__init__.py +2 -9
- maxframe/core/operator/base.py +3 -5
- maxframe/core/operator/objects.py +0 -9
- maxframe/core/operator/utils.py +55 -0
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +7 -33
- maxframe/dataframe/arithmetic/equal.py +4 -2
- maxframe/dataframe/arithmetic/greater.py +4 -2
- maxframe/dataframe/arithmetic/greater_equal.py +4 -2
- maxframe/dataframe/arithmetic/less.py +2 -2
- maxframe/dataframe/arithmetic/less_equal.py +4 -2
- maxframe/dataframe/arithmetic/not_equal.py +4 -2
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
- maxframe/dataframe/core.py +31 -7
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +117 -23
- maxframe/dataframe/datasource/read_odps_table.py +6 -3
- maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +28 -0
- maxframe/dataframe/extensions/__init__.py +5 -0
- maxframe/dataframe/extensions/flatjson.py +131 -0
- maxframe/dataframe/extensions/flatmap.py +317 -0
- maxframe/dataframe/extensions/reshuffle.py +1 -1
- maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
- maxframe/dataframe/groupby/core.py +1 -1
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/fill.py +4 -1
- maxframe/dataframe/groupby/getitem.py +6 -0
- maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
- maxframe/dataframe/groupby/transform.py +5 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/loc.py +6 -4
- maxframe/dataframe/indexing/rename.py +5 -28
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/initializer.py +11 -1
- maxframe/dataframe/merge/__init__.py +9 -1
- maxframe/dataframe/merge/concat.py +41 -31
- maxframe/dataframe/merge/merge.py +237 -3
- maxframe/dataframe/merge/tests/test_merge.py +126 -1
- maxframe/dataframe/misc/apply.py +5 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +8 -8
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/tests/test_misc.py +33 -2
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/operators.py +1 -17
- maxframe/dataframe/reduction/core.py +2 -2
- maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +13 -19
- maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
- maxframe/dataframe/tests/test_initializer.py +33 -2
- maxframe/dataframe/utils.py +26 -11
- maxframe/dataframe/window/expanding.py +5 -3
- maxframe/dataframe/window/tests/test_expanding.py +2 -2
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/io/__init__.py +13 -0
- maxframe/io/objects/__init__.py +24 -0
- maxframe/io/objects/core.py +140 -0
- maxframe/io/objects/tensor.py +76 -0
- maxframe/io/objects/tests/__init__.py +13 -0
- maxframe/io/objects/tests/test_object_io.py +97 -0
- maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
- maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
- maxframe/{odpsio → io/odpsio}/schema.py +38 -16
- maxframe/io/odpsio/tableio.py +719 -0
- maxframe/io/odpsio/tests/__init__.py +13 -0
- maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
- maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
- maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
- maxframe/io/odpsio/volumeio.py +63 -0
- maxframe/learn/contrib/__init__.py +3 -1
- maxframe/learn/contrib/graph/__init__.py +15 -0
- maxframe/learn/contrib/graph/connected_components.py +215 -0
- maxframe/learn/contrib/graph/tests/__init__.py +13 -0
- maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
- maxframe/learn/contrib/llm/__init__.py +16 -0
- maxframe/learn/contrib/llm/core.py +54 -0
- maxframe/learn/contrib/llm/models/__init__.py +14 -0
- maxframe/learn/contrib/llm/models/dashscope.py +73 -0
- maxframe/learn/contrib/llm/multi_modal.py +42 -0
- maxframe/learn/contrib/llm/text.py +42 -0
- maxframe/learn/contrib/xgboost/classifier.py +26 -2
- maxframe/learn/contrib/xgboost/core.py +87 -2
- maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
- maxframe/learn/contrib/xgboost/predict.py +29 -46
- maxframe/learn/contrib/xgboost/regressor.py +3 -10
- maxframe/learn/contrib/xgboost/train.py +29 -18
- maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
- maxframe/lib/mmh3.cp310-win32.pyd +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/sparse/tests/test_sparse.py +15 -15
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/opcodes.py +8 -0
- maxframe/protocol.py +154 -27
- maxframe/remote/core.py +4 -8
- maxframe/serialization/__init__.py +1 -0
- maxframe/serialization/core.cp310-win32.pyd +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +3 -0
- maxframe/serialization/core.pyx +67 -26
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +52 -17
- maxframe/serialization/serializables/core.py +180 -15
- maxframe/serialization/serializables/field_type.py +4 -1
- maxframe/serialization/serializables/tests/test_serializable.py +54 -5
- maxframe/serialization/tests/test_serial.py +2 -1
- maxframe/session.py +9 -2
- maxframe/tensor/__init__.py +81 -2
- maxframe/tensor/arithmetic/isclose.py +1 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
- maxframe/tensor/core.py +5 -136
- maxframe/tensor/datasource/array.py +3 -0
- maxframe/tensor/datasource/full.py +1 -1
- maxframe/tensor/datasource/tests/test_datasource.py +1 -1
- maxframe/tensor/indexing/flatnonzero.py +1 -1
- maxframe/tensor/indexing/getitem.py +2 -0
- maxframe/tensor/merge/__init__.py +2 -0
- maxframe/tensor/merge/concatenate.py +101 -0
- maxframe/tensor/merge/tests/test_merge.py +30 -1
- maxframe/tensor/merge/vstack.py +74 -0
- maxframe/tensor/{base → misc}/__init__.py +2 -0
- maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
- maxframe/tensor/misc/atleast_2d.py +70 -0
- maxframe/tensor/misc/atleast_3d.py +85 -0
- maxframe/tensor/misc/tests/__init__.py +13 -0
- maxframe/tensor/{base → misc}/transpose.py +22 -18
- maxframe/tensor/{base → misc}/unique.py +3 -3
- maxframe/tensor/operators.py +1 -7
- maxframe/tensor/random/core.py +1 -1
- maxframe/tensor/reduction/count_nonzero.py +2 -1
- maxframe/tensor/reduction/mean.py +1 -0
- maxframe/tensor/reduction/nanmean.py +1 -0
- maxframe/tensor/reduction/nanvar.py +2 -0
- maxframe/tensor/reduction/tests/test_reduction.py +12 -1
- maxframe/tensor/reduction/var.py +2 -0
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tensor/utils.py +2 -22
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +17 -2
- maxframe/typing_.py +4 -1
- maxframe/udf.py +8 -9
- maxframe/utils.py +106 -86
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/clients/framedriver.py +4 -1
- maxframe_client/fetcher.py +81 -74
- maxframe_client/session/consts.py +3 -0
- maxframe_client/session/graph.py +8 -2
- maxframe_client/session/odps.py +194 -40
- maxframe_client/session/task.py +94 -39
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +109 -8
- maxframe/core/entity/chunks.py +0 -68
- maxframe/core/entity/fuse.py +0 -73
- maxframe/core/graph/builder/chunk.py +0 -430
- maxframe/odpsio/tableio.py +0 -322
- maxframe/odpsio/volumeio.py +0 -95
- maxframe_client/clients/spe.py +0 -104
- /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
- /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
- /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
- /maxframe/tensor/{base → misc}/astype.py +0 -0
- /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
- /maxframe/tensor/{base → misc}/ravel.py +0 -0
- /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
- /maxframe/tensor/{base → misc}/where.py +0 -0
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -18,9 +18,10 @@ import pyarrow as pa
|
|
|
18
18
|
import pytest
|
|
19
19
|
from odps import types as odps_types
|
|
20
20
|
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
-
from
|
|
21
|
+
from .... import dataframe as md
|
|
22
|
+
from .... import tensor as mt
|
|
23
|
+
from ....core import OutputType
|
|
24
|
+
from ....utils import pd_release_version
|
|
24
25
|
from ..schema import (
|
|
25
26
|
arrow_schema_to_odps_schema,
|
|
26
27
|
build_dataframe_table_meta,
|
|
@@ -143,17 +144,17 @@ def test_pandas_to_odps_schema_index(wrap_obj):
|
|
|
143
144
|
data = pd.Index(np.random.randint(0, 100, 100))
|
|
144
145
|
|
|
145
146
|
test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
147
|
+
for ignore_idx in (False, True):
|
|
148
|
+
schema, meta = pandas_to_odps_schema(
|
|
149
|
+
test_idx, unknown_as_string=True, ignore_index=ignore_idx
|
|
150
|
+
)
|
|
151
|
+
assert [c.name for c in schema.columns] == ["_idx_0"]
|
|
152
|
+
assert [c.type.name for c in schema.columns] == ["bigint"]
|
|
153
|
+
assert meta.type == OutputType.index
|
|
154
|
+
assert meta.table_column_names == []
|
|
155
|
+
assert meta.table_index_column_names == ["_idx_0"]
|
|
156
|
+
assert meta.pd_column_level_names == []
|
|
157
|
+
assert meta.pd_index_level_names == [None]
|
|
157
158
|
|
|
158
159
|
data = pd.MultiIndex.from_arrays(
|
|
159
160
|
[np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
|
|
@@ -177,6 +178,7 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
177
178
|
test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
|
|
178
179
|
if wrap_obj != "no":
|
|
179
180
|
test_scalar.op.data = None
|
|
181
|
+
|
|
180
182
|
schema, meta = pandas_to_odps_schema(test_scalar, unknown_as_string=True)
|
|
181
183
|
assert schema.columns[0].name == "_idx_0"
|
|
182
184
|
assert schema.columns[0].type.name == "double"
|
|
@@ -186,9 +188,6 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
|
|
|
186
188
|
assert meta.pd_column_level_names == []
|
|
187
189
|
assert meta.pd_index_level_names == [None]
|
|
188
190
|
|
|
189
|
-
with pytest.raises(AssertionError):
|
|
190
|
-
pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
|
|
191
|
-
|
|
192
191
|
|
|
193
192
|
def test_odps_arrow_schema_conversion():
|
|
194
193
|
odps_schema = odps_types.OdpsSchema(
|
|
@@ -211,10 +210,11 @@ def test_odps_arrow_schema_conversion():
|
|
|
211
210
|
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
212
211
|
odps_types.Column("col17", "CHAR(15)"),
|
|
213
212
|
odps_types.Column("col18", "VARCHAR(15)"),
|
|
213
|
+
odps_types.Column("col19", "decimal"),
|
|
214
214
|
]
|
|
215
215
|
)
|
|
216
216
|
arrow_schema = odps_schema_to_arrow_schema(odps_schema)
|
|
217
|
-
assert arrow_schema.names == [f"col{i}" for i in range(1,
|
|
217
|
+
assert arrow_schema.names == [f"col{i}" for i in range(1, 20)]
|
|
218
218
|
assert arrow_schema.types == [
|
|
219
219
|
pa.string(),
|
|
220
220
|
pa.binary(),
|
|
@@ -234,6 +234,7 @@ def test_odps_arrow_schema_conversion():
|
|
|
234
234
|
pa.struct([("a1", pa.string()), ("a2", pa.map_(pa.string(), pa.int64()))]),
|
|
235
235
|
pa.string(),
|
|
236
236
|
pa.string(),
|
|
237
|
+
pa.decimal128(38, 18),
|
|
237
238
|
]
|
|
238
239
|
|
|
239
240
|
expected_odps_schema = odps_types.OdpsSchema(
|
|
@@ -256,6 +257,7 @@ def test_odps_arrow_schema_conversion():
|
|
|
256
257
|
odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
|
|
257
258
|
odps_types.Column("col17", "string"),
|
|
258
259
|
odps_types.Column("col18", "string"),
|
|
260
|
+
odps_types.Column("col19", "decimal(38, 18)"),
|
|
259
261
|
]
|
|
260
262
|
)
|
|
261
263
|
|
|
@@ -269,10 +271,6 @@ def test_odps_arrow_schema_conversion():
|
|
|
269
271
|
|
|
270
272
|
with pytest.raises(TypeError):
|
|
271
273
|
arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
|
|
272
|
-
with pytest.raises(TypeError):
|
|
273
|
-
odps_schema_to_arrow_schema(
|
|
274
|
-
odps_types.OdpsSchema([odps_types.Column("col1", "json")])
|
|
275
|
-
)
|
|
276
274
|
|
|
277
275
|
|
|
278
276
|
def test_build_column_name():
|
|
@@ -295,3 +293,42 @@ def test_build_table_meta(wrap_obj):
|
|
|
295
293
|
table_meta = build_dataframe_table_meta(test_df)
|
|
296
294
|
expected_cols = ["a_2", "a_3", "a_0", "a_1_0", "a_1_1", "b", "c"]
|
|
297
295
|
assert table_meta.table_column_names == expected_cols
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
@pytest.mark.skipif(
|
|
299
|
+
pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
|
|
300
|
+
)
|
|
301
|
+
def test_table_meta_with_datetime():
|
|
302
|
+
raw_df = pd.DataFrame(
|
|
303
|
+
[
|
|
304
|
+
[1, "abc", "2024-10-01 11:23:12"],
|
|
305
|
+
[3, "uvw", "2024-10-02 22:55:13"],
|
|
306
|
+
],
|
|
307
|
+
columns=["col1", "col2", "col3"],
|
|
308
|
+
)
|
|
309
|
+
df = md.DataFrame(raw_df).astype({"col3": "datetime64[ms]"})
|
|
310
|
+
schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
|
|
311
|
+
assert schema.columns[3].type == odps_types.datetime
|
|
312
|
+
|
|
313
|
+
raw_series = pd.Series(
|
|
314
|
+
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
315
|
+
)
|
|
316
|
+
s = md.Series(raw_series)
|
|
317
|
+
schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
|
|
318
|
+
assert schema.columns[1].type == odps_types.datetime
|
|
319
|
+
|
|
320
|
+
raw_index = pd.Index(
|
|
321
|
+
["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
|
|
322
|
+
)
|
|
323
|
+
idx = md.Index(raw_index)
|
|
324
|
+
schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
|
|
325
|
+
assert schema.columns[0].type == odps_types.datetime
|
|
326
|
+
|
|
327
|
+
src_df = pd.DataFrame(
|
|
328
|
+
[[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
|
|
329
|
+
columns=["A", "B"],
|
|
330
|
+
).astype({"B": "datetime64[ms]"})
|
|
331
|
+
raw_multiindex = pd.MultiIndex.from_frame(src_df)
|
|
332
|
+
multiidx = md.Index(raw_multiindex)
|
|
333
|
+
schema, _ = pandas_to_odps_schema(multiidx, unknown_as_string=True)
|
|
334
|
+
assert schema.columns[1].type == odps_types.datetime
|
|
@@ -12,22 +12,37 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import datetime
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
17
19
|
import pyarrow as pa
|
|
20
|
+
import pytest
|
|
18
21
|
from odps import ODPS
|
|
19
22
|
|
|
20
|
-
from
|
|
21
|
-
from
|
|
22
|
-
from
|
|
23
|
+
from ....config import options
|
|
24
|
+
from ....tests.utils import flaky, tn
|
|
25
|
+
from ....utils import config_odps_default_options
|
|
26
|
+
from ..tableio import ODPSTableIO
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def switch_table_io(request):
|
|
31
|
+
old_use_common_table = options.use_common_table
|
|
32
|
+
try:
|
|
33
|
+
options.use_common_table = request.param
|
|
34
|
+
yield
|
|
35
|
+
finally:
|
|
36
|
+
options.use_common_table = old_use_common_table
|
|
23
37
|
|
|
24
38
|
|
|
25
39
|
@flaky(max_runs=3)
|
|
26
|
-
|
|
40
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
41
|
+
def test_empty_table_io(switch_table_io):
|
|
27
42
|
config_odps_default_options()
|
|
28
43
|
|
|
29
44
|
o = ODPS.from_environments()
|
|
30
|
-
|
|
45
|
+
table_io = ODPSTableIO(o)
|
|
31
46
|
|
|
32
47
|
# test read from empty table
|
|
33
48
|
empty_table_name = tn("test_empty_table_halo_read")
|
|
@@ -35,42 +50,53 @@ def test_empty_table_io():
|
|
|
35
50
|
tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
|
|
36
51
|
|
|
37
52
|
try:
|
|
38
|
-
with
|
|
53
|
+
with table_io.open_reader(empty_table_name) as reader:
|
|
39
54
|
assert len(reader.read_all()) == 0
|
|
40
55
|
finally:
|
|
41
56
|
tb.drop()
|
|
42
57
|
|
|
43
58
|
|
|
44
59
|
@flaky(max_runs=3)
|
|
45
|
-
|
|
60
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
61
|
+
def test_table_io_without_parts(switch_table_io):
|
|
46
62
|
config_odps_default_options()
|
|
47
63
|
|
|
48
64
|
o = ODPS.from_environments()
|
|
49
|
-
|
|
65
|
+
table_io = ODPSTableIO(o)
|
|
50
66
|
|
|
51
67
|
# test read and write tables without partition
|
|
52
68
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
53
69
|
o.delete_table(no_part_table_name, if_exists=True)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
)
|
|
70
|
+
col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
|
|
71
|
+
tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
|
|
57
72
|
|
|
58
73
|
try:
|
|
59
74
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
60
|
-
|
|
75
|
+
date_val = [
|
|
76
|
+
(
|
|
77
|
+
datetime.datetime.now().replace(microsecond=0)
|
|
78
|
+
+ datetime.timedelta(seconds=i)
|
|
79
|
+
)
|
|
80
|
+
for i in range(100)
|
|
81
|
+
]
|
|
82
|
+
pd_data["f"] = pd.Series(date_val, dtype="datetime64[ms]").dt.tz_localize(
|
|
83
|
+
options.local_timezone
|
|
84
|
+
)
|
|
85
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
61
86
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
62
|
-
with
|
|
87
|
+
with table_io.open_reader(no_part_table_name) as reader:
|
|
63
88
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
64
89
|
finally:
|
|
65
90
|
tb.drop()
|
|
66
91
|
|
|
67
92
|
|
|
68
93
|
@flaky(max_runs=3)
|
|
69
|
-
|
|
94
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
95
|
+
def test_table_io_with_range_reader(switch_table_io):
|
|
70
96
|
config_odps_default_options()
|
|
71
97
|
|
|
72
98
|
o = ODPS.from_environments()
|
|
73
|
-
|
|
99
|
+
table_io = ODPSTableIO(o)
|
|
74
100
|
|
|
75
101
|
# test read and write tables without partition
|
|
76
102
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
@@ -81,15 +107,15 @@ def test_table_io_with_range_reader():
|
|
|
81
107
|
|
|
82
108
|
try:
|
|
83
109
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
84
|
-
with
|
|
110
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
85
111
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
86
112
|
|
|
87
|
-
with
|
|
113
|
+
with table_io.open_reader(
|
|
88
114
|
no_part_table_name, start=None, stop=100, row_batch_size=10
|
|
89
115
|
) as reader:
|
|
90
116
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
91
117
|
|
|
92
|
-
with
|
|
118
|
+
with table_io.open_reader(
|
|
93
119
|
no_part_table_name,
|
|
94
120
|
start=-2,
|
|
95
121
|
stop=-52,
|
|
@@ -105,11 +131,12 @@ def test_table_io_with_range_reader():
|
|
|
105
131
|
|
|
106
132
|
|
|
107
133
|
@flaky(max_runs=3)
|
|
108
|
-
|
|
134
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
135
|
+
def test_table_io_with_parts(switch_table_io):
|
|
109
136
|
config_odps_default_options()
|
|
110
137
|
|
|
111
138
|
o = ODPS.from_environments()
|
|
112
|
-
|
|
139
|
+
table_io = ODPSTableIO(o)
|
|
113
140
|
|
|
114
141
|
# test read and write tables with partition
|
|
115
142
|
parted_table_name = tn("test_parted_halo_write")
|
|
@@ -122,11 +149,11 @@ def test_table_io_with_parts():
|
|
|
122
149
|
|
|
123
150
|
try:
|
|
124
151
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
125
|
-
with
|
|
152
|
+
with table_io.open_writer(parted_table_name, "pt=test") as writer:
|
|
126
153
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
127
|
-
with
|
|
154
|
+
with table_io.open_reader(parted_table_name, "pt=test") as reader:
|
|
128
155
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
129
|
-
with
|
|
156
|
+
with table_io.open_reader(
|
|
130
157
|
parted_table_name, "pt=test", partition_columns=True
|
|
131
158
|
) as reader:
|
|
132
159
|
expected_data = pd_data.copy()
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
import pytest
|
|
16
16
|
from odps import ODPS
|
|
17
17
|
|
|
18
|
-
from
|
|
18
|
+
from ....tests.utils import tn
|
|
19
19
|
from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
|
|
20
20
|
|
|
21
21
|
|
|
@@ -69,19 +69,17 @@ def create_volume(request, oss_config):
|
|
|
69
69
|
oss_config.oss_bucket.batch_delete_objects(keys)
|
|
70
70
|
|
|
71
71
|
|
|
72
|
-
@pytest.mark.parametrize("create_volume", ["
|
|
72
|
+
@pytest.mark.parametrize("create_volume", ["external"], indirect=True)
|
|
73
73
|
def test_read_write_volume(create_volume):
|
|
74
74
|
test_vol_dir = "test_vol_dir"
|
|
75
75
|
|
|
76
76
|
odps_entry = ODPS.from_environments()
|
|
77
77
|
|
|
78
78
|
writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
|
|
79
|
-
write_session_id = writer.create_write_session()
|
|
80
79
|
|
|
81
80
|
writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
|
|
82
|
-
writer.write_file("file1", b"content1"
|
|
83
|
-
writer.write_file("file2", b"content2"
|
|
84
|
-
writer.commit(["file1", "file2"], write_session_id)
|
|
81
|
+
writer.write_file("file1", b"content1")
|
|
82
|
+
writer.write_file("file2", b"content2")
|
|
85
83
|
|
|
86
84
|
reader = ODPSVolumeReader(odps_entry, create_volume, test_vol_dir)
|
|
87
85
|
assert reader.read_file("file1") == b"content1"
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import inspect
|
|
16
|
+
from typing import Iterator, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
from odps import ODPS
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ODPSVolumeReader:
|
|
22
|
+
def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
|
|
23
|
+
self._odps_entry = odps_entry
|
|
24
|
+
self._volume = odps_entry.get_volume(volume_name)
|
|
25
|
+
self._volume_dir = volume_dir
|
|
26
|
+
|
|
27
|
+
def list_files(self) -> List[str]:
|
|
28
|
+
def _get_file_name(vol_file):
|
|
29
|
+
if hasattr(vol_file, "name"):
|
|
30
|
+
return vol_file.name
|
|
31
|
+
return vol_file.path.rsplit("/", 1)[-1]
|
|
32
|
+
|
|
33
|
+
return [
|
|
34
|
+
_get_file_name(f)
|
|
35
|
+
for f in self._odps_entry.list_volume_files(
|
|
36
|
+
f"/{self._volume.name}/{self._volume_dir}"
|
|
37
|
+
)
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
def read_file(self, file_name: str) -> bytes:
|
|
41
|
+
with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
|
|
42
|
+
return reader.read()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ODPSVolumeWriter:
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
odps_entry: ODPS,
|
|
49
|
+
volume_name: str,
|
|
50
|
+
volume_dir: str,
|
|
51
|
+
schema_name: Optional[str] = None,
|
|
52
|
+
):
|
|
53
|
+
self._odps_entry = odps_entry
|
|
54
|
+
self._volume = odps_entry.get_volume(volume_name, schema=schema_name)
|
|
55
|
+
self._volume_dir = volume_dir
|
|
56
|
+
|
|
57
|
+
def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
|
|
58
|
+
with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
|
|
59
|
+
if not inspect.isgenerator(data):
|
|
60
|
+
writer.write(data)
|
|
61
|
+
else:
|
|
62
|
+
for chunk in data:
|
|
63
|
+
writer.write(chunk)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from .connected_components import connected_components
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
from maxframe import opcodes
|
|
19
|
+
|
|
20
|
+
from ....core import OutputType
|
|
21
|
+
from ....dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
|
|
22
|
+
from ....dataframe.utils import make_dtypes, parse_index
|
|
23
|
+
from ....serialization.serializables import Int32Field, StringField
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DataFrameConnectedComponentsOperator(DataFrameOperator, DataFrameOperatorMixin):
|
|
27
|
+
_op_type_ = opcodes.CONNECTED_COMPONENTS
|
|
28
|
+
|
|
29
|
+
vertex_col1 = StringField("vertex_col1", default=None)
|
|
30
|
+
vertex_col2 = StringField("vertex_col2", default=None)
|
|
31
|
+
max_iter = Int32Field("max_iter", default=6)
|
|
32
|
+
|
|
33
|
+
def __call__(self, df):
|
|
34
|
+
node_id_dtype = df.dtypes[self.vertex_col1]
|
|
35
|
+
dtypes = make_dtypes({"id": node_id_dtype, "component": node_id_dtype})
|
|
36
|
+
# this will return a dataframe and a bool flag
|
|
37
|
+
new_dataframe_tileable_kw = {
|
|
38
|
+
"shape": (np.nan, 2),
|
|
39
|
+
"index_value": parse_index(pd.RangeIndex(0)),
|
|
40
|
+
"columns_value": parse_index(dtypes.index, store_data=True),
|
|
41
|
+
"dtypes": dtypes,
|
|
42
|
+
}
|
|
43
|
+
new_scalar_tileable_kw = {"dtype": np.dtype(np.bool_), "shape": ()}
|
|
44
|
+
return self.new_tileables(
|
|
45
|
+
[df],
|
|
46
|
+
kws=[new_dataframe_tileable_kw, new_scalar_tileable_kw],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def output_limit(self):
|
|
51
|
+
return 2
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def connected_components(
|
|
55
|
+
dataframe, vertex_col1: str, vertex_col2: str, max_iter: int = 6
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
The connected components algorithm labels each node as belonging to a specific connected component with the ID of
|
|
59
|
+
its lowest-numbered vertex.
|
|
60
|
+
|
|
61
|
+
Parameters
|
|
62
|
+
----------
|
|
63
|
+
dataframe : DataFrame
|
|
64
|
+
A DataFrame containing the edges of the graph.
|
|
65
|
+
|
|
66
|
+
vertex_col1 : str
|
|
67
|
+
The name of the column in `dataframe` that contains the one of edge vertices. The column value must be an
|
|
68
|
+
integer.
|
|
69
|
+
|
|
70
|
+
vertex_col2 : str
|
|
71
|
+
The name of the column in `dataframe` that contains the other one of edge vertices. The column value must be an
|
|
72
|
+
integer.
|
|
73
|
+
|
|
74
|
+
max_iter : int
|
|
75
|
+
The algorithm use large and small star transformation to find all connected components, `max_iter`
|
|
76
|
+
controls the max round of the iterations before finds all edges. Default is 6.
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
DataFrame
|
|
82
|
+
Return dataFrame contains all connected component edges by two columns `id` and `component`. `component` is
|
|
83
|
+
the lowest-numbered vertex in the connected components.
|
|
84
|
+
|
|
85
|
+
Notes
|
|
86
|
+
-------
|
|
87
|
+
After `execute()`, the dataframe has a bool member `flag` to indicate if the `connected_components` already
|
|
88
|
+
converged in `max_iter` rounds. `True` means the dataframe already contains all edges of the connected components.
|
|
89
|
+
If `False` you can run `connected_components` more times to reach the converged state.
|
|
90
|
+
|
|
91
|
+
Examples
|
|
92
|
+
--------
|
|
93
|
+
>>> import numpy as np
|
|
94
|
+
>>> import maxframe.dataframe as md
|
|
95
|
+
>>> import maxframe.learn.contrib.graph.connected_components
|
|
96
|
+
>>> df = md.DataFrame({'x': [4, 1], 'y': [0, 4]})
|
|
97
|
+
>>> df.execute()
|
|
98
|
+
x y
|
|
99
|
+
0 4 1
|
|
100
|
+
1 0 4
|
|
101
|
+
|
|
102
|
+
Get connected components with 1 round iteration.
|
|
103
|
+
|
|
104
|
+
>>> components, converged = connected_components(df, "x", "y", 1)
|
|
105
|
+
>>> session.execute(components, converged)
|
|
106
|
+
>>> components
|
|
107
|
+
A B
|
|
108
|
+
0 1 0
|
|
109
|
+
1 4 0
|
|
110
|
+
|
|
111
|
+
>>> converged
|
|
112
|
+
True
|
|
113
|
+
|
|
114
|
+
Sometimes, a single iteration may not be sufficient to propagate the connectivity of all edges.
|
|
115
|
+
By default, `connected_components` performs 6 iterations of calculations.
|
|
116
|
+
If you are unsure whether the connected components have converged, you can check the `flag` variable in
|
|
117
|
+
the output DataFrame after calling `execute()`.
|
|
118
|
+
|
|
119
|
+
>>> df = md.DataFrame({'x': [4, 1, 7, 5, 8, 11, 11], 'y': [0, 4, 4, 7, 7, 9, 13]})
|
|
120
|
+
>>> df.execute()
|
|
121
|
+
x y
|
|
122
|
+
0 4 0
|
|
123
|
+
1 1 4
|
|
124
|
+
2 7 4
|
|
125
|
+
3 5 7
|
|
126
|
+
4 8 7
|
|
127
|
+
5 11 9
|
|
128
|
+
6 11 13
|
|
129
|
+
|
|
130
|
+
>>> components, converged = connected_components(df, "x", "y", 1)
|
|
131
|
+
>>> session.execute(components, converged)
|
|
132
|
+
>>> components
|
|
133
|
+
id component
|
|
134
|
+
0 4 0
|
|
135
|
+
1 7 0
|
|
136
|
+
2 8 4
|
|
137
|
+
3 13 9
|
|
138
|
+
4 1 0
|
|
139
|
+
5 5 0
|
|
140
|
+
6 11 9
|
|
141
|
+
|
|
142
|
+
If `flag` is True, it means convergence has been achieved.
|
|
143
|
+
|
|
144
|
+
>>> converged
|
|
145
|
+
False
|
|
146
|
+
|
|
147
|
+
You can determine whether to continue iterating or to use a larger number of iterations
|
|
148
|
+
(but not too large, which would result in wasted computational overhead).
|
|
149
|
+
|
|
150
|
+
>>> components, converged = connected_components(components, "id", "component", 1)
|
|
151
|
+
>>> session.execute(components, converged)
|
|
152
|
+
>>> components
|
|
153
|
+
id component
|
|
154
|
+
0 4 0
|
|
155
|
+
1 7 0
|
|
156
|
+
2 13 9
|
|
157
|
+
3 1 0
|
|
158
|
+
4 5 0
|
|
159
|
+
5 11 9
|
|
160
|
+
6 8 0
|
|
161
|
+
|
|
162
|
+
>>> components, converged = connected_components(df, "x", "y")
|
|
163
|
+
>>> session.execute(components, converged)
|
|
164
|
+
>>> components
|
|
165
|
+
id component
|
|
166
|
+
0 4 0
|
|
167
|
+
1 7 0
|
|
168
|
+
2 13 9
|
|
169
|
+
3 1 0
|
|
170
|
+
4 5 0
|
|
171
|
+
5 11 9
|
|
172
|
+
6 8 0
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
# Check if vertex columns are provided
|
|
176
|
+
if not vertex_col1 or not vertex_col2:
|
|
177
|
+
raise ValueError("Both vertex_col1 and vertex_col2 must be provided.")
|
|
178
|
+
|
|
179
|
+
# Check if max_iter is provided and within the valid range
|
|
180
|
+
if max_iter is None:
|
|
181
|
+
raise ValueError("max_iter must be provided.")
|
|
182
|
+
if not (1 <= max_iter <= 50):
|
|
183
|
+
raise ValueError("max_iter must be an integer between 1 and 50.")
|
|
184
|
+
|
|
185
|
+
# Verify that the vertex columns exist in the dataframe
|
|
186
|
+
missing_cols = [
|
|
187
|
+
col for col in (vertex_col1, vertex_col2) if col not in dataframe.dtypes
|
|
188
|
+
]
|
|
189
|
+
if missing_cols:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"The following required columns {missing_cols} are not in {list(dataframe.dtypes.index)}"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Ensure that the vertex columns are of integer type
|
|
195
|
+
# TODO support string dtype
|
|
196
|
+
incorrect_dtypes = [
|
|
197
|
+
col
|
|
198
|
+
for col in (vertex_col1, vertex_col2)
|
|
199
|
+
if dataframe[col].dtype != np.dtype("int")
|
|
200
|
+
]
|
|
201
|
+
if incorrect_dtypes:
|
|
202
|
+
dtypes_str = ", ".join(str(dataframe[col].dtype) for col in incorrect_dtypes)
|
|
203
|
+
raise ValueError(
|
|
204
|
+
f"Columns {incorrect_dtypes} should be of integer type, but found {dtypes_str}."
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
op = DataFrameConnectedComponentsOperator(
|
|
208
|
+
vertex_col1=vertex_col1,
|
|
209
|
+
vertex_col2=vertex_col2,
|
|
210
|
+
_output_types=[OutputType.dataframe, OutputType.scalar],
|
|
211
|
+
max_iter=max_iter,
|
|
212
|
+
)
|
|
213
|
+
return op(
|
|
214
|
+
dataframe,
|
|
215
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 1999-2024 Alibaba Group Holding Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|