maxframe 0.1.0b5__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show
  1. maxframe/_utils.cpython-310-darwin.so +0 -0
  2. maxframe/codegen.py +10 -4
  3. maxframe/config/config.py +68 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +58 -14
  6. maxframe/core/__init__.py +2 -16
  7. maxframe/core/entity/__init__.py +1 -12
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +46 -45
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cpython-310-darwin.so +0 -0
  19. maxframe/core/graph/core.pyx +4 -4
  20. maxframe/core/graph/entity.py +9 -33
  21. maxframe/core/operator/__init__.py +2 -9
  22. maxframe/core/operator/base.py +3 -5
  23. maxframe/core/operator/objects.py +0 -9
  24. maxframe/core/operator/utils.py +55 -0
  25. maxframe/dataframe/__init__.py +1 -1
  26. maxframe/dataframe/arithmetic/around.py +5 -17
  27. maxframe/dataframe/arithmetic/core.py +15 -7
  28. maxframe/dataframe/arithmetic/docstring.py +7 -33
  29. maxframe/dataframe/arithmetic/equal.py +4 -2
  30. maxframe/dataframe/arithmetic/greater.py +4 -2
  31. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  32. maxframe/dataframe/arithmetic/less.py +2 -2
  33. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  34. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  36. maxframe/dataframe/core.py +31 -7
  37. maxframe/dataframe/datasource/date_range.py +2 -2
  38. maxframe/dataframe/datasource/read_odps_query.py +117 -23
  39. maxframe/dataframe/datasource/read_odps_table.py +6 -3
  40. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  41. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  42. maxframe/dataframe/datastore/to_odps.py +28 -0
  43. maxframe/dataframe/extensions/__init__.py +5 -0
  44. maxframe/dataframe/extensions/flatjson.py +131 -0
  45. maxframe/dataframe/extensions/flatmap.py +317 -0
  46. maxframe/dataframe/extensions/reshuffle.py +1 -1
  47. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  48. maxframe/dataframe/groupby/core.py +1 -1
  49. maxframe/dataframe/groupby/cum.py +0 -1
  50. maxframe/dataframe/groupby/fill.py +4 -1
  51. maxframe/dataframe/groupby/getitem.py +6 -0
  52. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  53. maxframe/dataframe/groupby/transform.py +5 -1
  54. maxframe/dataframe/indexing/align.py +1 -1
  55. maxframe/dataframe/indexing/loc.py +6 -4
  56. maxframe/dataframe/indexing/rename.py +5 -28
  57. maxframe/dataframe/indexing/sample.py +0 -1
  58. maxframe/dataframe/indexing/set_index.py +68 -1
  59. maxframe/dataframe/initializer.py +11 -1
  60. maxframe/dataframe/merge/__init__.py +9 -1
  61. maxframe/dataframe/merge/concat.py +41 -31
  62. maxframe/dataframe/merge/merge.py +237 -3
  63. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  64. maxframe/dataframe/misc/apply.py +5 -10
  65. maxframe/dataframe/misc/case_when.py +1 -1
  66. maxframe/dataframe/misc/describe.py +2 -2
  67. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  68. maxframe/dataframe/misc/eval.py +4 -0
  69. maxframe/dataframe/misc/memory_usage.py +2 -2
  70. maxframe/dataframe/misc/pct_change.py +1 -83
  71. maxframe/dataframe/misc/tests/test_misc.py +33 -2
  72. maxframe/dataframe/misc/transform.py +1 -30
  73. maxframe/dataframe/misc/value_counts.py +4 -17
  74. maxframe/dataframe/missing/dropna.py +1 -1
  75. maxframe/dataframe/missing/fillna.py +5 -5
  76. maxframe/dataframe/operators.py +1 -17
  77. maxframe/dataframe/reduction/core.py +2 -2
  78. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  79. maxframe/dataframe/sort/sort_values.py +1 -11
  80. maxframe/dataframe/statistics/corr.py +3 -3
  81. maxframe/dataframe/statistics/quantile.py +13 -19
  82. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  83. maxframe/dataframe/tests/test_initializer.py +33 -2
  84. maxframe/dataframe/utils.py +26 -11
  85. maxframe/dataframe/window/expanding.py +5 -3
  86. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  87. maxframe/errors.py +13 -0
  88. maxframe/extension.py +12 -0
  89. maxframe/io/__init__.py +13 -0
  90. maxframe/io/objects/__init__.py +24 -0
  91. maxframe/io/objects/core.py +140 -0
  92. maxframe/io/objects/tensor.py +76 -0
  93. maxframe/io/objects/tests/__init__.py +13 -0
  94. maxframe/io/objects/tests/test_object_io.py +97 -0
  95. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  96. maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
  97. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  98. maxframe/io/odpsio/tableio.py +719 -0
  99. maxframe/io/odpsio/tests/__init__.py +13 -0
  100. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
  101. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  102. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  103. maxframe/io/odpsio/volumeio.py +63 -0
  104. maxframe/learn/contrib/__init__.py +3 -1
  105. maxframe/learn/contrib/graph/__init__.py +15 -0
  106. maxframe/learn/contrib/graph/connected_components.py +215 -0
  107. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  108. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  109. maxframe/learn/contrib/llm/__init__.py +16 -0
  110. maxframe/learn/contrib/llm/core.py +54 -0
  111. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  112. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  113. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  114. maxframe/learn/contrib/llm/text.py +42 -0
  115. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  116. maxframe/learn/contrib/xgboost/core.py +87 -2
  117. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  118. maxframe/learn/contrib/xgboost/predict.py +29 -46
  119. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  120. maxframe/learn/contrib/xgboost/train.py +29 -18
  121. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  122. maxframe/lib/mmh3.cpython-310-darwin.so +0 -0
  123. maxframe/lib/mmh3.pyi +43 -0
  124. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  125. maxframe/lib/wrapped_pickle.py +2 -1
  126. maxframe/opcodes.py +8 -0
  127. maxframe/protocol.py +154 -27
  128. maxframe/remote/core.py +4 -8
  129. maxframe/serialization/__init__.py +1 -0
  130. maxframe/serialization/core.cpython-310-darwin.so +0 -0
  131. maxframe/serialization/core.pxd +3 -0
  132. maxframe/serialization/core.pyi +3 -0
  133. maxframe/serialization/core.pyx +67 -26
  134. maxframe/serialization/exception.py +1 -1
  135. maxframe/serialization/pandas.py +52 -17
  136. maxframe/serialization/serializables/core.py +180 -15
  137. maxframe/serialization/serializables/field_type.py +4 -1
  138. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  139. maxframe/serialization/tests/test_serial.py +2 -1
  140. maxframe/session.py +9 -2
  141. maxframe/tensor/__init__.py +81 -2
  142. maxframe/tensor/arithmetic/isclose.py +1 -0
  143. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  144. maxframe/tensor/core.py +5 -136
  145. maxframe/tensor/datasource/array.py +3 -0
  146. maxframe/tensor/datasource/full.py +1 -1
  147. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  148. maxframe/tensor/indexing/flatnonzero.py +1 -1
  149. maxframe/tensor/indexing/getitem.py +2 -0
  150. maxframe/tensor/merge/__init__.py +2 -0
  151. maxframe/tensor/merge/concatenate.py +101 -0
  152. maxframe/tensor/merge/tests/test_merge.py +30 -1
  153. maxframe/tensor/merge/vstack.py +74 -0
  154. maxframe/tensor/{base → misc}/__init__.py +2 -0
  155. maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
  156. maxframe/tensor/misc/atleast_2d.py +70 -0
  157. maxframe/tensor/misc/atleast_3d.py +85 -0
  158. maxframe/tensor/misc/tests/__init__.py +13 -0
  159. maxframe/tensor/{base → misc}/transpose.py +22 -18
  160. maxframe/tensor/{base → misc}/unique.py +3 -3
  161. maxframe/tensor/operators.py +1 -7
  162. maxframe/tensor/random/core.py +1 -1
  163. maxframe/tensor/reduction/count_nonzero.py +2 -1
  164. maxframe/tensor/reduction/mean.py +1 -0
  165. maxframe/tensor/reduction/nanmean.py +1 -0
  166. maxframe/tensor/reduction/nanvar.py +2 -0
  167. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  168. maxframe/tensor/reduction/var.py +2 -0
  169. maxframe/tensor/statistics/quantile.py +2 -2
  170. maxframe/tensor/utils.py +2 -22
  171. maxframe/tests/test_protocol.py +34 -0
  172. maxframe/tests/test_utils.py +0 -12
  173. maxframe/tests/utils.py +17 -2
  174. maxframe/typing_.py +4 -1
  175. maxframe/udf.py +8 -9
  176. maxframe/utils.py +106 -86
  177. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  178. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
  179. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  180. maxframe_client/__init__.py +0 -1
  181. maxframe_client/clients/framedriver.py +4 -1
  182. maxframe_client/fetcher.py +81 -74
  183. maxframe_client/session/consts.py +3 -0
  184. maxframe_client/session/graph.py +8 -2
  185. maxframe_client/session/odps.py +194 -40
  186. maxframe_client/session/task.py +94 -39
  187. maxframe_client/tests/test_fetcher.py +21 -3
  188. maxframe_client/tests/test_session.py +109 -8
  189. maxframe/core/entity/chunks.py +0 -68
  190. maxframe/core/entity/fuse.py +0 -73
  191. maxframe/core/graph/builder/chunk.py +0 -430
  192. maxframe/odpsio/tableio.py +0 -322
  193. maxframe/odpsio/volumeio.py +0 -95
  194. maxframe_client/clients/spe.py +0 -104
  195. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  196. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  197. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  198. /maxframe/tensor/{base → misc}/astype.py +0 -0
  199. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  200. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  201. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  202. /maxframe/tensor/{base → misc}/where.py +0 -0
  203. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -18,9 +18,10 @@ import pyarrow as pa
18
18
  import pytest
19
19
  from odps import types as odps_types
20
20
 
21
- from ... import dataframe as md
22
- from ... import tensor as mt
23
- from ...core import OutputType
21
+ from .... import dataframe as md
22
+ from .... import tensor as mt
23
+ from ....core import OutputType
24
+ from ....utils import pd_release_version
24
25
  from ..schema import (
25
26
  arrow_schema_to_odps_schema,
26
27
  build_dataframe_table_meta,
@@ -143,17 +144,17 @@ def test_pandas_to_odps_schema_index(wrap_obj):
143
144
  data = pd.Index(np.random.randint(0, 100, 100))
144
145
 
145
146
  test_idx = _wrap_maxframe_obj(data, wrap=wrap_obj)
146
- schema, meta = pandas_to_odps_schema(test_idx, unknown_as_string=True)
147
- assert [c.name for c in schema.columns] == ["_idx_0"]
148
- assert [c.type.name for c in schema.columns] == ["bigint"]
149
- assert meta.type == OutputType.index
150
- assert meta.table_column_names == []
151
- assert meta.table_index_column_names == ["_idx_0"]
152
- assert meta.pd_column_level_names == []
153
- assert meta.pd_index_level_names == [None]
154
-
155
- with pytest.raises(AssertionError):
156
- pandas_to_odps_schema(test_idx, unknown_as_string=True, ignore_index=True)
147
+ for ignore_idx in (False, True):
148
+ schema, meta = pandas_to_odps_schema(
149
+ test_idx, unknown_as_string=True, ignore_index=ignore_idx
150
+ )
151
+ assert [c.name for c in schema.columns] == ["_idx_0"]
152
+ assert [c.type.name for c in schema.columns] == ["bigint"]
153
+ assert meta.type == OutputType.index
154
+ assert meta.table_column_names == []
155
+ assert meta.table_index_column_names == ["_idx_0"]
156
+ assert meta.pd_column_level_names == []
157
+ assert meta.pd_index_level_names == [None]
157
158
 
158
159
  data = pd.MultiIndex.from_arrays(
159
160
  [np.random.choice(list("ABC"), 100), np.random.randint(0, 10, 100)],
@@ -177,6 +178,7 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
177
178
  test_scalar = _wrap_maxframe_obj(data, wrap=wrap_obj)
178
179
  if wrap_obj != "no":
179
180
  test_scalar.op.data = None
181
+
180
182
  schema, meta = pandas_to_odps_schema(test_scalar, unknown_as_string=True)
181
183
  assert schema.columns[0].name == "_idx_0"
182
184
  assert schema.columns[0].type.name == "double"
@@ -186,9 +188,6 @@ def test_pandas_to_odps_schema_scalar(wrap_obj):
186
188
  assert meta.pd_column_level_names == []
187
189
  assert meta.pd_index_level_names == [None]
188
190
 
189
- with pytest.raises(AssertionError):
190
- pandas_to_odps_schema(test_scalar, unknown_as_string=True, ignore_index=True)
191
-
192
191
 
193
192
  def test_odps_arrow_schema_conversion():
194
193
  odps_schema = odps_types.OdpsSchema(
@@ -211,10 +210,11 @@ def test_odps_arrow_schema_conversion():
211
210
  odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
212
211
  odps_types.Column("col17", "CHAR(15)"),
213
212
  odps_types.Column("col18", "VARCHAR(15)"),
213
+ odps_types.Column("col19", "decimal"),
214
214
  ]
215
215
  )
216
216
  arrow_schema = odps_schema_to_arrow_schema(odps_schema)
217
- assert arrow_schema.names == [f"col{i}" for i in range(1, 19)]
217
+ assert arrow_schema.names == [f"col{i}" for i in range(1, 20)]
218
218
  assert arrow_schema.types == [
219
219
  pa.string(),
220
220
  pa.binary(),
@@ -234,6 +234,7 @@ def test_odps_arrow_schema_conversion():
234
234
  pa.struct([("a1", pa.string()), ("a2", pa.map_(pa.string(), pa.int64()))]),
235
235
  pa.string(),
236
236
  pa.string(),
237
+ pa.decimal128(38, 18),
237
238
  ]
238
239
 
239
240
  expected_odps_schema = odps_types.OdpsSchema(
@@ -256,6 +257,7 @@ def test_odps_arrow_schema_conversion():
256
257
  odps_types.Column("col16", "struct<a1: string, a2: map<string, bigint>>"),
257
258
  odps_types.Column("col17", "string"),
258
259
  odps_types.Column("col18", "string"),
260
+ odps_types.Column("col19", "decimal(38, 18)"),
259
261
  ]
260
262
  )
261
263
 
@@ -269,10 +271,6 @@ def test_odps_arrow_schema_conversion():
269
271
 
270
272
  with pytest.raises(TypeError):
271
273
  arrow_schema_to_odps_schema(pa.schema([("col1", pa.float16())]))
272
- with pytest.raises(TypeError):
273
- odps_schema_to_arrow_schema(
274
- odps_types.OdpsSchema([odps_types.Column("col1", "json")])
275
- )
276
274
 
277
275
 
278
276
  def test_build_column_name():
@@ -295,3 +293,42 @@ def test_build_table_meta(wrap_obj):
295
293
  table_meta = build_dataframe_table_meta(test_df)
296
294
  expected_cols = ["a_2", "a_3", "a_0", "a_1_0", "a_1_1", "b", "c"]
297
295
  assert table_meta.table_column_names == expected_cols
296
+
297
+
298
+ @pytest.mark.skipif(
299
+ pd_release_version[0] < 2, reason="only run under pandas 2.0 or greater"
300
+ )
301
+ def test_table_meta_with_datetime():
302
+ raw_df = pd.DataFrame(
303
+ [
304
+ [1, "abc", "2024-10-01 11:23:12"],
305
+ [3, "uvw", "2024-10-02 22:55:13"],
306
+ ],
307
+ columns=["col1", "col2", "col3"],
308
+ )
309
+ df = md.DataFrame(raw_df).astype({"col3": "datetime64[ms]"})
310
+ schema, _ = pandas_to_odps_schema(df, unknown_as_string=True)
311
+ assert schema.columns[3].type == odps_types.datetime
312
+
313
+ raw_series = pd.Series(
314
+ ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
315
+ )
316
+ s = md.Series(raw_series)
317
+ schema, _ = pandas_to_odps_schema(s, unknown_as_string=True)
318
+ assert schema.columns[1].type == odps_types.datetime
319
+
320
+ raw_index = pd.Index(
321
+ ["2024-10-01 11:23:12", "2024-10-02 22:55:13"], dtype="datetime64[ms]"
322
+ )
323
+ idx = md.Index(raw_index)
324
+ schema, _ = pandas_to_odps_schema(idx, unknown_as_string=True)
325
+ assert schema.columns[0].type == odps_types.datetime
326
+
327
+ src_df = pd.DataFrame(
328
+ [[1, "2024-10-01 11:23:12"], [3, "2024-10-02 22:55:13"]],
329
+ columns=["A", "B"],
330
+ ).astype({"B": "datetime64[ms]"})
331
+ raw_multiindex = pd.MultiIndex.from_frame(src_df)
332
+ multiidx = md.Index(raw_multiindex)
333
+ schema, _ = pandas_to_odps_schema(multiidx, unknown_as_string=True)
334
+ assert schema.columns[1].type == odps_types.datetime
@@ -12,22 +12,37 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import datetime
16
+
15
17
  import numpy as np
16
18
  import pandas as pd
17
19
  import pyarrow as pa
20
+ import pytest
18
21
  from odps import ODPS
19
22
 
20
- from ...tests.utils import flaky, tn
21
- from ...utils import config_odps_default_options
22
- from ..tableio import HaloTableIO
23
+ from ....config import options
24
+ from ....tests.utils import flaky, tn
25
+ from ....utils import config_odps_default_options
26
+ from ..tableio import ODPSTableIO
27
+
28
+
29
+ @pytest.fixture
30
+ def switch_table_io(request):
31
+ old_use_common_table = options.use_common_table
32
+ try:
33
+ options.use_common_table = request.param
34
+ yield
35
+ finally:
36
+ options.use_common_table = old_use_common_table
23
37
 
24
38
 
25
39
  @flaky(max_runs=3)
26
- def test_empty_table_io():
40
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
41
+ def test_empty_table_io(switch_table_io):
27
42
  config_odps_default_options()
28
43
 
29
44
  o = ODPS.from_environments()
30
- halo_table_io = HaloTableIO(o)
45
+ table_io = ODPSTableIO(o)
31
46
 
32
47
  # test read from empty table
33
48
  empty_table_name = tn("test_empty_table_halo_read")
@@ -35,42 +50,53 @@ def test_empty_table_io():
35
50
  tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
36
51
 
37
52
  try:
38
- with halo_table_io.open_reader(empty_table_name) as reader:
53
+ with table_io.open_reader(empty_table_name) as reader:
39
54
  assert len(reader.read_all()) == 0
40
55
  finally:
41
56
  tb.drop()
42
57
 
43
58
 
44
59
  @flaky(max_runs=3)
45
- def test_table_io_without_parts():
60
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
61
+ def test_table_io_without_parts(switch_table_io):
46
62
  config_odps_default_options()
47
63
 
48
64
  o = ODPS.from_environments()
49
- halo_table_io = HaloTableIO(o)
65
+ table_io = ODPSTableIO(o)
50
66
 
51
67
  # test read and write tables without partition
52
68
  no_part_table_name = tn("test_no_part_halo_write")
53
69
  o.delete_table(no_part_table_name, if_exists=True)
54
- tb = o.create_table(
55
- no_part_table_name, ",".join(f"{c} double" for c in "abcde"), lifecycle=1
56
- )
70
+ col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
71
+ tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
57
72
 
58
73
  try:
59
74
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
60
- with halo_table_io.open_writer(no_part_table_name) as writer:
75
+ date_val = [
76
+ (
77
+ datetime.datetime.now().replace(microsecond=0)
78
+ + datetime.timedelta(seconds=i)
79
+ )
80
+ for i in range(100)
81
+ ]
82
+ pd_data["f"] = pd.Series(date_val, dtype="datetime64[ms]").dt.tz_localize(
83
+ options.local_timezone
84
+ )
85
+ with table_io.open_writer(no_part_table_name) as writer:
61
86
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
62
- with halo_table_io.open_reader(no_part_table_name) as reader:
87
+ with table_io.open_reader(no_part_table_name) as reader:
63
88
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
64
89
  finally:
65
90
  tb.drop()
66
91
 
67
92
 
68
93
  @flaky(max_runs=3)
69
- def test_table_io_with_range_reader():
94
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
95
+ def test_table_io_with_range_reader(switch_table_io):
70
96
  config_odps_default_options()
71
97
 
72
98
  o = ODPS.from_environments()
73
- halo_table_io = HaloTableIO(o)
99
+ table_io = ODPSTableIO(o)
74
100
 
75
101
  # test read and write tables without partition
76
102
  no_part_table_name = tn("test_no_part_halo_write")
@@ -81,15 +107,15 @@ def test_table_io_with_range_reader():
81
107
 
82
108
  try:
83
109
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
84
- with halo_table_io.open_writer(no_part_table_name) as writer:
110
+ with table_io.open_writer(no_part_table_name) as writer:
85
111
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
86
112
 
87
- with halo_table_io.open_reader(
113
+ with table_io.open_reader(
88
114
  no_part_table_name, start=None, stop=100, row_batch_size=10
89
115
  ) as reader:
90
116
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
91
117
 
92
- with halo_table_io.open_reader(
118
+ with table_io.open_reader(
93
119
  no_part_table_name,
94
120
  start=-2,
95
121
  stop=-52,
@@ -105,11 +131,12 @@ def test_table_io_with_range_reader():
105
131
 
106
132
 
107
133
  @flaky(max_runs=3)
108
- def test_table_io_with_parts():
134
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
135
+ def test_table_io_with_parts(switch_table_io):
109
136
  config_odps_default_options()
110
137
 
111
138
  o = ODPS.from_environments()
112
- halo_table_io = HaloTableIO(o)
139
+ table_io = ODPSTableIO(o)
113
140
 
114
141
  # test read and write tables with partition
115
142
  parted_table_name = tn("test_parted_halo_write")
@@ -122,11 +149,11 @@ def test_table_io_with_parts():
122
149
 
123
150
  try:
124
151
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
125
- with halo_table_io.open_writer(parted_table_name, "pt=test") as writer:
152
+ with table_io.open_writer(parted_table_name, "pt=test") as writer:
126
153
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
127
- with halo_table_io.open_reader(parted_table_name, "pt=test") as reader:
154
+ with table_io.open_reader(parted_table_name, "pt=test") as reader:
128
155
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
129
- with halo_table_io.open_reader(
156
+ with table_io.open_reader(
130
157
  parted_table_name, "pt=test", partition_columns=True
131
158
  ) as reader:
132
159
  expected_data = pd_data.copy()
@@ -15,7 +15,7 @@
15
15
  import pytest
16
16
  from odps import ODPS
17
17
 
18
- from ...tests.utils import tn
18
+ from ....tests.utils import tn
19
19
  from ..volumeio import ODPSVolumeReader, ODPSVolumeWriter
20
20
 
21
21
 
@@ -69,19 +69,17 @@ def create_volume(request, oss_config):
69
69
  oss_config.oss_bucket.batch_delete_objects(keys)
70
70
 
71
71
 
72
- @pytest.mark.parametrize("create_volume", ["parted", "external"], indirect=True)
72
+ @pytest.mark.parametrize("create_volume", ["external"], indirect=True)
73
73
  def test_read_write_volume(create_volume):
74
74
  test_vol_dir = "test_vol_dir"
75
75
 
76
76
  odps_entry = ODPS.from_environments()
77
77
 
78
78
  writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
79
- write_session_id = writer.create_write_session()
80
79
 
81
80
  writer = ODPSVolumeWriter(odps_entry, create_volume, test_vol_dir)
82
- writer.write_file("file1", b"content1", write_session_id)
83
- writer.write_file("file2", b"content2", write_session_id)
84
- writer.commit(["file1", "file2"], write_session_id)
81
+ writer.write_file("file1", b"content1")
82
+ writer.write_file("file2", b"content2")
85
83
 
86
84
  reader = ODPSVolumeReader(odps_entry, create_volume, test_vol_dir)
87
85
  assert reader.read_file("file1") == b"content1"
@@ -0,0 +1,63 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from typing import Iterator, List, Optional, Union
17
+
18
+ from odps import ODPS
19
+
20
+
21
+ class ODPSVolumeReader:
22
+ def __init__(self, odps_entry: ODPS, volume_name: str, volume_dir: str):
23
+ self._odps_entry = odps_entry
24
+ self._volume = odps_entry.get_volume(volume_name)
25
+ self._volume_dir = volume_dir
26
+
27
+ def list_files(self) -> List[str]:
28
+ def _get_file_name(vol_file):
29
+ if hasattr(vol_file, "name"):
30
+ return vol_file.name
31
+ return vol_file.path.rsplit("/", 1)[-1]
32
+
33
+ return [
34
+ _get_file_name(f)
35
+ for f in self._odps_entry.list_volume_files(
36
+ f"/{self._volume.name}/{self._volume_dir}"
37
+ )
38
+ ]
39
+
40
+ def read_file(self, file_name: str) -> bytes:
41
+ with self._volume.open_reader(self._volume_dir + "/" + file_name) as reader:
42
+ return reader.read()
43
+
44
+
45
+ class ODPSVolumeWriter:
46
+ def __init__(
47
+ self,
48
+ odps_entry: ODPS,
49
+ volume_name: str,
50
+ volume_dir: str,
51
+ schema_name: Optional[str] = None,
52
+ ):
53
+ self._odps_entry = odps_entry
54
+ self._volume = odps_entry.get_volume(volume_name, schema=schema_name)
55
+ self._volume_dir = volume_dir
56
+
57
+ def write_file(self, file_name: str, data: Union[bytes, Iterator[bytes]]):
58
+ with self._volume.open_writer(self._volume_dir + "/" + file_name) as writer:
59
+ if not inspect.isgenerator(data):
60
+ writer.write(data)
61
+ else:
62
+ for chunk in data:
63
+ writer.write(chunk)
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from . import pytorch
15
+ from . import graph, llm, pytorch
16
16
 
17
+ del graph
18
+ del llm
17
19
  del pytorch
@@ -0,0 +1,15 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from .connected_components import connected_components
@@ -0,0 +1,215 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import pandas as pd
17
+
18
+ from maxframe import opcodes
19
+
20
+ from ....core import OutputType
21
+ from ....dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
22
+ from ....dataframe.utils import make_dtypes, parse_index
23
+ from ....serialization.serializables import Int32Field, StringField
24
+
25
+
26
+ class DataFrameConnectedComponentsOperator(DataFrameOperator, DataFrameOperatorMixin):
27
+ _op_type_ = opcodes.CONNECTED_COMPONENTS
28
+
29
+ vertex_col1 = StringField("vertex_col1", default=None)
30
+ vertex_col2 = StringField("vertex_col2", default=None)
31
+ max_iter = Int32Field("max_iter", default=6)
32
+
33
+ def __call__(self, df):
34
+ node_id_dtype = df.dtypes[self.vertex_col1]
35
+ dtypes = make_dtypes({"id": node_id_dtype, "component": node_id_dtype})
36
+ # this will return a dataframe and a bool flag
37
+ new_dataframe_tileable_kw = {
38
+ "shape": (np.nan, 2),
39
+ "index_value": parse_index(pd.RangeIndex(0)),
40
+ "columns_value": parse_index(dtypes.index, store_data=True),
41
+ "dtypes": dtypes,
42
+ }
43
+ new_scalar_tileable_kw = {"dtype": np.dtype(np.bool_), "shape": ()}
44
+ return self.new_tileables(
45
+ [df],
46
+ kws=[new_dataframe_tileable_kw, new_scalar_tileable_kw],
47
+ )
48
+
49
+ @property
50
+ def output_limit(self):
51
+ return 2
52
+
53
+
54
+ def connected_components(
55
+ dataframe, vertex_col1: str, vertex_col2: str, max_iter: int = 6
56
+ ):
57
+ """
58
+ The connected components algorithm labels each node as belonging to a specific connected component with the ID of
59
+ its lowest-numbered vertex.
60
+
61
+ Parameters
62
+ ----------
63
+ dataframe : DataFrame
64
+ A DataFrame containing the edges of the graph.
65
+
66
+ vertex_col1 : str
67
+ The name of the column in `dataframe` that contains the one of edge vertices. The column value must be an
68
+ integer.
69
+
70
+ vertex_col2 : str
71
+ The name of the column in `dataframe` that contains the other one of edge vertices. The column value must be an
72
+ integer.
73
+
74
+ max_iter : int
75
+ The algorithm use large and small star transformation to find all connected components, `max_iter`
76
+ controls the max round of the iterations before finds all edges. Default is 6.
77
+
78
+
79
+ Returns
80
+ -------
81
+ DataFrame
82
+ Return dataFrame contains all connected component edges by two columns `id` and `component`. `component` is
83
+ the lowest-numbered vertex in the connected components.
84
+
85
+ Notes
86
+ -------
87
+ After `execute()`, the dataframe has a bool member `flag` to indicate if the `connected_components` already
88
+ converged in `max_iter` rounds. `True` means the dataframe already contains all edges of the connected components.
89
+ If `False` you can run `connected_components` more times to reach the converged state.
90
+
91
+ Examples
92
+ --------
93
+ >>> import numpy as np
94
+ >>> import maxframe.dataframe as md
95
+ >>> import maxframe.learn.contrib.graph.connected_components
96
+ >>> df = md.DataFrame({'x': [4, 1], 'y': [0, 4]})
97
+ >>> df.execute()
98
+ x y
99
+ 0 4 1
100
+ 1 0 4
101
+
102
+ Get connected components with 1 round iteration.
103
+
104
+ >>> components, converged = connected_components(df, "x", "y", 1)
105
+ >>> session.execute(components, converged)
106
+ >>> components
107
+ A B
108
+ 0 1 0
109
+ 1 4 0
110
+
111
+ >>> converged
112
+ True
113
+
114
+ Sometimes, a single iteration may not be sufficient to propagate the connectivity of all edges.
115
+ By default, `connected_components` performs 6 iterations of calculations.
116
+ If you are unsure whether the connected components have converged, you can check the `flag` variable in
117
+ the output DataFrame after calling `execute()`.
118
+
119
+ >>> df = md.DataFrame({'x': [4, 1, 7, 5, 8, 11, 11], 'y': [0, 4, 4, 7, 7, 9, 13]})
120
+ >>> df.execute()
121
+ x y
122
+ 0 4 0
123
+ 1 1 4
124
+ 2 7 4
125
+ 3 5 7
126
+ 4 8 7
127
+ 5 11 9
128
+ 6 11 13
129
+
130
+ >>> components, converged = connected_components(df, "x", "y", 1)
131
+ >>> session.execute(components, converged)
132
+ >>> components
133
+ id component
134
+ 0 4 0
135
+ 1 7 0
136
+ 2 8 4
137
+ 3 13 9
138
+ 4 1 0
139
+ 5 5 0
140
+ 6 11 9
141
+
142
+ If `flag` is True, it means convergence has been achieved.
143
+
144
+ >>> converged
145
+ False
146
+
147
+ You can determine whether to continue iterating or to use a larger number of iterations
148
+ (but not too large, which would result in wasted computational overhead).
149
+
150
+ >>> components, converged = connected_components(components, "id", "component", 1)
151
+ >>> session.execute(components, converged)
152
+ >>> components
153
+ id component
154
+ 0 4 0
155
+ 1 7 0
156
+ 2 13 9
157
+ 3 1 0
158
+ 4 5 0
159
+ 5 11 9
160
+ 6 8 0
161
+
162
+ >>> components, converged = connected_components(df, "x", "y")
163
+ >>> session.execute(components, converged)
164
+ >>> components
165
+ id component
166
+ 0 4 0
167
+ 1 7 0
168
+ 2 13 9
169
+ 3 1 0
170
+ 4 5 0
171
+ 5 11 9
172
+ 6 8 0
173
+ """
174
+
175
+ # Check if vertex columns are provided
176
+ if not vertex_col1 or not vertex_col2:
177
+ raise ValueError("Both vertex_col1 and vertex_col2 must be provided.")
178
+
179
+ # Check if max_iter is provided and within the valid range
180
+ if max_iter is None:
181
+ raise ValueError("max_iter must be provided.")
182
+ if not (1 <= max_iter <= 50):
183
+ raise ValueError("max_iter must be an integer between 1 and 50.")
184
+
185
+ # Verify that the vertex columns exist in the dataframe
186
+ missing_cols = [
187
+ col for col in (vertex_col1, vertex_col2) if col not in dataframe.dtypes
188
+ ]
189
+ if missing_cols:
190
+ raise ValueError(
191
+ f"The following required columns {missing_cols} are not in {list(dataframe.dtypes.index)}"
192
+ )
193
+
194
+ # Ensure that the vertex columns are of integer type
195
+ # TODO support string dtype
196
+ incorrect_dtypes = [
197
+ col
198
+ for col in (vertex_col1, vertex_col2)
199
+ if dataframe[col].dtype != np.dtype("int")
200
+ ]
201
+ if incorrect_dtypes:
202
+ dtypes_str = ", ".join(str(dataframe[col].dtype) for col in incorrect_dtypes)
203
+ raise ValueError(
204
+ f"Columns {incorrect_dtypes} should be of integer type, but found {dtypes_str}."
205
+ )
206
+
207
+ op = DataFrameConnectedComponentsOperator(
208
+ vertex_col1=vertex_col1,
209
+ vertex_col2=vertex_col2,
210
+ _output_types=[OutputType.dataframe, OutputType.scalar],
211
+ max_iter=max_iter,
212
+ )
213
+ return op(
214
+ dataframe,
215
+ )
@@ -0,0 +1,13 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.