maxframe 0.1.0b5__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (203) hide show
  1. maxframe/_utils.cpython-310-darwin.so +0 -0
  2. maxframe/codegen.py +10 -4
  3. maxframe/config/config.py +68 -10
  4. maxframe/config/validators.py +42 -11
  5. maxframe/conftest.py +58 -14
  6. maxframe/core/__init__.py +2 -16
  7. maxframe/core/entity/__init__.py +1 -12
  8. maxframe/core/entity/executable.py +1 -1
  9. maxframe/core/entity/objects.py +46 -45
  10. maxframe/core/entity/output_types.py +0 -3
  11. maxframe/core/entity/tests/test_objects.py +43 -0
  12. maxframe/core/entity/tileables.py +5 -78
  13. maxframe/core/graph/__init__.py +2 -2
  14. maxframe/core/graph/builder/__init__.py +0 -1
  15. maxframe/core/graph/builder/base.py +5 -4
  16. maxframe/core/graph/builder/tileable.py +4 -4
  17. maxframe/core/graph/builder/utils.py +4 -8
  18. maxframe/core/graph/core.cpython-310-darwin.so +0 -0
  19. maxframe/core/graph/core.pyx +4 -4
  20. maxframe/core/graph/entity.py +9 -33
  21. maxframe/core/operator/__init__.py +2 -9
  22. maxframe/core/operator/base.py +3 -5
  23. maxframe/core/operator/objects.py +0 -9
  24. maxframe/core/operator/utils.py +55 -0
  25. maxframe/dataframe/__init__.py +1 -1
  26. maxframe/dataframe/arithmetic/around.py +5 -17
  27. maxframe/dataframe/arithmetic/core.py +15 -7
  28. maxframe/dataframe/arithmetic/docstring.py +7 -33
  29. maxframe/dataframe/arithmetic/equal.py +4 -2
  30. maxframe/dataframe/arithmetic/greater.py +4 -2
  31. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  32. maxframe/dataframe/arithmetic/less.py +2 -2
  33. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  34. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  36. maxframe/dataframe/core.py +31 -7
  37. maxframe/dataframe/datasource/date_range.py +2 -2
  38. maxframe/dataframe/datasource/read_odps_query.py +117 -23
  39. maxframe/dataframe/datasource/read_odps_table.py +6 -3
  40. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  41. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  42. maxframe/dataframe/datastore/to_odps.py +28 -0
  43. maxframe/dataframe/extensions/__init__.py +5 -0
  44. maxframe/dataframe/extensions/flatjson.py +131 -0
  45. maxframe/dataframe/extensions/flatmap.py +317 -0
  46. maxframe/dataframe/extensions/reshuffle.py +1 -1
  47. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  48. maxframe/dataframe/groupby/core.py +1 -1
  49. maxframe/dataframe/groupby/cum.py +0 -1
  50. maxframe/dataframe/groupby/fill.py +4 -1
  51. maxframe/dataframe/groupby/getitem.py +6 -0
  52. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  53. maxframe/dataframe/groupby/transform.py +5 -1
  54. maxframe/dataframe/indexing/align.py +1 -1
  55. maxframe/dataframe/indexing/loc.py +6 -4
  56. maxframe/dataframe/indexing/rename.py +5 -28
  57. maxframe/dataframe/indexing/sample.py +0 -1
  58. maxframe/dataframe/indexing/set_index.py +68 -1
  59. maxframe/dataframe/initializer.py +11 -1
  60. maxframe/dataframe/merge/__init__.py +9 -1
  61. maxframe/dataframe/merge/concat.py +41 -31
  62. maxframe/dataframe/merge/merge.py +237 -3
  63. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  64. maxframe/dataframe/misc/apply.py +5 -10
  65. maxframe/dataframe/misc/case_when.py +1 -1
  66. maxframe/dataframe/misc/describe.py +2 -2
  67. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  68. maxframe/dataframe/misc/eval.py +4 -0
  69. maxframe/dataframe/misc/memory_usage.py +2 -2
  70. maxframe/dataframe/misc/pct_change.py +1 -83
  71. maxframe/dataframe/misc/tests/test_misc.py +33 -2
  72. maxframe/dataframe/misc/transform.py +1 -30
  73. maxframe/dataframe/misc/value_counts.py +4 -17
  74. maxframe/dataframe/missing/dropna.py +1 -1
  75. maxframe/dataframe/missing/fillna.py +5 -5
  76. maxframe/dataframe/operators.py +1 -17
  77. maxframe/dataframe/reduction/core.py +2 -2
  78. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  79. maxframe/dataframe/sort/sort_values.py +1 -11
  80. maxframe/dataframe/statistics/corr.py +3 -3
  81. maxframe/dataframe/statistics/quantile.py +13 -19
  82. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  83. maxframe/dataframe/tests/test_initializer.py +33 -2
  84. maxframe/dataframe/utils.py +26 -11
  85. maxframe/dataframe/window/expanding.py +5 -3
  86. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  87. maxframe/errors.py +13 -0
  88. maxframe/extension.py +12 -0
  89. maxframe/io/__init__.py +13 -0
  90. maxframe/io/objects/__init__.py +24 -0
  91. maxframe/io/objects/core.py +140 -0
  92. maxframe/io/objects/tensor.py +76 -0
  93. maxframe/io/objects/tests/__init__.py +13 -0
  94. maxframe/io/objects/tests/test_object_io.py +97 -0
  95. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  96. maxframe/{odpsio → io/odpsio}/arrow.py +42 -10
  97. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  98. maxframe/io/odpsio/tableio.py +719 -0
  99. maxframe/io/odpsio/tests/__init__.py +13 -0
  100. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +59 -22
  101. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  102. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  103. maxframe/io/odpsio/volumeio.py +63 -0
  104. maxframe/learn/contrib/__init__.py +3 -1
  105. maxframe/learn/contrib/graph/__init__.py +15 -0
  106. maxframe/learn/contrib/graph/connected_components.py +215 -0
  107. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  108. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  109. maxframe/learn/contrib/llm/__init__.py +16 -0
  110. maxframe/learn/contrib/llm/core.py +54 -0
  111. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  112. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  113. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  114. maxframe/learn/contrib/llm/text.py +42 -0
  115. maxframe/learn/contrib/xgboost/classifier.py +26 -2
  116. maxframe/learn/contrib/xgboost/core.py +87 -2
  117. maxframe/learn/contrib/xgboost/dmatrix.py +3 -6
  118. maxframe/learn/contrib/xgboost/predict.py +29 -46
  119. maxframe/learn/contrib/xgboost/regressor.py +3 -10
  120. maxframe/learn/contrib/xgboost/train.py +29 -18
  121. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  122. maxframe/lib/mmh3.cpython-310-darwin.so +0 -0
  123. maxframe/lib/mmh3.pyi +43 -0
  124. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  125. maxframe/lib/wrapped_pickle.py +2 -1
  126. maxframe/opcodes.py +8 -0
  127. maxframe/protocol.py +154 -27
  128. maxframe/remote/core.py +4 -8
  129. maxframe/serialization/__init__.py +1 -0
  130. maxframe/serialization/core.cpython-310-darwin.so +0 -0
  131. maxframe/serialization/core.pxd +3 -0
  132. maxframe/serialization/core.pyi +3 -0
  133. maxframe/serialization/core.pyx +67 -26
  134. maxframe/serialization/exception.py +1 -1
  135. maxframe/serialization/pandas.py +52 -17
  136. maxframe/serialization/serializables/core.py +180 -15
  137. maxframe/serialization/serializables/field_type.py +4 -1
  138. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  139. maxframe/serialization/tests/test_serial.py +2 -1
  140. maxframe/session.py +9 -2
  141. maxframe/tensor/__init__.py +81 -2
  142. maxframe/tensor/arithmetic/isclose.py +1 -0
  143. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  144. maxframe/tensor/core.py +5 -136
  145. maxframe/tensor/datasource/array.py +3 -0
  146. maxframe/tensor/datasource/full.py +1 -1
  147. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  148. maxframe/tensor/indexing/flatnonzero.py +1 -1
  149. maxframe/tensor/indexing/getitem.py +2 -0
  150. maxframe/tensor/merge/__init__.py +2 -0
  151. maxframe/tensor/merge/concatenate.py +101 -0
  152. maxframe/tensor/merge/tests/test_merge.py +30 -1
  153. maxframe/tensor/merge/vstack.py +74 -0
  154. maxframe/tensor/{base → misc}/__init__.py +2 -0
  155. maxframe/tensor/{base → misc}/atleast_1d.py +1 -3
  156. maxframe/tensor/misc/atleast_2d.py +70 -0
  157. maxframe/tensor/misc/atleast_3d.py +85 -0
  158. maxframe/tensor/misc/tests/__init__.py +13 -0
  159. maxframe/tensor/{base → misc}/transpose.py +22 -18
  160. maxframe/tensor/{base → misc}/unique.py +3 -3
  161. maxframe/tensor/operators.py +1 -7
  162. maxframe/tensor/random/core.py +1 -1
  163. maxframe/tensor/reduction/count_nonzero.py +2 -1
  164. maxframe/tensor/reduction/mean.py +1 -0
  165. maxframe/tensor/reduction/nanmean.py +1 -0
  166. maxframe/tensor/reduction/nanvar.py +2 -0
  167. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  168. maxframe/tensor/reduction/var.py +2 -0
  169. maxframe/tensor/statistics/quantile.py +2 -2
  170. maxframe/tensor/utils.py +2 -22
  171. maxframe/tests/test_protocol.py +34 -0
  172. maxframe/tests/test_utils.py +0 -12
  173. maxframe/tests/utils.py +17 -2
  174. maxframe/typing_.py +4 -1
  175. maxframe/udf.py +8 -9
  176. maxframe/utils.py +106 -86
  177. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  178. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/RECORD +197 -173
  179. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  180. maxframe_client/__init__.py +0 -1
  181. maxframe_client/clients/framedriver.py +4 -1
  182. maxframe_client/fetcher.py +81 -74
  183. maxframe_client/session/consts.py +3 -0
  184. maxframe_client/session/graph.py +8 -2
  185. maxframe_client/session/odps.py +194 -40
  186. maxframe_client/session/task.py +94 -39
  187. maxframe_client/tests/test_fetcher.py +21 -3
  188. maxframe_client/tests/test_session.py +109 -8
  189. maxframe/core/entity/chunks.py +0 -68
  190. maxframe/core/entity/fuse.py +0 -73
  191. maxframe/core/graph/builder/chunk.py +0 -430
  192. maxframe/odpsio/tableio.py +0 -322
  193. maxframe/odpsio/volumeio.py +0 -95
  194. maxframe_client/clients/spe.py +0 -104
  195. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  196. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  197. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  198. /maxframe/tensor/{base → misc}/astype.py +0 -0
  199. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  200. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  201. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  202. /maxframe/tensor/{base → misc}/where.py +0 -0
  203. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
@@ -17,19 +17,32 @@ import uuid
17
17
  import numpy as np
18
18
  import pandas as pd
19
19
  import pyarrow as pa
20
+ import pytest
20
21
  from odps import ODPS
21
22
 
22
23
  import maxframe.dataframe as md
23
- from maxframe.odpsio import HaloTableIO
24
+ from maxframe.config import options
25
+ from maxframe.io.odpsio import ODPSTableIO
24
26
  from maxframe.protocol import ODPSTableResultInfo, ResultType
25
27
  from maxframe.tests.utils import tn
26
28
 
27
29
  from ..fetcher import ODPSTableFetcher
28
30
 
29
31
 
30
- async def test_table_fetcher():
32
+ @pytest.fixture
33
+ def switch_table_io(request):
34
+ old_use_common_table = options.use_common_table
35
+ try:
36
+ options.use_common_table = request.param
37
+ yield
38
+ finally:
39
+ options.use_common_table = old_use_common_table
40
+
41
+
42
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
43
+ async def test_table_fetcher(switch_table_io):
31
44
  odps_entry = ODPS.from_environments()
32
- halo_table_io = HaloTableIO(odps_entry)
45
+ halo_table_io = ODPSTableIO(odps_entry)
33
46
  fetcher = ODPSTableFetcher(odps_entry)
34
47
 
35
48
  data = pd.DataFrame(
@@ -58,6 +71,11 @@ async def test_table_fetcher():
58
71
  assert len(fetched) == 1000
59
72
  pd.testing.assert_frame_equal(raw_data, fetched)
60
73
 
74
+ result_info = ODPSTableResultInfo(ResultType.ODPS_TABLE, full_table_name=table_name)
75
+ fetched = await fetcher.fetch(tileable, result_info, [slice(None, 2000), None])
76
+ assert len(fetched) == 1000
77
+ pd.testing.assert_frame_equal(raw_data, fetched)
78
+
61
79
  result_info = ODPSTableResultInfo(ResultType.ODPS_TABLE, full_table_name=table_name)
62
80
  fetched = await fetcher.fetch(tileable, result_info, [2, None])
63
81
  assert len(fetched) == 1
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import time
16
- from typing import Dict
16
+ from typing import Any, Dict
17
17
 
18
18
  import mock
19
19
  import numpy as np
@@ -23,7 +23,10 @@ from odps import ODPS
23
23
 
24
24
  import maxframe.dataframe as md
25
25
  import maxframe.remote as mr
26
+ from maxframe.config import options
27
+ from maxframe.config.config import option_context
26
28
  from maxframe.core import ExecutableTuple, TileableGraph
29
+ from maxframe.errors import NoTaskServerResponseError
27
30
  from maxframe.lib.aio import stop_isolation
28
31
  from maxframe.protocol import ResultInfo
29
32
  from maxframe.serialization import RemoteException
@@ -35,6 +38,7 @@ from maxframe_framedriver.app.tests.test_framedriver_webapp import ( # noqa: F4
35
38
  )
36
39
 
37
40
  from ..clients.framedriver import FrameDriverClient
41
+ from ..session.odps import MaxFrameRestCaller
38
42
 
39
43
  pytestmark = pytest.mark.maxframe_engine(["MCSQL", "SPE"])
40
44
 
@@ -82,15 +86,32 @@ def test_simple_run_dataframe(start_mock_session):
82
86
  session_id: str,
83
87
  dag: TileableGraph,
84
88
  managed_input_infos: Dict[str, ResultInfo] = None,
89
+ new_settings: Dict[str, Any] = None,
85
90
  ):
86
91
  assert len(dag) == 2
87
- return await original_submit_dag(self, session_id, dag, managed_input_infos)
92
+ return await original_submit_dag(
93
+ self, session_id, dag, managed_input_infos, new_settings
94
+ )
95
+
96
+ no_task_server_raised = False
97
+ original_get_dag_info = MaxFrameRestCaller.get_dag_info
98
+
99
+ async def patched_get_dag_info(self, dag_id: str):
100
+ nonlocal no_task_server_raised
101
+
102
+ if not no_task_server_raised:
103
+ no_task_server_raised = True
104
+ raise NoTaskServerResponseError
105
+ return await original_get_dag_info(self, dag_id)
88
106
 
89
107
  df["H"] = "extra_content"
90
108
 
91
109
  with mock.patch(
92
110
  "maxframe_client.clients.framedriver.FrameDriverClient.submit_dag",
93
111
  new=patched_submit_dag,
112
+ ), mock.patch(
113
+ "maxframe_client.session.odps.MaxFrameRestCaller.get_dag_info",
114
+ new=patched_get_dag_info,
94
115
  ):
95
116
  result = df.execute().fetch()
96
117
  assert len(result) == 1000
@@ -112,13 +133,30 @@ def test_simple_run_dataframe(start_mock_session):
112
133
  )
113
134
  assert odps_entry.exist_table(build_temp_table_name(start_mock_session, key))
114
135
  del df
115
- time.sleep(5)
136
+ retry_times = 10
137
+ while (
138
+ odps_entry.exist_table(
139
+ build_temp_table_name(start_mock_session, intermediate_key)
140
+ )
141
+ and retry_times > 0
142
+ ):
143
+ time.sleep(1)
144
+ retry_times -= 1
116
145
  assert not odps_entry.exist_table(
117
146
  build_temp_table_name(start_mock_session, intermediate_key)
118
147
  )
119
148
  assert not odps_entry.exist_table(build_temp_table_name(start_mock_session, key))
120
149
 
121
150
 
151
+ def test_run_and_fetch_slice(start_mock_session):
152
+ pd_df = pd.DataFrame(np.random.rand(1000, 5), columns=list("ABCDE"))
153
+ df = md.DataFrame(pd_df)
154
+ result = df.execute()
155
+
156
+ sliced = result.head(10).fetch()
157
+ assert len(sliced) == 10
158
+
159
+
122
160
  def test_run_empty_table(start_mock_session):
123
161
  odps_entry = ODPS.from_environments()
124
162
 
@@ -139,6 +177,25 @@ def test_run_empty_table(start_mock_session):
139
177
  empty_table.drop()
140
178
 
141
179
 
180
+ def test_run_odps_query_without_schema(start_mock_session):
181
+ odps_entry = ODPS.from_environments()
182
+
183
+ table_name = tn("test_session_empty_table")
184
+ odps_entry.delete_table(table_name, if_exists=True)
185
+ test_table = odps_entry.create_table(table_name, "a double, b double", lifecycle=1)
186
+
187
+ with test_table.open_writer() as writer:
188
+ writer.write([123, 456])
189
+
190
+ df = md.read_odps_query(
191
+ f"select a, b, a + b as `special: name` from {table_name}", skip_schema=True
192
+ )
193
+ executed = df.execute().fetch()
194
+ assert len(executed.dtypes) == 3
195
+
196
+ test_table.drop()
197
+
198
+
142
199
  def test_run_dataframe_with_pd_source(start_mock_session):
143
200
  odps_entry = ODPS.from_environments()
144
201
 
@@ -171,19 +228,38 @@ def test_run_dataframe_from_to_odps_table(start_mock_session):
171
228
  table_name = build_temp_table_name(start_mock_session, "tmp_save")
172
229
  table_obj = odps_entry.get_table(table_name)
173
230
  try:
174
- md.to_odps_table(md.DataFrame(pd_df), table_obj).execute().fetch()
231
+ md.to_odps_table(md.DataFrame(pd_df), table_obj, lifecycle=1).execute().fetch()
175
232
  with table_obj.open_reader() as reader:
176
233
  result_df = reader.to_pandas()
177
234
  assert len(result_df) == 10
178
235
  assert len(result_df.columns) == 6
179
236
 
180
- df = md.read_odps_table(table_obj, index_col="index").head(10).execute().fetch()
237
+ df = md.read_odps_table(table_obj, index_col="index").head(10).execute()
238
+ assert df.shape == (10, 5)
181
239
  assert len(df) == 10
182
240
  assert len(df.columns) == 5
183
241
  finally:
184
242
  odps_entry.delete_table(table_name, if_exists=True)
185
243
 
186
244
 
245
+ def test_create_session_with_options(framedriver_app): # noqa: F811
246
+ odps_entry = ODPS.from_environments()
247
+ framedriver_addr = f"mf://localhost:{framedriver_app.port}"
248
+ old_value = options.session.max_alive_seconds
249
+ session = None
250
+ try:
251
+ options.session.max_alive_seconds = 10
252
+ session = new_session(framedriver_addr, odps_entry=odps_entry)
253
+ session_id = session.session_id
254
+ session_conf = framedriver_app.session_manager.get_session_settings(session_id)
255
+ with option_context(session_conf) as session_options:
256
+ assert session_options.session.max_alive_seconds == 10
257
+ finally:
258
+ options.session.max_alive_seconds = old_value
259
+ if session is not None:
260
+ session.destroy()
261
+
262
+
187
263
  def test_run_and_fetch_series(start_mock_session):
188
264
  odps_entry = ODPS.from_environments()
189
265
 
@@ -210,7 +286,22 @@ def test_run_and_fetch_series(start_mock_session):
210
286
  )
211
287
 
212
288
 
213
- def test_run_remote_success(start_mock_session):
289
+ def test_execute_with_tensor(oss_config, start_mock_session):
290
+ pd_df = pd.DataFrame(
291
+ {"angles": [0, 3, 4], "degrees": [360, 180, 360]},
292
+ index=["circle", "triangle", "rectangle"],
293
+ )
294
+ df = md.DataFrame(pd_df)
295
+
296
+ result = (df - [1, 2]).execute().fetch()
297
+ expected = pd_df - [1, 2]
298
+ # TODO: currently the record order in tensor reading from table is the index
299
+ # sorting order
300
+ expected.sort_index(axis=0, inplace=True)
301
+ pd.testing.assert_frame_equal(result, expected, check_like=True)
302
+
303
+
304
+ def test_run_remote_success(oss_config, start_mock_session):
214
305
  def func(a, b):
215
306
  return a + b
216
307
 
@@ -221,7 +312,7 @@ def test_run_remote_success(start_mock_session):
221
312
  assert result == 21
222
313
 
223
314
 
224
- def test_run_remote_error(start_mock_session):
315
+ def test_run_remote_error(oss_config, start_mock_session):
225
316
  def func():
226
317
  raise ValueError
227
318
 
@@ -244,7 +335,7 @@ def test_pivot_dataframe(start_mock_session):
244
335
  df = md.DataFrame(pd_df)
245
336
  pivot = df.pivot_table(values="D", index=["A", "B"], columns=["C"], aggfunc="sum")
246
337
  executed = pivot.execute()
247
- assert pivot.shape == (2, 4)
338
+ assert pivot.shape == (4, 2)
248
339
  pd.testing.assert_index_equal(
249
340
  pivot.dtypes.index, pd.Index(["large", "small"], name="C")
250
341
  )
@@ -253,3 +344,13 @@ def test_pivot_dataframe(start_mock_session):
253
344
  values="D", index=["A", "B"], columns=["C"], aggfunc="sum"
254
345
  )
255
346
  pd.testing.assert_frame_equal(executed.to_pandas(), expected)
347
+
348
+
349
+ def test_index_drop_duplicates(start_mock_session):
350
+ pd_idx = pd.Index(["lame", "cow", "lame", "beetle", "lame", "hippo"])
351
+ idx = md.Index(pd_idx)
352
+ executed = idx.drop_duplicates(keep="first").execute()
353
+ expected = pd_idx.drop_duplicates(keep="first")
354
+ pd.testing.assert_index_equal(
355
+ executed.to_pandas().sort_values(), expected.sort_values()
356
+ )
@@ -1,68 +0,0 @@
1
- # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from ...serialization.serializables import BoolField, FieldTypes, TupleField
16
- from ...utils import tokenize
17
- from .core import Entity, EntityData
18
-
19
-
20
- class ChunkData(EntityData):
21
- __slots__ = ()
22
-
23
- is_broadcaster = BoolField("is_broadcaster", default=False)
24
- # If the operator is a shuffle mapper, this flag indicates whether the current chunk is mapper chunk when
25
- # the operator produce multiple chunks such as TensorUnique.
26
- is_mapper = BoolField("is_mapper", default=None)
27
- # optional fields
28
- _index = TupleField("index", FieldTypes.uint32)
29
-
30
- def __repr__(self):
31
- if self.op.stage is None:
32
- return (
33
- f"{type(self).__name__} <op={type(self.op).__name__}, "
34
- f"key={self.key}>"
35
- )
36
- else:
37
- return (
38
- f"{type(self).__name__} <op={type(self.op).__name__}, "
39
- f"stage={self.op.stage.name}, key={self.key}>"
40
- )
41
-
42
- @property
43
- def index(self):
44
- return getattr(self, "_index", None)
45
-
46
- @property
47
- def device(self):
48
- return self.op.device
49
-
50
- def _update_key(self):
51
- object.__setattr__(
52
- self,
53
- "_key",
54
- tokenize(
55
- type(self).__name__,
56
- *(getattr(self, k, None) for k in self._keys_ if k != "_index"),
57
- ),
58
- )
59
-
60
-
61
- class Chunk(Entity):
62
- _allow_data_type_ = (ChunkData,)
63
-
64
- def __repr__(self):
65
- return f"{type(self).__name__}({self._data.__repr__()})"
66
-
67
-
68
- CHUNK_TYPE = (Chunk, ChunkData)
@@ -1,73 +0,0 @@
1
- # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import numpy as np
16
-
17
- from ...serialization.serializables import ReferenceField
18
- from .chunks import CHUNK_TYPE, Chunk, ChunkData
19
-
20
-
21
- class FuseChunkData(ChunkData):
22
- __slots__ = ("_inited",)
23
-
24
- _chunk = ReferenceField(
25
- "chunk", CHUNK_TYPE, on_serialize=lambda x: x.data if hasattr(x, "data") else x
26
- )
27
-
28
- def __init__(self, *args, **kwargs):
29
- self._inited = False
30
- super().__init__(*args, **kwargs)
31
- self._extra_params = {}
32
- self._inited = True
33
-
34
- @property
35
- def chunk(self):
36
- return self._chunk
37
-
38
- @property
39
- def composed(self):
40
- # for compatibility, just return the topological ordering,
41
- # once we apply optimization on the subgraph,
42
- # `composed` is not needed any more and should be removed then.
43
- assert getattr(self._op, "fuse_graph", None) is not None
44
- fuse_graph = self._op.fuse_graph
45
- return list(fuse_graph.topological_iter())
46
-
47
- def __getattr__(self, attr):
48
- if not self._inited:
49
- return object.__getattribute__(self, attr)
50
- if attr in self._extra_params:
51
- return self._extra_params[attr]
52
- try:
53
- return getattr(self._chunk, attr)
54
- except AttributeError:
55
- return object.__getattribute__(self, attr)
56
-
57
- def __setattr__(self, attr, value):
58
- if attr == "params":
59
- self._chunk.params = value
60
- else:
61
- super().__setattr__(attr, value)
62
-
63
- @property
64
- def nbytes(self):
65
- return np.prod(self.shape) * self.dtype.itemsize
66
-
67
-
68
- class FuseChunk(Chunk):
69
- __slots__ = ()
70
- _allow_data_type_ = (FuseChunkData,)
71
-
72
-
73
- FUSE_CHUNK_TYPE = (FuseChunkData, FuseChunk)