maxframe 0.1.0b4__cp310-cp310-macosx_10_9_universal2.whl → 1.0.0__cp310-cp310-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (214) hide show
  1. maxframe/__init__.py +1 -0
  2. maxframe/_utils.cpython-310-darwin.so +0 -0
  3. maxframe/codegen.py +56 -5
  4. maxframe/config/config.py +78 -10
  5. maxframe/config/validators.py +42 -11
  6. maxframe/conftest.py +58 -14
  7. maxframe/core/__init__.py +2 -16
  8. maxframe/core/entity/__init__.py +1 -12
  9. maxframe/core/entity/executable.py +1 -1
  10. maxframe/core/entity/objects.py +46 -45
  11. maxframe/core/entity/output_types.py +0 -3
  12. maxframe/core/entity/tests/test_objects.py +43 -0
  13. maxframe/core/entity/tileables.py +5 -78
  14. maxframe/core/graph/__init__.py +2 -2
  15. maxframe/core/graph/builder/__init__.py +0 -1
  16. maxframe/core/graph/builder/base.py +5 -4
  17. maxframe/core/graph/builder/tileable.py +4 -4
  18. maxframe/core/graph/builder/utils.py +4 -8
  19. maxframe/core/graph/core.cpython-310-darwin.so +0 -0
  20. maxframe/core/graph/core.pyx +4 -4
  21. maxframe/core/graph/entity.py +9 -33
  22. maxframe/core/operator/__init__.py +2 -9
  23. maxframe/core/operator/base.py +3 -5
  24. maxframe/core/operator/objects.py +0 -9
  25. maxframe/core/operator/utils.py +55 -0
  26. maxframe/dataframe/__init__.py +2 -1
  27. maxframe/dataframe/arithmetic/around.py +5 -17
  28. maxframe/dataframe/arithmetic/core.py +15 -7
  29. maxframe/dataframe/arithmetic/docstring.py +7 -33
  30. maxframe/dataframe/arithmetic/equal.py +4 -2
  31. maxframe/dataframe/arithmetic/greater.py +4 -2
  32. maxframe/dataframe/arithmetic/greater_equal.py +4 -2
  33. maxframe/dataframe/arithmetic/less.py +2 -2
  34. maxframe/dataframe/arithmetic/less_equal.py +4 -2
  35. maxframe/dataframe/arithmetic/not_equal.py +4 -2
  36. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +39 -16
  37. maxframe/dataframe/core.py +58 -12
  38. maxframe/dataframe/datasource/date_range.py +2 -2
  39. maxframe/dataframe/datasource/read_odps_query.py +120 -24
  40. maxframe/dataframe/datasource/read_odps_table.py +9 -4
  41. maxframe/dataframe/datasource/tests/test_datasource.py +103 -8
  42. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  43. maxframe/dataframe/datastore/to_odps.py +28 -0
  44. maxframe/dataframe/extensions/__init__.py +5 -0
  45. maxframe/dataframe/extensions/flatjson.py +131 -0
  46. maxframe/dataframe/extensions/flatmap.py +317 -0
  47. maxframe/dataframe/extensions/reshuffle.py +1 -1
  48. maxframe/dataframe/extensions/tests/test_extensions.py +108 -3
  49. maxframe/dataframe/groupby/core.py +1 -1
  50. maxframe/dataframe/groupby/cum.py +0 -1
  51. maxframe/dataframe/groupby/fill.py +4 -1
  52. maxframe/dataframe/groupby/getitem.py +6 -0
  53. maxframe/dataframe/groupby/tests/test_groupby.py +5 -1
  54. maxframe/dataframe/groupby/transform.py +5 -1
  55. maxframe/dataframe/indexing/align.py +1 -1
  56. maxframe/dataframe/indexing/loc.py +6 -4
  57. maxframe/dataframe/indexing/rename.py +5 -28
  58. maxframe/dataframe/indexing/sample.py +0 -1
  59. maxframe/dataframe/indexing/set_index.py +68 -1
  60. maxframe/dataframe/initializer.py +11 -1
  61. maxframe/dataframe/merge/__init__.py +9 -1
  62. maxframe/dataframe/merge/concat.py +41 -31
  63. maxframe/dataframe/merge/merge.py +237 -3
  64. maxframe/dataframe/merge/tests/test_merge.py +126 -1
  65. maxframe/dataframe/misc/__init__.py +4 -0
  66. maxframe/dataframe/misc/apply.py +6 -11
  67. maxframe/dataframe/misc/case_when.py +141 -0
  68. maxframe/dataframe/misc/describe.py +2 -2
  69. maxframe/dataframe/misc/drop_duplicates.py +8 -8
  70. maxframe/dataframe/misc/eval.py +4 -0
  71. maxframe/dataframe/misc/memory_usage.py +2 -2
  72. maxframe/dataframe/misc/pct_change.py +1 -83
  73. maxframe/dataframe/misc/pivot_table.py +262 -0
  74. maxframe/dataframe/misc/tests/test_misc.py +93 -1
  75. maxframe/dataframe/misc/transform.py +1 -30
  76. maxframe/dataframe/misc/value_counts.py +4 -17
  77. maxframe/dataframe/missing/dropna.py +1 -1
  78. maxframe/dataframe/missing/fillna.py +5 -5
  79. maxframe/dataframe/operators.py +1 -17
  80. maxframe/dataframe/plotting/core.py +2 -2
  81. maxframe/dataframe/reduction/core.py +4 -3
  82. maxframe/dataframe/reduction/tests/test_reduction.py +2 -4
  83. maxframe/dataframe/sort/sort_values.py +1 -11
  84. maxframe/dataframe/statistics/corr.py +3 -3
  85. maxframe/dataframe/statistics/quantile.py +13 -19
  86. maxframe/dataframe/statistics/tests/test_statistics.py +4 -4
  87. maxframe/dataframe/tests/test_initializer.py +33 -2
  88. maxframe/dataframe/utils.py +33 -11
  89. maxframe/dataframe/window/expanding.py +5 -3
  90. maxframe/dataframe/window/tests/test_expanding.py +2 -2
  91. maxframe/errors.py +13 -0
  92. maxframe/extension.py +12 -0
  93. maxframe/io/__init__.py +13 -0
  94. maxframe/io/objects/__init__.py +24 -0
  95. maxframe/io/objects/core.py +140 -0
  96. maxframe/io/objects/tensor.py +76 -0
  97. maxframe/io/objects/tests/__init__.py +13 -0
  98. maxframe/io/objects/tests/test_object_io.py +97 -0
  99. maxframe/{odpsio → io/odpsio}/__init__.py +3 -1
  100. maxframe/{odpsio → io/odpsio}/arrow.py +43 -12
  101. maxframe/{odpsio → io/odpsio}/schema.py +38 -16
  102. maxframe/io/odpsio/tableio.py +719 -0
  103. maxframe/io/odpsio/tests/__init__.py +13 -0
  104. maxframe/{odpsio → io/odpsio}/tests/test_schema.py +75 -33
  105. maxframe/{odpsio → io/odpsio}/tests/test_tableio.py +50 -23
  106. maxframe/{odpsio → io/odpsio}/tests/test_volumeio.py +4 -6
  107. maxframe/io/odpsio/volumeio.py +63 -0
  108. maxframe/learn/contrib/__init__.py +3 -1
  109. maxframe/learn/contrib/graph/__init__.py +15 -0
  110. maxframe/learn/contrib/graph/connected_components.py +215 -0
  111. maxframe/learn/contrib/graph/tests/__init__.py +13 -0
  112. maxframe/learn/contrib/graph/tests/test_connected_components.py +53 -0
  113. maxframe/learn/contrib/llm/__init__.py +16 -0
  114. maxframe/learn/contrib/llm/core.py +54 -0
  115. maxframe/learn/contrib/llm/models/__init__.py +14 -0
  116. maxframe/learn/contrib/llm/models/dashscope.py +73 -0
  117. maxframe/learn/contrib/llm/multi_modal.py +42 -0
  118. maxframe/learn/contrib/llm/text.py +42 -0
  119. maxframe/learn/contrib/utils.py +52 -0
  120. maxframe/learn/contrib/xgboost/__init__.py +26 -0
  121. maxframe/learn/contrib/xgboost/classifier.py +110 -0
  122. maxframe/learn/contrib/xgboost/core.py +241 -0
  123. maxframe/learn/contrib/xgboost/dmatrix.py +147 -0
  124. maxframe/learn/contrib/xgboost/predict.py +121 -0
  125. maxframe/learn/contrib/xgboost/regressor.py +71 -0
  126. maxframe/learn/contrib/xgboost/tests/__init__.py +13 -0
  127. maxframe/learn/contrib/xgboost/tests/test_core.py +43 -0
  128. maxframe/learn/contrib/xgboost/train.py +132 -0
  129. maxframe/{core/operator/fuse.py → learn/core.py} +7 -10
  130. maxframe/learn/utils/__init__.py +15 -0
  131. maxframe/learn/utils/core.py +29 -0
  132. maxframe/lib/mmh3.cpython-310-darwin.so +0 -0
  133. maxframe/lib/mmh3.pyi +43 -0
  134. maxframe/lib/sparse/tests/test_sparse.py +15 -15
  135. maxframe/lib/wrapped_pickle.py +2 -1
  136. maxframe/opcodes.py +11 -0
  137. maxframe/protocol.py +154 -27
  138. maxframe/remote/core.py +4 -8
  139. maxframe/serialization/__init__.py +1 -0
  140. maxframe/serialization/core.cpython-310-darwin.so +0 -0
  141. maxframe/serialization/core.pxd +3 -0
  142. maxframe/serialization/core.pyi +64 -0
  143. maxframe/serialization/core.pyx +67 -26
  144. maxframe/serialization/exception.py +1 -1
  145. maxframe/serialization/pandas.py +52 -17
  146. maxframe/serialization/serializables/core.py +180 -15
  147. maxframe/serialization/serializables/field_type.py +4 -1
  148. maxframe/serialization/serializables/tests/test_serializable.py +54 -5
  149. maxframe/serialization/tests/test_serial.py +2 -1
  150. maxframe/session.py +37 -2
  151. maxframe/tensor/__init__.py +81 -2
  152. maxframe/tensor/arithmetic/isclose.py +1 -0
  153. maxframe/tensor/arithmetic/tests/test_arithmetic.py +22 -18
  154. maxframe/tensor/core.py +5 -136
  155. maxframe/tensor/datasource/array.py +7 -2
  156. maxframe/tensor/datasource/full.py +1 -1
  157. maxframe/tensor/datasource/scalar.py +1 -1
  158. maxframe/tensor/datasource/tests/test_datasource.py +1 -1
  159. maxframe/tensor/indexing/flatnonzero.py +1 -1
  160. maxframe/tensor/indexing/getitem.py +2 -0
  161. maxframe/tensor/merge/__init__.py +2 -0
  162. maxframe/tensor/merge/concatenate.py +101 -0
  163. maxframe/tensor/merge/tests/test_merge.py +30 -1
  164. maxframe/tensor/merge/vstack.py +74 -0
  165. maxframe/tensor/{base → misc}/__init__.py +4 -0
  166. maxframe/tensor/misc/atleast_1d.py +72 -0
  167. maxframe/tensor/misc/atleast_2d.py +70 -0
  168. maxframe/tensor/misc/atleast_3d.py +85 -0
  169. maxframe/tensor/misc/tests/__init__.py +13 -0
  170. maxframe/tensor/{base → misc}/transpose.py +22 -18
  171. maxframe/tensor/misc/unique.py +205 -0
  172. maxframe/tensor/operators.py +1 -7
  173. maxframe/tensor/random/core.py +1 -1
  174. maxframe/tensor/reduction/count_nonzero.py +2 -1
  175. maxframe/tensor/reduction/mean.py +1 -0
  176. maxframe/tensor/reduction/nanmean.py +1 -0
  177. maxframe/tensor/reduction/nanvar.py +2 -0
  178. maxframe/tensor/reduction/tests/test_reduction.py +12 -1
  179. maxframe/tensor/reduction/var.py +2 -0
  180. maxframe/tensor/statistics/quantile.py +2 -2
  181. maxframe/tensor/utils.py +2 -22
  182. maxframe/tests/test_protocol.py +34 -0
  183. maxframe/tests/test_utils.py +0 -12
  184. maxframe/tests/utils.py +17 -2
  185. maxframe/typing_.py +4 -1
  186. maxframe/udf.py +62 -3
  187. maxframe/utils.py +112 -86
  188. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/METADATA +25 -25
  189. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/RECORD +208 -167
  190. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/WHEEL +1 -1
  191. maxframe_client/__init__.py +0 -1
  192. maxframe_client/clients/framedriver.py +4 -1
  193. maxframe_client/fetcher.py +123 -54
  194. maxframe_client/session/consts.py +3 -0
  195. maxframe_client/session/graph.py +8 -2
  196. maxframe_client/session/odps.py +223 -40
  197. maxframe_client/session/task.py +108 -80
  198. maxframe_client/tests/test_fetcher.py +21 -3
  199. maxframe_client/tests/test_session.py +136 -8
  200. maxframe/core/entity/chunks.py +0 -68
  201. maxframe/core/entity/fuse.py +0 -73
  202. maxframe/core/graph/builder/chunk.py +0 -430
  203. maxframe/odpsio/tableio.py +0 -300
  204. maxframe/odpsio/volumeio.py +0 -95
  205. maxframe_client/clients/spe.py +0 -104
  206. /maxframe/{odpsio → core/entity}/tests/__init__.py +0 -0
  207. /maxframe/{tensor/base → dataframe/datastore}/tests/__init__.py +0 -0
  208. /maxframe/{odpsio → io/odpsio}/tests/test_arrow.py +0 -0
  209. /maxframe/tensor/{base → misc}/astype.py +0 -0
  210. /maxframe/tensor/{base → misc}/broadcast_to.py +0 -0
  211. /maxframe/tensor/{base → misc}/ravel.py +0 -0
  212. /maxframe/tensor/{base/tests/test_base.py → misc/tests/test_misc.py} +0 -0
  213. /maxframe/tensor/{base → misc}/where.py +0 -0
  214. {maxframe-0.1.0b4.dist-info → maxframe-1.0.0.dist-info}/top_level.txt +0 -0
maxframe/protocol.py CHANGED
@@ -15,7 +15,7 @@
15
15
  import base64
16
16
  import enum
17
17
  import uuid
18
- from typing import Any, Dict, Generic, List, Optional, Tuple, Type, TypeVar
18
+ from typing import Any, Dict, Generic, List, Optional, Type, TypeVar
19
19
 
20
20
  import pandas as pd
21
21
 
@@ -32,12 +32,12 @@ from .serialization.serializables import (
32
32
  EnumField,
33
33
  FieldTypes,
34
34
  Float64Field,
35
+ Int32Field,
35
36
  ListField,
36
37
  ReferenceField,
37
38
  Serializable,
38
39
  SeriesField,
39
40
  StringField,
40
- TupleField,
41
41
  )
42
42
 
43
43
  pickling_support.install()
@@ -71,6 +71,9 @@ class DagStatus(enum.Enum):
71
71
  CANCELLING = 4
72
72
  CANCELLED = 5
73
73
 
74
+ def is_terminated(self):
75
+ return self in (DagStatus.CANCELLED, DagStatus.SUCCEEDED, DagStatus.FAILED)
76
+
74
77
 
75
78
  class DimensionIndex(Serializable):
76
79
  is_slice: bool = BoolField("is_slice", default=None)
@@ -88,19 +91,6 @@ class DataSerializeType(enum.Enum):
88
91
  PICKLE = 0
89
92
 
90
93
 
91
- class VolumeDataMeta(Serializable):
92
- output_type: OutputType = EnumField(
93
- "output_type", OutputType, FieldTypes.int8, default=None
94
- )
95
- serial_type: DataSerializeType = EnumField(
96
- "serial_type", DataSerializeType, FieldTypes.int8, default=None
97
- )
98
- shape: Tuple[int, ...] = TupleField("shape", FieldTypes.int64, default=None)
99
- nsplits: Tuple[Tuple[int, ...], ...] = TupleField(
100
- "nsplits", FieldTypes.tuple(FieldTypes.tuple(FieldTypes.int64)), default=None
101
- )
102
-
103
-
104
94
  _result_type_to_info_cls: Dict[ResultType, Type["ResultInfo"]] = dict()
105
95
 
106
96
 
@@ -150,6 +140,9 @@ class ODPSTableResultInfo(ResultInfo):
150
140
  partition_specs: Optional[List[str]] = ListField(
151
141
  "partition_specs", FieldTypes.string, default=None
152
142
  )
143
+ table_meta: Optional["DataFrameTableMeta"] = ReferenceField(
144
+ "table_meta", default=None
145
+ )
153
146
 
154
147
  def __init__(self, result_type: ResultType = None, **kw):
155
148
  result_type = result_type or ResultType.ODPS_TABLE
@@ -160,8 +153,17 @@ class ODPSTableResultInfo(ResultInfo):
160
153
  ret["full_table_name"] = self.full_table_name
161
154
  if self.partition_specs:
162
155
  ret["partition_specs"] = self.partition_specs
156
+ if self.table_meta:
157
+ ret["table_meta"] = self.table_meta.to_json()
163
158
  return ret
164
159
 
160
+ @classmethod
161
+ def _json_to_kwargs(cls, serialized: dict) -> dict:
162
+ kw = super()._json_to_kwargs(serialized)
163
+ if "table_meta" in kw:
164
+ kw["table_meta"] = DataFrameTableMeta.from_json(kw["table_meta"])
165
+ return kw
166
+
165
167
 
166
168
  class ODPSVolumeResultInfo(ResultInfo):
167
169
  _result_type = ResultType.ODPS_VOLUME
@@ -190,9 +192,9 @@ class ErrorInfo(JsonSerializable):
190
192
  "error_tracebacks", FieldTypes.list
191
193
  )
192
194
  raw_error_source: ErrorSource = EnumField(
193
- "raw_error_source", ErrorSource, FieldTypes.int8
195
+ "raw_error_source", ErrorSource, FieldTypes.int8, default=None
194
196
  )
195
- raw_error_data: Optional[Exception] = AnyField("raw_error_data")
197
+ raw_error_data: Optional[Exception] = AnyField("raw_error_data", default=None)
196
198
 
197
199
  @classmethod
198
200
  def from_exception(cls, exc: Exception):
@@ -201,20 +203,29 @@ class ErrorInfo(JsonSerializable):
201
203
  return cls(messages, tracebacks, ErrorSource.PYTHON, exc)
202
204
 
203
205
  def reraise(self):
204
- if self.raw_error_source == ErrorSource.PYTHON:
206
+ if (
207
+ self.raw_error_source == ErrorSource.PYTHON
208
+ and self.raw_error_data is not None
209
+ ):
205
210
  raise self.raw_error_data
206
211
  raise RemoteException(self.error_messages, self.error_tracebacks, [])
207
212
 
208
213
  @classmethod
209
214
  def from_json(cls, serialized: dict) -> "ErrorInfo":
210
215
  kw = serialized.copy()
211
- kw["raw_error_source"] = ErrorSource(serialized["raw_error_source"])
216
+ if kw.get("raw_error_source") is not None:
217
+ kw["raw_error_source"] = ErrorSource(serialized["raw_error_source"])
218
+ else:
219
+ kw["raw_error_source"] = None
220
+
212
221
  if kw.get("raw_error_data"):
213
222
  bufs = [base64.b64decode(s) for s in kw["raw_error_data"]]
214
223
  try:
215
224
  kw["raw_error_data"] = pickle.loads(bufs[0], buffers=bufs[1:])
216
225
  except:
217
- kw["raw_error_data"] = None
226
+ # both error source and data shall be None to make sure
227
+ # RemoteException is raised.
228
+ kw["raw_error_source"] = kw["raw_error_data"] = None
218
229
  return cls(**kw)
219
230
 
220
231
  def to_json(self) -> dict:
@@ -227,7 +238,12 @@ class ErrorInfo(JsonSerializable):
227
238
  if isinstance(self.raw_error_data, (PickleContainer, RemoteException)):
228
239
  err_data_bufs = self.raw_error_data.get_buffers()
229
240
  elif isinstance(self.raw_error_data, BaseException):
230
- err_data_bufs = pickle_buffers(self.raw_error_data)
241
+ try:
242
+ err_data_bufs = pickle_buffers(self.raw_error_data)
243
+ except:
244
+ err_data_bufs = None
245
+ ret["raw_error_source"] = None
246
+
231
247
  if err_data_bufs:
232
248
  ret["raw_error_data"] = [
233
249
  base64.b64encode(s).decode() for s in err_data_bufs
@@ -249,9 +265,17 @@ class DagInfo(JsonSerializable):
249
265
  error_info: Optional[ErrorInfo] = ReferenceField("error_info", default=None)
250
266
  start_timestamp: Optional[float] = Float64Field("start_timestamp", default=None)
251
267
  end_timestamp: Optional[float] = Float64Field("end_timestamp", default=None)
268
+ subdag_infos: Dict[str, "SubDagInfo"] = DictField(
269
+ "subdag_infos",
270
+ key_type=FieldTypes.string,
271
+ value_type=FieldTypes.reference,
272
+ default_factory=dict,
273
+ )
252
274
 
253
275
  @classmethod
254
- def from_json(cls, serialized: dict) -> "DagInfo":
276
+ def from_json(cls, serialized: dict) -> Optional["DagInfo"]:
277
+ if serialized is None:
278
+ return None
255
279
  kw = serialized.copy()
256
280
  kw["status"] = DagStatus(kw["status"])
257
281
  if kw.get("tileable_to_result_infos"):
@@ -261,6 +285,10 @@ class DagInfo(JsonSerializable):
261
285
  }
262
286
  if kw.get("error_info"):
263
287
  kw["error_info"] = ErrorInfo.from_json(kw["error_info"])
288
+ if kw.get("subdag_infos"):
289
+ kw["subdag_infos"] = {
290
+ k: SubDagInfo.from_json(v) for k, v in kw["subdag_infos"].items()
291
+ }
264
292
  return DagInfo(**kw)
265
293
 
266
294
  def to_json(self) -> dict:
@@ -279,6 +307,8 @@ class DagInfo(JsonSerializable):
279
307
  }
280
308
  if self.error_info:
281
309
  ret["error_info"] = self.error_info.to_json()
310
+ if self.subdag_infos:
311
+ ret["subdag_infos"] = {k: v.to_json() for k, v in self.subdag_infos.items()}
282
312
  return ret
283
313
 
284
314
 
@@ -302,7 +332,9 @@ class SessionInfo(JsonSerializable):
302
332
  error_info: Optional[ErrorInfo] = ReferenceField("error_info", default=None)
303
333
 
304
334
  @classmethod
305
- def from_json(cls, serialized: dict) -> "SessionInfo":
335
+ def from_json(cls, serialized: dict) -> Optional["SessionInfo"]:
336
+ if serialized is None:
337
+ return None
306
338
  kw = serialized.copy()
307
339
  if kw.get("dag_infos"):
308
340
  kw["dag_infos"] = {
@@ -320,7 +352,10 @@ class SessionInfo(JsonSerializable):
320
352
  "idle_timestamp": self.idle_timestamp,
321
353
  }
322
354
  if self.dag_infos:
323
- ret["dag_infos"] = {k: v.to_json() for k, v in self.dag_infos.items()}
355
+ ret["dag_infos"] = {
356
+ k: v.to_json() if v is not None else None
357
+ for k, v in self.dag_infos.items()
358
+ }
324
359
  if self.error_info:
325
360
  ret["error_info"] = self.error_info.to_json()
326
361
  return ret
@@ -340,9 +375,32 @@ class ExecuteDagRequest(Serializable):
340
375
  value_type=FieldTypes.reference,
341
376
  default=None,
342
377
  )
378
+ new_settings: Dict[str, Any] = DictField(
379
+ "new_settings",
380
+ key_type=FieldTypes.string,
381
+ default=None,
382
+ )
383
+
384
+
385
+ class SubDagSubmitInstanceInfo(JsonSerializable):
386
+ submit_reason: str = StringField("submit_reason")
387
+ instance_id: str = StringField("instance_id")
388
+ subquery_id: Optional[int] = Int32Field("subquery_id", default=None)
389
+
390
+ @classmethod
391
+ def from_json(cls, serialized: dict) -> "SubDagSubmitInstanceInfo":
392
+ return SubDagSubmitInstanceInfo(**serialized)
393
+
394
+ def to_json(self) -> dict:
395
+ ret = {
396
+ "submit_reason": self.submit_reason,
397
+ "instance_id": self.instance_id,
398
+ "subquery_id": self.subquery_id,
399
+ }
400
+ return ret
343
401
 
344
402
 
345
- class SubDagInfo(Serializable):
403
+ class SubDagInfo(JsonSerializable):
346
404
  subdag_id: str = StringField("subdag_id")
347
405
  status: DagStatus = EnumField("status", DagStatus, FieldTypes.int8, default=None)
348
406
  progress: float = Float64Field("progress", default=None)
@@ -355,9 +413,52 @@ class SubDagInfo(Serializable):
355
413
  FieldTypes.reference,
356
414
  default_factory=dict,
357
415
  )
416
+ start_timestamp: Optional[float] = Float64Field("start_timestamp", default=None)
417
+ end_timestamp: Optional[float] = Float64Field("end_timestamp", default=None)
418
+ submit_instances: List[SubDagSubmitInstanceInfo] = ListField(
419
+ "submit_instances",
420
+ FieldTypes.reference,
421
+ default_factory=list,
422
+ )
423
+
424
+ @classmethod
425
+ def from_json(cls, serialized: dict) -> "SubDagInfo":
426
+ kw = serialized.copy()
427
+ kw["status"] = DagStatus(kw["status"])
428
+ if kw.get("tileable_to_result_infos"):
429
+ kw["tileable_to_result_infos"] = {
430
+ k: ResultInfo.from_json(s)
431
+ for k, s in kw["tileable_to_result_infos"].items()
432
+ }
433
+ if kw.get("error_info"):
434
+ kw["error_info"] = ErrorInfo.from_json(kw["error_info"])
435
+ if kw.get("submit_instances"):
436
+ kw["submit_instances"] = [
437
+ SubDagSubmitInstanceInfo.from_json(s) for s in kw["submit_instances"]
438
+ ]
439
+ return SubDagInfo(**kw)
440
+
441
+ def to_json(self) -> dict:
442
+ ret = {
443
+ "subdag_id": self.subdag_id,
444
+ "status": self.status.value,
445
+ "progress": self.progress,
446
+ "start_timestamp": self.start_timestamp,
447
+ "end_timestamp": self.end_timestamp,
448
+ }
449
+ if self.error_info:
450
+ ret["error_info"] = self.error_info.to_json()
451
+ if self.tileable_to_result_infos:
452
+ ret["tileable_to_result_infos"] = {
453
+ k: v.to_json() for k, v in self.tileable_to_result_infos.items()
454
+ }
455
+ if self.submit_instances:
456
+ ret["submit_instances"] = [i.to_json() for i in self.submit_instances]
457
+ return ret
358
458
 
359
459
 
360
460
  class ExecuteSubDagRequest(Serializable):
461
+ subdag_id: str = StringField("subdag_id")
361
462
  dag: TileableGraph = ReferenceField(
362
463
  "dag",
363
464
  on_serialize=SerializableGraph.from_graph,
@@ -371,7 +472,7 @@ class DecrefRequest(Serializable):
371
472
  keys: List[str] = ListField("keys", FieldTypes.string, default=None)
372
473
 
373
474
 
374
- class DataFrameTableMeta(Serializable):
475
+ class DataFrameTableMeta(JsonSerializable):
375
476
  __slots__ = "_pd_column_names", "_pd_index_level_names"
376
477
 
377
478
  table_name: Optional[str] = StringField("table_name", default=None)
@@ -402,7 +503,7 @@ class DataFrameTableMeta(Serializable):
402
503
  self._pd_index_level_names = self.pd_index_dtypes.index.tolist()
403
504
  return self._pd_index_level_names
404
505
 
405
- def __eq__(self, other: "Serializable") -> bool:
506
+ def __eq__(self, other: "DataFrameTableMeta") -> bool:
406
507
  if not isinstance(other, type(self)):
407
508
  return False
408
509
  for k in self._FIELDS:
@@ -413,3 +514,29 @@ class DataFrameTableMeta(Serializable):
413
514
  if not is_same:
414
515
  return False
415
516
  return True
517
+
518
+ def to_json(self) -> dict:
519
+ b64_pk = lambda x: base64.b64encode(pickle.dumps(x)).decode()
520
+ ret = {
521
+ "table_name": self.table_name,
522
+ "type": self.type.value,
523
+ "table_column_names": self.table_column_names,
524
+ "table_index_column_names": self.table_index_column_names,
525
+ "pd_column_dtypes": b64_pk(self.pd_column_dtypes),
526
+ "pd_column_level_names": b64_pk(self.pd_column_level_names),
527
+ "pd_index_dtypes": b64_pk(self.pd_index_dtypes),
528
+ }
529
+ return ret
530
+
531
+ @classmethod
532
+ def from_json(cls, serialized: dict) -> "DataFrameTableMeta":
533
+ b64_upk = lambda x: pickle.loads(base64.b64decode(x))
534
+ serialized.update(
535
+ {
536
+ "type": OutputType(serialized["type"]),
537
+ "pd_column_dtypes": b64_upk(serialized["pd_column_dtypes"]),
538
+ "pd_column_level_names": b64_upk(serialized["pd_column_level_names"]),
539
+ "pd_index_dtypes": b64_upk(serialized["pd_index_dtypes"]),
540
+ }
541
+ )
542
+ return DataFrameTableMeta(**serialized)
maxframe/remote/core.py CHANGED
@@ -15,7 +15,7 @@
15
15
  from functools import partial
16
16
 
17
17
  from .. import opcodes
18
- from ..core import ENTITY_TYPE, ChunkData
18
+ from ..core import ENTITY_TYPE
19
19
  from ..core.operator import ObjectOperator, ObjectOperatorMixin
20
20
  from ..dataframe.core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
21
21
  from ..serialization.serializables import (
@@ -26,7 +26,7 @@ from ..serialization.serializables import (
26
26
  ListField,
27
27
  )
28
28
  from ..tensor.core import TENSOR_TYPE
29
- from ..utils import build_fetch_tileable, find_objects, replace_objects
29
+ from ..utils import find_objects, replace_objects
30
30
 
31
31
 
32
32
  class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
@@ -63,12 +63,8 @@ class RemoteFunction(ObjectOperatorMixin, ObjectOperator):
63
63
  if raw_inputs is not None:
64
64
  for raw_inp in raw_inputs:
65
65
  if self._no_prepare(raw_inp):
66
- if not isinstance(self._inputs[0], ChunkData):
67
- # not in tile, set_inputs from tileable
68
- mapping[raw_inp] = next(function_inputs)
69
- else:
70
- # in tile, set_inputs from chunk
71
- mapping[raw_inp] = build_fetch_tileable(raw_inp)
66
+ # not in tile, set_inputs from tileable
67
+ mapping[raw_inp] = next(function_inputs)
72
68
  else:
73
69
  mapping[raw_inp] = next(function_inputs)
74
70
  self.function_args = replace_objects(self.function_args, mapping)
@@ -17,6 +17,7 @@ from .core import (
17
17
  PickleContainer,
18
18
  Serializer,
19
19
  deserialize,
20
+ load_type,
20
21
  pickle_buffers,
21
22
  serialize,
22
23
  serialize_with_spawn,
@@ -18,6 +18,9 @@ from libc.stdint cimport int32_t, uint64_t
18
18
  cdef class Serializer:
19
19
  cdef int _serializer_id
20
20
 
21
+ cpdef bint is_public_data_exist(self, dict context, object key)
22
+ cpdef put_public_data(self, dict context, object key, object value)
23
+ cpdef get_public_data(self, dict context, object key)
21
24
  cpdef serial(self, object obj, dict context)
22
25
  cpdef deserial(self, list serialized, dict context, list subs)
23
26
  cpdef on_deserial_error(
@@ -0,0 +1,64 @@
1
+ # Copyright 1999-2024 Alibaba Group Holding Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from concurrent.futures import Executor
16
+ from typing import Any, Callable, Dict, List, TypeVar
17
+
18
+ def buffered(func: Callable) -> Callable: ...
19
+ def fast_id(obj: Any) -> int: ...
20
+
21
+ LoadType = TypeVar("LoadType")
22
+
23
+ def load_type(class_name: str, parent_class: LoadType) -> LoadType: ...
24
+
25
+ class PickleContainer:
26
+ def __init__(self, buffers: List[bytes]): ...
27
+ def get(self) -> Any: ...
28
+ def get_buffers(self) -> List[bytes]: ...
29
+
30
+ class Serializer:
31
+ serializer_id: int
32
+ def is_public_data_exist(self, context: Dict, key: Any) -> bool: ...
33
+ def put_public_data(self, context: Dict, key: Any, value: Any) -> None: ...
34
+ def get_public_data(self, context: Dict, key: Any) -> Any: ...
35
+ def serial(self, obj: Any, context: Dict): ...
36
+ def deserial(self, serialized: List, context: Dict, subs: List[Any]): ...
37
+ def on_deserial_error(
38
+ self,
39
+ serialized: List,
40
+ context: Dict,
41
+ subs_serialized: List,
42
+ error_index: int,
43
+ exc: BaseException,
44
+ ): ...
45
+ @classmethod
46
+ def register(cls, obj_type): ...
47
+ @classmethod
48
+ def unregister(cls, obj_type): ...
49
+
50
+ class Placeholder:
51
+ id: int
52
+ callbacks: List[Callable]
53
+ def __init__(self, id_: int): ...
54
+ def __hash__(self): ...
55
+ def __eq__(self, other): ...
56
+
57
+ def serialize(obj: Any, context: Dict = None): ...
58
+ async def serialize_with_spawn(
59
+ obj: Any,
60
+ context: Dict = None,
61
+ spawn_threshold: int = 100,
62
+ executor: Executor = None,
63
+ ): ...
64
+ def deserialize(headers: List, buffers: List, context: Dict = None): ...
@@ -37,7 +37,7 @@ from .._utils import NamedType
37
37
  from .._utils cimport TypeDispatcher
38
38
 
39
39
  from ..lib import wrapped_pickle as pickle
40
- from ..utils import arrow_type_from_str
40
+ from ..utils import NoDefault, arrow_type_from_str, no_default
41
41
 
42
42
  try:
43
43
  from pandas import ArrowDtype
@@ -94,6 +94,7 @@ cdef:
94
94
  int COMPLEX_SERIALIZER = 12
95
95
  int SLICE_SERIALIZER = 13
96
96
  int REGEX_SERIALIZER = 14
97
+ int NO_DEFAULT_SERIALIZER = 15
97
98
  int PLACEHOLDER_SERIALIZER = 4096
98
99
 
99
100
 
@@ -130,11 +131,30 @@ cdef Serializer get_deserializer(int32_t deserializer_id):
130
131
 
131
132
  cdef class Serializer:
132
133
  serializer_id = None
134
+ _public_data_context_key = 0x7fffffff - 1
133
135
 
134
136
  def __cinit__(self):
135
137
  # make the value can be referenced with C code
136
138
  self._serializer_id = self.serializer_id
137
139
 
140
+ cpdef bint is_public_data_exist(self, dict context, object key):
141
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
142
+ if public_dict is None:
143
+ return False
144
+ return key in public_dict
145
+
146
+ cpdef put_public_data(self, dict context, object key, object value):
147
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
148
+ if public_dict is None:
149
+ public_dict = context[self._public_data_context_key] = {}
150
+ public_dict[key] = value
151
+
152
+ cpdef get_public_data(self, dict context, object key):
153
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
154
+ if public_dict is None:
155
+ return None
156
+ return public_dict.get(key)
157
+
138
158
  cpdef serial(self, object obj, dict context):
139
159
  """
140
160
  Returns intermediate serialization result of certain object.
@@ -784,6 +804,16 @@ cdef class RegexSerializer(Serializer):
784
804
  return re.compile((<bytes>(subs[0])).decode(), serialized[0])
785
805
 
786
806
 
807
+ cdef class NoDefaultSerializer(Serializer):
808
+ serializer_id = NO_DEFAULT_SERIALIZER
809
+
810
+ cpdef serial(self, object obj, dict context):
811
+ return [], [], True
812
+
813
+ cpdef deserial(self, list obj, dict context, list subs):
814
+ return no_default
815
+
816
+
787
817
  cdef class Placeholder:
788
818
  """
789
819
  Placeholder object to reduce duplicated serialization
@@ -838,6 +868,7 @@ DtypeSerializer.register(ExtensionDtype)
838
868
  ComplexSerializer.register(complex)
839
869
  SliceSerializer.register(slice)
840
870
  RegexSerializer.register(re.Pattern)
871
+ NoDefaultSerializer.register(NoDefault)
841
872
  PlaceholderSerializer.register(Placeholder)
842
873
 
843
874
 
@@ -993,17 +1024,20 @@ def serialize(obj, dict context = None):
993
1024
  cdef list subs
994
1025
  cdef bint final
995
1026
  cdef _IdContextHolder id_context_holder = _IdContextHolder()
1027
+ cdef tuple result
996
1028
 
997
1029
  context = context if context is not None else dict()
998
1030
  serialized, subs, final = _serial_single(obj, context, id_context_holder)
999
1031
  if final or not subs:
1000
1032
  # marked as a leaf node, return directly
1001
- return [{}, serialized], subs
1002
-
1003
- serial_stack.append(_SerialStackItem(serialized, subs))
1004
- return _serialize_with_stack(
1005
- serial_stack, None, context, id_context_holder, result_bufs_list
1006
- )
1033
+ result = [{}, serialized], subs
1034
+ else:
1035
+ serial_stack.append(_SerialStackItem(serialized, subs))
1036
+ result = _serialize_with_stack(
1037
+ serial_stack, None, context, id_context_holder, result_bufs_list
1038
+ )
1039
+ result[0][0]["_PUB"] = context.get(Serializer._public_data_context_key)
1040
+ return result
1007
1041
 
1008
1042
 
1009
1043
  async def serialize_with_spawn(
@@ -1036,31 +1070,38 @@ async def serialize_with_spawn(
1036
1070
  cdef list subs
1037
1071
  cdef bint final
1038
1072
  cdef _IdContextHolder id_context_holder = _IdContextHolder()
1073
+ cdef tuple result
1039
1074
 
1040
1075
  context = context if context is not None else dict()
1041
1076
  serialized, subs, final = _serial_single(obj, context, id_context_holder)
1042
1077
  if final or not subs:
1043
1078
  # marked as a leaf node, return directly
1044
- return [{}, serialized], subs
1045
-
1046
- serial_stack.append(_SerialStackItem(serialized, subs))
1079
+ result = [{}, serialized], subs
1080
+ else:
1081
+ serial_stack.append(_SerialStackItem(serialized, subs))
1047
1082
 
1048
- try:
1049
- result = _serialize_with_stack(
1050
- serial_stack, None, context, id_context_holder, result_bufs_list, spawn_threshold
1051
- )
1052
- except _SerializeObjectOverflow as ex:
1053
- result = await asyncio.get_running_loop().run_in_executor(
1054
- executor,
1055
- _serialize_with_stack,
1056
- serial_stack,
1057
- ex.cur_serialized,
1058
- context,
1059
- id_context_holder,
1060
- result_bufs_list,
1061
- 0,
1062
- ex.num_total_serialized,
1063
- )
1083
+ try:
1084
+ result = _serialize_with_stack(
1085
+ serial_stack,
1086
+ None,
1087
+ context,
1088
+ id_context_holder,
1089
+ result_bufs_list,
1090
+ spawn_threshold,
1091
+ )
1092
+ except _SerializeObjectOverflow as ex:
1093
+ result = await asyncio.get_running_loop().run_in_executor(
1094
+ executor,
1095
+ _serialize_with_stack,
1096
+ serial_stack,
1097
+ ex.cur_serialized,
1098
+ context,
1099
+ id_context_holder,
1100
+ result_bufs_list,
1101
+ 0,
1102
+ ex.num_total_serialized,
1103
+ )
1104
+ result[0][0]["_PUB"] = context.get(Serializer._public_data_context_key)
1064
1105
  return result
1065
1106
 
1066
1107
 
@@ -35,7 +35,7 @@ class RemoteException(MaxFrameError):
35
35
  def from_exception(cls, exc: Exception):
36
36
  try:
37
37
  buffers = pickle_buffers(exc)
38
- except (TypeError, pickle.PicklingError):
38
+ except:
39
39
  logger.exception("Cannot pickle exception %s", exc)
40
40
  buffers = []
41
41