maxframe 0.1.0b5__cp39-cp39-win_amd64.whl → 1.0.0rc2__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of maxframe might be problematic. Click here for more details.

Files changed (92) hide show
  1. maxframe/_utils.cp39-win_amd64.pyd +0 -0
  2. maxframe/codegen.py +6 -2
  3. maxframe/config/config.py +38 -2
  4. maxframe/config/validators.py +1 -0
  5. maxframe/conftest.py +2 -0
  6. maxframe/core/__init__.py +0 -3
  7. maxframe/core/entity/__init__.py +1 -8
  8. maxframe/core/entity/objects.py +3 -45
  9. maxframe/core/graph/core.cp39-win_amd64.pyd +0 -0
  10. maxframe/core/graph/core.pyx +4 -4
  11. maxframe/dataframe/__init__.py +1 -1
  12. maxframe/dataframe/arithmetic/around.py +5 -17
  13. maxframe/dataframe/arithmetic/core.py +15 -7
  14. maxframe/dataframe/arithmetic/docstring.py +5 -55
  15. maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
  16. maxframe/dataframe/core.py +5 -5
  17. maxframe/dataframe/datasource/date_range.py +2 -2
  18. maxframe/dataframe/datasource/read_odps_query.py +6 -0
  19. maxframe/dataframe/datasource/read_odps_table.py +2 -1
  20. maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
  21. maxframe/dataframe/datastore/tests/__init__.py +13 -0
  22. maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
  23. maxframe/dataframe/datastore/to_odps.py +21 -0
  24. maxframe/dataframe/groupby/cum.py +0 -1
  25. maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
  26. maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
  27. maxframe/dataframe/indexing/align.py +1 -1
  28. maxframe/dataframe/indexing/rename.py +3 -37
  29. maxframe/dataframe/indexing/sample.py +0 -1
  30. maxframe/dataframe/indexing/set_index.py +68 -1
  31. maxframe/dataframe/merge/merge.py +236 -2
  32. maxframe/dataframe/merge/tests/test_merge.py +123 -0
  33. maxframe/dataframe/misc/apply.py +5 -10
  34. maxframe/dataframe/misc/case_when.py +1 -1
  35. maxframe/dataframe/misc/describe.py +2 -2
  36. maxframe/dataframe/misc/drop_duplicates.py +4 -25
  37. maxframe/dataframe/misc/eval.py +4 -0
  38. maxframe/dataframe/misc/memory_usage.py +2 -2
  39. maxframe/dataframe/misc/pct_change.py +1 -83
  40. maxframe/dataframe/misc/tests/test_misc.py +23 -0
  41. maxframe/dataframe/misc/transform.py +1 -30
  42. maxframe/dataframe/misc/value_counts.py +4 -17
  43. maxframe/dataframe/missing/dropna.py +1 -1
  44. maxframe/dataframe/missing/fillna.py +5 -5
  45. maxframe/dataframe/sort/sort_values.py +1 -11
  46. maxframe/dataframe/statistics/corr.py +3 -3
  47. maxframe/dataframe/statistics/quantile.py +5 -17
  48. maxframe/dataframe/utils.py +4 -7
  49. maxframe/errors.py +13 -0
  50. maxframe/extension.py +12 -0
  51. maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
  52. maxframe/learn/contrib/xgboost/predict.py +2 -2
  53. maxframe/learn/contrib/xgboost/train.py +2 -2
  54. maxframe/lib/mmh3.cp39-win_amd64.pyd +0 -0
  55. maxframe/lib/mmh3.pyi +43 -0
  56. maxframe/lib/wrapped_pickle.py +2 -1
  57. maxframe/odpsio/__init__.py +1 -1
  58. maxframe/odpsio/arrow.py +8 -4
  59. maxframe/odpsio/schema.py +10 -7
  60. maxframe/odpsio/tableio.py +388 -14
  61. maxframe/odpsio/tests/test_schema.py +16 -15
  62. maxframe/odpsio/tests/test_tableio.py +48 -21
  63. maxframe/protocol.py +148 -12
  64. maxframe/serialization/core.cp39-win_amd64.pyd +0 -0
  65. maxframe/serialization/core.pxd +3 -0
  66. maxframe/serialization/core.pyi +3 -0
  67. maxframe/serialization/core.pyx +54 -25
  68. maxframe/serialization/exception.py +1 -1
  69. maxframe/serialization/pandas.py +7 -2
  70. maxframe/serialization/serializables/core.py +158 -12
  71. maxframe/serialization/serializables/tests/test_serializable.py +46 -4
  72. maxframe/tensor/__init__.py +59 -0
  73. maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
  74. maxframe/tensor/base/atleast_1d.py +1 -1
  75. maxframe/tensor/base/unique.py +3 -3
  76. maxframe/tensor/reduction/count_nonzero.py +1 -1
  77. maxframe/tensor/statistics/quantile.py +2 -2
  78. maxframe/tests/test_protocol.py +34 -0
  79. maxframe/tests/test_utils.py +0 -12
  80. maxframe/tests/utils.py +11 -2
  81. maxframe/utils.py +24 -13
  82. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +75 -2
  83. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +91 -89
  84. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
  85. maxframe_client/__init__.py +0 -1
  86. maxframe_client/fetcher.py +38 -27
  87. maxframe_client/session/odps.py +50 -10
  88. maxframe_client/session/task.py +41 -20
  89. maxframe_client/tests/test_fetcher.py +21 -3
  90. maxframe_client/tests/test_session.py +49 -2
  91. maxframe_client/clients/spe.py +0 -104
  92. {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -12,22 +12,37 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import datetime
16
+
15
17
  import numpy as np
16
18
  import pandas as pd
17
19
  import pyarrow as pa
20
+ import pytest
18
21
  from odps import ODPS
19
22
 
23
+ from ...config import options
20
24
  from ...tests.utils import flaky, tn
21
25
  from ...utils import config_odps_default_options
22
- from ..tableio import HaloTableIO
26
+ from ..tableio import ODPSTableIO
27
+
28
+
29
+ @pytest.fixture
30
+ def switch_table_io(request):
31
+ old_use_common_table = options.use_common_table
32
+ try:
33
+ options.use_common_table = request.param
34
+ yield
35
+ finally:
36
+ options.use_common_table = old_use_common_table
23
37
 
24
38
 
25
39
  @flaky(max_runs=3)
26
- def test_empty_table_io():
40
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
41
+ def test_empty_table_io(switch_table_io):
27
42
  config_odps_default_options()
28
43
 
29
44
  o = ODPS.from_environments()
30
- halo_table_io = HaloTableIO(o)
45
+ table_io = ODPSTableIO(o)
31
46
 
32
47
  # test read from empty table
33
48
  empty_table_name = tn("test_empty_table_halo_read")
@@ -35,42 +50,53 @@ def test_empty_table_io():
35
50
  tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
36
51
 
37
52
  try:
38
- with halo_table_io.open_reader(empty_table_name) as reader:
53
+ with table_io.open_reader(empty_table_name) as reader:
39
54
  assert len(reader.read_all()) == 0
40
55
  finally:
41
56
  tb.drop()
42
57
 
43
58
 
44
59
  @flaky(max_runs=3)
45
- def test_table_io_without_parts():
60
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
61
+ def test_table_io_without_parts(switch_table_io):
46
62
  config_odps_default_options()
47
63
 
48
64
  o = ODPS.from_environments()
49
- halo_table_io = HaloTableIO(o)
65
+ table_io = ODPSTableIO(o)
50
66
 
51
67
  # test read and write tables without partition
52
68
  no_part_table_name = tn("test_no_part_halo_write")
53
69
  o.delete_table(no_part_table_name, if_exists=True)
54
- tb = o.create_table(
55
- no_part_table_name, ",".join(f"{c} double" for c in "abcde"), lifecycle=1
56
- )
70
+ col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
71
+ tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
57
72
 
58
73
  try:
59
74
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
60
- with halo_table_io.open_writer(no_part_table_name) as writer:
75
+ date_val = [
76
+ (
77
+ datetime.datetime.now().replace(microsecond=0)
78
+ + datetime.timedelta(seconds=i)
79
+ )
80
+ for i in range(100)
81
+ ]
82
+ pd_data["f"] = pd.Series(date_val, dtype="datetime64[ms]").dt.tz_localize(
83
+ options.local_timezone
84
+ )
85
+ with table_io.open_writer(no_part_table_name) as writer:
61
86
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
62
- with halo_table_io.open_reader(no_part_table_name) as reader:
87
+ with table_io.open_reader(no_part_table_name) as reader:
63
88
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
64
89
  finally:
65
90
  tb.drop()
66
91
 
67
92
 
68
93
  @flaky(max_runs=3)
69
- def test_table_io_with_range_reader():
94
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
95
+ def test_table_io_with_range_reader(switch_table_io):
70
96
  config_odps_default_options()
71
97
 
72
98
  o = ODPS.from_environments()
73
- halo_table_io = HaloTableIO(o)
99
+ table_io = ODPSTableIO(o)
74
100
 
75
101
  # test read and write tables without partition
76
102
  no_part_table_name = tn("test_no_part_halo_write")
@@ -81,15 +107,15 @@ def test_table_io_with_range_reader():
81
107
 
82
108
  try:
83
109
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
84
- with halo_table_io.open_writer(no_part_table_name) as writer:
110
+ with table_io.open_writer(no_part_table_name) as writer:
85
111
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
86
112
 
87
- with halo_table_io.open_reader(
113
+ with table_io.open_reader(
88
114
  no_part_table_name, start=None, stop=100, row_batch_size=10
89
115
  ) as reader:
90
116
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
91
117
 
92
- with halo_table_io.open_reader(
118
+ with table_io.open_reader(
93
119
  no_part_table_name,
94
120
  start=-2,
95
121
  stop=-52,
@@ -105,11 +131,12 @@ def test_table_io_with_range_reader():
105
131
 
106
132
 
107
133
  @flaky(max_runs=3)
108
- def test_table_io_with_parts():
134
+ @pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
135
+ def test_table_io_with_parts(switch_table_io):
109
136
  config_odps_default_options()
110
137
 
111
138
  o = ODPS.from_environments()
112
- halo_table_io = HaloTableIO(o)
139
+ table_io = ODPSTableIO(o)
113
140
 
114
141
  # test read and write tables with partition
115
142
  parted_table_name = tn("test_parted_halo_write")
@@ -122,11 +149,11 @@ def test_table_io_with_parts():
122
149
 
123
150
  try:
124
151
  pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
125
- with halo_table_io.open_writer(parted_table_name, "pt=test") as writer:
152
+ with table_io.open_writer(parted_table_name, "pt=test") as writer:
126
153
  writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
127
- with halo_table_io.open_reader(parted_table_name, "pt=test") as reader:
154
+ with table_io.open_reader(parted_table_name, "pt=test") as reader:
128
155
  pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
129
- with halo_table_io.open_reader(
156
+ with table_io.open_reader(
130
157
  parted_table_name, "pt=test", partition_columns=True
131
158
  ) as reader:
132
159
  expected_data = pd_data.copy()
maxframe/protocol.py CHANGED
@@ -32,6 +32,7 @@ from .serialization.serializables import (
32
32
  EnumField,
33
33
  FieldTypes,
34
34
  Float64Field,
35
+ Int32Field,
35
36
  ListField,
36
37
  ReferenceField,
37
38
  Serializable,
@@ -71,6 +72,9 @@ class DagStatus(enum.Enum):
71
72
  CANCELLING = 4
72
73
  CANCELLED = 5
73
74
 
75
+ def is_terminated(self):
76
+ return self in (DagStatus.CANCELLED, DagStatus.SUCCEEDED, DagStatus.FAILED)
77
+
74
78
 
75
79
  class DimensionIndex(Serializable):
76
80
  is_slice: bool = BoolField("is_slice", default=None)
@@ -150,6 +154,9 @@ class ODPSTableResultInfo(ResultInfo):
150
154
  partition_specs: Optional[List[str]] = ListField(
151
155
  "partition_specs", FieldTypes.string, default=None
152
156
  )
157
+ table_meta: Optional["DataFrameTableMeta"] = ReferenceField(
158
+ "table_meta", default=None
159
+ )
153
160
 
154
161
  def __init__(self, result_type: ResultType = None, **kw):
155
162
  result_type = result_type or ResultType.ODPS_TABLE
@@ -160,8 +167,17 @@ class ODPSTableResultInfo(ResultInfo):
160
167
  ret["full_table_name"] = self.full_table_name
161
168
  if self.partition_specs:
162
169
  ret["partition_specs"] = self.partition_specs
170
+ if self.table_meta:
171
+ ret["table_meta"] = self.table_meta.to_json()
163
172
  return ret
164
173
 
174
+ @classmethod
175
+ def _json_to_kwargs(cls, serialized: dict) -> dict:
176
+ kw = super()._json_to_kwargs(serialized)
177
+ if "table_meta" in kw:
178
+ kw["table_meta"] = DataFrameTableMeta.from_json(kw["table_meta"])
179
+ return kw
180
+
165
181
 
166
182
  class ODPSVolumeResultInfo(ResultInfo):
167
183
  _result_type = ResultType.ODPS_VOLUME
@@ -190,9 +206,9 @@ class ErrorInfo(JsonSerializable):
190
206
  "error_tracebacks", FieldTypes.list
191
207
  )
192
208
  raw_error_source: ErrorSource = EnumField(
193
- "raw_error_source", ErrorSource, FieldTypes.int8
209
+ "raw_error_source", ErrorSource, FieldTypes.int8, default=None
194
210
  )
195
- raw_error_data: Optional[Exception] = AnyField("raw_error_data")
211
+ raw_error_data: Optional[Exception] = AnyField("raw_error_data", default=None)
196
212
 
197
213
  @classmethod
198
214
  def from_exception(cls, exc: Exception):
@@ -201,20 +217,29 @@ class ErrorInfo(JsonSerializable):
201
217
  return cls(messages, tracebacks, ErrorSource.PYTHON, exc)
202
218
 
203
219
  def reraise(self):
204
- if self.raw_error_source == ErrorSource.PYTHON:
220
+ if (
221
+ self.raw_error_source == ErrorSource.PYTHON
222
+ and self.raw_error_data is not None
223
+ ):
205
224
  raise self.raw_error_data
206
225
  raise RemoteException(self.error_messages, self.error_tracebacks, [])
207
226
 
208
227
  @classmethod
209
228
  def from_json(cls, serialized: dict) -> "ErrorInfo":
210
229
  kw = serialized.copy()
211
- kw["raw_error_source"] = ErrorSource(serialized["raw_error_source"])
230
+ if kw.get("raw_error_source") is not None:
231
+ kw["raw_error_source"] = ErrorSource(serialized["raw_error_source"])
232
+ else:
233
+ kw["raw_error_source"] = None
234
+
212
235
  if kw.get("raw_error_data"):
213
236
  bufs = [base64.b64decode(s) for s in kw["raw_error_data"]]
214
237
  try:
215
238
  kw["raw_error_data"] = pickle.loads(bufs[0], buffers=bufs[1:])
216
239
  except:
217
- kw["raw_error_data"] = None
240
+ # both error source and data shall be None to make sure
241
+ # RemoteException is raised.
242
+ kw["raw_error_source"] = kw["raw_error_data"] = None
218
243
  return cls(**kw)
219
244
 
220
245
  def to_json(self) -> dict:
@@ -227,7 +252,12 @@ class ErrorInfo(JsonSerializable):
227
252
  if isinstance(self.raw_error_data, (PickleContainer, RemoteException)):
228
253
  err_data_bufs = self.raw_error_data.get_buffers()
229
254
  elif isinstance(self.raw_error_data, BaseException):
230
- err_data_bufs = pickle_buffers(self.raw_error_data)
255
+ try:
256
+ err_data_bufs = pickle_buffers(self.raw_error_data)
257
+ except:
258
+ err_data_bufs = None
259
+ ret["raw_error_source"] = None
260
+
231
261
  if err_data_bufs:
232
262
  ret["raw_error_data"] = [
233
263
  base64.b64encode(s).decode() for s in err_data_bufs
@@ -249,9 +279,17 @@ class DagInfo(JsonSerializable):
249
279
  error_info: Optional[ErrorInfo] = ReferenceField("error_info", default=None)
250
280
  start_timestamp: Optional[float] = Float64Field("start_timestamp", default=None)
251
281
  end_timestamp: Optional[float] = Float64Field("end_timestamp", default=None)
282
+ subdag_infos: Dict[str, "SubDagInfo"] = DictField(
283
+ "subdag_infos",
284
+ key_type=FieldTypes.string,
285
+ value_type=FieldTypes.reference,
286
+ default_factory=dict,
287
+ )
252
288
 
253
289
  @classmethod
254
- def from_json(cls, serialized: dict) -> "DagInfo":
290
+ def from_json(cls, serialized: dict) -> Optional["DagInfo"]:
291
+ if serialized is None:
292
+ return None
255
293
  kw = serialized.copy()
256
294
  kw["status"] = DagStatus(kw["status"])
257
295
  if kw.get("tileable_to_result_infos"):
@@ -261,6 +299,10 @@ class DagInfo(JsonSerializable):
261
299
  }
262
300
  if kw.get("error_info"):
263
301
  kw["error_info"] = ErrorInfo.from_json(kw["error_info"])
302
+ if kw.get("subdag_infos"):
303
+ kw["subdag_infos"] = {
304
+ k: SubDagInfo.from_json(v) for k, v in kw["subdag_infos"].items()
305
+ }
264
306
  return DagInfo(**kw)
265
307
 
266
308
  def to_json(self) -> dict:
@@ -279,6 +321,8 @@ class DagInfo(JsonSerializable):
279
321
  }
280
322
  if self.error_info:
281
323
  ret["error_info"] = self.error_info.to_json()
324
+ if self.subdag_infos:
325
+ ret["subdag_infos"] = {k: v.to_json() for k, v in self.subdag_infos.items()}
282
326
  return ret
283
327
 
284
328
 
@@ -302,7 +346,9 @@ class SessionInfo(JsonSerializable):
302
346
  error_info: Optional[ErrorInfo] = ReferenceField("error_info", default=None)
303
347
 
304
348
  @classmethod
305
- def from_json(cls, serialized: dict) -> "SessionInfo":
349
+ def from_json(cls, serialized: dict) -> Optional["SessionInfo"]:
350
+ if serialized is None:
351
+ return None
306
352
  kw = serialized.copy()
307
353
  if kw.get("dag_infos"):
308
354
  kw["dag_infos"] = {
@@ -320,7 +366,10 @@ class SessionInfo(JsonSerializable):
320
366
  "idle_timestamp": self.idle_timestamp,
321
367
  }
322
368
  if self.dag_infos:
323
- ret["dag_infos"] = {k: v.to_json() for k, v in self.dag_infos.items()}
369
+ ret["dag_infos"] = {
370
+ k: v.to_json() if v is not None else None
371
+ for k, v in self.dag_infos.items()
372
+ }
324
373
  if self.error_info:
325
374
  ret["error_info"] = self.error_info.to_json()
326
375
  return ret
@@ -342,7 +391,25 @@ class ExecuteDagRequest(Serializable):
342
391
  )
343
392
 
344
393
 
345
- class SubDagInfo(Serializable):
394
+ class SubDagSubmitInstanceInfo(JsonSerializable):
395
+ submit_reason: str = StringField("submit_reason")
396
+ instance_id: str = StringField("instance_id")
397
+ subquery_id: Optional[int] = Int32Field("subquery_id", default=None)
398
+
399
+ @classmethod
400
+ def from_json(cls, serialized: dict) -> "SubDagSubmitInstanceInfo":
401
+ return SubDagSubmitInstanceInfo(**serialized)
402
+
403
+ def to_json(self) -> dict:
404
+ ret = {
405
+ "submit_reason": self.submit_reason,
406
+ "instance_id": self.instance_id,
407
+ "subquery_id": self.subquery_id,
408
+ }
409
+ return ret
410
+
411
+
412
+ class SubDagInfo(JsonSerializable):
346
413
  subdag_id: str = StringField("subdag_id")
347
414
  status: DagStatus = EnumField("status", DagStatus, FieldTypes.int8, default=None)
348
415
  progress: float = Float64Field("progress", default=None)
@@ -355,9 +422,52 @@ class SubDagInfo(Serializable):
355
422
  FieldTypes.reference,
356
423
  default_factory=dict,
357
424
  )
425
+ start_timestamp: Optional[float] = Float64Field("start_timestamp", default=None)
426
+ end_timestamp: Optional[float] = Float64Field("end_timestamp", default=None)
427
+ submit_instances: List[SubDagSubmitInstanceInfo] = ListField(
428
+ "submit_instances",
429
+ FieldTypes.reference,
430
+ default_factory=list,
431
+ )
432
+
433
+ @classmethod
434
+ def from_json(cls, serialized: dict) -> "SubDagInfo":
435
+ kw = serialized.copy()
436
+ kw["status"] = DagStatus(kw["status"])
437
+ if kw.get("tileable_to_result_infos"):
438
+ kw["tileable_to_result_infos"] = {
439
+ k: ResultInfo.from_json(s)
440
+ for k, s in kw["tileable_to_result_infos"].items()
441
+ }
442
+ if kw.get("error_info"):
443
+ kw["error_info"] = ErrorInfo.from_json(kw["error_info"])
444
+ if kw.get("submit_instances"):
445
+ kw["submit_instances"] = [
446
+ SubDagSubmitInstanceInfo.from_json(s) for s in kw["submit_instances"]
447
+ ]
448
+ return SubDagInfo(**kw)
449
+
450
+ def to_json(self) -> dict:
451
+ ret = {
452
+ "subdag_id": self.subdag_id,
453
+ "status": self.status.value,
454
+ "progress": self.progress,
455
+ "start_timestamp": self.start_timestamp,
456
+ "end_timestamp": self.end_timestamp,
457
+ }
458
+ if self.error_info:
459
+ ret["error_info"] = self.error_info.to_json()
460
+ if self.tileable_to_result_infos:
461
+ ret["tileable_to_result_infos"] = {
462
+ k: v.to_json() for k, v in self.tileable_to_result_infos.items()
463
+ }
464
+ if self.submit_instances:
465
+ ret["submit_instances"] = [i.to_json() for i in self.submit_instances]
466
+ return ret
358
467
 
359
468
 
360
469
  class ExecuteSubDagRequest(Serializable):
470
+ subdag_id: str = StringField("subdag_id")
361
471
  dag: TileableGraph = ReferenceField(
362
472
  "dag",
363
473
  on_serialize=SerializableGraph.from_graph,
@@ -371,7 +481,7 @@ class DecrefRequest(Serializable):
371
481
  keys: List[str] = ListField("keys", FieldTypes.string, default=None)
372
482
 
373
483
 
374
- class DataFrameTableMeta(Serializable):
484
+ class DataFrameTableMeta(JsonSerializable):
375
485
  __slots__ = "_pd_column_names", "_pd_index_level_names"
376
486
 
377
487
  table_name: Optional[str] = StringField("table_name", default=None)
@@ -402,7 +512,7 @@ class DataFrameTableMeta(Serializable):
402
512
  self._pd_index_level_names = self.pd_index_dtypes.index.tolist()
403
513
  return self._pd_index_level_names
404
514
 
405
- def __eq__(self, other: "Serializable") -> bool:
515
+ def __eq__(self, other: "DataFrameTableMeta") -> bool:
406
516
  if not isinstance(other, type(self)):
407
517
  return False
408
518
  for k in self._FIELDS:
@@ -413,3 +523,29 @@ class DataFrameTableMeta(Serializable):
413
523
  if not is_same:
414
524
  return False
415
525
  return True
526
+
527
+ def to_json(self) -> dict:
528
+ b64_pk = lambda x: base64.b64encode(pickle.dumps(x))
529
+ ret = {
530
+ "table_name": self.table_name,
531
+ "type": self.type.value,
532
+ "table_column_names": self.table_column_names,
533
+ "table_index_column_names": self.table_index_column_names,
534
+ "pd_column_dtypes": b64_pk(self.pd_column_dtypes),
535
+ "pd_column_level_names": b64_pk(self.pd_column_level_names),
536
+ "pd_index_dtypes": b64_pk(self.pd_index_dtypes),
537
+ }
538
+ return ret
539
+
540
+ @classmethod
541
+ def from_json(cls, serialized: dict) -> "DataFrameTableMeta":
542
+ b64_upk = lambda x: pickle.loads(base64.b64decode(x))
543
+ serialized.update(
544
+ {
545
+ "type": OutputType(serialized["type"]),
546
+ "pd_column_dtypes": b64_upk(serialized["pd_column_dtypes"]),
547
+ "pd_column_level_names": b64_upk(serialized["pd_column_level_names"]),
548
+ "pd_index_dtypes": b64_upk(serialized["pd_index_dtypes"]),
549
+ }
550
+ )
551
+ return DataFrameTableMeta(**serialized)
@@ -18,6 +18,9 @@ from libc.stdint cimport int32_t, uint64_t
18
18
  cdef class Serializer:
19
19
  cdef int _serializer_id
20
20
 
21
+ cpdef bint is_public_data_exist(self, dict context, object key)
22
+ cpdef put_public_data(self, dict context, object key, object value)
23
+ cpdef get_public_data(self, dict context, object key)
21
24
  cpdef serial(self, object obj, dict context)
22
25
  cpdef deserial(self, list serialized, dict context, list subs)
23
26
  cpdef on_deserial_error(
@@ -29,6 +29,9 @@ class PickleContainer:
29
29
 
30
30
  class Serializer:
31
31
  serializer_id: int
32
+ def is_public_data_exist(self, context: Dict, key: Any) -> bool: ...
33
+ def put_public_data(self, context: Dict, key: Any, value: Any) -> None: ...
34
+ def get_public_data(self, context: Dict, key: Any) -> Any: ...
32
35
  def serial(self, obj: Any, context: Dict): ...
33
36
  def deserial(self, serialized: List, context: Dict, subs: List[Any]): ...
34
37
  def on_deserial_error(
@@ -130,11 +130,30 @@ cdef Serializer get_deserializer(int32_t deserializer_id):
130
130
 
131
131
  cdef class Serializer:
132
132
  serializer_id = None
133
+ _public_data_context_key = 0x7fffffff - 1
133
134
 
134
135
  def __cinit__(self):
135
136
  # make the value can be referenced with C code
136
137
  self._serializer_id = self.serializer_id
137
138
 
139
+ cpdef bint is_public_data_exist(self, dict context, object key):
140
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
141
+ if public_dict is None:
142
+ return False
143
+ return key in public_dict
144
+
145
+ cpdef put_public_data(self, dict context, object key, object value):
146
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
147
+ if public_dict is None:
148
+ public_dict = context[self._public_data_context_key] = {}
149
+ public_dict[key] = value
150
+
151
+ cpdef get_public_data(self, dict context, object key):
152
+ cdef dict public_dict = context.get(self._public_data_context_key, None)
153
+ if public_dict is None:
154
+ return None
155
+ return public_dict.get(key)
156
+
138
157
  cpdef serial(self, object obj, dict context):
139
158
  """
140
159
  Returns intermediate serialization result of certain object.
@@ -993,17 +1012,20 @@ def serialize(obj, dict context = None):
993
1012
  cdef list subs
994
1013
  cdef bint final
995
1014
  cdef _IdContextHolder id_context_holder = _IdContextHolder()
1015
+ cdef tuple result
996
1016
 
997
1017
  context = context if context is not None else dict()
998
1018
  serialized, subs, final = _serial_single(obj, context, id_context_holder)
999
1019
  if final or not subs:
1000
1020
  # marked as a leaf node, return directly
1001
- return [{}, serialized], subs
1002
-
1003
- serial_stack.append(_SerialStackItem(serialized, subs))
1004
- return _serialize_with_stack(
1005
- serial_stack, None, context, id_context_holder, result_bufs_list
1006
- )
1021
+ result = [{}, serialized], subs
1022
+ else:
1023
+ serial_stack.append(_SerialStackItem(serialized, subs))
1024
+ result = _serialize_with_stack(
1025
+ serial_stack, None, context, id_context_holder, result_bufs_list
1026
+ )
1027
+ result[0][0]["_PUB"] = context.get(Serializer._public_data_context_key)
1028
+ return result
1007
1029
 
1008
1030
 
1009
1031
  async def serialize_with_spawn(
@@ -1036,31 +1058,38 @@ async def serialize_with_spawn(
1036
1058
  cdef list subs
1037
1059
  cdef bint final
1038
1060
  cdef _IdContextHolder id_context_holder = _IdContextHolder()
1061
+ cdef tuple result
1039
1062
 
1040
1063
  context = context if context is not None else dict()
1041
1064
  serialized, subs, final = _serial_single(obj, context, id_context_holder)
1042
1065
  if final or not subs:
1043
1066
  # marked as a leaf node, return directly
1044
- return [{}, serialized], subs
1045
-
1046
- serial_stack.append(_SerialStackItem(serialized, subs))
1067
+ result = [{}, serialized], subs
1068
+ else:
1069
+ serial_stack.append(_SerialStackItem(serialized, subs))
1047
1070
 
1048
- try:
1049
- result = _serialize_with_stack(
1050
- serial_stack, None, context, id_context_holder, result_bufs_list, spawn_threshold
1051
- )
1052
- except _SerializeObjectOverflow as ex:
1053
- result = await asyncio.get_running_loop().run_in_executor(
1054
- executor,
1055
- _serialize_with_stack,
1056
- serial_stack,
1057
- ex.cur_serialized,
1058
- context,
1059
- id_context_holder,
1060
- result_bufs_list,
1061
- 0,
1062
- ex.num_total_serialized,
1063
- )
1071
+ try:
1072
+ result = _serialize_with_stack(
1073
+ serial_stack,
1074
+ None,
1075
+ context,
1076
+ id_context_holder,
1077
+ result_bufs_list,
1078
+ spawn_threshold,
1079
+ )
1080
+ except _SerializeObjectOverflow as ex:
1081
+ result = await asyncio.get_running_loop().run_in_executor(
1082
+ executor,
1083
+ _serialize_with_stack,
1084
+ serial_stack,
1085
+ ex.cur_serialized,
1086
+ context,
1087
+ id_context_holder,
1088
+ result_bufs_list,
1089
+ 0,
1090
+ ex.num_total_serialized,
1091
+ )
1092
+ result[0][0]["_PUB"] = context.get(Serializer._public_data_context_key)
1064
1093
  return result
1065
1094
 
1066
1095
 
@@ -35,7 +35,7 @@ class RemoteException(MaxFrameError):
35
35
  def from_exception(cls, exc: Exception):
36
36
  try:
37
37
  buffers = pickle_buffers(exc)
38
- except (TypeError, pickle.PicklingError):
38
+ except:
39
39
  logger.exception("Cannot pickle exception %s", exc)
40
40
  buffers = []
41
41
 
@@ -176,11 +176,16 @@ class PdTimestampSerializer(Serializer):
176
176
 
177
177
  class PdTimedeltaSerializer(Serializer):
178
178
  def serial(self, obj: pd.Timedelta, context: Dict):
179
- return [int(obj.seconds), obj.microseconds, obj.nanoseconds], [], True
179
+ return [int(obj.seconds), obj.microseconds, obj.nanoseconds, obj.days], [], True
180
180
 
181
181
  def deserial(self, serialized: List, context: Dict, subs: List):
182
+ days = 0 if len(serialized) < 4 else serialized[3]
183
+ seconds, microseconds, nanoseconds = serialized[:3]
182
184
  return pd.Timedelta(
183
- seconds=serialized[0], microseconds=serialized[1], nanoseconds=serialized[2]
185
+ days=days,
186
+ seconds=seconds,
187
+ microseconds=microseconds,
188
+ nanoseconds=nanoseconds,
184
189
  )
185
190
 
186
191