maxframe 0.1.0b5__cp311-cp311-macosx_10_9_universal2.whl → 1.0.0rc2__cp311-cp311-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of maxframe might be problematic. Click here for more details.
- maxframe/_utils.cpython-311-darwin.so +0 -0
- maxframe/codegen.py +6 -2
- maxframe/config/config.py +38 -2
- maxframe/config/validators.py +1 -0
- maxframe/conftest.py +2 -0
- maxframe/core/__init__.py +0 -3
- maxframe/core/entity/__init__.py +1 -8
- maxframe/core/entity/objects.py +3 -45
- maxframe/core/graph/core.cpython-311-darwin.so +0 -0
- maxframe/core/graph/core.pyx +4 -4
- maxframe/dataframe/__init__.py +1 -1
- maxframe/dataframe/arithmetic/around.py +5 -17
- maxframe/dataframe/arithmetic/core.py +15 -7
- maxframe/dataframe/arithmetic/docstring.py +5 -55
- maxframe/dataframe/arithmetic/tests/test_arithmetic.py +22 -0
- maxframe/dataframe/core.py +5 -5
- maxframe/dataframe/datasource/date_range.py +2 -2
- maxframe/dataframe/datasource/read_odps_query.py +6 -0
- maxframe/dataframe/datasource/read_odps_table.py +2 -1
- maxframe/dataframe/datasource/tests/test_datasource.py +14 -0
- maxframe/dataframe/datastore/tests/__init__.py +13 -0
- maxframe/dataframe/datastore/tests/test_to_odps.py +48 -0
- maxframe/dataframe/datastore/to_odps.py +21 -0
- maxframe/dataframe/groupby/cum.py +0 -1
- maxframe/dataframe/groupby/tests/test_groupby.py +4 -0
- maxframe/dataframe/indexing/add_prefix_suffix.py +1 -1
- maxframe/dataframe/indexing/align.py +1 -1
- maxframe/dataframe/indexing/rename.py +3 -37
- maxframe/dataframe/indexing/sample.py +0 -1
- maxframe/dataframe/indexing/set_index.py +68 -1
- maxframe/dataframe/merge/merge.py +236 -2
- maxframe/dataframe/merge/tests/test_merge.py +123 -0
- maxframe/dataframe/misc/apply.py +5 -10
- maxframe/dataframe/misc/case_when.py +1 -1
- maxframe/dataframe/misc/describe.py +2 -2
- maxframe/dataframe/misc/drop_duplicates.py +4 -25
- maxframe/dataframe/misc/eval.py +4 -0
- maxframe/dataframe/misc/memory_usage.py +2 -2
- maxframe/dataframe/misc/pct_change.py +1 -83
- maxframe/dataframe/misc/tests/test_misc.py +23 -0
- maxframe/dataframe/misc/transform.py +1 -30
- maxframe/dataframe/misc/value_counts.py +4 -17
- maxframe/dataframe/missing/dropna.py +1 -1
- maxframe/dataframe/missing/fillna.py +5 -5
- maxframe/dataframe/sort/sort_values.py +1 -11
- maxframe/dataframe/statistics/corr.py +3 -3
- maxframe/dataframe/statistics/quantile.py +5 -17
- maxframe/dataframe/utils.py +4 -7
- maxframe/errors.py +13 -0
- maxframe/extension.py +12 -0
- maxframe/learn/contrib/xgboost/dmatrix.py +2 -2
- maxframe/learn/contrib/xgboost/predict.py +2 -2
- maxframe/learn/contrib/xgboost/train.py +2 -2
- maxframe/lib/mmh3.cpython-311-darwin.so +0 -0
- maxframe/lib/mmh3.pyi +43 -0
- maxframe/lib/wrapped_pickle.py +2 -1
- maxframe/odpsio/__init__.py +1 -1
- maxframe/odpsio/arrow.py +8 -4
- maxframe/odpsio/schema.py +10 -7
- maxframe/odpsio/tableio.py +388 -14
- maxframe/odpsio/tests/test_schema.py +16 -15
- maxframe/odpsio/tests/test_tableio.py +48 -21
- maxframe/protocol.py +148 -12
- maxframe/serialization/core.cpython-311-darwin.so +0 -0
- maxframe/serialization/core.pxd +3 -0
- maxframe/serialization/core.pyi +3 -0
- maxframe/serialization/core.pyx +54 -25
- maxframe/serialization/exception.py +1 -1
- maxframe/serialization/pandas.py +7 -2
- maxframe/serialization/serializables/core.py +158 -12
- maxframe/serialization/serializables/tests/test_serializable.py +46 -4
- maxframe/tensor/__init__.py +59 -0
- maxframe/tensor/arithmetic/tests/test_arithmetic.py +1 -1
- maxframe/tensor/base/atleast_1d.py +1 -1
- maxframe/tensor/base/unique.py +3 -3
- maxframe/tensor/reduction/count_nonzero.py +1 -1
- maxframe/tensor/statistics/quantile.py +2 -2
- maxframe/tests/test_protocol.py +34 -0
- maxframe/tests/test_utils.py +0 -12
- maxframe/tests/utils.py +11 -2
- maxframe/utils.py +24 -13
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/METADATA +75 -2
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/RECORD +91 -89
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/WHEEL +1 -1
- maxframe_client/__init__.py +0 -1
- maxframe_client/fetcher.py +38 -27
- maxframe_client/session/odps.py +50 -10
- maxframe_client/session/task.py +41 -20
- maxframe_client/tests/test_fetcher.py +21 -3
- maxframe_client/tests/test_session.py +49 -2
- maxframe_client/clients/spe.py +0 -104
- {maxframe-0.1.0b5.dist-info → maxframe-1.0.0rc2.dist-info}/top_level.txt +0 -0
|
@@ -12,22 +12,37 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import datetime
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
import pandas as pd
|
|
17
19
|
import pyarrow as pa
|
|
20
|
+
import pytest
|
|
18
21
|
from odps import ODPS
|
|
19
22
|
|
|
23
|
+
from ...config import options
|
|
20
24
|
from ...tests.utils import flaky, tn
|
|
21
25
|
from ...utils import config_odps_default_options
|
|
22
|
-
from ..tableio import
|
|
26
|
+
from ..tableio import ODPSTableIO
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def switch_table_io(request):
|
|
31
|
+
old_use_common_table = options.use_common_table
|
|
32
|
+
try:
|
|
33
|
+
options.use_common_table = request.param
|
|
34
|
+
yield
|
|
35
|
+
finally:
|
|
36
|
+
options.use_common_table = old_use_common_table
|
|
23
37
|
|
|
24
38
|
|
|
25
39
|
@flaky(max_runs=3)
|
|
26
|
-
|
|
40
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
41
|
+
def test_empty_table_io(switch_table_io):
|
|
27
42
|
config_odps_default_options()
|
|
28
43
|
|
|
29
44
|
o = ODPS.from_environments()
|
|
30
|
-
|
|
45
|
+
table_io = ODPSTableIO(o)
|
|
31
46
|
|
|
32
47
|
# test read from empty table
|
|
33
48
|
empty_table_name = tn("test_empty_table_halo_read")
|
|
@@ -35,42 +50,53 @@ def test_empty_table_io():
|
|
|
35
50
|
tb = o.create_table(empty_table_name, "col1 string", lifecycle=1)
|
|
36
51
|
|
|
37
52
|
try:
|
|
38
|
-
with
|
|
53
|
+
with table_io.open_reader(empty_table_name) as reader:
|
|
39
54
|
assert len(reader.read_all()) == 0
|
|
40
55
|
finally:
|
|
41
56
|
tb.drop()
|
|
42
57
|
|
|
43
58
|
|
|
44
59
|
@flaky(max_runs=3)
|
|
45
|
-
|
|
60
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
61
|
+
def test_table_io_without_parts(switch_table_io):
|
|
46
62
|
config_odps_default_options()
|
|
47
63
|
|
|
48
64
|
o = ODPS.from_environments()
|
|
49
|
-
|
|
65
|
+
table_io = ODPSTableIO(o)
|
|
50
66
|
|
|
51
67
|
# test read and write tables without partition
|
|
52
68
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
53
69
|
o.delete_table(no_part_table_name, if_exists=True)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
)
|
|
70
|
+
col_desc = ",".join(f"{c} double" for c in "abcde") + ", f datetime"
|
|
71
|
+
tb = o.create_table(no_part_table_name, col_desc, lifecycle=1)
|
|
57
72
|
|
|
58
73
|
try:
|
|
59
74
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
60
|
-
|
|
75
|
+
date_val = [
|
|
76
|
+
(
|
|
77
|
+
datetime.datetime.now().replace(microsecond=0)
|
|
78
|
+
+ datetime.timedelta(seconds=i)
|
|
79
|
+
)
|
|
80
|
+
for i in range(100)
|
|
81
|
+
]
|
|
82
|
+
pd_data["f"] = pd.Series(date_val, dtype="datetime64[ms]").dt.tz_localize(
|
|
83
|
+
options.local_timezone
|
|
84
|
+
)
|
|
85
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
61
86
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
62
|
-
with
|
|
87
|
+
with table_io.open_reader(no_part_table_name) as reader:
|
|
63
88
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
64
89
|
finally:
|
|
65
90
|
tb.drop()
|
|
66
91
|
|
|
67
92
|
|
|
68
93
|
@flaky(max_runs=3)
|
|
69
|
-
|
|
94
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
95
|
+
def test_table_io_with_range_reader(switch_table_io):
|
|
70
96
|
config_odps_default_options()
|
|
71
97
|
|
|
72
98
|
o = ODPS.from_environments()
|
|
73
|
-
|
|
99
|
+
table_io = ODPSTableIO(o)
|
|
74
100
|
|
|
75
101
|
# test read and write tables without partition
|
|
76
102
|
no_part_table_name = tn("test_no_part_halo_write")
|
|
@@ -81,15 +107,15 @@ def test_table_io_with_range_reader():
|
|
|
81
107
|
|
|
82
108
|
try:
|
|
83
109
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
84
|
-
with
|
|
110
|
+
with table_io.open_writer(no_part_table_name) as writer:
|
|
85
111
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
86
112
|
|
|
87
|
-
with
|
|
113
|
+
with table_io.open_reader(
|
|
88
114
|
no_part_table_name, start=None, stop=100, row_batch_size=10
|
|
89
115
|
) as reader:
|
|
90
116
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
91
117
|
|
|
92
|
-
with
|
|
118
|
+
with table_io.open_reader(
|
|
93
119
|
no_part_table_name,
|
|
94
120
|
start=-2,
|
|
95
121
|
stop=-52,
|
|
@@ -105,11 +131,12 @@ def test_table_io_with_range_reader():
|
|
|
105
131
|
|
|
106
132
|
|
|
107
133
|
@flaky(max_runs=3)
|
|
108
|
-
|
|
134
|
+
@pytest.mark.parametrize("switch_table_io", [False, True], indirect=True)
|
|
135
|
+
def test_table_io_with_parts(switch_table_io):
|
|
109
136
|
config_odps_default_options()
|
|
110
137
|
|
|
111
138
|
o = ODPS.from_environments()
|
|
112
|
-
|
|
139
|
+
table_io = ODPSTableIO(o)
|
|
113
140
|
|
|
114
141
|
# test read and write tables with partition
|
|
115
142
|
parted_table_name = tn("test_parted_halo_write")
|
|
@@ -122,11 +149,11 @@ def test_table_io_with_parts():
|
|
|
122
149
|
|
|
123
150
|
try:
|
|
124
151
|
pd_data = pd.DataFrame(np.random.rand(100, 5), columns=list("abcde"))
|
|
125
|
-
with
|
|
152
|
+
with table_io.open_writer(parted_table_name, "pt=test") as writer:
|
|
126
153
|
writer.write(pa.Table.from_pandas(pd_data, preserve_index=False))
|
|
127
|
-
with
|
|
154
|
+
with table_io.open_reader(parted_table_name, "pt=test") as reader:
|
|
128
155
|
pd.testing.assert_frame_equal(reader.read_all().to_pandas(), pd_data)
|
|
129
|
-
with
|
|
156
|
+
with table_io.open_reader(
|
|
130
157
|
parted_table_name, "pt=test", partition_columns=True
|
|
131
158
|
) as reader:
|
|
132
159
|
expected_data = pd_data.copy()
|
maxframe/protocol.py
CHANGED
|
@@ -32,6 +32,7 @@ from .serialization.serializables import (
|
|
|
32
32
|
EnumField,
|
|
33
33
|
FieldTypes,
|
|
34
34
|
Float64Field,
|
|
35
|
+
Int32Field,
|
|
35
36
|
ListField,
|
|
36
37
|
ReferenceField,
|
|
37
38
|
Serializable,
|
|
@@ -71,6 +72,9 @@ class DagStatus(enum.Enum):
|
|
|
71
72
|
CANCELLING = 4
|
|
72
73
|
CANCELLED = 5
|
|
73
74
|
|
|
75
|
+
def is_terminated(self):
|
|
76
|
+
return self in (DagStatus.CANCELLED, DagStatus.SUCCEEDED, DagStatus.FAILED)
|
|
77
|
+
|
|
74
78
|
|
|
75
79
|
class DimensionIndex(Serializable):
|
|
76
80
|
is_slice: bool = BoolField("is_slice", default=None)
|
|
@@ -150,6 +154,9 @@ class ODPSTableResultInfo(ResultInfo):
|
|
|
150
154
|
partition_specs: Optional[List[str]] = ListField(
|
|
151
155
|
"partition_specs", FieldTypes.string, default=None
|
|
152
156
|
)
|
|
157
|
+
table_meta: Optional["DataFrameTableMeta"] = ReferenceField(
|
|
158
|
+
"table_meta", default=None
|
|
159
|
+
)
|
|
153
160
|
|
|
154
161
|
def __init__(self, result_type: ResultType = None, **kw):
|
|
155
162
|
result_type = result_type or ResultType.ODPS_TABLE
|
|
@@ -160,8 +167,17 @@ class ODPSTableResultInfo(ResultInfo):
|
|
|
160
167
|
ret["full_table_name"] = self.full_table_name
|
|
161
168
|
if self.partition_specs:
|
|
162
169
|
ret["partition_specs"] = self.partition_specs
|
|
170
|
+
if self.table_meta:
|
|
171
|
+
ret["table_meta"] = self.table_meta.to_json()
|
|
163
172
|
return ret
|
|
164
173
|
|
|
174
|
+
@classmethod
|
|
175
|
+
def _json_to_kwargs(cls, serialized: dict) -> dict:
|
|
176
|
+
kw = super()._json_to_kwargs(serialized)
|
|
177
|
+
if "table_meta" in kw:
|
|
178
|
+
kw["table_meta"] = DataFrameTableMeta.from_json(kw["table_meta"])
|
|
179
|
+
return kw
|
|
180
|
+
|
|
165
181
|
|
|
166
182
|
class ODPSVolumeResultInfo(ResultInfo):
|
|
167
183
|
_result_type = ResultType.ODPS_VOLUME
|
|
@@ -190,9 +206,9 @@ class ErrorInfo(JsonSerializable):
|
|
|
190
206
|
"error_tracebacks", FieldTypes.list
|
|
191
207
|
)
|
|
192
208
|
raw_error_source: ErrorSource = EnumField(
|
|
193
|
-
"raw_error_source", ErrorSource, FieldTypes.int8
|
|
209
|
+
"raw_error_source", ErrorSource, FieldTypes.int8, default=None
|
|
194
210
|
)
|
|
195
|
-
raw_error_data: Optional[Exception] = AnyField("raw_error_data")
|
|
211
|
+
raw_error_data: Optional[Exception] = AnyField("raw_error_data", default=None)
|
|
196
212
|
|
|
197
213
|
@classmethod
|
|
198
214
|
def from_exception(cls, exc: Exception):
|
|
@@ -201,20 +217,29 @@ class ErrorInfo(JsonSerializable):
|
|
|
201
217
|
return cls(messages, tracebacks, ErrorSource.PYTHON, exc)
|
|
202
218
|
|
|
203
219
|
def reraise(self):
|
|
204
|
-
if
|
|
220
|
+
if (
|
|
221
|
+
self.raw_error_source == ErrorSource.PYTHON
|
|
222
|
+
and self.raw_error_data is not None
|
|
223
|
+
):
|
|
205
224
|
raise self.raw_error_data
|
|
206
225
|
raise RemoteException(self.error_messages, self.error_tracebacks, [])
|
|
207
226
|
|
|
208
227
|
@classmethod
|
|
209
228
|
def from_json(cls, serialized: dict) -> "ErrorInfo":
|
|
210
229
|
kw = serialized.copy()
|
|
211
|
-
kw
|
|
230
|
+
if kw.get("raw_error_source") is not None:
|
|
231
|
+
kw["raw_error_source"] = ErrorSource(serialized["raw_error_source"])
|
|
232
|
+
else:
|
|
233
|
+
kw["raw_error_source"] = None
|
|
234
|
+
|
|
212
235
|
if kw.get("raw_error_data"):
|
|
213
236
|
bufs = [base64.b64decode(s) for s in kw["raw_error_data"]]
|
|
214
237
|
try:
|
|
215
238
|
kw["raw_error_data"] = pickle.loads(bufs[0], buffers=bufs[1:])
|
|
216
239
|
except:
|
|
217
|
-
|
|
240
|
+
# both error source and data shall be None to make sure
|
|
241
|
+
# RemoteException is raised.
|
|
242
|
+
kw["raw_error_source"] = kw["raw_error_data"] = None
|
|
218
243
|
return cls(**kw)
|
|
219
244
|
|
|
220
245
|
def to_json(self) -> dict:
|
|
@@ -227,7 +252,12 @@ class ErrorInfo(JsonSerializable):
|
|
|
227
252
|
if isinstance(self.raw_error_data, (PickleContainer, RemoteException)):
|
|
228
253
|
err_data_bufs = self.raw_error_data.get_buffers()
|
|
229
254
|
elif isinstance(self.raw_error_data, BaseException):
|
|
230
|
-
|
|
255
|
+
try:
|
|
256
|
+
err_data_bufs = pickle_buffers(self.raw_error_data)
|
|
257
|
+
except:
|
|
258
|
+
err_data_bufs = None
|
|
259
|
+
ret["raw_error_source"] = None
|
|
260
|
+
|
|
231
261
|
if err_data_bufs:
|
|
232
262
|
ret["raw_error_data"] = [
|
|
233
263
|
base64.b64encode(s).decode() for s in err_data_bufs
|
|
@@ -249,9 +279,17 @@ class DagInfo(JsonSerializable):
|
|
|
249
279
|
error_info: Optional[ErrorInfo] = ReferenceField("error_info", default=None)
|
|
250
280
|
start_timestamp: Optional[float] = Float64Field("start_timestamp", default=None)
|
|
251
281
|
end_timestamp: Optional[float] = Float64Field("end_timestamp", default=None)
|
|
282
|
+
subdag_infos: Dict[str, "SubDagInfo"] = DictField(
|
|
283
|
+
"subdag_infos",
|
|
284
|
+
key_type=FieldTypes.string,
|
|
285
|
+
value_type=FieldTypes.reference,
|
|
286
|
+
default_factory=dict,
|
|
287
|
+
)
|
|
252
288
|
|
|
253
289
|
@classmethod
|
|
254
|
-
def from_json(cls, serialized: dict) -> "DagInfo":
|
|
290
|
+
def from_json(cls, serialized: dict) -> Optional["DagInfo"]:
|
|
291
|
+
if serialized is None:
|
|
292
|
+
return None
|
|
255
293
|
kw = serialized.copy()
|
|
256
294
|
kw["status"] = DagStatus(kw["status"])
|
|
257
295
|
if kw.get("tileable_to_result_infos"):
|
|
@@ -261,6 +299,10 @@ class DagInfo(JsonSerializable):
|
|
|
261
299
|
}
|
|
262
300
|
if kw.get("error_info"):
|
|
263
301
|
kw["error_info"] = ErrorInfo.from_json(kw["error_info"])
|
|
302
|
+
if kw.get("subdag_infos"):
|
|
303
|
+
kw["subdag_infos"] = {
|
|
304
|
+
k: SubDagInfo.from_json(v) for k, v in kw["subdag_infos"].items()
|
|
305
|
+
}
|
|
264
306
|
return DagInfo(**kw)
|
|
265
307
|
|
|
266
308
|
def to_json(self) -> dict:
|
|
@@ -279,6 +321,8 @@ class DagInfo(JsonSerializable):
|
|
|
279
321
|
}
|
|
280
322
|
if self.error_info:
|
|
281
323
|
ret["error_info"] = self.error_info.to_json()
|
|
324
|
+
if self.subdag_infos:
|
|
325
|
+
ret["subdag_infos"] = {k: v.to_json() for k, v in self.subdag_infos.items()}
|
|
282
326
|
return ret
|
|
283
327
|
|
|
284
328
|
|
|
@@ -302,7 +346,9 @@ class SessionInfo(JsonSerializable):
|
|
|
302
346
|
error_info: Optional[ErrorInfo] = ReferenceField("error_info", default=None)
|
|
303
347
|
|
|
304
348
|
@classmethod
|
|
305
|
-
def from_json(cls, serialized: dict) -> "SessionInfo":
|
|
349
|
+
def from_json(cls, serialized: dict) -> Optional["SessionInfo"]:
|
|
350
|
+
if serialized is None:
|
|
351
|
+
return None
|
|
306
352
|
kw = serialized.copy()
|
|
307
353
|
if kw.get("dag_infos"):
|
|
308
354
|
kw["dag_infos"] = {
|
|
@@ -320,7 +366,10 @@ class SessionInfo(JsonSerializable):
|
|
|
320
366
|
"idle_timestamp": self.idle_timestamp,
|
|
321
367
|
}
|
|
322
368
|
if self.dag_infos:
|
|
323
|
-
ret["dag_infos"] = {
|
|
369
|
+
ret["dag_infos"] = {
|
|
370
|
+
k: v.to_json() if v is not None else None
|
|
371
|
+
for k, v in self.dag_infos.items()
|
|
372
|
+
}
|
|
324
373
|
if self.error_info:
|
|
325
374
|
ret["error_info"] = self.error_info.to_json()
|
|
326
375
|
return ret
|
|
@@ -342,7 +391,25 @@ class ExecuteDagRequest(Serializable):
|
|
|
342
391
|
)
|
|
343
392
|
|
|
344
393
|
|
|
345
|
-
class
|
|
394
|
+
class SubDagSubmitInstanceInfo(JsonSerializable):
|
|
395
|
+
submit_reason: str = StringField("submit_reason")
|
|
396
|
+
instance_id: str = StringField("instance_id")
|
|
397
|
+
subquery_id: Optional[int] = Int32Field("subquery_id", default=None)
|
|
398
|
+
|
|
399
|
+
@classmethod
|
|
400
|
+
def from_json(cls, serialized: dict) -> "SubDagSubmitInstanceInfo":
|
|
401
|
+
return SubDagSubmitInstanceInfo(**serialized)
|
|
402
|
+
|
|
403
|
+
def to_json(self) -> dict:
|
|
404
|
+
ret = {
|
|
405
|
+
"submit_reason": self.submit_reason,
|
|
406
|
+
"instance_id": self.instance_id,
|
|
407
|
+
"subquery_id": self.subquery_id,
|
|
408
|
+
}
|
|
409
|
+
return ret
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class SubDagInfo(JsonSerializable):
|
|
346
413
|
subdag_id: str = StringField("subdag_id")
|
|
347
414
|
status: DagStatus = EnumField("status", DagStatus, FieldTypes.int8, default=None)
|
|
348
415
|
progress: float = Float64Field("progress", default=None)
|
|
@@ -355,9 +422,52 @@ class SubDagInfo(Serializable):
|
|
|
355
422
|
FieldTypes.reference,
|
|
356
423
|
default_factory=dict,
|
|
357
424
|
)
|
|
425
|
+
start_timestamp: Optional[float] = Float64Field("start_timestamp", default=None)
|
|
426
|
+
end_timestamp: Optional[float] = Float64Field("end_timestamp", default=None)
|
|
427
|
+
submit_instances: List[SubDagSubmitInstanceInfo] = ListField(
|
|
428
|
+
"submit_instances",
|
|
429
|
+
FieldTypes.reference,
|
|
430
|
+
default_factory=list,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
@classmethod
|
|
434
|
+
def from_json(cls, serialized: dict) -> "SubDagInfo":
|
|
435
|
+
kw = serialized.copy()
|
|
436
|
+
kw["status"] = DagStatus(kw["status"])
|
|
437
|
+
if kw.get("tileable_to_result_infos"):
|
|
438
|
+
kw["tileable_to_result_infos"] = {
|
|
439
|
+
k: ResultInfo.from_json(s)
|
|
440
|
+
for k, s in kw["tileable_to_result_infos"].items()
|
|
441
|
+
}
|
|
442
|
+
if kw.get("error_info"):
|
|
443
|
+
kw["error_info"] = ErrorInfo.from_json(kw["error_info"])
|
|
444
|
+
if kw.get("submit_instances"):
|
|
445
|
+
kw["submit_instances"] = [
|
|
446
|
+
SubDagSubmitInstanceInfo.from_json(s) for s in kw["submit_instances"]
|
|
447
|
+
]
|
|
448
|
+
return SubDagInfo(**kw)
|
|
449
|
+
|
|
450
|
+
def to_json(self) -> dict:
|
|
451
|
+
ret = {
|
|
452
|
+
"subdag_id": self.subdag_id,
|
|
453
|
+
"status": self.status.value,
|
|
454
|
+
"progress": self.progress,
|
|
455
|
+
"start_timestamp": self.start_timestamp,
|
|
456
|
+
"end_timestamp": self.end_timestamp,
|
|
457
|
+
}
|
|
458
|
+
if self.error_info:
|
|
459
|
+
ret["error_info"] = self.error_info.to_json()
|
|
460
|
+
if self.tileable_to_result_infos:
|
|
461
|
+
ret["tileable_to_result_infos"] = {
|
|
462
|
+
k: v.to_json() for k, v in self.tileable_to_result_infos.items()
|
|
463
|
+
}
|
|
464
|
+
if self.submit_instances:
|
|
465
|
+
ret["submit_instances"] = [i.to_json() for i in self.submit_instances]
|
|
466
|
+
return ret
|
|
358
467
|
|
|
359
468
|
|
|
360
469
|
class ExecuteSubDagRequest(Serializable):
|
|
470
|
+
subdag_id: str = StringField("subdag_id")
|
|
361
471
|
dag: TileableGraph = ReferenceField(
|
|
362
472
|
"dag",
|
|
363
473
|
on_serialize=SerializableGraph.from_graph,
|
|
@@ -371,7 +481,7 @@ class DecrefRequest(Serializable):
|
|
|
371
481
|
keys: List[str] = ListField("keys", FieldTypes.string, default=None)
|
|
372
482
|
|
|
373
483
|
|
|
374
|
-
class DataFrameTableMeta(
|
|
484
|
+
class DataFrameTableMeta(JsonSerializable):
|
|
375
485
|
__slots__ = "_pd_column_names", "_pd_index_level_names"
|
|
376
486
|
|
|
377
487
|
table_name: Optional[str] = StringField("table_name", default=None)
|
|
@@ -402,7 +512,7 @@ class DataFrameTableMeta(Serializable):
|
|
|
402
512
|
self._pd_index_level_names = self.pd_index_dtypes.index.tolist()
|
|
403
513
|
return self._pd_index_level_names
|
|
404
514
|
|
|
405
|
-
def __eq__(self, other: "
|
|
515
|
+
def __eq__(self, other: "DataFrameTableMeta") -> bool:
|
|
406
516
|
if not isinstance(other, type(self)):
|
|
407
517
|
return False
|
|
408
518
|
for k in self._FIELDS:
|
|
@@ -413,3 +523,29 @@ class DataFrameTableMeta(Serializable):
|
|
|
413
523
|
if not is_same:
|
|
414
524
|
return False
|
|
415
525
|
return True
|
|
526
|
+
|
|
527
|
+
def to_json(self) -> dict:
|
|
528
|
+
b64_pk = lambda x: base64.b64encode(pickle.dumps(x))
|
|
529
|
+
ret = {
|
|
530
|
+
"table_name": self.table_name,
|
|
531
|
+
"type": self.type.value,
|
|
532
|
+
"table_column_names": self.table_column_names,
|
|
533
|
+
"table_index_column_names": self.table_index_column_names,
|
|
534
|
+
"pd_column_dtypes": b64_pk(self.pd_column_dtypes),
|
|
535
|
+
"pd_column_level_names": b64_pk(self.pd_column_level_names),
|
|
536
|
+
"pd_index_dtypes": b64_pk(self.pd_index_dtypes),
|
|
537
|
+
}
|
|
538
|
+
return ret
|
|
539
|
+
|
|
540
|
+
@classmethod
|
|
541
|
+
def from_json(cls, serialized: dict) -> "DataFrameTableMeta":
|
|
542
|
+
b64_upk = lambda x: pickle.loads(base64.b64decode(x))
|
|
543
|
+
serialized.update(
|
|
544
|
+
{
|
|
545
|
+
"type": OutputType(serialized["type"]),
|
|
546
|
+
"pd_column_dtypes": b64_upk(serialized["pd_column_dtypes"]),
|
|
547
|
+
"pd_column_level_names": b64_upk(serialized["pd_column_level_names"]),
|
|
548
|
+
"pd_index_dtypes": b64_upk(serialized["pd_index_dtypes"]),
|
|
549
|
+
}
|
|
550
|
+
)
|
|
551
|
+
return DataFrameTableMeta(**serialized)
|
|
Binary file
|
maxframe/serialization/core.pxd
CHANGED
|
@@ -18,6 +18,9 @@ from libc.stdint cimport int32_t, uint64_t
|
|
|
18
18
|
cdef class Serializer:
|
|
19
19
|
cdef int _serializer_id
|
|
20
20
|
|
|
21
|
+
cpdef bint is_public_data_exist(self, dict context, object key)
|
|
22
|
+
cpdef put_public_data(self, dict context, object key, object value)
|
|
23
|
+
cpdef get_public_data(self, dict context, object key)
|
|
21
24
|
cpdef serial(self, object obj, dict context)
|
|
22
25
|
cpdef deserial(self, list serialized, dict context, list subs)
|
|
23
26
|
cpdef on_deserial_error(
|
maxframe/serialization/core.pyi
CHANGED
|
@@ -29,6 +29,9 @@ class PickleContainer:
|
|
|
29
29
|
|
|
30
30
|
class Serializer:
|
|
31
31
|
serializer_id: int
|
|
32
|
+
def is_public_data_exist(self, context: Dict, key: Any) -> bool: ...
|
|
33
|
+
def put_public_data(self, context: Dict, key: Any, value: Any) -> None: ...
|
|
34
|
+
def get_public_data(self, context: Dict, key: Any) -> Any: ...
|
|
32
35
|
def serial(self, obj: Any, context: Dict): ...
|
|
33
36
|
def deserial(self, serialized: List, context: Dict, subs: List[Any]): ...
|
|
34
37
|
def on_deserial_error(
|
maxframe/serialization/core.pyx
CHANGED
|
@@ -130,11 +130,30 @@ cdef Serializer get_deserializer(int32_t deserializer_id):
|
|
|
130
130
|
|
|
131
131
|
cdef class Serializer:
|
|
132
132
|
serializer_id = None
|
|
133
|
+
_public_data_context_key = 0x7fffffff - 1
|
|
133
134
|
|
|
134
135
|
def __cinit__(self):
|
|
135
136
|
# make the value can be referenced with C code
|
|
136
137
|
self._serializer_id = self.serializer_id
|
|
137
138
|
|
|
139
|
+
cpdef bint is_public_data_exist(self, dict context, object key):
|
|
140
|
+
cdef dict public_dict = context.get(self._public_data_context_key, None)
|
|
141
|
+
if public_dict is None:
|
|
142
|
+
return False
|
|
143
|
+
return key in public_dict
|
|
144
|
+
|
|
145
|
+
cpdef put_public_data(self, dict context, object key, object value):
|
|
146
|
+
cdef dict public_dict = context.get(self._public_data_context_key, None)
|
|
147
|
+
if public_dict is None:
|
|
148
|
+
public_dict = context[self._public_data_context_key] = {}
|
|
149
|
+
public_dict[key] = value
|
|
150
|
+
|
|
151
|
+
cpdef get_public_data(self, dict context, object key):
|
|
152
|
+
cdef dict public_dict = context.get(self._public_data_context_key, None)
|
|
153
|
+
if public_dict is None:
|
|
154
|
+
return None
|
|
155
|
+
return public_dict.get(key)
|
|
156
|
+
|
|
138
157
|
cpdef serial(self, object obj, dict context):
|
|
139
158
|
"""
|
|
140
159
|
Returns intermediate serialization result of certain object.
|
|
@@ -993,17 +1012,20 @@ def serialize(obj, dict context = None):
|
|
|
993
1012
|
cdef list subs
|
|
994
1013
|
cdef bint final
|
|
995
1014
|
cdef _IdContextHolder id_context_holder = _IdContextHolder()
|
|
1015
|
+
cdef tuple result
|
|
996
1016
|
|
|
997
1017
|
context = context if context is not None else dict()
|
|
998
1018
|
serialized, subs, final = _serial_single(obj, context, id_context_holder)
|
|
999
1019
|
if final or not subs:
|
|
1000
1020
|
# marked as a leaf node, return directly
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1021
|
+
result = [{}, serialized], subs
|
|
1022
|
+
else:
|
|
1023
|
+
serial_stack.append(_SerialStackItem(serialized, subs))
|
|
1024
|
+
result = _serialize_with_stack(
|
|
1025
|
+
serial_stack, None, context, id_context_holder, result_bufs_list
|
|
1026
|
+
)
|
|
1027
|
+
result[0][0]["_PUB"] = context.get(Serializer._public_data_context_key)
|
|
1028
|
+
return result
|
|
1007
1029
|
|
|
1008
1030
|
|
|
1009
1031
|
async def serialize_with_spawn(
|
|
@@ -1036,31 +1058,38 @@ async def serialize_with_spawn(
|
|
|
1036
1058
|
cdef list subs
|
|
1037
1059
|
cdef bint final
|
|
1038
1060
|
cdef _IdContextHolder id_context_holder = _IdContextHolder()
|
|
1061
|
+
cdef tuple result
|
|
1039
1062
|
|
|
1040
1063
|
context = context if context is not None else dict()
|
|
1041
1064
|
serialized, subs, final = _serial_single(obj, context, id_context_holder)
|
|
1042
1065
|
if final or not subs:
|
|
1043
1066
|
# marked as a leaf node, return directly
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1067
|
+
result = [{}, serialized], subs
|
|
1068
|
+
else:
|
|
1069
|
+
serial_stack.append(_SerialStackItem(serialized, subs))
|
|
1047
1070
|
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1071
|
+
try:
|
|
1072
|
+
result = _serialize_with_stack(
|
|
1073
|
+
serial_stack,
|
|
1074
|
+
None,
|
|
1075
|
+
context,
|
|
1076
|
+
id_context_holder,
|
|
1077
|
+
result_bufs_list,
|
|
1078
|
+
spawn_threshold,
|
|
1079
|
+
)
|
|
1080
|
+
except _SerializeObjectOverflow as ex:
|
|
1081
|
+
result = await asyncio.get_running_loop().run_in_executor(
|
|
1082
|
+
executor,
|
|
1083
|
+
_serialize_with_stack,
|
|
1084
|
+
serial_stack,
|
|
1085
|
+
ex.cur_serialized,
|
|
1086
|
+
context,
|
|
1087
|
+
id_context_holder,
|
|
1088
|
+
result_bufs_list,
|
|
1089
|
+
0,
|
|
1090
|
+
ex.num_total_serialized,
|
|
1091
|
+
)
|
|
1092
|
+
result[0][0]["_PUB"] = context.get(Serializer._public_data_context_key)
|
|
1064
1093
|
return result
|
|
1065
1094
|
|
|
1066
1095
|
|
maxframe/serialization/pandas.py
CHANGED
|
@@ -176,11 +176,16 @@ class PdTimestampSerializer(Serializer):
|
|
|
176
176
|
|
|
177
177
|
class PdTimedeltaSerializer(Serializer):
|
|
178
178
|
def serial(self, obj: pd.Timedelta, context: Dict):
|
|
179
|
-
return [int(obj.seconds), obj.microseconds, obj.nanoseconds], [], True
|
|
179
|
+
return [int(obj.seconds), obj.microseconds, obj.nanoseconds, obj.days], [], True
|
|
180
180
|
|
|
181
181
|
def deserial(self, serialized: List, context: Dict, subs: List):
|
|
182
|
+
days = 0 if len(serialized) < 4 else serialized[3]
|
|
183
|
+
seconds, microseconds, nanoseconds = serialized[:3]
|
|
182
184
|
return pd.Timedelta(
|
|
183
|
-
|
|
185
|
+
days=days,
|
|
186
|
+
seconds=seconds,
|
|
187
|
+
microseconds=microseconds,
|
|
188
|
+
nanoseconds=nanoseconds,
|
|
184
189
|
)
|
|
185
190
|
|
|
186
191
|
|