dreadnode 1.0.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dreadnode/__init__.py +51 -0
- dreadnode/api/__init__.py +0 -0
- dreadnode/api/client.py +249 -0
- dreadnode/api/models.py +210 -0
- dreadnode/artifact/__init__.py +0 -0
- dreadnode/artifact/merger.py +599 -0
- dreadnode/artifact/storage.py +126 -0
- dreadnode/artifact/tree_builder.py +455 -0
- dreadnode/constants.py +16 -0
- dreadnode/integrations/__init__.py +0 -0
- dreadnode/integrations/transformers.py +183 -0
- dreadnode/main.py +1042 -0
- dreadnode/metric.py +225 -0
- dreadnode/object.py +29 -0
- dreadnode/py.typed +0 -0
- dreadnode/serialization.py +731 -0
- dreadnode/task.py +447 -0
- dreadnode/tracing/__init__.py +0 -0
- dreadnode/tracing/constants.py +35 -0
- dreadnode/tracing/exporters.py +157 -0
- dreadnode/tracing/span.py +811 -0
- dreadnode/types.py +25 -0
- dreadnode/util.py +150 -0
- dreadnode/version.py +3 -0
- dreadnode-1.0.0rc0.dist-info/METADATA +122 -0
- dreadnode-1.0.0rc0.dist-info/RECORD +27 -0
- dreadnode-1.0.0rc0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,731 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import contextlib
|
|
3
|
+
import dataclasses
|
|
4
|
+
import datetime
|
|
5
|
+
import hashlib
|
|
6
|
+
import io
|
|
7
|
+
import json
|
|
8
|
+
import typing as t
|
|
9
|
+
from collections import deque
|
|
10
|
+
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
11
|
+
from decimal import Decimal
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from functools import lru_cache
|
|
14
|
+
from ipaddress import (
|
|
15
|
+
IPv4Address,
|
|
16
|
+
IPv4Interface,
|
|
17
|
+
IPv4Network,
|
|
18
|
+
IPv6Address,
|
|
19
|
+
IPv6Interface,
|
|
20
|
+
IPv6Network,
|
|
21
|
+
)
|
|
22
|
+
from pathlib import PosixPath
|
|
23
|
+
from re import Pattern
|
|
24
|
+
from uuid import UUID
|
|
25
|
+
|
|
26
|
+
from dreadnode.types import JsonDict, JsonValue
|
|
27
|
+
from dreadnode.util import safe_repr
|
|
28
|
+
|
|
29
|
+
# Types
|
|
30
|
+
|
|
31
|
+
HandlerFunc = Callable[[t.Any, set[int]], tuple[JsonValue, JsonDict]]
|
|
32
|
+
|
|
33
|
+
# Constants
|
|
34
|
+
|
|
35
|
+
EMPTY_SCHEMA: JsonDict = {}
|
|
36
|
+
UNKNOWN_OBJECT_SCHEMA: JsonDict = {"type": "object", "x-python-datatype": "unknown"}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Helpers
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
import attrs
|
|
43
|
+
|
|
44
|
+
def _is_attrs_instance(_cls: type) -> bool:
|
|
45
|
+
return attrs.has(_cls)
|
|
46
|
+
|
|
47
|
+
except ModuleNotFoundError:
|
|
48
|
+
|
|
49
|
+
def _is_attrs_instance(_cls: type) -> bool:
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Specific handlers
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _handle_sequence(
|
|
57
|
+
obj: Sequence[t.Any] | set[t.Any] | frozenset[t.Any],
|
|
58
|
+
seen: set[int],
|
|
59
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
60
|
+
obj_type = type(obj)
|
|
61
|
+
items_list = list(obj)
|
|
62
|
+
|
|
63
|
+
with contextlib.suppress(TypeError):
|
|
64
|
+
items_list.sort() # sort if possible (e.g., for sets)
|
|
65
|
+
|
|
66
|
+
serialized: list[JsonValue] = []
|
|
67
|
+
item_schemas: list[JsonDict] = []
|
|
68
|
+
|
|
69
|
+
non_empty_schemas_found = False
|
|
70
|
+
|
|
71
|
+
for item in items_list:
|
|
72
|
+
s_item, schema_item = _serialize(item, seen)
|
|
73
|
+
serialized.append(s_item)
|
|
74
|
+
item_schemas.append(schema_item)
|
|
75
|
+
if schema_item != EMPTY_SCHEMA:
|
|
76
|
+
non_empty_schemas_found = True
|
|
77
|
+
|
|
78
|
+
schema: JsonDict = {"type": "array"}
|
|
79
|
+
if obj_type != list: # noqa: E721
|
|
80
|
+
schema["title"] = obj_type.__name__
|
|
81
|
+
type_name_map = {tuple: "tuple", set: "set", frozenset: "set", deque: "deque"}
|
|
82
|
+
schema["x-python-datatype"] = type_name_map.get(obj_type, obj_type.__name__)
|
|
83
|
+
|
|
84
|
+
if not items_list: # if empty, basic array schema is sufficient
|
|
85
|
+
return serialized, schema
|
|
86
|
+
|
|
87
|
+
if not non_empty_schemas_found:
|
|
88
|
+
first_item_type = type(items_list[0])
|
|
89
|
+
if first_item_type in {str, int, float, bool, type(None)} and all(
|
|
90
|
+
type(item) is first_item_type for item in items_list
|
|
91
|
+
):
|
|
92
|
+
type_map = {
|
|
93
|
+
str: "string",
|
|
94
|
+
int: "integer",
|
|
95
|
+
float: "number",
|
|
96
|
+
bool: "boolean",
|
|
97
|
+
type(None): "null",
|
|
98
|
+
}
|
|
99
|
+
schema["items"] = {"type": type_map[first_item_type]}
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
# Check if all non-empty schemas are the same
|
|
103
|
+
first_real_schema = next((s for s in item_schemas if s != EMPTY_SCHEMA), None)
|
|
104
|
+
if first_real_schema and all(s in (first_real_schema, EMPTY_SCHEMA) for s in item_schemas):
|
|
105
|
+
# All items conform to the same schema (or are primitives implicitly covered)
|
|
106
|
+
schema["items"] = first_real_schema
|
|
107
|
+
else:
|
|
108
|
+
# Mixed schemas, use prefixItems (best for tuples, compromise for lists/sets)
|
|
109
|
+
schema["prefixItems"] = item_schemas # type: ignore [assignment]
|
|
110
|
+
|
|
111
|
+
return serialized, schema
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _handle_mapping(
|
|
115
|
+
obj: Mapping[t.Any, t.Any],
|
|
116
|
+
seen: set[int],
|
|
117
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
118
|
+
serialized_dict: JsonDict = {}
|
|
119
|
+
schema_properties: JsonDict = {}
|
|
120
|
+
|
|
121
|
+
for key, value in obj.items():
|
|
122
|
+
str_key = key if isinstance(key, str) else safe_repr(key)
|
|
123
|
+
val_serialized, val_schema = _serialize(value, seen)
|
|
124
|
+
serialized_dict[str_key] = val_serialized
|
|
125
|
+
if val_schema != EMPTY_SCHEMA:
|
|
126
|
+
schema_properties[str_key] = val_schema
|
|
127
|
+
|
|
128
|
+
schema: JsonDict = {"type": "object"}
|
|
129
|
+
if not isinstance(obj, dict):
|
|
130
|
+
schema["title"] = obj.__class__.__name__
|
|
131
|
+
schema["x-python-datatype"] = "Mapping"
|
|
132
|
+
|
|
133
|
+
if schema_properties:
|
|
134
|
+
schema["properties"] = schema_properties
|
|
135
|
+
|
|
136
|
+
return serialized_dict, schema
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _handle_bytes(
|
|
140
|
+
obj: bytes,
|
|
141
|
+
_seen: set[int],
|
|
142
|
+
schema_extras: JsonDict | None = None,
|
|
143
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
144
|
+
schema = {
|
|
145
|
+
"type": "string",
|
|
146
|
+
"x-python-datatype": "bytes",
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if obj.__class__.__name__ != "bytes":
|
|
150
|
+
schema["title"] = obj.__class__.__name__
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
serialized = obj.decode()
|
|
154
|
+
if not serialized.isprintable():
|
|
155
|
+
raise ValueError("Non-printable characters found") # noqa: TRY301
|
|
156
|
+
except (UnicodeDecodeError, ValueError):
|
|
157
|
+
serialized = base64.b64encode(obj).decode()
|
|
158
|
+
schema["format"] = "base64"
|
|
159
|
+
|
|
160
|
+
return serialized, {**schema, **(schema_extras or {})}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _handle_bytearray(
|
|
164
|
+
obj: bytearray,
|
|
165
|
+
seen: set[int],
|
|
166
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
167
|
+
serialized, schema = _handle_bytes(bytes(obj), seen)
|
|
168
|
+
schema["x-python-datatype"] = "bytearray"
|
|
169
|
+
if obj.__class__.__name__ != "bytearray":
|
|
170
|
+
schema["title"] = obj.__class__.__name__
|
|
171
|
+
return serialized, schema
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _handle_enum(
|
|
175
|
+
obj: Enum,
|
|
176
|
+
seen: set[int],
|
|
177
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
178
|
+
enum_cls = obj.__class__
|
|
179
|
+
serialized, _ = _serialize(obj.value, seen) # Process the underlying value
|
|
180
|
+
|
|
181
|
+
# Determine schema type based on enum values
|
|
182
|
+
value_type = "object"
|
|
183
|
+
if enum_values := [e.value for e in enum_cls]:
|
|
184
|
+
first_val_type = type(enum_values[0])
|
|
185
|
+
if all(isinstance(v, first_val_type) for v in enum_values):
|
|
186
|
+
type_map = {str: "string", int: "integer", float: "number", bool: "boolean"}
|
|
187
|
+
value_type = type_map.get(first_val_type, "object")
|
|
188
|
+
|
|
189
|
+
# Get serialized representations of all possible enum values
|
|
190
|
+
serialized_enum_values = []
|
|
191
|
+
for e in enum_cls:
|
|
192
|
+
s_enum_val, _ = _serialize(e.value, seen.copy())
|
|
193
|
+
serialized_enum_values.append(s_enum_val)
|
|
194
|
+
|
|
195
|
+
schema: JsonDict = {
|
|
196
|
+
"type": value_type,
|
|
197
|
+
"title": enum_cls.__name__,
|
|
198
|
+
"x-python-datatype": "Enum",
|
|
199
|
+
"enum": serialized_enum_values,
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return serialized, schema
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _handle_datetime_iso(
|
|
206
|
+
obj: datetime.date | datetime.datetime | datetime.time,
|
|
207
|
+
_seen: set[int],
|
|
208
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
209
|
+
format_map = {
|
|
210
|
+
datetime.datetime: "date-time",
|
|
211
|
+
datetime.date: "date",
|
|
212
|
+
datetime.time: "time",
|
|
213
|
+
}
|
|
214
|
+
return obj.isoformat(), {
|
|
215
|
+
"type": "string",
|
|
216
|
+
"format": format_map.get(type(obj), "unknown-datetime"),
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _handle_timedelta(
|
|
221
|
+
obj: datetime.timedelta,
|
|
222
|
+
_seen: set[int],
|
|
223
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
224
|
+
return obj.total_seconds(), {
|
|
225
|
+
"type": "number",
|
|
226
|
+
"format": "time-delta-seconds",
|
|
227
|
+
"x-python-datatype": "timedelta",
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _handle_decimal(
|
|
232
|
+
obj: Decimal,
|
|
233
|
+
_seen: set[int],
|
|
234
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
235
|
+
return str(obj), {"type": "string", "format": "decimal"}
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _handle_str_based(
|
|
239
|
+
obj: t.Any,
|
|
240
|
+
_seen: set[int],
|
|
241
|
+
schema_extras: JsonDict | None = None,
|
|
242
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
243
|
+
return str(obj), {"type": "string", **(schema_extras or {})}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _handle_uuid(
|
|
247
|
+
obj: UUID,
|
|
248
|
+
seen: set[int],
|
|
249
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
250
|
+
return _handle_str_based(obj, seen, {"format": "uuid"})
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _handle_path(
|
|
254
|
+
obj: PosixPath,
|
|
255
|
+
seen: set[int],
|
|
256
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
257
|
+
return _handle_str_based(
|
|
258
|
+
obj,
|
|
259
|
+
seen,
|
|
260
|
+
{"format": "path", "x-python-datatype": "PosixPath"},
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _handle_pattern(
|
|
265
|
+
obj: Pattern[t.Any],
|
|
266
|
+
seen: set[int],
|
|
267
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
268
|
+
return _handle_str_based(obj.pattern, seen, {"format": "regex"})
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _handle_exception(obj: Exception, _seen: set[int]) -> tuple[JsonValue, JsonDict]:
|
|
272
|
+
return _handle_str_based(
|
|
273
|
+
obj,
|
|
274
|
+
_seen,
|
|
275
|
+
{"title": obj.__class__.__name__, "x-python-datatype": "Exception"},
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _handle_range(obj: range, _seen: set[int]) -> tuple[JsonValue, JsonDict]:
|
|
280
|
+
return list(obj), {"type": "array", "items": {"type": "integer"}, "x-python-datatype": "range"}
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _handle_custom_object(
|
|
284
|
+
obj: t.Any,
|
|
285
|
+
keys: Iterable[str],
|
|
286
|
+
seen: set[int],
|
|
287
|
+
datatype_name: str,
|
|
288
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
289
|
+
obj_type = type(obj)
|
|
290
|
+
serialized_props: JsonDict = {}
|
|
291
|
+
schema_properties: JsonDict = {}
|
|
292
|
+
|
|
293
|
+
for key in keys:
|
|
294
|
+
with contextlib.suppress(AttributeError):
|
|
295
|
+
value = getattr(obj, key)
|
|
296
|
+
s_value, schema_value = _serialize(value, seen)
|
|
297
|
+
serialized_props[key] = s_value
|
|
298
|
+
if schema_value != EMPTY_SCHEMA:
|
|
299
|
+
schema_properties[key] = schema_value
|
|
300
|
+
|
|
301
|
+
schema: JsonDict = {
|
|
302
|
+
"type": "object",
|
|
303
|
+
"title": obj_type.__name__,
|
|
304
|
+
"x-python-datatype": datatype_name,
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
if schema_properties:
|
|
308
|
+
schema["properties"] = schema_properties
|
|
309
|
+
|
|
310
|
+
return serialized_props, schema
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _handle_dataclass(obj: t.Any, seen: set[int]) -> tuple[JsonValue, JsonDict]:
|
|
314
|
+
keys = [f.name for f in dataclasses.fields(obj) if f.repr]
|
|
315
|
+
return _handle_custom_object(obj, keys, seen, "dataclass")
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _handle_attrs(obj: t.Any, seen: set[int]) -> tuple[JsonValue, JsonDict]:
|
|
319
|
+
import attrs
|
|
320
|
+
|
|
321
|
+
keys = [f.name for f in attrs.fields(obj.__class__)]
|
|
322
|
+
return _handle_custom_object(obj, keys, seen, "attrs")
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _handle_pydantic_model(obj: t.Any, _seen: set[int]) -> tuple[JsonValue, JsonDict]:
|
|
326
|
+
import pydantic
|
|
327
|
+
|
|
328
|
+
if not isinstance(obj, pydantic.BaseModel):
|
|
329
|
+
return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
|
|
330
|
+
|
|
331
|
+
schema: JsonDict = {
|
|
332
|
+
"type": "object",
|
|
333
|
+
"title": type(obj).__name__,
|
|
334
|
+
"x-python-datatype": "pydantic.BaseModel",
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
with contextlib.suppress(Exception):
|
|
338
|
+
schema = obj.model_json_schema()
|
|
339
|
+
|
|
340
|
+
return obj.model_dump(mode="json"), schema
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _handle_numpy_array(
|
|
344
|
+
obj: t.Any,
|
|
345
|
+
seen: set[int],
|
|
346
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
347
|
+
import numpy # noqa: ICN001
|
|
348
|
+
|
|
349
|
+
if not isinstance(obj, numpy.ndarray):
|
|
350
|
+
return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
|
|
351
|
+
|
|
352
|
+
serialized, schema = _handle_bytes(obj.tobytes(), seen)
|
|
353
|
+
|
|
354
|
+
schema["x-python-datatype"] = "numpy.ndarray"
|
|
355
|
+
schema["x-numpy-dtype"] = str(obj.dtype)
|
|
356
|
+
schema["x-numpy-shape"] = list(obj.shape)
|
|
357
|
+
|
|
358
|
+
return serialized, schema
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _handle_pandas_dataframe(
|
|
362
|
+
obj: t.Any,
|
|
363
|
+
seen: set[int],
|
|
364
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
365
|
+
import pandas as pd
|
|
366
|
+
|
|
367
|
+
if not isinstance(obj, pd.DataFrame):
|
|
368
|
+
return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
|
|
369
|
+
|
|
370
|
+
serialized, schema = _serialize(obj.to_dict(), seen)
|
|
371
|
+
schema["x-python-datatype"] = "pandas.DataFrame"
|
|
372
|
+
|
|
373
|
+
return serialized, schema
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _handle_pandas_series(
|
|
377
|
+
obj: t.Any,
|
|
378
|
+
seen: set[int],
|
|
379
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
380
|
+
import pandas as pd
|
|
381
|
+
|
|
382
|
+
if not isinstance(obj, pd.Series):
|
|
383
|
+
return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
|
|
384
|
+
|
|
385
|
+
serialized, schema = _serialize(obj.tolist(), seen)
|
|
386
|
+
schema["x-python-datatype"] = "pandas.Series"
|
|
387
|
+
|
|
388
|
+
return serialized, schema
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def _handle_pil_image(
|
|
392
|
+
obj: t.Any,
|
|
393
|
+
_seen: set[int],
|
|
394
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
395
|
+
import PIL.Image
|
|
396
|
+
|
|
397
|
+
if not isinstance(obj, PIL.Image.Image):
|
|
398
|
+
return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
|
|
399
|
+
|
|
400
|
+
buffer = io.BytesIO()
|
|
401
|
+
export_format = "PNG"
|
|
402
|
+
|
|
403
|
+
if hasattr(obj, "format") and isinstance(obj.format, str):
|
|
404
|
+
export_format = obj.format.lower()
|
|
405
|
+
|
|
406
|
+
obj.save(buffer, format=export_format)
|
|
407
|
+
|
|
408
|
+
return _handle_bytes(
|
|
409
|
+
buffer.getvalue(),
|
|
410
|
+
_seen,
|
|
411
|
+
{
|
|
412
|
+
"x-python-datatype": "PIL.Image",
|
|
413
|
+
"format": export_format.lower(),
|
|
414
|
+
},
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def _handle_pydub_audio_segment(
|
|
419
|
+
obj: t.Any,
|
|
420
|
+
_seen: set[int],
|
|
421
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
422
|
+
from pydub import AudioSegment # type: ignore[import-untyped, unused-ignore, import-not-found]
|
|
423
|
+
|
|
424
|
+
if not isinstance(obj, AudioSegment):
|
|
425
|
+
return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
|
|
426
|
+
|
|
427
|
+
# AudioSegment can be in different formats, but we will use WAV as a default
|
|
428
|
+
# Since there is no way to get the format from the AudioSegment object, we will use WAV
|
|
429
|
+
# as a default format for export. TODO: Add a way to get the format from the user via tags may be.
|
|
430
|
+
export_format = "wav"
|
|
431
|
+
# Raw audio data from AudioSegment class is in bytes format.
|
|
432
|
+
raw_bytes_data = obj.raw_data
|
|
433
|
+
schema = {
|
|
434
|
+
"x-python-datatype": "pydub.AudioSegment",
|
|
435
|
+
"format": export_format,
|
|
436
|
+
"x-audio-sample-rate": obj.frame_rate,
|
|
437
|
+
"x-audio-channels": obj.channels,
|
|
438
|
+
"x-audio-sample-width": obj.sample_width,
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
return _handle_bytes(raw_bytes_data, _seen, schema)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _handle_moviepy_video_clip(
|
|
445
|
+
obj: t.Any,
|
|
446
|
+
_seen: set[int],
|
|
447
|
+
) -> tuple[JsonValue, JsonDict]:
|
|
448
|
+
import tempfile
|
|
449
|
+
from pathlib import Path
|
|
450
|
+
|
|
451
|
+
from moviepy import ( # type: ignore[import-untyped, unused-ignore, import-not-found]
|
|
452
|
+
VideoFileClip,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
if not isinstance(obj, VideoFileClip):
|
|
456
|
+
return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
|
|
457
|
+
|
|
458
|
+
# Infer format from filename if available
|
|
459
|
+
export_format = "mp4"
|
|
460
|
+
if getattr(obj, "filename", None):
|
|
461
|
+
ext = Path(obj.filename).suffix.lstrip(".").lower()
|
|
462
|
+
if ext:
|
|
463
|
+
export_format = ext
|
|
464
|
+
|
|
465
|
+
# Export video to temp file
|
|
466
|
+
with tempfile.NamedTemporaryFile(suffix=f".{export_format}") as temp_file:
|
|
467
|
+
obj.write_videofile(
|
|
468
|
+
temp_file.name,
|
|
469
|
+
)
|
|
470
|
+
raw_bytes_data = Path(temp_file.name).read_bytes()
|
|
471
|
+
|
|
472
|
+
schema = {
|
|
473
|
+
"x-python-datatype": "moviepy.VideoFileClip",
|
|
474
|
+
"format": export_format,
|
|
475
|
+
"start": obj.start,
|
|
476
|
+
"end": obj.end,
|
|
477
|
+
"duration": obj.duration,
|
|
478
|
+
"fps": obj.fps,
|
|
479
|
+
"size": obj.size,
|
|
480
|
+
"rotation": obj.rotation,
|
|
481
|
+
"aspect_ratio": obj.aspect_ratio,
|
|
482
|
+
"w": obj.w,
|
|
483
|
+
"h": obj.h,
|
|
484
|
+
"n_frames": obj.n_frames,
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
return _handle_bytes(raw_bytes_data, _seen, schema)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def _handle_dataset(obj: t.Any, _seen: set[int]) -> tuple[JsonValue, JsonDict]:
|
|
491
|
+
import datasets # type: ignore[import-untyped]
|
|
492
|
+
|
|
493
|
+
if not isinstance(obj, datasets.Dataset):
|
|
494
|
+
return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
|
|
495
|
+
|
|
496
|
+
buffer = io.BytesIO()
|
|
497
|
+
obj.to_parquet(buffer)
|
|
498
|
+
|
|
499
|
+
return _handle_bytes(
|
|
500
|
+
buffer.getvalue(),
|
|
501
|
+
_seen,
|
|
502
|
+
{
|
|
503
|
+
"x-python-datatype": "datasets.Dataset",
|
|
504
|
+
"format": "parquet",
|
|
505
|
+
},
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
@lru_cache(maxsize=1)
|
|
510
|
+
def _get_handlers() -> dict[type, HandlerFunc]:
|
|
511
|
+
handlers: dict[type, HandlerFunc] = {
|
|
512
|
+
list: _handle_sequence,
|
|
513
|
+
tuple: _handle_sequence,
|
|
514
|
+
set: _handle_sequence,
|
|
515
|
+
frozenset: _handle_sequence,
|
|
516
|
+
deque: _handle_sequence,
|
|
517
|
+
dict: _handle_mapping,
|
|
518
|
+
bytes: _handle_bytes,
|
|
519
|
+
bytearray: _handle_bytearray,
|
|
520
|
+
Enum: _handle_enum,
|
|
521
|
+
Decimal: _handle_decimal,
|
|
522
|
+
datetime.datetime: _handle_datetime_iso,
|
|
523
|
+
datetime.date: _handle_datetime_iso,
|
|
524
|
+
datetime.time: _handle_datetime_iso,
|
|
525
|
+
datetime.timedelta: _handle_timedelta,
|
|
526
|
+
UUID: _handle_uuid,
|
|
527
|
+
PosixPath: _handle_path,
|
|
528
|
+
Pattern: _handle_pattern,
|
|
529
|
+
range: _handle_range,
|
|
530
|
+
Exception: _handle_exception,
|
|
531
|
+
IPv4Address: lambda o, s: _handle_str_based(o, s, {"format": "ipv4"}),
|
|
532
|
+
IPv6Address: lambda o, s: _handle_str_based(o, s, {"format": "ipv6"}),
|
|
533
|
+
IPv4Interface: lambda o, s: _handle_str_based(o, s, {"x-python-datatype": "IPv4Interface"}),
|
|
534
|
+
IPv6Interface: lambda o, s: _handle_str_based(o, s, {"x-python-datatype": "IPv6Interface"}),
|
|
535
|
+
IPv4Network: lambda o, s: _handle_str_based(o, s, {"x-python-datatype": "IPv4Network"}),
|
|
536
|
+
IPv6Network: lambda o, s: _handle_str_based(o, s, {"x-python-datatype": "IPv6Network"}),
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
# Pydantic
|
|
540
|
+
|
|
541
|
+
with contextlib.suppress(Exception):
|
|
542
|
+
import pydantic
|
|
543
|
+
|
|
544
|
+
handlers[pydantic.NameEmail] = lambda o, s: _handle_str_based(
|
|
545
|
+
o,
|
|
546
|
+
s,
|
|
547
|
+
{"format": "email", "x-python-datatype": "pydantic.NameEmail"},
|
|
548
|
+
)
|
|
549
|
+
handlers[pydantic.SecretStr] = lambda _o, s: _handle_str_based(
|
|
550
|
+
"***",
|
|
551
|
+
s,
|
|
552
|
+
{"x-python-datatype": "pydantic.SecretStr"},
|
|
553
|
+
)
|
|
554
|
+
handlers[pydantic.SecretBytes] = lambda _o, s: _handle_bytes(
|
|
555
|
+
b"***",
|
|
556
|
+
s,
|
|
557
|
+
{"x-python-datatype": "pydantic.SecretBytes"},
|
|
558
|
+
)
|
|
559
|
+
handlers[pydantic.AnyUrl] = lambda o, s: _handle_str_based(
|
|
560
|
+
o,
|
|
561
|
+
s,
|
|
562
|
+
{"format": "url", "x-python-datatype": "pydantic.AnyUrl"},
|
|
563
|
+
)
|
|
564
|
+
handlers[pydantic.BaseModel] = _handle_pydantic_model
|
|
565
|
+
|
|
566
|
+
with contextlib.suppress(Exception):
|
|
567
|
+
import numpy as np
|
|
568
|
+
|
|
569
|
+
handlers[np.ndarray] = _handle_numpy_array
|
|
570
|
+
handlers[np.floating] = lambda o, s: _serialize(float(o), s)
|
|
571
|
+
handlers[np.integer] = lambda o, s: _serialize(int(o), s)
|
|
572
|
+
handlers[np.bool_] = lambda o, s: _serialize(bool(o), s)
|
|
573
|
+
handlers[np.str_] = lambda o, s: _handle_str_based(
|
|
574
|
+
o,
|
|
575
|
+
s,
|
|
576
|
+
{"x-python-datatype": "numpy.str_"},
|
|
577
|
+
)
|
|
578
|
+
handlers[np.bytes_] = lambda o, s: _handle_bytes(
|
|
579
|
+
o,
|
|
580
|
+
s,
|
|
581
|
+
{"x-python-datatype": "numpy.bytes_"},
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
with contextlib.suppress(Exception):
|
|
585
|
+
import pandas as pd
|
|
586
|
+
|
|
587
|
+
handlers[pd.DataFrame] = _handle_pandas_dataframe
|
|
588
|
+
handlers[pd.Series] = _handle_pandas_series
|
|
589
|
+
|
|
590
|
+
with contextlib.suppress(Exception):
|
|
591
|
+
import PIL.Image
|
|
592
|
+
|
|
593
|
+
handlers[PIL.Image.Image] = _handle_pil_image
|
|
594
|
+
|
|
595
|
+
with contextlib.suppress(Exception):
|
|
596
|
+
import datasets
|
|
597
|
+
|
|
598
|
+
handlers[datasets.Dataset] = _handle_dataset
|
|
599
|
+
|
|
600
|
+
with contextlib.suppress(Exception):
|
|
601
|
+
from pydub import AudioSegment
|
|
602
|
+
|
|
603
|
+
handlers[AudioSegment] = _handle_pydub_audio_segment
|
|
604
|
+
|
|
605
|
+
with contextlib.suppress(Exception):
|
|
606
|
+
from moviepy import VideoFileClip
|
|
607
|
+
|
|
608
|
+
handlers[VideoFileClip] = _handle_moviepy_video_clip
|
|
609
|
+
|
|
610
|
+
return handlers
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
# Core functions
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def _serialize(obj: t.Any, seen: set[int] | None = None) -> tuple[JsonValue, JsonDict]: # noqa: PLR0911
|
|
617
|
+
# Primitives early
|
|
618
|
+
|
|
619
|
+
if isinstance(obj, str | int | float | bool) or obj is None:
|
|
620
|
+
return obj, EMPTY_SCHEMA
|
|
621
|
+
|
|
622
|
+
# Cycle tracking
|
|
623
|
+
|
|
624
|
+
seen = seen or set()
|
|
625
|
+
|
|
626
|
+
obj_id = id(obj)
|
|
627
|
+
if obj_id in seen:
|
|
628
|
+
return "<circular reference>", {}
|
|
629
|
+
|
|
630
|
+
seen = seen.copy()
|
|
631
|
+
seen.add(obj_id)
|
|
632
|
+
|
|
633
|
+
obj_type = type(obj)
|
|
634
|
+
handlers = _get_handlers()
|
|
635
|
+
|
|
636
|
+
with contextlib.suppress(Exception):
|
|
637
|
+
# MRO-based lookup first
|
|
638
|
+
|
|
639
|
+
for base in obj_type.__mro__:
|
|
640
|
+
if base in handlers:
|
|
641
|
+
handler = handlers[base]
|
|
642
|
+
return handler(obj, seen)
|
|
643
|
+
|
|
644
|
+
# Common collections
|
|
645
|
+
|
|
646
|
+
if isinstance(obj, list | tuple | set | frozenset | deque):
|
|
647
|
+
return _handle_sequence(obj, seen)
|
|
648
|
+
|
|
649
|
+
if isinstance(obj, Mapping):
|
|
650
|
+
return _handle_mapping(obj, seen)
|
|
651
|
+
|
|
652
|
+
# Common struct types
|
|
653
|
+
|
|
654
|
+
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
|
|
655
|
+
return _handle_dataclass(obj, seen)
|
|
656
|
+
|
|
657
|
+
if _is_attrs_instance(obj_type):
|
|
658
|
+
return _handle_attrs(obj, seen)
|
|
659
|
+
|
|
660
|
+
# Generic sequences (if not list/tuple/set/deque and no other handler matched)
|
|
661
|
+
|
|
662
|
+
if isinstance(obj, Sequence):
|
|
663
|
+
return _handle_sequence(obj, seen)
|
|
664
|
+
|
|
665
|
+
# Common fallbacks
|
|
666
|
+
|
|
667
|
+
if hasattr(obj, "to_dict"):
|
|
668
|
+
return _serialize(obj.to_dict(), seen)
|
|
669
|
+
|
|
670
|
+
if hasattr(obj, "asdict"): # e.g., namedtuple
|
|
671
|
+
return _serialize(obj.asdict(), seen)
|
|
672
|
+
|
|
673
|
+
# Fallback to repr
|
|
674
|
+
|
|
675
|
+
return safe_repr(obj), {
|
|
676
|
+
"type": "string",
|
|
677
|
+
"title": obj_type.__name__,
|
|
678
|
+
"x-python-datatype": "unknown",
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
|
|
682
|
+
@dataclasses.dataclass
|
|
683
|
+
class Serialized:
|
|
684
|
+
data: JsonValue | None
|
|
685
|
+
data_bytes: bytes | None
|
|
686
|
+
data_len: int
|
|
687
|
+
data_hash: str
|
|
688
|
+
schema: JsonDict
|
|
689
|
+
schema_hash: str
|
|
690
|
+
|
|
691
|
+
|
|
692
|
+
EMPTY_HASH = "0" * 16
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def serialize(obj: t.Any) -> Serialized:
|
|
696
|
+
"""
|
|
697
|
+
Serializes a Python object into a JSON-compatible structure and
|
|
698
|
+
generates a corresponding JSON Schema, ensuring consistency between
|
|
699
|
+
the serialization format and the schema.
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
obj: The Python object to process.
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
An object containing the serialized data, schema, and their hashes.
|
|
706
|
+
"""
|
|
707
|
+
serialized, schema = _serialize(obj)
|
|
708
|
+
|
|
709
|
+
if isinstance(serialized, str | int | bool | float):
|
|
710
|
+
serialized_bytes = str(serialized).encode()
|
|
711
|
+
else:
|
|
712
|
+
serialized_bytes = json.dumps(serialized, separators=(",", ":")).encode()
|
|
713
|
+
|
|
714
|
+
schema_str = json.dumps(schema, separators=(",", ":"))
|
|
715
|
+
|
|
716
|
+
data_hash = EMPTY_HASH
|
|
717
|
+
if serialized is not None:
|
|
718
|
+
data_hash = hashlib.sha1(serialized_bytes).hexdigest()[:16] # noqa: S324 (using sha1 for speed)
|
|
719
|
+
|
|
720
|
+
schema_hash = EMPTY_HASH
|
|
721
|
+
if schema and schema != EMPTY_SCHEMA:
|
|
722
|
+
schema_hash = hashlib.sha1(schema_str.encode()).hexdigest()[:16] # noqa: S324
|
|
723
|
+
|
|
724
|
+
return Serialized(
|
|
725
|
+
data=serialized,
|
|
726
|
+
data_bytes=serialized_bytes if serialized is not None else None,
|
|
727
|
+
data_len=len(serialized_bytes) if serialized is not None else 0,
|
|
728
|
+
data_hash=data_hash,
|
|
729
|
+
schema=schema,
|
|
730
|
+
schema_hash=schema_hash,
|
|
731
|
+
)
|