dreadnode 1.0.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,731 @@
1
+ import base64
2
+ import contextlib
3
+ import dataclasses
4
+ import datetime
5
+ import hashlib
6
+ import io
7
+ import json
8
+ import typing as t
9
+ from collections import deque
10
+ from collections.abc import Callable, Iterable, Mapping, Sequence
11
+ from decimal import Decimal
12
+ from enum import Enum
13
+ from functools import lru_cache
14
+ from ipaddress import (
15
+ IPv4Address,
16
+ IPv4Interface,
17
+ IPv4Network,
18
+ IPv6Address,
19
+ IPv6Interface,
20
+ IPv6Network,
21
+ )
22
+ from pathlib import PosixPath
23
+ from re import Pattern
24
+ from uuid import UUID
25
+
26
+ from dreadnode.types import JsonDict, JsonValue
27
+ from dreadnode.util import safe_repr
28
+
29
+ # Types
30
+
31
+ HandlerFunc = Callable[[t.Any, set[int]], tuple[JsonValue, JsonDict]]
32
+
33
+ # Constants
34
+
35
+ EMPTY_SCHEMA: JsonDict = {}
36
+ UNKNOWN_OBJECT_SCHEMA: JsonDict = {"type": "object", "x-python-datatype": "unknown"}
37
+
38
+
39
+ # Helpers
40
+
41
+ try:
42
+ import attrs
43
+
44
+ def _is_attrs_instance(_cls: type) -> bool:
45
+ return attrs.has(_cls)
46
+
47
+ except ModuleNotFoundError:
48
+
49
+ def _is_attrs_instance(_cls: type) -> bool:
50
+ return False
51
+
52
+
53
+ # Specific handlers
54
+
55
+
56
+ def _handle_sequence(
57
+ obj: Sequence[t.Any] | set[t.Any] | frozenset[t.Any],
58
+ seen: set[int],
59
+ ) -> tuple[JsonValue, JsonDict]:
60
+ obj_type = type(obj)
61
+ items_list = list(obj)
62
+
63
+ with contextlib.suppress(TypeError):
64
+ items_list.sort() # sort if possible (e.g., for sets)
65
+
66
+ serialized: list[JsonValue] = []
67
+ item_schemas: list[JsonDict] = []
68
+
69
+ non_empty_schemas_found = False
70
+
71
+ for item in items_list:
72
+ s_item, schema_item = _serialize(item, seen)
73
+ serialized.append(s_item)
74
+ item_schemas.append(schema_item)
75
+ if schema_item != EMPTY_SCHEMA:
76
+ non_empty_schemas_found = True
77
+
78
+ schema: JsonDict = {"type": "array"}
79
+ if obj_type != list: # noqa: E721
80
+ schema["title"] = obj_type.__name__
81
+ type_name_map = {tuple: "tuple", set: "set", frozenset: "set", deque: "deque"}
82
+ schema["x-python-datatype"] = type_name_map.get(obj_type, obj_type.__name__)
83
+
84
+ if not items_list: # if empty, basic array schema is sufficient
85
+ return serialized, schema
86
+
87
+ if not non_empty_schemas_found:
88
+ first_item_type = type(items_list[0])
89
+ if first_item_type in {str, int, float, bool, type(None)} and all(
90
+ type(item) is first_item_type for item in items_list
91
+ ):
92
+ type_map = {
93
+ str: "string",
94
+ int: "integer",
95
+ float: "number",
96
+ bool: "boolean",
97
+ type(None): "null",
98
+ }
99
+ schema["items"] = {"type": type_map[first_item_type]}
100
+
101
+ else:
102
+ # Check if all non-empty schemas are the same
103
+ first_real_schema = next((s for s in item_schemas if s != EMPTY_SCHEMA), None)
104
+ if first_real_schema and all(s in (first_real_schema, EMPTY_SCHEMA) for s in item_schemas):
105
+ # All items conform to the same schema (or are primitives implicitly covered)
106
+ schema["items"] = first_real_schema
107
+ else:
108
+ # Mixed schemas, use prefixItems (best for tuples, compromise for lists/sets)
109
+ schema["prefixItems"] = item_schemas # type: ignore [assignment]
110
+
111
+ return serialized, schema
112
+
113
+
114
+ def _handle_mapping(
115
+ obj: Mapping[t.Any, t.Any],
116
+ seen: set[int],
117
+ ) -> tuple[JsonValue, JsonDict]:
118
+ serialized_dict: JsonDict = {}
119
+ schema_properties: JsonDict = {}
120
+
121
+ for key, value in obj.items():
122
+ str_key = key if isinstance(key, str) else safe_repr(key)
123
+ val_serialized, val_schema = _serialize(value, seen)
124
+ serialized_dict[str_key] = val_serialized
125
+ if val_schema != EMPTY_SCHEMA:
126
+ schema_properties[str_key] = val_schema
127
+
128
+ schema: JsonDict = {"type": "object"}
129
+ if not isinstance(obj, dict):
130
+ schema["title"] = obj.__class__.__name__
131
+ schema["x-python-datatype"] = "Mapping"
132
+
133
+ if schema_properties:
134
+ schema["properties"] = schema_properties
135
+
136
+ return serialized_dict, schema
137
+
138
+
139
+ def _handle_bytes(
140
+ obj: bytes,
141
+ _seen: set[int],
142
+ schema_extras: JsonDict | None = None,
143
+ ) -> tuple[JsonValue, JsonDict]:
144
+ schema = {
145
+ "type": "string",
146
+ "x-python-datatype": "bytes",
147
+ }
148
+
149
+ if obj.__class__.__name__ != "bytes":
150
+ schema["title"] = obj.__class__.__name__
151
+
152
+ try:
153
+ serialized = obj.decode()
154
+ if not serialized.isprintable():
155
+ raise ValueError("Non-printable characters found") # noqa: TRY301
156
+ except (UnicodeDecodeError, ValueError):
157
+ serialized = base64.b64encode(obj).decode()
158
+ schema["format"] = "base64"
159
+
160
+ return serialized, {**schema, **(schema_extras or {})}
161
+
162
+
163
+ def _handle_bytearray(
164
+ obj: bytearray,
165
+ seen: set[int],
166
+ ) -> tuple[JsonValue, JsonDict]:
167
+ serialized, schema = _handle_bytes(bytes(obj), seen)
168
+ schema["x-python-datatype"] = "bytearray"
169
+ if obj.__class__.__name__ != "bytearray":
170
+ schema["title"] = obj.__class__.__name__
171
+ return serialized, schema
172
+
173
+
174
+ def _handle_enum(
175
+ obj: Enum,
176
+ seen: set[int],
177
+ ) -> tuple[JsonValue, JsonDict]:
178
+ enum_cls = obj.__class__
179
+ serialized, _ = _serialize(obj.value, seen) # Process the underlying value
180
+
181
+ # Determine schema type based on enum values
182
+ value_type = "object"
183
+ if enum_values := [e.value for e in enum_cls]:
184
+ first_val_type = type(enum_values[0])
185
+ if all(isinstance(v, first_val_type) for v in enum_values):
186
+ type_map = {str: "string", int: "integer", float: "number", bool: "boolean"}
187
+ value_type = type_map.get(first_val_type, "object")
188
+
189
+ # Get serialized representations of all possible enum values
190
+ serialized_enum_values = []
191
+ for e in enum_cls:
192
+ s_enum_val, _ = _serialize(e.value, seen.copy())
193
+ serialized_enum_values.append(s_enum_val)
194
+
195
+ schema: JsonDict = {
196
+ "type": value_type,
197
+ "title": enum_cls.__name__,
198
+ "x-python-datatype": "Enum",
199
+ "enum": serialized_enum_values,
200
+ }
201
+
202
+ return serialized, schema
203
+
204
+
205
+ def _handle_datetime_iso(
206
+ obj: datetime.date | datetime.datetime | datetime.time,
207
+ _seen: set[int],
208
+ ) -> tuple[JsonValue, JsonDict]:
209
+ format_map = {
210
+ datetime.datetime: "date-time",
211
+ datetime.date: "date",
212
+ datetime.time: "time",
213
+ }
214
+ return obj.isoformat(), {
215
+ "type": "string",
216
+ "format": format_map.get(type(obj), "unknown-datetime"),
217
+ }
218
+
219
+
220
+ def _handle_timedelta(
221
+ obj: datetime.timedelta,
222
+ _seen: set[int],
223
+ ) -> tuple[JsonValue, JsonDict]:
224
+ return obj.total_seconds(), {
225
+ "type": "number",
226
+ "format": "time-delta-seconds",
227
+ "x-python-datatype": "timedelta",
228
+ }
229
+
230
+
231
+ def _handle_decimal(
232
+ obj: Decimal,
233
+ _seen: set[int],
234
+ ) -> tuple[JsonValue, JsonDict]:
235
+ return str(obj), {"type": "string", "format": "decimal"}
236
+
237
+
238
+ def _handle_str_based(
239
+ obj: t.Any,
240
+ _seen: set[int],
241
+ schema_extras: JsonDict | None = None,
242
+ ) -> tuple[JsonValue, JsonDict]:
243
+ return str(obj), {"type": "string", **(schema_extras or {})}
244
+
245
+
246
+ def _handle_uuid(
247
+ obj: UUID,
248
+ seen: set[int],
249
+ ) -> tuple[JsonValue, JsonDict]:
250
+ return _handle_str_based(obj, seen, {"format": "uuid"})
251
+
252
+
253
+ def _handle_path(
254
+ obj: PosixPath,
255
+ seen: set[int],
256
+ ) -> tuple[JsonValue, JsonDict]:
257
+ return _handle_str_based(
258
+ obj,
259
+ seen,
260
+ {"format": "path", "x-python-datatype": "PosixPath"},
261
+ )
262
+
263
+
264
+ def _handle_pattern(
265
+ obj: Pattern[t.Any],
266
+ seen: set[int],
267
+ ) -> tuple[JsonValue, JsonDict]:
268
+ return _handle_str_based(obj.pattern, seen, {"format": "regex"})
269
+
270
+
271
+ def _handle_exception(obj: Exception, _seen: set[int]) -> tuple[JsonValue, JsonDict]:
272
+ return _handle_str_based(
273
+ obj,
274
+ _seen,
275
+ {"title": obj.__class__.__name__, "x-python-datatype": "Exception"},
276
+ )
277
+
278
+
279
+ def _handle_range(obj: range, _seen: set[int]) -> tuple[JsonValue, JsonDict]:
280
+ return list(obj), {"type": "array", "items": {"type": "integer"}, "x-python-datatype": "range"}
281
+
282
+
283
+ def _handle_custom_object(
284
+ obj: t.Any,
285
+ keys: Iterable[str],
286
+ seen: set[int],
287
+ datatype_name: str,
288
+ ) -> tuple[JsonValue, JsonDict]:
289
+ obj_type = type(obj)
290
+ serialized_props: JsonDict = {}
291
+ schema_properties: JsonDict = {}
292
+
293
+ for key in keys:
294
+ with contextlib.suppress(AttributeError):
295
+ value = getattr(obj, key)
296
+ s_value, schema_value = _serialize(value, seen)
297
+ serialized_props[key] = s_value
298
+ if schema_value != EMPTY_SCHEMA:
299
+ schema_properties[key] = schema_value
300
+
301
+ schema: JsonDict = {
302
+ "type": "object",
303
+ "title": obj_type.__name__,
304
+ "x-python-datatype": datatype_name,
305
+ }
306
+
307
+ if schema_properties:
308
+ schema["properties"] = schema_properties
309
+
310
+ return serialized_props, schema
311
+
312
+
313
+ def _handle_dataclass(obj: t.Any, seen: set[int]) -> tuple[JsonValue, JsonDict]:
314
+ keys = [f.name for f in dataclasses.fields(obj) if f.repr]
315
+ return _handle_custom_object(obj, keys, seen, "dataclass")
316
+
317
+
318
+ def _handle_attrs(obj: t.Any, seen: set[int]) -> tuple[JsonValue, JsonDict]:
319
+ import attrs
320
+
321
+ keys = [f.name for f in attrs.fields(obj.__class__)]
322
+ return _handle_custom_object(obj, keys, seen, "attrs")
323
+
324
+
325
+ def _handle_pydantic_model(obj: t.Any, _seen: set[int]) -> tuple[JsonValue, JsonDict]:
326
+ import pydantic
327
+
328
+ if not isinstance(obj, pydantic.BaseModel):
329
+ return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
330
+
331
+ schema: JsonDict = {
332
+ "type": "object",
333
+ "title": type(obj).__name__,
334
+ "x-python-datatype": "pydantic.BaseModel",
335
+ }
336
+
337
+ with contextlib.suppress(Exception):
338
+ schema = obj.model_json_schema()
339
+
340
+ return obj.model_dump(mode="json"), schema
341
+
342
+
343
+ def _handle_numpy_array(
344
+ obj: t.Any,
345
+ seen: set[int],
346
+ ) -> tuple[JsonValue, JsonDict]:
347
+ import numpy # noqa: ICN001
348
+
349
+ if not isinstance(obj, numpy.ndarray):
350
+ return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
351
+
352
+ serialized, schema = _handle_bytes(obj.tobytes(), seen)
353
+
354
+ schema["x-python-datatype"] = "numpy.ndarray"
355
+ schema["x-numpy-dtype"] = str(obj.dtype)
356
+ schema["x-numpy-shape"] = list(obj.shape)
357
+
358
+ return serialized, schema
359
+
360
+
361
+ def _handle_pandas_dataframe(
362
+ obj: t.Any,
363
+ seen: set[int],
364
+ ) -> tuple[JsonValue, JsonDict]:
365
+ import pandas as pd
366
+
367
+ if not isinstance(obj, pd.DataFrame):
368
+ return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
369
+
370
+ serialized, schema = _serialize(obj.to_dict(), seen)
371
+ schema["x-python-datatype"] = "pandas.DataFrame"
372
+
373
+ return serialized, schema
374
+
375
+
376
+ def _handle_pandas_series(
377
+ obj: t.Any,
378
+ seen: set[int],
379
+ ) -> tuple[JsonValue, JsonDict]:
380
+ import pandas as pd
381
+
382
+ if not isinstance(obj, pd.Series):
383
+ return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
384
+
385
+ serialized, schema = _serialize(obj.tolist(), seen)
386
+ schema["x-python-datatype"] = "pandas.Series"
387
+
388
+ return serialized, schema
389
+
390
+
391
+ def _handle_pil_image(
392
+ obj: t.Any,
393
+ _seen: set[int],
394
+ ) -> tuple[JsonValue, JsonDict]:
395
+ import PIL.Image
396
+
397
+ if not isinstance(obj, PIL.Image.Image):
398
+ return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
399
+
400
+ buffer = io.BytesIO()
401
+ export_format = "PNG"
402
+
403
+ if hasattr(obj, "format") and isinstance(obj.format, str):
404
+ export_format = obj.format.lower()
405
+
406
+ obj.save(buffer, format=export_format)
407
+
408
+ return _handle_bytes(
409
+ buffer.getvalue(),
410
+ _seen,
411
+ {
412
+ "x-python-datatype": "PIL.Image",
413
+ "format": export_format.lower(),
414
+ },
415
+ )
416
+
417
+
418
+ def _handle_pydub_audio_segment(
419
+ obj: t.Any,
420
+ _seen: set[int],
421
+ ) -> tuple[JsonValue, JsonDict]:
422
+ from pydub import AudioSegment # type: ignore[import-untyped, unused-ignore, import-not-found]
423
+
424
+ if not isinstance(obj, AudioSegment):
425
+ return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
426
+
427
+ # AudioSegment can be in different formats, but we will use WAV as a default
428
+ # Since there is no way to get the format from the AudioSegment object, we will use WAV
429
+ # as a default format for export. TODO: Add a way to get the format from the user via tags may be.
430
+ export_format = "wav"
431
+ # Raw audio data from AudioSegment class is in bytes format.
432
+ raw_bytes_data = obj.raw_data
433
+ schema = {
434
+ "x-python-datatype": "pydub.AudioSegment",
435
+ "format": export_format,
436
+ "x-audio-sample-rate": obj.frame_rate,
437
+ "x-audio-channels": obj.channels,
438
+ "x-audio-sample-width": obj.sample_width,
439
+ }
440
+
441
+ return _handle_bytes(raw_bytes_data, _seen, schema)
442
+
443
+
444
+ def _handle_moviepy_video_clip(
445
+ obj: t.Any,
446
+ _seen: set[int],
447
+ ) -> tuple[JsonValue, JsonDict]:
448
+ import tempfile
449
+ from pathlib import Path
450
+
451
+ from moviepy import ( # type: ignore[import-untyped, unused-ignore, import-not-found]
452
+ VideoFileClip,
453
+ )
454
+
455
+ if not isinstance(obj, VideoFileClip):
456
+ return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
457
+
458
+ # Infer format from filename if available
459
+ export_format = "mp4"
460
+ if getattr(obj, "filename", None):
461
+ ext = Path(obj.filename).suffix.lstrip(".").lower()
462
+ if ext:
463
+ export_format = ext
464
+
465
+ # Export video to temp file
466
+ with tempfile.NamedTemporaryFile(suffix=f".{export_format}") as temp_file:
467
+ obj.write_videofile(
468
+ temp_file.name,
469
+ )
470
+ raw_bytes_data = Path(temp_file.name).read_bytes()
471
+
472
+ schema = {
473
+ "x-python-datatype": "moviepy.VideoFileClip",
474
+ "format": export_format,
475
+ "start": obj.start,
476
+ "end": obj.end,
477
+ "duration": obj.duration,
478
+ "fps": obj.fps,
479
+ "size": obj.size,
480
+ "rotation": obj.rotation,
481
+ "aspect_ratio": obj.aspect_ratio,
482
+ "w": obj.w,
483
+ "h": obj.h,
484
+ "n_frames": obj.n_frames,
485
+ }
486
+
487
+ return _handle_bytes(raw_bytes_data, _seen, schema)
488
+
489
+
490
+ def _handle_dataset(obj: t.Any, _seen: set[int]) -> tuple[JsonValue, JsonDict]:
491
+ import datasets # type: ignore[import-untyped]
492
+
493
+ if not isinstance(obj, datasets.Dataset):
494
+ return safe_repr(obj), UNKNOWN_OBJECT_SCHEMA
495
+
496
+ buffer = io.BytesIO()
497
+ obj.to_parquet(buffer)
498
+
499
+ return _handle_bytes(
500
+ buffer.getvalue(),
501
+ _seen,
502
+ {
503
+ "x-python-datatype": "datasets.Dataset",
504
+ "format": "parquet",
505
+ },
506
+ )
507
+
508
+
509
+ @lru_cache(maxsize=1)
510
+ def _get_handlers() -> dict[type, HandlerFunc]:
511
+ handlers: dict[type, HandlerFunc] = {
512
+ list: _handle_sequence,
513
+ tuple: _handle_sequence,
514
+ set: _handle_sequence,
515
+ frozenset: _handle_sequence,
516
+ deque: _handle_sequence,
517
+ dict: _handle_mapping,
518
+ bytes: _handle_bytes,
519
+ bytearray: _handle_bytearray,
520
+ Enum: _handle_enum,
521
+ Decimal: _handle_decimal,
522
+ datetime.datetime: _handle_datetime_iso,
523
+ datetime.date: _handle_datetime_iso,
524
+ datetime.time: _handle_datetime_iso,
525
+ datetime.timedelta: _handle_timedelta,
526
+ UUID: _handle_uuid,
527
+ PosixPath: _handle_path,
528
+ Pattern: _handle_pattern,
529
+ range: _handle_range,
530
+ Exception: _handle_exception,
531
+ IPv4Address: lambda o, s: _handle_str_based(o, s, {"format": "ipv4"}),
532
+ IPv6Address: lambda o, s: _handle_str_based(o, s, {"format": "ipv6"}),
533
+ IPv4Interface: lambda o, s: _handle_str_based(o, s, {"x-python-datatype": "IPv4Interface"}),
534
+ IPv6Interface: lambda o, s: _handle_str_based(o, s, {"x-python-datatype": "IPv6Interface"}),
535
+ IPv4Network: lambda o, s: _handle_str_based(o, s, {"x-python-datatype": "IPv4Network"}),
536
+ IPv6Network: lambda o, s: _handle_str_based(o, s, {"x-python-datatype": "IPv6Network"}),
537
+ }
538
+
539
+ # Pydantic
540
+
541
+ with contextlib.suppress(Exception):
542
+ import pydantic
543
+
544
+ handlers[pydantic.NameEmail] = lambda o, s: _handle_str_based(
545
+ o,
546
+ s,
547
+ {"format": "email", "x-python-datatype": "pydantic.NameEmail"},
548
+ )
549
+ handlers[pydantic.SecretStr] = lambda _o, s: _handle_str_based(
550
+ "***",
551
+ s,
552
+ {"x-python-datatype": "pydantic.SecretStr"},
553
+ )
554
+ handlers[pydantic.SecretBytes] = lambda _o, s: _handle_bytes(
555
+ b"***",
556
+ s,
557
+ {"x-python-datatype": "pydantic.SecretBytes"},
558
+ )
559
+ handlers[pydantic.AnyUrl] = lambda o, s: _handle_str_based(
560
+ o,
561
+ s,
562
+ {"format": "url", "x-python-datatype": "pydantic.AnyUrl"},
563
+ )
564
+ handlers[pydantic.BaseModel] = _handle_pydantic_model
565
+
566
+ with contextlib.suppress(Exception):
567
+ import numpy as np
568
+
569
+ handlers[np.ndarray] = _handle_numpy_array
570
+ handlers[np.floating] = lambda o, s: _serialize(float(o), s)
571
+ handlers[np.integer] = lambda o, s: _serialize(int(o), s)
572
+ handlers[np.bool_] = lambda o, s: _serialize(bool(o), s)
573
+ handlers[np.str_] = lambda o, s: _handle_str_based(
574
+ o,
575
+ s,
576
+ {"x-python-datatype": "numpy.str_"},
577
+ )
578
+ handlers[np.bytes_] = lambda o, s: _handle_bytes(
579
+ o,
580
+ s,
581
+ {"x-python-datatype": "numpy.bytes_"},
582
+ )
583
+
584
+ with contextlib.suppress(Exception):
585
+ import pandas as pd
586
+
587
+ handlers[pd.DataFrame] = _handle_pandas_dataframe
588
+ handlers[pd.Series] = _handle_pandas_series
589
+
590
+ with contextlib.suppress(Exception):
591
+ import PIL.Image
592
+
593
+ handlers[PIL.Image.Image] = _handle_pil_image
594
+
595
+ with contextlib.suppress(Exception):
596
+ import datasets
597
+
598
+ handlers[datasets.Dataset] = _handle_dataset
599
+
600
+ with contextlib.suppress(Exception):
601
+ from pydub import AudioSegment
602
+
603
+ handlers[AudioSegment] = _handle_pydub_audio_segment
604
+
605
+ with contextlib.suppress(Exception):
606
+ from moviepy import VideoFileClip
607
+
608
+ handlers[VideoFileClip] = _handle_moviepy_video_clip
609
+
610
+ return handlers
611
+
612
+
613
+ # Core functions
614
+
615
+
616
+ def _serialize(obj: t.Any, seen: set[int] | None = None) -> tuple[JsonValue, JsonDict]: # noqa: PLR0911
617
+ # Primitives early
618
+
619
+ if isinstance(obj, str | int | float | bool) or obj is None:
620
+ return obj, EMPTY_SCHEMA
621
+
622
+ # Cycle tracking
623
+
624
+ seen = seen or set()
625
+
626
+ obj_id = id(obj)
627
+ if obj_id in seen:
628
+ return "<circular reference>", {}
629
+
630
+ seen = seen.copy()
631
+ seen.add(obj_id)
632
+
633
+ obj_type = type(obj)
634
+ handlers = _get_handlers()
635
+
636
+ with contextlib.suppress(Exception):
637
+ # MRO-based lookup first
638
+
639
+ for base in obj_type.__mro__:
640
+ if base in handlers:
641
+ handler = handlers[base]
642
+ return handler(obj, seen)
643
+
644
+ # Common collections
645
+
646
+ if isinstance(obj, list | tuple | set | frozenset | deque):
647
+ return _handle_sequence(obj, seen)
648
+
649
+ if isinstance(obj, Mapping):
650
+ return _handle_mapping(obj, seen)
651
+
652
+ # Common struct types
653
+
654
+ if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
655
+ return _handle_dataclass(obj, seen)
656
+
657
+ if _is_attrs_instance(obj_type):
658
+ return _handle_attrs(obj, seen)
659
+
660
+ # Generic sequences (if not list/tuple/set/deque and no other handler matched)
661
+
662
+ if isinstance(obj, Sequence):
663
+ return _handle_sequence(obj, seen)
664
+
665
+ # Common fallbacks
666
+
667
+ if hasattr(obj, "to_dict"):
668
+ return _serialize(obj.to_dict(), seen)
669
+
670
+ if hasattr(obj, "asdict"): # e.g., namedtuple
671
+ return _serialize(obj.asdict(), seen)
672
+
673
+ # Fallback to repr
674
+
675
+ return safe_repr(obj), {
676
+ "type": "string",
677
+ "title": obj_type.__name__,
678
+ "x-python-datatype": "unknown",
679
+ }
680
+
681
+
682
+ @dataclasses.dataclass
683
+ class Serialized:
684
+ data: JsonValue | None
685
+ data_bytes: bytes | None
686
+ data_len: int
687
+ data_hash: str
688
+ schema: JsonDict
689
+ schema_hash: str
690
+
691
+
692
+ EMPTY_HASH = "0" * 16
693
+
694
+
695
+ def serialize(obj: t.Any) -> Serialized:
696
+ """
697
+ Serializes a Python object into a JSON-compatible structure and
698
+ generates a corresponding JSON Schema, ensuring consistency between
699
+ the serialization format and the schema.
700
+
701
+ Args:
702
+ obj: The Python object to process.
703
+
704
+ Returns:
705
+ An object containing the serialized data, schema, and their hashes.
706
+ """
707
+ serialized, schema = _serialize(obj)
708
+
709
+ if isinstance(serialized, str | int | bool | float):
710
+ serialized_bytes = str(serialized).encode()
711
+ else:
712
+ serialized_bytes = json.dumps(serialized, separators=(",", ":")).encode()
713
+
714
+ schema_str = json.dumps(schema, separators=(",", ":"))
715
+
716
+ data_hash = EMPTY_HASH
717
+ if serialized is not None:
718
+ data_hash = hashlib.sha1(serialized_bytes).hexdigest()[:16] # noqa: S324 (using sha1 for speed)
719
+
720
+ schema_hash = EMPTY_HASH
721
+ if schema and schema != EMPTY_SCHEMA:
722
+ schema_hash = hashlib.sha1(schema_str.encode()).hexdigest()[:16] # noqa: S324
723
+
724
+ return Serialized(
725
+ data=serialized,
726
+ data_bytes=serialized_bytes if serialized is not None else None,
727
+ data_len=len(serialized_bytes) if serialized is not None else 0,
728
+ data_hash=data_hash,
729
+ schema=schema,
730
+ schema_hash=schema_hash,
731
+ )