atdata 0.2.0a1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,438 @@
1
+ """Schema codec for dynamic PackableSample type generation.
2
+
3
+ This module provides functionality to reconstruct Python PackableSample types
4
+ from schema records. This enables loading datasets without knowing the sample
5
+ type ahead of time - the type can be dynamically generated from stored schema
6
+ metadata.
7
+
8
+ The schema format follows the ATProto record structure defined in
9
+ ``atmosphere/_types.py``, with field types supporting primitives, ndarrays,
10
+ arrays, and schema references.
11
+
12
+ Examples:
13
+ >>> schema = {
14
+ ... "name": "ImageSample",
15
+ ... "version": "1.0.0",
16
+ ... "fields": [
17
+ ... {"name": "image", "fieldType": {"$type": "...#ndarray", "dtype": "float32"}, "optional": False},
18
+ ... {"name": "label", "fieldType": {"$type": "...#primitive", "primitive": "str"}, "optional": False},
19
+ ... ]
20
+ ... }
21
+ >>> ImageSample = schema_to_type(schema)
22
+ >>> sample = ImageSample(image=np.zeros((64, 64)), label="cat")
23
+ """
24
+
25
+ from dataclasses import field, make_dataclass
26
+ from typing import Any, Optional, Type
27
+ import hashlib
28
+
29
+ from numpy.typing import NDArray
30
+
31
+ # Import PackableSample for inheritance
32
+ from .dataset import PackableSample
33
+
34
+
35
+ # Type cache to avoid regenerating identical types
36
+ # Uses insertion order (Python 3.7+) for simple FIFO eviction
37
+ _type_cache: dict[str, Type[PackableSample]] = {}
38
+ _TYPE_CACHE_MAX_SIZE = 256
39
+
40
+
41
+ def _schema_cache_key(schema: dict) -> str:
42
+ """Generate a cache key for a schema.
43
+
44
+ Uses name + version + field signature to identify unique schemas.
45
+ """
46
+ name = schema.get("name", "Unknown")
47
+ version = schema.get("version", "0.0.0")
48
+ fields = schema.get("fields", [])
49
+
50
+ # Create a stable string representation of fields
51
+ field_sig = ";".join(
52
+ f"{f['name']}:{f['fieldType'].get('$type', '')}:{f.get('optional', False)}"
53
+ for f in fields
54
+ )
55
+
56
+ # Hash for compactness
57
+ sig_hash = hashlib.md5(field_sig.encode()).hexdigest()[:8]
58
+ return f"{name}@{version}#{sig_hash}"
59
+
60
+
61
+ def _field_type_to_python(field_type: dict, optional: bool = False) -> Any:
62
+ """Convert a schema field type to a Python type annotation.
63
+
64
+ Args:
65
+ field_type: Field type dict with '$type' and type-specific fields.
66
+ optional: Whether this field is optional (can be None).
67
+
68
+ Returns:
69
+ Python type annotation suitable for dataclass field.
70
+
71
+ Raises:
72
+ ValueError: If field type is not supported.
73
+ """
74
+ type_str = field_type.get("$type", "")
75
+
76
+ # Extract kind from $type (e.g., "ac.foundation.dataset.schemaType#primitive" -> "primitive")
77
+ if "#" in type_str:
78
+ kind = type_str.split("#")[-1]
79
+ else:
80
+ # Fallback for simplified format
81
+ kind = field_type.get("kind", "")
82
+
83
+ python_type: Any
84
+
85
+ if kind == "primitive":
86
+ primitive = field_type.get("primitive", "str")
87
+ primitive_map = {
88
+ "str": str,
89
+ "int": int,
90
+ "float": float,
91
+ "bool": bool,
92
+ "bytes": bytes,
93
+ }
94
+ python_type = primitive_map.get(primitive)
95
+ if python_type is None:
96
+ raise ValueError(f"Unknown primitive type: {primitive}")
97
+
98
+ elif kind == "ndarray":
99
+ # NDArray type - dtype info is available but we use generic NDArray
100
+ # The dtype is handled at runtime by PackableSample serialization
101
+ python_type = NDArray
102
+
103
+ elif kind == "array":
104
+ # List type - recursively resolve item type
105
+ items = field_type.get("items")
106
+ if items:
107
+ item_type = _field_type_to_python(items, optional=False)
108
+ python_type = list[item_type]
109
+ else:
110
+ python_type = list
111
+
112
+ elif kind == "ref":
113
+ # Reference to another schema - not yet supported for dynamic generation
114
+ raise ValueError(
115
+ f"Schema references ('ref') are not yet supported for dynamic type generation. "
116
+ f"Referenced schema: {field_type.get('ref')}"
117
+ )
118
+
119
+ else:
120
+ raise ValueError(f"Unknown field type kind: {kind}")
121
+
122
+ # Wrap in Optional if needed
123
+ if optional:
124
+ python_type = Optional[python_type]
125
+
126
+ return python_type
127
+
128
+
129
+ def schema_to_type(
130
+ schema: dict,
131
+ *,
132
+ use_cache: bool = True,
133
+ ) -> Type[PackableSample]:
134
+ """Generate a PackableSample subclass from a schema record.
135
+
136
+ This function dynamically creates a dataclass that inherits from PackableSample,
137
+ with fields matching the schema definition. The generated class can be used
138
+ with ``Dataset[T]`` to load and process samples.
139
+
140
+ Args:
141
+ schema: Schema record dict with 'name', 'version', 'fields', etc.
142
+ Fields should have 'name', 'fieldType', and 'optional' keys.
143
+ use_cache: If True, cache and reuse generated types for identical schemas.
144
+ Defaults to True.
145
+
146
+ Returns:
147
+ A dynamically generated PackableSample subclass.
148
+
149
+ Raises:
150
+ ValueError: If schema is malformed or contains unsupported types.
151
+
152
+ Examples:
153
+ >>> schema = index.get_schema("local://schemas/MySample@1.0.0")
154
+ >>> MySample = schema_to_type(schema)
155
+ >>> ds = Dataset[MySample]("data.tar")
156
+ >>> for sample in ds.ordered():
157
+ ... print(sample)
158
+ """
159
+ # Check cache first
160
+ if use_cache:
161
+ cache_key = _schema_cache_key(schema)
162
+ if cache_key in _type_cache:
163
+ return _type_cache[cache_key]
164
+
165
+ # Extract schema metadata
166
+ name = schema.get("name")
167
+ if not name:
168
+ raise ValueError("Schema must have a 'name' field")
169
+
170
+ version = schema.get("version", "1.0.0")
171
+ fields_data = schema.get("fields", [])
172
+
173
+ if not fields_data:
174
+ raise ValueError("Schema must have at least one field")
175
+
176
+ # Build field definitions for make_dataclass
177
+ # Format: (name, type) or (name, type, field())
178
+ dataclass_fields: list[tuple[str, Any] | tuple[str, Any, Any]] = []
179
+
180
+ for field_def in fields_data:
181
+ field_name = field_def.get("name")
182
+ if not field_name:
183
+ raise ValueError("Each field must have a 'name'")
184
+
185
+ field_type_dict = field_def.get("fieldType", {})
186
+ is_optional = field_def.get("optional", False)
187
+
188
+ # Convert to Python type
189
+ python_type = _field_type_to_python(field_type_dict, optional=is_optional)
190
+
191
+ # Optional fields need a default value of None
192
+ if is_optional:
193
+ dataclass_fields.append((field_name, python_type, field(default=None)))
194
+ else:
195
+ dataclass_fields.append((field_name, python_type))
196
+
197
+ # Create the dataclass dynamically
198
+ # We need to make it inherit from PackableSample and call __post_init__
199
+ generated_class = make_dataclass(
200
+ name,
201
+ dataclass_fields,
202
+ bases=(PackableSample,),
203
+ namespace={
204
+ "__post_init__": lambda self: PackableSample.__post_init__(self),
205
+ "__schema_version__": version,
206
+ "__schema_ref__": schema.get(
207
+ "$ref", None
208
+ ), # Store original ref if available
209
+ },
210
+ )
211
+
212
+ # Cache the generated type with FIFO eviction
213
+ if use_cache:
214
+ cache_key = _schema_cache_key(schema)
215
+ _type_cache[cache_key] = generated_class
216
+ # Evict oldest entries if cache exceeds max size
217
+ while len(_type_cache) > _TYPE_CACHE_MAX_SIZE:
218
+ oldest_key = next(iter(_type_cache))
219
+ del _type_cache[oldest_key]
220
+
221
+ return generated_class
222
+
223
+
224
+ def _field_type_to_stub_str(field_type: dict, optional: bool = False) -> str:
225
+ """Convert a schema field type to a Python type string for stub files.
226
+
227
+ Args:
228
+ field_type: Field type dict with '$type' and type-specific fields.
229
+ optional: Whether this field is optional (can be None).
230
+
231
+ Returns:
232
+ String representation of the Python type for use in .pyi files.
233
+ """
234
+ type_str = field_type.get("$type", "")
235
+
236
+ # Extract kind from $type
237
+ if "#" in type_str:
238
+ kind = type_str.split("#")[-1]
239
+ else:
240
+ kind = field_type.get("kind", "")
241
+
242
+ if kind == "primitive":
243
+ primitive = field_type.get("primitive", "str")
244
+ py_type = (
245
+ primitive # str, int, float, bool, bytes are all valid Python type names
246
+ )
247
+ elif kind == "ndarray":
248
+ py_type = "NDArray[Any]"
249
+ elif kind == "array":
250
+ items = field_type.get("items")
251
+ if items:
252
+ item_type = _field_type_to_stub_str(items, optional=False)
253
+ py_type = f"list[{item_type}]"
254
+ else:
255
+ py_type = "list[Any]"
256
+ elif kind == "ref":
257
+ # Reference to another schema - use Any for now
258
+ py_type = "Any"
259
+ else:
260
+ py_type = "Any"
261
+
262
+ if optional:
263
+ return f"{py_type} | None"
264
+ return py_type
265
+
266
+
267
+ def generate_stub(schema: dict) -> str:
268
+ """Generate a .pyi stub file content for a schema.
269
+
270
+ This function creates type stub content that can be saved to a .pyi file
271
+ to provide IDE autocomplete and type checking support for dynamically
272
+ decoded sample types.
273
+
274
+ Note:
275
+ Types created by ``schema_to_type()`` work correctly at runtime but
276
+ static type checkers cannot analyze dynamically generated classes.
277
+ Stub files bridge this gap by providing static type information.
278
+
279
+ Args:
280
+ schema: Schema record dict with 'name', 'version', 'fields', etc.
281
+
282
+ Returns:
283
+ String content for a .pyi stub file.
284
+
285
+ Examples:
286
+ >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
287
+ >>> stub_content = generate_stub(schema.to_dict())
288
+ >>> # Save to a stubs directory configured in your IDE
289
+ >>> with open("stubs/my_sample.pyi", "w") as f:
290
+ ... f.write(stub_content)
291
+ """
292
+ name = schema.get("name", "UnknownSample")
293
+ version = schema.get("version", "1.0.0")
294
+ fields = schema.get("fields", [])
295
+
296
+ lines = [
297
+ "# Auto-generated stub for dynamically decoded schema",
298
+ f"# Schema: {name}@{version}",
299
+ "#",
300
+ "# Save this file to a stubs directory and configure your IDE to include it.",
301
+ "# For VS Code/Pylance: add to python.analysis.extraPaths in settings.json",
302
+ "# For PyCharm: mark the stubs directory as Sources Root",
303
+ "",
304
+ "from typing import Any",
305
+ "from numpy.typing import NDArray",
306
+ "from atdata import PackableSample",
307
+ "",
308
+ f"class {name}(PackableSample):",
309
+ f' """Dynamically decoded sample type from schema {name}@{version}."""',
310
+ ]
311
+
312
+ # Add field annotations
313
+ if fields:
314
+ for field_def in fields:
315
+ fname = field_def.get("name", "unknown")
316
+ ftype = _field_type_to_stub_str(
317
+ field_def.get("fieldType", {}),
318
+ field_def.get("optional", False),
319
+ )
320
+ lines.append(f" {fname}: {ftype}")
321
+ else:
322
+ lines.append(" pass")
323
+
324
+ # Add __init__ signature
325
+ lines.append("")
326
+ init_params = ["self"]
327
+ for field_def in fields:
328
+ fname = field_def.get("name", "unknown")
329
+ ftype = _field_type_to_stub_str(
330
+ field_def.get("fieldType", {}),
331
+ field_def.get("optional", False),
332
+ )
333
+ if field_def.get("optional", False):
334
+ init_params.append(f"{fname}: {ftype} = None")
335
+ else:
336
+ init_params.append(f"{fname}: {ftype}")
337
+
338
+ lines.append(f" def __init__({', '.join(init_params)}) -> None: ...")
339
+ lines.append("")
340
+
341
+ return "\n".join(lines)
342
+
343
+
344
+ def generate_module(schema: dict) -> str:
345
+ """Generate an importable Python module for a schema.
346
+
347
+ This function creates a Python module that defines a PackableSample subclass
348
+ matching the schema. Unlike stub files, this module can be imported at runtime,
349
+ allowing ``decode_schema`` to return properly typed classes.
350
+
351
+ The generated class inherits from PackableSample and uses @dataclass decorator
352
+ for proper initialization. This provides both runtime functionality and static
353
+ type checking support.
354
+
355
+ Args:
356
+ schema: Schema record dict with 'name', 'version', 'fields', etc.
357
+
358
+ Returns:
359
+ String content for a .py module file.
360
+
361
+ Examples:
362
+ >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
363
+ >>> module_content = generate_module(schema.to_dict())
364
+ >>> # The module can be imported after being saved
365
+ """
366
+ name = schema.get("name", "UnknownSample")
367
+ version = schema.get("version", "1.0.0")
368
+ fields = schema.get("fields", [])
369
+
370
+ lines = [
371
+ '"""Auto-generated module for dynamically decoded schema.',
372
+ "",
373
+ f"Schema: {name}@{version}",
374
+ "",
375
+ "This module is auto-generated by atdata to provide IDE autocomplete",
376
+ "and type checking support for dynamically decoded schema types.",
377
+ '"""',
378
+ "",
379
+ "from dataclasses import dataclass",
380
+ "from typing import Any",
381
+ "from numpy.typing import NDArray",
382
+ "from atdata import PackableSample",
383
+ "",
384
+ "",
385
+ "@dataclass",
386
+ f"class {name}(PackableSample):",
387
+ f' """Dynamically decoded sample type from schema {name}@{version}."""',
388
+ "",
389
+ ]
390
+
391
+ # Add field annotations
392
+ if fields:
393
+ for field_def in fields:
394
+ fname = field_def.get("name", "unknown")
395
+ ftype = _field_type_to_stub_str(
396
+ field_def.get("fieldType", {}),
397
+ field_def.get("optional", False),
398
+ )
399
+ is_optional = field_def.get("optional", False)
400
+ if is_optional:
401
+ lines.append(f" {fname}: {ftype} = None")
402
+ else:
403
+ lines.append(f" {fname}: {ftype}")
404
+ else:
405
+ lines.append(" pass")
406
+
407
+ lines.append("")
408
+ lines.append("")
409
+ lines.append(f"__all__ = [{name!r}]")
410
+ lines.append("")
411
+
412
+ return "\n".join(lines)
413
+
414
+
415
+ def clear_type_cache() -> None:
416
+ """Clear the cached generated types.
417
+
418
+ Useful for testing or when schema definitions change.
419
+ """
420
+ _type_cache.clear()
421
+
422
+
423
+ def get_cached_types() -> dict[str, Type[PackableSample]]:
424
+ """Get a copy of the current type cache.
425
+
426
+ Returns:
427
+ Dictionary mapping cache keys to generated types.
428
+ """
429
+ return dict(_type_cache)
430
+
431
+
432
+ __all__ = [
433
+ "schema_to_type",
434
+ "generate_stub",
435
+ "generate_module",
436
+ "clear_type_cache",
437
+ "get_cached_types",
438
+ ]