atdata 0.2.0a1__py3-none-any.whl → 0.2.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,442 @@
1
+ """Schema codec for dynamic PackableSample type generation.
2
+
3
+ This module provides functionality to reconstruct Python PackableSample types
4
+ from schema records. This enables loading datasets without knowing the sample
5
+ type ahead of time - the type can be dynamically generated from stored schema
6
+ metadata.
7
+
8
+ The schema format follows the ATProto record structure defined in
9
+ ``atmosphere/_types.py``, with field types supporting primitives, ndarrays,
10
+ arrays, and schema references.
11
+
12
+ Example:
13
+ ::
14
+
15
+ >>> schema = {
16
+ ... "name": "ImageSample",
17
+ ... "version": "1.0.0",
18
+ ... "fields": [
19
+ ... {"name": "image", "fieldType": {"$type": "...#ndarray", "dtype": "float32"}, "optional": False},
20
+ ... {"name": "label", "fieldType": {"$type": "...#primitive", "primitive": "str"}, "optional": False},
21
+ ... ]
22
+ ... }
23
+ >>> ImageSample = schema_to_type(schema)
24
+ >>> sample = ImageSample(image=np.zeros((64, 64)), label="cat")
25
+ """
26
+
27
+ from dataclasses import field, make_dataclass
28
+ from typing import Any, Optional, Type
29
+ import hashlib
30
+
31
+ from numpy.typing import NDArray
32
+
33
+ # Import PackableSample for inheritance
34
+ from .dataset import PackableSample
35
+
36
+
37
+ # Type cache to avoid regenerating identical types
38
+ # Uses insertion order (Python 3.7+) for simple FIFO eviction
39
+ _type_cache: dict[str, Type[PackableSample]] = {}
40
+ _TYPE_CACHE_MAX_SIZE = 256
41
+
42
+
43
+ def _schema_cache_key(schema: dict) -> str:
44
+ """Generate a cache key for a schema.
45
+
46
+ Uses name + version + field signature to identify unique schemas.
47
+ """
48
+ name = schema.get("name", "Unknown")
49
+ version = schema.get("version", "0.0.0")
50
+ fields = schema.get("fields", [])
51
+
52
+ # Create a stable string representation of fields
53
+ field_sig = ";".join(
54
+ f"{f['name']}:{f['fieldType'].get('$type', '')}:{f.get('optional', False)}"
55
+ for f in fields
56
+ )
57
+
58
+ # Hash for compactness
59
+ sig_hash = hashlib.md5(field_sig.encode()).hexdigest()[:8]
60
+ return f"{name}@{version}#{sig_hash}"
61
+
62
+
63
+ def _field_type_to_python(field_type: dict, optional: bool = False) -> Any:
64
+ """Convert a schema field type to a Python type annotation.
65
+
66
+ Args:
67
+ field_type: Field type dict with '$type' and type-specific fields.
68
+ optional: Whether this field is optional (can be None).
69
+
70
+ Returns:
71
+ Python type annotation suitable for dataclass field.
72
+
73
+ Raises:
74
+ ValueError: If field type is not supported.
75
+ """
76
+ type_str = field_type.get("$type", "")
77
+
78
+ # Extract kind from $type (e.g., "ac.foundation.dataset.schemaType#primitive" -> "primitive")
79
+ if "#" in type_str:
80
+ kind = type_str.split("#")[-1]
81
+ else:
82
+ # Fallback for simplified format
83
+ kind = field_type.get("kind", "")
84
+
85
+ python_type: Any
86
+
87
+ if kind == "primitive":
88
+ primitive = field_type.get("primitive", "str")
89
+ primitive_map = {
90
+ "str": str,
91
+ "int": int,
92
+ "float": float,
93
+ "bool": bool,
94
+ "bytes": bytes,
95
+ }
96
+ python_type = primitive_map.get(primitive)
97
+ if python_type is None:
98
+ raise ValueError(f"Unknown primitive type: {primitive}")
99
+
100
+ elif kind == "ndarray":
101
+ # NDArray type - dtype info is available but we use generic NDArray
102
+ # The dtype is handled at runtime by PackableSample serialization
103
+ python_type = NDArray
104
+
105
+ elif kind == "array":
106
+ # List type - recursively resolve item type
107
+ items = field_type.get("items")
108
+ if items:
109
+ item_type = _field_type_to_python(items, optional=False)
110
+ python_type = list[item_type]
111
+ else:
112
+ python_type = list
113
+
114
+ elif kind == "ref":
115
+ # Reference to another schema - not yet supported for dynamic generation
116
+ raise ValueError(
117
+ f"Schema references ('ref') are not yet supported for dynamic type generation. "
118
+ f"Referenced schema: {field_type.get('ref')}"
119
+ )
120
+
121
+ else:
122
+ raise ValueError(f"Unknown field type kind: {kind}")
123
+
124
+ # Wrap in Optional if needed
125
+ if optional:
126
+ python_type = Optional[python_type]
127
+
128
+ return python_type
129
+
130
+
131
+ def schema_to_type(
132
+ schema: dict,
133
+ *,
134
+ use_cache: bool = True,
135
+ ) -> Type[PackableSample]:
136
+ """Generate a PackableSample subclass from a schema record.
137
+
138
+ This function dynamically creates a dataclass that inherits from PackableSample,
139
+ with fields matching the schema definition. The generated class can be used
140
+ with ``Dataset[T]`` to load and process samples.
141
+
142
+ Args:
143
+ schema: Schema record dict with 'name', 'version', 'fields', etc.
144
+ Fields should have 'name', 'fieldType', and 'optional' keys.
145
+ use_cache: If True, cache and reuse generated types for identical schemas.
146
+ Defaults to True.
147
+
148
+ Returns:
149
+ A dynamically generated PackableSample subclass.
150
+
151
+ Raises:
152
+ ValueError: If schema is malformed or contains unsupported types.
153
+
154
+ Example:
155
+ ::
156
+
157
+ >>> schema = index.get_schema("local://schemas/MySample@1.0.0")
158
+ >>> MySample = schema_to_type(schema)
159
+ >>> ds = Dataset[MySample]("data.tar")
160
+ >>> for sample in ds.ordered():
161
+ ... print(sample)
162
+ """
163
+ # Check cache first
164
+ if use_cache:
165
+ cache_key = _schema_cache_key(schema)
166
+ if cache_key in _type_cache:
167
+ return _type_cache[cache_key]
168
+
169
+ # Extract schema metadata
170
+ name = schema.get("name")
171
+ if not name:
172
+ raise ValueError("Schema must have a 'name' field")
173
+
174
+ version = schema.get("version", "1.0.0")
175
+ fields_data = schema.get("fields", [])
176
+
177
+ if not fields_data:
178
+ raise ValueError("Schema must have at least one field")
179
+
180
+ # Build field definitions for make_dataclass
181
+ # Format: (name, type) or (name, type, field())
182
+ dataclass_fields: list[tuple[str, Any] | tuple[str, Any, Any]] = []
183
+
184
+ for field_def in fields_data:
185
+ field_name = field_def.get("name")
186
+ if not field_name:
187
+ raise ValueError("Each field must have a 'name'")
188
+
189
+ field_type_dict = field_def.get("fieldType", {})
190
+ is_optional = field_def.get("optional", False)
191
+
192
+ # Convert to Python type
193
+ python_type = _field_type_to_python(field_type_dict, optional=is_optional)
194
+
195
+ # Optional fields need a default value of None
196
+ if is_optional:
197
+ dataclass_fields.append((field_name, python_type, field(default=None)))
198
+ else:
199
+ dataclass_fields.append((field_name, python_type))
200
+
201
+ # Create the dataclass dynamically
202
+ # We need to make it inherit from PackableSample and call __post_init__
203
+ generated_class = make_dataclass(
204
+ name,
205
+ dataclass_fields,
206
+ bases=(PackableSample,),
207
+ namespace={
208
+ "__post_init__": lambda self: PackableSample.__post_init__(self),
209
+ "__schema_version__": version,
210
+ "__schema_ref__": schema.get("$ref", None), # Store original ref if available
211
+ },
212
+ )
213
+
214
+ # Cache the generated type with FIFO eviction
215
+ if use_cache:
216
+ cache_key = _schema_cache_key(schema)
217
+ _type_cache[cache_key] = generated_class
218
+ # Evict oldest entries if cache exceeds max size
219
+ while len(_type_cache) > _TYPE_CACHE_MAX_SIZE:
220
+ oldest_key = next(iter(_type_cache))
221
+ del _type_cache[oldest_key]
222
+
223
+ return generated_class
224
+
225
+
226
+ def _field_type_to_stub_str(field_type: dict, optional: bool = False) -> str:
227
+ """Convert a schema field type to a Python type string for stub files.
228
+
229
+ Args:
230
+ field_type: Field type dict with '$type' and type-specific fields.
231
+ optional: Whether this field is optional (can be None).
232
+
233
+ Returns:
234
+ String representation of the Python type for use in .pyi files.
235
+ """
236
+ type_str = field_type.get("$type", "")
237
+
238
+ # Extract kind from $type
239
+ if "#" in type_str:
240
+ kind = type_str.split("#")[-1]
241
+ else:
242
+ kind = field_type.get("kind", "")
243
+
244
+ if kind == "primitive":
245
+ primitive = field_type.get("primitive", "str")
246
+ py_type = primitive # str, int, float, bool, bytes are all valid Python type names
247
+ elif kind == "ndarray":
248
+ py_type = "NDArray[Any]"
249
+ elif kind == "array":
250
+ items = field_type.get("items")
251
+ if items:
252
+ item_type = _field_type_to_stub_str(items, optional=False)
253
+ py_type = f"list[{item_type}]"
254
+ else:
255
+ py_type = "list[Any]"
256
+ elif kind == "ref":
257
+ # Reference to another schema - use Any for now
258
+ py_type = "Any"
259
+ else:
260
+ py_type = "Any"
261
+
262
+ if optional:
263
+ return f"{py_type} | None"
264
+ return py_type
265
+
266
+
267
+ def generate_stub(schema: dict) -> str:
268
+ """Generate a .pyi stub file content for a schema.
269
+
270
+ This function creates type stub content that can be saved to a .pyi file
271
+ to provide IDE autocomplete and type checking support for dynamically
272
+ decoded sample types.
273
+
274
+ Note:
275
+ Types created by ``schema_to_type()`` work correctly at runtime but
276
+ static type checkers cannot analyze dynamically generated classes.
277
+ Stub files bridge this gap by providing static type information.
278
+
279
+ Args:
280
+ schema: Schema record dict with 'name', 'version', 'fields', etc.
281
+
282
+ Returns:
283
+ String content for a .pyi stub file.
284
+
285
+ Example:
286
+ ::
287
+
288
+ >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
289
+ >>> stub_content = generate_stub(schema.to_dict())
290
+ >>> # Save to a stubs directory configured in your IDE
291
+ >>> with open("stubs/my_sample.pyi", "w") as f:
292
+ ... f.write(stub_content)
293
+ """
294
+ name = schema.get("name", "UnknownSample")
295
+ version = schema.get("version", "1.0.0")
296
+ fields = schema.get("fields", [])
297
+
298
+ lines = [
299
+ "# Auto-generated stub for dynamically decoded schema",
300
+ f"# Schema: {name}@{version}",
301
+ "#",
302
+ "# Save this file to a stubs directory and configure your IDE to include it.",
303
+ "# For VS Code/Pylance: add to python.analysis.extraPaths in settings.json",
304
+ "# For PyCharm: mark the stubs directory as Sources Root",
305
+ "",
306
+ "from typing import Any",
307
+ "from numpy.typing import NDArray",
308
+ "from atdata import PackableSample",
309
+ "",
310
+ f"class {name}(PackableSample):",
311
+ f' """Dynamically decoded sample type from schema {name}@{version}."""',
312
+ ]
313
+
314
+ # Add field annotations
315
+ if fields:
316
+ for field_def in fields:
317
+ fname = field_def.get("name", "unknown")
318
+ ftype = _field_type_to_stub_str(
319
+ field_def.get("fieldType", {}),
320
+ field_def.get("optional", False),
321
+ )
322
+ lines.append(f" {fname}: {ftype}")
323
+ else:
324
+ lines.append(" pass")
325
+
326
+ # Add __init__ signature
327
+ lines.append("")
328
+ init_params = ["self"]
329
+ for field_def in fields:
330
+ fname = field_def.get("name", "unknown")
331
+ ftype = _field_type_to_stub_str(
332
+ field_def.get("fieldType", {}),
333
+ field_def.get("optional", False),
334
+ )
335
+ if field_def.get("optional", False):
336
+ init_params.append(f"{fname}: {ftype} = None")
337
+ else:
338
+ init_params.append(f"{fname}: {ftype}")
339
+
340
+ lines.append(f" def __init__({', '.join(init_params)}) -> None: ...")
341
+ lines.append("")
342
+
343
+ return "\n".join(lines)
344
+
345
+
346
+ def generate_module(schema: dict) -> str:
347
+ """Generate an importable Python module for a schema.
348
+
349
+ This function creates a Python module that defines a PackableSample subclass
350
+ matching the schema. Unlike stub files, this module can be imported at runtime,
351
+ allowing ``decode_schema`` to return properly typed classes.
352
+
353
+ The generated class inherits from PackableSample and uses @dataclass decorator
354
+ for proper initialization. This provides both runtime functionality and static
355
+ type checking support.
356
+
357
+ Args:
358
+ schema: Schema record dict with 'name', 'version', 'fields', etc.
359
+
360
+ Returns:
361
+ String content for a .py module file.
362
+
363
+ Example:
364
+ ::
365
+
366
+ >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
367
+ >>> module_content = generate_module(schema.to_dict())
368
+ >>> # The module can be imported after being saved
369
+ """
370
+ name = schema.get("name", "UnknownSample")
371
+ version = schema.get("version", "1.0.0")
372
+ fields = schema.get("fields", [])
373
+
374
+ lines = [
375
+ '"""Auto-generated module for dynamically decoded schema.',
376
+ "",
377
+ f"Schema: {name}@{version}",
378
+ "",
379
+ "This module is auto-generated by atdata to provide IDE autocomplete",
380
+ "and type checking support for dynamically decoded schema types.",
381
+ '"""',
382
+ "",
383
+ "from dataclasses import dataclass",
384
+ "from typing import Any",
385
+ "from numpy.typing import NDArray",
386
+ "from atdata import PackableSample",
387
+ "",
388
+ "",
389
+ "@dataclass",
390
+ f"class {name}(PackableSample):",
391
+ f' """Dynamically decoded sample type from schema {name}@{version}."""',
392
+ "",
393
+ ]
394
+
395
+ # Add field annotations
396
+ if fields:
397
+ for field_def in fields:
398
+ fname = field_def.get("name", "unknown")
399
+ ftype = _field_type_to_stub_str(
400
+ field_def.get("fieldType", {}),
401
+ field_def.get("optional", False),
402
+ )
403
+ is_optional = field_def.get("optional", False)
404
+ if is_optional:
405
+ lines.append(f" {fname}: {ftype} = None")
406
+ else:
407
+ lines.append(f" {fname}: {ftype}")
408
+ else:
409
+ lines.append(" pass")
410
+
411
+ lines.append("")
412
+ lines.append("")
413
+ lines.append(f"__all__ = [{name!r}]")
414
+ lines.append("")
415
+
416
+ return "\n".join(lines)
417
+
418
+
419
+ def clear_type_cache() -> None:
420
+ """Clear the cached generated types.
421
+
422
+ Useful for testing or when schema definitions change.
423
+ """
424
+ _type_cache.clear()
425
+
426
+
427
+ def get_cached_types() -> dict[str, Type[PackableSample]]:
428
+ """Get a copy of the current type cache.
429
+
430
+ Returns:
431
+ Dictionary mapping cache keys to generated types.
432
+ """
433
+ return dict(_type_cache)
434
+
435
+
436
+ __all__ = [
437
+ "schema_to_type",
438
+ "generate_stub",
439
+ "generate_module",
440
+ "clear_type_cache",
441
+ "get_cached_types",
442
+ ]