atdata 0.2.0a1__py3-none-any.whl → 0.2.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +43 -10
- atdata/_cid.py +150 -0
- atdata/_hf_api.py +692 -0
- atdata/_protocols.py +519 -0
- atdata/_schema_codec.py +442 -0
- atdata/_sources.py +515 -0
- atdata/_stub_manager.py +529 -0
- atdata/_type_utils.py +90 -0
- atdata/atmosphere/__init__.py +278 -7
- atdata/atmosphere/_types.py +9 -7
- atdata/atmosphere/client.py +146 -6
- atdata/atmosphere/lens.py +29 -25
- atdata/atmosphere/records.py +197 -30
- atdata/atmosphere/schema.py +41 -98
- atdata/atmosphere/store.py +208 -0
- atdata/cli/__init__.py +213 -0
- atdata/cli/diagnose.py +165 -0
- atdata/cli/local.py +280 -0
- atdata/dataset.py +482 -167
- atdata/lens.py +61 -57
- atdata/local.py +1400 -185
- atdata/promote.py +199 -0
- {atdata-0.2.0a1.dist-info → atdata-0.2.2b1.dist-info}/METADATA +105 -14
- atdata-0.2.2b1.dist-info/RECORD +28 -0
- atdata-0.2.0a1.dist-info/RECORD +0 -16
- {atdata-0.2.0a1.dist-info → atdata-0.2.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.0a1.dist-info → atdata-0.2.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.0a1.dist-info → atdata-0.2.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/_schema_codec.py
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
"""Schema codec for dynamic PackableSample type generation.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to reconstruct Python PackableSample types
|
|
4
|
+
from schema records. This enables loading datasets without knowing the sample
|
|
5
|
+
type ahead of time - the type can be dynamically generated from stored schema
|
|
6
|
+
metadata.
|
|
7
|
+
|
|
8
|
+
The schema format follows the ATProto record structure defined in
|
|
9
|
+
``atmosphere/_types.py``, with field types supporting primitives, ndarrays,
|
|
10
|
+
arrays, and schema references.
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
::
|
|
14
|
+
|
|
15
|
+
>>> schema = {
|
|
16
|
+
... "name": "ImageSample",
|
|
17
|
+
... "version": "1.0.0",
|
|
18
|
+
... "fields": [
|
|
19
|
+
... {"name": "image", "fieldType": {"$type": "...#ndarray", "dtype": "float32"}, "optional": False},
|
|
20
|
+
... {"name": "label", "fieldType": {"$type": "...#primitive", "primitive": "str"}, "optional": False},
|
|
21
|
+
... ]
|
|
22
|
+
... }
|
|
23
|
+
>>> ImageSample = schema_to_type(schema)
|
|
24
|
+
>>> sample = ImageSample(image=np.zeros((64, 64)), label="cat")
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from dataclasses import field, make_dataclass
|
|
28
|
+
from typing import Any, Optional, Type
|
|
29
|
+
import hashlib
|
|
30
|
+
|
|
31
|
+
from numpy.typing import NDArray
|
|
32
|
+
|
|
33
|
+
# Import PackableSample for inheritance
|
|
34
|
+
from .dataset import PackableSample
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Type cache to avoid regenerating identical types
|
|
38
|
+
# Uses insertion order (Python 3.7+) for simple FIFO eviction
|
|
39
|
+
_type_cache: dict[str, Type[PackableSample]] = {}
|
|
40
|
+
_TYPE_CACHE_MAX_SIZE = 256
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _schema_cache_key(schema: dict) -> str:
|
|
44
|
+
"""Generate a cache key for a schema.
|
|
45
|
+
|
|
46
|
+
Uses name + version + field signature to identify unique schemas.
|
|
47
|
+
"""
|
|
48
|
+
name = schema.get("name", "Unknown")
|
|
49
|
+
version = schema.get("version", "0.0.0")
|
|
50
|
+
fields = schema.get("fields", [])
|
|
51
|
+
|
|
52
|
+
# Create a stable string representation of fields
|
|
53
|
+
field_sig = ";".join(
|
|
54
|
+
f"{f['name']}:{f['fieldType'].get('$type', '')}:{f.get('optional', False)}"
|
|
55
|
+
for f in fields
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Hash for compactness
|
|
59
|
+
sig_hash = hashlib.md5(field_sig.encode()).hexdigest()[:8]
|
|
60
|
+
return f"{name}@{version}#{sig_hash}"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _field_type_to_python(field_type: dict, optional: bool = False) -> Any:
|
|
64
|
+
"""Convert a schema field type to a Python type annotation.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
field_type: Field type dict with '$type' and type-specific fields.
|
|
68
|
+
optional: Whether this field is optional (can be None).
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Python type annotation suitable for dataclass field.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If field type is not supported.
|
|
75
|
+
"""
|
|
76
|
+
type_str = field_type.get("$type", "")
|
|
77
|
+
|
|
78
|
+
# Extract kind from $type (e.g., "ac.foundation.dataset.schemaType#primitive" -> "primitive")
|
|
79
|
+
if "#" in type_str:
|
|
80
|
+
kind = type_str.split("#")[-1]
|
|
81
|
+
else:
|
|
82
|
+
# Fallback for simplified format
|
|
83
|
+
kind = field_type.get("kind", "")
|
|
84
|
+
|
|
85
|
+
python_type: Any
|
|
86
|
+
|
|
87
|
+
if kind == "primitive":
|
|
88
|
+
primitive = field_type.get("primitive", "str")
|
|
89
|
+
primitive_map = {
|
|
90
|
+
"str": str,
|
|
91
|
+
"int": int,
|
|
92
|
+
"float": float,
|
|
93
|
+
"bool": bool,
|
|
94
|
+
"bytes": bytes,
|
|
95
|
+
}
|
|
96
|
+
python_type = primitive_map.get(primitive)
|
|
97
|
+
if python_type is None:
|
|
98
|
+
raise ValueError(f"Unknown primitive type: {primitive}")
|
|
99
|
+
|
|
100
|
+
elif kind == "ndarray":
|
|
101
|
+
# NDArray type - dtype info is available but we use generic NDArray
|
|
102
|
+
# The dtype is handled at runtime by PackableSample serialization
|
|
103
|
+
python_type = NDArray
|
|
104
|
+
|
|
105
|
+
elif kind == "array":
|
|
106
|
+
# List type - recursively resolve item type
|
|
107
|
+
items = field_type.get("items")
|
|
108
|
+
if items:
|
|
109
|
+
item_type = _field_type_to_python(items, optional=False)
|
|
110
|
+
python_type = list[item_type]
|
|
111
|
+
else:
|
|
112
|
+
python_type = list
|
|
113
|
+
|
|
114
|
+
elif kind == "ref":
|
|
115
|
+
# Reference to another schema - not yet supported for dynamic generation
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"Schema references ('ref') are not yet supported for dynamic type generation. "
|
|
118
|
+
f"Referenced schema: {field_type.get('ref')}"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
else:
|
|
122
|
+
raise ValueError(f"Unknown field type kind: {kind}")
|
|
123
|
+
|
|
124
|
+
# Wrap in Optional if needed
|
|
125
|
+
if optional:
|
|
126
|
+
python_type = Optional[python_type]
|
|
127
|
+
|
|
128
|
+
return python_type
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def schema_to_type(
|
|
132
|
+
schema: dict,
|
|
133
|
+
*,
|
|
134
|
+
use_cache: bool = True,
|
|
135
|
+
) -> Type[PackableSample]:
|
|
136
|
+
"""Generate a PackableSample subclass from a schema record.
|
|
137
|
+
|
|
138
|
+
This function dynamically creates a dataclass that inherits from PackableSample,
|
|
139
|
+
with fields matching the schema definition. The generated class can be used
|
|
140
|
+
with ``Dataset[T]`` to load and process samples.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
schema: Schema record dict with 'name', 'version', 'fields', etc.
|
|
144
|
+
Fields should have 'name', 'fieldType', and 'optional' keys.
|
|
145
|
+
use_cache: If True, cache and reuse generated types for identical schemas.
|
|
146
|
+
Defaults to True.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
A dynamically generated PackableSample subclass.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
ValueError: If schema is malformed or contains unsupported types.
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
::
|
|
156
|
+
|
|
157
|
+
>>> schema = index.get_schema("local://schemas/MySample@1.0.0")
|
|
158
|
+
>>> MySample = schema_to_type(schema)
|
|
159
|
+
>>> ds = Dataset[MySample]("data.tar")
|
|
160
|
+
>>> for sample in ds.ordered():
|
|
161
|
+
... print(sample)
|
|
162
|
+
"""
|
|
163
|
+
# Check cache first
|
|
164
|
+
if use_cache:
|
|
165
|
+
cache_key = _schema_cache_key(schema)
|
|
166
|
+
if cache_key in _type_cache:
|
|
167
|
+
return _type_cache[cache_key]
|
|
168
|
+
|
|
169
|
+
# Extract schema metadata
|
|
170
|
+
name = schema.get("name")
|
|
171
|
+
if not name:
|
|
172
|
+
raise ValueError("Schema must have a 'name' field")
|
|
173
|
+
|
|
174
|
+
version = schema.get("version", "1.0.0")
|
|
175
|
+
fields_data = schema.get("fields", [])
|
|
176
|
+
|
|
177
|
+
if not fields_data:
|
|
178
|
+
raise ValueError("Schema must have at least one field")
|
|
179
|
+
|
|
180
|
+
# Build field definitions for make_dataclass
|
|
181
|
+
# Format: (name, type) or (name, type, field())
|
|
182
|
+
dataclass_fields: list[tuple[str, Any] | tuple[str, Any, Any]] = []
|
|
183
|
+
|
|
184
|
+
for field_def in fields_data:
|
|
185
|
+
field_name = field_def.get("name")
|
|
186
|
+
if not field_name:
|
|
187
|
+
raise ValueError("Each field must have a 'name'")
|
|
188
|
+
|
|
189
|
+
field_type_dict = field_def.get("fieldType", {})
|
|
190
|
+
is_optional = field_def.get("optional", False)
|
|
191
|
+
|
|
192
|
+
# Convert to Python type
|
|
193
|
+
python_type = _field_type_to_python(field_type_dict, optional=is_optional)
|
|
194
|
+
|
|
195
|
+
# Optional fields need a default value of None
|
|
196
|
+
if is_optional:
|
|
197
|
+
dataclass_fields.append((field_name, python_type, field(default=None)))
|
|
198
|
+
else:
|
|
199
|
+
dataclass_fields.append((field_name, python_type))
|
|
200
|
+
|
|
201
|
+
# Create the dataclass dynamically
|
|
202
|
+
# We need to make it inherit from PackableSample and call __post_init__
|
|
203
|
+
generated_class = make_dataclass(
|
|
204
|
+
name,
|
|
205
|
+
dataclass_fields,
|
|
206
|
+
bases=(PackableSample,),
|
|
207
|
+
namespace={
|
|
208
|
+
"__post_init__": lambda self: PackableSample.__post_init__(self),
|
|
209
|
+
"__schema_version__": version,
|
|
210
|
+
"__schema_ref__": schema.get("$ref", None), # Store original ref if available
|
|
211
|
+
},
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Cache the generated type with FIFO eviction
|
|
215
|
+
if use_cache:
|
|
216
|
+
cache_key = _schema_cache_key(schema)
|
|
217
|
+
_type_cache[cache_key] = generated_class
|
|
218
|
+
# Evict oldest entries if cache exceeds max size
|
|
219
|
+
while len(_type_cache) > _TYPE_CACHE_MAX_SIZE:
|
|
220
|
+
oldest_key = next(iter(_type_cache))
|
|
221
|
+
del _type_cache[oldest_key]
|
|
222
|
+
|
|
223
|
+
return generated_class
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _field_type_to_stub_str(field_type: dict, optional: bool = False) -> str:
|
|
227
|
+
"""Convert a schema field type to a Python type string for stub files.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
field_type: Field type dict with '$type' and type-specific fields.
|
|
231
|
+
optional: Whether this field is optional (can be None).
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
String representation of the Python type for use in .pyi files.
|
|
235
|
+
"""
|
|
236
|
+
type_str = field_type.get("$type", "")
|
|
237
|
+
|
|
238
|
+
# Extract kind from $type
|
|
239
|
+
if "#" in type_str:
|
|
240
|
+
kind = type_str.split("#")[-1]
|
|
241
|
+
else:
|
|
242
|
+
kind = field_type.get("kind", "")
|
|
243
|
+
|
|
244
|
+
if kind == "primitive":
|
|
245
|
+
primitive = field_type.get("primitive", "str")
|
|
246
|
+
py_type = primitive # str, int, float, bool, bytes are all valid Python type names
|
|
247
|
+
elif kind == "ndarray":
|
|
248
|
+
py_type = "NDArray[Any]"
|
|
249
|
+
elif kind == "array":
|
|
250
|
+
items = field_type.get("items")
|
|
251
|
+
if items:
|
|
252
|
+
item_type = _field_type_to_stub_str(items, optional=False)
|
|
253
|
+
py_type = f"list[{item_type}]"
|
|
254
|
+
else:
|
|
255
|
+
py_type = "list[Any]"
|
|
256
|
+
elif kind == "ref":
|
|
257
|
+
# Reference to another schema - use Any for now
|
|
258
|
+
py_type = "Any"
|
|
259
|
+
else:
|
|
260
|
+
py_type = "Any"
|
|
261
|
+
|
|
262
|
+
if optional:
|
|
263
|
+
return f"{py_type} | None"
|
|
264
|
+
return py_type
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def generate_stub(schema: dict) -> str:
|
|
268
|
+
"""Generate a .pyi stub file content for a schema.
|
|
269
|
+
|
|
270
|
+
This function creates type stub content that can be saved to a .pyi file
|
|
271
|
+
to provide IDE autocomplete and type checking support for dynamically
|
|
272
|
+
decoded sample types.
|
|
273
|
+
|
|
274
|
+
Note:
|
|
275
|
+
Types created by ``schema_to_type()`` work correctly at runtime but
|
|
276
|
+
static type checkers cannot analyze dynamically generated classes.
|
|
277
|
+
Stub files bridge this gap by providing static type information.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
schema: Schema record dict with 'name', 'version', 'fields', etc.
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
String content for a .pyi stub file.
|
|
284
|
+
|
|
285
|
+
Example:
|
|
286
|
+
::
|
|
287
|
+
|
|
288
|
+
>>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
|
|
289
|
+
>>> stub_content = generate_stub(schema.to_dict())
|
|
290
|
+
>>> # Save to a stubs directory configured in your IDE
|
|
291
|
+
>>> with open("stubs/my_sample.pyi", "w") as f:
|
|
292
|
+
... f.write(stub_content)
|
|
293
|
+
"""
|
|
294
|
+
name = schema.get("name", "UnknownSample")
|
|
295
|
+
version = schema.get("version", "1.0.0")
|
|
296
|
+
fields = schema.get("fields", [])
|
|
297
|
+
|
|
298
|
+
lines = [
|
|
299
|
+
"# Auto-generated stub for dynamically decoded schema",
|
|
300
|
+
f"# Schema: {name}@{version}",
|
|
301
|
+
"#",
|
|
302
|
+
"# Save this file to a stubs directory and configure your IDE to include it.",
|
|
303
|
+
"# For VS Code/Pylance: add to python.analysis.extraPaths in settings.json",
|
|
304
|
+
"# For PyCharm: mark the stubs directory as Sources Root",
|
|
305
|
+
"",
|
|
306
|
+
"from typing import Any",
|
|
307
|
+
"from numpy.typing import NDArray",
|
|
308
|
+
"from atdata import PackableSample",
|
|
309
|
+
"",
|
|
310
|
+
f"class {name}(PackableSample):",
|
|
311
|
+
f' """Dynamically decoded sample type from schema {name}@{version}."""',
|
|
312
|
+
]
|
|
313
|
+
|
|
314
|
+
# Add field annotations
|
|
315
|
+
if fields:
|
|
316
|
+
for field_def in fields:
|
|
317
|
+
fname = field_def.get("name", "unknown")
|
|
318
|
+
ftype = _field_type_to_stub_str(
|
|
319
|
+
field_def.get("fieldType", {}),
|
|
320
|
+
field_def.get("optional", False),
|
|
321
|
+
)
|
|
322
|
+
lines.append(f" {fname}: {ftype}")
|
|
323
|
+
else:
|
|
324
|
+
lines.append(" pass")
|
|
325
|
+
|
|
326
|
+
# Add __init__ signature
|
|
327
|
+
lines.append("")
|
|
328
|
+
init_params = ["self"]
|
|
329
|
+
for field_def in fields:
|
|
330
|
+
fname = field_def.get("name", "unknown")
|
|
331
|
+
ftype = _field_type_to_stub_str(
|
|
332
|
+
field_def.get("fieldType", {}),
|
|
333
|
+
field_def.get("optional", False),
|
|
334
|
+
)
|
|
335
|
+
if field_def.get("optional", False):
|
|
336
|
+
init_params.append(f"{fname}: {ftype} = None")
|
|
337
|
+
else:
|
|
338
|
+
init_params.append(f"{fname}: {ftype}")
|
|
339
|
+
|
|
340
|
+
lines.append(f" def __init__({', '.join(init_params)}) -> None: ...")
|
|
341
|
+
lines.append("")
|
|
342
|
+
|
|
343
|
+
return "\n".join(lines)
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def generate_module(schema: dict) -> str:
|
|
347
|
+
"""Generate an importable Python module for a schema.
|
|
348
|
+
|
|
349
|
+
This function creates a Python module that defines a PackableSample subclass
|
|
350
|
+
matching the schema. Unlike stub files, this module can be imported at runtime,
|
|
351
|
+
allowing ``decode_schema`` to return properly typed classes.
|
|
352
|
+
|
|
353
|
+
The generated class inherits from PackableSample and uses @dataclass decorator
|
|
354
|
+
for proper initialization. This provides both runtime functionality and static
|
|
355
|
+
type checking support.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
schema: Schema record dict with 'name', 'version', 'fields', etc.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
String content for a .py module file.
|
|
362
|
+
|
|
363
|
+
Example:
|
|
364
|
+
::
|
|
365
|
+
|
|
366
|
+
>>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
|
|
367
|
+
>>> module_content = generate_module(schema.to_dict())
|
|
368
|
+
>>> # The module can be imported after being saved
|
|
369
|
+
"""
|
|
370
|
+
name = schema.get("name", "UnknownSample")
|
|
371
|
+
version = schema.get("version", "1.0.0")
|
|
372
|
+
fields = schema.get("fields", [])
|
|
373
|
+
|
|
374
|
+
lines = [
|
|
375
|
+
'"""Auto-generated module for dynamically decoded schema.',
|
|
376
|
+
"",
|
|
377
|
+
f"Schema: {name}@{version}",
|
|
378
|
+
"",
|
|
379
|
+
"This module is auto-generated by atdata to provide IDE autocomplete",
|
|
380
|
+
"and type checking support for dynamically decoded schema types.",
|
|
381
|
+
'"""',
|
|
382
|
+
"",
|
|
383
|
+
"from dataclasses import dataclass",
|
|
384
|
+
"from typing import Any",
|
|
385
|
+
"from numpy.typing import NDArray",
|
|
386
|
+
"from atdata import PackableSample",
|
|
387
|
+
"",
|
|
388
|
+
"",
|
|
389
|
+
"@dataclass",
|
|
390
|
+
f"class {name}(PackableSample):",
|
|
391
|
+
f' """Dynamically decoded sample type from schema {name}@{version}."""',
|
|
392
|
+
"",
|
|
393
|
+
]
|
|
394
|
+
|
|
395
|
+
# Add field annotations
|
|
396
|
+
if fields:
|
|
397
|
+
for field_def in fields:
|
|
398
|
+
fname = field_def.get("name", "unknown")
|
|
399
|
+
ftype = _field_type_to_stub_str(
|
|
400
|
+
field_def.get("fieldType", {}),
|
|
401
|
+
field_def.get("optional", False),
|
|
402
|
+
)
|
|
403
|
+
is_optional = field_def.get("optional", False)
|
|
404
|
+
if is_optional:
|
|
405
|
+
lines.append(f" {fname}: {ftype} = None")
|
|
406
|
+
else:
|
|
407
|
+
lines.append(f" {fname}: {ftype}")
|
|
408
|
+
else:
|
|
409
|
+
lines.append(" pass")
|
|
410
|
+
|
|
411
|
+
lines.append("")
|
|
412
|
+
lines.append("")
|
|
413
|
+
lines.append(f"__all__ = [{name!r}]")
|
|
414
|
+
lines.append("")
|
|
415
|
+
|
|
416
|
+
return "\n".join(lines)
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def clear_type_cache() -> None:
|
|
420
|
+
"""Clear the cached generated types.
|
|
421
|
+
|
|
422
|
+
Useful for testing or when schema definitions change.
|
|
423
|
+
"""
|
|
424
|
+
_type_cache.clear()
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def get_cached_types() -> dict[str, Type[PackableSample]]:
|
|
428
|
+
"""Get a copy of the current type cache.
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Dictionary mapping cache keys to generated types.
|
|
432
|
+
"""
|
|
433
|
+
return dict(_type_cache)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
__all__ = [
|
|
437
|
+
"schema_to_type",
|
|
438
|
+
"generate_stub",
|
|
439
|
+
"generate_module",
|
|
440
|
+
"clear_type_cache",
|
|
441
|
+
"get_cached_types",
|
|
442
|
+
]
|