atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +39 -0
- atdata/_cid.py +0 -21
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +41 -15
- atdata/_hf_api.py +95 -11
- atdata/_logging.py +70 -0
- atdata/_protocols.py +77 -238
- atdata/_schema_codec.py +7 -6
- atdata/_stub_manager.py +5 -25
- atdata/_type_utils.py +28 -2
- atdata/atmosphere/__init__.py +31 -20
- atdata/atmosphere/_types.py +4 -4
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +11 -12
- atdata/atmosphere/records.py +12 -12
- atdata/atmosphere/schema.py +16 -18
- atdata/atmosphere/store.py +6 -7
- atdata/cli/__init__.py +161 -175
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +11 -11
- atdata/cli/inspect.py +69 -0
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +583 -328
- atdata/index/__init__.py +54 -0
- atdata/index/_entry.py +157 -0
- atdata/index/_index.py +1198 -0
- atdata/index/_schema.py +380 -0
- atdata/lens.py +9 -2
- atdata/lexicons/__init__.py +121 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +70 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +18 -14
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +123 -0
- atdata/stores/_s3.py +349 -0
- atdata/testing.py +341 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
- atdata-0.3.1b1.dist-info/RECORD +67 -0
- atdata/local.py +0 -1720
- atdata-0.2.3b1.dist-info/RECORD +0 -28
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
atdata/_protocols.py
CHANGED
|
@@ -1,37 +1,25 @@
|
|
|
1
1
|
"""Protocol definitions for atdata index and storage abstractions.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
The key insight is that both local and atmosphere implementations solve the
|
|
7
|
-
same problem: indexed dataset storage with external data URLs. These protocols
|
|
8
|
-
formalize that common interface.
|
|
9
|
-
|
|
10
|
-
Note:
|
|
11
|
-
Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
|
|
12
|
-
the standard Python syntax for Protocol definitions - these are interface
|
|
13
|
-
specifications, not stub implementations. Concrete classes (LocalIndex,
|
|
14
|
-
AtmosphereIndex, etc.) provide the actual implementations.
|
|
3
|
+
Defines the abstract protocols that enable interchangeable index backends
|
|
4
|
+
(local SQLite/Redis vs ATProto PDS) and data stores (S3, local disk, PDS blobs).
|
|
15
5
|
|
|
16
6
|
Protocols:
|
|
17
|
-
Packable: Structural interface for packable sample types
|
|
7
|
+
Packable: Structural interface for packable sample types
|
|
18
8
|
IndexEntry: Common interface for dataset index entries
|
|
19
9
|
AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
|
|
20
10
|
AbstractDataStore: Protocol for data storage operations
|
|
11
|
+
DataSource: Protocol for streaming shard data
|
|
21
12
|
|
|
22
13
|
Examples:
|
|
23
14
|
>>> def process_datasets(index: AbstractIndex) -> None:
|
|
24
15
|
... for entry in index.list_datasets():
|
|
25
16
|
... print(f"{entry.name}: {entry.data_urls}")
|
|
26
|
-
...
|
|
27
|
-
>>> # Works with either LocalIndex or AtmosphereIndex
|
|
28
|
-
>>> process_datasets(local_index)
|
|
29
|
-
>>> process_datasets(atmosphere_index)
|
|
30
17
|
"""
|
|
31
18
|
|
|
32
19
|
from typing import (
|
|
33
20
|
IO,
|
|
34
21
|
Any,
|
|
22
|
+
Iterable,
|
|
35
23
|
Iterator,
|
|
36
24
|
Optional,
|
|
37
25
|
Protocol,
|
|
@@ -77,24 +65,16 @@ class Packable(Protocol):
|
|
|
77
65
|
"""
|
|
78
66
|
|
|
79
67
|
@classmethod
|
|
80
|
-
def from_data(cls, data: dict[str, Any]) -> "Packable":
|
|
81
|
-
"""Create instance from unpacked msgpack data dictionary."""
|
|
82
|
-
...
|
|
68
|
+
def from_data(cls, data: dict[str, Any]) -> "Packable": ...
|
|
83
69
|
|
|
84
70
|
@classmethod
|
|
85
|
-
def from_bytes(cls, bs: bytes) -> "Packable":
|
|
86
|
-
"""Create instance from raw msgpack bytes."""
|
|
87
|
-
...
|
|
71
|
+
def from_bytes(cls, bs: bytes) -> "Packable": ...
|
|
88
72
|
|
|
89
73
|
@property
|
|
90
|
-
def packed(self) -> bytes:
|
|
91
|
-
"""Pack this sample's data into msgpack bytes."""
|
|
92
|
-
...
|
|
74
|
+
def packed(self) -> bytes: ...
|
|
93
75
|
|
|
94
76
|
@property
|
|
95
|
-
def as_wds(self) -> dict[str, Any]:
|
|
96
|
-
"""WebDataset-compatible representation with __key__ and msgpack."""
|
|
97
|
-
...
|
|
77
|
+
def as_wds(self) -> dict[str, Any]: ...
|
|
98
78
|
|
|
99
79
|
|
|
100
80
|
##
|
|
@@ -116,16 +96,14 @@ class IndexEntry(Protocol):
|
|
|
116
96
|
"""
|
|
117
97
|
|
|
118
98
|
@property
|
|
119
|
-
def name(self) -> str:
|
|
120
|
-
"""Human-readable dataset name."""
|
|
121
|
-
...
|
|
99
|
+
def name(self) -> str: ...
|
|
122
100
|
|
|
123
101
|
@property
|
|
124
102
|
def schema_ref(self) -> str:
|
|
125
|
-
"""
|
|
103
|
+
"""Schema reference string.
|
|
126
104
|
|
|
127
|
-
|
|
128
|
-
|
|
105
|
+
Local: ``local://schemas/{module.Class}@{version}``
|
|
106
|
+
Atmosphere: ``at://did:plc:.../ac.foundation.dataset.schema/...``
|
|
129
107
|
"""
|
|
130
108
|
...
|
|
131
109
|
|
|
@@ -139,9 +117,7 @@ class IndexEntry(Protocol):
|
|
|
139
117
|
...
|
|
140
118
|
|
|
141
119
|
@property
|
|
142
|
-
def metadata(self) -> Optional[dict]:
|
|
143
|
-
"""Arbitrary metadata dictionary, or None if not set."""
|
|
144
|
-
...
|
|
120
|
+
def metadata(self) -> Optional[dict]: ...
|
|
145
121
|
|
|
146
122
|
|
|
147
123
|
##
|
|
@@ -149,32 +125,16 @@ class IndexEntry(Protocol):
|
|
|
149
125
|
|
|
150
126
|
|
|
151
127
|
class AbstractIndex(Protocol):
|
|
152
|
-
"""Protocol for index operations
|
|
153
|
-
|
|
154
|
-
This protocol defines the common interface for managing dataset metadata:
|
|
155
|
-
- Publishing and retrieving schemas
|
|
156
|
-
- Inserting and listing datasets
|
|
157
|
-
- (Future) Publishing and retrieving lenses
|
|
128
|
+
"""Protocol for index operations — implemented by Index and AtmosphereIndex.
|
|
158
129
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
Optional Extensions:
|
|
163
|
-
Some index implementations support additional features:
|
|
164
|
-
- ``data_store``: An AbstractDataStore for reading/writing dataset shards.
|
|
165
|
-
If present, ``load_dataset`` will use it for S3 credential resolution.
|
|
130
|
+
Manages dataset metadata: publishing/retrieving schemas, inserting/listing
|
|
131
|
+
datasets. A single index holds datasets of many sample types, tracked via
|
|
132
|
+
schema references.
|
|
166
133
|
|
|
167
134
|
Examples:
|
|
168
135
|
>>> def publish_and_list(index: AbstractIndex) -> None:
|
|
169
|
-
...
|
|
170
|
-
... schema1 = index.publish_schema(ImageSample, version="1.0.0")
|
|
171
|
-
... schema2 = index.publish_schema(TextSample, version="1.0.0")
|
|
172
|
-
...
|
|
173
|
-
... # Insert datasets of different types
|
|
136
|
+
... index.publish_schema(ImageSample, version="1.0.0")
|
|
174
137
|
... index.insert_dataset(image_ds, name="images")
|
|
175
|
-
... index.insert_dataset(text_ds, name="texts")
|
|
176
|
-
...
|
|
177
|
-
... # List all datasets (mixed types)
|
|
178
138
|
... for entry in index.list_datasets():
|
|
179
139
|
... print(f"{entry.name} -> {entry.schema_ref}")
|
|
180
140
|
"""
|
|
@@ -183,55 +143,58 @@ class AbstractIndex(Protocol):
|
|
|
183
143
|
def data_store(self) -> Optional["AbstractDataStore"]:
|
|
184
144
|
"""Optional data store for reading/writing shards.
|
|
185
145
|
|
|
186
|
-
If present, ``load_dataset``
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
Returns:
|
|
190
|
-
AbstractDataStore instance, or None if this index doesn't have
|
|
191
|
-
an associated data store.
|
|
192
|
-
|
|
193
|
-
Note:
|
|
194
|
-
Not all index implementations provide a data_store. Use
|
|
195
|
-
``hasattr(index, 'data_store') and index.data_store is not None``
|
|
196
|
-
for safe access.
|
|
146
|
+
If present, ``load_dataset`` uses it for credential resolution.
|
|
147
|
+
Not all implementations provide a data_store; check with
|
|
148
|
+
``getattr(index, 'data_store', None)``.
|
|
197
149
|
"""
|
|
198
150
|
...
|
|
199
151
|
|
|
200
152
|
# Dataset operations
|
|
201
153
|
|
|
202
|
-
def
|
|
154
|
+
def write(
|
|
203
155
|
self,
|
|
204
|
-
|
|
156
|
+
samples: Iterable,
|
|
205
157
|
*,
|
|
206
158
|
name: str,
|
|
207
159
|
schema_ref: Optional[str] = None,
|
|
208
160
|
**kwargs,
|
|
209
161
|
) -> IndexEntry:
|
|
210
|
-
"""
|
|
162
|
+
"""Write samples and create an index entry in one step.
|
|
211
163
|
|
|
212
|
-
|
|
213
|
-
|
|
164
|
+
Serializes samples to WebDataset tar files, stores them via the
|
|
165
|
+
appropriate backend, and creates an index entry.
|
|
214
166
|
|
|
215
167
|
Args:
|
|
216
|
-
|
|
217
|
-
name:
|
|
218
|
-
schema_ref: Optional
|
|
219
|
-
|
|
220
|
-
**kwargs: Additional backend-specific options.
|
|
168
|
+
samples: Iterable of Packable samples. Must be non-empty.
|
|
169
|
+
name: Dataset name, optionally prefixed with target backend.
|
|
170
|
+
schema_ref: Optional schema reference.
|
|
171
|
+
**kwargs: Backend-specific options (maxcount, description, etc.).
|
|
221
172
|
|
|
222
173
|
Returns:
|
|
223
|
-
IndexEntry for the
|
|
174
|
+
IndexEntry for the created dataset.
|
|
224
175
|
"""
|
|
225
176
|
...
|
|
226
177
|
|
|
227
|
-
def
|
|
228
|
-
|
|
178
|
+
def insert_dataset(
|
|
179
|
+
self,
|
|
180
|
+
ds: "Dataset",
|
|
181
|
+
*,
|
|
182
|
+
name: str,
|
|
183
|
+
schema_ref: Optional[str] = None,
|
|
184
|
+
**kwargs,
|
|
185
|
+
) -> IndexEntry:
|
|
186
|
+
"""Register an existing dataset in the index.
|
|
229
187
|
|
|
230
188
|
Args:
|
|
231
|
-
|
|
189
|
+
ds: The Dataset to register.
|
|
190
|
+
name: Human-readable name.
|
|
191
|
+
schema_ref: Explicit schema ref; auto-published if ``None``.
|
|
192
|
+
**kwargs: Backend-specific options.
|
|
193
|
+
"""
|
|
194
|
+
...
|
|
232
195
|
|
|
233
|
-
|
|
234
|
-
|
|
196
|
+
def get_dataset(self, ref: str) -> IndexEntry:
|
|
197
|
+
"""Get a dataset entry by name or reference.
|
|
235
198
|
|
|
236
199
|
Raises:
|
|
237
200
|
KeyError: If dataset not found.
|
|
@@ -239,21 +202,9 @@ class AbstractIndex(Protocol):
|
|
|
239
202
|
...
|
|
240
203
|
|
|
241
204
|
@property
|
|
242
|
-
def datasets(self) -> Iterator[IndexEntry]:
|
|
243
|
-
"""Lazily iterate over all dataset entries in this index.
|
|
244
|
-
|
|
245
|
-
Yields:
|
|
246
|
-
IndexEntry for each dataset (may be of different sample types).
|
|
247
|
-
"""
|
|
248
|
-
...
|
|
249
|
-
|
|
250
|
-
def list_datasets(self) -> list[IndexEntry]:
|
|
251
|
-
"""Get all dataset entries as a materialized list.
|
|
205
|
+
def datasets(self) -> Iterator[IndexEntry]: ...
|
|
252
206
|
|
|
253
|
-
|
|
254
|
-
List of IndexEntry for each dataset.
|
|
255
|
-
"""
|
|
256
|
-
...
|
|
207
|
+
def list_datasets(self) -> list[IndexEntry]: ...
|
|
257
208
|
|
|
258
209
|
# Schema operations
|
|
259
210
|
|
|
@@ -266,80 +217,39 @@ class AbstractIndex(Protocol):
|
|
|
266
217
|
) -> str:
|
|
267
218
|
"""Publish a schema for a sample type.
|
|
268
219
|
|
|
269
|
-
The sample_type is accepted as ``type`` rather than ``Type[Packable]`` to
|
|
270
|
-
support ``@packable``-decorated classes, which satisfy the Packable protocol
|
|
271
|
-
at runtime but cannot be statically verified by type checkers.
|
|
272
|
-
|
|
273
220
|
Args:
|
|
274
|
-
sample_type: A Packable type (
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
**kwargs: Additional backend-specific options.
|
|
221
|
+
sample_type: A Packable type (``@packable``-decorated or subclass).
|
|
222
|
+
version: Semantic version string.
|
|
223
|
+
**kwargs: Backend-specific options.
|
|
278
224
|
|
|
279
225
|
Returns:
|
|
280
|
-
Schema reference string
|
|
281
|
-
- Local: 'local://schemas/{module.Class}@{version}'
|
|
282
|
-
- Atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
|
|
226
|
+
Schema reference string (``local://...`` or ``at://...``).
|
|
283
227
|
"""
|
|
284
228
|
...
|
|
285
229
|
|
|
286
230
|
def get_schema(self, ref: str) -> dict:
|
|
287
231
|
"""Get a schema record by reference.
|
|
288
232
|
|
|
289
|
-
Args:
|
|
290
|
-
ref: Schema reference string (local:// or at://).
|
|
291
|
-
|
|
292
|
-
Returns:
|
|
293
|
-
Schema record as a dictionary with fields like 'name', 'version',
|
|
294
|
-
'fields', etc.
|
|
295
|
-
|
|
296
233
|
Raises:
|
|
297
234
|
KeyError: If schema not found.
|
|
298
235
|
"""
|
|
299
236
|
...
|
|
300
237
|
|
|
301
238
|
@property
|
|
302
|
-
def schemas(self) -> Iterator[dict]:
|
|
303
|
-
"""Lazily iterate over all schema records in this index.
|
|
304
|
-
|
|
305
|
-
Yields:
|
|
306
|
-
Schema records as dictionaries.
|
|
307
|
-
"""
|
|
308
|
-
...
|
|
239
|
+
def schemas(self) -> Iterator[dict]: ...
|
|
309
240
|
|
|
310
|
-
def list_schemas(self) -> list[dict]:
|
|
311
|
-
"""Get all schema records as a materialized list.
|
|
312
|
-
|
|
313
|
-
Returns:
|
|
314
|
-
List of schema records as dictionaries.
|
|
315
|
-
"""
|
|
316
|
-
...
|
|
241
|
+
def list_schemas(self) -> list[dict]: ...
|
|
317
242
|
|
|
318
243
|
def decode_schema(self, ref: str) -> Type[Packable]:
|
|
319
|
-
"""Reconstruct a
|
|
320
|
-
|
|
321
|
-
This method enables loading datasets without knowing the sample type
|
|
322
|
-
ahead of time. The index retrieves the schema record and dynamically
|
|
323
|
-
generates a Packable class matching the schema definition.
|
|
324
|
-
|
|
325
|
-
Args:
|
|
326
|
-
ref: Schema reference string (local:// or at://).
|
|
327
|
-
|
|
328
|
-
Returns:
|
|
329
|
-
A dynamically generated Packable class with fields matching
|
|
330
|
-
the schema definition. The class can be used with
|
|
331
|
-
``Dataset[T]`` to load and iterate over samples.
|
|
244
|
+
"""Reconstruct a Packable type from a stored schema.
|
|
332
245
|
|
|
333
246
|
Raises:
|
|
334
247
|
KeyError: If schema not found.
|
|
335
|
-
ValueError: If schema
|
|
248
|
+
ValueError: If schema has unsupported field types.
|
|
336
249
|
|
|
337
250
|
Examples:
|
|
338
|
-
>>> entry = index.get_dataset("my-dataset")
|
|
339
251
|
>>> SampleType = index.decode_schema(entry.schema_ref)
|
|
340
252
|
>>> ds = Dataset[SampleType](entry.data_urls[0])
|
|
341
|
-
>>> for sample in ds.ordered():
|
|
342
|
-
... print(sample) # sample is instance of SampleType
|
|
343
253
|
"""
|
|
344
254
|
...
|
|
345
255
|
|
|
@@ -349,21 +259,14 @@ class AbstractIndex(Protocol):
|
|
|
349
259
|
|
|
350
260
|
|
|
351
261
|
class AbstractDataStore(Protocol):
|
|
352
|
-
"""Protocol for data storage
|
|
353
|
-
|
|
354
|
-
This protocol abstracts over different storage backends for dataset data:
|
|
355
|
-
- S3DataStore: S3-compatible object storage
|
|
356
|
-
- PDSBlobStore: ATProto PDS blob storage (future)
|
|
262
|
+
"""Protocol for data storage backends (S3, local disk, PDS blobs).
|
|
357
263
|
|
|
358
|
-
|
|
359
|
-
flexible deployment
|
|
360
|
-
S3 storage, or atmosphere index with PDS blobs.
|
|
264
|
+
Separates index (metadata) from data store (shard files), enabling
|
|
265
|
+
flexible deployment combinations.
|
|
361
266
|
|
|
362
267
|
Examples:
|
|
363
268
|
>>> store = S3DataStore(credentials, bucket="my-bucket")
|
|
364
269
|
>>> urls = store.write_shards(dataset, prefix="training/v1")
|
|
365
|
-
>>> print(urls)
|
|
366
|
-
['s3://my-bucket/training/v1/shard-000000.tar', ...]
|
|
367
270
|
"""
|
|
368
271
|
|
|
369
272
|
def write_shards(
|
|
@@ -377,38 +280,19 @@ class AbstractDataStore(Protocol):
|
|
|
377
280
|
|
|
378
281
|
Args:
|
|
379
282
|
ds: The Dataset to write.
|
|
380
|
-
prefix: Path prefix
|
|
381
|
-
**kwargs: Backend-specific options (
|
|
283
|
+
prefix: Path prefix (e.g., ``'datasets/mnist/v1'``).
|
|
284
|
+
**kwargs: Backend-specific options (``maxcount``, ``maxsize``, etc.).
|
|
382
285
|
|
|
383
286
|
Returns:
|
|
384
|
-
List of URLs
|
|
385
|
-
WebDataset or atdata.Dataset().
|
|
287
|
+
List of shard URLs suitable for ``atdata.Dataset()``.
|
|
386
288
|
"""
|
|
387
289
|
...
|
|
388
290
|
|
|
389
291
|
def read_url(self, url: str) -> str:
|
|
390
|
-
"""Resolve a storage URL for reading.
|
|
391
|
-
|
|
392
|
-
Some storage backends may need to transform URLs (e.g., signing S3 URLs
|
|
393
|
-
or resolving blob references). This method returns a URL that can be
|
|
394
|
-
used directly with WebDataset.
|
|
395
|
-
|
|
396
|
-
Args:
|
|
397
|
-
url: Storage URL to resolve.
|
|
398
|
-
|
|
399
|
-
Returns:
|
|
400
|
-
WebDataset-compatible URL for reading.
|
|
401
|
-
"""
|
|
292
|
+
"""Resolve a storage URL for reading (e.g., sign S3 URLs)."""
|
|
402
293
|
...
|
|
403
294
|
|
|
404
|
-
def supports_streaming(self) -> bool:
|
|
405
|
-
"""Whether this store supports streaming reads.
|
|
406
|
-
|
|
407
|
-
Returns:
|
|
408
|
-
True if the store supports efficient streaming (like S3),
|
|
409
|
-
False if data must be fully downloaded first.
|
|
410
|
-
"""
|
|
411
|
-
...
|
|
295
|
+
def supports_streaming(self) -> bool: ...
|
|
412
296
|
|
|
413
297
|
|
|
414
298
|
##
|
|
@@ -417,77 +301,32 @@ class AbstractDataStore(Protocol):
|
|
|
417
301
|
|
|
418
302
|
@runtime_checkable
|
|
419
303
|
class DataSource(Protocol):
|
|
420
|
-
"""Protocol for data sources that
|
|
304
|
+
"""Protocol for data sources that stream shard data to Dataset.
|
|
421
305
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
The key method is ``shards()``, which yields (identifier, stream) pairs.
|
|
428
|
-
These are fed directly to WebDataset's tar_file_expander, bypassing URL
|
|
429
|
-
resolution entirely. This enables:
|
|
430
|
-
- Private S3 repos with credentials
|
|
431
|
-
- Custom endpoints (Cloudflare R2, MinIO)
|
|
432
|
-
- ATProto blob streaming
|
|
433
|
-
- Any other source that can provide file-like objects
|
|
306
|
+
Implementations (URLSource, S3Source, BlobSource) yield
|
|
307
|
+
``(identifier, stream)`` pairs fed to WebDataset's tar expander,
|
|
308
|
+
bypassing URL resolution. This enables private S3, custom endpoints,
|
|
309
|
+
and ATProto blob streaming.
|
|
434
310
|
|
|
435
311
|
Examples:
|
|
436
|
-
>>> source = S3Source(
|
|
437
|
-
... bucket="my-bucket",
|
|
438
|
-
... keys=["data-000.tar", "data-001.tar"],
|
|
439
|
-
... endpoint="https://r2.example.com",
|
|
440
|
-
... credentials=creds,
|
|
441
|
-
... )
|
|
312
|
+
>>> source = S3Source(bucket="my-bucket", keys=["data-000.tar"])
|
|
442
313
|
>>> ds = Dataset[MySample](source)
|
|
443
|
-
>>> for sample in ds.ordered():
|
|
444
|
-
... print(sample)
|
|
445
314
|
"""
|
|
446
315
|
|
|
447
316
|
@property
|
|
448
317
|
def shards(self) -> Iterator[tuple[str, IO[bytes]]]:
|
|
449
|
-
"""Lazily yield (
|
|
450
|
-
|
|
451
|
-
The identifier is used for error messages and __url__ metadata.
|
|
452
|
-
The stream must be a file-like object that can be read by tarfile.
|
|
453
|
-
|
|
454
|
-
Yields:
|
|
455
|
-
Tuple of (shard_identifier, file_like_stream).
|
|
456
|
-
|
|
457
|
-
Examples:
|
|
458
|
-
>>> for shard_id, stream in source.shards:
|
|
459
|
-
... print(f"Processing {shard_id}")
|
|
460
|
-
... data = stream.read()
|
|
461
|
-
"""
|
|
318
|
+
"""Lazily yield ``(shard_id, stream)`` pairs for each shard."""
|
|
462
319
|
...
|
|
463
320
|
|
|
464
321
|
def list_shards(self) -> list[str]:
|
|
465
|
-
"""
|
|
466
|
-
|
|
467
|
-
Used for metadata queries like counting shards without actually
|
|
468
|
-
streaming data. Implementations should return identifiers that
|
|
469
|
-
match what shards would yield.
|
|
470
|
-
|
|
471
|
-
Returns:
|
|
472
|
-
List of shard identifier strings.
|
|
473
|
-
"""
|
|
322
|
+
"""Shard identifiers without opening streams."""
|
|
474
323
|
...
|
|
475
324
|
|
|
476
325
|
def open_shard(self, shard_id: str) -> IO[bytes]:
|
|
477
|
-
"""Open a single shard
|
|
478
|
-
|
|
479
|
-
This method enables random access to individual shards, which is
|
|
480
|
-
required for PyTorch DataLoader worker splitting. Each worker opens
|
|
481
|
-
only its assigned shards rather than iterating all shards.
|
|
482
|
-
|
|
483
|
-
Args:
|
|
484
|
-
shard_id: Shard identifier from shard_list.
|
|
485
|
-
|
|
486
|
-
Returns:
|
|
487
|
-
File-like stream for reading the shard.
|
|
326
|
+
"""Open a single shard for random access (e.g., DataLoader splitting).
|
|
488
327
|
|
|
489
328
|
Raises:
|
|
490
|
-
KeyError: If shard_id is not in
|
|
329
|
+
KeyError: If *shard_id* is not in ``list_shards()``.
|
|
491
330
|
"""
|
|
492
331
|
...
|
|
493
332
|
|
atdata/_schema_codec.py
CHANGED
|
@@ -28,13 +28,14 @@ import hashlib
|
|
|
28
28
|
|
|
29
29
|
from numpy.typing import NDArray
|
|
30
30
|
|
|
31
|
-
# Import PackableSample for inheritance
|
|
31
|
+
# Import PackableSample for inheritance in dynamic class generation
|
|
32
32
|
from .dataset import PackableSample
|
|
33
|
+
from ._protocols import Packable
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
# Type cache to avoid regenerating identical types
|
|
36
37
|
# Uses insertion order (Python 3.7+) for simple FIFO eviction
|
|
37
|
-
_type_cache: dict[str, Type[
|
|
38
|
+
_type_cache: dict[str, Type[Packable]] = {}
|
|
38
39
|
_TYPE_CACHE_MAX_SIZE = 256
|
|
39
40
|
|
|
40
41
|
|
|
@@ -130,7 +131,7 @@ def schema_to_type(
|
|
|
130
131
|
schema: dict,
|
|
131
132
|
*,
|
|
132
133
|
use_cache: bool = True,
|
|
133
|
-
) -> Type[
|
|
134
|
+
) -> Type[Packable]:
|
|
134
135
|
"""Generate a PackableSample subclass from a schema record.
|
|
135
136
|
|
|
136
137
|
This function dynamically creates a dataclass that inherits from PackableSample,
|
|
@@ -283,7 +284,7 @@ def generate_stub(schema: dict) -> str:
|
|
|
283
284
|
String content for a .pyi stub file.
|
|
284
285
|
|
|
285
286
|
Examples:
|
|
286
|
-
>>> schema = index.get_schema("atdata://local/
|
|
287
|
+
>>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
|
|
287
288
|
>>> stub_content = generate_stub(schema.to_dict())
|
|
288
289
|
>>> # Save to a stubs directory configured in your IDE
|
|
289
290
|
>>> with open("stubs/my_sample.pyi", "w") as f:
|
|
@@ -359,7 +360,7 @@ def generate_module(schema: dict) -> str:
|
|
|
359
360
|
String content for a .py module file.
|
|
360
361
|
|
|
361
362
|
Examples:
|
|
362
|
-
>>> schema = index.get_schema("atdata://local/
|
|
363
|
+
>>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
|
|
363
364
|
>>> module_content = generate_module(schema.to_dict())
|
|
364
365
|
>>> # The module can be imported after being saved
|
|
365
366
|
"""
|
|
@@ -420,7 +421,7 @@ def clear_type_cache() -> None:
|
|
|
420
421
|
_type_cache.clear()
|
|
421
422
|
|
|
422
423
|
|
|
423
|
-
def get_cached_types() -> dict[str, Type[
|
|
424
|
+
def get_cached_types() -> dict[str, Type[Packable]]:
|
|
424
425
|
"""Get a copy of the current type cache.
|
|
425
426
|
|
|
426
427
|
Returns:
|
atdata/_stub_manager.py
CHANGED
|
@@ -15,7 +15,7 @@ Examples:
|
|
|
15
15
|
>>> index = Index(auto_stubs=True)
|
|
16
16
|
>>>
|
|
17
17
|
>>> # Modules are generated automatically on decode_schema
|
|
18
|
-
>>> MyType = index.decode_schema("atdata://local/
|
|
18
|
+
>>> MyType = index.decode_schema("atdata://local/schema/MySample@1.0.0")
|
|
19
19
|
>>> # MyType is now properly typed for IDE autocomplete!
|
|
20
20
|
>>>
|
|
21
21
|
>>> # Get the stub directory path for IDE configuration
|
|
@@ -51,8 +51,8 @@ def _extract_authority(schema_ref: Optional[str]) -> str:
|
|
|
51
51
|
"""Extract authority from a schema reference URI.
|
|
52
52
|
|
|
53
53
|
Args:
|
|
54
|
-
schema_ref: Schema ref like "atdata://local/
|
|
55
|
-
or "atdata://alice.bsky.social/
|
|
54
|
+
schema_ref: Schema ref like "atdata://local/schema/Name@1.0.0"
|
|
55
|
+
or "atdata://alice.bsky.social/schema/Name@1.0.0"
|
|
56
56
|
|
|
57
57
|
Returns:
|
|
58
58
|
Authority string (e.g., "local", "alice.bsky.social", "did_plc_xxx").
|
|
@@ -149,10 +149,6 @@ class StubManager:
|
|
|
149
149
|
safe_version = version.replace(".", "_")
|
|
150
150
|
return f"{name}_{safe_version}.py"
|
|
151
151
|
|
|
152
|
-
def _stub_filename(self, name: str, version: str) -> str:
|
|
153
|
-
"""Alias for _module_filename for backwards compatibility."""
|
|
154
|
-
return self._module_filename(name, version)
|
|
155
|
-
|
|
156
152
|
def _module_path(
|
|
157
153
|
self, name: str, version: str, authority: str = DEFAULT_AUTHORITY
|
|
158
154
|
) -> Path:
|
|
@@ -168,12 +164,6 @@ class StubManager:
|
|
|
168
164
|
"""
|
|
169
165
|
return self._stub_dir / authority / self._module_filename(name, version)
|
|
170
166
|
|
|
171
|
-
def _stub_path(
|
|
172
|
-
self, name: str, version: str, authority: str = DEFAULT_AUTHORITY
|
|
173
|
-
) -> Path:
|
|
174
|
-
"""Alias for _module_path for backwards compatibility."""
|
|
175
|
-
return self._module_path(name, version, authority)
|
|
176
|
-
|
|
177
167
|
def _module_is_current(self, path: Path, version: str) -> bool:
|
|
178
168
|
"""Check if an existing module file matches the expected version.
|
|
179
169
|
|
|
@@ -200,10 +190,6 @@ class StubManager:
|
|
|
200
190
|
except (OSError, IOError):
|
|
201
191
|
return False
|
|
202
192
|
|
|
203
|
-
def _stub_is_current(self, path: Path, version: str) -> bool:
|
|
204
|
-
"""Alias for _module_is_current for backwards compatibility."""
|
|
205
|
-
return self._module_is_current(path, version)
|
|
206
|
-
|
|
207
193
|
def _ensure_authority_package(self, authority: str) -> None:
|
|
208
194
|
"""Ensure authority subdirectory exists with __init__.py."""
|
|
209
195
|
self._ensure_dir_exists()
|
|
@@ -261,12 +247,6 @@ class StubManager:
|
|
|
261
247
|
pass # Temp file cleanup failed, re-raising original error
|
|
262
248
|
raise
|
|
263
249
|
|
|
264
|
-
def _write_stub_atomic(self, path: Path, content: str) -> None:
|
|
265
|
-
"""Legacy method - extracts authority from path and calls _write_module_atomic."""
|
|
266
|
-
# Extract authority from path (parent directory name)
|
|
267
|
-
authority = path.parent.name
|
|
268
|
-
self._write_module_atomic(path, content, authority)
|
|
269
|
-
|
|
270
250
|
def ensure_stub(self, schema: dict) -> Optional[Path]:
|
|
271
251
|
"""Ensure a module file exists for the given schema.
|
|
272
252
|
|
|
@@ -426,7 +406,7 @@ class StubManager:
|
|
|
426
406
|
Returns:
|
|
427
407
|
Path if stub exists, None otherwise
|
|
428
408
|
"""
|
|
429
|
-
path = self.
|
|
409
|
+
path = self._module_path(name, version, authority)
|
|
430
410
|
return path if path.exists() else None
|
|
431
411
|
|
|
432
412
|
def list_stubs(self, authority: Optional[str] = None) -> list[Path]:
|
|
@@ -513,7 +493,7 @@ class StubManager:
|
|
|
513
493
|
Returns:
|
|
514
494
|
True if file was removed, False if it didn't exist
|
|
515
495
|
"""
|
|
516
|
-
path = self.
|
|
496
|
+
path = self._module_path(name, version, authority)
|
|
517
497
|
if path.exists():
|
|
518
498
|
try:
|
|
519
499
|
path.unlink()
|
atdata/_type_utils.py
CHANGED
|
@@ -45,9 +45,13 @@ def numpy_dtype_to_string(dtype: Any) -> str:
|
|
|
45
45
|
Schema dtype string (e.g., "float32", "int64"). Defaults to "float32".
|
|
46
46
|
"""
|
|
47
47
|
dtype_str = str(dtype)
|
|
48
|
-
|
|
48
|
+
# Exact match first (handles "float32", "int64", etc.)
|
|
49
|
+
if dtype_str in NUMPY_DTYPE_MAP:
|
|
50
|
+
return NUMPY_DTYPE_MAP[dtype_str]
|
|
51
|
+
# Substring match, longest keys first to avoid "int8" matching "uint8"
|
|
52
|
+
for key in sorted(NUMPY_DTYPE_MAP, key=len, reverse=True):
|
|
49
53
|
if key in dtype_str:
|
|
50
|
-
return
|
|
54
|
+
return NUMPY_DTYPE_MAP[key]
|
|
51
55
|
return "float32"
|
|
52
56
|
|
|
53
57
|
|
|
@@ -102,3 +106,25 @@ def extract_ndarray_dtype(python_type: Any) -> str:
|
|
|
102
106
|
if dtype_arg is not None:
|
|
103
107
|
return numpy_dtype_to_string(dtype_arg)
|
|
104
108
|
return "float32"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def parse_semver(version: str) -> tuple[int, int, int]:
|
|
112
|
+
"""Parse a semantic version string into a comparable tuple.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
version: A ``"major.minor.patch"`` version string.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Tuple of (major, minor, patch) integers.
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
ValueError: If the version string is not valid semver.
|
|
122
|
+
|
|
123
|
+
Examples:
|
|
124
|
+
>>> parse_semver("1.2.3")
|
|
125
|
+
(1, 2, 3)
|
|
126
|
+
"""
|
|
127
|
+
parts = version.split(".")
|
|
128
|
+
if len(parts) != 3:
|
|
129
|
+
raise ValueError(f"Invalid semver: {version}")
|
|
130
|
+
return int(parts[0]), int(parts[1]), int(parts[2])
|