atdata 0.3.0b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +9 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +33 -1
- atdata/_protocols.py +64 -182
- atdata/_schema_codec.py +2 -2
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +12 -11
- atdata/atmosphere/_types.py +4 -4
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +11 -12
- atdata/atmosphere/records.py +9 -10
- atdata/atmosphere/schema.py +14 -16
- atdata/atmosphere/store.py +6 -7
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +155 -2
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_index.py +322 -64
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +121 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/promote.py +14 -10
- atdata/repository.py +7 -7
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +123 -0
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +2 -2
- atdata-0.3.1b1.dist-info/RECORD +67 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- /atdata/{local → index}/_entry.py +0 -0
- /atdata/{local → stores}/_s3.py +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
atdata/__init__.py
CHANGED
|
@@ -44,6 +44,7 @@ from .dataset import (
|
|
|
44
44
|
SampleBatch as SampleBatch,
|
|
45
45
|
Dataset as Dataset,
|
|
46
46
|
packable as packable,
|
|
47
|
+
write_samples as write_samples,
|
|
47
48
|
)
|
|
48
49
|
|
|
49
50
|
from .lens import (
|
|
@@ -96,6 +97,14 @@ from .repository import (
|
|
|
96
97
|
create_repository as create_repository,
|
|
97
98
|
)
|
|
98
99
|
|
|
100
|
+
from .index import (
|
|
101
|
+
Index as Index,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
from .stores import (
|
|
105
|
+
LocalDiskStore as LocalDiskStore,
|
|
106
|
+
)
|
|
107
|
+
|
|
99
108
|
from ._cid import (
|
|
100
109
|
generate_cid as generate_cid,
|
|
101
110
|
verify_cid as verify_cid,
|
atdata/_cid.py
CHANGED
|
@@ -116,29 +116,8 @@ def verify_cid(cid: str, data: Any) -> bool:
|
|
|
116
116
|
return cid == expected_cid
|
|
117
117
|
|
|
118
118
|
|
|
119
|
-
def parse_cid(cid: str) -> dict:
|
|
120
|
-
"""Parse a CID string into its components.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
cid: CID string to parse.
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
Dictionary with 'version', 'codec', and 'hash' keys.
|
|
127
|
-
The 'hash' value is itself a dict with 'code', 'size', and 'digest'.
|
|
128
|
-
|
|
129
|
-
Examples:
|
|
130
|
-
>>> info = parse_cid('bafyrei...')
|
|
131
|
-
>>> info['version']
|
|
132
|
-
1
|
|
133
|
-
>>> info['codec']
|
|
134
|
-
113 # 0x71 = dag-cbor
|
|
135
|
-
"""
|
|
136
|
-
return libipld.decode_cid(cid)
|
|
137
|
-
|
|
138
|
-
|
|
139
119
|
__all__ = [
|
|
140
120
|
"generate_cid",
|
|
141
121
|
"generate_cid_from_bytes",
|
|
142
122
|
"verify_cid",
|
|
143
|
-
"parse_cid",
|
|
144
123
|
]
|
atdata/_helpers.py
CHANGED
|
@@ -65,10 +65,22 @@ def bytes_to_array(b: bytes) -> np.ndarray:
|
|
|
65
65
|
return np.load(BytesIO(b), allow_pickle=True)
|
|
66
66
|
|
|
67
67
|
# Compact format: dtype_len(1B) + dtype_str + ndim(1B) + shape(ndim×8B) + data
|
|
68
|
+
if len(b) < 2:
|
|
69
|
+
raise ValueError(f"Array buffer too short ({len(b)} bytes): need at least 2")
|
|
68
70
|
dlen = b[0]
|
|
71
|
+
min_header = 2 + dlen # dtype_len + dtype_str + ndim
|
|
72
|
+
if len(b) < min_header:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Array buffer too short ({len(b)} bytes): need at least {min_header} for header"
|
|
75
|
+
)
|
|
69
76
|
dtype = np.dtype(b[1 : 1 + dlen].decode())
|
|
70
77
|
ndim = b[1 + dlen]
|
|
71
78
|
offset = 2 + dlen
|
|
79
|
+
min_with_shape = offset + ndim * 8
|
|
80
|
+
if len(b) < min_with_shape:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Array buffer too short ({len(b)} bytes): need at least {min_with_shape} for shape"
|
|
83
|
+
)
|
|
72
84
|
shape = struct.unpack_from(f"<{ndim}q", b, offset)
|
|
73
85
|
offset += ndim * 8
|
|
74
86
|
return np.frombuffer(b, dtype=dtype, offset=offset).reshape(shape).copy()
|
atdata/_hf_api.py
CHANGED
|
@@ -32,6 +32,7 @@ import re
|
|
|
32
32
|
import threading
|
|
33
33
|
from pathlib import Path
|
|
34
34
|
from typing import (
|
|
35
|
+
Any,
|
|
35
36
|
TYPE_CHECKING,
|
|
36
37
|
Generic,
|
|
37
38
|
Mapping,
|
|
@@ -65,7 +66,7 @@ def get_default_index() -> "Index": # noqa: F821
|
|
|
65
66
|
"""Get or create the module-level default Index.
|
|
66
67
|
|
|
67
68
|
The default Index uses Redis for local storage (backwards-compatible
|
|
68
|
-
default) and an anonymous
|
|
69
|
+
default) and an anonymous Atmosphere for read-only public data
|
|
69
70
|
resolution.
|
|
70
71
|
|
|
71
72
|
The default is created lazily on first access and cached for the
|
|
@@ -189,6 +190,37 @@ class DatasetDict(Generic[ST], dict):
|
|
|
189
190
|
"""
|
|
190
191
|
return {name: len(ds.list_shards()) for name, ds in self.items()}
|
|
191
192
|
|
|
193
|
+
# Methods proxied to the sole Dataset when only one split exists.
|
|
194
|
+
_DATASET_METHODS = frozenset(
|
|
195
|
+
{
|
|
196
|
+
"ordered",
|
|
197
|
+
"shuffled",
|
|
198
|
+
"as_type",
|
|
199
|
+
"list_shards",
|
|
200
|
+
"head",
|
|
201
|
+
}
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def __getattr__(self, name: str) -> Any:
|
|
205
|
+
"""Proxy common Dataset methods when this dict has exactly one split.
|
|
206
|
+
|
|
207
|
+
When a ``DatasetDict`` contains a single split, calling iteration
|
|
208
|
+
methods like ``.ordered()`` or ``.shuffled()`` is forwarded to the
|
|
209
|
+
contained ``Dataset`` for convenience. Multi-split dicts raise
|
|
210
|
+
``AttributeError`` with a hint to select a split explicitly.
|
|
211
|
+
"""
|
|
212
|
+
if name in self._DATASET_METHODS:
|
|
213
|
+
if len(self) == 1:
|
|
214
|
+
return getattr(next(iter(self.values())), name)
|
|
215
|
+
splits = ", ".join(f"'{k}'" for k in self.keys())
|
|
216
|
+
raise AttributeError(
|
|
217
|
+
f"'{type(self).__name__}' has {len(self)} splits ({splits}). "
|
|
218
|
+
f"Select one first, e.g. ds_dict['{next(iter(self.keys()))}'].{name}()"
|
|
219
|
+
)
|
|
220
|
+
raise AttributeError(
|
|
221
|
+
f"'{type(self).__name__}' object has no attribute '{name}'"
|
|
222
|
+
)
|
|
223
|
+
|
|
192
224
|
|
|
193
225
|
##
|
|
194
226
|
# Path resolution utilities
|
atdata/_protocols.py
CHANGED
|
@@ -1,37 +1,25 @@
|
|
|
1
1
|
"""Protocol definitions for atdata index and storage abstractions.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
The key insight is that both local and atmosphere implementations solve the
|
|
7
|
-
same problem: indexed dataset storage with external data URLs. These protocols
|
|
8
|
-
formalize that common interface.
|
|
9
|
-
|
|
10
|
-
Note:
|
|
11
|
-
Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
|
|
12
|
-
the standard Python syntax for Protocol definitions - these are interface
|
|
13
|
-
specifications, not stub implementations. Concrete classes (Index,
|
|
14
|
-
AtmosphereIndex, etc.) provide the actual implementations.
|
|
3
|
+
Defines the abstract protocols that enable interchangeable index backends
|
|
4
|
+
(local SQLite/Redis vs ATProto PDS) and data stores (S3, local disk, PDS blobs).
|
|
15
5
|
|
|
16
6
|
Protocols:
|
|
17
|
-
Packable: Structural interface for packable sample types
|
|
7
|
+
Packable: Structural interface for packable sample types
|
|
18
8
|
IndexEntry: Common interface for dataset index entries
|
|
19
9
|
AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
|
|
20
10
|
AbstractDataStore: Protocol for data storage operations
|
|
11
|
+
DataSource: Protocol for streaming shard data
|
|
21
12
|
|
|
22
13
|
Examples:
|
|
23
14
|
>>> def process_datasets(index: AbstractIndex) -> None:
|
|
24
15
|
... for entry in index.list_datasets():
|
|
25
16
|
... print(f"{entry.name}: {entry.data_urls}")
|
|
26
|
-
...
|
|
27
|
-
>>> # Works with either Index or AtmosphereIndex
|
|
28
|
-
>>> process_datasets(local_index)
|
|
29
|
-
>>> process_datasets(atmosphere_index)
|
|
30
17
|
"""
|
|
31
18
|
|
|
32
19
|
from typing import (
|
|
33
20
|
IO,
|
|
34
21
|
Any,
|
|
22
|
+
Iterable,
|
|
35
23
|
Iterator,
|
|
36
24
|
Optional,
|
|
37
25
|
Protocol,
|
|
@@ -115,7 +103,7 @@ class IndexEntry(Protocol):
|
|
|
115
103
|
"""Schema reference string.
|
|
116
104
|
|
|
117
105
|
Local: ``local://schemas/{module.Class}@{version}``
|
|
118
|
-
Atmosphere: ``at://did:plc:.../ac.foundation.dataset.
|
|
106
|
+
Atmosphere: ``at://did:plc:.../ac.foundation.dataset.schema/...``
|
|
119
107
|
"""
|
|
120
108
|
...
|
|
121
109
|
|
|
@@ -137,32 +125,16 @@ class IndexEntry(Protocol):
|
|
|
137
125
|
|
|
138
126
|
|
|
139
127
|
class AbstractIndex(Protocol):
|
|
140
|
-
"""Protocol for index operations
|
|
141
|
-
|
|
142
|
-
This protocol defines the common interface for managing dataset metadata:
|
|
143
|
-
- Publishing and retrieving schemas
|
|
144
|
-
- Inserting and listing datasets
|
|
145
|
-
- (Future) Publishing and retrieving lenses
|
|
146
|
-
|
|
147
|
-
A single index can hold datasets of many different sample types. The sample
|
|
148
|
-
type is tracked via schema references, not as a generic parameter on the index.
|
|
128
|
+
"""Protocol for index operations — implemented by Index and AtmosphereIndex.
|
|
149
129
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
If present, ``load_dataset`` will use it for S3 credential resolution.
|
|
130
|
+
Manages dataset metadata: publishing/retrieving schemas, inserting/listing
|
|
131
|
+
datasets. A single index holds datasets of many sample types, tracked via
|
|
132
|
+
schema references.
|
|
154
133
|
|
|
155
134
|
Examples:
|
|
156
135
|
>>> def publish_and_list(index: AbstractIndex) -> None:
|
|
157
|
-
...
|
|
158
|
-
... schema1 = index.publish_schema(ImageSample, version="1.0.0")
|
|
159
|
-
... schema2 = index.publish_schema(TextSample, version="1.0.0")
|
|
160
|
-
...
|
|
161
|
-
... # Insert datasets of different types
|
|
136
|
+
... index.publish_schema(ImageSample, version="1.0.0")
|
|
162
137
|
... index.insert_dataset(image_ds, name="images")
|
|
163
|
-
... index.insert_dataset(text_ds, name="texts")
|
|
164
|
-
...
|
|
165
|
-
... # List all datasets (mixed types)
|
|
166
138
|
... for entry in index.list_datasets():
|
|
167
139
|
... print(f"{entry.name} -> {entry.schema_ref}")
|
|
168
140
|
"""
|
|
@@ -171,55 +143,58 @@ class AbstractIndex(Protocol):
|
|
|
171
143
|
def data_store(self) -> Optional["AbstractDataStore"]:
|
|
172
144
|
"""Optional data store for reading/writing shards.
|
|
173
145
|
|
|
174
|
-
If present, ``load_dataset``
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
Returns:
|
|
178
|
-
AbstractDataStore instance, or None if this index doesn't have
|
|
179
|
-
an associated data store.
|
|
180
|
-
|
|
181
|
-
Note:
|
|
182
|
-
Not all index implementations provide a data_store. Use
|
|
183
|
-
``hasattr(index, 'data_store') and index.data_store is not None``
|
|
184
|
-
for safe access.
|
|
146
|
+
If present, ``load_dataset`` uses it for credential resolution.
|
|
147
|
+
Not all implementations provide a data_store; check with
|
|
148
|
+
``getattr(index, 'data_store', None)``.
|
|
185
149
|
"""
|
|
186
150
|
...
|
|
187
151
|
|
|
188
152
|
# Dataset operations
|
|
189
153
|
|
|
190
|
-
def
|
|
154
|
+
def write(
|
|
191
155
|
self,
|
|
192
|
-
|
|
156
|
+
samples: Iterable,
|
|
193
157
|
*,
|
|
194
158
|
name: str,
|
|
195
159
|
schema_ref: Optional[str] = None,
|
|
196
160
|
**kwargs,
|
|
197
161
|
) -> IndexEntry:
|
|
198
|
-
"""
|
|
162
|
+
"""Write samples and create an index entry in one step.
|
|
199
163
|
|
|
200
|
-
|
|
201
|
-
|
|
164
|
+
Serializes samples to WebDataset tar files, stores them via the
|
|
165
|
+
appropriate backend, and creates an index entry.
|
|
202
166
|
|
|
203
167
|
Args:
|
|
204
|
-
|
|
205
|
-
name:
|
|
206
|
-
schema_ref: Optional
|
|
207
|
-
|
|
208
|
-
**kwargs: Additional backend-specific options.
|
|
168
|
+
samples: Iterable of Packable samples. Must be non-empty.
|
|
169
|
+
name: Dataset name, optionally prefixed with target backend.
|
|
170
|
+
schema_ref: Optional schema reference.
|
|
171
|
+
**kwargs: Backend-specific options (maxcount, description, etc.).
|
|
209
172
|
|
|
210
173
|
Returns:
|
|
211
|
-
IndexEntry for the
|
|
174
|
+
IndexEntry for the created dataset.
|
|
212
175
|
"""
|
|
213
176
|
...
|
|
214
177
|
|
|
215
|
-
def
|
|
216
|
-
|
|
178
|
+
def insert_dataset(
|
|
179
|
+
self,
|
|
180
|
+
ds: "Dataset",
|
|
181
|
+
*,
|
|
182
|
+
name: str,
|
|
183
|
+
schema_ref: Optional[str] = None,
|
|
184
|
+
**kwargs,
|
|
185
|
+
) -> IndexEntry:
|
|
186
|
+
"""Register an existing dataset in the index.
|
|
217
187
|
|
|
218
188
|
Args:
|
|
219
|
-
|
|
189
|
+
ds: The Dataset to register.
|
|
190
|
+
name: Human-readable name.
|
|
191
|
+
schema_ref: Explicit schema ref; auto-published if ``None``.
|
|
192
|
+
**kwargs: Backend-specific options.
|
|
193
|
+
"""
|
|
194
|
+
...
|
|
220
195
|
|
|
221
|
-
|
|
222
|
-
|
|
196
|
+
def get_dataset(self, ref: str) -> IndexEntry:
|
|
197
|
+
"""Get a dataset entry by name or reference.
|
|
223
198
|
|
|
224
199
|
Raises:
|
|
225
200
|
KeyError: If dataset not found.
|
|
@@ -242,33 +217,19 @@ class AbstractIndex(Protocol):
|
|
|
242
217
|
) -> str:
|
|
243
218
|
"""Publish a schema for a sample type.
|
|
244
219
|
|
|
245
|
-
The sample_type is accepted as ``type`` rather than ``Type[Packable]`` to
|
|
246
|
-
support ``@packable``-decorated classes, which satisfy the Packable protocol
|
|
247
|
-
at runtime but cannot be statically verified by type checkers.
|
|
248
|
-
|
|
249
220
|
Args:
|
|
250
|
-
sample_type: A Packable type (
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
**kwargs: Additional backend-specific options.
|
|
221
|
+
sample_type: A Packable type (``@packable``-decorated or subclass).
|
|
222
|
+
version: Semantic version string.
|
|
223
|
+
**kwargs: Backend-specific options.
|
|
254
224
|
|
|
255
225
|
Returns:
|
|
256
|
-
Schema reference string
|
|
257
|
-
- Local: 'local://schemas/{module.Class}@{version}'
|
|
258
|
-
- Atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
|
|
226
|
+
Schema reference string (``local://...`` or ``at://...``).
|
|
259
227
|
"""
|
|
260
228
|
...
|
|
261
229
|
|
|
262
230
|
def get_schema(self, ref: str) -> dict:
|
|
263
231
|
"""Get a schema record by reference.
|
|
264
232
|
|
|
265
|
-
Args:
|
|
266
|
-
ref: Schema reference string (local:// or at://).
|
|
267
|
-
|
|
268
|
-
Returns:
|
|
269
|
-
Schema record as a dictionary with fields like 'name', 'version',
|
|
270
|
-
'fields', etc.
|
|
271
|
-
|
|
272
233
|
Raises:
|
|
273
234
|
KeyError: If schema not found.
|
|
274
235
|
"""
|
|
@@ -280,30 +241,15 @@ class AbstractIndex(Protocol):
|
|
|
280
241
|
def list_schemas(self) -> list[dict]: ...
|
|
281
242
|
|
|
282
243
|
def decode_schema(self, ref: str) -> Type[Packable]:
|
|
283
|
-
"""Reconstruct a
|
|
284
|
-
|
|
285
|
-
This method enables loading datasets without knowing the sample type
|
|
286
|
-
ahead of time. The index retrieves the schema record and dynamically
|
|
287
|
-
generates a Packable class matching the schema definition.
|
|
288
|
-
|
|
289
|
-
Args:
|
|
290
|
-
ref: Schema reference string (local:// or at://).
|
|
291
|
-
|
|
292
|
-
Returns:
|
|
293
|
-
A dynamically generated Packable class with fields matching
|
|
294
|
-
the schema definition. The class can be used with
|
|
295
|
-
``Dataset[T]`` to load and iterate over samples.
|
|
244
|
+
"""Reconstruct a Packable type from a stored schema.
|
|
296
245
|
|
|
297
246
|
Raises:
|
|
298
247
|
KeyError: If schema not found.
|
|
299
|
-
ValueError: If schema
|
|
248
|
+
ValueError: If schema has unsupported field types.
|
|
300
249
|
|
|
301
250
|
Examples:
|
|
302
|
-
>>> entry = index.get_dataset("my-dataset")
|
|
303
251
|
>>> SampleType = index.decode_schema(entry.schema_ref)
|
|
304
252
|
>>> ds = Dataset[SampleType](entry.data_urls[0])
|
|
305
|
-
>>> for sample in ds.ordered():
|
|
306
|
-
... print(sample) # sample is instance of SampleType
|
|
307
253
|
"""
|
|
308
254
|
...
|
|
309
255
|
|
|
@@ -313,21 +259,14 @@ class AbstractIndex(Protocol):
|
|
|
313
259
|
|
|
314
260
|
|
|
315
261
|
class AbstractDataStore(Protocol):
|
|
316
|
-
"""Protocol for data storage
|
|
317
|
-
|
|
318
|
-
This protocol abstracts over different storage backends for dataset data:
|
|
319
|
-
- S3DataStore: S3-compatible object storage
|
|
320
|
-
- PDSBlobStore: ATProto PDS blob storage (future)
|
|
262
|
+
"""Protocol for data storage backends (S3, local disk, PDS blobs).
|
|
321
263
|
|
|
322
|
-
|
|
323
|
-
flexible deployment
|
|
324
|
-
S3 storage, or atmosphere index with PDS blobs.
|
|
264
|
+
Separates index (metadata) from data store (shard files), enabling
|
|
265
|
+
flexible deployment combinations.
|
|
325
266
|
|
|
326
267
|
Examples:
|
|
327
268
|
>>> store = S3DataStore(credentials, bucket="my-bucket")
|
|
328
269
|
>>> urls = store.write_shards(dataset, prefix="training/v1")
|
|
329
|
-
>>> print(urls)
|
|
330
|
-
['s3://my-bucket/training/v1/shard-000000.tar', ...]
|
|
331
270
|
"""
|
|
332
271
|
|
|
333
272
|
def write_shards(
|
|
@@ -341,28 +280,16 @@ class AbstractDataStore(Protocol):
|
|
|
341
280
|
|
|
342
281
|
Args:
|
|
343
282
|
ds: The Dataset to write.
|
|
344
|
-
prefix: Path prefix
|
|
345
|
-
**kwargs: Backend-specific options (
|
|
283
|
+
prefix: Path prefix (e.g., ``'datasets/mnist/v1'``).
|
|
284
|
+
**kwargs: Backend-specific options (``maxcount``, ``maxsize``, etc.).
|
|
346
285
|
|
|
347
286
|
Returns:
|
|
348
|
-
List of URLs
|
|
349
|
-
WebDataset or atdata.Dataset().
|
|
287
|
+
List of shard URLs suitable for ``atdata.Dataset()``.
|
|
350
288
|
"""
|
|
351
289
|
...
|
|
352
290
|
|
|
353
291
|
def read_url(self, url: str) -> str:
|
|
354
|
-
"""Resolve a storage URL for reading.
|
|
355
|
-
|
|
356
|
-
Some storage backends may need to transform URLs (e.g., signing S3 URLs
|
|
357
|
-
or resolving blob references). This method returns a URL that can be
|
|
358
|
-
used directly with WebDataset.
|
|
359
|
-
|
|
360
|
-
Args:
|
|
361
|
-
url: Storage URL to resolve.
|
|
362
|
-
|
|
363
|
-
Returns:
|
|
364
|
-
WebDataset-compatible URL for reading.
|
|
365
|
-
"""
|
|
292
|
+
"""Resolve a storage URL for reading (e.g., sign S3 URLs)."""
|
|
366
293
|
...
|
|
367
294
|
|
|
368
295
|
def supports_streaming(self) -> bool: ...
|
|
@@ -374,77 +301,32 @@ class AbstractDataStore(Protocol):
|
|
|
374
301
|
|
|
375
302
|
@runtime_checkable
|
|
376
303
|
class DataSource(Protocol):
|
|
377
|
-
"""Protocol for data sources that
|
|
378
|
-
|
|
379
|
-
A DataSource abstracts over different ways of accessing dataset shards:
|
|
380
|
-
- URLSource: Standard WebDataset-compatible URLs (http, https, pipe, gs, etc.)
|
|
381
|
-
- S3Source: S3-compatible storage with explicit credentials
|
|
382
|
-
- BlobSource: ATProto blob references (future)
|
|
304
|
+
"""Protocol for data sources that stream shard data to Dataset.
|
|
383
305
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
resolution
|
|
387
|
-
|
|
388
|
-
- Custom endpoints (Cloudflare R2, MinIO)
|
|
389
|
-
- ATProto blob streaming
|
|
390
|
-
- Any other source that can provide file-like objects
|
|
306
|
+
Implementations (URLSource, S3Source, BlobSource) yield
|
|
307
|
+
``(identifier, stream)`` pairs fed to WebDataset's tar expander,
|
|
308
|
+
bypassing URL resolution. This enables private S3, custom endpoints,
|
|
309
|
+
and ATProto blob streaming.
|
|
391
310
|
|
|
392
311
|
Examples:
|
|
393
|
-
>>> source = S3Source(
|
|
394
|
-
... bucket="my-bucket",
|
|
395
|
-
... keys=["data-000.tar", "data-001.tar"],
|
|
396
|
-
... endpoint="https://r2.example.com",
|
|
397
|
-
... credentials=creds,
|
|
398
|
-
... )
|
|
312
|
+
>>> source = S3Source(bucket="my-bucket", keys=["data-000.tar"])
|
|
399
313
|
>>> ds = Dataset[MySample](source)
|
|
400
|
-
>>> for sample in ds.ordered():
|
|
401
|
-
... print(sample)
|
|
402
314
|
"""
|
|
403
315
|
|
|
404
316
|
@property
|
|
405
317
|
def shards(self) -> Iterator[tuple[str, IO[bytes]]]:
|
|
406
|
-
"""Lazily yield (
|
|
407
|
-
|
|
408
|
-
The identifier is used for error messages and __url__ metadata.
|
|
409
|
-
The stream must be a file-like object that can be read by tarfile.
|
|
410
|
-
|
|
411
|
-
Yields:
|
|
412
|
-
Tuple of (shard_identifier, file_like_stream).
|
|
413
|
-
|
|
414
|
-
Examples:
|
|
415
|
-
>>> for shard_id, stream in source.shards:
|
|
416
|
-
... print(f"Processing {shard_id}")
|
|
417
|
-
... data = stream.read()
|
|
418
|
-
"""
|
|
318
|
+
"""Lazily yield ``(shard_id, stream)`` pairs for each shard."""
|
|
419
319
|
...
|
|
420
320
|
|
|
421
321
|
def list_shards(self) -> list[str]:
|
|
422
|
-
"""
|
|
423
|
-
|
|
424
|
-
Used for metadata queries like counting shards without actually
|
|
425
|
-
streaming data. Implementations should return identifiers that
|
|
426
|
-
match what shards would yield.
|
|
427
|
-
|
|
428
|
-
Returns:
|
|
429
|
-
List of shard identifier strings.
|
|
430
|
-
"""
|
|
322
|
+
"""Shard identifiers without opening streams."""
|
|
431
323
|
...
|
|
432
324
|
|
|
433
325
|
def open_shard(self, shard_id: str) -> IO[bytes]:
|
|
434
|
-
"""Open a single shard
|
|
435
|
-
|
|
436
|
-
This method enables random access to individual shards, which is
|
|
437
|
-
required for PyTorch DataLoader worker splitting. Each worker opens
|
|
438
|
-
only its assigned shards rather than iterating all shards.
|
|
439
|
-
|
|
440
|
-
Args:
|
|
441
|
-
shard_id: Shard identifier from list_shards().
|
|
442
|
-
|
|
443
|
-
Returns:
|
|
444
|
-
File-like stream for reading the shard.
|
|
326
|
+
"""Open a single shard for random access (e.g., DataLoader splitting).
|
|
445
327
|
|
|
446
328
|
Raises:
|
|
447
|
-
KeyError: If shard_id is not in list_shards()
|
|
329
|
+
KeyError: If *shard_id* is not in ``list_shards()``.
|
|
448
330
|
"""
|
|
449
331
|
...
|
|
450
332
|
|
atdata/_schema_codec.py
CHANGED
|
@@ -284,7 +284,7 @@ def generate_stub(schema: dict) -> str:
|
|
|
284
284
|
String content for a .pyi stub file.
|
|
285
285
|
|
|
286
286
|
Examples:
|
|
287
|
-
>>> schema = index.get_schema("atdata://local/
|
|
287
|
+
>>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
|
|
288
288
|
>>> stub_content = generate_stub(schema.to_dict())
|
|
289
289
|
>>> # Save to a stubs directory configured in your IDE
|
|
290
290
|
>>> with open("stubs/my_sample.pyi", "w") as f:
|
|
@@ -360,7 +360,7 @@ def generate_module(schema: dict) -> str:
|
|
|
360
360
|
String content for a .py module file.
|
|
361
361
|
|
|
362
362
|
Examples:
|
|
363
|
-
>>> schema = index.get_schema("atdata://local/
|
|
363
|
+
>>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
|
|
364
364
|
>>> module_content = generate_module(schema.to_dict())
|
|
365
365
|
>>> # The module can be imported after being saved
|
|
366
366
|
"""
|
atdata/_stub_manager.py
CHANGED
|
@@ -15,7 +15,7 @@ Examples:
|
|
|
15
15
|
>>> index = Index(auto_stubs=True)
|
|
16
16
|
>>>
|
|
17
17
|
>>> # Modules are generated automatically on decode_schema
|
|
18
|
-
>>> MyType = index.decode_schema("atdata://local/
|
|
18
|
+
>>> MyType = index.decode_schema("atdata://local/schema/MySample@1.0.0")
|
|
19
19
|
>>> # MyType is now properly typed for IDE autocomplete!
|
|
20
20
|
>>>
|
|
21
21
|
>>> # Get the stub directory path for IDE configuration
|
|
@@ -51,8 +51,8 @@ def _extract_authority(schema_ref: Optional[str]) -> str:
|
|
|
51
51
|
"""Extract authority from a schema reference URI.
|
|
52
52
|
|
|
53
53
|
Args:
|
|
54
|
-
schema_ref: Schema ref like "atdata://local/
|
|
55
|
-
or "atdata://alice.bsky.social/
|
|
54
|
+
schema_ref: Schema ref like "atdata://local/schema/Name@1.0.0"
|
|
55
|
+
or "atdata://alice.bsky.social/schema/Name@1.0.0"
|
|
56
56
|
|
|
57
57
|
Returns:
|
|
58
58
|
Authority string (e.g., "local", "alice.bsky.social", "did_plc_xxx").
|
|
@@ -149,10 +149,6 @@ class StubManager:
|
|
|
149
149
|
safe_version = version.replace(".", "_")
|
|
150
150
|
return f"{name}_{safe_version}.py"
|
|
151
151
|
|
|
152
|
-
def _stub_filename(self, name: str, version: str) -> str:
|
|
153
|
-
"""Alias for _module_filename for backwards compatibility."""
|
|
154
|
-
return self._module_filename(name, version)
|
|
155
|
-
|
|
156
152
|
def _module_path(
|
|
157
153
|
self, name: str, version: str, authority: str = DEFAULT_AUTHORITY
|
|
158
154
|
) -> Path:
|
|
@@ -168,12 +164,6 @@ class StubManager:
|
|
|
168
164
|
"""
|
|
169
165
|
return self._stub_dir / authority / self._module_filename(name, version)
|
|
170
166
|
|
|
171
|
-
def _stub_path(
|
|
172
|
-
self, name: str, version: str, authority: str = DEFAULT_AUTHORITY
|
|
173
|
-
) -> Path:
|
|
174
|
-
"""Alias for _module_path for backwards compatibility."""
|
|
175
|
-
return self._module_path(name, version, authority)
|
|
176
|
-
|
|
177
167
|
def _module_is_current(self, path: Path, version: str) -> bool:
|
|
178
168
|
"""Check if an existing module file matches the expected version.
|
|
179
169
|
|
|
@@ -200,10 +190,6 @@ class StubManager:
|
|
|
200
190
|
except (OSError, IOError):
|
|
201
191
|
return False
|
|
202
192
|
|
|
203
|
-
def _stub_is_current(self, path: Path, version: str) -> bool:
|
|
204
|
-
"""Alias for _module_is_current for backwards compatibility."""
|
|
205
|
-
return self._module_is_current(path, version)
|
|
206
|
-
|
|
207
193
|
def _ensure_authority_package(self, authority: str) -> None:
|
|
208
194
|
"""Ensure authority subdirectory exists with __init__.py."""
|
|
209
195
|
self._ensure_dir_exists()
|
|
@@ -261,12 +247,6 @@ class StubManager:
|
|
|
261
247
|
pass # Temp file cleanup failed, re-raising original error
|
|
262
248
|
raise
|
|
263
249
|
|
|
264
|
-
def _write_stub_atomic(self, path: Path, content: str) -> None:
|
|
265
|
-
"""Legacy method - extracts authority from path and calls _write_module_atomic."""
|
|
266
|
-
# Extract authority from path (parent directory name)
|
|
267
|
-
authority = path.parent.name
|
|
268
|
-
self._write_module_atomic(path, content, authority)
|
|
269
|
-
|
|
270
250
|
def ensure_stub(self, schema: dict) -> Optional[Path]:
|
|
271
251
|
"""Ensure a module file exists for the given schema.
|
|
272
252
|
|
|
@@ -426,7 +406,7 @@ class StubManager:
|
|
|
426
406
|
Returns:
|
|
427
407
|
Path if stub exists, None otherwise
|
|
428
408
|
"""
|
|
429
|
-
path = self.
|
|
409
|
+
path = self._module_path(name, version, authority)
|
|
430
410
|
return path if path.exists() else None
|
|
431
411
|
|
|
432
412
|
def list_stubs(self, authority: Optional[str] = None) -> list[Path]:
|
|
@@ -513,7 +493,7 @@ class StubManager:
|
|
|
513
493
|
Returns:
|
|
514
494
|
True if file was removed, False if it didn't exist
|
|
515
495
|
"""
|
|
516
|
-
path = self.
|
|
496
|
+
path = self._module_path(name, version, authority)
|
|
517
497
|
if path.exists():
|
|
518
498
|
try:
|
|
519
499
|
path.unlink()
|