atdata 0.2.0a1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/_protocols.py ADDED
@@ -0,0 +1,504 @@
1
+ """Protocol definitions for atdata index and storage abstractions.
2
+
3
+ This module defines the abstract protocols that enable interchangeable
4
+ index backends (local Redis vs ATProto PDS) and data stores (S3 vs PDS blobs).
5
+
6
+ The key insight is that both local and atmosphere implementations solve the
7
+ same problem: indexed dataset storage with external data URLs. These protocols
8
+ formalize that common interface.
9
+
10
+ Note:
11
+ Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
12
+ the standard Python syntax for Protocol definitions - these are interface
13
+ specifications, not stub implementations. Concrete classes (LocalIndex,
14
+ AtmosphereIndex, etc.) provide the actual implementations.
15
+
16
+ Protocols:
17
+ Packable: Structural interface for packable sample types (lens compatibility)
18
+ IndexEntry: Common interface for dataset index entries
19
+ AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
20
+ AbstractDataStore: Protocol for data storage operations
21
+
22
+ Examples:
23
+ >>> def process_datasets(index: AbstractIndex) -> None:
24
+ ... for entry in index.list_datasets():
25
+ ... print(f"{entry.name}: {entry.data_urls}")
26
+ ...
27
+ >>> # Works with either LocalIndex or AtmosphereIndex
28
+ >>> process_datasets(local_index)
29
+ >>> process_datasets(atmosphere_index)
30
+ """
31
+
32
+ from typing import (
33
+ IO,
34
+ Any,
35
+ Iterator,
36
+ Optional,
37
+ Protocol,
38
+ Type,
39
+ TYPE_CHECKING,
40
+ runtime_checkable,
41
+ )
42
+
43
+ if TYPE_CHECKING:
44
+ from .dataset import Dataset
45
+
46
+
47
+ ##
48
+ # Packable Protocol (for lens type compatibility)
49
+
50
+
51
+ @runtime_checkable
52
+ class Packable(Protocol):
53
+ """Structural protocol for packable sample types.
54
+
55
+ This protocol allows classes decorated with ``@packable`` to be recognized
56
+ as valid types for lens transformations and schema operations, even though
57
+ the decorator doesn't change the class's nominal type at static analysis time.
58
+
59
+ Both ``PackableSample`` subclasses and ``@packable``-decorated classes
60
+ satisfy this protocol structurally.
61
+
62
+ The protocol captures the full interface needed for:
63
+ - Lens type transformations (as_wds, from_data)
64
+ - Schema publishing (class introspection via dataclass fields)
65
+ - Serialization/deserialization (packed, from_bytes)
66
+
67
+ Examples:
68
+ >>> @packable
69
+ ... class MySample:
70
+ ... name: str
71
+ ... value: int
72
+ ...
73
+ >>> def process(sample_type: Type[Packable]) -> None:
74
+ ... # Type checker knows sample_type has from_bytes, packed, etc.
75
+ ... instance = sample_type.from_bytes(data)
76
+ ... print(instance.packed)
77
+ """
78
+
79
+ @classmethod
80
+ def from_data(cls, data: dict[str, Any]) -> "Packable":
81
+ """Create instance from unpacked msgpack data dictionary."""
82
+ ...
83
+
84
+ @classmethod
85
+ def from_bytes(cls, bs: bytes) -> "Packable":
86
+ """Create instance from raw msgpack bytes."""
87
+ ...
88
+
89
+ @property
90
+ def packed(self) -> bytes:
91
+ """Pack this sample's data into msgpack bytes."""
92
+ ...
93
+
94
+ @property
95
+ def as_wds(self) -> dict[str, Any]:
96
+ """WebDataset-compatible representation with __key__ and msgpack."""
97
+ ...
98
+
99
+
100
+ ##
101
+ # IndexEntry Protocol
102
+
103
+
104
+ @runtime_checkable
105
+ class IndexEntry(Protocol):
106
+ """Common interface for index entries (local or atmosphere).
107
+
108
+ Both LocalDatasetEntry and atmosphere DatasetRecord-based entries
109
+ should satisfy this protocol, enabling code that works with either.
110
+
111
+ Properties:
112
+ name: Human-readable dataset name
113
+ schema_ref: Reference to schema (local:// path or AT URI)
114
+ data_urls: WebDataset URLs for the data
115
+ metadata: Arbitrary metadata dict, or None
116
+ """
117
+
118
+ @property
119
+ def name(self) -> str:
120
+ """Human-readable dataset name."""
121
+ ...
122
+
123
+ @property
124
+ def schema_ref(self) -> str:
125
+ """Reference to the schema for this dataset.
126
+
127
+ For local: 'local://schemas/{module.Class}@{version}'
128
+ For atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
129
+ """
130
+ ...
131
+
132
+ @property
133
+ def data_urls(self) -> list[str]:
134
+ """WebDataset URLs for the data.
135
+
136
+ These are the URLs that can be passed to atdata.Dataset() or
137
+ used with WebDataset directly. May use brace notation for shards.
138
+ """
139
+ ...
140
+
141
+ @property
142
+ def metadata(self) -> Optional[dict]:
143
+ """Arbitrary metadata dictionary, or None if not set."""
144
+ ...
145
+
146
+
147
+ ##
148
+ # AbstractIndex Protocol
149
+
150
+
151
+ class AbstractIndex(Protocol):
152
+ """Protocol for index operations - implemented by LocalIndex and AtmosphereIndex.
153
+
154
+ This protocol defines the common interface for managing dataset metadata:
155
+ - Publishing and retrieving schemas
156
+ - Inserting and listing datasets
157
+ - (Future) Publishing and retrieving lenses
158
+
159
+ A single index can hold datasets of many different sample types. The sample
160
+ type is tracked via schema references, not as a generic parameter on the index.
161
+
162
+ Optional Extensions:
163
+ Some index implementations support additional features:
164
+ - ``data_store``: An AbstractDataStore for reading/writing dataset shards.
165
+ If present, ``load_dataset`` will use it for S3 credential resolution.
166
+
167
+ Examples:
168
+ >>> def publish_and_list(index: AbstractIndex) -> None:
169
+ ... # Publish schemas for different types
170
+ ... schema1 = index.publish_schema(ImageSample, version="1.0.0")
171
+ ... schema2 = index.publish_schema(TextSample, version="1.0.0")
172
+ ...
173
+ ... # Insert datasets of different types
174
+ ... index.insert_dataset(image_ds, name="images")
175
+ ... index.insert_dataset(text_ds, name="texts")
176
+ ...
177
+ ... # List all datasets (mixed types)
178
+ ... for entry in index.list_datasets():
179
+ ... print(f"{entry.name} -> {entry.schema_ref}")
180
+ """
181
+
182
+ @property
183
+ def data_store(self) -> Optional["AbstractDataStore"]:
184
+ """Optional data store for reading/writing shards.
185
+
186
+ If present, ``load_dataset`` will use it for credential resolution
187
+ (e.g., S3 credentials from S3DataStore).
188
+
189
+ Returns:
190
+ AbstractDataStore instance, or None if this index doesn't have
191
+ an associated data store.
192
+
193
+ Note:
194
+ Not all index implementations provide a data_store. Use
195
+ ``hasattr(index, 'data_store') and index.data_store is not None``
196
+ for safe access.
197
+ """
198
+ ...
199
+
200
+ # Dataset operations
201
+
202
+ def insert_dataset(
203
+ self,
204
+ ds: "Dataset",
205
+ *,
206
+ name: str,
207
+ schema_ref: Optional[str] = None,
208
+ **kwargs,
209
+ ) -> IndexEntry:
210
+ """Insert a dataset into the index.
211
+
212
+ The sample type is inferred from ``ds.sample_type``. If schema_ref is not
213
+ provided, the schema may be auto-published based on the sample type.
214
+
215
+ Args:
216
+ ds: The Dataset to register in the index (any sample type).
217
+ name: Human-readable name for the dataset.
218
+ schema_ref: Optional explicit schema reference. If not provided,
219
+ the schema may be auto-published or inferred from ds.sample_type.
220
+ **kwargs: Additional backend-specific options.
221
+
222
+ Returns:
223
+ IndexEntry for the inserted dataset.
224
+ """
225
+ ...
226
+
227
+ def get_dataset(self, ref: str) -> IndexEntry:
228
+ """Get a dataset entry by name or reference.
229
+
230
+ Args:
231
+ ref: Dataset name, path, or full reference string.
232
+
233
+ Returns:
234
+ IndexEntry for the dataset.
235
+
236
+ Raises:
237
+ KeyError: If dataset not found.
238
+ """
239
+ ...
240
+
241
+ @property
242
+ def datasets(self) -> Iterator[IndexEntry]:
243
+ """Lazily iterate over all dataset entries in this index.
244
+
245
+ Yields:
246
+ IndexEntry for each dataset (may be of different sample types).
247
+ """
248
+ ...
249
+
250
+ def list_datasets(self) -> list[IndexEntry]:
251
+ """Get all dataset entries as a materialized list.
252
+
253
+ Returns:
254
+ List of IndexEntry for each dataset.
255
+ """
256
+ ...
257
+
258
+ # Schema operations
259
+
260
+ def publish_schema(
261
+ self,
262
+ sample_type: type,
263
+ *,
264
+ version: str = "1.0.0",
265
+ **kwargs,
266
+ ) -> str:
267
+ """Publish a schema for a sample type.
268
+
269
+ The sample_type is accepted as ``type`` rather than ``Type[Packable]`` to
270
+ support ``@packable``-decorated classes, which satisfy the Packable protocol
271
+ at runtime but cannot be statically verified by type checkers.
272
+
273
+ Args:
274
+ sample_type: A Packable type (PackableSample subclass or @packable-decorated).
275
+ Validated at runtime via the @runtime_checkable Packable protocol.
276
+ version: Semantic version string for the schema.
277
+ **kwargs: Additional backend-specific options.
278
+
279
+ Returns:
280
+ Schema reference string:
281
+ - Local: 'local://schemas/{module.Class}@{version}'
282
+ - Atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
283
+ """
284
+ ...
285
+
286
+ def get_schema(self, ref: str) -> dict:
287
+ """Get a schema record by reference.
288
+
289
+ Args:
290
+ ref: Schema reference string (local:// or at://).
291
+
292
+ Returns:
293
+ Schema record as a dictionary with fields like 'name', 'version',
294
+ 'fields', etc.
295
+
296
+ Raises:
297
+ KeyError: If schema not found.
298
+ """
299
+ ...
300
+
301
+ @property
302
+ def schemas(self) -> Iterator[dict]:
303
+ """Lazily iterate over all schema records in this index.
304
+
305
+ Yields:
306
+ Schema records as dictionaries.
307
+ """
308
+ ...
309
+
310
+ def list_schemas(self) -> list[dict]:
311
+ """Get all schema records as a materialized list.
312
+
313
+ Returns:
314
+ List of schema records as dictionaries.
315
+ """
316
+ ...
317
+
318
+ def decode_schema(self, ref: str) -> Type[Packable]:
319
+ """Reconstruct a Python Packable type from a stored schema.
320
+
321
+ This method enables loading datasets without knowing the sample type
322
+ ahead of time. The index retrieves the schema record and dynamically
323
+ generates a Packable class matching the schema definition.
324
+
325
+ Args:
326
+ ref: Schema reference string (local:// or at://).
327
+
328
+ Returns:
329
+ A dynamically generated Packable class with fields matching
330
+ the schema definition. The class can be used with
331
+ ``Dataset[T]`` to load and iterate over samples.
332
+
333
+ Raises:
334
+ KeyError: If schema not found.
335
+ ValueError: If schema cannot be decoded (unsupported field types).
336
+
337
+ Examples:
338
+ >>> entry = index.get_dataset("my-dataset")
339
+ >>> SampleType = index.decode_schema(entry.schema_ref)
340
+ >>> ds = Dataset[SampleType](entry.data_urls[0])
341
+ >>> for sample in ds.ordered():
342
+ ... print(sample) # sample is instance of SampleType
343
+ """
344
+ ...
345
+
346
+
347
+ ##
348
+ # AbstractDataStore Protocol
349
+
350
+
351
+ class AbstractDataStore(Protocol):
352
+ """Protocol for data storage operations.
353
+
354
+ This protocol abstracts over different storage backends for dataset data:
355
+ - S3DataStore: S3-compatible object storage
356
+ - PDSBlobStore: ATProto PDS blob storage (future)
357
+
358
+ The separation of index (metadata) from data store (actual files) allows
359
+ flexible deployment: local index with S3 storage, atmosphere index with
360
+ S3 storage, or atmosphere index with PDS blobs.
361
+
362
+ Examples:
363
+ >>> store = S3DataStore(credentials, bucket="my-bucket")
364
+ >>> urls = store.write_shards(dataset, prefix="training/v1")
365
+ >>> print(urls)
366
+ ['s3://my-bucket/training/v1/shard-000000.tar', ...]
367
+ """
368
+
369
+ def write_shards(
370
+ self,
371
+ ds: "Dataset",
372
+ *,
373
+ prefix: str,
374
+ **kwargs,
375
+ ) -> list[str]:
376
+ """Write dataset shards to storage.
377
+
378
+ Args:
379
+ ds: The Dataset to write.
380
+ prefix: Path prefix for the shards (e.g., 'datasets/mnist/v1').
381
+ **kwargs: Backend-specific options (e.g., maxcount for shard size).
382
+
383
+ Returns:
384
+ List of URLs for the written shards, suitable for use with
385
+ WebDataset or atdata.Dataset().
386
+ """
387
+ ...
388
+
389
+ def read_url(self, url: str) -> str:
390
+ """Resolve a storage URL for reading.
391
+
392
+ Some storage backends may need to transform URLs (e.g., signing S3 URLs
393
+ or resolving blob references). This method returns a URL that can be
394
+ used directly with WebDataset.
395
+
396
+ Args:
397
+ url: Storage URL to resolve.
398
+
399
+ Returns:
400
+ WebDataset-compatible URL for reading.
401
+ """
402
+ ...
403
+
404
+ def supports_streaming(self) -> bool:
405
+ """Whether this store supports streaming reads.
406
+
407
+ Returns:
408
+ True if the store supports efficient streaming (like S3),
409
+ False if data must be fully downloaded first.
410
+ """
411
+ ...
412
+
413
+
414
+ ##
415
+ # DataSource Protocol
416
+
417
+
418
+ @runtime_checkable
419
+ class DataSource(Protocol):
420
+ """Protocol for data sources that provide streams to Dataset.
421
+
422
+ A DataSource abstracts over different ways of accessing dataset shards:
423
+ - URLSource: Standard WebDataset-compatible URLs (http, https, pipe, gs, etc.)
424
+ - S3Source: S3-compatible storage with explicit credentials
425
+ - BlobSource: ATProto blob references (future)
426
+
427
+ The key method is ``shards()``, which yields (identifier, stream) pairs.
428
+ These are fed directly to WebDataset's tar_file_expander, bypassing URL
429
+ resolution entirely. This enables:
430
+ - Private S3 repos with credentials
431
+ - Custom endpoints (Cloudflare R2, MinIO)
432
+ - ATProto blob streaming
433
+ - Any other source that can provide file-like objects
434
+
435
+ Examples:
436
+ >>> source = S3Source(
437
+ ... bucket="my-bucket",
438
+ ... keys=["data-000.tar", "data-001.tar"],
439
+ ... endpoint="https://r2.example.com",
440
+ ... credentials=creds,
441
+ ... )
442
+ >>> ds = Dataset[MySample](source)
443
+ >>> for sample in ds.ordered():
444
+ ... print(sample)
445
+ """
446
+
447
+ @property
448
+ def shards(self) -> Iterator[tuple[str, IO[bytes]]]:
449
+ """Lazily yield (identifier, stream) pairs for each shard.
450
+
451
+ The identifier is used for error messages and __url__ metadata.
452
+ The stream must be a file-like object that can be read by tarfile.
453
+
454
+ Yields:
455
+ Tuple of (shard_identifier, file_like_stream).
456
+
457
+ Examples:
458
+ >>> for shard_id, stream in source.shards:
459
+ ... print(f"Processing {shard_id}")
460
+ ... data = stream.read()
461
+ """
462
+ ...
463
+
464
+ def list_shards(self) -> list[str]:
465
+ """Get list of shard identifiers without opening streams.
466
+
467
+ Used for metadata queries like counting shards without actually
468
+ streaming data. Implementations should return identifiers that
469
+ match what shards would yield.
470
+
471
+ Returns:
472
+ List of shard identifier strings.
473
+ """
474
+ ...
475
+
476
+ def open_shard(self, shard_id: str) -> IO[bytes]:
477
+ """Open a single shard by its identifier.
478
+
479
+ This method enables random access to individual shards, which is
480
+ required for PyTorch DataLoader worker splitting. Each worker opens
481
+ only its assigned shards rather than iterating all shards.
482
+
483
+ Args:
484
+ shard_id: Shard identifier from shard_list.
485
+
486
+ Returns:
487
+ File-like stream for reading the shard.
488
+
489
+ Raises:
490
+ KeyError: If shard_id is not in shard_list.
491
+ """
492
+ ...
493
+
494
+
495
+ ##
496
+ # Module exports
497
+
498
+ __all__ = [
499
+ "Packable",
500
+ "IndexEntry",
501
+ "AbstractIndex",
502
+ "AbstractDataStore",
503
+ "DataSource",
504
+ ]