atdata 0.2.0a1__py3-none-any.whl → 0.2.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/_protocols.py ADDED
@@ -0,0 +1,519 @@
1
+ """Protocol definitions for atdata index and storage abstractions.
2
+
3
+ This module defines the abstract protocols that enable interchangeable
4
+ index backends (local Redis vs ATProto PDS) and data stores (S3 vs PDS blobs).
5
+
6
+ The key insight is that both local and atmosphere implementations solve the
7
+ same problem: indexed dataset storage with external data URLs. These protocols
8
+ formalize that common interface.
9
+
10
+ Note:
11
+ Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
12
+ the standard Python syntax for Protocol definitions - these are interface
13
+ specifications, not stub implementations. Concrete classes (LocalIndex,
14
+ AtmosphereIndex, etc.) provide the actual implementations.
15
+
16
+ Protocols:
17
+ Packable: Structural interface for packable sample types (lens compatibility)
18
+ IndexEntry: Common interface for dataset index entries
19
+ AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
20
+ AbstractDataStore: Protocol for data storage operations
21
+
22
+ Example:
23
+ ::
24
+
25
+ >>> def process_datasets(index: AbstractIndex) -> None:
26
+ ... for entry in index.list_datasets():
27
+ ... print(f"{entry.name}: {entry.data_urls}")
28
+ ...
29
+ >>> # Works with either LocalIndex or AtmosphereIndex
30
+ >>> process_datasets(local_index)
31
+ >>> process_datasets(atmosphere_index)
32
+ """
33
+
34
+ from typing import (
35
+ IO,
36
+ Any,
37
+ ClassVar,
38
+ Iterator,
39
+ Optional,
40
+ Protocol,
41
+ Type,
42
+ TYPE_CHECKING,
43
+ runtime_checkable,
44
+ )
45
+
46
+ if TYPE_CHECKING:
47
+ from .dataset import Dataset
48
+
49
+
50
+ ##
51
+ # Packable Protocol (for lens type compatibility)
52
+
53
+
54
+ @runtime_checkable
55
+ class Packable(Protocol):
56
+ """Structural protocol for packable sample types.
57
+
58
+ This protocol allows classes decorated with ``@packable`` to be recognized
59
+ as valid types for lens transformations and schema operations, even though
60
+ the decorator doesn't change the class's nominal type at static analysis time.
61
+
62
+ Both ``PackableSample`` subclasses and ``@packable``-decorated classes
63
+ satisfy this protocol structurally.
64
+
65
+ The protocol captures the full interface needed for:
66
+ - Lens type transformations (as_wds, from_data)
67
+ - Schema publishing (class introspection via dataclass fields)
68
+ - Serialization/deserialization (packed, from_bytes)
69
+
70
+ Example:
71
+ ::
72
+
73
+ >>> @packable
74
+ ... class MySample:
75
+ ... name: str
76
+ ... value: int
77
+ ...
78
+ >>> def process(sample_type: Type[Packable]) -> None:
79
+ ... # Type checker knows sample_type has from_bytes, packed, etc.
80
+ ... instance = sample_type.from_bytes(data)
81
+ ... print(instance.packed)
82
+ """
83
+
84
+ @classmethod
85
+ def from_data(cls, data: dict[str, Any]) -> "Packable":
86
+ """Create instance from unpacked msgpack data dictionary."""
87
+ ...
88
+
89
+ @classmethod
90
+ def from_bytes(cls, bs: bytes) -> "Packable":
91
+ """Create instance from raw msgpack bytes."""
92
+ ...
93
+
94
+ @property
95
+ def packed(self) -> bytes:
96
+ """Pack this sample's data into msgpack bytes."""
97
+ ...
98
+
99
+ @property
100
+ def as_wds(self) -> dict[str, Any]:
101
+ """WebDataset-compatible representation with __key__ and msgpack."""
102
+ ...
103
+
104
+
105
+ ##
106
+ # IndexEntry Protocol
107
+
108
+
109
+ @runtime_checkable
110
+ class IndexEntry(Protocol):
111
+ """Common interface for index entries (local or atmosphere).
112
+
113
+ Both LocalDatasetEntry and atmosphere DatasetRecord-based entries
114
+ should satisfy this protocol, enabling code that works with either.
115
+
116
+ Properties:
117
+ name: Human-readable dataset name
118
+ schema_ref: Reference to schema (local:// path or AT URI)
119
+ data_urls: WebDataset URLs for the data
120
+ metadata: Arbitrary metadata dict, or None
121
+ """
122
+
123
+ @property
124
+ def name(self) -> str:
125
+ """Human-readable dataset name."""
126
+ ...
127
+
128
+ @property
129
+ def schema_ref(self) -> str:
130
+ """Reference to the schema for this dataset.
131
+
132
+ For local: 'local://schemas/{module.Class}@{version}'
133
+ For atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
134
+ """
135
+ ...
136
+
137
+ @property
138
+ def data_urls(self) -> list[str]:
139
+ """WebDataset URLs for the data.
140
+
141
+ These are the URLs that can be passed to atdata.Dataset() or
142
+ used with WebDataset directly. May use brace notation for shards.
143
+ """
144
+ ...
145
+
146
+ @property
147
+ def metadata(self) -> Optional[dict]:
148
+ """Arbitrary metadata dictionary, or None if not set."""
149
+ ...
150
+
151
+
152
+ ##
153
+ # AbstractIndex Protocol
154
+
155
+
156
+ class AbstractIndex(Protocol):
157
+ """Protocol for index operations - implemented by LocalIndex and AtmosphereIndex.
158
+
159
+ This protocol defines the common interface for managing dataset metadata:
160
+ - Publishing and retrieving schemas
161
+ - Inserting and listing datasets
162
+ - (Future) Publishing and retrieving lenses
163
+
164
+ A single index can hold datasets of many different sample types. The sample
165
+ type is tracked via schema references, not as a generic parameter on the index.
166
+
167
+ Optional Extensions:
168
+ Some index implementations support additional features:
169
+ - ``data_store``: An AbstractDataStore for reading/writing dataset shards.
170
+ If present, ``load_dataset`` will use it for S3 credential resolution.
171
+
172
+ Example:
173
+ ::
174
+
175
+ >>> def publish_and_list(index: AbstractIndex) -> None:
176
+ ... # Publish schemas for different types
177
+ ... schema1 = index.publish_schema(ImageSample, version="1.0.0")
178
+ ... schema2 = index.publish_schema(TextSample, version="1.0.0")
179
+ ...
180
+ ... # Insert datasets of different types
181
+ ... index.insert_dataset(image_ds, name="images")
182
+ ... index.insert_dataset(text_ds, name="texts")
183
+ ...
184
+ ... # List all datasets (mixed types)
185
+ ... for entry in index.list_datasets():
186
+ ... print(f"{entry.name} -> {entry.schema_ref}")
187
+ """
188
+
189
+ @property
190
+ def data_store(self) -> Optional["AbstractDataStore"]:
191
+ """Optional data store for reading/writing shards.
192
+
193
+ If present, ``load_dataset`` will use it for credential resolution
194
+ (e.g., S3 credentials from S3DataStore).
195
+
196
+ Returns:
197
+ AbstractDataStore instance, or None if this index doesn't have
198
+ an associated data store.
199
+
200
+ Note:
201
+ Not all index implementations provide a data_store. Use
202
+ ``hasattr(index, 'data_store') and index.data_store is not None``
203
+ for safe access.
204
+ """
205
+ ...
206
+
207
+ # Dataset operations
208
+
209
+ def insert_dataset(
210
+ self,
211
+ ds: "Dataset",
212
+ *,
213
+ name: str,
214
+ schema_ref: Optional[str] = None,
215
+ **kwargs,
216
+ ) -> IndexEntry:
217
+ """Insert a dataset into the index.
218
+
219
+ The sample type is inferred from ``ds.sample_type``. If schema_ref is not
220
+ provided, the schema may be auto-published based on the sample type.
221
+
222
+ Args:
223
+ ds: The Dataset to register in the index (any sample type).
224
+ name: Human-readable name for the dataset.
225
+ schema_ref: Optional explicit schema reference. If not provided,
226
+ the schema may be auto-published or inferred from ds.sample_type.
227
+ **kwargs: Additional backend-specific options.
228
+
229
+ Returns:
230
+ IndexEntry for the inserted dataset.
231
+ """
232
+ ...
233
+
234
+ def get_dataset(self, ref: str) -> IndexEntry:
235
+ """Get a dataset entry by name or reference.
236
+
237
+ Args:
238
+ ref: Dataset name, path, or full reference string.
239
+
240
+ Returns:
241
+ IndexEntry for the dataset.
242
+
243
+ Raises:
244
+ KeyError: If dataset not found.
245
+ """
246
+ ...
247
+
248
+ @property
249
+ def datasets(self) -> Iterator[IndexEntry]:
250
+ """Lazily iterate over all dataset entries in this index.
251
+
252
+ Yields:
253
+ IndexEntry for each dataset (may be of different sample types).
254
+ """
255
+ ...
256
+
257
+ def list_datasets(self) -> list[IndexEntry]:
258
+ """Get all dataset entries as a materialized list.
259
+
260
+ Returns:
261
+ List of IndexEntry for each dataset.
262
+ """
263
+ ...
264
+
265
+ # Schema operations
266
+
267
+ def publish_schema(
268
+ self,
269
+ sample_type: type,
270
+ *,
271
+ version: str = "1.0.0",
272
+ **kwargs,
273
+ ) -> str:
274
+ """Publish a schema for a sample type.
275
+
276
+ The sample_type is accepted as ``type`` rather than ``Type[Packable]`` to
277
+ support ``@packable``-decorated classes, which satisfy the Packable protocol
278
+ at runtime but cannot be statically verified by type checkers.
279
+
280
+ Args:
281
+ sample_type: A Packable type (PackableSample subclass or @packable-decorated).
282
+ Validated at runtime via the @runtime_checkable Packable protocol.
283
+ version: Semantic version string for the schema.
284
+ **kwargs: Additional backend-specific options.
285
+
286
+ Returns:
287
+ Schema reference string:
288
+ - Local: 'local://schemas/{module.Class}@{version}'
289
+ - Atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
290
+ """
291
+ ...
292
+
293
+ def get_schema(self, ref: str) -> dict:
294
+ """Get a schema record by reference.
295
+
296
+ Args:
297
+ ref: Schema reference string (local:// or at://).
298
+
299
+ Returns:
300
+ Schema record as a dictionary with fields like 'name', 'version',
301
+ 'fields', etc.
302
+
303
+ Raises:
304
+ KeyError: If schema not found.
305
+ """
306
+ ...
307
+
308
+ @property
309
+ def schemas(self) -> Iterator[dict]:
310
+ """Lazily iterate over all schema records in this index.
311
+
312
+ Yields:
313
+ Schema records as dictionaries.
314
+ """
315
+ ...
316
+
317
+ def list_schemas(self) -> list[dict]:
318
+ """Get all schema records as a materialized list.
319
+
320
+ Returns:
321
+ List of schema records as dictionaries.
322
+ """
323
+ ...
324
+
325
+ def decode_schema(self, ref: str) -> Type[Packable]:
326
+ """Reconstruct a Python Packable type from a stored schema.
327
+
328
+ This method enables loading datasets without knowing the sample type
329
+ ahead of time. The index retrieves the schema record and dynamically
330
+ generates a Packable class matching the schema definition.
331
+
332
+ Args:
333
+ ref: Schema reference string (local:// or at://).
334
+
335
+ Returns:
336
+ A dynamically generated Packable class with fields matching
337
+ the schema definition. The class can be used with
338
+ ``Dataset[T]`` to load and iterate over samples.
339
+
340
+ Raises:
341
+ KeyError: If schema not found.
342
+ ValueError: If schema cannot be decoded (unsupported field types).
343
+
344
+ Example:
345
+ ::
346
+
347
+ >>> entry = index.get_dataset("my-dataset")
348
+ >>> SampleType = index.decode_schema(entry.schema_ref)
349
+ >>> ds = Dataset[SampleType](entry.data_urls[0])
350
+ >>> for sample in ds.ordered():
351
+ ... print(sample) # sample is instance of SampleType
352
+ """
353
+ ...
354
+
355
+
356
+ ##
357
+ # AbstractDataStore Protocol
358
+
359
+
360
+ class AbstractDataStore(Protocol):
361
+ """Protocol for data storage operations.
362
+
363
+ This protocol abstracts over different storage backends for dataset data:
364
+ - S3DataStore: S3-compatible object storage
365
+ - PDSBlobStore: ATProto PDS blob storage (future)
366
+
367
+ The separation of index (metadata) from data store (actual files) allows
368
+ flexible deployment: local index with S3 storage, atmosphere index with
369
+ S3 storage, or atmosphere index with PDS blobs.
370
+
371
+ Example:
372
+ ::
373
+
374
+ >>> store = S3DataStore(credentials, bucket="my-bucket")
375
+ >>> urls = store.write_shards(dataset, prefix="training/v1")
376
+ >>> print(urls)
377
+ ['s3://my-bucket/training/v1/shard-000000.tar', ...]
378
+ """
379
+
380
+ def write_shards(
381
+ self,
382
+ ds: "Dataset",
383
+ *,
384
+ prefix: str,
385
+ **kwargs,
386
+ ) -> list[str]:
387
+ """Write dataset shards to storage.
388
+
389
+ Args:
390
+ ds: The Dataset to write.
391
+ prefix: Path prefix for the shards (e.g., 'datasets/mnist/v1').
392
+ **kwargs: Backend-specific options (e.g., maxcount for shard size).
393
+
394
+ Returns:
395
+ List of URLs for the written shards, suitable for use with
396
+ WebDataset or atdata.Dataset().
397
+ """
398
+ ...
399
+
400
+ def read_url(self, url: str) -> str:
401
+ """Resolve a storage URL for reading.
402
+
403
+ Some storage backends may need to transform URLs (e.g., signing S3 URLs
404
+ or resolving blob references). This method returns a URL that can be
405
+ used directly with WebDataset.
406
+
407
+ Args:
408
+ url: Storage URL to resolve.
409
+
410
+ Returns:
411
+ WebDataset-compatible URL for reading.
412
+ """
413
+ ...
414
+
415
+ def supports_streaming(self) -> bool:
416
+ """Whether this store supports streaming reads.
417
+
418
+ Returns:
419
+ True if the store supports efficient streaming (like S3),
420
+ False if data must be fully downloaded first.
421
+ """
422
+ ...
423
+
424
+
425
+ ##
426
+ # DataSource Protocol
427
+
428
+
429
+ @runtime_checkable
430
+ class DataSource(Protocol):
431
+ """Protocol for data sources that provide streams to Dataset.
432
+
433
+ A DataSource abstracts over different ways of accessing dataset shards:
434
+ - URLSource: Standard WebDataset-compatible URLs (http, https, pipe, gs, etc.)
435
+ - S3Source: S3-compatible storage with explicit credentials
436
+ - BlobSource: ATProto blob references (future)
437
+
438
+ The key method is ``shards()``, which yields (identifier, stream) pairs.
439
+ These are fed directly to WebDataset's tar_file_expander, bypassing URL
440
+ resolution entirely. This enables:
441
+ - Private S3 repos with credentials
442
+ - Custom endpoints (Cloudflare R2, MinIO)
443
+ - ATProto blob streaming
444
+ - Any other source that can provide file-like objects
445
+
446
+ Example:
447
+ ::
448
+
449
+ >>> source = S3Source(
450
+ ... bucket="my-bucket",
451
+ ... keys=["data-000.tar", "data-001.tar"],
452
+ ... endpoint="https://r2.example.com",
453
+ ... credentials=creds,
454
+ ... )
455
+ >>> ds = Dataset[MySample](source)
456
+ >>> for sample in ds.ordered():
457
+ ... print(sample)
458
+ """
459
+
460
+ @property
461
+ def shards(self) -> Iterator[tuple[str, IO[bytes]]]:
462
+ """Lazily yield (identifier, stream) pairs for each shard.
463
+
464
+ The identifier is used for error messages and __url__ metadata.
465
+ The stream must be a file-like object that can be read by tarfile.
466
+
467
+ Yields:
468
+ Tuple of (shard_identifier, file_like_stream).
469
+
470
+ Example:
471
+ ::
472
+
473
+ >>> for shard_id, stream in source.shards:
474
+ ... print(f"Processing {shard_id}")
475
+ ... data = stream.read()
476
+ """
477
+ ...
478
+
479
+ def list_shards(self) -> list[str]:
480
+ """Get list of shard identifiers without opening streams.
481
+
482
+ Used for metadata queries like counting shards without actually
483
+ streaming data. Implementations should return identifiers that
484
+ match what shards would yield.
485
+
486
+ Returns:
487
+ List of shard identifier strings.
488
+ """
489
+ ...
490
+
491
+ def open_shard(self, shard_id: str) -> IO[bytes]:
492
+ """Open a single shard by its identifier.
493
+
494
+ This method enables random access to individual shards, which is
495
+ required for PyTorch DataLoader worker splitting. Each worker opens
496
+ only its assigned shards rather than iterating all shards.
497
+
498
+ Args:
499
+ shard_id: Shard identifier from shard_list.
500
+
501
+ Returns:
502
+ File-like stream for reading the shard.
503
+
504
+ Raises:
505
+ KeyError: If shard_id is not in shard_list.
506
+ """
507
+ ...
508
+
509
+
510
+ ##
511
+ # Module exports
512
+
513
+ __all__ = [
514
+ "Packable",
515
+ "IndexEntry",
516
+ "AbstractIndex",
517
+ "AbstractDataStore",
518
+ "DataSource",
519
+ ]