atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. atdata/__init__.py +11 -0
  2. atdata/_cid.py +0 -21
  3. atdata/_helpers.py +12 -0
  4. atdata/_hf_api.py +46 -1
  5. atdata/_logging.py +43 -0
  6. atdata/_protocols.py +81 -182
  7. atdata/_schema_codec.py +2 -2
  8. atdata/_sources.py +24 -4
  9. atdata/_stub_manager.py +5 -25
  10. atdata/atmosphere/__init__.py +60 -21
  11. atdata/atmosphere/_lexicon_types.py +595 -0
  12. atdata/atmosphere/_types.py +73 -245
  13. atdata/atmosphere/client.py +64 -12
  14. atdata/atmosphere/lens.py +60 -53
  15. atdata/atmosphere/records.py +291 -100
  16. atdata/atmosphere/schema.py +91 -65
  17. atdata/atmosphere/store.py +68 -66
  18. atdata/cli/__init__.py +16 -16
  19. atdata/cli/diagnose.py +2 -2
  20. atdata/cli/{local.py → infra.py} +10 -10
  21. atdata/dataset.py +266 -47
  22. atdata/index/__init__.py +54 -0
  23. atdata/{local → index}/_entry.py +6 -2
  24. atdata/{local → index}/_index.py +617 -72
  25. atdata/{local → index}/_schema.py +5 -5
  26. atdata/lexicons/__init__.py +127 -0
  27. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  28. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  29. atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
  30. atdata/lexicons/ac.foundation.dataset.record.json +117 -0
  31. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  32. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
  34. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  35. atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
  36. atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
  37. atdata/lexicons/ndarray_shim.json +16 -0
  38. atdata/local/__init__.py +12 -13
  39. atdata/local/_repo_legacy.py +3 -3
  40. atdata/manifest/__init__.py +4 -0
  41. atdata/manifest/_proxy.py +321 -0
  42. atdata/promote.py +14 -10
  43. atdata/repository.py +66 -16
  44. atdata/stores/__init__.py +23 -0
  45. atdata/stores/_disk.py +131 -0
  46. atdata/{local → stores}/_s3.py +134 -112
  47. atdata/testing.py +12 -8
  48. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
  49. atdata-0.3.2b1.dist-info/RECORD +71 -0
  50. atdata-0.3.0b1.dist-info/RECORD +0 -54
  51. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
  52. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
  53. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/__init__.py CHANGED
@@ -44,6 +44,7 @@ from .dataset import (
44
44
  SampleBatch as SampleBatch,
45
45
  Dataset as Dataset,
46
46
  packable as packable,
47
+ write_samples as write_samples,
47
48
  )
48
49
 
49
50
  from .lens import (
@@ -89,6 +90,7 @@ from ._schema_codec import (
89
90
  from ._logging import (
90
91
  configure_logging as configure_logging,
91
92
  get_logger as get_logger,
93
+ log_operation as log_operation,
92
94
  )
93
95
 
94
96
  from .repository import (
@@ -96,6 +98,14 @@ from .repository import (
96
98
  create_repository as create_repository,
97
99
  )
98
100
 
101
+ from .index import (
102
+ Index as Index,
103
+ )
104
+
105
+ from .stores import (
106
+ LocalDiskStore as LocalDiskStore,
107
+ )
108
+
99
109
  from ._cid import (
100
110
  generate_cid as generate_cid,
101
111
  verify_cid as verify_cid,
@@ -112,6 +122,7 @@ from .manifest import (
112
122
  ManifestWriter as ManifestWriter,
113
123
  QueryExecutor as QueryExecutor,
114
124
  SampleLocation as SampleLocation,
125
+ query_fields as query_fields,
115
126
  )
116
127
 
117
128
  # ATProto integration (lazy import to avoid requiring atproto package)
atdata/_cid.py CHANGED
@@ -116,29 +116,8 @@ def verify_cid(cid: str, data: Any) -> bool:
116
116
  return cid == expected_cid
117
117
 
118
118
 
119
- def parse_cid(cid: str) -> dict:
120
- """Parse a CID string into its components.
121
-
122
- Args:
123
- cid: CID string to parse.
124
-
125
- Returns:
126
- Dictionary with 'version', 'codec', and 'hash' keys.
127
- The 'hash' value is itself a dict with 'code', 'size', and 'digest'.
128
-
129
- Examples:
130
- >>> info = parse_cid('bafyrei...')
131
- >>> info['version']
132
- 1
133
- >>> info['codec']
134
- 113 # 0x71 = dag-cbor
135
- """
136
- return libipld.decode_cid(cid)
137
-
138
-
139
119
  __all__ = [
140
120
  "generate_cid",
141
121
  "generate_cid_from_bytes",
142
122
  "verify_cid",
143
- "parse_cid",
144
123
  ]
atdata/_helpers.py CHANGED
@@ -65,10 +65,22 @@ def bytes_to_array(b: bytes) -> np.ndarray:
65
65
  return np.load(BytesIO(b), allow_pickle=True)
66
66
 
67
67
  # Compact format: dtype_len(1B) + dtype_str + ndim(1B) + shape(ndim×8B) + data
68
+ if len(b) < 2:
69
+ raise ValueError(f"Array buffer too short ({len(b)} bytes): need at least 2")
68
70
  dlen = b[0]
71
+ min_header = 2 + dlen # dtype_len + dtype_str + ndim
72
+ if len(b) < min_header:
73
+ raise ValueError(
74
+ f"Array buffer too short ({len(b)} bytes): need at least {min_header} for header"
75
+ )
69
76
  dtype = np.dtype(b[1 : 1 + dlen].decode())
70
77
  ndim = b[1 + dlen]
71
78
  offset = 2 + dlen
79
+ min_with_shape = offset + ndim * 8
80
+ if len(b) < min_with_shape:
81
+ raise ValueError(
82
+ f"Array buffer too short ({len(b)} bytes): need at least {min_with_shape} for shape"
83
+ )
72
84
  shape = struct.unpack_from(f"<{ndim}q", b, offset)
73
85
  offset += ndim * 8
74
86
  return np.frombuffer(b, dtype=dtype, offset=offset).reshape(shape).copy()
atdata/_hf_api.py CHANGED
@@ -32,6 +32,7 @@ import re
32
32
  import threading
33
33
  from pathlib import Path
34
34
  from typing import (
35
+ Any,
35
36
  TYPE_CHECKING,
36
37
  Generic,
37
38
  Mapping,
@@ -65,7 +66,7 @@ def get_default_index() -> "Index": # noqa: F821
65
66
  """Get or create the module-level default Index.
66
67
 
67
68
  The default Index uses Redis for local storage (backwards-compatible
68
- default) and an anonymous AtmosphereClient for read-only public data
69
+ default) and an anonymous Atmosphere for read-only public data
69
70
  resolution.
70
71
 
71
72
  The default is created lazily on first access and cached for the
@@ -189,6 +190,37 @@ class DatasetDict(Generic[ST], dict):
189
190
  """
190
191
  return {name: len(ds.list_shards()) for name, ds in self.items()}
191
192
 
193
+ # Methods proxied to the sole Dataset when only one split exists.
194
+ _DATASET_METHODS = frozenset(
195
+ {
196
+ "ordered",
197
+ "shuffled",
198
+ "as_type",
199
+ "list_shards",
200
+ "head",
201
+ }
202
+ )
203
+
204
+ def __getattr__(self, name: str) -> Any:
205
+ """Proxy common Dataset methods when this dict has exactly one split.
206
+
207
+ When a ``DatasetDict`` contains a single split, calling iteration
208
+ methods like ``.ordered()`` or ``.shuffled()`` is forwarded to the
209
+ contained ``Dataset`` for convenience. Multi-split dicts raise
210
+ ``AttributeError`` with a hint to select a split explicitly.
211
+ """
212
+ if name in self._DATASET_METHODS:
213
+ if len(self) == 1:
214
+ return getattr(next(iter(self.values())), name)
215
+ splits = ", ".join(f"'{k}'" for k in self.keys())
216
+ raise AttributeError(
217
+ f"'{type(self).__name__}' has {len(self)} splits ({splits}). "
218
+ f"Select one first, e.g. ds_dict['{next(iter(self.keys()))}'].{name}()"
219
+ )
220
+ raise AttributeError(
221
+ f"'{type(self).__name__}' object has no attribute '{name}'"
222
+ )
223
+
192
224
 
193
225
  ##
194
226
  # Path resolution utilities
@@ -682,12 +714,23 @@ def load_dataset(
682
714
  >>> index = Index()
683
715
  >>> ds = load_dataset("@local/my-dataset", index=index, split="train")
684
716
  """
717
+ from ._logging import get_logger
718
+
719
+ log = get_logger()
720
+ log.info(
721
+ "load_dataset: path=%s, split=%s, sample_type=%s",
722
+ path,
723
+ split,
724
+ sample_type.__name__ if sample_type is not None else "None",
725
+ )
726
+
685
727
  # Handle @handle/dataset indexed path resolution
686
728
  if _is_indexed_path(path):
687
729
  if index is None:
688
730
  index = get_default_index()
689
731
 
690
732
  source, schema_ref = _resolve_indexed_path(path, index)
733
+ log.debug("load_dataset: resolved indexed path, schema_ref=%s", schema_ref)
691
734
 
692
735
  # Resolve sample_type from schema if not provided
693
736
  resolved_type: Type = (
@@ -714,6 +757,8 @@ def load_dataset(
714
757
  if not splits_shards:
715
758
  raise FileNotFoundError(f"No data files found at path: {path}")
716
759
 
760
+ log.debug("load_dataset: resolved %d split(s) from path", len(splits_shards))
761
+
717
762
  # Build Dataset for each split
718
763
  datasets: dict[str, Dataset] = {}
719
764
  for split_name, shards in splits_shards.items():
atdata/_logging.py CHANGED
@@ -22,7 +22,10 @@ custom logger implementations.
22
22
 
23
23
  from __future__ import annotations
24
24
 
25
+ import contextlib
25
26
  import logging
27
+ import time
28
+ from collections.abc import Generator
26
29
  from typing import Any, Protocol, runtime_checkable
27
30
 
28
31
 
@@ -68,3 +71,43 @@ def get_logger() -> LoggerProtocol:
68
71
  whatever was last set via :func:`configure_logging`.
69
72
  """
70
73
  return _logger
74
+
75
+
76
+ @contextlib.contextmanager
77
+ def log_operation(op_name: str, **context: Any) -> Generator[None, None, None]:
78
+ """Log the start, completion, and duration of an operation.
79
+
80
+ Emits an ``info`` message on entry and on successful completion
81
+ (with elapsed time), or an ``error`` message if an exception
82
+ propagates out.
83
+
84
+ Args:
85
+ op_name: Short label for the operation (e.g. ``"write_samples"``).
86
+ **context: Arbitrary key-value pairs included in every log message.
87
+
88
+ Examples:
89
+ >>> with log_operation("write_samples", shard_count=10):
90
+ ... do_work()
91
+ """
92
+ log = get_logger()
93
+ ctx_str = ", ".join(f"{k}={v}" for k, v in context.items())
94
+ if ctx_str:
95
+ log.info("%s: started (%s)", op_name, ctx_str)
96
+ else:
97
+ log.info("%s: started", op_name)
98
+ t0 = time.monotonic()
99
+ try:
100
+ yield
101
+ except Exception:
102
+ elapsed = time.monotonic() - t0
103
+ if ctx_str:
104
+ log.error("%s: failed after %.2fs (%s)", op_name, elapsed, ctx_str)
105
+ else:
106
+ log.error("%s: failed after %.2fs", op_name, elapsed)
107
+ raise
108
+ else:
109
+ elapsed = time.monotonic() - t0
110
+ if ctx_str:
111
+ log.info("%s: completed in %.2fs (%s)", op_name, elapsed, ctx_str)
112
+ else:
113
+ log.info("%s: completed in %.2fs", op_name, elapsed)
atdata/_protocols.py CHANGED
@@ -1,37 +1,25 @@
1
1
  """Protocol definitions for atdata index and storage abstractions.
2
2
 
3
- This module defines the abstract protocols that enable interchangeable
4
- index backends (local Redis vs ATProto PDS) and data stores (S3 vs PDS blobs).
5
-
6
- The key insight is that both local and atmosphere implementations solve the
7
- same problem: indexed dataset storage with external data URLs. These protocols
8
- formalize that common interface.
9
-
10
- Note:
11
- Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
12
- the standard Python syntax for Protocol definitions - these are interface
13
- specifications, not stub implementations. Concrete classes (Index,
14
- AtmosphereIndex, etc.) provide the actual implementations.
3
+ Defines the abstract protocols that enable interchangeable index backends
4
+ (local SQLite/Redis vs ATProto PDS) and data stores (S3, local disk, PDS blobs).
15
5
 
16
6
  Protocols:
17
- Packable: Structural interface for packable sample types (lens compatibility)
7
+ Packable: Structural interface for packable sample types
18
8
  IndexEntry: Common interface for dataset index entries
19
9
  AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
20
10
  AbstractDataStore: Protocol for data storage operations
11
+ DataSource: Protocol for streaming shard data
21
12
 
22
13
  Examples:
23
14
  >>> def process_datasets(index: AbstractIndex) -> None:
24
15
  ... for entry in index.list_datasets():
25
16
  ... print(f"{entry.name}: {entry.data_urls}")
26
- ...
27
- >>> # Works with either Index or AtmosphereIndex
28
- >>> process_datasets(local_index)
29
- >>> process_datasets(atmosphere_index)
30
17
  """
31
18
 
32
19
  from typing import (
33
20
  IO,
34
21
  Any,
22
+ Iterable,
35
23
  Iterator,
36
24
  Optional,
37
25
  Protocol,
@@ -115,7 +103,7 @@ class IndexEntry(Protocol):
115
103
  """Schema reference string.
116
104
 
117
105
  Local: ``local://schemas/{module.Class}@{version}``
118
- Atmosphere: ``at://did:plc:.../ac.foundation.dataset.sampleSchema/...``
106
+ Atmosphere: ``at://did:plc:.../ac.foundation.dataset.schema/...``
119
107
  """
120
108
  ...
121
109
 
@@ -137,32 +125,16 @@ class IndexEntry(Protocol):
137
125
 
138
126
 
139
127
  class AbstractIndex(Protocol):
140
- """Protocol for index operations - implemented by Index and AtmosphereIndex.
141
-
142
- This protocol defines the common interface for managing dataset metadata:
143
- - Publishing and retrieving schemas
144
- - Inserting and listing datasets
145
- - (Future) Publishing and retrieving lenses
146
-
147
- A single index can hold datasets of many different sample types. The sample
148
- type is tracked via schema references, not as a generic parameter on the index.
128
+ """Protocol for index operations implemented by Index and AtmosphereIndex.
149
129
 
150
- Optional Extensions:
151
- Some index implementations support additional features:
152
- - ``data_store``: An AbstractDataStore for reading/writing dataset shards.
153
- If present, ``load_dataset`` will use it for S3 credential resolution.
130
+ Manages dataset metadata: publishing/retrieving schemas, inserting/listing
131
+ datasets. A single index holds datasets of many sample types, tracked via
132
+ schema references.
154
133
 
155
134
  Examples:
156
135
  >>> def publish_and_list(index: AbstractIndex) -> None:
157
- ... # Publish schemas for different types
158
- ... schema1 = index.publish_schema(ImageSample, version="1.0.0")
159
- ... schema2 = index.publish_schema(TextSample, version="1.0.0")
160
- ...
161
- ... # Insert datasets of different types
136
+ ... index.publish_schema(ImageSample, version="1.0.0")
162
137
  ... index.insert_dataset(image_ds, name="images")
163
- ... index.insert_dataset(text_ds, name="texts")
164
- ...
165
- ... # List all datasets (mixed types)
166
138
  ... for entry in index.list_datasets():
167
139
  ... print(f"{entry.name} -> {entry.schema_ref}")
168
140
  """
@@ -171,55 +143,75 @@ class AbstractIndex(Protocol):
171
143
  def data_store(self) -> Optional["AbstractDataStore"]:
172
144
  """Optional data store for reading/writing shards.
173
145
 
174
- If present, ``load_dataset`` will use it for credential resolution
175
- (e.g., S3 credentials from S3DataStore).
176
-
177
- Returns:
178
- AbstractDataStore instance, or None if this index doesn't have
179
- an associated data store.
180
-
181
- Note:
182
- Not all index implementations provide a data_store. Use
183
- ``hasattr(index, 'data_store') and index.data_store is not None``
184
- for safe access.
146
+ If present, ``load_dataset`` uses it for credential resolution.
147
+ Not all implementations provide a data_store; check with
148
+ ``getattr(index, 'data_store', None)``.
185
149
  """
186
150
  ...
187
151
 
188
152
  # Dataset operations
189
153
 
190
- def insert_dataset(
154
+ def write_samples(
191
155
  self,
192
- ds: "Dataset",
156
+ samples: Iterable,
193
157
  *,
194
158
  name: str,
195
159
  schema_ref: Optional[str] = None,
160
+ data_store: Optional["AbstractDataStore"] = None,
161
+ force: bool = False,
196
162
  **kwargs,
197
163
  ) -> IndexEntry:
198
- """Insert a dataset into the index.
164
+ """Write samples and create an index entry in one step.
165
+
166
+ Serializes samples to WebDataset tar files, stores them via the
167
+ appropriate backend, and creates an index entry.
199
168
 
200
- The sample type is inferred from ``ds.sample_type``. If schema_ref is not
201
- provided, the schema may be auto-published based on the sample type.
169
+ For atmosphere targets, data is uploaded as PDS blobs by default
170
+ with size guards (50 MB per shard, 1 GB total).
202
171
 
203
172
  Args:
204
- ds: The Dataset to register in the index (any sample type).
205
- name: Human-readable name for the dataset.
206
- schema_ref: Optional explicit schema reference. If not provided,
207
- the schema may be auto-published or inferred from ds.sample_type.
208
- **kwargs: Additional backend-specific options.
173
+ samples: Iterable of Packable samples. Must be non-empty.
174
+ name: Dataset name, optionally prefixed with target backend.
175
+ schema_ref: Optional schema reference.
176
+ data_store: Explicit data store for shard storage.
177
+ force: Bypass PDS size limits.
178
+ **kwargs: Backend-specific options (maxcount, description, etc.).
209
179
 
210
180
  Returns:
211
- IndexEntry for the inserted dataset.
181
+ IndexEntry for the created dataset.
212
182
  """
213
183
  ...
214
184
 
215
- def get_dataset(self, ref: str) -> IndexEntry:
216
- """Get a dataset entry by name or reference.
185
+ def insert_dataset(
186
+ self,
187
+ ds: "Dataset",
188
+ *,
189
+ name: str,
190
+ schema_ref: Optional[str] = None,
191
+ data_store: Optional["AbstractDataStore"] = None,
192
+ force: bool = False,
193
+ copy: bool = False,
194
+ **kwargs,
195
+ ) -> IndexEntry:
196
+ """Register an existing dataset in the index.
197
+
198
+ For atmosphere targets, local sources are uploaded via
199
+ *data_store* (defaults to PDS blobs). Credentialed sources
200
+ require ``copy=True``.
217
201
 
218
202
  Args:
219
- ref: Dataset name, path, or full reference string.
203
+ ds: The Dataset to register.
204
+ name: Human-readable name.
205
+ schema_ref: Explicit schema ref; auto-published if ``None``.
206
+ data_store: Explicit data store for shard storage.
207
+ force: Bypass PDS size limits.
208
+ copy: Copy data to destination store even for remote sources.
209
+ **kwargs: Backend-specific options.
210
+ """
211
+ ...
220
212
 
221
- Returns:
222
- IndexEntry for the dataset.
213
+ def get_dataset(self, ref: str) -> IndexEntry:
214
+ """Get a dataset entry by name or reference.
223
215
 
224
216
  Raises:
225
217
  KeyError: If dataset not found.
@@ -242,33 +234,19 @@ class AbstractIndex(Protocol):
242
234
  ) -> str:
243
235
  """Publish a schema for a sample type.
244
236
 
245
- The sample_type is accepted as ``type`` rather than ``Type[Packable]`` to
246
- support ``@packable``-decorated classes, which satisfy the Packable protocol
247
- at runtime but cannot be statically verified by type checkers.
248
-
249
237
  Args:
250
- sample_type: A Packable type (PackableSample subclass or @packable-decorated).
251
- Validated at runtime via the @runtime_checkable Packable protocol.
252
- version: Semantic version string for the schema.
253
- **kwargs: Additional backend-specific options.
238
+ sample_type: A Packable type (``@packable``-decorated or subclass).
239
+ version: Semantic version string.
240
+ **kwargs: Backend-specific options.
254
241
 
255
242
  Returns:
256
- Schema reference string:
257
- - Local: 'local://schemas/{module.Class}@{version}'
258
- - Atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
243
+ Schema reference string (``local://...`` or ``at://...``).
259
244
  """
260
245
  ...
261
246
 
262
247
  def get_schema(self, ref: str) -> dict:
263
248
  """Get a schema record by reference.
264
249
 
265
- Args:
266
- ref: Schema reference string (local:// or at://).
267
-
268
- Returns:
269
- Schema record as a dictionary with fields like 'name', 'version',
270
- 'fields', etc.
271
-
272
250
  Raises:
273
251
  KeyError: If schema not found.
274
252
  """
@@ -280,30 +258,15 @@ class AbstractIndex(Protocol):
280
258
  def list_schemas(self) -> list[dict]: ...
281
259
 
282
260
  def decode_schema(self, ref: str) -> Type[Packable]:
283
- """Reconstruct a Python Packable type from a stored schema.
284
-
285
- This method enables loading datasets without knowing the sample type
286
- ahead of time. The index retrieves the schema record and dynamically
287
- generates a Packable class matching the schema definition.
288
-
289
- Args:
290
- ref: Schema reference string (local:// or at://).
291
-
292
- Returns:
293
- A dynamically generated Packable class with fields matching
294
- the schema definition. The class can be used with
295
- ``Dataset[T]`` to load and iterate over samples.
261
+ """Reconstruct a Packable type from a stored schema.
296
262
 
297
263
  Raises:
298
264
  KeyError: If schema not found.
299
- ValueError: If schema cannot be decoded (unsupported field types).
265
+ ValueError: If schema has unsupported field types.
300
266
 
301
267
  Examples:
302
- >>> entry = index.get_dataset("my-dataset")
303
268
  >>> SampleType = index.decode_schema(entry.schema_ref)
304
269
  >>> ds = Dataset[SampleType](entry.data_urls[0])
305
- >>> for sample in ds.ordered():
306
- ... print(sample) # sample is instance of SampleType
307
270
  """
308
271
  ...
309
272
 
@@ -313,21 +276,14 @@ class AbstractIndex(Protocol):
313
276
 
314
277
 
315
278
  class AbstractDataStore(Protocol):
316
- """Protocol for data storage operations.
279
+ """Protocol for data storage backends (S3, local disk, PDS blobs).
317
280
 
318
- This protocol abstracts over different storage backends for dataset data:
319
- - S3DataStore: S3-compatible object storage
320
- - PDSBlobStore: ATProto PDS blob storage (future)
321
-
322
- The separation of index (metadata) from data store (actual files) allows
323
- flexible deployment: local index with S3 storage, atmosphere index with
324
- S3 storage, or atmosphere index with PDS blobs.
281
+ Separates index (metadata) from data store (shard files), enabling
282
+ flexible deployment combinations.
325
283
 
326
284
  Examples:
327
285
  >>> store = S3DataStore(credentials, bucket="my-bucket")
328
286
  >>> urls = store.write_shards(dataset, prefix="training/v1")
329
- >>> print(urls)
330
- ['s3://my-bucket/training/v1/shard-000000.tar', ...]
331
287
  """
332
288
 
333
289
  def write_shards(
@@ -341,28 +297,16 @@ class AbstractDataStore(Protocol):
341
297
 
342
298
  Args:
343
299
  ds: The Dataset to write.
344
- prefix: Path prefix for the shards (e.g., 'datasets/mnist/v1').
345
- **kwargs: Backend-specific options (e.g., maxcount for shard size).
300
+ prefix: Path prefix (e.g., ``'datasets/mnist/v1'``).
301
+ **kwargs: Backend-specific options (``maxcount``, ``maxsize``, etc.).
346
302
 
347
303
  Returns:
348
- List of URLs for the written shards, suitable for use with
349
- WebDataset or atdata.Dataset().
304
+ List of shard URLs suitable for ``atdata.Dataset()``.
350
305
  """
351
306
  ...
352
307
 
353
308
  def read_url(self, url: str) -> str:
354
- """Resolve a storage URL for reading.
355
-
356
- Some storage backends may need to transform URLs (e.g., signing S3 URLs
357
- or resolving blob references). This method returns a URL that can be
358
- used directly with WebDataset.
359
-
360
- Args:
361
- url: Storage URL to resolve.
362
-
363
- Returns:
364
- WebDataset-compatible URL for reading.
365
- """
309
+ """Resolve a storage URL for reading (e.g., sign S3 URLs)."""
366
310
  ...
367
311
 
368
312
  def supports_streaming(self) -> bool: ...
@@ -374,77 +318,32 @@ class AbstractDataStore(Protocol):
374
318
 
375
319
  @runtime_checkable
376
320
  class DataSource(Protocol):
377
- """Protocol for data sources that provide streams to Dataset.
321
+ """Protocol for data sources that stream shard data to Dataset.
378
322
 
379
- A DataSource abstracts over different ways of accessing dataset shards:
380
- - URLSource: Standard WebDataset-compatible URLs (http, https, pipe, gs, etc.)
381
- - S3Source: S3-compatible storage with explicit credentials
382
- - BlobSource: ATProto blob references (future)
383
-
384
- The key method is ``shards()``, which yields (identifier, stream) pairs.
385
- These are fed directly to WebDataset's tar_file_expander, bypassing URL
386
- resolution entirely. This enables:
387
- - Private S3 repos with credentials
388
- - Custom endpoints (Cloudflare R2, MinIO)
389
- - ATProto blob streaming
390
- - Any other source that can provide file-like objects
323
+ Implementations (URLSource, S3Source, BlobSource) yield
324
+ ``(identifier, stream)`` pairs fed to WebDataset's tar expander,
325
+ bypassing URL resolution. This enables private S3, custom endpoints,
326
+ and ATProto blob streaming.
391
327
 
392
328
  Examples:
393
- >>> source = S3Source(
394
- ... bucket="my-bucket",
395
- ... keys=["data-000.tar", "data-001.tar"],
396
- ... endpoint="https://r2.example.com",
397
- ... credentials=creds,
398
- ... )
329
+ >>> source = S3Source(bucket="my-bucket", keys=["data-000.tar"])
399
330
  >>> ds = Dataset[MySample](source)
400
- >>> for sample in ds.ordered():
401
- ... print(sample)
402
331
  """
403
332
 
404
333
  @property
405
334
  def shards(self) -> Iterator[tuple[str, IO[bytes]]]:
406
- """Lazily yield (identifier, stream) pairs for each shard.
407
-
408
- The identifier is used for error messages and __url__ metadata.
409
- The stream must be a file-like object that can be read by tarfile.
410
-
411
- Yields:
412
- Tuple of (shard_identifier, file_like_stream).
413
-
414
- Examples:
415
- >>> for shard_id, stream in source.shards:
416
- ... print(f"Processing {shard_id}")
417
- ... data = stream.read()
418
- """
335
+ """Lazily yield ``(shard_id, stream)`` pairs for each shard."""
419
336
  ...
420
337
 
421
338
  def list_shards(self) -> list[str]:
422
- """Get list of shard identifiers without opening streams.
423
-
424
- Used for metadata queries like counting shards without actually
425
- streaming data. Implementations should return identifiers that
426
- match what shards would yield.
427
-
428
- Returns:
429
- List of shard identifier strings.
430
- """
339
+ """Shard identifiers without opening streams."""
431
340
  ...
432
341
 
433
342
  def open_shard(self, shard_id: str) -> IO[bytes]:
434
- """Open a single shard by its identifier.
435
-
436
- This method enables random access to individual shards, which is
437
- required for PyTorch DataLoader worker splitting. Each worker opens
438
- only its assigned shards rather than iterating all shards.
439
-
440
- Args:
441
- shard_id: Shard identifier from list_shards().
442
-
443
- Returns:
444
- File-like stream for reading the shard.
343
+ """Open a single shard for random access (e.g., DataLoader splitting).
445
344
 
446
345
  Raises:
447
- KeyError: If shard_id is not in list_shards().
346
+ KeyError: If *shard_id* is not in ``list_shards()``.
448
347
  """
449
348
  ...
450
349
 
atdata/_schema_codec.py CHANGED
@@ -284,7 +284,7 @@ def generate_stub(schema: dict) -> str:
284
284
  String content for a .pyi stub file.
285
285
 
286
286
  Examples:
287
- >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
287
+ >>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
288
288
  >>> stub_content = generate_stub(schema.to_dict())
289
289
  >>> # Save to a stubs directory configured in your IDE
290
290
  >>> with open("stubs/my_sample.pyi", "w") as f:
@@ -360,7 +360,7 @@ def generate_module(schema: dict) -> str:
360
360
  String content for a .py module file.
361
361
 
362
362
  Examples:
363
- >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
363
+ >>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
364
364
  >>> module_content = generate_module(schema.to_dict())
365
365
  >>> # The module can be imported after being saved
366
366
  """