atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +11 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +46 -1
- atdata/_logging.py +43 -0
- atdata/_protocols.py +81 -182
- atdata/_schema_codec.py +2 -2
- atdata/_sources.py +24 -4
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +60 -21
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +73 -245
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +60 -53
- atdata/atmosphere/records.py +291 -100
- atdata/atmosphere/schema.py +91 -65
- atdata/atmosphere/store.py +68 -66
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +266 -47
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_entry.py +6 -2
- atdata/{local → index}/_index.py +617 -72
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +127 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
- atdata/lexicons/ac.foundation.dataset.record.json +117 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/promote.py +14 -10
- atdata/repository.py +66 -16
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +131 -0
- atdata/{local → stores}/_s3.py +134 -112
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
- atdata-0.3.2b1.dist-info/RECORD +71 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/__init__.py
CHANGED
|
@@ -44,6 +44,7 @@ from .dataset import (
|
|
|
44
44
|
SampleBatch as SampleBatch,
|
|
45
45
|
Dataset as Dataset,
|
|
46
46
|
packable as packable,
|
|
47
|
+
write_samples as write_samples,
|
|
47
48
|
)
|
|
48
49
|
|
|
49
50
|
from .lens import (
|
|
@@ -89,6 +90,7 @@ from ._schema_codec import (
|
|
|
89
90
|
from ._logging import (
|
|
90
91
|
configure_logging as configure_logging,
|
|
91
92
|
get_logger as get_logger,
|
|
93
|
+
log_operation as log_operation,
|
|
92
94
|
)
|
|
93
95
|
|
|
94
96
|
from .repository import (
|
|
@@ -96,6 +98,14 @@ from .repository import (
|
|
|
96
98
|
create_repository as create_repository,
|
|
97
99
|
)
|
|
98
100
|
|
|
101
|
+
from .index import (
|
|
102
|
+
Index as Index,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
from .stores import (
|
|
106
|
+
LocalDiskStore as LocalDiskStore,
|
|
107
|
+
)
|
|
108
|
+
|
|
99
109
|
from ._cid import (
|
|
100
110
|
generate_cid as generate_cid,
|
|
101
111
|
verify_cid as verify_cid,
|
|
@@ -112,6 +122,7 @@ from .manifest import (
|
|
|
112
122
|
ManifestWriter as ManifestWriter,
|
|
113
123
|
QueryExecutor as QueryExecutor,
|
|
114
124
|
SampleLocation as SampleLocation,
|
|
125
|
+
query_fields as query_fields,
|
|
115
126
|
)
|
|
116
127
|
|
|
117
128
|
# ATProto integration (lazy import to avoid requiring atproto package)
|
atdata/_cid.py
CHANGED
|
@@ -116,29 +116,8 @@ def verify_cid(cid: str, data: Any) -> bool:
|
|
|
116
116
|
return cid == expected_cid
|
|
117
117
|
|
|
118
118
|
|
|
119
|
-
def parse_cid(cid: str) -> dict:
|
|
120
|
-
"""Parse a CID string into its components.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
cid: CID string to parse.
|
|
124
|
-
|
|
125
|
-
Returns:
|
|
126
|
-
Dictionary with 'version', 'codec', and 'hash' keys.
|
|
127
|
-
The 'hash' value is itself a dict with 'code', 'size', and 'digest'.
|
|
128
|
-
|
|
129
|
-
Examples:
|
|
130
|
-
>>> info = parse_cid('bafyrei...')
|
|
131
|
-
>>> info['version']
|
|
132
|
-
1
|
|
133
|
-
>>> info['codec']
|
|
134
|
-
113 # 0x71 = dag-cbor
|
|
135
|
-
"""
|
|
136
|
-
return libipld.decode_cid(cid)
|
|
137
|
-
|
|
138
|
-
|
|
139
119
|
__all__ = [
|
|
140
120
|
"generate_cid",
|
|
141
121
|
"generate_cid_from_bytes",
|
|
142
122
|
"verify_cid",
|
|
143
|
-
"parse_cid",
|
|
144
123
|
]
|
atdata/_helpers.py
CHANGED
|
@@ -65,10 +65,22 @@ def bytes_to_array(b: bytes) -> np.ndarray:
|
|
|
65
65
|
return np.load(BytesIO(b), allow_pickle=True)
|
|
66
66
|
|
|
67
67
|
# Compact format: dtype_len(1B) + dtype_str + ndim(1B) + shape(ndim×8B) + data
|
|
68
|
+
if len(b) < 2:
|
|
69
|
+
raise ValueError(f"Array buffer too short ({len(b)} bytes): need at least 2")
|
|
68
70
|
dlen = b[0]
|
|
71
|
+
min_header = 2 + dlen # dtype_len + dtype_str + ndim
|
|
72
|
+
if len(b) < min_header:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
f"Array buffer too short ({len(b)} bytes): need at least {min_header} for header"
|
|
75
|
+
)
|
|
69
76
|
dtype = np.dtype(b[1 : 1 + dlen].decode())
|
|
70
77
|
ndim = b[1 + dlen]
|
|
71
78
|
offset = 2 + dlen
|
|
79
|
+
min_with_shape = offset + ndim * 8
|
|
80
|
+
if len(b) < min_with_shape:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Array buffer too short ({len(b)} bytes): need at least {min_with_shape} for shape"
|
|
83
|
+
)
|
|
72
84
|
shape = struct.unpack_from(f"<{ndim}q", b, offset)
|
|
73
85
|
offset += ndim * 8
|
|
74
86
|
return np.frombuffer(b, dtype=dtype, offset=offset).reshape(shape).copy()
|
atdata/_hf_api.py
CHANGED
|
@@ -32,6 +32,7 @@ import re
|
|
|
32
32
|
import threading
|
|
33
33
|
from pathlib import Path
|
|
34
34
|
from typing import (
|
|
35
|
+
Any,
|
|
35
36
|
TYPE_CHECKING,
|
|
36
37
|
Generic,
|
|
37
38
|
Mapping,
|
|
@@ -65,7 +66,7 @@ def get_default_index() -> "Index": # noqa: F821
|
|
|
65
66
|
"""Get or create the module-level default Index.
|
|
66
67
|
|
|
67
68
|
The default Index uses Redis for local storage (backwards-compatible
|
|
68
|
-
default) and an anonymous
|
|
69
|
+
default) and an anonymous Atmosphere for read-only public data
|
|
69
70
|
resolution.
|
|
70
71
|
|
|
71
72
|
The default is created lazily on first access and cached for the
|
|
@@ -189,6 +190,37 @@ class DatasetDict(Generic[ST], dict):
|
|
|
189
190
|
"""
|
|
190
191
|
return {name: len(ds.list_shards()) for name, ds in self.items()}
|
|
191
192
|
|
|
193
|
+
# Methods proxied to the sole Dataset when only one split exists.
|
|
194
|
+
_DATASET_METHODS = frozenset(
|
|
195
|
+
{
|
|
196
|
+
"ordered",
|
|
197
|
+
"shuffled",
|
|
198
|
+
"as_type",
|
|
199
|
+
"list_shards",
|
|
200
|
+
"head",
|
|
201
|
+
}
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def __getattr__(self, name: str) -> Any:
|
|
205
|
+
"""Proxy common Dataset methods when this dict has exactly one split.
|
|
206
|
+
|
|
207
|
+
When a ``DatasetDict`` contains a single split, calling iteration
|
|
208
|
+
methods like ``.ordered()`` or ``.shuffled()`` is forwarded to the
|
|
209
|
+
contained ``Dataset`` for convenience. Multi-split dicts raise
|
|
210
|
+
``AttributeError`` with a hint to select a split explicitly.
|
|
211
|
+
"""
|
|
212
|
+
if name in self._DATASET_METHODS:
|
|
213
|
+
if len(self) == 1:
|
|
214
|
+
return getattr(next(iter(self.values())), name)
|
|
215
|
+
splits = ", ".join(f"'{k}'" for k in self.keys())
|
|
216
|
+
raise AttributeError(
|
|
217
|
+
f"'{type(self).__name__}' has {len(self)} splits ({splits}). "
|
|
218
|
+
f"Select one first, e.g. ds_dict['{next(iter(self.keys()))}'].{name}()"
|
|
219
|
+
)
|
|
220
|
+
raise AttributeError(
|
|
221
|
+
f"'{type(self).__name__}' object has no attribute '{name}'"
|
|
222
|
+
)
|
|
223
|
+
|
|
192
224
|
|
|
193
225
|
##
|
|
194
226
|
# Path resolution utilities
|
|
@@ -682,12 +714,23 @@ def load_dataset(
|
|
|
682
714
|
>>> index = Index()
|
|
683
715
|
>>> ds = load_dataset("@local/my-dataset", index=index, split="train")
|
|
684
716
|
"""
|
|
717
|
+
from ._logging import get_logger
|
|
718
|
+
|
|
719
|
+
log = get_logger()
|
|
720
|
+
log.info(
|
|
721
|
+
"load_dataset: path=%s, split=%s, sample_type=%s",
|
|
722
|
+
path,
|
|
723
|
+
split,
|
|
724
|
+
sample_type.__name__ if sample_type is not None else "None",
|
|
725
|
+
)
|
|
726
|
+
|
|
685
727
|
# Handle @handle/dataset indexed path resolution
|
|
686
728
|
if _is_indexed_path(path):
|
|
687
729
|
if index is None:
|
|
688
730
|
index = get_default_index()
|
|
689
731
|
|
|
690
732
|
source, schema_ref = _resolve_indexed_path(path, index)
|
|
733
|
+
log.debug("load_dataset: resolved indexed path, schema_ref=%s", schema_ref)
|
|
691
734
|
|
|
692
735
|
# Resolve sample_type from schema if not provided
|
|
693
736
|
resolved_type: Type = (
|
|
@@ -714,6 +757,8 @@ def load_dataset(
|
|
|
714
757
|
if not splits_shards:
|
|
715
758
|
raise FileNotFoundError(f"No data files found at path: {path}")
|
|
716
759
|
|
|
760
|
+
log.debug("load_dataset: resolved %d split(s) from path", len(splits_shards))
|
|
761
|
+
|
|
717
762
|
# Build Dataset for each split
|
|
718
763
|
datasets: dict[str, Dataset] = {}
|
|
719
764
|
for split_name, shards in splits_shards.items():
|
atdata/_logging.py
CHANGED
|
@@ -22,7 +22,10 @@ custom logger implementations.
|
|
|
22
22
|
|
|
23
23
|
from __future__ import annotations
|
|
24
24
|
|
|
25
|
+
import contextlib
|
|
25
26
|
import logging
|
|
27
|
+
import time
|
|
28
|
+
from collections.abc import Generator
|
|
26
29
|
from typing import Any, Protocol, runtime_checkable
|
|
27
30
|
|
|
28
31
|
|
|
@@ -68,3 +71,43 @@ def get_logger() -> LoggerProtocol:
|
|
|
68
71
|
whatever was last set via :func:`configure_logging`.
|
|
69
72
|
"""
|
|
70
73
|
return _logger
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@contextlib.contextmanager
|
|
77
|
+
def log_operation(op_name: str, **context: Any) -> Generator[None, None, None]:
|
|
78
|
+
"""Log the start, completion, and duration of an operation.
|
|
79
|
+
|
|
80
|
+
Emits an ``info`` message on entry and on successful completion
|
|
81
|
+
(with elapsed time), or an ``error`` message if an exception
|
|
82
|
+
propagates out.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
op_name: Short label for the operation (e.g. ``"write_samples"``).
|
|
86
|
+
**context: Arbitrary key-value pairs included in every log message.
|
|
87
|
+
|
|
88
|
+
Examples:
|
|
89
|
+
>>> with log_operation("write_samples", shard_count=10):
|
|
90
|
+
... do_work()
|
|
91
|
+
"""
|
|
92
|
+
log = get_logger()
|
|
93
|
+
ctx_str = ", ".join(f"{k}={v}" for k, v in context.items())
|
|
94
|
+
if ctx_str:
|
|
95
|
+
log.info("%s: started (%s)", op_name, ctx_str)
|
|
96
|
+
else:
|
|
97
|
+
log.info("%s: started", op_name)
|
|
98
|
+
t0 = time.monotonic()
|
|
99
|
+
try:
|
|
100
|
+
yield
|
|
101
|
+
except Exception:
|
|
102
|
+
elapsed = time.monotonic() - t0
|
|
103
|
+
if ctx_str:
|
|
104
|
+
log.error("%s: failed after %.2fs (%s)", op_name, elapsed, ctx_str)
|
|
105
|
+
else:
|
|
106
|
+
log.error("%s: failed after %.2fs", op_name, elapsed)
|
|
107
|
+
raise
|
|
108
|
+
else:
|
|
109
|
+
elapsed = time.monotonic() - t0
|
|
110
|
+
if ctx_str:
|
|
111
|
+
log.info("%s: completed in %.2fs (%s)", op_name, elapsed, ctx_str)
|
|
112
|
+
else:
|
|
113
|
+
log.info("%s: completed in %.2fs", op_name, elapsed)
|
atdata/_protocols.py
CHANGED
|
@@ -1,37 +1,25 @@
|
|
|
1
1
|
"""Protocol definitions for atdata index and storage abstractions.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
The key insight is that both local and atmosphere implementations solve the
|
|
7
|
-
same problem: indexed dataset storage with external data URLs. These protocols
|
|
8
|
-
formalize that common interface.
|
|
9
|
-
|
|
10
|
-
Note:
|
|
11
|
-
Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
|
|
12
|
-
the standard Python syntax for Protocol definitions - these are interface
|
|
13
|
-
specifications, not stub implementations. Concrete classes (Index,
|
|
14
|
-
AtmosphereIndex, etc.) provide the actual implementations.
|
|
3
|
+
Defines the abstract protocols that enable interchangeable index backends
|
|
4
|
+
(local SQLite/Redis vs ATProto PDS) and data stores (S3, local disk, PDS blobs).
|
|
15
5
|
|
|
16
6
|
Protocols:
|
|
17
|
-
Packable: Structural interface for packable sample types
|
|
7
|
+
Packable: Structural interface for packable sample types
|
|
18
8
|
IndexEntry: Common interface for dataset index entries
|
|
19
9
|
AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
|
|
20
10
|
AbstractDataStore: Protocol for data storage operations
|
|
11
|
+
DataSource: Protocol for streaming shard data
|
|
21
12
|
|
|
22
13
|
Examples:
|
|
23
14
|
>>> def process_datasets(index: AbstractIndex) -> None:
|
|
24
15
|
... for entry in index.list_datasets():
|
|
25
16
|
... print(f"{entry.name}: {entry.data_urls}")
|
|
26
|
-
...
|
|
27
|
-
>>> # Works with either Index or AtmosphereIndex
|
|
28
|
-
>>> process_datasets(local_index)
|
|
29
|
-
>>> process_datasets(atmosphere_index)
|
|
30
17
|
"""
|
|
31
18
|
|
|
32
19
|
from typing import (
|
|
33
20
|
IO,
|
|
34
21
|
Any,
|
|
22
|
+
Iterable,
|
|
35
23
|
Iterator,
|
|
36
24
|
Optional,
|
|
37
25
|
Protocol,
|
|
@@ -115,7 +103,7 @@ class IndexEntry(Protocol):
|
|
|
115
103
|
"""Schema reference string.
|
|
116
104
|
|
|
117
105
|
Local: ``local://schemas/{module.Class}@{version}``
|
|
118
|
-
Atmosphere: ``at://did:plc:.../ac.foundation.dataset.
|
|
106
|
+
Atmosphere: ``at://did:plc:.../ac.foundation.dataset.schema/...``
|
|
119
107
|
"""
|
|
120
108
|
...
|
|
121
109
|
|
|
@@ -137,32 +125,16 @@ class IndexEntry(Protocol):
|
|
|
137
125
|
|
|
138
126
|
|
|
139
127
|
class AbstractIndex(Protocol):
|
|
140
|
-
"""Protocol for index operations
|
|
141
|
-
|
|
142
|
-
This protocol defines the common interface for managing dataset metadata:
|
|
143
|
-
- Publishing and retrieving schemas
|
|
144
|
-
- Inserting and listing datasets
|
|
145
|
-
- (Future) Publishing and retrieving lenses
|
|
146
|
-
|
|
147
|
-
A single index can hold datasets of many different sample types. The sample
|
|
148
|
-
type is tracked via schema references, not as a generic parameter on the index.
|
|
128
|
+
"""Protocol for index operations — implemented by Index and AtmosphereIndex.
|
|
149
129
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
If present, ``load_dataset`` will use it for S3 credential resolution.
|
|
130
|
+
Manages dataset metadata: publishing/retrieving schemas, inserting/listing
|
|
131
|
+
datasets. A single index holds datasets of many sample types, tracked via
|
|
132
|
+
schema references.
|
|
154
133
|
|
|
155
134
|
Examples:
|
|
156
135
|
>>> def publish_and_list(index: AbstractIndex) -> None:
|
|
157
|
-
...
|
|
158
|
-
... schema1 = index.publish_schema(ImageSample, version="1.0.0")
|
|
159
|
-
... schema2 = index.publish_schema(TextSample, version="1.0.0")
|
|
160
|
-
...
|
|
161
|
-
... # Insert datasets of different types
|
|
136
|
+
... index.publish_schema(ImageSample, version="1.0.0")
|
|
162
137
|
... index.insert_dataset(image_ds, name="images")
|
|
163
|
-
... index.insert_dataset(text_ds, name="texts")
|
|
164
|
-
...
|
|
165
|
-
... # List all datasets (mixed types)
|
|
166
138
|
... for entry in index.list_datasets():
|
|
167
139
|
... print(f"{entry.name} -> {entry.schema_ref}")
|
|
168
140
|
"""
|
|
@@ -171,55 +143,75 @@ class AbstractIndex(Protocol):
|
|
|
171
143
|
def data_store(self) -> Optional["AbstractDataStore"]:
|
|
172
144
|
"""Optional data store for reading/writing shards.
|
|
173
145
|
|
|
174
|
-
If present, ``load_dataset``
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
Returns:
|
|
178
|
-
AbstractDataStore instance, or None if this index doesn't have
|
|
179
|
-
an associated data store.
|
|
180
|
-
|
|
181
|
-
Note:
|
|
182
|
-
Not all index implementations provide a data_store. Use
|
|
183
|
-
``hasattr(index, 'data_store') and index.data_store is not None``
|
|
184
|
-
for safe access.
|
|
146
|
+
If present, ``load_dataset`` uses it for credential resolution.
|
|
147
|
+
Not all implementations provide a data_store; check with
|
|
148
|
+
``getattr(index, 'data_store', None)``.
|
|
185
149
|
"""
|
|
186
150
|
...
|
|
187
151
|
|
|
188
152
|
# Dataset operations
|
|
189
153
|
|
|
190
|
-
def
|
|
154
|
+
def write_samples(
|
|
191
155
|
self,
|
|
192
|
-
|
|
156
|
+
samples: Iterable,
|
|
193
157
|
*,
|
|
194
158
|
name: str,
|
|
195
159
|
schema_ref: Optional[str] = None,
|
|
160
|
+
data_store: Optional["AbstractDataStore"] = None,
|
|
161
|
+
force: bool = False,
|
|
196
162
|
**kwargs,
|
|
197
163
|
) -> IndexEntry:
|
|
198
|
-
"""
|
|
164
|
+
"""Write samples and create an index entry in one step.
|
|
165
|
+
|
|
166
|
+
Serializes samples to WebDataset tar files, stores them via the
|
|
167
|
+
appropriate backend, and creates an index entry.
|
|
199
168
|
|
|
200
|
-
|
|
201
|
-
|
|
169
|
+
For atmosphere targets, data is uploaded as PDS blobs by default
|
|
170
|
+
with size guards (50 MB per shard, 1 GB total).
|
|
202
171
|
|
|
203
172
|
Args:
|
|
204
|
-
|
|
205
|
-
name:
|
|
206
|
-
schema_ref: Optional
|
|
207
|
-
|
|
208
|
-
|
|
173
|
+
samples: Iterable of Packable samples. Must be non-empty.
|
|
174
|
+
name: Dataset name, optionally prefixed with target backend.
|
|
175
|
+
schema_ref: Optional schema reference.
|
|
176
|
+
data_store: Explicit data store for shard storage.
|
|
177
|
+
force: Bypass PDS size limits.
|
|
178
|
+
**kwargs: Backend-specific options (maxcount, description, etc.).
|
|
209
179
|
|
|
210
180
|
Returns:
|
|
211
|
-
IndexEntry for the
|
|
181
|
+
IndexEntry for the created dataset.
|
|
212
182
|
"""
|
|
213
183
|
...
|
|
214
184
|
|
|
215
|
-
def
|
|
216
|
-
|
|
185
|
+
def insert_dataset(
|
|
186
|
+
self,
|
|
187
|
+
ds: "Dataset",
|
|
188
|
+
*,
|
|
189
|
+
name: str,
|
|
190
|
+
schema_ref: Optional[str] = None,
|
|
191
|
+
data_store: Optional["AbstractDataStore"] = None,
|
|
192
|
+
force: bool = False,
|
|
193
|
+
copy: bool = False,
|
|
194
|
+
**kwargs,
|
|
195
|
+
) -> IndexEntry:
|
|
196
|
+
"""Register an existing dataset in the index.
|
|
197
|
+
|
|
198
|
+
For atmosphere targets, local sources are uploaded via
|
|
199
|
+
*data_store* (defaults to PDS blobs). Credentialed sources
|
|
200
|
+
require ``copy=True``.
|
|
217
201
|
|
|
218
202
|
Args:
|
|
219
|
-
|
|
203
|
+
ds: The Dataset to register.
|
|
204
|
+
name: Human-readable name.
|
|
205
|
+
schema_ref: Explicit schema ref; auto-published if ``None``.
|
|
206
|
+
data_store: Explicit data store for shard storage.
|
|
207
|
+
force: Bypass PDS size limits.
|
|
208
|
+
copy: Copy data to destination store even for remote sources.
|
|
209
|
+
**kwargs: Backend-specific options.
|
|
210
|
+
"""
|
|
211
|
+
...
|
|
220
212
|
|
|
221
|
-
|
|
222
|
-
|
|
213
|
+
def get_dataset(self, ref: str) -> IndexEntry:
|
|
214
|
+
"""Get a dataset entry by name or reference.
|
|
223
215
|
|
|
224
216
|
Raises:
|
|
225
217
|
KeyError: If dataset not found.
|
|
@@ -242,33 +234,19 @@ class AbstractIndex(Protocol):
|
|
|
242
234
|
) -> str:
|
|
243
235
|
"""Publish a schema for a sample type.
|
|
244
236
|
|
|
245
|
-
The sample_type is accepted as ``type`` rather than ``Type[Packable]`` to
|
|
246
|
-
support ``@packable``-decorated classes, which satisfy the Packable protocol
|
|
247
|
-
at runtime but cannot be statically verified by type checkers.
|
|
248
|
-
|
|
249
237
|
Args:
|
|
250
|
-
sample_type: A Packable type (
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
**kwargs: Additional backend-specific options.
|
|
238
|
+
sample_type: A Packable type (``@packable``-decorated or subclass).
|
|
239
|
+
version: Semantic version string.
|
|
240
|
+
**kwargs: Backend-specific options.
|
|
254
241
|
|
|
255
242
|
Returns:
|
|
256
|
-
Schema reference string
|
|
257
|
-
- Local: 'local://schemas/{module.Class}@{version}'
|
|
258
|
-
- Atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
|
|
243
|
+
Schema reference string (``local://...`` or ``at://...``).
|
|
259
244
|
"""
|
|
260
245
|
...
|
|
261
246
|
|
|
262
247
|
def get_schema(self, ref: str) -> dict:
|
|
263
248
|
"""Get a schema record by reference.
|
|
264
249
|
|
|
265
|
-
Args:
|
|
266
|
-
ref: Schema reference string (local:// or at://).
|
|
267
|
-
|
|
268
|
-
Returns:
|
|
269
|
-
Schema record as a dictionary with fields like 'name', 'version',
|
|
270
|
-
'fields', etc.
|
|
271
|
-
|
|
272
250
|
Raises:
|
|
273
251
|
KeyError: If schema not found.
|
|
274
252
|
"""
|
|
@@ -280,30 +258,15 @@ class AbstractIndex(Protocol):
|
|
|
280
258
|
def list_schemas(self) -> list[dict]: ...
|
|
281
259
|
|
|
282
260
|
def decode_schema(self, ref: str) -> Type[Packable]:
|
|
283
|
-
"""Reconstruct a
|
|
284
|
-
|
|
285
|
-
This method enables loading datasets without knowing the sample type
|
|
286
|
-
ahead of time. The index retrieves the schema record and dynamically
|
|
287
|
-
generates a Packable class matching the schema definition.
|
|
288
|
-
|
|
289
|
-
Args:
|
|
290
|
-
ref: Schema reference string (local:// or at://).
|
|
291
|
-
|
|
292
|
-
Returns:
|
|
293
|
-
A dynamically generated Packable class with fields matching
|
|
294
|
-
the schema definition. The class can be used with
|
|
295
|
-
``Dataset[T]`` to load and iterate over samples.
|
|
261
|
+
"""Reconstruct a Packable type from a stored schema.
|
|
296
262
|
|
|
297
263
|
Raises:
|
|
298
264
|
KeyError: If schema not found.
|
|
299
|
-
ValueError: If schema
|
|
265
|
+
ValueError: If schema has unsupported field types.
|
|
300
266
|
|
|
301
267
|
Examples:
|
|
302
|
-
>>> entry = index.get_dataset("my-dataset")
|
|
303
268
|
>>> SampleType = index.decode_schema(entry.schema_ref)
|
|
304
269
|
>>> ds = Dataset[SampleType](entry.data_urls[0])
|
|
305
|
-
>>> for sample in ds.ordered():
|
|
306
|
-
... print(sample) # sample is instance of SampleType
|
|
307
270
|
"""
|
|
308
271
|
...
|
|
309
272
|
|
|
@@ -313,21 +276,14 @@ class AbstractIndex(Protocol):
|
|
|
313
276
|
|
|
314
277
|
|
|
315
278
|
class AbstractDataStore(Protocol):
|
|
316
|
-
"""Protocol for data storage
|
|
279
|
+
"""Protocol for data storage backends (S3, local disk, PDS blobs).
|
|
317
280
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
- PDSBlobStore: ATProto PDS blob storage (future)
|
|
321
|
-
|
|
322
|
-
The separation of index (metadata) from data store (actual files) allows
|
|
323
|
-
flexible deployment: local index with S3 storage, atmosphere index with
|
|
324
|
-
S3 storage, or atmosphere index with PDS blobs.
|
|
281
|
+
Separates index (metadata) from data store (shard files), enabling
|
|
282
|
+
flexible deployment combinations.
|
|
325
283
|
|
|
326
284
|
Examples:
|
|
327
285
|
>>> store = S3DataStore(credentials, bucket="my-bucket")
|
|
328
286
|
>>> urls = store.write_shards(dataset, prefix="training/v1")
|
|
329
|
-
>>> print(urls)
|
|
330
|
-
['s3://my-bucket/training/v1/shard-000000.tar', ...]
|
|
331
287
|
"""
|
|
332
288
|
|
|
333
289
|
def write_shards(
|
|
@@ -341,28 +297,16 @@ class AbstractDataStore(Protocol):
|
|
|
341
297
|
|
|
342
298
|
Args:
|
|
343
299
|
ds: The Dataset to write.
|
|
344
|
-
prefix: Path prefix
|
|
345
|
-
**kwargs: Backend-specific options (
|
|
300
|
+
prefix: Path prefix (e.g., ``'datasets/mnist/v1'``).
|
|
301
|
+
**kwargs: Backend-specific options (``maxcount``, ``maxsize``, etc.).
|
|
346
302
|
|
|
347
303
|
Returns:
|
|
348
|
-
List of URLs
|
|
349
|
-
WebDataset or atdata.Dataset().
|
|
304
|
+
List of shard URLs suitable for ``atdata.Dataset()``.
|
|
350
305
|
"""
|
|
351
306
|
...
|
|
352
307
|
|
|
353
308
|
def read_url(self, url: str) -> str:
|
|
354
|
-
"""Resolve a storage URL for reading.
|
|
355
|
-
|
|
356
|
-
Some storage backends may need to transform URLs (e.g., signing S3 URLs
|
|
357
|
-
or resolving blob references). This method returns a URL that can be
|
|
358
|
-
used directly with WebDataset.
|
|
359
|
-
|
|
360
|
-
Args:
|
|
361
|
-
url: Storage URL to resolve.
|
|
362
|
-
|
|
363
|
-
Returns:
|
|
364
|
-
WebDataset-compatible URL for reading.
|
|
365
|
-
"""
|
|
309
|
+
"""Resolve a storage URL for reading (e.g., sign S3 URLs)."""
|
|
366
310
|
...
|
|
367
311
|
|
|
368
312
|
def supports_streaming(self) -> bool: ...
|
|
@@ -374,77 +318,32 @@ class AbstractDataStore(Protocol):
|
|
|
374
318
|
|
|
375
319
|
@runtime_checkable
|
|
376
320
|
class DataSource(Protocol):
|
|
377
|
-
"""Protocol for data sources that
|
|
321
|
+
"""Protocol for data sources that stream shard data to Dataset.
|
|
378
322
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
The key method is ``shards()``, which yields (identifier, stream) pairs.
|
|
385
|
-
These are fed directly to WebDataset's tar_file_expander, bypassing URL
|
|
386
|
-
resolution entirely. This enables:
|
|
387
|
-
- Private S3 repos with credentials
|
|
388
|
-
- Custom endpoints (Cloudflare R2, MinIO)
|
|
389
|
-
- ATProto blob streaming
|
|
390
|
-
- Any other source that can provide file-like objects
|
|
323
|
+
Implementations (URLSource, S3Source, BlobSource) yield
|
|
324
|
+
``(identifier, stream)`` pairs fed to WebDataset's tar expander,
|
|
325
|
+
bypassing URL resolution. This enables private S3, custom endpoints,
|
|
326
|
+
and ATProto blob streaming.
|
|
391
327
|
|
|
392
328
|
Examples:
|
|
393
|
-
>>> source = S3Source(
|
|
394
|
-
... bucket="my-bucket",
|
|
395
|
-
... keys=["data-000.tar", "data-001.tar"],
|
|
396
|
-
... endpoint="https://r2.example.com",
|
|
397
|
-
... credentials=creds,
|
|
398
|
-
... )
|
|
329
|
+
>>> source = S3Source(bucket="my-bucket", keys=["data-000.tar"])
|
|
399
330
|
>>> ds = Dataset[MySample](source)
|
|
400
|
-
>>> for sample in ds.ordered():
|
|
401
|
-
... print(sample)
|
|
402
331
|
"""
|
|
403
332
|
|
|
404
333
|
@property
|
|
405
334
|
def shards(self) -> Iterator[tuple[str, IO[bytes]]]:
|
|
406
|
-
"""Lazily yield (
|
|
407
|
-
|
|
408
|
-
The identifier is used for error messages and __url__ metadata.
|
|
409
|
-
The stream must be a file-like object that can be read by tarfile.
|
|
410
|
-
|
|
411
|
-
Yields:
|
|
412
|
-
Tuple of (shard_identifier, file_like_stream).
|
|
413
|
-
|
|
414
|
-
Examples:
|
|
415
|
-
>>> for shard_id, stream in source.shards:
|
|
416
|
-
... print(f"Processing {shard_id}")
|
|
417
|
-
... data = stream.read()
|
|
418
|
-
"""
|
|
335
|
+
"""Lazily yield ``(shard_id, stream)`` pairs for each shard."""
|
|
419
336
|
...
|
|
420
337
|
|
|
421
338
|
def list_shards(self) -> list[str]:
|
|
422
|
-
"""
|
|
423
|
-
|
|
424
|
-
Used for metadata queries like counting shards without actually
|
|
425
|
-
streaming data. Implementations should return identifiers that
|
|
426
|
-
match what shards would yield.
|
|
427
|
-
|
|
428
|
-
Returns:
|
|
429
|
-
List of shard identifier strings.
|
|
430
|
-
"""
|
|
339
|
+
"""Shard identifiers without opening streams."""
|
|
431
340
|
...
|
|
432
341
|
|
|
433
342
|
def open_shard(self, shard_id: str) -> IO[bytes]:
|
|
434
|
-
"""Open a single shard
|
|
435
|
-
|
|
436
|
-
This method enables random access to individual shards, which is
|
|
437
|
-
required for PyTorch DataLoader worker splitting. Each worker opens
|
|
438
|
-
only its assigned shards rather than iterating all shards.
|
|
439
|
-
|
|
440
|
-
Args:
|
|
441
|
-
shard_id: Shard identifier from list_shards().
|
|
442
|
-
|
|
443
|
-
Returns:
|
|
444
|
-
File-like stream for reading the shard.
|
|
343
|
+
"""Open a single shard for random access (e.g., DataLoader splitting).
|
|
445
344
|
|
|
446
345
|
Raises:
|
|
447
|
-
KeyError: If shard_id is not in list_shards()
|
|
346
|
+
KeyError: If *shard_id* is not in ``list_shards()``.
|
|
448
347
|
"""
|
|
449
348
|
...
|
|
450
349
|
|
atdata/_schema_codec.py
CHANGED
|
@@ -284,7 +284,7 @@ def generate_stub(schema: dict) -> str:
|
|
|
284
284
|
String content for a .pyi stub file.
|
|
285
285
|
|
|
286
286
|
Examples:
|
|
287
|
-
>>> schema = index.get_schema("atdata://local/
|
|
287
|
+
>>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
|
|
288
288
|
>>> stub_content = generate_stub(schema.to_dict())
|
|
289
289
|
>>> # Save to a stubs directory configured in your IDE
|
|
290
290
|
>>> with open("stubs/my_sample.pyi", "w") as f:
|
|
@@ -360,7 +360,7 @@ def generate_module(schema: dict) -> str:
|
|
|
360
360
|
String content for a .py module file.
|
|
361
361
|
|
|
362
362
|
Examples:
|
|
363
|
-
>>> schema = index.get_schema("atdata://local/
|
|
363
|
+
>>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
|
|
364
364
|
>>> module_content = generate_module(schema.to_dict())
|
|
365
365
|
>>> # The module can be imported after being saved
|
|
366
366
|
"""
|