atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +11 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +46 -1
- atdata/_logging.py +43 -0
- atdata/_protocols.py +81 -182
- atdata/_schema_codec.py +2 -2
- atdata/_sources.py +24 -4
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +60 -21
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +73 -245
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +60 -53
- atdata/atmosphere/records.py +291 -100
- atdata/atmosphere/schema.py +91 -65
- atdata/atmosphere/store.py +68 -66
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +266 -47
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_entry.py +6 -2
- atdata/{local → index}/_index.py +617 -72
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +127 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
- atdata/lexicons/ac.foundation.dataset.record.json +117 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/promote.py +14 -10
- atdata/repository.py +66 -16
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +131 -0
- atdata/{local → stores}/_s3.py +134 -112
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
- atdata-0.3.2b1.dist-info/RECORD +71 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/atmosphere/records.py
CHANGED
|
@@ -8,13 +8,18 @@ and loading them back. Dataset records are published as
|
|
|
8
8
|
from typing import Type, TypeVar, Optional
|
|
9
9
|
import msgpack
|
|
10
10
|
|
|
11
|
-
from .client import
|
|
11
|
+
from .client import Atmosphere
|
|
12
12
|
from .schema import SchemaPublisher
|
|
13
|
-
from ._types import
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
13
|
+
from ._types import AtUri, LEXICON_NAMESPACE
|
|
14
|
+
from ._lexicon_types import (
|
|
15
|
+
LexDatasetRecord,
|
|
16
|
+
StorageHttp,
|
|
17
|
+
StorageS3,
|
|
18
|
+
StorageBlobs,
|
|
19
|
+
HttpShardEntry,
|
|
20
|
+
S3ShardEntry,
|
|
21
|
+
BlobEntry,
|
|
22
|
+
ShardChecksum,
|
|
18
23
|
)
|
|
19
24
|
|
|
20
25
|
# Import for type checking only to avoid circular imports
|
|
@@ -27,19 +32,23 @@ if TYPE_CHECKING:
|
|
|
27
32
|
ST = TypeVar("ST", bound="Packable")
|
|
28
33
|
|
|
29
34
|
|
|
35
|
+
def _placeholder_checksum() -> ShardChecksum:
|
|
36
|
+
"""Return an empty checksum placeholder for shards without pre-computed digests."""
|
|
37
|
+
return ShardChecksum(algorithm="none", digest="")
|
|
38
|
+
|
|
39
|
+
|
|
30
40
|
class DatasetPublisher:
|
|
31
41
|
"""Publishes dataset index records to ATProto.
|
|
32
42
|
|
|
33
43
|
This class creates dataset records that reference a schema and point to
|
|
34
|
-
|
|
44
|
+
HTTP storage, S3 storage, or ATProto blobs.
|
|
35
45
|
|
|
36
46
|
Examples:
|
|
37
|
-
>>> dataset = atdata.Dataset[MySample]("
|
|
47
|
+
>>> dataset = atdata.Dataset[MySample]("https://example.com/data-000000.tar")
|
|
38
48
|
>>>
|
|
39
|
-
>>>
|
|
40
|
-
>>> client.login("handle", "password")
|
|
49
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
41
50
|
>>>
|
|
42
|
-
>>> publisher = DatasetPublisher(
|
|
51
|
+
>>> publisher = DatasetPublisher(atmo)
|
|
43
52
|
>>> uri = publisher.publish(
|
|
44
53
|
... dataset,
|
|
45
54
|
... name="My Training Data",
|
|
@@ -48,15 +57,49 @@ class DatasetPublisher:
|
|
|
48
57
|
... )
|
|
49
58
|
"""
|
|
50
59
|
|
|
51
|
-
def __init__(self, client:
|
|
60
|
+
def __init__(self, client: Atmosphere):
|
|
52
61
|
"""Initialize the dataset publisher.
|
|
53
62
|
|
|
54
63
|
Args:
|
|
55
|
-
client: Authenticated
|
|
64
|
+
client: Authenticated Atmosphere instance.
|
|
56
65
|
"""
|
|
57
66
|
self.client = client
|
|
58
67
|
self._schema_publisher = SchemaPublisher(client)
|
|
59
68
|
|
|
69
|
+
def _create_record(
|
|
70
|
+
self,
|
|
71
|
+
storage: "StorageHttp | StorageS3 | StorageBlobs",
|
|
72
|
+
*,
|
|
73
|
+
name: str,
|
|
74
|
+
schema_uri: str,
|
|
75
|
+
description: Optional[str] = None,
|
|
76
|
+
tags: Optional[list[str]] = None,
|
|
77
|
+
license: Optional[str] = None,
|
|
78
|
+
metadata: Optional[dict] = None,
|
|
79
|
+
rkey: Optional[str] = None,
|
|
80
|
+
) -> AtUri:
|
|
81
|
+
"""Build a LexDatasetRecord and publish it to ATProto."""
|
|
82
|
+
metadata_bytes: Optional[bytes] = None
|
|
83
|
+
if metadata is not None:
|
|
84
|
+
metadata_bytes = msgpack.packb(metadata)
|
|
85
|
+
|
|
86
|
+
dataset_record = LexDatasetRecord(
|
|
87
|
+
name=name,
|
|
88
|
+
schema_ref=schema_uri,
|
|
89
|
+
storage=storage,
|
|
90
|
+
description=description,
|
|
91
|
+
tags=tags or [],
|
|
92
|
+
license=license,
|
|
93
|
+
metadata=metadata_bytes,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return self.client.create_record(
|
|
97
|
+
collection=f"{LEXICON_NAMESPACE}.record",
|
|
98
|
+
record=dataset_record.to_record(),
|
|
99
|
+
rkey=rkey,
|
|
100
|
+
validate=False,
|
|
101
|
+
)
|
|
102
|
+
|
|
60
103
|
def publish(
|
|
61
104
|
self,
|
|
62
105
|
dataset: "Dataset[ST]",
|
|
@@ -91,46 +134,34 @@ class DatasetPublisher:
|
|
|
91
134
|
Raises:
|
|
92
135
|
ValueError: If schema_uri is not provided and auto_publish_schema is False.
|
|
93
136
|
"""
|
|
94
|
-
# Ensure we have a schema reference
|
|
95
137
|
if schema_uri is None:
|
|
96
138
|
if not auto_publish_schema:
|
|
97
139
|
raise ValueError(
|
|
98
140
|
"schema_uri is required when auto_publish_schema=False"
|
|
99
141
|
)
|
|
100
|
-
# Auto-publish the schema
|
|
101
142
|
schema_uri_obj = self._schema_publisher.publish(
|
|
102
143
|
dataset.sample_type,
|
|
103
144
|
version=schema_version,
|
|
104
145
|
)
|
|
105
146
|
schema_uri = str(schema_uri_obj)
|
|
106
147
|
|
|
107
|
-
|
|
108
|
-
storage =
|
|
109
|
-
|
|
110
|
-
|
|
148
|
+
shard_urls = dataset.list_shards()
|
|
149
|
+
storage = StorageHttp(
|
|
150
|
+
shards=[
|
|
151
|
+
HttpShardEntry(url=url, checksum=_placeholder_checksum())
|
|
152
|
+
for url in shard_urls
|
|
153
|
+
]
|
|
111
154
|
)
|
|
112
155
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
if dataset.metadata is not None:
|
|
116
|
-
metadata_bytes = msgpack.packb(dataset.metadata)
|
|
117
|
-
|
|
118
|
-
dataset_record = DatasetRecord(
|
|
156
|
+
return self._create_record(
|
|
157
|
+
storage,
|
|
119
158
|
name=name,
|
|
120
|
-
|
|
121
|
-
storage=storage,
|
|
159
|
+
schema_uri=schema_uri,
|
|
122
160
|
description=description,
|
|
123
|
-
tags=tags
|
|
161
|
+
tags=tags,
|
|
124
162
|
license=license,
|
|
125
|
-
metadata=
|
|
126
|
-
)
|
|
127
|
-
|
|
128
|
-
# Publish to ATProto
|
|
129
|
-
return self.client.create_record(
|
|
130
|
-
collection=f"{LEXICON_NAMESPACE}.record",
|
|
131
|
-
record=dataset_record.to_record(),
|
|
163
|
+
metadata=dataset.metadata,
|
|
132
164
|
rkey=rkey,
|
|
133
|
-
validate=False,
|
|
134
165
|
)
|
|
135
166
|
|
|
136
167
|
def publish_with_urls(
|
|
@@ -143,50 +174,162 @@ class DatasetPublisher:
|
|
|
143
174
|
tags: Optional[list[str]] = None,
|
|
144
175
|
license: Optional[str] = None,
|
|
145
176
|
metadata: Optional[dict] = None,
|
|
177
|
+
checksums: Optional[list[ShardChecksum]] = None,
|
|
146
178
|
rkey: Optional[str] = None,
|
|
147
179
|
) -> AtUri:
|
|
148
|
-
"""Publish a dataset record with explicit URLs.
|
|
180
|
+
"""Publish a dataset record with explicit HTTP URLs.
|
|
149
181
|
|
|
150
182
|
This method allows publishing a dataset record without having a
|
|
151
183
|
Dataset object, useful for registering existing WebDataset files.
|
|
184
|
+
Each URL should be an individual shard (no brace notation).
|
|
152
185
|
|
|
153
186
|
Args:
|
|
154
|
-
urls: List of
|
|
187
|
+
urls: List of individual shard URLs.
|
|
155
188
|
schema_uri: AT URI of the schema record.
|
|
156
189
|
name: Human-readable dataset name.
|
|
157
190
|
description: Human-readable description.
|
|
158
191
|
tags: Searchable tags for discovery.
|
|
159
192
|
license: SPDX license identifier.
|
|
160
193
|
metadata: Arbitrary metadata dictionary.
|
|
194
|
+
checksums: Per-shard checksums. If not provided, empty checksums
|
|
195
|
+
are used.
|
|
161
196
|
rkey: Optional explicit record key.
|
|
162
197
|
|
|
163
198
|
Returns:
|
|
164
199
|
The AT URI of the created dataset record.
|
|
165
200
|
"""
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
201
|
+
if checksums and len(checksums) != len(urls):
|
|
202
|
+
raise ValueError(
|
|
203
|
+
f"checksums length ({len(checksums)}) must match "
|
|
204
|
+
f"urls length ({len(urls)})"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
shards = [
|
|
208
|
+
HttpShardEntry(
|
|
209
|
+
url=url,
|
|
210
|
+
checksum=checksums[i] if checksums else _placeholder_checksum(),
|
|
211
|
+
)
|
|
212
|
+
for i, url in enumerate(urls)
|
|
213
|
+
]
|
|
214
|
+
|
|
215
|
+
return self._create_record(
|
|
216
|
+
StorageHttp(shards=shards),
|
|
217
|
+
name=name,
|
|
218
|
+
schema_uri=schema_uri,
|
|
219
|
+
description=description,
|
|
220
|
+
tags=tags,
|
|
221
|
+
license=license,
|
|
222
|
+
metadata=metadata,
|
|
223
|
+
rkey=rkey,
|
|
169
224
|
)
|
|
170
225
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
226
|
+
def publish_with_s3(
|
|
227
|
+
self,
|
|
228
|
+
bucket: str,
|
|
229
|
+
keys: list[str],
|
|
230
|
+
schema_uri: str,
|
|
231
|
+
*,
|
|
232
|
+
name: str,
|
|
233
|
+
region: Optional[str] = None,
|
|
234
|
+
endpoint: Optional[str] = None,
|
|
235
|
+
description: Optional[str] = None,
|
|
236
|
+
tags: Optional[list[str]] = None,
|
|
237
|
+
license: Optional[str] = None,
|
|
238
|
+
metadata: Optional[dict] = None,
|
|
239
|
+
checksums: Optional[list[ShardChecksum]] = None,
|
|
240
|
+
rkey: Optional[str] = None,
|
|
241
|
+
) -> AtUri:
|
|
242
|
+
"""Publish a dataset record with S3 storage.
|
|
174
243
|
|
|
175
|
-
|
|
244
|
+
Args:
|
|
245
|
+
bucket: S3 bucket name.
|
|
246
|
+
keys: List of S3 object keys for shard files.
|
|
247
|
+
schema_uri: AT URI of the schema record.
|
|
248
|
+
name: Human-readable dataset name.
|
|
249
|
+
region: AWS region (e.g., 'us-east-1').
|
|
250
|
+
endpoint: Custom S3-compatible endpoint URL.
|
|
251
|
+
description: Human-readable description.
|
|
252
|
+
tags: Searchable tags for discovery.
|
|
253
|
+
license: SPDX license identifier.
|
|
254
|
+
metadata: Arbitrary metadata dictionary.
|
|
255
|
+
checksums: Per-shard checksums.
|
|
256
|
+
rkey: Optional explicit record key.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
The AT URI of the created dataset record.
|
|
260
|
+
"""
|
|
261
|
+
if checksums and len(checksums) != len(keys):
|
|
262
|
+
raise ValueError(
|
|
263
|
+
f"checksums length ({len(checksums)}) must match "
|
|
264
|
+
f"keys length ({len(keys)})"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
shards = [
|
|
268
|
+
S3ShardEntry(
|
|
269
|
+
key=key,
|
|
270
|
+
checksum=checksums[i] if checksums else _placeholder_checksum(),
|
|
271
|
+
)
|
|
272
|
+
for i, key in enumerate(keys)
|
|
273
|
+
]
|
|
274
|
+
|
|
275
|
+
return self._create_record(
|
|
276
|
+
StorageS3(bucket=bucket, shards=shards, region=region, endpoint=endpoint),
|
|
176
277
|
name=name,
|
|
177
|
-
|
|
178
|
-
storage=storage,
|
|
278
|
+
schema_uri=schema_uri,
|
|
179
279
|
description=description,
|
|
180
|
-
tags=tags
|
|
280
|
+
tags=tags,
|
|
181
281
|
license=license,
|
|
182
|
-
metadata=
|
|
282
|
+
metadata=metadata,
|
|
283
|
+
rkey=rkey,
|
|
183
284
|
)
|
|
184
285
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
286
|
+
def publish_with_blob_refs(
|
|
287
|
+
self,
|
|
288
|
+
blob_refs: list[dict],
|
|
289
|
+
schema_uri: str,
|
|
290
|
+
*,
|
|
291
|
+
name: str,
|
|
292
|
+
description: Optional[str] = None,
|
|
293
|
+
tags: Optional[list[str]] = None,
|
|
294
|
+
license: Optional[str] = None,
|
|
295
|
+
metadata: Optional[dict] = None,
|
|
296
|
+
rkey: Optional[str] = None,
|
|
297
|
+
) -> AtUri:
|
|
298
|
+
"""Publish a dataset record with pre-uploaded blob references.
|
|
299
|
+
|
|
300
|
+
Unlike ``publish_with_blobs`` (which takes raw bytes and uploads them),
|
|
301
|
+
this method accepts blob ref dicts that have already been uploaded to
|
|
302
|
+
the PDS. The refs are embedded directly in the record so the PDS
|
|
303
|
+
retains the blobs.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
blob_refs: List of blob reference dicts as returned by
|
|
307
|
+
``Atmosphere.upload_blob()``. Each dict must contain
|
|
308
|
+
``$type``, ``ref`` (with ``$link``), ``mimeType``, and ``size``.
|
|
309
|
+
schema_uri: AT URI of the schema record.
|
|
310
|
+
name: Human-readable dataset name.
|
|
311
|
+
description: Human-readable description.
|
|
312
|
+
tags: Searchable tags for discovery.
|
|
313
|
+
license: SPDX license identifier.
|
|
314
|
+
metadata: Arbitrary metadata dictionary.
|
|
315
|
+
rkey: Optional explicit record key.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
The AT URI of the created dataset record.
|
|
319
|
+
"""
|
|
320
|
+
blob_entries = [
|
|
321
|
+
BlobEntry(blob=ref, checksum=_placeholder_checksum()) for ref in blob_refs
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
return self._create_record(
|
|
325
|
+
StorageBlobs(blobs=blob_entries),
|
|
326
|
+
name=name,
|
|
327
|
+
schema_uri=schema_uri,
|
|
328
|
+
description=description,
|
|
329
|
+
tags=tags,
|
|
330
|
+
license=license,
|
|
331
|
+
metadata=metadata,
|
|
188
332
|
rkey=rkey,
|
|
189
|
-
validate=False,
|
|
190
333
|
)
|
|
191
334
|
|
|
192
335
|
def publish_with_blobs(
|
|
@@ -226,37 +369,28 @@ class DatasetPublisher:
|
|
|
226
369
|
Blobs are only retained by the PDS when referenced in a committed
|
|
227
370
|
record. This method handles that automatically.
|
|
228
371
|
"""
|
|
229
|
-
|
|
230
|
-
blob_refs = []
|
|
372
|
+
blob_entries = []
|
|
231
373
|
for blob_data in blobs:
|
|
232
374
|
blob_ref = self.client.upload_blob(blob_data, mime_type=mime_type)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
# Create storage location with blob references
|
|
236
|
-
storage = StorageLocation(
|
|
237
|
-
kind="blobs",
|
|
238
|
-
blob_refs=blob_refs,
|
|
239
|
-
)
|
|
375
|
+
import hashlib
|
|
240
376
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
377
|
+
digest = hashlib.sha256(blob_data).hexdigest()
|
|
378
|
+
blob_entries.append(
|
|
379
|
+
BlobEntry(
|
|
380
|
+
blob=blob_ref,
|
|
381
|
+
checksum=ShardChecksum(algorithm="sha256", digest=digest),
|
|
382
|
+
)
|
|
383
|
+
)
|
|
244
384
|
|
|
245
|
-
|
|
385
|
+
return self._create_record(
|
|
386
|
+
StorageBlobs(blobs=blob_entries),
|
|
246
387
|
name=name,
|
|
247
|
-
|
|
248
|
-
storage=storage,
|
|
388
|
+
schema_uri=schema_uri,
|
|
249
389
|
description=description,
|
|
250
|
-
tags=tags
|
|
390
|
+
tags=tags,
|
|
251
391
|
license=license,
|
|
252
|
-
metadata=
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
return self.client.create_record(
|
|
256
|
-
collection=f"{LEXICON_NAMESPACE}.record",
|
|
257
|
-
record=dataset_record.to_record(),
|
|
392
|
+
metadata=metadata,
|
|
258
393
|
rkey=rkey,
|
|
259
|
-
validate=False,
|
|
260
394
|
)
|
|
261
395
|
|
|
262
396
|
|
|
@@ -268,8 +402,8 @@ class DatasetLoader:
|
|
|
268
402
|
Python class for the sample type.
|
|
269
403
|
|
|
270
404
|
Examples:
|
|
271
|
-
>>>
|
|
272
|
-
>>> loader = DatasetLoader(
|
|
405
|
+
>>> atmo = Atmosphere.login("handle", "password")
|
|
406
|
+
>>> loader = DatasetLoader(atmo)
|
|
273
407
|
>>>
|
|
274
408
|
>>> # List available datasets
|
|
275
409
|
>>> datasets = loader.list()
|
|
@@ -280,11 +414,11 @@ class DatasetLoader:
|
|
|
280
414
|
>>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.record/xyz")
|
|
281
415
|
"""
|
|
282
416
|
|
|
283
|
-
def __init__(self, client:
|
|
417
|
+
def __init__(self, client: Atmosphere):
|
|
284
418
|
"""Initialize the dataset loader.
|
|
285
419
|
|
|
286
420
|
Args:
|
|
287
|
-
client:
|
|
421
|
+
client: Atmosphere instance.
|
|
288
422
|
"""
|
|
289
423
|
self.client = client
|
|
290
424
|
|
|
@@ -311,6 +445,18 @@ class DatasetLoader:
|
|
|
311
445
|
|
|
312
446
|
return record
|
|
313
447
|
|
|
448
|
+
def get_typed(self, uri: str | AtUri) -> LexDatasetRecord:
|
|
449
|
+
"""Fetch a dataset record and return as a typed object.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
uri: The AT URI of the dataset record.
|
|
453
|
+
|
|
454
|
+
Returns:
|
|
455
|
+
LexDatasetRecord instance.
|
|
456
|
+
"""
|
|
457
|
+
record = self.get(uri)
|
|
458
|
+
return LexDatasetRecord.from_record(record)
|
|
459
|
+
|
|
314
460
|
def list_all(
|
|
315
461
|
self,
|
|
316
462
|
repo: Optional[str] = None,
|
|
@@ -334,7 +480,7 @@ class DatasetLoader:
|
|
|
334
480
|
uri: The AT URI of the dataset record.
|
|
335
481
|
|
|
336
482
|
Returns:
|
|
337
|
-
|
|
483
|
+
One of "http", "s3", "blobs", or "external" (legacy).
|
|
338
484
|
|
|
339
485
|
Raises:
|
|
340
486
|
ValueError: If storage type is unknown.
|
|
@@ -343,16 +489,22 @@ class DatasetLoader:
|
|
|
343
489
|
storage = record.get("storage", {})
|
|
344
490
|
storage_type = storage.get("$type", "")
|
|
345
491
|
|
|
346
|
-
if "
|
|
347
|
-
return "
|
|
492
|
+
if "storageHttp" in storage_type:
|
|
493
|
+
return "http"
|
|
494
|
+
elif "storageS3" in storage_type:
|
|
495
|
+
return "s3"
|
|
348
496
|
elif "storageBlobs" in storage_type:
|
|
349
497
|
return "blobs"
|
|
498
|
+
elif "storageExternal" in storage_type:
|
|
499
|
+
return "external"
|
|
350
500
|
else:
|
|
351
501
|
raise ValueError(f"Unknown storage type: {storage_type}")
|
|
352
502
|
|
|
353
503
|
def get_urls(self, uri: str | AtUri) -> list[str]:
|
|
354
504
|
"""Get the WebDataset URLs from a dataset record.
|
|
355
505
|
|
|
506
|
+
Supports storageHttp, storageS3, and legacy storageExternal formats.
|
|
507
|
+
|
|
356
508
|
Args:
|
|
357
509
|
uri: The AT URI of the dataset record.
|
|
358
510
|
|
|
@@ -360,22 +512,61 @@ class DatasetLoader:
|
|
|
360
512
|
List of WebDataset URLs.
|
|
361
513
|
|
|
362
514
|
Raises:
|
|
363
|
-
ValueError: If the storage type is
|
|
515
|
+
ValueError: If the storage type is blob-only.
|
|
364
516
|
"""
|
|
365
517
|
record = self.get(uri)
|
|
366
518
|
storage = record.get("storage", {})
|
|
367
|
-
|
|
368
519
|
storage_type = storage.get("$type", "")
|
|
369
|
-
|
|
520
|
+
|
|
521
|
+
if "storageHttp" in storage_type:
|
|
522
|
+
return [s["url"] for s in storage.get("shards", [])]
|
|
523
|
+
elif "storageS3" in storage_type:
|
|
524
|
+
bucket = storage.get("bucket", "")
|
|
525
|
+
endpoint = storage.get("endpoint")
|
|
526
|
+
urls = []
|
|
527
|
+
for s in storage.get("shards", []):
|
|
528
|
+
if endpoint:
|
|
529
|
+
urls.append(f"{endpoint.rstrip('/')}/{bucket}/{s['key']}")
|
|
530
|
+
else:
|
|
531
|
+
urls.append(f"s3://{bucket}/{s['key']}")
|
|
532
|
+
return urls
|
|
533
|
+
elif "storageExternal" in storage_type:
|
|
370
534
|
return storage.get("urls", [])
|
|
371
535
|
elif "storageBlobs" in storage_type:
|
|
372
536
|
raise ValueError(
|
|
373
|
-
"Dataset uses blob storage, not
|
|
374
|
-
"Use get_blob_urls() instead."
|
|
537
|
+
"Dataset uses blob storage, not URLs. Use get_blob_urls() instead."
|
|
375
538
|
)
|
|
376
539
|
else:
|
|
377
540
|
raise ValueError(f"Unknown storage type: {storage_type}")
|
|
378
541
|
|
|
542
|
+
def get_s3_info(self, uri: str | AtUri) -> dict:
|
|
543
|
+
"""Get S3 storage details from a dataset record.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
uri: The AT URI of the dataset record.
|
|
547
|
+
|
|
548
|
+
Returns:
|
|
549
|
+
Dict with keys: bucket, keys, region (optional), endpoint (optional).
|
|
550
|
+
|
|
551
|
+
Raises:
|
|
552
|
+
ValueError: If the storage type is not S3.
|
|
553
|
+
"""
|
|
554
|
+
record = self.get(uri)
|
|
555
|
+
storage = record.get("storage", {})
|
|
556
|
+
storage_type = storage.get("$type", "")
|
|
557
|
+
|
|
558
|
+
if "storageS3" not in storage_type:
|
|
559
|
+
raise ValueError(
|
|
560
|
+
f"Dataset does not use S3 storage. Storage type: {storage_type}"
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
return {
|
|
564
|
+
"bucket": storage.get("bucket", ""),
|
|
565
|
+
"keys": [s["key"] for s in storage.get("shards", [])],
|
|
566
|
+
"region": storage.get("region"),
|
|
567
|
+
"endpoint": storage.get("endpoint"),
|
|
568
|
+
}
|
|
569
|
+
|
|
379
570
|
def get_blobs(self, uri: str | AtUri) -> list[dict]:
|
|
380
571
|
"""Get the blob references from a dataset record.
|
|
381
572
|
|
|
@@ -383,7 +574,7 @@ class DatasetLoader:
|
|
|
383
574
|
uri: The AT URI of the dataset record.
|
|
384
575
|
|
|
385
576
|
Returns:
|
|
386
|
-
List of blob
|
|
577
|
+
List of blob entry dicts.
|
|
387
578
|
|
|
388
579
|
Raises:
|
|
389
580
|
ValueError: If the storage type is not blobs.
|
|
@@ -394,12 +585,11 @@ class DatasetLoader:
|
|
|
394
585
|
storage_type = storage.get("$type", "")
|
|
395
586
|
if "storageBlobs" in storage_type:
|
|
396
587
|
return storage.get("blobs", [])
|
|
397
|
-
|
|
588
|
+
else:
|
|
398
589
|
raise ValueError(
|
|
399
|
-
"Dataset
|
|
590
|
+
f"Dataset does not use blob storage. Storage type: {storage_type}. "
|
|
591
|
+
"Use get_urls() instead."
|
|
400
592
|
)
|
|
401
|
-
else:
|
|
402
|
-
raise ValueError(f"Unknown storage type: {storage_type}")
|
|
403
593
|
|
|
404
594
|
def get_blob_urls(self, uri: str | AtUri) -> list[str]:
|
|
405
595
|
"""Get fetchable URLs for blob-stored dataset shards.
|
|
@@ -421,12 +611,13 @@ class DatasetLoader:
|
|
|
421
611
|
else:
|
|
422
612
|
parsed_uri = uri
|
|
423
613
|
|
|
424
|
-
|
|
614
|
+
blob_entries = self.get_blobs(uri)
|
|
425
615
|
did = parsed_uri.authority
|
|
426
616
|
|
|
427
617
|
urls = []
|
|
428
|
-
for
|
|
429
|
-
#
|
|
618
|
+
for entry in blob_entries:
|
|
619
|
+
# Handle both new blobEntry format and legacy bare blob format
|
|
620
|
+
blob = entry.get("blob", entry)
|
|
430
621
|
ref = blob.get("ref", {})
|
|
431
622
|
cid = ref.get("$link") if isinstance(ref, dict) else str(ref)
|
|
432
623
|
if cid:
|
|
@@ -463,7 +654,7 @@ class DatasetLoader:
|
|
|
463
654
|
You must provide the sample type class, which should match the
|
|
464
655
|
schema referenced by the record.
|
|
465
656
|
|
|
466
|
-
Supports
|
|
657
|
+
Supports HTTP, S3, blob, and legacy external storage.
|
|
467
658
|
|
|
468
659
|
Args:
|
|
469
660
|
uri: The AT URI of the dataset record.
|
|
@@ -486,10 +677,10 @@ class DatasetLoader:
|
|
|
486
677
|
|
|
487
678
|
storage_type = self.get_storage_type(uri)
|
|
488
679
|
|
|
489
|
-
if storage_type == "
|
|
490
|
-
urls = self.get_urls(uri)
|
|
491
|
-
else:
|
|
680
|
+
if storage_type == "blobs":
|
|
492
681
|
urls = self.get_blob_urls(uri)
|
|
682
|
+
else:
|
|
683
|
+
urls = self.get_urls(uri)
|
|
493
684
|
|
|
494
685
|
if not urls:
|
|
495
686
|
raise ValueError("Dataset record has no storage URLs")
|