atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. atdata/__init__.py +11 -0
  2. atdata/_cid.py +0 -21
  3. atdata/_helpers.py +12 -0
  4. atdata/_hf_api.py +46 -1
  5. atdata/_logging.py +43 -0
  6. atdata/_protocols.py +81 -182
  7. atdata/_schema_codec.py +2 -2
  8. atdata/_sources.py +24 -4
  9. atdata/_stub_manager.py +5 -25
  10. atdata/atmosphere/__init__.py +60 -21
  11. atdata/atmosphere/_lexicon_types.py +595 -0
  12. atdata/atmosphere/_types.py +73 -245
  13. atdata/atmosphere/client.py +64 -12
  14. atdata/atmosphere/lens.py +60 -53
  15. atdata/atmosphere/records.py +291 -100
  16. atdata/atmosphere/schema.py +91 -65
  17. atdata/atmosphere/store.py +68 -66
  18. atdata/cli/__init__.py +16 -16
  19. atdata/cli/diagnose.py +2 -2
  20. atdata/cli/{local.py → infra.py} +10 -10
  21. atdata/dataset.py +266 -47
  22. atdata/index/__init__.py +54 -0
  23. atdata/{local → index}/_entry.py +6 -2
  24. atdata/{local → index}/_index.py +617 -72
  25. atdata/{local → index}/_schema.py +5 -5
  26. atdata/lexicons/__init__.py +127 -0
  27. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  28. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  29. atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
  30. atdata/lexicons/ac.foundation.dataset.record.json +117 -0
  31. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  32. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
  34. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  35. atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
  36. atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
  37. atdata/lexicons/ndarray_shim.json +16 -0
  38. atdata/local/__init__.py +12 -13
  39. atdata/local/_repo_legacy.py +3 -3
  40. atdata/manifest/__init__.py +4 -0
  41. atdata/manifest/_proxy.py +321 -0
  42. atdata/promote.py +14 -10
  43. atdata/repository.py +66 -16
  44. atdata/stores/__init__.py +23 -0
  45. atdata/stores/_disk.py +131 -0
  46. atdata/{local → stores}/_s3.py +134 -112
  47. atdata/testing.py +12 -8
  48. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
  49. atdata-0.3.2b1.dist-info/RECORD +71 -0
  50. atdata-0.3.0b1.dist-info/RECORD +0 -54
  51. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
  52. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
  53. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
@@ -8,13 +8,18 @@ and loading them back. Dataset records are published as
8
8
  from typing import Type, TypeVar, Optional
9
9
  import msgpack
10
10
 
11
- from .client import AtmosphereClient
11
+ from .client import Atmosphere
12
12
  from .schema import SchemaPublisher
13
- from ._types import (
14
- AtUri,
15
- DatasetRecord,
16
- StorageLocation,
17
- LEXICON_NAMESPACE,
13
+ from ._types import AtUri, LEXICON_NAMESPACE
14
+ from ._lexicon_types import (
15
+ LexDatasetRecord,
16
+ StorageHttp,
17
+ StorageS3,
18
+ StorageBlobs,
19
+ HttpShardEntry,
20
+ S3ShardEntry,
21
+ BlobEntry,
22
+ ShardChecksum,
18
23
  )
19
24
 
20
25
  # Import for type checking only to avoid circular imports
@@ -27,19 +32,23 @@ if TYPE_CHECKING:
27
32
  ST = TypeVar("ST", bound="Packable")
28
33
 
29
34
 
35
+ def _placeholder_checksum() -> ShardChecksum:
36
+ """Return an empty checksum placeholder for shards without pre-computed digests."""
37
+ return ShardChecksum(algorithm="none", digest="")
38
+
39
+
30
40
  class DatasetPublisher:
31
41
  """Publishes dataset index records to ATProto.
32
42
 
33
43
  This class creates dataset records that reference a schema and point to
34
- external storage (WebDataset URLs) or ATProto blobs.
44
+ HTTP storage, S3 storage, or ATProto blobs.
35
45
 
36
46
  Examples:
37
- >>> dataset = atdata.Dataset[MySample]("s3://bucket/data-{000000..000009}.tar")
47
+ >>> dataset = atdata.Dataset[MySample]("https://example.com/data-000000.tar")
38
48
  >>>
39
- >>> client = AtmosphereClient()
40
- >>> client.login("handle", "password")
49
+ >>> atmo = Atmosphere.login("handle", "password")
41
50
  >>>
42
- >>> publisher = DatasetPublisher(client)
51
+ >>> publisher = DatasetPublisher(atmo)
43
52
  >>> uri = publisher.publish(
44
53
  ... dataset,
45
54
  ... name="My Training Data",
@@ -48,15 +57,49 @@ class DatasetPublisher:
48
57
  ... )
49
58
  """
50
59
 
51
- def __init__(self, client: AtmosphereClient):
60
+ def __init__(self, client: Atmosphere):
52
61
  """Initialize the dataset publisher.
53
62
 
54
63
  Args:
55
- client: Authenticated AtmosphereClient instance.
64
+ client: Authenticated Atmosphere instance.
56
65
  """
57
66
  self.client = client
58
67
  self._schema_publisher = SchemaPublisher(client)
59
68
 
69
+ def _create_record(
70
+ self,
71
+ storage: "StorageHttp | StorageS3 | StorageBlobs",
72
+ *,
73
+ name: str,
74
+ schema_uri: str,
75
+ description: Optional[str] = None,
76
+ tags: Optional[list[str]] = None,
77
+ license: Optional[str] = None,
78
+ metadata: Optional[dict] = None,
79
+ rkey: Optional[str] = None,
80
+ ) -> AtUri:
81
+ """Build a LexDatasetRecord and publish it to ATProto."""
82
+ metadata_bytes: Optional[bytes] = None
83
+ if metadata is not None:
84
+ metadata_bytes = msgpack.packb(metadata)
85
+
86
+ dataset_record = LexDatasetRecord(
87
+ name=name,
88
+ schema_ref=schema_uri,
89
+ storage=storage,
90
+ description=description,
91
+ tags=tags or [],
92
+ license=license,
93
+ metadata=metadata_bytes,
94
+ )
95
+
96
+ return self.client.create_record(
97
+ collection=f"{LEXICON_NAMESPACE}.record",
98
+ record=dataset_record.to_record(),
99
+ rkey=rkey,
100
+ validate=False,
101
+ )
102
+
60
103
  def publish(
61
104
  self,
62
105
  dataset: "Dataset[ST]",
@@ -91,46 +134,34 @@ class DatasetPublisher:
91
134
  Raises:
92
135
  ValueError: If schema_uri is not provided and auto_publish_schema is False.
93
136
  """
94
- # Ensure we have a schema reference
95
137
  if schema_uri is None:
96
138
  if not auto_publish_schema:
97
139
  raise ValueError(
98
140
  "schema_uri is required when auto_publish_schema=False"
99
141
  )
100
- # Auto-publish the schema
101
142
  schema_uri_obj = self._schema_publisher.publish(
102
143
  dataset.sample_type,
103
144
  version=schema_version,
104
145
  )
105
146
  schema_uri = str(schema_uri_obj)
106
147
 
107
- # Build the storage location
108
- storage = StorageLocation(
109
- kind="external",
110
- urls=[dataset.url],
148
+ shard_urls = dataset.list_shards()
149
+ storage = StorageHttp(
150
+ shards=[
151
+ HttpShardEntry(url=url, checksum=_placeholder_checksum())
152
+ for url in shard_urls
153
+ ]
111
154
  )
112
155
 
113
- # Build dataset record
114
- metadata_bytes: Optional[bytes] = None
115
- if dataset.metadata is not None:
116
- metadata_bytes = msgpack.packb(dataset.metadata)
117
-
118
- dataset_record = DatasetRecord(
156
+ return self._create_record(
157
+ storage,
119
158
  name=name,
120
- schema_ref=schema_uri,
121
- storage=storage,
159
+ schema_uri=schema_uri,
122
160
  description=description,
123
- tags=tags or [],
161
+ tags=tags,
124
162
  license=license,
125
- metadata=metadata_bytes,
126
- )
127
-
128
- # Publish to ATProto
129
- return self.client.create_record(
130
- collection=f"{LEXICON_NAMESPACE}.record",
131
- record=dataset_record.to_record(),
163
+ metadata=dataset.metadata,
132
164
  rkey=rkey,
133
- validate=False,
134
165
  )
135
166
 
136
167
  def publish_with_urls(
@@ -143,50 +174,162 @@ class DatasetPublisher:
143
174
  tags: Optional[list[str]] = None,
144
175
  license: Optional[str] = None,
145
176
  metadata: Optional[dict] = None,
177
+ checksums: Optional[list[ShardChecksum]] = None,
146
178
  rkey: Optional[str] = None,
147
179
  ) -> AtUri:
148
- """Publish a dataset record with explicit URLs.
180
+ """Publish a dataset record with explicit HTTP URLs.
149
181
 
150
182
  This method allows publishing a dataset record without having a
151
183
  Dataset object, useful for registering existing WebDataset files.
184
+ Each URL should be an individual shard (no brace notation).
152
185
 
153
186
  Args:
154
- urls: List of WebDataset URLs with brace notation.
187
+ urls: List of individual shard URLs.
155
188
  schema_uri: AT URI of the schema record.
156
189
  name: Human-readable dataset name.
157
190
  description: Human-readable description.
158
191
  tags: Searchable tags for discovery.
159
192
  license: SPDX license identifier.
160
193
  metadata: Arbitrary metadata dictionary.
194
+ checksums: Per-shard checksums. If not provided, empty checksums
195
+ are used.
161
196
  rkey: Optional explicit record key.
162
197
 
163
198
  Returns:
164
199
  The AT URI of the created dataset record.
165
200
  """
166
- storage = StorageLocation(
167
- kind="external",
168
- urls=urls,
201
+ if checksums and len(checksums) != len(urls):
202
+ raise ValueError(
203
+ f"checksums length ({len(checksums)}) must match "
204
+ f"urls length ({len(urls)})"
205
+ )
206
+
207
+ shards = [
208
+ HttpShardEntry(
209
+ url=url,
210
+ checksum=checksums[i] if checksums else _placeholder_checksum(),
211
+ )
212
+ for i, url in enumerate(urls)
213
+ ]
214
+
215
+ return self._create_record(
216
+ StorageHttp(shards=shards),
217
+ name=name,
218
+ schema_uri=schema_uri,
219
+ description=description,
220
+ tags=tags,
221
+ license=license,
222
+ metadata=metadata,
223
+ rkey=rkey,
169
224
  )
170
225
 
171
- metadata_bytes: Optional[bytes] = None
172
- if metadata is not None:
173
- metadata_bytes = msgpack.packb(metadata)
226
+ def publish_with_s3(
227
+ self,
228
+ bucket: str,
229
+ keys: list[str],
230
+ schema_uri: str,
231
+ *,
232
+ name: str,
233
+ region: Optional[str] = None,
234
+ endpoint: Optional[str] = None,
235
+ description: Optional[str] = None,
236
+ tags: Optional[list[str]] = None,
237
+ license: Optional[str] = None,
238
+ metadata: Optional[dict] = None,
239
+ checksums: Optional[list[ShardChecksum]] = None,
240
+ rkey: Optional[str] = None,
241
+ ) -> AtUri:
242
+ """Publish a dataset record with S3 storage.
174
243
 
175
- dataset_record = DatasetRecord(
244
+ Args:
245
+ bucket: S3 bucket name.
246
+ keys: List of S3 object keys for shard files.
247
+ schema_uri: AT URI of the schema record.
248
+ name: Human-readable dataset name.
249
+ region: AWS region (e.g., 'us-east-1').
250
+ endpoint: Custom S3-compatible endpoint URL.
251
+ description: Human-readable description.
252
+ tags: Searchable tags for discovery.
253
+ license: SPDX license identifier.
254
+ metadata: Arbitrary metadata dictionary.
255
+ checksums: Per-shard checksums.
256
+ rkey: Optional explicit record key.
257
+
258
+ Returns:
259
+ The AT URI of the created dataset record.
260
+ """
261
+ if checksums and len(checksums) != len(keys):
262
+ raise ValueError(
263
+ f"checksums length ({len(checksums)}) must match "
264
+ f"keys length ({len(keys)})"
265
+ )
266
+
267
+ shards = [
268
+ S3ShardEntry(
269
+ key=key,
270
+ checksum=checksums[i] if checksums else _placeholder_checksum(),
271
+ )
272
+ for i, key in enumerate(keys)
273
+ ]
274
+
275
+ return self._create_record(
276
+ StorageS3(bucket=bucket, shards=shards, region=region, endpoint=endpoint),
176
277
  name=name,
177
- schema_ref=schema_uri,
178
- storage=storage,
278
+ schema_uri=schema_uri,
179
279
  description=description,
180
- tags=tags or [],
280
+ tags=tags,
181
281
  license=license,
182
- metadata=metadata_bytes,
282
+ metadata=metadata,
283
+ rkey=rkey,
183
284
  )
184
285
 
185
- return self.client.create_record(
186
- collection=f"{LEXICON_NAMESPACE}.record",
187
- record=dataset_record.to_record(),
286
+ def publish_with_blob_refs(
287
+ self,
288
+ blob_refs: list[dict],
289
+ schema_uri: str,
290
+ *,
291
+ name: str,
292
+ description: Optional[str] = None,
293
+ tags: Optional[list[str]] = None,
294
+ license: Optional[str] = None,
295
+ metadata: Optional[dict] = None,
296
+ rkey: Optional[str] = None,
297
+ ) -> AtUri:
298
+ """Publish a dataset record with pre-uploaded blob references.
299
+
300
+ Unlike ``publish_with_blobs`` (which takes raw bytes and uploads them),
301
+ this method accepts blob ref dicts that have already been uploaded to
302
+ the PDS. The refs are embedded directly in the record so the PDS
303
+ retains the blobs.
304
+
305
+ Args:
306
+ blob_refs: List of blob reference dicts as returned by
307
+ ``Atmosphere.upload_blob()``. Each dict must contain
308
+ ``$type``, ``ref`` (with ``$link``), ``mimeType``, and ``size``.
309
+ schema_uri: AT URI of the schema record.
310
+ name: Human-readable dataset name.
311
+ description: Human-readable description.
312
+ tags: Searchable tags for discovery.
313
+ license: SPDX license identifier.
314
+ metadata: Arbitrary metadata dictionary.
315
+ rkey: Optional explicit record key.
316
+
317
+ Returns:
318
+ The AT URI of the created dataset record.
319
+ """
320
+ blob_entries = [
321
+ BlobEntry(blob=ref, checksum=_placeholder_checksum()) for ref in blob_refs
322
+ ]
323
+
324
+ return self._create_record(
325
+ StorageBlobs(blobs=blob_entries),
326
+ name=name,
327
+ schema_uri=schema_uri,
328
+ description=description,
329
+ tags=tags,
330
+ license=license,
331
+ metadata=metadata,
188
332
  rkey=rkey,
189
- validate=False,
190
333
  )
191
334
 
192
335
  def publish_with_blobs(
@@ -226,37 +369,28 @@ class DatasetPublisher:
226
369
  Blobs are only retained by the PDS when referenced in a committed
227
370
  record. This method handles that automatically.
228
371
  """
229
- # Upload all blobs
230
- blob_refs = []
372
+ blob_entries = []
231
373
  for blob_data in blobs:
232
374
  blob_ref = self.client.upload_blob(blob_data, mime_type=mime_type)
233
- blob_refs.append(blob_ref)
234
-
235
- # Create storage location with blob references
236
- storage = StorageLocation(
237
- kind="blobs",
238
- blob_refs=blob_refs,
239
- )
375
+ import hashlib
240
376
 
241
- metadata_bytes: Optional[bytes] = None
242
- if metadata is not None:
243
- metadata_bytes = msgpack.packb(metadata)
377
+ digest = hashlib.sha256(blob_data).hexdigest()
378
+ blob_entries.append(
379
+ BlobEntry(
380
+ blob=blob_ref,
381
+ checksum=ShardChecksum(algorithm="sha256", digest=digest),
382
+ )
383
+ )
244
384
 
245
- dataset_record = DatasetRecord(
385
+ return self._create_record(
386
+ StorageBlobs(blobs=blob_entries),
246
387
  name=name,
247
- schema_ref=schema_uri,
248
- storage=storage,
388
+ schema_uri=schema_uri,
249
389
  description=description,
250
- tags=tags or [],
390
+ tags=tags,
251
391
  license=license,
252
- metadata=metadata_bytes,
253
- )
254
-
255
- return self.client.create_record(
256
- collection=f"{LEXICON_NAMESPACE}.record",
257
- record=dataset_record.to_record(),
392
+ metadata=metadata,
258
393
  rkey=rkey,
259
- validate=False,
260
394
  )
261
395
 
262
396
 
@@ -268,8 +402,8 @@ class DatasetLoader:
268
402
  Python class for the sample type.
269
403
 
270
404
  Examples:
271
- >>> client = AtmosphereClient()
272
- >>> loader = DatasetLoader(client)
405
+ >>> atmo = Atmosphere.login("handle", "password")
406
+ >>> loader = DatasetLoader(atmo)
273
407
  >>>
274
408
  >>> # List available datasets
275
409
  >>> datasets = loader.list()
@@ -280,11 +414,11 @@ class DatasetLoader:
280
414
  >>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.record/xyz")
281
415
  """
282
416
 
283
- def __init__(self, client: AtmosphereClient):
417
+ def __init__(self, client: Atmosphere):
284
418
  """Initialize the dataset loader.
285
419
 
286
420
  Args:
287
- client: AtmosphereClient instance.
421
+ client: Atmosphere instance.
288
422
  """
289
423
  self.client = client
290
424
 
@@ -311,6 +445,18 @@ class DatasetLoader:
311
445
 
312
446
  return record
313
447
 
448
+ def get_typed(self, uri: str | AtUri) -> LexDatasetRecord:
449
+ """Fetch a dataset record and return as a typed object.
450
+
451
+ Args:
452
+ uri: The AT URI of the dataset record.
453
+
454
+ Returns:
455
+ LexDatasetRecord instance.
456
+ """
457
+ record = self.get(uri)
458
+ return LexDatasetRecord.from_record(record)
459
+
314
460
  def list_all(
315
461
  self,
316
462
  repo: Optional[str] = None,
@@ -334,7 +480,7 @@ class DatasetLoader:
334
480
  uri: The AT URI of the dataset record.
335
481
 
336
482
  Returns:
337
- Either "external" or "blobs".
483
+ One of "http", "s3", "blobs", or "external" (legacy).
338
484
 
339
485
  Raises:
340
486
  ValueError: If storage type is unknown.
@@ -343,16 +489,22 @@ class DatasetLoader:
343
489
  storage = record.get("storage", {})
344
490
  storage_type = storage.get("$type", "")
345
491
 
346
- if "storageExternal" in storage_type:
347
- return "external"
492
+ if "storageHttp" in storage_type:
493
+ return "http"
494
+ elif "storageS3" in storage_type:
495
+ return "s3"
348
496
  elif "storageBlobs" in storage_type:
349
497
  return "blobs"
498
+ elif "storageExternal" in storage_type:
499
+ return "external"
350
500
  else:
351
501
  raise ValueError(f"Unknown storage type: {storage_type}")
352
502
 
353
503
  def get_urls(self, uri: str | AtUri) -> list[str]:
354
504
  """Get the WebDataset URLs from a dataset record.
355
505
 
506
+ Supports storageHttp, storageS3, and legacy storageExternal formats.
507
+
356
508
  Args:
357
509
  uri: The AT URI of the dataset record.
358
510
 
@@ -360,22 +512,61 @@ class DatasetLoader:
360
512
  List of WebDataset URLs.
361
513
 
362
514
  Raises:
363
- ValueError: If the storage type is not external URLs.
515
+ ValueError: If the storage type is blob-only.
364
516
  """
365
517
  record = self.get(uri)
366
518
  storage = record.get("storage", {})
367
-
368
519
  storage_type = storage.get("$type", "")
369
- if "storageExternal" in storage_type:
520
+
521
+ if "storageHttp" in storage_type:
522
+ return [s["url"] for s in storage.get("shards", [])]
523
+ elif "storageS3" in storage_type:
524
+ bucket = storage.get("bucket", "")
525
+ endpoint = storage.get("endpoint")
526
+ urls = []
527
+ for s in storage.get("shards", []):
528
+ if endpoint:
529
+ urls.append(f"{endpoint.rstrip('/')}/{bucket}/{s['key']}")
530
+ else:
531
+ urls.append(f"s3://{bucket}/{s['key']}")
532
+ return urls
533
+ elif "storageExternal" in storage_type:
370
534
  return storage.get("urls", [])
371
535
  elif "storageBlobs" in storage_type:
372
536
  raise ValueError(
373
- "Dataset uses blob storage, not external URLs. "
374
- "Use get_blob_urls() instead."
537
+ "Dataset uses blob storage, not URLs. Use get_blob_urls() instead."
375
538
  )
376
539
  else:
377
540
  raise ValueError(f"Unknown storage type: {storage_type}")
378
541
 
542
+ def get_s3_info(self, uri: str | AtUri) -> dict:
543
+ """Get S3 storage details from a dataset record.
544
+
545
+ Args:
546
+ uri: The AT URI of the dataset record.
547
+
548
+ Returns:
549
+ Dict with keys: bucket, keys, region (optional), endpoint (optional).
550
+
551
+ Raises:
552
+ ValueError: If the storage type is not S3.
553
+ """
554
+ record = self.get(uri)
555
+ storage = record.get("storage", {})
556
+ storage_type = storage.get("$type", "")
557
+
558
+ if "storageS3" not in storage_type:
559
+ raise ValueError(
560
+ f"Dataset does not use S3 storage. Storage type: {storage_type}"
561
+ )
562
+
563
+ return {
564
+ "bucket": storage.get("bucket", ""),
565
+ "keys": [s["key"] for s in storage.get("shards", [])],
566
+ "region": storage.get("region"),
567
+ "endpoint": storage.get("endpoint"),
568
+ }
569
+
379
570
  def get_blobs(self, uri: str | AtUri) -> list[dict]:
380
571
  """Get the blob references from a dataset record.
381
572
 
@@ -383,7 +574,7 @@ class DatasetLoader:
383
574
  uri: The AT URI of the dataset record.
384
575
 
385
576
  Returns:
386
- List of blob reference dicts with keys: $type, ref, mimeType, size.
577
+ List of blob entry dicts.
387
578
 
388
579
  Raises:
389
580
  ValueError: If the storage type is not blobs.
@@ -394,12 +585,11 @@ class DatasetLoader:
394
585
  storage_type = storage.get("$type", "")
395
586
  if "storageBlobs" in storage_type:
396
587
  return storage.get("blobs", [])
397
- elif "storageExternal" in storage_type:
588
+ else:
398
589
  raise ValueError(
399
- "Dataset uses external URL storage, not blobs. Use get_urls() instead."
590
+ f"Dataset does not use blob storage. Storage type: {storage_type}. "
591
+ "Use get_urls() instead."
400
592
  )
401
- else:
402
- raise ValueError(f"Unknown storage type: {storage_type}")
403
593
 
404
594
  def get_blob_urls(self, uri: str | AtUri) -> list[str]:
405
595
  """Get fetchable URLs for blob-stored dataset shards.
@@ -421,12 +611,13 @@ class DatasetLoader:
421
611
  else:
422
612
  parsed_uri = uri
423
613
 
424
- blobs = self.get_blobs(uri)
614
+ blob_entries = self.get_blobs(uri)
425
615
  did = parsed_uri.authority
426
616
 
427
617
  urls = []
428
- for blob in blobs:
429
- # Extract CID from blob reference
618
+ for entry in blob_entries:
619
+ # Handle both new blobEntry format and legacy bare blob format
620
+ blob = entry.get("blob", entry)
430
621
  ref = blob.get("ref", {})
431
622
  cid = ref.get("$link") if isinstance(ref, dict) else str(ref)
432
623
  if cid:
@@ -463,7 +654,7 @@ class DatasetLoader:
463
654
  You must provide the sample type class, which should match the
464
655
  schema referenced by the record.
465
656
 
466
- Supports both external URL storage and ATProto blob storage.
657
+ Supports HTTP, S3, blob, and legacy external storage.
467
658
 
468
659
  Args:
469
660
  uri: The AT URI of the dataset record.
@@ -486,10 +677,10 @@ class DatasetLoader:
486
677
 
487
678
  storage_type = self.get_storage_type(uri)
488
679
 
489
- if storage_type == "external":
490
- urls = self.get_urls(uri)
491
- else:
680
+ if storage_type == "blobs":
492
681
  urls = self.get_blob_urls(uri)
682
+ else:
683
+ urls = self.get_urls(uri)
493
684
 
494
685
  if not urls:
495
686
  raise ValueError("Dataset record has no storage URLs")