atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. atdata/__init__.py +11 -0
  2. atdata/_cid.py +0 -21
  3. atdata/_helpers.py +12 -0
  4. atdata/_hf_api.py +46 -1
  5. atdata/_logging.py +43 -0
  6. atdata/_protocols.py +81 -182
  7. atdata/_schema_codec.py +2 -2
  8. atdata/_sources.py +24 -4
  9. atdata/_stub_manager.py +5 -25
  10. atdata/atmosphere/__init__.py +60 -21
  11. atdata/atmosphere/_lexicon_types.py +595 -0
  12. atdata/atmosphere/_types.py +73 -245
  13. atdata/atmosphere/client.py +64 -12
  14. atdata/atmosphere/lens.py +60 -53
  15. atdata/atmosphere/records.py +291 -100
  16. atdata/atmosphere/schema.py +91 -65
  17. atdata/atmosphere/store.py +68 -66
  18. atdata/cli/__init__.py +16 -16
  19. atdata/cli/diagnose.py +2 -2
  20. atdata/cli/{local.py → infra.py} +10 -10
  21. atdata/dataset.py +266 -47
  22. atdata/index/__init__.py +54 -0
  23. atdata/{local → index}/_entry.py +6 -2
  24. atdata/{local → index}/_index.py +617 -72
  25. atdata/{local → index}/_schema.py +5 -5
  26. atdata/lexicons/__init__.py +127 -0
  27. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  28. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  29. atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
  30. atdata/lexicons/ac.foundation.dataset.record.json +117 -0
  31. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  32. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
  34. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  35. atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
  36. atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
  37. atdata/lexicons/ndarray_shim.json +16 -0
  38. atdata/local/__init__.py +12 -13
  39. atdata/local/_repo_legacy.py +3 -3
  40. atdata/manifest/__init__.py +4 -0
  41. atdata/manifest/_proxy.py +321 -0
  42. atdata/promote.py +14 -10
  43. atdata/repository.py +66 -16
  44. atdata/stores/__init__.py +23 -0
  45. atdata/stores/_disk.py +131 -0
  46. atdata/{local → stores}/_s3.py +134 -112
  47. atdata/testing.py +12 -8
  48. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
  49. atdata-0.3.2b1.dist-info/RECORD +71 -0
  50. atdata-0.3.0b1.dist-info/RECORD +0 -54
  51. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
  52. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
  53. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
@@ -7,8 +7,8 @@ from atdata import (
7
7
  )
8
8
  from atdata._protocols import AbstractDataStore, Packable
9
9
 
10
- from atdata.local._entry import LocalDatasetEntry
11
- from atdata.local._schema import (
10
+ from atdata.index._entry import LocalDatasetEntry
11
+ from atdata.index._schema import (
12
12
  SchemaNamespace,
13
13
  LocalSchemaRecord,
14
14
  _schema_ref_from_type,
@@ -21,15 +21,17 @@ from atdata.local._schema import (
21
21
  from pathlib import Path
22
22
  from typing import (
23
23
  Any,
24
+ Iterable,
24
25
  Type,
25
26
  TypeVar,
26
27
  Generator,
27
28
  TYPE_CHECKING,
28
29
  )
29
- from redis import Redis
30
30
  import json
31
31
 
32
32
  if TYPE_CHECKING:
33
+ from redis import Redis
34
+
33
35
  from atdata.providers._base import IndexProvider
34
36
  from atdata.repository import Repository, _AtmosphereBackend
35
37
  from atdata._protocols import IndexEntry
@@ -37,12 +39,42 @@ if TYPE_CHECKING:
37
39
  T = TypeVar("T", bound=Packable)
38
40
 
39
41
 
42
+ def _is_local_path(url: str) -> bool:
43
+ """Check if a URL points to the local filesystem."""
44
+ return (
45
+ url.startswith("/")
46
+ or url.startswith("file://")
47
+ or (len(url) > 1 and url[1] == ":")
48
+ )
49
+
50
+
51
+ def _is_credentialed_source(ds: Dataset) -> bool:
52
+ """Check if a Dataset uses a credentialed source (e.g. S3Source with keys)."""
53
+ from atdata._sources import S3Source
54
+
55
+ return isinstance(ds.source, S3Source)
56
+
57
+
58
+ def _estimate_dataset_bytes(ds: Dataset) -> int:
59
+ """Best-effort total size estimate from local shard files.
60
+
61
+ Returns 0 when size cannot be determined (e.g. remote URLs).
62
+ """
63
+ total = 0
64
+ for shard_url in ds.list_shards():
65
+ if _is_local_path(shard_url):
66
+ p = Path(shard_url.removeprefix("file://"))
67
+ if p.exists():
68
+ total += p.stat().st_size
69
+ return total
70
+
71
+
40
72
  class Index:
41
73
  """Unified index for tracking datasets across multiple repositories.
42
74
 
43
75
  Implements the AbstractIndex protocol. Maintains a registry of
44
- dataset entries across a built-in ``"local"`` repository, optional
45
- named repositories, and an optional atmosphere (ATProto) backend.
76
+ dataset entries across named repositories (always including a built-in
77
+ ``"local"`` repository) and an optional atmosphere (ATProto) backend.
46
78
 
47
79
  The ``"local"`` repository is always present and uses the storage backend
48
80
  determined by the ``provider`` argument. When no provider is given, defaults
@@ -52,14 +84,12 @@ class Index:
52
84
  Additional named repositories can be mounted via the ``repos`` parameter,
53
85
  each pairing an IndexProvider with an optional data store.
54
86
 
55
- An AtmosphereClient is available by default for anonymous read-only
87
+ An Atmosphere is available by default for anonymous read-only
56
88
  resolution of ``@handle/dataset`` paths. Pass an authenticated client
57
89
  for write operations, or ``atmosphere=None`` to disable.
58
90
 
59
91
  Attributes:
60
- _provider: IndexProvider for the built-in ``"local"`` repository.
61
- _data_store: Optional AbstractDataStore for the local repository.
62
- _repos: Named repositories beyond ``"local"``.
92
+ _repos: All repositories keyed by name. ``"local"`` is always present.
63
93
  _atmosphere: Optional atmosphere backend for ATProto operations.
64
94
  """
65
95
 
@@ -105,7 +135,7 @@ class Index:
105
135
  atmosphere: ATProto client for distributed network operations.
106
136
  - Default (sentinel): creates an anonymous read-only client
107
137
  lazily on first access.
108
- - ``AtmosphereClient`` instance: uses that client directly.
138
+ - ``Atmosphere`` instance: uses that client directly.
109
139
  - ``None``: disables atmosphere backend entirely.
110
140
  auto_stubs: If True, automatically generate .pyi stub files when
111
141
  schemas are accessed via get_schema() or decode_schema().
@@ -146,12 +176,13 @@ class Index:
146
176
  ##
147
177
 
148
178
  from atdata.providers._base import IndexProvider as _IP
179
+ from atdata.repository import Repository as _Repo
149
180
 
181
+ # Resolve the local provider
150
182
  if isinstance(provider, str):
151
- # String-based provider selection
152
183
  from atdata.providers._factory import create_provider
153
184
 
154
- self._provider: _IP = create_provider(
185
+ local_provider: _IP = create_provider(
155
186
  provider, path=path, dsn=dsn, redis=redis, **kwargs
156
187
  )
157
188
  elif provider is not None:
@@ -160,27 +191,25 @@ class Index:
160
191
  f"provider must be an IndexProvider or backend name string, "
161
192
  f"got {type(provider).__name__}"
162
193
  )
163
- self._provider = provider
194
+ local_provider = provider
164
195
  elif redis is not None:
165
- # Explicit Redis connection provided
166
196
  from atdata.providers._redis import RedisProvider
167
197
 
168
- self._provider = RedisProvider(redis)
198
+ local_provider = RedisProvider(redis)
169
199
  elif kwargs:
170
- # kwargs provided — assume Redis constructor args for compat
200
+ from redis import Redis as _Redis
171
201
  from atdata.providers._redis import RedisProvider
172
202
 
173
- self._provider = RedisProvider(Redis(**kwargs))
203
+ local_provider = RedisProvider(_Redis(**kwargs))
174
204
  else:
175
- # Default: zero-dependency SQLite
176
205
  from atdata.providers._sqlite import SqliteProvider
177
206
 
178
- self._provider = SqliteProvider()
179
-
180
- self._data_store = data_store
207
+ local_provider = SqliteProvider()
181
208
 
182
- # Validate and store named repositories
183
- from atdata.repository import Repository as _Repo
209
+ # Build the unified repos dict with "local" always present
210
+ self._repos: dict[str, _Repo] = {
211
+ "local": _Repo(provider=local_provider, data_store=data_store),
212
+ }
184
213
 
185
214
  if repos is not None:
186
215
  if "local" in repos:
@@ -194,9 +223,7 @@ class Index:
194
223
  f"repos[{name!r}] must be a Repository, "
195
224
  f"got {type(repo).__name__}"
196
225
  )
197
- self._repos: dict[str, _Repo] = dict(repos)
198
- else:
199
- self._repos = {}
226
+ self._repos.update(repos)
200
227
 
201
228
  # Atmosphere backend (lazy or explicit)
202
229
  from atdata.repository import _AtmosphereBackend
@@ -230,10 +257,10 @@ class Index:
230
257
  """Get the atmosphere backend, lazily creating anonymous client if needed."""
231
258
  if self._atmosphere_deferred and self._atmosphere is None:
232
259
  try:
233
- from atdata.atmosphere.client import AtmosphereClient
260
+ from atdata.atmosphere.client import Atmosphere
234
261
  from atdata.repository import _AtmosphereBackend
235
262
 
236
- client = AtmosphereClient()
263
+ client = Atmosphere()
237
264
  self._atmosphere = _AtmosphereBackend(client)
238
265
  except ImportError:
239
266
  # atproto package not installed -- atmosphere unavailable
@@ -289,13 +316,13 @@ class Index:
289
316
  return ("local", ref, None)
290
317
 
291
318
  @property
292
- def repos(self) -> dict[str, Repository]:
293
- """Named repositories mounted on this index (excluding ``"local"``)."""
319
+ def repos(self) -> dict[str, "Repository"]:
320
+ """All repositories mounted on this index (including ``"local"``)."""
294
321
  return dict(self._repos)
295
322
 
296
323
  @property
297
324
  def atmosphere(self) -> Any:
298
- """The AtmosphereClient for this index, or None if disabled.
325
+ """The Atmosphere for this index, or None if disabled.
299
326
 
300
327
  Returns the underlying client (not the internal backend wrapper).
301
328
  """
@@ -304,10 +331,15 @@ class Index:
304
331
  return backend.client
305
332
  return None
306
333
 
334
+ @property
335
+ def _provider(self) -> "IndexProvider": # noqa: F821
336
+ """IndexProvider for the ``"local"`` repository (backward compat)."""
337
+ return self._repos["local"].provider
338
+
307
339
  @property
308
340
  def provider(self) -> "IndexProvider": # noqa: F821
309
- """The storage provider backing this index."""
310
- return self._provider
341
+ """The storage provider backing the ``"local"`` repository."""
342
+ return self._repos["local"].provider
311
343
 
312
344
  @property
313
345
  def _redis(self) -> Redis:
@@ -318,17 +350,23 @@ class Index:
318
350
  """
319
351
  from atdata.providers._redis import RedisProvider
320
352
 
321
- if isinstance(self._provider, RedisProvider):
322
- return self._provider.redis
353
+ prov = self._repos["local"].provider
354
+ if isinstance(prov, RedisProvider):
355
+ return prov.redis
323
356
  raise AttributeError(
324
357
  "Index._redis is only available with a Redis provider. "
325
358
  "Use index.provider instead."
326
359
  )
327
360
 
361
+ @property
362
+ def _data_store(self) -> AbstractDataStore | None:
363
+ """Data store for the ``"local"`` repository (backward compat)."""
364
+ return self._repos["local"].data_store
365
+
328
366
  @property
329
367
  def data_store(self) -> AbstractDataStore | None:
330
368
  """The data store for writing shards, or None if index-only."""
331
- return self._data_store
369
+ return self._repos["local"].data_store
332
370
 
333
371
  @property
334
372
  def stub_dir(self) -> Path | None:
@@ -351,7 +389,7 @@ class Index:
351
389
  as attributes on this namespace.
352
390
 
353
391
  Examples:
354
- >>> index.load_schema("atdata://local/sampleSchema/MySample@1.0.0")
392
+ >>> index.load_schema("atdata://local/schema/MySample@1.0.0")
355
393
  >>> MyType = index.types.MySample
356
394
  >>> sample = MyType(name="hello", value=42)
357
395
 
@@ -368,7 +406,7 @@ class Index:
368
406
  in the :attr:`types` namespace for easy access.
369
407
 
370
408
  Args:
371
- ref: Schema reference string (atdata://local/sampleSchema/... or
409
+ ref: Schema reference string (atdata://local/schema/... or
372
410
  legacy local://schemas/...).
373
411
 
374
412
  Returns:
@@ -381,11 +419,11 @@ class Index:
381
419
 
382
420
  Examples:
383
421
  >>> # Load and use immediately
384
- >>> MyType = index.load_schema("atdata://local/sampleSchema/MySample@1.0.0")
422
+ >>> MyType = index.load_schema("atdata://local/schema/MySample@1.0.0")
385
423
  >>> sample = MyType(field1="hello", field2=42)
386
424
  >>>
387
425
  >>> # Or access later via namespace
388
- >>> index.load_schema("atdata://local/sampleSchema/OtherType@1.0.0")
426
+ >>> index.load_schema("atdata://local/schema/OtherType@1.0.0")
389
427
  >>> other = index.types.OtherType(data="test")
390
428
  """
391
429
  # Decode the schema (uses generated module if auto_stubs enabled)
@@ -465,6 +503,9 @@ class Index:
465
503
  ) -> LocalDatasetEntry:
466
504
  """Add a dataset to the local repository index.
467
505
 
506
+ .. deprecated::
507
+ Use :meth:`insert_dataset` instead.
508
+
468
509
  Args:
469
510
  ds: The dataset to add to the index.
470
511
  name: Human-readable name for the dataset.
@@ -474,6 +515,13 @@ class Index:
474
515
  Returns:
475
516
  The created LocalDatasetEntry object.
476
517
  """
518
+ import warnings
519
+
520
+ warnings.warn(
521
+ "Index.add_entry() is deprecated, use Index.insert_dataset()",
522
+ DeprecationWarning,
523
+ stacklevel=2,
524
+ )
477
525
  return self._insert_dataset_to_provider(
478
526
  ds,
479
527
  name=name,
@@ -513,6 +561,23 @@ class Index:
513
561
 
514
562
  # AbstractIndex protocol methods
515
563
 
564
+ @staticmethod
565
+ def _ensure_schema_stored(
566
+ schema_ref: str,
567
+ sample_type: type,
568
+ provider: "IndexProvider", # noqa: F821
569
+ ) -> None:
570
+ """Persist the schema definition if not already stored.
571
+
572
+ Called during dataset insertion so that ``decode_schema()`` can
573
+ reconstruct the type later without the caller needing to publish
574
+ the schema separately.
575
+ """
576
+ schema_name, version = _parse_schema_ref(schema_ref)
577
+ if provider.get_schema_json(schema_name, version) is None:
578
+ record = _build_schema_record(sample_type, version=version)
579
+ provider.store_schema(schema_name, version, json.dumps(record))
580
+
516
581
  def _insert_dataset_to_provider(
517
582
  self,
518
583
  ds: Dataset,
@@ -528,21 +593,36 @@ class Index:
528
593
  This is the internal implementation shared by all local and named
529
594
  repository inserts.
530
595
  """
596
+ from atdata._logging import get_logger
597
+
598
+ log = get_logger()
531
599
  metadata = kwargs.get("metadata")
532
600
 
533
601
  if store is not None:
534
602
  prefix = kwargs.get("prefix", name)
535
603
  cache_local = kwargs.get("cache_local", False)
604
+ log.debug(
605
+ "_insert_dataset_to_provider: name=%s, store=%s",
606
+ name,
607
+ type(store).__name__,
608
+ )
536
609
 
537
610
  written_urls = store.write_shards(
538
611
  ds,
539
612
  prefix=prefix,
540
613
  cache_local=cache_local,
541
614
  )
615
+ log.info(
616
+ "_insert_dataset_to_provider: %d shard(s) written for %s",
617
+ len(written_urls),
618
+ name,
619
+ )
542
620
 
543
621
  if schema_ref is None:
544
622
  schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
545
623
 
624
+ self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
625
+
546
626
  entry_metadata = metadata if metadata is not None else ds._metadata
547
627
  entry = LocalDatasetEntry(
548
628
  name=name,
@@ -551,12 +631,15 @@ class Index:
551
631
  metadata=entry_metadata,
552
632
  )
553
633
  provider.store_entry(entry)
634
+ log.debug("_insert_dataset_to_provider: entry stored for %s", name)
554
635
  return entry
555
636
 
556
637
  # No data store - just index the existing URL
557
638
  if schema_ref is None:
558
639
  schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
559
640
 
641
+ self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
642
+
560
643
  data_urls = [ds.url]
561
644
  entry_metadata = metadata if metadata is not None else ds._metadata
562
645
 
@@ -567,6 +650,7 @@ class Index:
567
650
  metadata=entry_metadata,
568
651
  )
569
652
  provider.store_entry(entry)
653
+ log.debug("_insert_dataset_to_provider: entry stored for %s", name)
570
654
  return entry
571
655
 
572
656
  def insert_dataset(
@@ -575,66 +659,379 @@ class Index:
575
659
  *,
576
660
  name: str,
577
661
  schema_ref: str | None = None,
662
+ description: str | None = None,
663
+ tags: list[str] | None = None,
664
+ license: str | None = None,
665
+ data_store: AbstractDataStore | None = None,
666
+ force: bool = False,
667
+ copy: bool = False,
668
+ metadata: dict | None = None,
669
+ _data_urls: list[str] | None = None,
670
+ _blob_refs: list[dict] | None = None,
578
671
  **kwargs,
579
672
  ) -> "IndexEntry":
580
- """Insert a dataset into the index (AbstractIndex protocol).
673
+ """Insert a dataset into the index.
581
674
 
582
675
  The target repository is determined by a prefix in the ``name``
583
676
  argument (e.g. ``"lab/mnist"``). If no prefix is given, or the
584
677
  prefix is ``"local"``, the built-in local repository is used.
585
678
 
586
- If the target repository has a data_store, shards are written to
587
- storage first, then indexed. Otherwise, the dataset's existing URL
588
- is indexed directly.
679
+ For atmosphere targets:
680
+
681
+ - **Local sources** are uploaded via *data_store* (defaults to
682
+ ``PDSBlobStore``).
683
+ - **Public remote sources** (http/https) are referenced as
684
+ external URLs unless *copy* is ``True``.
685
+ - **Credentialed sources** (e.g. ``S3Source``) raise an error
686
+ unless *copy* is ``True`` or *data_store* is provided, to
687
+ prevent leaking private endpoints.
589
688
 
590
689
  Args:
591
690
  ds: The Dataset to register.
592
691
  name: Human-readable name for the dataset, optionally prefixed
593
692
  with a repository name (e.g. ``"lab/mnist"``).
594
693
  schema_ref: Optional schema reference.
595
- **kwargs: Additional options:
596
- - metadata: Optional metadata dict
597
- - prefix: Storage prefix (default: dataset name)
598
- - cache_local: If True, cache writes locally first
694
+ description: Optional dataset description (atmosphere only).
695
+ tags: Optional tags for discovery (atmosphere only).
696
+ license: Optional license identifier (atmosphere only).
697
+ data_store: Explicit data store for shard storage. When
698
+ provided, data is always copied through this store.
699
+ force: If True, bypass PDS size limits (50 MB per shard,
700
+ 1 GB total). Default: ``False``.
701
+ copy: If True, copy data to the destination store even for
702
+ remote sources. Required for credentialed sources
703
+ targeting the atmosphere. Default: ``False``.
704
+ metadata: Optional metadata dict.
599
705
 
600
706
  Returns:
601
707
  IndexEntry for the inserted dataset.
708
+
709
+ Raises:
710
+ ValueError: If atmosphere limits are exceeded (when
711
+ *force* is ``False``), or if a credentialed source
712
+ targets the atmosphere without *copy*.
602
713
  """
714
+ from atdata.atmosphere.store import PDS_TOTAL_DATASET_LIMIT_BYTES
715
+
603
716
  backend_key, resolved_name, handle_or_did = self._resolve_prefix(name)
717
+ is_atmosphere = backend_key == "_atmosphere"
604
718
 
605
- if backend_key == "_atmosphere":
719
+ if is_atmosphere:
606
720
  atmo = self._get_atmosphere()
607
721
  if atmo is None:
608
722
  raise ValueError(
609
723
  f"Atmosphere backend required for name {name!r} but not available."
610
724
  )
611
- return atmo.insert_dataset(
612
- ds, name=resolved_name, schema_ref=schema_ref, **kwargs
613
- )
614
725
 
615
- if backend_key == "local":
616
- return self._insert_dataset_to_provider(
726
+ # Providing an explicit data_store implies copy behaviour
727
+ needs_copy = copy or data_store is not None
728
+
729
+ # Credentialed source guard
730
+ if _is_credentialed_source(ds) and not needs_copy:
731
+ raise ValueError(
732
+ "Dataset uses a credentialed source. Referencing "
733
+ "these URLs in a public atmosphere record would "
734
+ "leak private endpoints. Pass copy=True to copy "
735
+ "data to the destination store (default: PDS blobs)."
736
+ )
737
+
738
+ # If we already have pre-written URLs (from write_samples),
739
+ # go straight to publish.
740
+ if _data_urls is not None:
741
+ return atmo.insert_dataset(
742
+ ds,
743
+ name=resolved_name,
744
+ schema_ref=schema_ref,
745
+ data_urls=_data_urls,
746
+ blob_refs=_blob_refs,
747
+ description=description,
748
+ tags=tags,
749
+ license=license,
750
+ metadata=metadata,
751
+ **kwargs,
752
+ )
753
+
754
+ # Determine whether data must be copied
755
+ source_is_local = _is_local_path(ds.url)
756
+
757
+ if source_is_local or needs_copy:
758
+ # Resolve effective store
759
+ if data_store is not None:
760
+ effective_store = data_store
761
+ else:
762
+ from atdata.atmosphere.store import PDSBlobStore
763
+
764
+ effective_store = PDSBlobStore(atmo.client)
765
+
766
+ # Size guard
767
+ if not force:
768
+ total_bytes = _estimate_dataset_bytes(ds)
769
+ if total_bytes > PDS_TOTAL_DATASET_LIMIT_BYTES:
770
+ raise ValueError(
771
+ f"Total dataset size ({total_bytes} bytes) "
772
+ f"exceeds atmosphere limit "
773
+ f"({PDS_TOTAL_DATASET_LIMIT_BYTES} bytes). "
774
+ f"Pass force=True to bypass."
775
+ )
776
+
777
+ result = effective_store.write_shards(ds, prefix=resolved_name)
778
+
779
+ # ShardUploadResult carries blob_refs; plain list does not
780
+ blob_refs = getattr(result, "blob_refs", None) or None
781
+
782
+ return atmo.insert_dataset(
783
+ ds,
784
+ name=resolved_name,
785
+ schema_ref=schema_ref,
786
+ data_urls=list(result),
787
+ blob_refs=blob_refs,
788
+ description=description,
789
+ tags=tags,
790
+ license=license,
791
+ metadata=metadata,
792
+ **kwargs,
793
+ )
794
+
795
+ # Public remote source — reference existing URLs
796
+ data_urls = ds.list_shards()
797
+ return atmo.insert_dataset(
617
798
  ds,
618
799
  name=resolved_name,
619
800
  schema_ref=schema_ref,
620
- provider=self._provider,
621
- store=self._data_store,
801
+ data_urls=data_urls,
802
+ description=description,
803
+ tags=tags,
804
+ license=license,
805
+ metadata=metadata,
622
806
  **kwargs,
623
807
  )
624
808
 
625
- # Named repository
809
+ # --- Local / named repo path ---
626
810
  repo = self._repos.get(backend_key)
627
811
  if repo is None:
628
812
  raise KeyError(f"Unknown repository {backend_key!r} in name {name!r}")
813
+
814
+ effective_store = data_store or repo.data_store
629
815
  return self._insert_dataset_to_provider(
630
816
  ds,
631
817
  name=resolved_name,
632
818
  schema_ref=schema_ref,
633
819
  provider=repo.provider,
634
- store=repo.data_store,
820
+ store=effective_store,
821
+ metadata=metadata,
635
822
  **kwargs,
636
823
  )
637
824
 
825
+ def write_samples(
826
+ self,
827
+ samples: Iterable,
828
+ *,
829
+ name: str,
830
+ schema_ref: str | None = None,
831
+ description: str | None = None,
832
+ tags: list[str] | None = None,
833
+ license: str | None = None,
834
+ maxcount: int = 10_000,
835
+ maxsize: int | None = None,
836
+ metadata: dict | None = None,
837
+ manifest: bool = False,
838
+ data_store: AbstractDataStore | None = None,
839
+ force: bool = False,
840
+ ) -> "IndexEntry":
841
+ """Write samples and create an index entry in one step.
842
+
843
+ This is the primary method for publishing data. It serializes
844
+ samples to WebDataset tar files, stores them via the appropriate
845
+ backend, and creates an index entry.
846
+
847
+ The target backend is determined by the *name* prefix:
848
+
849
+ - Bare name (e.g., ``"mnist"``): writes to the local repository.
850
+ - ``"@handle/name"``: writes and publishes to the atmosphere.
851
+ - ``"repo/name"``: writes to a named repository.
852
+
853
+ For atmosphere targets, data is uploaded as PDS blobs by default.
854
+ Shard size is capped at 50 MB and total dataset size at 1 GB
855
+ unless *force* is ``True``.
856
+
857
+ When the local backend has no ``data_store`` configured, a
858
+ ``LocalDiskStore`` is created automatically at
859
+ ``~/.atdata/data/`` so that samples have persistent storage.
860
+
861
+ Args:
862
+ samples: Iterable of ``Packable`` samples. Must be non-empty.
863
+ name: Dataset name, optionally prefixed with target.
864
+ schema_ref: Optional schema reference. Auto-generated if ``None``.
865
+ description: Optional dataset description (atmosphere only).
866
+ tags: Optional tags for discovery (atmosphere only).
867
+ license: Optional license identifier (atmosphere only).
868
+ maxcount: Max samples per shard. Default: 10,000.
869
+ maxsize: Max bytes per shard. For atmosphere targets defaults
870
+ to 50 MB (PDS blob limit). For local targets defaults to
871
+ ``None`` (unlimited).
872
+ metadata: Optional metadata dict stored with the entry.
873
+ manifest: If True, write per-shard manifest sidecar files
874
+ alongside each tar. Default: ``False``.
875
+ data_store: Explicit data store for shard storage. Overrides
876
+ the repository's default store. For atmosphere targets
877
+ defaults to ``PDSBlobStore``.
878
+ force: If True, bypass PDS size limits (50 MB per shard,
879
+ 1 GB total dataset). Default: ``False``.
880
+
881
+ Returns:
882
+ IndexEntry for the created dataset.
883
+
884
+ Raises:
885
+ ValueError: If *samples* is empty, or if atmosphere size
886
+ limits are exceeded (when *force* is ``False``).
887
+
888
+ Examples:
889
+ >>> index = Index()
890
+ >>> samples = [MySample(key="0", text="hello")]
891
+ >>> entry = index.write_samples(samples, name="my-dataset")
892
+ """
893
+ import tempfile
894
+
895
+ from atdata.dataset import write_samples as _write_samples
896
+ from atdata.atmosphere.store import (
897
+ PDS_BLOB_LIMIT_BYTES,
898
+ PDS_TOTAL_DATASET_LIMIT_BYTES,
899
+ )
900
+ from atdata._logging import log_operation
901
+
902
+ backend_key, resolved_name, _ = self._resolve_prefix(name)
903
+ is_atmosphere = backend_key == "_atmosphere"
904
+
905
+ with log_operation("Index.write_samples", name=name):
906
+ # --- Atmosphere size guards ---
907
+ if is_atmosphere and not force:
908
+ if maxsize is not None and maxsize > PDS_BLOB_LIMIT_BYTES:
909
+ raise ValueError(
910
+ f"maxsize={maxsize} exceeds PDS blob limit "
911
+ f"({PDS_BLOB_LIMIT_BYTES} bytes). "
912
+ f"Pass force=True to bypass."
913
+ )
914
+
915
+ # Default maxsize for atmosphere targets
916
+ effective_maxsize = maxsize
917
+ if is_atmosphere and effective_maxsize is None:
918
+ effective_maxsize = PDS_BLOB_LIMIT_BYTES
919
+
920
+ # Resolve the effective data store
921
+ if is_atmosphere:
922
+ atmo = self._get_atmosphere()
923
+ if atmo is None:
924
+ raise ValueError(
925
+ f"Atmosphere backend required for name {name!r} but not available."
926
+ )
927
+ if data_store is None:
928
+ from atdata.atmosphere.store import PDSBlobStore
929
+
930
+ effective_store: AbstractDataStore | None = PDSBlobStore(
931
+ atmo.client
932
+ )
933
+ else:
934
+ effective_store = data_store
935
+ else:
936
+ repo = self._repos.get(backend_key)
937
+ effective_store = data_store or (
938
+ repo.data_store if repo is not None else None
939
+ )
940
+ needs_auto_store = repo is not None and effective_store is None
941
+ if needs_auto_store:
942
+ from atdata.stores._disk import LocalDiskStore
943
+
944
+ effective_store = LocalDiskStore()
945
+
946
+ with tempfile.TemporaryDirectory() as tmp_dir:
947
+ tmp_path = Path(tmp_dir) / "data.tar"
948
+ ds = _write_samples(
949
+ samples,
950
+ tmp_path,
951
+ maxcount=maxcount,
952
+ maxsize=effective_maxsize,
953
+ manifest=manifest,
954
+ )
955
+
956
+ # Atmosphere total-size guard (after writing so we can measure)
957
+ if is_atmosphere and not force:
958
+ total_bytes = _estimate_dataset_bytes(ds)
959
+ if total_bytes > PDS_TOTAL_DATASET_LIMIT_BYTES:
960
+ raise ValueError(
961
+ f"Total dataset size ({total_bytes} bytes) exceeds "
962
+ f"atmosphere limit ({PDS_TOTAL_DATASET_LIMIT_BYTES} "
963
+ f"bytes). Pass force=True to bypass."
964
+ )
965
+
966
+ if is_atmosphere:
967
+ # Write shards through the store, then publish record
968
+ # with the resulting URLs (not the temp paths).
969
+ written_urls = effective_store.write_shards(
970
+ ds, prefix=resolved_name
971
+ )
972
+
973
+ # If write_shards returned blob refs (e.g. ShardUploadResult),
974
+ # use storageBlobs so the PDS retains the uploaded blobs.
975
+ # Fall back to storageExternal with AT URIs otherwise.
976
+ blob_refs = getattr(written_urls, "blob_refs", None) or None
977
+
978
+ return self.insert_dataset(
979
+ ds,
980
+ name=name,
981
+ schema_ref=schema_ref,
982
+ metadata=metadata,
983
+ description=description,
984
+ tags=tags,
985
+ license=license,
986
+ data_store=data_store,
987
+ force=force,
988
+ _data_urls=written_urls,
989
+ _blob_refs=blob_refs,
990
+ )
991
+
992
+ # Local / named repo path
993
+ repo = self._repos.get(backend_key)
994
+ if repo is not None and effective_store is not None:
995
+ return self._insert_dataset_to_provider(
996
+ ds,
997
+ name=resolved_name,
998
+ schema_ref=schema_ref,
999
+ provider=repo.provider,
1000
+ store=effective_store,
1001
+ metadata=metadata,
1002
+ )
1003
+
1004
+ return self.insert_dataset(
1005
+ ds,
1006
+ name=name,
1007
+ schema_ref=schema_ref,
1008
+ metadata=metadata,
1009
+ description=description,
1010
+ tags=tags,
1011
+ license=license,
1012
+ )
1013
+
1014
+ def write(
1015
+ self,
1016
+ samples: Iterable,
1017
+ *,
1018
+ name: str,
1019
+ **kwargs: Any,
1020
+ ) -> "IndexEntry":
1021
+ """Write samples and create an index entry.
1022
+
1023
+ .. deprecated::
1024
+ Use :meth:`write_samples` instead.
1025
+ """
1026
+ import warnings
1027
+
1028
+ warnings.warn(
1029
+ "Index.write() is deprecated, use Index.write_samples()",
1030
+ DeprecationWarning,
1031
+ stacklevel=2,
1032
+ )
1033
+ return self.write_samples(samples, name=name, **kwargs)
1034
+
638
1035
  def get_dataset(self, ref: str) -> "IndexEntry":
639
1036
  """Get a dataset entry by name or prefixed reference.
640
1037
 
@@ -659,14 +1056,10 @@ class Index:
659
1056
  if atmo is None:
660
1057
  raise ValueError(
661
1058
  f"Atmosphere backend required for path {ref!r} but not available. "
662
- "Install 'atproto' or pass an AtmosphereClient."
1059
+ "Install 'atproto' or pass an Atmosphere."
663
1060
  )
664
1061
  return atmo.get_dataset(resolved_ref)
665
1062
 
666
- if backend_key == "local":
667
- return self._provider.get_entry_by_name(resolved_ref)
668
-
669
- # Named repository
670
1063
  repo = self._repos.get(backend_key)
671
1064
  if repo is None:
672
1065
  raise KeyError(f"Unknown repository {backend_key!r} in ref {ref!r}")
@@ -676,14 +1069,13 @@ class Index:
676
1069
  def datasets(self) -> Generator["IndexEntry", None, None]:
677
1070
  """Lazily iterate over all dataset entries across local repositories.
678
1071
 
679
- Yields entries from the ``"local"`` repository and all named
680
- repositories. Atmosphere entries are not included (use
1072
+ Yields entries from all mounted repositories (``"local"`` and named).
1073
+ Atmosphere entries are not included (use
681
1074
  ``list_datasets(repo="_atmosphere")`` for those).
682
1075
 
683
1076
  Yields:
684
1077
  IndexEntry for each dataset.
685
1078
  """
686
- yield from self._provider.iter_entries()
687
1079
  for repo in self._repos.values():
688
1080
  yield from repo.provider.iter_entries()
689
1081
 
@@ -702,9 +1094,6 @@ class Index:
702
1094
  if repo is None:
703
1095
  return list(self.datasets)
704
1096
 
705
- if repo == "local":
706
- return self.list_entries()
707
-
708
1097
  if repo == "_atmosphere":
709
1098
  atmo = self._get_atmosphere()
710
1099
  if atmo is None:
@@ -740,7 +1129,7 @@ class Index:
740
1129
  the class docstring.
741
1130
 
742
1131
  Returns:
743
- Schema reference string: 'atdata://local/sampleSchema/{name}@{version}'.
1132
+ Schema reference string: 'atdata://local/schema/{name}@{version}'.
744
1133
 
745
1134
  Raises:
746
1135
  ValueError: If sample_type is not a dataclass.
@@ -794,7 +1183,7 @@ class Index:
794
1183
 
795
1184
  Args:
796
1185
  ref: Schema reference string. Supports both new format
797
- (atdata://local/sampleSchema/{name}@{version}) and legacy
1186
+ (atdata://local/schema/{name}@{version}) and legacy
798
1187
  format (local://schemas/{module.Class}@{version}).
799
1188
 
800
1189
  Returns:
@@ -871,7 +1260,7 @@ class Index:
871
1260
  The returned class has proper type information that IDEs can understand.
872
1261
 
873
1262
  Args:
874
- ref: Schema reference string (atdata://local/sampleSchema/... or
1263
+ ref: Schema reference string (atdata://local/schema/... or
875
1264
  legacy local://schemas/...).
876
1265
 
877
1266
  Returns:
@@ -938,3 +1327,159 @@ class Index:
938
1327
  if self._stub_manager is not None:
939
1328
  return self._stub_manager.clear_stubs()
940
1329
  return 0
1330
+
1331
+ # -- Atmosphere promotion --
1332
+
1333
+ def promote_entry(
1334
+ self,
1335
+ entry_name: str,
1336
+ *,
1337
+ name: str | None = None,
1338
+ description: str | None = None,
1339
+ tags: list[str] | None = None,
1340
+ license: str | None = None,
1341
+ ) -> str:
1342
+ """Promote a locally-indexed dataset to the atmosphere.
1343
+
1344
+ .. deprecated::
1345
+ Use :meth:`insert_dataset` instead.
1346
+
1347
+ Args:
1348
+ entry_name: Name of the local dataset entry to promote.
1349
+ name: Override name for the atmosphere record. Defaults to
1350
+ the local entry name.
1351
+ description: Optional description for the dataset.
1352
+ tags: Optional tags for discovery.
1353
+ license: Optional license identifier.
1354
+
1355
+ Returns:
1356
+ AT URI of the created atmosphere dataset record.
1357
+
1358
+ Raises:
1359
+ ValueError: If atmosphere backend is not available, or
1360
+ the local entry has no data URLs.
1361
+ KeyError: If the entry or its schema is not found.
1362
+
1363
+ Examples:
1364
+ >>> index = Index(atmosphere=client)
1365
+ >>> uri = index.promote_entry("mnist-train")
1366
+ """
1367
+ import warnings
1368
+
1369
+ warnings.warn(
1370
+ "Index.promote_entry() is deprecated, use Index.insert_dataset()",
1371
+ DeprecationWarning,
1372
+ stacklevel=2,
1373
+ )
1374
+ from atdata.promote import _find_or_publish_schema
1375
+ from atdata.atmosphere import DatasetPublisher
1376
+ from atdata._schema_codec import schema_to_type
1377
+ from atdata._logging import log_operation
1378
+
1379
+ atmo = self._get_atmosphere()
1380
+ if atmo is None:
1381
+ raise ValueError("Atmosphere backend required but not available.")
1382
+
1383
+ with log_operation("Index.promote_entry", entry_name=entry_name):
1384
+ entry = self.get_entry_by_name(entry_name)
1385
+ if not entry.data_urls:
1386
+ raise ValueError(f"Local entry {entry_name!r} has no data URLs")
1387
+
1388
+ schema_record = self.get_schema(entry.schema_ref)
1389
+ sample_type = schema_to_type(schema_record)
1390
+ schema_version = schema_record.get("version", "1.0.0")
1391
+
1392
+ atmosphere_schema_uri = _find_or_publish_schema(
1393
+ sample_type,
1394
+ schema_version,
1395
+ atmo.client,
1396
+ description=schema_record.get("description"),
1397
+ )
1398
+
1399
+ publisher = DatasetPublisher(atmo.client)
1400
+ uri = publisher.publish_with_urls(
1401
+ urls=entry.data_urls,
1402
+ schema_uri=atmosphere_schema_uri,
1403
+ name=name or entry.name,
1404
+ description=description,
1405
+ tags=tags,
1406
+ license=license,
1407
+ metadata=entry.metadata,
1408
+ )
1409
+ return str(uri)
1410
+
1411
+ def promote_dataset(
1412
+ self,
1413
+ dataset: Dataset,
1414
+ *,
1415
+ name: str,
1416
+ sample_type: type | None = None,
1417
+ schema_version: str = "1.0.0",
1418
+ description: str | None = None,
1419
+ tags: list[str] | None = None,
1420
+ license: str | None = None,
1421
+ ) -> str:
1422
+ """Publish a Dataset directly to the atmosphere.
1423
+
1424
+ .. deprecated::
1425
+ Use :meth:`insert_dataset` instead.
1426
+
1427
+ Args:
1428
+ dataset: The Dataset to publish.
1429
+ name: Name for the atmosphere dataset record.
1430
+ sample_type: Sample type for schema publishing. Inferred from
1431
+ ``dataset.sample_type`` if not provided.
1432
+ schema_version: Semantic version for the schema. Default: ``"1.0.0"``.
1433
+ description: Optional description for the dataset.
1434
+ tags: Optional tags for discovery.
1435
+ license: Optional license identifier.
1436
+
1437
+ Returns:
1438
+ AT URI of the created atmosphere dataset record.
1439
+
1440
+ Raises:
1441
+ ValueError: If atmosphere backend is not available.
1442
+
1443
+ Examples:
1444
+ >>> index = Index(atmosphere=client)
1445
+ >>> ds = atdata.load_dataset("./data.tar", MySample, split="train")
1446
+ >>> uri = index.promote_dataset(ds, name="my-dataset")
1447
+ """
1448
+ import warnings
1449
+
1450
+ warnings.warn(
1451
+ "Index.promote_dataset() is deprecated, use Index.insert_dataset()",
1452
+ DeprecationWarning,
1453
+ stacklevel=2,
1454
+ )
1455
+ from atdata.promote import _find_or_publish_schema
1456
+ from atdata.atmosphere import DatasetPublisher
1457
+ from atdata._logging import log_operation
1458
+
1459
+ atmo = self._get_atmosphere()
1460
+ if atmo is None:
1461
+ raise ValueError("Atmosphere backend required but not available.")
1462
+
1463
+ with log_operation("Index.promote_dataset", name=name):
1464
+ st = sample_type or dataset.sample_type
1465
+
1466
+ atmosphere_schema_uri = _find_or_publish_schema(
1467
+ st,
1468
+ schema_version,
1469
+ atmo.client,
1470
+ description=description,
1471
+ )
1472
+
1473
+ data_urls = dataset.list_shards()
1474
+
1475
+ publisher = DatasetPublisher(atmo.client)
1476
+ uri = publisher.publish_with_urls(
1477
+ urls=data_urls,
1478
+ schema_uri=atmosphere_schema_uri,
1479
+ name=name,
1480
+ description=description,
1481
+ tags=tags,
1482
+ license=license,
1483
+ metadata=dataset._metadata,
1484
+ )
1485
+ return str(uri)