atdata 0.3.0b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. atdata/__init__.py +9 -0
  2. atdata/_cid.py +0 -21
  3. atdata/_helpers.py +12 -0
  4. atdata/_hf_api.py +33 -1
  5. atdata/_protocols.py +64 -182
  6. atdata/_schema_codec.py +2 -2
  7. atdata/_stub_manager.py +5 -25
  8. atdata/atmosphere/__init__.py +12 -11
  9. atdata/atmosphere/_types.py +4 -4
  10. atdata/atmosphere/client.py +64 -12
  11. atdata/atmosphere/lens.py +11 -12
  12. atdata/atmosphere/records.py +9 -10
  13. atdata/atmosphere/schema.py +14 -16
  14. atdata/atmosphere/store.py +6 -7
  15. atdata/cli/__init__.py +16 -16
  16. atdata/cli/diagnose.py +2 -2
  17. atdata/cli/{local.py → infra.py} +10 -10
  18. atdata/dataset.py +155 -2
  19. atdata/index/__init__.py +54 -0
  20. atdata/{local → index}/_index.py +322 -64
  21. atdata/{local → index}/_schema.py +5 -5
  22. atdata/lexicons/__init__.py +121 -0
  23. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  24. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  25. atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
  26. atdata/lexicons/ac.foundation.dataset.record.json +96 -0
  27. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  28. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  29. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
  30. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  31. atdata/lexicons/ndarray_shim.json +16 -0
  32. atdata/local/__init__.py +12 -13
  33. atdata/local/_repo_legacy.py +3 -3
  34. atdata/promote.py +14 -10
  35. atdata/repository.py +7 -7
  36. atdata/stores/__init__.py +23 -0
  37. atdata/stores/_disk.py +123 -0
  38. atdata/testing.py +12 -8
  39. {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +2 -2
  40. atdata-0.3.1b1.dist-info/RECORD +67 -0
  41. atdata-0.3.0b1.dist-info/RECORD +0 -54
  42. /atdata/{local → index}/_entry.py +0 -0
  43. /atdata/{local → stores}/_s3.py +0 -0
  44. {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
  45. {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
  46. {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -7,8 +7,8 @@ from atdata import (
7
7
  )
8
8
  from atdata._protocols import AbstractDataStore, Packable
9
9
 
10
- from atdata.local._entry import LocalDatasetEntry
11
- from atdata.local._schema import (
10
+ from atdata.index._entry import LocalDatasetEntry
11
+ from atdata.index._schema import (
12
12
  SchemaNamespace,
13
13
  LocalSchemaRecord,
14
14
  _schema_ref_from_type,
@@ -21,6 +21,7 @@ from atdata.local._schema import (
21
21
  from pathlib import Path
22
22
  from typing import (
23
23
  Any,
24
+ Iterable,
24
25
  Type,
25
26
  TypeVar,
26
27
  Generator,
@@ -41,8 +42,8 @@ class Index:
41
42
  """Unified index for tracking datasets across multiple repositories.
42
43
 
43
44
  Implements the AbstractIndex protocol. Maintains a registry of
44
- dataset entries across a built-in ``"local"`` repository, optional
45
- named repositories, and an optional atmosphere (ATProto) backend.
45
+ dataset entries across named repositories (always including a built-in
46
+ ``"local"`` repository) and an optional atmosphere (ATProto) backend.
46
47
 
47
48
  The ``"local"`` repository is always present and uses the storage backend
48
49
  determined by the ``provider`` argument. When no provider is given, defaults
@@ -52,14 +53,12 @@ class Index:
52
53
  Additional named repositories can be mounted via the ``repos`` parameter,
53
54
  each pairing an IndexProvider with an optional data store.
54
55
 
55
- An AtmosphereClient is available by default for anonymous read-only
56
+ An Atmosphere is available by default for anonymous read-only
56
57
  resolution of ``@handle/dataset`` paths. Pass an authenticated client
57
58
  for write operations, or ``atmosphere=None`` to disable.
58
59
 
59
60
  Attributes:
60
- _provider: IndexProvider for the built-in ``"local"`` repository.
61
- _data_store: Optional AbstractDataStore for the local repository.
62
- _repos: Named repositories beyond ``"local"``.
61
+ _repos: All repositories keyed by name. ``"local"`` is always present.
63
62
  _atmosphere: Optional atmosphere backend for ATProto operations.
64
63
  """
65
64
 
@@ -105,7 +104,7 @@ class Index:
105
104
  atmosphere: ATProto client for distributed network operations.
106
105
  - Default (sentinel): creates an anonymous read-only client
107
106
  lazily on first access.
108
- - ``AtmosphereClient`` instance: uses that client directly.
107
+ - ``Atmosphere`` instance: uses that client directly.
109
108
  - ``None``: disables atmosphere backend entirely.
110
109
  auto_stubs: If True, automatically generate .pyi stub files when
111
110
  schemas are accessed via get_schema() or decode_schema().
@@ -146,12 +145,13 @@ class Index:
146
145
  ##
147
146
 
148
147
  from atdata.providers._base import IndexProvider as _IP
148
+ from atdata.repository import Repository as _Repo
149
149
 
150
+ # Resolve the local provider
150
151
  if isinstance(provider, str):
151
- # String-based provider selection
152
152
  from atdata.providers._factory import create_provider
153
153
 
154
- self._provider: _IP = create_provider(
154
+ local_provider: _IP = create_provider(
155
155
  provider, path=path, dsn=dsn, redis=redis, **kwargs
156
156
  )
157
157
  elif provider is not None:
@@ -160,27 +160,24 @@ class Index:
160
160
  f"provider must be an IndexProvider or backend name string, "
161
161
  f"got {type(provider).__name__}"
162
162
  )
163
- self._provider = provider
163
+ local_provider = provider
164
164
  elif redis is not None:
165
- # Explicit Redis connection provided
166
165
  from atdata.providers._redis import RedisProvider
167
166
 
168
- self._provider = RedisProvider(redis)
167
+ local_provider = RedisProvider(redis)
169
168
  elif kwargs:
170
- # kwargs provided — assume Redis constructor args for compat
171
169
  from atdata.providers._redis import RedisProvider
172
170
 
173
- self._provider = RedisProvider(Redis(**kwargs))
171
+ local_provider = RedisProvider(Redis(**kwargs))
174
172
  else:
175
- # Default: zero-dependency SQLite
176
173
  from atdata.providers._sqlite import SqliteProvider
177
174
 
178
- self._provider = SqliteProvider()
179
-
180
- self._data_store = data_store
175
+ local_provider = SqliteProvider()
181
176
 
182
- # Validate and store named repositories
183
- from atdata.repository import Repository as _Repo
177
+ # Build the unified repos dict with "local" always present
178
+ self._repos: dict[str, _Repo] = {
179
+ "local": _Repo(provider=local_provider, data_store=data_store),
180
+ }
184
181
 
185
182
  if repos is not None:
186
183
  if "local" in repos:
@@ -194,9 +191,7 @@ class Index:
194
191
  f"repos[{name!r}] must be a Repository, "
195
192
  f"got {type(repo).__name__}"
196
193
  )
197
- self._repos: dict[str, _Repo] = dict(repos)
198
- else:
199
- self._repos = {}
194
+ self._repos.update(repos)
200
195
 
201
196
  # Atmosphere backend (lazy or explicit)
202
197
  from atdata.repository import _AtmosphereBackend
@@ -230,10 +225,10 @@ class Index:
230
225
  """Get the atmosphere backend, lazily creating anonymous client if needed."""
231
226
  if self._atmosphere_deferred and self._atmosphere is None:
232
227
  try:
233
- from atdata.atmosphere.client import AtmosphereClient
228
+ from atdata.atmosphere.client import Atmosphere
234
229
  from atdata.repository import _AtmosphereBackend
235
230
 
236
- client = AtmosphereClient()
231
+ client = Atmosphere()
237
232
  self._atmosphere = _AtmosphereBackend(client)
238
233
  except ImportError:
239
234
  # atproto package not installed -- atmosphere unavailable
@@ -289,13 +284,13 @@ class Index:
289
284
  return ("local", ref, None)
290
285
 
291
286
  @property
292
- def repos(self) -> dict[str, Repository]:
293
- """Named repositories mounted on this index (excluding ``"local"``)."""
287
+ def repos(self) -> dict[str, "Repository"]:
288
+ """All repositories mounted on this index (including ``"local"``)."""
294
289
  return dict(self._repos)
295
290
 
296
291
  @property
297
292
  def atmosphere(self) -> Any:
298
- """The AtmosphereClient for this index, or None if disabled.
293
+ """The Atmosphere for this index, or None if disabled.
299
294
 
300
295
  Returns the underlying client (not the internal backend wrapper).
301
296
  """
@@ -304,10 +299,15 @@ class Index:
304
299
  return backend.client
305
300
  return None
306
301
 
302
+ @property
303
+ def _provider(self) -> "IndexProvider": # noqa: F821
304
+ """IndexProvider for the ``"local"`` repository (backward compat)."""
305
+ return self._repos["local"].provider
306
+
307
307
  @property
308
308
  def provider(self) -> "IndexProvider": # noqa: F821
309
- """The storage provider backing this index."""
310
- return self._provider
309
+ """The storage provider backing the ``"local"`` repository."""
310
+ return self._repos["local"].provider
311
311
 
312
312
  @property
313
313
  def _redis(self) -> Redis:
@@ -318,17 +318,23 @@ class Index:
318
318
  """
319
319
  from atdata.providers._redis import RedisProvider
320
320
 
321
- if isinstance(self._provider, RedisProvider):
322
- return self._provider.redis
321
+ prov = self._repos["local"].provider
322
+ if isinstance(prov, RedisProvider):
323
+ return prov.redis
323
324
  raise AttributeError(
324
325
  "Index._redis is only available with a Redis provider. "
325
326
  "Use index.provider instead."
326
327
  )
327
328
 
329
+ @property
330
+ def _data_store(self) -> AbstractDataStore | None:
331
+ """Data store for the ``"local"`` repository (backward compat)."""
332
+ return self._repos["local"].data_store
333
+
328
334
  @property
329
335
  def data_store(self) -> AbstractDataStore | None:
330
336
  """The data store for writing shards, or None if index-only."""
331
- return self._data_store
337
+ return self._repos["local"].data_store
332
338
 
333
339
  @property
334
340
  def stub_dir(self) -> Path | None:
@@ -351,7 +357,7 @@ class Index:
351
357
  as attributes on this namespace.
352
358
 
353
359
  Examples:
354
- >>> index.load_schema("atdata://local/sampleSchema/MySample@1.0.0")
360
+ >>> index.load_schema("atdata://local/schema/MySample@1.0.0")
355
361
  >>> MyType = index.types.MySample
356
362
  >>> sample = MyType(name="hello", value=42)
357
363
 
@@ -368,7 +374,7 @@ class Index:
368
374
  in the :attr:`types` namespace for easy access.
369
375
 
370
376
  Args:
371
- ref: Schema reference string (atdata://local/sampleSchema/... or
377
+ ref: Schema reference string (atdata://local/schema/... or
372
378
  legacy local://schemas/...).
373
379
 
374
380
  Returns:
@@ -381,11 +387,11 @@ class Index:
381
387
 
382
388
  Examples:
383
389
  >>> # Load and use immediately
384
- >>> MyType = index.load_schema("atdata://local/sampleSchema/MySample@1.0.0")
390
+ >>> MyType = index.load_schema("atdata://local/schema/MySample@1.0.0")
385
391
  >>> sample = MyType(field1="hello", field2=42)
386
392
  >>>
387
393
  >>> # Or access later via namespace
388
- >>> index.load_schema("atdata://local/sampleSchema/OtherType@1.0.0")
394
+ >>> index.load_schema("atdata://local/schema/OtherType@1.0.0")
389
395
  >>> other = index.types.OtherType(data="test")
390
396
  """
391
397
  # Decode the schema (uses generated module if auto_stubs enabled)
@@ -513,6 +519,23 @@ class Index:
513
519
 
514
520
  # AbstractIndex protocol methods
515
521
 
522
+ @staticmethod
523
+ def _ensure_schema_stored(
524
+ schema_ref: str,
525
+ sample_type: type,
526
+ provider: "IndexProvider", # noqa: F821
527
+ ) -> None:
528
+ """Persist the schema definition if not already stored.
529
+
530
+ Called during dataset insertion so that ``decode_schema()`` can
531
+ reconstruct the type later without the caller needing to publish
532
+ the schema separately.
533
+ """
534
+ schema_name, version = _parse_schema_ref(schema_ref)
535
+ if provider.get_schema_json(schema_name, version) is None:
536
+ record = _build_schema_record(sample_type, version=version)
537
+ provider.store_schema(schema_name, version, json.dumps(record))
538
+
516
539
  def _insert_dataset_to_provider(
517
540
  self,
518
541
  ds: Dataset,
@@ -543,6 +566,8 @@ class Index:
543
566
  if schema_ref is None:
544
567
  schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
545
568
 
569
+ self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
570
+
546
571
  entry_metadata = metadata if metadata is not None else ds._metadata
547
572
  entry = LocalDatasetEntry(
548
573
  name=name,
@@ -557,6 +582,8 @@ class Index:
557
582
  if schema_ref is None:
558
583
  schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
559
584
 
585
+ self._ensure_schema_stored(schema_ref, ds.sample_type, provider)
586
+
560
587
  data_urls = [ds.url]
561
588
  entry_metadata = metadata if metadata is not None else ds._metadata
562
589
 
@@ -612,17 +639,6 @@ class Index:
612
639
  ds, name=resolved_name, schema_ref=schema_ref, **kwargs
613
640
  )
614
641
 
615
- if backend_key == "local":
616
- return self._insert_dataset_to_provider(
617
- ds,
618
- name=resolved_name,
619
- schema_ref=schema_ref,
620
- provider=self._provider,
621
- store=self._data_store,
622
- **kwargs,
623
- )
624
-
625
- # Named repository
626
642
  repo = self._repos.get(backend_key)
627
643
  if repo is None:
628
644
  raise KeyError(f"Unknown repository {backend_key!r} in name {name!r}")
@@ -635,6 +651,117 @@ class Index:
635
651
  **kwargs,
636
652
  )
637
653
 
654
+ def write(
655
+ self,
656
+ samples: Iterable,
657
+ *,
658
+ name: str,
659
+ schema_ref: str | None = None,
660
+ description: str | None = None,
661
+ tags: list[str] | None = None,
662
+ license: str | None = None,
663
+ maxcount: int = 10_000,
664
+ maxsize: int | None = None,
665
+ metadata: dict | None = None,
666
+ manifest: bool = False,
667
+ ) -> "IndexEntry":
668
+ """Write samples and create an index entry in one step.
669
+
670
+ This is the primary method for publishing data. It serializes
671
+ samples to WebDataset tar files, stores them via the appropriate
672
+ backend, and creates an index entry.
673
+
674
+ The target backend is determined by the *name* prefix:
675
+
676
+ - Bare name (e.g., ``"mnist"``): writes to the local repository.
677
+ - ``"@handle/name"``: writes and publishes to the atmosphere.
678
+ - ``"repo/name"``: writes to a named repository.
679
+
680
+ When the local backend has no ``data_store`` configured, a
681
+ ``LocalDiskStore`` is created automatically at
682
+ ``~/.atdata/data/`` so that samples have persistent storage.
683
+
684
+ .. note::
685
+
686
+ This method is synchronous. Samples are written to a temporary
687
+ location first, then copied to permanent storage by the backend.
688
+ Avoid passing lazily-evaluated iterators that depend on external
689
+ state that may change during the call.
690
+
691
+ Args:
692
+ samples: Iterable of ``Packable`` samples. Must be non-empty.
693
+ name: Dataset name, optionally prefixed with target.
694
+ schema_ref: Optional schema reference. Auto-generated if ``None``.
695
+ description: Optional dataset description (atmosphere only).
696
+ tags: Optional tags for discovery (atmosphere only).
697
+ license: Optional license identifier (atmosphere only).
698
+ maxcount: Max samples per shard. Default: 10,000.
699
+ maxsize: Max bytes per shard. Default: ``None``.
700
+ metadata: Optional metadata dict stored with the entry.
701
+ manifest: If True, write per-shard manifest sidecar files
702
+ alongside each tar. Default: ``False``.
703
+
704
+ Returns:
705
+ IndexEntry for the created dataset.
706
+
707
+ Raises:
708
+ ValueError: If *samples* is empty.
709
+
710
+ Examples:
711
+ >>> index = Index()
712
+ >>> samples = [MySample(key="0", text="hello")]
713
+ >>> entry = index.write(samples, name="my-dataset")
714
+ """
715
+ import tempfile
716
+
717
+ from atdata.dataset import write_samples
718
+
719
+ backend_key, resolved_name, _ = self._resolve_prefix(name)
720
+
721
+ # Resolve the target repo's data store; auto-create LocalDiskStore
722
+ # for repos that have no store so write() always persists data.
723
+ repo = self._repos.get(backend_key)
724
+ effective_store = repo.data_store if repo is not None else None
725
+ needs_auto_store = repo is not None and effective_store is None
726
+
727
+ if needs_auto_store and backend_key != "_atmosphere":
728
+ from atdata.stores._disk import LocalDiskStore
729
+
730
+ effective_store = LocalDiskStore()
731
+
732
+ with tempfile.TemporaryDirectory() as tmp_dir:
733
+ tmp_path = Path(tmp_dir) / "data.tar"
734
+ ds = write_samples(
735
+ samples,
736
+ tmp_path,
737
+ maxcount=maxcount,
738
+ maxsize=maxsize,
739
+ manifest=manifest,
740
+ )
741
+
742
+ # When we auto-created a store, write directly through it
743
+ # rather than via insert_dataset (which would just index
744
+ # the temp path).
745
+ if needs_auto_store and repo is not None:
746
+ return self._insert_dataset_to_provider(
747
+ ds,
748
+ name=resolved_name,
749
+ schema_ref=schema_ref,
750
+ provider=repo.provider,
751
+ store=effective_store,
752
+ metadata=metadata,
753
+ )
754
+
755
+ return self.insert_dataset(
756
+ ds,
757
+ name=name,
758
+ schema_ref=schema_ref,
759
+ metadata=metadata,
760
+ description=description,
761
+ tags=tags,
762
+ license=license,
763
+ )
764
+
638
765
  def get_dataset(self, ref: str) -> "IndexEntry":
639
766
  """Get a dataset entry by name or prefixed reference.
640
767
 
@@ -659,14 +786,10 @@ class Index:
659
786
  if atmo is None:
660
787
  raise ValueError(
661
788
  f"Atmosphere backend required for path {ref!r} but not available. "
662
- "Install 'atproto' or pass an AtmosphereClient."
789
+ "Install 'atproto' or pass an Atmosphere."
663
790
  )
664
791
  return atmo.get_dataset(resolved_ref)
665
792
 
666
- if backend_key == "local":
667
- return self._provider.get_entry_by_name(resolved_ref)
668
-
669
- # Named repository
670
793
  repo = self._repos.get(backend_key)
671
794
  if repo is None:
672
795
  raise KeyError(f"Unknown repository {backend_key!r} in ref {ref!r}")
@@ -676,14 +799,13 @@ class Index:
676
799
  def datasets(self) -> Generator["IndexEntry", None, None]:
677
800
  """Lazily iterate over all dataset entries across local repositories.
678
801
 
679
- Yields entries from the ``"local"`` repository and all named
680
- repositories. Atmosphere entries are not included (use
802
+ Yields entries from all mounted repositories (``"local"`` and named).
803
+ Atmosphere entries are not included (use
681
804
  ``list_datasets(repo="_atmosphere")`` for those).
682
805
 
683
806
  Yields:
684
807
  IndexEntry for each dataset.
685
808
  """
686
- yield from self._provider.iter_entries()
687
809
  for repo in self._repos.values():
688
810
  yield from repo.provider.iter_entries()
689
811
 
@@ -702,9 +824,6 @@ class Index:
702
824
  if repo is None:
703
825
  return list(self.datasets)
704
826
 
705
- if repo == "local":
706
- return self.list_entries()
707
-
708
827
  if repo == "_atmosphere":
709
828
  atmo = self._get_atmosphere()
710
829
  if atmo is None:
@@ -740,7 +859,7 @@ class Index:
740
859
  the class docstring.
741
860
 
742
861
  Returns:
743
- Schema reference string: 'atdata://local/sampleSchema/{name}@{version}'.
862
+ Schema reference string: 'atdata://local/schema/{name}@{version}'.
744
863
 
745
864
  Raises:
746
865
  ValueError: If sample_type is not a dataclass.
@@ -794,7 +913,7 @@ class Index:
794
913
 
795
914
  Args:
796
915
  ref: Schema reference string. Supports both new format
797
- (atdata://local/sampleSchema/{name}@{version}) and legacy
916
+ (atdata://local/schema/{name}@{version}) and legacy
798
917
  format (local://schemas/{module.Class}@{version}).
799
918
 
800
919
  Returns:
@@ -871,7 +990,7 @@ class Index:
871
990
  The returned class has proper type information that IDEs can understand.
872
991
 
873
992
  Args:
874
- ref: Schema reference string (atdata://local/sampleSchema/... or
993
+ ref: Schema reference string (atdata://local/schema/... or
875
994
  legacy local://schemas/...).
876
995
 
877
996
  Returns:
@@ -938,3 +1057,142 @@ class Index:
938
1057
  if self._stub_manager is not None:
939
1058
  return self._stub_manager.clear_stubs()
940
1059
  return 0
1060
+
1061
+ # -- Atmosphere promotion --
1062
+
1063
+ def promote_entry(
1064
+ self,
1065
+ entry_name: str,
1066
+ *,
1067
+ name: str | None = None,
1068
+ description: str | None = None,
1069
+ tags: list[str] | None = None,
1070
+ license: str | None = None,
1071
+ ) -> str:
1072
+ """Promote a locally-indexed dataset to the atmosphere.
1073
+
1074
+ Looks up the entry by name in the local index, resolves its
1075
+ schema, and publishes both schema and dataset record to ATProto
1076
+ via the index's atmosphere backend.
1077
+
1078
+ Args:
1079
+ entry_name: Name of the local dataset entry to promote.
1080
+ name: Override name for the atmosphere record. Defaults to
1081
+ the local entry name.
1082
+ description: Optional description for the dataset.
1083
+ tags: Optional tags for discovery.
1084
+ license: Optional license identifier.
1085
+
1086
+ Returns:
1087
+ AT URI of the created atmosphere dataset record.
1088
+
1089
+ Raises:
1090
+ ValueError: If atmosphere backend is not available, or
1091
+ the local entry has no data URLs.
1092
+ KeyError: If the entry or its schema is not found.
1093
+
1094
+ Examples:
1095
+ >>> index = Index(atmosphere=client)
1096
+ >>> uri = index.promote_entry("mnist-train")
1097
+ """
1098
+ from atdata.promote import _find_or_publish_schema
1099
+ from atdata.atmosphere import DatasetPublisher
1100
+ from atdata._schema_codec import schema_to_type
1101
+
1102
+ atmo = self._get_atmosphere()
1103
+ if atmo is None:
1104
+ raise ValueError("Atmosphere backend required but not available.")
1105
+
1106
+ entry = self.get_entry_by_name(entry_name)
1107
+ if not entry.data_urls:
1108
+ raise ValueError(f"Local entry {entry_name!r} has no data URLs")
1109
+
1110
+ schema_record = self.get_schema(entry.schema_ref)
1111
+ sample_type = schema_to_type(schema_record)
1112
+ schema_version = schema_record.get("version", "1.0.0")
1113
+
1114
+ atmosphere_schema_uri = _find_or_publish_schema(
1115
+ sample_type,
1116
+ schema_version,
1117
+ atmo.client,
1118
+ description=schema_record.get("description"),
1119
+ )
1120
+
1121
+ publisher = DatasetPublisher(atmo.client)
1122
+ uri = publisher.publish_with_urls(
1123
+ urls=entry.data_urls,
1124
+ schema_uri=atmosphere_schema_uri,
1125
+ name=name or entry.name,
1126
+ description=description,
1127
+ tags=tags,
1128
+ license=license,
1129
+ metadata=entry.metadata,
1130
+ )
1131
+ return str(uri)
1132
+
1133
+ def promote_dataset(
1134
+ self,
1135
+ dataset: Dataset,
1136
+ *,
1137
+ name: str,
1138
+ sample_type: type | None = None,
1139
+ schema_version: str = "1.0.0",
1140
+ description: str | None = None,
1141
+ tags: list[str] | None = None,
1142
+ license: str | None = None,
1143
+ ) -> str:
1144
+ """Publish a Dataset directly to the atmosphere.
1145
+
1146
+ Publishes the schema (with deduplication) and creates a dataset
1147
+ record on ATProto. Uses the index's atmosphere backend.
1148
+
1149
+ Args:
1150
+ dataset: The Dataset to publish.
1151
+ name: Name for the atmosphere dataset record.
1152
+ sample_type: Sample type for schema publishing. Inferred from
1153
+ ``dataset.sample_type`` if not provided.
1154
+ schema_version: Semantic version for the schema. Default: ``"1.0.0"``.
1155
+ description: Optional description for the dataset.
1156
+ tags: Optional tags for discovery.
1157
+ license: Optional license identifier.
1158
+
1159
+ Returns:
1160
+ AT URI of the created atmosphere dataset record.
1161
+
1162
+ Raises:
1163
+ ValueError: If atmosphere backend is not available.
1164
+
1165
+ Examples:
1166
+ >>> index = Index(atmosphere=client)
1167
+ >>> ds = atdata.load_dataset("./data.tar", MySample, split="train")
1168
+ >>> uri = index.promote_dataset(ds, name="my-dataset")
1169
+ """
1170
+ from atdata.promote import _find_or_publish_schema
1171
+ from atdata.atmosphere import DatasetPublisher
1172
+
1173
+ atmo = self._get_atmosphere()
1174
+ if atmo is None:
1175
+ raise ValueError("Atmosphere backend required but not available.")
1176
+
1177
+ st = sample_type or dataset.sample_type
1178
+
1179
+ atmosphere_schema_uri = _find_or_publish_schema(
1180
+ st,
1181
+ schema_version,
1182
+ atmo.client,
1183
+ description=description,
1184
+ )
1185
+
1186
+ data_urls = dataset.list_shards()
1187
+
1188
+ publisher = DatasetPublisher(atmo.client)
1189
+ uri = publisher.publish_with_urls(
1190
+ urls=data_urls,
1191
+ schema_uri=atmosphere_schema_uri,
1192
+ name=name,
1193
+ description=description,
1194
+ tags=tags,
1195
+ license=license,
1196
+ metadata=dataset._metadata,
1197
+ )
1198
+ return str(uri)
@@ -26,7 +26,7 @@ from typing import (
26
26
  T = TypeVar("T", bound=Packable)
27
27
 
28
28
  # URI scheme prefixes
29
- _ATDATA_URI_PREFIX = "atdata://local/sampleSchema/"
29
+ _ATDATA_URI_PREFIX = "atdata://local/schema/"
30
30
  _LEGACY_URI_PREFIX = "local://schemas/"
31
31
 
32
32
 
@@ -37,7 +37,7 @@ class SchemaNamespace:
37
37
  Supports attribute access, iteration, ``len()``, and ``in`` checks.
38
38
 
39
39
  Examples:
40
- >>> index.load_schema("atdata://local/sampleSchema/MySample@1.0.0")
40
+ >>> index.load_schema("atdata://local/schema/MySample@1.0.0")
41
41
  >>> MyType = index.types.MySample
42
42
  >>> sample = MyType(field1="hello", field2=42)
43
43
 
@@ -207,7 +207,7 @@ class LocalSchemaRecord:
207
207
  """List of field definitions."""
208
208
 
209
209
  ref: str
210
- """Schema reference URI (atdata://local/sampleSchema/{name}@{version})."""
210
+ """Schema reference URI (atdata://local/schema/{name}@{version})."""
211
211
 
212
212
  description: Optional[str] = None
213
213
  """Human-readable description."""
@@ -259,7 +259,7 @@ def _kind_str_for_sample_type(st: Type[Packable]) -> str:
259
259
 
260
260
 
261
261
  def _schema_ref_from_type(sample_type: Type[Packable], version: str) -> str:
262
- """Generate 'atdata://local/sampleSchema/{name}@{version}' reference."""
262
+ """Generate 'atdata://local/schema/{name}@{version}' reference."""
263
263
  return _make_schema_ref(sample_type.__name__, version)
264
264
 
265
265
 
@@ -271,7 +271,7 @@ def _make_schema_ref(name: str, version: str) -> str:
271
271
  def _parse_schema_ref(ref: str) -> tuple[str, str]:
272
272
  """Parse schema reference into (name, version).
273
273
 
274
- Supports both new format: 'atdata://local/sampleSchema/{name}@{version}'
274
+ Supports both new format: 'atdata://local/schema/{name}@{version}'
275
275
  and legacy format: 'local://schemas/{module.Class}@{version}'
276
276
  """
277
277
  if ref.startswith(_ATDATA_URI_PREFIX):