atdata 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/index/_index.py CHANGED
@@ -27,10 +27,11 @@ from typing import (
27
27
  Generator,
28
28
  TYPE_CHECKING,
29
29
  )
30
- from redis import Redis
31
30
  import json
32
31
 
33
32
  if TYPE_CHECKING:
33
+ from redis import Redis
34
+
34
35
  from atdata.providers._base import IndexProvider
35
36
  from atdata.repository import Repository, _AtmosphereBackend
36
37
  from atdata._protocols import IndexEntry
@@ -38,6 +39,36 @@ if TYPE_CHECKING:
38
39
  T = TypeVar("T", bound=Packable)
39
40
 
40
41
 
42
+ def _is_local_path(url: str) -> bool:
43
+ """Check if a URL points to the local filesystem."""
44
+ return (
45
+ url.startswith("/")
46
+ or url.startswith("file://")
47
+ or (len(url) > 1 and url[1] == ":")
48
+ )
49
+
50
+
51
+ def _is_credentialed_source(ds: Dataset) -> bool:
52
+ """Check if a Dataset uses a credentialed source (e.g. S3Source with keys)."""
53
+ from atdata._sources import S3Source
54
+
55
+ return isinstance(ds.source, S3Source)
56
+
57
+
58
+ def _estimate_dataset_bytes(ds: Dataset) -> int:
59
+ """Best-effort total size estimate from local shard files.
60
+
61
+ Returns 0 when size cannot be determined (e.g. remote URLs).
62
+ """
63
+ total = 0
64
+ for shard_url in ds.list_shards():
65
+ if _is_local_path(shard_url):
66
+ p = Path(shard_url.removeprefix("file://"))
67
+ if p.exists():
68
+ total += p.stat().st_size
69
+ return total
70
+
71
+
41
72
  class Index:
42
73
  """Unified index for tracking datasets across multiple repositories.
43
74
 
@@ -166,9 +197,10 @@ class Index:
166
197
 
167
198
  local_provider = RedisProvider(redis)
168
199
  elif kwargs:
200
+ from redis import Redis as _Redis
169
201
  from atdata.providers._redis import RedisProvider
170
202
 
171
- local_provider = RedisProvider(Redis(**kwargs))
203
+ local_provider = RedisProvider(_Redis(**kwargs))
172
204
  else:
173
205
  from atdata.providers._sqlite import SqliteProvider
174
206
 
@@ -471,6 +503,9 @@ class Index:
471
503
  ) -> LocalDatasetEntry:
472
504
  """Add a dataset to the local repository index.
473
505
 
506
+ .. deprecated::
507
+ Use :meth:`insert_dataset` instead.
508
+
474
509
  Args:
475
510
  ds: The dataset to add to the index.
476
511
  name: Human-readable name for the dataset.
@@ -480,6 +515,13 @@ class Index:
480
515
  Returns:
481
516
  The created LocalDatasetEntry object.
482
517
  """
518
+ import warnings
519
+
520
+ warnings.warn(
521
+ "Index.add_entry() is deprecated, use Index.insert_dataset()",
522
+ DeprecationWarning,
523
+ stacklevel=2,
524
+ )
483
525
  return self._insert_dataset_to_provider(
484
526
  ds,
485
527
  name=name,
@@ -551,17 +593,30 @@ class Index:
551
593
  This is the internal implementation shared by all local and named
552
594
  repository inserts.
553
595
  """
596
+ from atdata._logging import get_logger
597
+
598
+ log = get_logger()
554
599
  metadata = kwargs.get("metadata")
555
600
 
556
601
  if store is not None:
557
602
  prefix = kwargs.get("prefix", name)
558
603
  cache_local = kwargs.get("cache_local", False)
604
+ log.debug(
605
+ "_insert_dataset_to_provider: name=%s, store=%s",
606
+ name,
607
+ type(store).__name__,
608
+ )
559
609
 
560
610
  written_urls = store.write_shards(
561
611
  ds,
562
612
  prefix=prefix,
563
613
  cache_local=cache_local,
564
614
  )
615
+ log.info(
616
+ "_insert_dataset_to_provider: %d shard(s) written for %s",
617
+ len(written_urls),
618
+ name,
619
+ )
565
620
 
566
621
  if schema_ref is None:
567
622
  schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
@@ -576,6 +631,7 @@ class Index:
576
631
  metadata=entry_metadata,
577
632
  )
578
633
  provider.store_entry(entry)
634
+ log.debug("_insert_dataset_to_provider: entry stored for %s", name)
579
635
  return entry
580
636
 
581
637
  # No data store - just index the existing URL
@@ -594,6 +650,7 @@ class Index:
594
650
  metadata=entry_metadata,
595
651
  )
596
652
  provider.store_entry(entry)
653
+ log.debug("_insert_dataset_to_provider: entry stored for %s", name)
597
654
  return entry
598
655
 
599
656
  def insert_dataset(
@@ -602,56 +659,170 @@ class Index:
602
659
  *,
603
660
  name: str,
604
661
  schema_ref: str | None = None,
662
+ description: str | None = None,
663
+ tags: list[str] | None = None,
664
+ license: str | None = None,
665
+ data_store: AbstractDataStore | None = None,
666
+ force: bool = False,
667
+ copy: bool = False,
668
+ metadata: dict | None = None,
669
+ _data_urls: list[str] | None = None,
670
+ _blob_refs: list[dict] | None = None,
605
671
  **kwargs,
606
672
  ) -> "IndexEntry":
607
- """Insert a dataset into the index (AbstractIndex protocol).
673
+ """Insert a dataset into the index.
608
674
 
609
675
  The target repository is determined by a prefix in the ``name``
610
676
  argument (e.g. ``"lab/mnist"``). If no prefix is given, or the
611
677
  prefix is ``"local"``, the built-in local repository is used.
612
678
 
613
- If the target repository has a data_store, shards are written to
614
- storage first, then indexed. Otherwise, the dataset's existing URL
615
- is indexed directly.
679
+ For atmosphere targets:
680
+
681
+ - **Local sources** are uploaded via *data_store* (defaults to
682
+ ``PDSBlobStore``).
683
+ - **Public remote sources** (http/https) are referenced as
684
+ external URLs unless *copy* is ``True``.
685
+ - **Credentialed sources** (e.g. ``S3Source``) raise an error
686
+ unless *copy* is ``True`` or *data_store* is provided, to
687
+ prevent leaking private endpoints.
616
688
 
617
689
  Args:
618
690
  ds: The Dataset to register.
619
691
  name: Human-readable name for the dataset, optionally prefixed
620
692
  with a repository name (e.g. ``"lab/mnist"``).
621
693
  schema_ref: Optional schema reference.
622
- **kwargs: Additional options:
623
- - metadata: Optional metadata dict
624
- - prefix: Storage prefix (default: dataset name)
625
- - cache_local: If True, cache writes locally first
694
+ description: Optional dataset description (atmosphere only).
695
+ tags: Optional tags for discovery (atmosphere only).
696
+ license: Optional license identifier (atmosphere only).
697
+ data_store: Explicit data store for shard storage. When
698
+ provided, data is always copied through this store.
699
+ force: If True, bypass PDS size limits (50 MB per shard,
700
+ 1 GB total). Default: ``False``.
701
+ copy: If True, copy data to the destination store even for
702
+ remote sources. Required for credentialed sources
703
+ targeting the atmosphere. Default: ``False``.
704
+ metadata: Optional metadata dict.
626
705
 
627
706
  Returns:
628
707
  IndexEntry for the inserted dataset.
708
+
709
+ Raises:
710
+ ValueError: If atmosphere limits are exceeded (when
711
+ *force* is ``False``), or if a credentialed source
712
+ targets the atmosphere without *copy*.
629
713
  """
714
+ from atdata.atmosphere.store import PDS_TOTAL_DATASET_LIMIT_BYTES
715
+
630
716
  backend_key, resolved_name, handle_or_did = self._resolve_prefix(name)
717
+ is_atmosphere = backend_key == "_atmosphere"
631
718
 
632
- if backend_key == "_atmosphere":
719
+ if is_atmosphere:
633
720
  atmo = self._get_atmosphere()
634
721
  if atmo is None:
635
722
  raise ValueError(
636
723
  f"Atmosphere backend required for name {name!r} but not available."
637
724
  )
725
+
726
+ # Providing an explicit data_store implies copy behaviour
727
+ needs_copy = copy or data_store is not None
728
+
729
+ # Credentialed source guard
730
+ if _is_credentialed_source(ds) and not needs_copy:
731
+ raise ValueError(
732
+ "Dataset uses a credentialed source. Referencing "
733
+ "these URLs in a public atmosphere record would "
734
+ "leak private endpoints. Pass copy=True to copy "
735
+ "data to the destination store (default: PDS blobs)."
736
+ )
737
+
738
+ # If we already have pre-written URLs (from write_samples),
739
+ # go straight to publish.
740
+ if _data_urls is not None:
741
+ return atmo.insert_dataset(
742
+ ds,
743
+ name=resolved_name,
744
+ schema_ref=schema_ref,
745
+ data_urls=_data_urls,
746
+ blob_refs=_blob_refs,
747
+ description=description,
748
+ tags=tags,
749
+ license=license,
750
+ metadata=metadata,
751
+ **kwargs,
752
+ )
753
+
754
+ # Determine whether data must be copied
755
+ source_is_local = _is_local_path(ds.url)
756
+
757
+ if source_is_local or needs_copy:
758
+ # Resolve effective store
759
+ if data_store is not None:
760
+ effective_store = data_store
761
+ else:
762
+ from atdata.atmosphere.store import PDSBlobStore
763
+
764
+ effective_store = PDSBlobStore(atmo.client)
765
+
766
+ # Size guard
767
+ if not force:
768
+ total_bytes = _estimate_dataset_bytes(ds)
769
+ if total_bytes > PDS_TOTAL_DATASET_LIMIT_BYTES:
770
+ raise ValueError(
771
+ f"Total dataset size ({total_bytes} bytes) "
772
+ f"exceeds atmosphere limit "
773
+ f"({PDS_TOTAL_DATASET_LIMIT_BYTES} bytes). "
774
+ f"Pass force=True to bypass."
775
+ )
776
+
777
+ result = effective_store.write_shards(ds, prefix=resolved_name)
778
+
779
+ # ShardUploadResult carries blob_refs; plain list does not
780
+ blob_refs = getattr(result, "blob_refs", None) or None
781
+
782
+ return atmo.insert_dataset(
783
+ ds,
784
+ name=resolved_name,
785
+ schema_ref=schema_ref,
786
+ data_urls=list(result),
787
+ blob_refs=blob_refs,
788
+ description=description,
789
+ tags=tags,
790
+ license=license,
791
+ metadata=metadata,
792
+ **kwargs,
793
+ )
794
+
795
+ # Public remote source — reference existing URLs
796
+ data_urls = ds.list_shards()
638
797
  return atmo.insert_dataset(
639
- ds, name=resolved_name, schema_ref=schema_ref, **kwargs
798
+ ds,
799
+ name=resolved_name,
800
+ schema_ref=schema_ref,
801
+ data_urls=data_urls,
802
+ description=description,
803
+ tags=tags,
804
+ license=license,
805
+ metadata=metadata,
806
+ **kwargs,
640
807
  )
641
808
 
809
+ # --- Local / named repo path ---
642
810
  repo = self._repos.get(backend_key)
643
811
  if repo is None:
644
812
  raise KeyError(f"Unknown repository {backend_key!r} in name {name!r}")
813
+
814
+ effective_store = data_store or repo.data_store
645
815
  return self._insert_dataset_to_provider(
646
816
  ds,
647
817
  name=resolved_name,
648
818
  schema_ref=schema_ref,
649
819
  provider=repo.provider,
650
- store=repo.data_store,
820
+ store=effective_store,
821
+ metadata=metadata,
651
822
  **kwargs,
652
823
  )
653
824
 
654
- def write(
825
+ def write_samples(
655
826
  self,
656
827
  samples: Iterable,
657
828
  *,
@@ -664,6 +835,8 @@ class Index:
664
835
  maxsize: int | None = None,
665
836
  metadata: dict | None = None,
666
837
  manifest: bool = False,
838
+ data_store: AbstractDataStore | None = None,
839
+ force: bool = False,
667
840
  ) -> "IndexEntry":
668
841
  """Write samples and create an index entry in one step.
669
842
 
@@ -677,17 +850,14 @@ class Index:
677
850
  - ``"@handle/name"``: writes and publishes to the atmosphere.
678
851
  - ``"repo/name"``: writes to a named repository.
679
852
 
853
+ For atmosphere targets, data is uploaded as PDS blobs by default.
854
+ Shard size is capped at 50 MB and total dataset size at 1 GB
855
+ unless *force* is ``True``.
856
+
680
857
  When the local backend has no ``data_store`` configured, a
681
858
  ``LocalDiskStore`` is created automatically at
682
859
  ``~/.atdata/data/`` so that samples have persistent storage.
683
860
 
684
- .. note::
685
-
686
- This method is synchronous. Samples are written to a temporary
687
- location first, then copied to permanent storage by the backend.
688
- Avoid passing lazily-evaluated iterators that depend on external
689
- state that may change during the call.
690
-
691
861
  Args:
692
862
  samples: Iterable of ``Packable`` samples. Must be non-empty.
693
863
  name: Dataset name, optionally prefixed with target.
@@ -696,71 +866,171 @@ class Index:
696
866
  tags: Optional tags for discovery (atmosphere only).
697
867
  license: Optional license identifier (atmosphere only).
698
868
  maxcount: Max samples per shard. Default: 10,000.
699
- maxsize: Max bytes per shard. Default: ``None``.
869
+ maxsize: Max bytes per shard. For atmosphere targets defaults
870
+ to 50 MB (PDS blob limit). For local targets defaults to
871
+ ``None`` (unlimited).
700
872
  metadata: Optional metadata dict stored with the entry.
701
873
  manifest: If True, write per-shard manifest sidecar files
702
874
  alongside each tar. Default: ``False``.
875
+ data_store: Explicit data store for shard storage. Overrides
876
+ the repository's default store. For atmosphere targets
877
+ defaults to ``PDSBlobStore``.
878
+ force: If True, bypass PDS size limits (50 MB per shard,
879
+ 1 GB total dataset). Default: ``False``.
703
880
 
704
881
  Returns:
705
882
  IndexEntry for the created dataset.
706
883
 
707
884
  Raises:
708
- ValueError: If *samples* is empty.
885
+ ValueError: If *samples* is empty, or if atmosphere size
886
+ limits are exceeded (when *force* is ``False``).
709
887
 
710
888
  Examples:
711
889
  >>> index = Index()
712
890
  >>> samples = [MySample(key="0", text="hello")]
713
- >>> entry = index.write(samples, name="my-dataset")
891
+ >>> entry = index.write_samples(samples, name="my-dataset")
714
892
  """
715
893
  import tempfile
716
894
 
717
- from atdata.dataset import write_samples
895
+ from atdata.dataset import write_samples as _write_samples
896
+ from atdata.atmosphere.store import (
897
+ PDS_BLOB_LIMIT_BYTES,
898
+ PDS_TOTAL_DATASET_LIMIT_BYTES,
899
+ )
900
+ from atdata._logging import log_operation
718
901
 
719
902
  backend_key, resolved_name, _ = self._resolve_prefix(name)
903
+ is_atmosphere = backend_key == "_atmosphere"
904
+
905
+ with log_operation("Index.write_samples", name=name):
906
+ # --- Atmosphere size guards ---
907
+ if is_atmosphere and not force:
908
+ if maxsize is not None and maxsize > PDS_BLOB_LIMIT_BYTES:
909
+ raise ValueError(
910
+ f"maxsize={maxsize} exceeds PDS blob limit "
911
+ f"({PDS_BLOB_LIMIT_BYTES} bytes). "
912
+ f"Pass force=True to bypass."
913
+ )
720
914
 
721
- # Resolve the target repo's data store; auto-create LocalDiskStore
722
- # for repos that have no store so write() always persists data.
723
- repo = self._repos.get(backend_key)
724
- effective_store = repo.data_store if repo is not None else None
725
- needs_auto_store = repo is not None and effective_store is None
726
-
727
- if needs_auto_store and backend_key != "_atmosphere":
728
- from atdata.stores._disk import LocalDiskStore
729
-
730
- effective_store = LocalDiskStore()
731
-
732
- with tempfile.TemporaryDirectory() as tmp_dir:
733
- tmp_path = Path(tmp_dir) / "data.tar"
734
- ds = write_samples(
735
- samples,
736
- tmp_path,
737
- maxcount=maxcount,
738
- maxsize=maxsize,
739
- manifest=manifest,
740
- )
915
+ # Default maxsize for atmosphere targets
916
+ effective_maxsize = maxsize
917
+ if is_atmosphere and effective_maxsize is None:
918
+ effective_maxsize = PDS_BLOB_LIMIT_BYTES
919
+
920
+ # Resolve the effective data store
921
+ if is_atmosphere:
922
+ atmo = self._get_atmosphere()
923
+ if atmo is None:
924
+ raise ValueError(
925
+ f"Atmosphere backend required for name {name!r} but not available."
926
+ )
927
+ if data_store is None:
928
+ from atdata.atmosphere.store import PDSBlobStore
929
+
930
+ effective_store: AbstractDataStore | None = PDSBlobStore(
931
+ atmo.client
932
+ )
933
+ else:
934
+ effective_store = data_store
935
+ else:
936
+ repo = self._repos.get(backend_key)
937
+ effective_store = data_store or (
938
+ repo.data_store if repo is not None else None
939
+ )
940
+ needs_auto_store = repo is not None and effective_store is None
941
+ if needs_auto_store:
942
+ from atdata.stores._disk import LocalDiskStore
943
+
944
+ effective_store = LocalDiskStore()
945
+
946
+ with tempfile.TemporaryDirectory() as tmp_dir:
947
+ tmp_path = Path(tmp_dir) / "data.tar"
948
+ ds = _write_samples(
949
+ samples,
950
+ tmp_path,
951
+ maxcount=maxcount,
952
+ maxsize=effective_maxsize,
953
+ manifest=manifest,
954
+ )
955
+
956
+ # Atmosphere total-size guard (after writing so we can measure)
957
+ if is_atmosphere and not force:
958
+ total_bytes = _estimate_dataset_bytes(ds)
959
+ if total_bytes > PDS_TOTAL_DATASET_LIMIT_BYTES:
960
+ raise ValueError(
961
+ f"Total dataset size ({total_bytes} bytes) exceeds "
962
+ f"atmosphere limit ({PDS_TOTAL_DATASET_LIMIT_BYTES} "
963
+ f"bytes). Pass force=True to bypass."
964
+ )
965
+
966
+ if is_atmosphere:
967
+ # Write shards through the store, then publish record
968
+ # with the resulting URLs (not the temp paths).
969
+ written_urls = effective_store.write_shards(
970
+ ds, prefix=resolved_name
971
+ )
972
+
973
+ # If write_shards returned blob refs (e.g. ShardUploadResult),
974
+ # use storageBlobs so the PDS retains the uploaded blobs.
975
+ # Fall back to storageExternal with AT URIs otherwise.
976
+ blob_refs = getattr(written_urls, "blob_refs", None) or None
977
+
978
+ return self.insert_dataset(
979
+ ds,
980
+ name=name,
981
+ schema_ref=schema_ref,
982
+ metadata=metadata,
983
+ description=description,
984
+ tags=tags,
985
+ license=license,
986
+ data_store=data_store,
987
+ force=force,
988
+ _data_urls=written_urls,
989
+ _blob_refs=blob_refs,
990
+ )
741
991
 
742
- # When we auto-created a store, write directly through it
743
- # rather than via insert_dataset (which would just index
744
- # the temp path).
745
- if needs_auto_store and repo is not None:
746
- return self._insert_dataset_to_provider(
992
+ # Local / named repo path
993
+ repo = self._repos.get(backend_key)
994
+ if repo is not None and effective_store is not None:
995
+ return self._insert_dataset_to_provider(
996
+ ds,
997
+ name=resolved_name,
998
+ schema_ref=schema_ref,
999
+ provider=repo.provider,
1000
+ store=effective_store,
1001
+ metadata=metadata,
1002
+ )
1003
+
1004
+ return self.insert_dataset(
747
1005
  ds,
748
- name=resolved_name,
1006
+ name=name,
749
1007
  schema_ref=schema_ref,
750
- provider=repo.provider,
751
- store=effective_store,
752
1008
  metadata=metadata,
1009
+ description=description,
1010
+ tags=tags,
1011
+ license=license,
753
1012
  )
754
1013
 
755
- return self.insert_dataset(
756
- ds,
757
- name=name,
758
- schema_ref=schema_ref,
759
- metadata=metadata,
760
- description=description,
761
- tags=tags,
762
- license=license,
763
- )
1014
+ def write(
1015
+ self,
1016
+ samples: Iterable,
1017
+ *,
1018
+ name: str,
1019
+ **kwargs: Any,
1020
+ ) -> "IndexEntry":
1021
+ """Write samples and create an index entry.
1022
+
1023
+ .. deprecated::
1024
+ Use :meth:`write_samples` instead.
1025
+ """
1026
+ import warnings
1027
+
1028
+ warnings.warn(
1029
+ "Index.write() is deprecated, use Index.write_samples()",
1030
+ DeprecationWarning,
1031
+ stacklevel=2,
1032
+ )
1033
+ return self.write_samples(samples, name=name, **kwargs)
764
1034
 
765
1035
  def get_dataset(self, ref: str) -> "IndexEntry":
766
1036
  """Get a dataset entry by name or prefixed reference.
@@ -1071,9 +1341,8 @@ class Index:
1071
1341
  ) -> str:
1072
1342
  """Promote a locally-indexed dataset to the atmosphere.
1073
1343
 
1074
- Looks up the entry by name in the local index, resolves its
1075
- schema, and publishes both schema and dataset record to ATProto
1076
- via the index's atmosphere backend.
1344
+ .. deprecated::
1345
+ Use :meth:`insert_dataset` instead.
1077
1346
 
1078
1347
  Args:
1079
1348
  entry_name: Name of the local dataset entry to promote.
@@ -1095,40 +1364,49 @@ class Index:
1095
1364
  >>> index = Index(atmosphere=client)
1096
1365
  >>> uri = index.promote_entry("mnist-train")
1097
1366
  """
1367
+ import warnings
1368
+
1369
+ warnings.warn(
1370
+ "Index.promote_entry() is deprecated, use Index.insert_dataset()",
1371
+ DeprecationWarning,
1372
+ stacklevel=2,
1373
+ )
1098
1374
  from atdata.promote import _find_or_publish_schema
1099
1375
  from atdata.atmosphere import DatasetPublisher
1100
1376
  from atdata._schema_codec import schema_to_type
1377
+ from atdata._logging import log_operation
1101
1378
 
1102
1379
  atmo = self._get_atmosphere()
1103
1380
  if atmo is None:
1104
1381
  raise ValueError("Atmosphere backend required but not available.")
1105
1382
 
1106
- entry = self.get_entry_by_name(entry_name)
1107
- if not entry.data_urls:
1108
- raise ValueError(f"Local entry {entry_name!r} has no data URLs")
1383
+ with log_operation("Index.promote_entry", entry_name=entry_name):
1384
+ entry = self.get_entry_by_name(entry_name)
1385
+ if not entry.data_urls:
1386
+ raise ValueError(f"Local entry {entry_name!r} has no data URLs")
1109
1387
 
1110
- schema_record = self.get_schema(entry.schema_ref)
1111
- sample_type = schema_to_type(schema_record)
1112
- schema_version = schema_record.get("version", "1.0.0")
1388
+ schema_record = self.get_schema(entry.schema_ref)
1389
+ sample_type = schema_to_type(schema_record)
1390
+ schema_version = schema_record.get("version", "1.0.0")
1113
1391
 
1114
- atmosphere_schema_uri = _find_or_publish_schema(
1115
- sample_type,
1116
- schema_version,
1117
- atmo.client,
1118
- description=schema_record.get("description"),
1119
- )
1392
+ atmosphere_schema_uri = _find_or_publish_schema(
1393
+ sample_type,
1394
+ schema_version,
1395
+ atmo.client,
1396
+ description=schema_record.get("description"),
1397
+ )
1120
1398
 
1121
- publisher = DatasetPublisher(atmo.client)
1122
- uri = publisher.publish_with_urls(
1123
- urls=entry.data_urls,
1124
- schema_uri=atmosphere_schema_uri,
1125
- name=name or entry.name,
1126
- description=description,
1127
- tags=tags,
1128
- license=license,
1129
- metadata=entry.metadata,
1130
- )
1131
- return str(uri)
1399
+ publisher = DatasetPublisher(atmo.client)
1400
+ uri = publisher.publish_with_urls(
1401
+ urls=entry.data_urls,
1402
+ schema_uri=atmosphere_schema_uri,
1403
+ name=name or entry.name,
1404
+ description=description,
1405
+ tags=tags,
1406
+ license=license,
1407
+ metadata=entry.metadata,
1408
+ )
1409
+ return str(uri)
1132
1410
 
1133
1411
  def promote_dataset(
1134
1412
  self,
@@ -1143,8 +1421,8 @@ class Index:
1143
1421
  ) -> str:
1144
1422
  """Publish a Dataset directly to the atmosphere.
1145
1423
 
1146
- Publishes the schema (with deduplication) and creates a dataset
1147
- record on ATProto. Uses the index's atmosphere backend.
1424
+ .. deprecated::
1425
+ Use :meth:`insert_dataset` instead.
1148
1426
 
1149
1427
  Args:
1150
1428
  dataset: The Dataset to publish.
@@ -1167,32 +1445,41 @@ class Index:
1167
1445
  >>> ds = atdata.load_dataset("./data.tar", MySample, split="train")
1168
1446
  >>> uri = index.promote_dataset(ds, name="my-dataset")
1169
1447
  """
1448
+ import warnings
1449
+
1450
+ warnings.warn(
1451
+ "Index.promote_dataset() is deprecated, use Index.insert_dataset()",
1452
+ DeprecationWarning,
1453
+ stacklevel=2,
1454
+ )
1170
1455
  from atdata.promote import _find_or_publish_schema
1171
1456
  from atdata.atmosphere import DatasetPublisher
1457
+ from atdata._logging import log_operation
1172
1458
 
1173
1459
  atmo = self._get_atmosphere()
1174
1460
  if atmo is None:
1175
1461
  raise ValueError("Atmosphere backend required but not available.")
1176
1462
 
1177
- st = sample_type or dataset.sample_type
1463
+ with log_operation("Index.promote_dataset", name=name):
1464
+ st = sample_type or dataset.sample_type
1178
1465
 
1179
- atmosphere_schema_uri = _find_or_publish_schema(
1180
- st,
1181
- schema_version,
1182
- atmo.client,
1183
- description=description,
1184
- )
1466
+ atmosphere_schema_uri = _find_or_publish_schema(
1467
+ st,
1468
+ schema_version,
1469
+ atmo.client,
1470
+ description=description,
1471
+ )
1185
1472
 
1186
- data_urls = dataset.list_shards()
1473
+ data_urls = dataset.list_shards()
1187
1474
 
1188
- publisher = DatasetPublisher(atmo.client)
1189
- uri = publisher.publish_with_urls(
1190
- urls=data_urls,
1191
- schema_uri=atmosphere_schema_uri,
1192
- name=name,
1193
- description=description,
1194
- tags=tags,
1195
- license=license,
1196
- metadata=dataset._metadata,
1197
- )
1198
- return str(uri)
1475
+ publisher = DatasetPublisher(atmo.client)
1476
+ uri = publisher.publish_with_urls(
1477
+ urls=data_urls,
1478
+ schema_uri=atmosphere_schema_uri,
1479
+ name=name,
1480
+ description=description,
1481
+ tags=tags,
1482
+ license=license,
1483
+ metadata=dataset._metadata,
1484
+ )
1485
+ return str(uri)