atdata 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +2 -0
- atdata/_hf_api.py +13 -0
- atdata/_logging.py +43 -0
- atdata/_protocols.py +18 -1
- atdata/_sources.py +24 -4
- atdata/atmosphere/__init__.py +48 -10
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +71 -243
- atdata/atmosphere/lens.py +49 -41
- atdata/atmosphere/records.py +282 -90
- atdata/atmosphere/schema.py +78 -50
- atdata/atmosphere/store.py +62 -59
- atdata/dataset.py +201 -135
- atdata/index/_entry.py +6 -2
- atdata/index/_index.py +396 -109
- atdata/lexicons/__init__.py +9 -3
- atdata/lexicons/ac.foundation.dataset.lens.json +2 -0
- atdata/lexicons/ac.foundation.dataset.record.json +22 -1
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +26 -4
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +1 -1
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/repository.py +59 -9
- atdata/stores/_disk.py +19 -11
- atdata/stores/_s3.py +134 -112
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +1 -1
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/RECORD +37 -33
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/index/_index.py
CHANGED
|
@@ -27,10 +27,11 @@ from typing import (
|
|
|
27
27
|
Generator,
|
|
28
28
|
TYPE_CHECKING,
|
|
29
29
|
)
|
|
30
|
-
from redis import Redis
|
|
31
30
|
import json
|
|
32
31
|
|
|
33
32
|
if TYPE_CHECKING:
|
|
33
|
+
from redis import Redis
|
|
34
|
+
|
|
34
35
|
from atdata.providers._base import IndexProvider
|
|
35
36
|
from atdata.repository import Repository, _AtmosphereBackend
|
|
36
37
|
from atdata._protocols import IndexEntry
|
|
@@ -38,6 +39,36 @@ if TYPE_CHECKING:
|
|
|
38
39
|
T = TypeVar("T", bound=Packable)
|
|
39
40
|
|
|
40
41
|
|
|
42
|
+
def _is_local_path(url: str) -> bool:
|
|
43
|
+
"""Check if a URL points to the local filesystem."""
|
|
44
|
+
return (
|
|
45
|
+
url.startswith("/")
|
|
46
|
+
or url.startswith("file://")
|
|
47
|
+
or (len(url) > 1 and url[1] == ":")
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _is_credentialed_source(ds: Dataset) -> bool:
|
|
52
|
+
"""Check if a Dataset uses a credentialed source (e.g. S3Source with keys)."""
|
|
53
|
+
from atdata._sources import S3Source
|
|
54
|
+
|
|
55
|
+
return isinstance(ds.source, S3Source)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _estimate_dataset_bytes(ds: Dataset) -> int:
|
|
59
|
+
"""Best-effort total size estimate from local shard files.
|
|
60
|
+
|
|
61
|
+
Returns 0 when size cannot be determined (e.g. remote URLs).
|
|
62
|
+
"""
|
|
63
|
+
total = 0
|
|
64
|
+
for shard_url in ds.list_shards():
|
|
65
|
+
if _is_local_path(shard_url):
|
|
66
|
+
p = Path(shard_url.removeprefix("file://"))
|
|
67
|
+
if p.exists():
|
|
68
|
+
total += p.stat().st_size
|
|
69
|
+
return total
|
|
70
|
+
|
|
71
|
+
|
|
41
72
|
class Index:
|
|
42
73
|
"""Unified index for tracking datasets across multiple repositories.
|
|
43
74
|
|
|
@@ -166,9 +197,10 @@ class Index:
|
|
|
166
197
|
|
|
167
198
|
local_provider = RedisProvider(redis)
|
|
168
199
|
elif kwargs:
|
|
200
|
+
from redis import Redis as _Redis
|
|
169
201
|
from atdata.providers._redis import RedisProvider
|
|
170
202
|
|
|
171
|
-
local_provider = RedisProvider(
|
|
203
|
+
local_provider = RedisProvider(_Redis(**kwargs))
|
|
172
204
|
else:
|
|
173
205
|
from atdata.providers._sqlite import SqliteProvider
|
|
174
206
|
|
|
@@ -471,6 +503,9 @@ class Index:
|
|
|
471
503
|
) -> LocalDatasetEntry:
|
|
472
504
|
"""Add a dataset to the local repository index.
|
|
473
505
|
|
|
506
|
+
.. deprecated::
|
|
507
|
+
Use :meth:`insert_dataset` instead.
|
|
508
|
+
|
|
474
509
|
Args:
|
|
475
510
|
ds: The dataset to add to the index.
|
|
476
511
|
name: Human-readable name for the dataset.
|
|
@@ -480,6 +515,13 @@ class Index:
|
|
|
480
515
|
Returns:
|
|
481
516
|
The created LocalDatasetEntry object.
|
|
482
517
|
"""
|
|
518
|
+
import warnings
|
|
519
|
+
|
|
520
|
+
warnings.warn(
|
|
521
|
+
"Index.add_entry() is deprecated, use Index.insert_dataset()",
|
|
522
|
+
DeprecationWarning,
|
|
523
|
+
stacklevel=2,
|
|
524
|
+
)
|
|
483
525
|
return self._insert_dataset_to_provider(
|
|
484
526
|
ds,
|
|
485
527
|
name=name,
|
|
@@ -551,17 +593,30 @@ class Index:
|
|
|
551
593
|
This is the internal implementation shared by all local and named
|
|
552
594
|
repository inserts.
|
|
553
595
|
"""
|
|
596
|
+
from atdata._logging import get_logger
|
|
597
|
+
|
|
598
|
+
log = get_logger()
|
|
554
599
|
metadata = kwargs.get("metadata")
|
|
555
600
|
|
|
556
601
|
if store is not None:
|
|
557
602
|
prefix = kwargs.get("prefix", name)
|
|
558
603
|
cache_local = kwargs.get("cache_local", False)
|
|
604
|
+
log.debug(
|
|
605
|
+
"_insert_dataset_to_provider: name=%s, store=%s",
|
|
606
|
+
name,
|
|
607
|
+
type(store).__name__,
|
|
608
|
+
)
|
|
559
609
|
|
|
560
610
|
written_urls = store.write_shards(
|
|
561
611
|
ds,
|
|
562
612
|
prefix=prefix,
|
|
563
613
|
cache_local=cache_local,
|
|
564
614
|
)
|
|
615
|
+
log.info(
|
|
616
|
+
"_insert_dataset_to_provider: %d shard(s) written for %s",
|
|
617
|
+
len(written_urls),
|
|
618
|
+
name,
|
|
619
|
+
)
|
|
565
620
|
|
|
566
621
|
if schema_ref is None:
|
|
567
622
|
schema_ref = _schema_ref_from_type(ds.sample_type, version="1.0.0")
|
|
@@ -576,6 +631,7 @@ class Index:
|
|
|
576
631
|
metadata=entry_metadata,
|
|
577
632
|
)
|
|
578
633
|
provider.store_entry(entry)
|
|
634
|
+
log.debug("_insert_dataset_to_provider: entry stored for %s", name)
|
|
579
635
|
return entry
|
|
580
636
|
|
|
581
637
|
# No data store - just index the existing URL
|
|
@@ -594,6 +650,7 @@ class Index:
|
|
|
594
650
|
metadata=entry_metadata,
|
|
595
651
|
)
|
|
596
652
|
provider.store_entry(entry)
|
|
653
|
+
log.debug("_insert_dataset_to_provider: entry stored for %s", name)
|
|
597
654
|
return entry
|
|
598
655
|
|
|
599
656
|
def insert_dataset(
|
|
@@ -602,56 +659,170 @@ class Index:
|
|
|
602
659
|
*,
|
|
603
660
|
name: str,
|
|
604
661
|
schema_ref: str | None = None,
|
|
662
|
+
description: str | None = None,
|
|
663
|
+
tags: list[str] | None = None,
|
|
664
|
+
license: str | None = None,
|
|
665
|
+
data_store: AbstractDataStore | None = None,
|
|
666
|
+
force: bool = False,
|
|
667
|
+
copy: bool = False,
|
|
668
|
+
metadata: dict | None = None,
|
|
669
|
+
_data_urls: list[str] | None = None,
|
|
670
|
+
_blob_refs: list[dict] | None = None,
|
|
605
671
|
**kwargs,
|
|
606
672
|
) -> "IndexEntry":
|
|
607
|
-
"""Insert a dataset into the index
|
|
673
|
+
"""Insert a dataset into the index.
|
|
608
674
|
|
|
609
675
|
The target repository is determined by a prefix in the ``name``
|
|
610
676
|
argument (e.g. ``"lab/mnist"``). If no prefix is given, or the
|
|
611
677
|
prefix is ``"local"``, the built-in local repository is used.
|
|
612
678
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
679
|
+
For atmosphere targets:
|
|
680
|
+
|
|
681
|
+
- **Local sources** are uploaded via *data_store* (defaults to
|
|
682
|
+
``PDSBlobStore``).
|
|
683
|
+
- **Public remote sources** (http/https) are referenced as
|
|
684
|
+
external URLs unless *copy* is ``True``.
|
|
685
|
+
- **Credentialed sources** (e.g. ``S3Source``) raise an error
|
|
686
|
+
unless *copy* is ``True`` or *data_store* is provided, to
|
|
687
|
+
prevent leaking private endpoints.
|
|
616
688
|
|
|
617
689
|
Args:
|
|
618
690
|
ds: The Dataset to register.
|
|
619
691
|
name: Human-readable name for the dataset, optionally prefixed
|
|
620
692
|
with a repository name (e.g. ``"lab/mnist"``).
|
|
621
693
|
schema_ref: Optional schema reference.
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
694
|
+
description: Optional dataset description (atmosphere only).
|
|
695
|
+
tags: Optional tags for discovery (atmosphere only).
|
|
696
|
+
license: Optional license identifier (atmosphere only).
|
|
697
|
+
data_store: Explicit data store for shard storage. When
|
|
698
|
+
provided, data is always copied through this store.
|
|
699
|
+
force: If True, bypass PDS size limits (50 MB per shard,
|
|
700
|
+
1 GB total). Default: ``False``.
|
|
701
|
+
copy: If True, copy data to the destination store even for
|
|
702
|
+
remote sources. Required for credentialed sources
|
|
703
|
+
targeting the atmosphere. Default: ``False``.
|
|
704
|
+
metadata: Optional metadata dict.
|
|
626
705
|
|
|
627
706
|
Returns:
|
|
628
707
|
IndexEntry for the inserted dataset.
|
|
708
|
+
|
|
709
|
+
Raises:
|
|
710
|
+
ValueError: If atmosphere limits are exceeded (when
|
|
711
|
+
*force* is ``False``), or if a credentialed source
|
|
712
|
+
targets the atmosphere without *copy*.
|
|
629
713
|
"""
|
|
714
|
+
from atdata.atmosphere.store import PDS_TOTAL_DATASET_LIMIT_BYTES
|
|
715
|
+
|
|
630
716
|
backend_key, resolved_name, handle_or_did = self._resolve_prefix(name)
|
|
717
|
+
is_atmosphere = backend_key == "_atmosphere"
|
|
631
718
|
|
|
632
|
-
if
|
|
719
|
+
if is_atmosphere:
|
|
633
720
|
atmo = self._get_atmosphere()
|
|
634
721
|
if atmo is None:
|
|
635
722
|
raise ValueError(
|
|
636
723
|
f"Atmosphere backend required for name {name!r} but not available."
|
|
637
724
|
)
|
|
725
|
+
|
|
726
|
+
# Providing an explicit data_store implies copy behaviour
|
|
727
|
+
needs_copy = copy or data_store is not None
|
|
728
|
+
|
|
729
|
+
# Credentialed source guard
|
|
730
|
+
if _is_credentialed_source(ds) and not needs_copy:
|
|
731
|
+
raise ValueError(
|
|
732
|
+
"Dataset uses a credentialed source. Referencing "
|
|
733
|
+
"these URLs in a public atmosphere record would "
|
|
734
|
+
"leak private endpoints. Pass copy=True to copy "
|
|
735
|
+
"data to the destination store (default: PDS blobs)."
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# If we already have pre-written URLs (from write_samples),
|
|
739
|
+
# go straight to publish.
|
|
740
|
+
if _data_urls is not None:
|
|
741
|
+
return atmo.insert_dataset(
|
|
742
|
+
ds,
|
|
743
|
+
name=resolved_name,
|
|
744
|
+
schema_ref=schema_ref,
|
|
745
|
+
data_urls=_data_urls,
|
|
746
|
+
blob_refs=_blob_refs,
|
|
747
|
+
description=description,
|
|
748
|
+
tags=tags,
|
|
749
|
+
license=license,
|
|
750
|
+
metadata=metadata,
|
|
751
|
+
**kwargs,
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
# Determine whether data must be copied
|
|
755
|
+
source_is_local = _is_local_path(ds.url)
|
|
756
|
+
|
|
757
|
+
if source_is_local or needs_copy:
|
|
758
|
+
# Resolve effective store
|
|
759
|
+
if data_store is not None:
|
|
760
|
+
effective_store = data_store
|
|
761
|
+
else:
|
|
762
|
+
from atdata.atmosphere.store import PDSBlobStore
|
|
763
|
+
|
|
764
|
+
effective_store = PDSBlobStore(atmo.client)
|
|
765
|
+
|
|
766
|
+
# Size guard
|
|
767
|
+
if not force:
|
|
768
|
+
total_bytes = _estimate_dataset_bytes(ds)
|
|
769
|
+
if total_bytes > PDS_TOTAL_DATASET_LIMIT_BYTES:
|
|
770
|
+
raise ValueError(
|
|
771
|
+
f"Total dataset size ({total_bytes} bytes) "
|
|
772
|
+
f"exceeds atmosphere limit "
|
|
773
|
+
f"({PDS_TOTAL_DATASET_LIMIT_BYTES} bytes). "
|
|
774
|
+
f"Pass force=True to bypass."
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
result = effective_store.write_shards(ds, prefix=resolved_name)
|
|
778
|
+
|
|
779
|
+
# ShardUploadResult carries blob_refs; plain list does not
|
|
780
|
+
blob_refs = getattr(result, "blob_refs", None) or None
|
|
781
|
+
|
|
782
|
+
return atmo.insert_dataset(
|
|
783
|
+
ds,
|
|
784
|
+
name=resolved_name,
|
|
785
|
+
schema_ref=schema_ref,
|
|
786
|
+
data_urls=list(result),
|
|
787
|
+
blob_refs=blob_refs,
|
|
788
|
+
description=description,
|
|
789
|
+
tags=tags,
|
|
790
|
+
license=license,
|
|
791
|
+
metadata=metadata,
|
|
792
|
+
**kwargs,
|
|
793
|
+
)
|
|
794
|
+
|
|
795
|
+
# Public remote source — reference existing URLs
|
|
796
|
+
data_urls = ds.list_shards()
|
|
638
797
|
return atmo.insert_dataset(
|
|
639
|
-
ds,
|
|
798
|
+
ds,
|
|
799
|
+
name=resolved_name,
|
|
800
|
+
schema_ref=schema_ref,
|
|
801
|
+
data_urls=data_urls,
|
|
802
|
+
description=description,
|
|
803
|
+
tags=tags,
|
|
804
|
+
license=license,
|
|
805
|
+
metadata=metadata,
|
|
806
|
+
**kwargs,
|
|
640
807
|
)
|
|
641
808
|
|
|
809
|
+
# --- Local / named repo path ---
|
|
642
810
|
repo = self._repos.get(backend_key)
|
|
643
811
|
if repo is None:
|
|
644
812
|
raise KeyError(f"Unknown repository {backend_key!r} in name {name!r}")
|
|
813
|
+
|
|
814
|
+
effective_store = data_store or repo.data_store
|
|
645
815
|
return self._insert_dataset_to_provider(
|
|
646
816
|
ds,
|
|
647
817
|
name=resolved_name,
|
|
648
818
|
schema_ref=schema_ref,
|
|
649
819
|
provider=repo.provider,
|
|
650
|
-
store=
|
|
820
|
+
store=effective_store,
|
|
821
|
+
metadata=metadata,
|
|
651
822
|
**kwargs,
|
|
652
823
|
)
|
|
653
824
|
|
|
654
|
-
def
|
|
825
|
+
def write_samples(
|
|
655
826
|
self,
|
|
656
827
|
samples: Iterable,
|
|
657
828
|
*,
|
|
@@ -664,6 +835,8 @@ class Index:
|
|
|
664
835
|
maxsize: int | None = None,
|
|
665
836
|
metadata: dict | None = None,
|
|
666
837
|
manifest: bool = False,
|
|
838
|
+
data_store: AbstractDataStore | None = None,
|
|
839
|
+
force: bool = False,
|
|
667
840
|
) -> "IndexEntry":
|
|
668
841
|
"""Write samples and create an index entry in one step.
|
|
669
842
|
|
|
@@ -677,17 +850,14 @@ class Index:
|
|
|
677
850
|
- ``"@handle/name"``: writes and publishes to the atmosphere.
|
|
678
851
|
- ``"repo/name"``: writes to a named repository.
|
|
679
852
|
|
|
853
|
+
For atmosphere targets, data is uploaded as PDS blobs by default.
|
|
854
|
+
Shard size is capped at 50 MB and total dataset size at 1 GB
|
|
855
|
+
unless *force* is ``True``.
|
|
856
|
+
|
|
680
857
|
When the local backend has no ``data_store`` configured, a
|
|
681
858
|
``LocalDiskStore`` is created automatically at
|
|
682
859
|
``~/.atdata/data/`` so that samples have persistent storage.
|
|
683
860
|
|
|
684
|
-
.. note::
|
|
685
|
-
|
|
686
|
-
This method is synchronous. Samples are written to a temporary
|
|
687
|
-
location first, then copied to permanent storage by the backend.
|
|
688
|
-
Avoid passing lazily-evaluated iterators that depend on external
|
|
689
|
-
state that may change during the call.
|
|
690
|
-
|
|
691
861
|
Args:
|
|
692
862
|
samples: Iterable of ``Packable`` samples. Must be non-empty.
|
|
693
863
|
name: Dataset name, optionally prefixed with target.
|
|
@@ -696,71 +866,171 @@ class Index:
|
|
|
696
866
|
tags: Optional tags for discovery (atmosphere only).
|
|
697
867
|
license: Optional license identifier (atmosphere only).
|
|
698
868
|
maxcount: Max samples per shard. Default: 10,000.
|
|
699
|
-
maxsize: Max bytes per shard.
|
|
869
|
+
maxsize: Max bytes per shard. For atmosphere targets defaults
|
|
870
|
+
to 50 MB (PDS blob limit). For local targets defaults to
|
|
871
|
+
``None`` (unlimited).
|
|
700
872
|
metadata: Optional metadata dict stored with the entry.
|
|
701
873
|
manifest: If True, write per-shard manifest sidecar files
|
|
702
874
|
alongside each tar. Default: ``False``.
|
|
875
|
+
data_store: Explicit data store for shard storage. Overrides
|
|
876
|
+
the repository's default store. For atmosphere targets
|
|
877
|
+
defaults to ``PDSBlobStore``.
|
|
878
|
+
force: If True, bypass PDS size limits (50 MB per shard,
|
|
879
|
+
1 GB total dataset). Default: ``False``.
|
|
703
880
|
|
|
704
881
|
Returns:
|
|
705
882
|
IndexEntry for the created dataset.
|
|
706
883
|
|
|
707
884
|
Raises:
|
|
708
|
-
ValueError: If *samples* is empty
|
|
885
|
+
ValueError: If *samples* is empty, or if atmosphere size
|
|
886
|
+
limits are exceeded (when *force* is ``False``).
|
|
709
887
|
|
|
710
888
|
Examples:
|
|
711
889
|
>>> index = Index()
|
|
712
890
|
>>> samples = [MySample(key="0", text="hello")]
|
|
713
|
-
>>> entry = index.
|
|
891
|
+
>>> entry = index.write_samples(samples, name="my-dataset")
|
|
714
892
|
"""
|
|
715
893
|
import tempfile
|
|
716
894
|
|
|
717
|
-
from atdata.dataset import write_samples
|
|
895
|
+
from atdata.dataset import write_samples as _write_samples
|
|
896
|
+
from atdata.atmosphere.store import (
|
|
897
|
+
PDS_BLOB_LIMIT_BYTES,
|
|
898
|
+
PDS_TOTAL_DATASET_LIMIT_BYTES,
|
|
899
|
+
)
|
|
900
|
+
from atdata._logging import log_operation
|
|
718
901
|
|
|
719
902
|
backend_key, resolved_name, _ = self._resolve_prefix(name)
|
|
903
|
+
is_atmosphere = backend_key == "_atmosphere"
|
|
904
|
+
|
|
905
|
+
with log_operation("Index.write_samples", name=name):
|
|
906
|
+
# --- Atmosphere size guards ---
|
|
907
|
+
if is_atmosphere and not force:
|
|
908
|
+
if maxsize is not None and maxsize > PDS_BLOB_LIMIT_BYTES:
|
|
909
|
+
raise ValueError(
|
|
910
|
+
f"maxsize={maxsize} exceeds PDS blob limit "
|
|
911
|
+
f"({PDS_BLOB_LIMIT_BYTES} bytes). "
|
|
912
|
+
f"Pass force=True to bypass."
|
|
913
|
+
)
|
|
720
914
|
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
915
|
+
# Default maxsize for atmosphere targets
|
|
916
|
+
effective_maxsize = maxsize
|
|
917
|
+
if is_atmosphere and effective_maxsize is None:
|
|
918
|
+
effective_maxsize = PDS_BLOB_LIMIT_BYTES
|
|
919
|
+
|
|
920
|
+
# Resolve the effective data store
|
|
921
|
+
if is_atmosphere:
|
|
922
|
+
atmo = self._get_atmosphere()
|
|
923
|
+
if atmo is None:
|
|
924
|
+
raise ValueError(
|
|
925
|
+
f"Atmosphere backend required for name {name!r} but not available."
|
|
926
|
+
)
|
|
927
|
+
if data_store is None:
|
|
928
|
+
from atdata.atmosphere.store import PDSBlobStore
|
|
929
|
+
|
|
930
|
+
effective_store: AbstractDataStore | None = PDSBlobStore(
|
|
931
|
+
atmo.client
|
|
932
|
+
)
|
|
933
|
+
else:
|
|
934
|
+
effective_store = data_store
|
|
935
|
+
else:
|
|
936
|
+
repo = self._repos.get(backend_key)
|
|
937
|
+
effective_store = data_store or (
|
|
938
|
+
repo.data_store if repo is not None else None
|
|
939
|
+
)
|
|
940
|
+
needs_auto_store = repo is not None and effective_store is None
|
|
941
|
+
if needs_auto_store:
|
|
942
|
+
from atdata.stores._disk import LocalDiskStore
|
|
943
|
+
|
|
944
|
+
effective_store = LocalDiskStore()
|
|
945
|
+
|
|
946
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
947
|
+
tmp_path = Path(tmp_dir) / "data.tar"
|
|
948
|
+
ds = _write_samples(
|
|
949
|
+
samples,
|
|
950
|
+
tmp_path,
|
|
951
|
+
maxcount=maxcount,
|
|
952
|
+
maxsize=effective_maxsize,
|
|
953
|
+
manifest=manifest,
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
# Atmosphere total-size guard (after writing so we can measure)
|
|
957
|
+
if is_atmosphere and not force:
|
|
958
|
+
total_bytes = _estimate_dataset_bytes(ds)
|
|
959
|
+
if total_bytes > PDS_TOTAL_DATASET_LIMIT_BYTES:
|
|
960
|
+
raise ValueError(
|
|
961
|
+
f"Total dataset size ({total_bytes} bytes) exceeds "
|
|
962
|
+
f"atmosphere limit ({PDS_TOTAL_DATASET_LIMIT_BYTES} "
|
|
963
|
+
f"bytes). Pass force=True to bypass."
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
if is_atmosphere:
|
|
967
|
+
# Write shards through the store, then publish record
|
|
968
|
+
# with the resulting URLs (not the temp paths).
|
|
969
|
+
written_urls = effective_store.write_shards(
|
|
970
|
+
ds, prefix=resolved_name
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
# If write_shards returned blob refs (e.g. ShardUploadResult),
|
|
974
|
+
# use storageBlobs so the PDS retains the uploaded blobs.
|
|
975
|
+
# Fall back to storageExternal with AT URIs otherwise.
|
|
976
|
+
blob_refs = getattr(written_urls, "blob_refs", None) or None
|
|
977
|
+
|
|
978
|
+
return self.insert_dataset(
|
|
979
|
+
ds,
|
|
980
|
+
name=name,
|
|
981
|
+
schema_ref=schema_ref,
|
|
982
|
+
metadata=metadata,
|
|
983
|
+
description=description,
|
|
984
|
+
tags=tags,
|
|
985
|
+
license=license,
|
|
986
|
+
data_store=data_store,
|
|
987
|
+
force=force,
|
|
988
|
+
_data_urls=written_urls,
|
|
989
|
+
_blob_refs=blob_refs,
|
|
990
|
+
)
|
|
741
991
|
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
992
|
+
# Local / named repo path
|
|
993
|
+
repo = self._repos.get(backend_key)
|
|
994
|
+
if repo is not None and effective_store is not None:
|
|
995
|
+
return self._insert_dataset_to_provider(
|
|
996
|
+
ds,
|
|
997
|
+
name=resolved_name,
|
|
998
|
+
schema_ref=schema_ref,
|
|
999
|
+
provider=repo.provider,
|
|
1000
|
+
store=effective_store,
|
|
1001
|
+
metadata=metadata,
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
return self.insert_dataset(
|
|
747
1005
|
ds,
|
|
748
|
-
name=
|
|
1006
|
+
name=name,
|
|
749
1007
|
schema_ref=schema_ref,
|
|
750
|
-
provider=repo.provider,
|
|
751
|
-
store=effective_store,
|
|
752
1008
|
metadata=metadata,
|
|
1009
|
+
description=description,
|
|
1010
|
+
tags=tags,
|
|
1011
|
+
license=license,
|
|
753
1012
|
)
|
|
754
1013
|
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
1014
|
+
def write(
|
|
1015
|
+
self,
|
|
1016
|
+
samples: Iterable,
|
|
1017
|
+
*,
|
|
1018
|
+
name: str,
|
|
1019
|
+
**kwargs: Any,
|
|
1020
|
+
) -> "IndexEntry":
|
|
1021
|
+
"""Write samples and create an index entry.
|
|
1022
|
+
|
|
1023
|
+
.. deprecated::
|
|
1024
|
+
Use :meth:`write_samples` instead.
|
|
1025
|
+
"""
|
|
1026
|
+
import warnings
|
|
1027
|
+
|
|
1028
|
+
warnings.warn(
|
|
1029
|
+
"Index.write() is deprecated, use Index.write_samples()",
|
|
1030
|
+
DeprecationWarning,
|
|
1031
|
+
stacklevel=2,
|
|
1032
|
+
)
|
|
1033
|
+
return self.write_samples(samples, name=name, **kwargs)
|
|
764
1034
|
|
|
765
1035
|
def get_dataset(self, ref: str) -> "IndexEntry":
|
|
766
1036
|
"""Get a dataset entry by name or prefixed reference.
|
|
@@ -1071,9 +1341,8 @@ class Index:
|
|
|
1071
1341
|
) -> str:
|
|
1072
1342
|
"""Promote a locally-indexed dataset to the atmosphere.
|
|
1073
1343
|
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
via the index's atmosphere backend.
|
|
1344
|
+
.. deprecated::
|
|
1345
|
+
Use :meth:`insert_dataset` instead.
|
|
1077
1346
|
|
|
1078
1347
|
Args:
|
|
1079
1348
|
entry_name: Name of the local dataset entry to promote.
|
|
@@ -1095,40 +1364,49 @@ class Index:
|
|
|
1095
1364
|
>>> index = Index(atmosphere=client)
|
|
1096
1365
|
>>> uri = index.promote_entry("mnist-train")
|
|
1097
1366
|
"""
|
|
1367
|
+
import warnings
|
|
1368
|
+
|
|
1369
|
+
warnings.warn(
|
|
1370
|
+
"Index.promote_entry() is deprecated, use Index.insert_dataset()",
|
|
1371
|
+
DeprecationWarning,
|
|
1372
|
+
stacklevel=2,
|
|
1373
|
+
)
|
|
1098
1374
|
from atdata.promote import _find_or_publish_schema
|
|
1099
1375
|
from atdata.atmosphere import DatasetPublisher
|
|
1100
1376
|
from atdata._schema_codec import schema_to_type
|
|
1377
|
+
from atdata._logging import log_operation
|
|
1101
1378
|
|
|
1102
1379
|
atmo = self._get_atmosphere()
|
|
1103
1380
|
if atmo is None:
|
|
1104
1381
|
raise ValueError("Atmosphere backend required but not available.")
|
|
1105
1382
|
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1383
|
+
with log_operation("Index.promote_entry", entry_name=entry_name):
|
|
1384
|
+
entry = self.get_entry_by_name(entry_name)
|
|
1385
|
+
if not entry.data_urls:
|
|
1386
|
+
raise ValueError(f"Local entry {entry_name!r} has no data URLs")
|
|
1109
1387
|
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1388
|
+
schema_record = self.get_schema(entry.schema_ref)
|
|
1389
|
+
sample_type = schema_to_type(schema_record)
|
|
1390
|
+
schema_version = schema_record.get("version", "1.0.0")
|
|
1113
1391
|
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1392
|
+
atmosphere_schema_uri = _find_or_publish_schema(
|
|
1393
|
+
sample_type,
|
|
1394
|
+
schema_version,
|
|
1395
|
+
atmo.client,
|
|
1396
|
+
description=schema_record.get("description"),
|
|
1397
|
+
)
|
|
1120
1398
|
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1399
|
+
publisher = DatasetPublisher(atmo.client)
|
|
1400
|
+
uri = publisher.publish_with_urls(
|
|
1401
|
+
urls=entry.data_urls,
|
|
1402
|
+
schema_uri=atmosphere_schema_uri,
|
|
1403
|
+
name=name or entry.name,
|
|
1404
|
+
description=description,
|
|
1405
|
+
tags=tags,
|
|
1406
|
+
license=license,
|
|
1407
|
+
metadata=entry.metadata,
|
|
1408
|
+
)
|
|
1409
|
+
return str(uri)
|
|
1132
1410
|
|
|
1133
1411
|
def promote_dataset(
|
|
1134
1412
|
self,
|
|
@@ -1143,8 +1421,8 @@ class Index:
|
|
|
1143
1421
|
) -> str:
|
|
1144
1422
|
"""Publish a Dataset directly to the atmosphere.
|
|
1145
1423
|
|
|
1146
|
-
|
|
1147
|
-
|
|
1424
|
+
.. deprecated::
|
|
1425
|
+
Use :meth:`insert_dataset` instead.
|
|
1148
1426
|
|
|
1149
1427
|
Args:
|
|
1150
1428
|
dataset: The Dataset to publish.
|
|
@@ -1167,32 +1445,41 @@ class Index:
|
|
|
1167
1445
|
>>> ds = atdata.load_dataset("./data.tar", MySample, split="train")
|
|
1168
1446
|
>>> uri = index.promote_dataset(ds, name="my-dataset")
|
|
1169
1447
|
"""
|
|
1448
|
+
import warnings
|
|
1449
|
+
|
|
1450
|
+
warnings.warn(
|
|
1451
|
+
"Index.promote_dataset() is deprecated, use Index.insert_dataset()",
|
|
1452
|
+
DeprecationWarning,
|
|
1453
|
+
stacklevel=2,
|
|
1454
|
+
)
|
|
1170
1455
|
from atdata.promote import _find_or_publish_schema
|
|
1171
1456
|
from atdata.atmosphere import DatasetPublisher
|
|
1457
|
+
from atdata._logging import log_operation
|
|
1172
1458
|
|
|
1173
1459
|
atmo = self._get_atmosphere()
|
|
1174
1460
|
if atmo is None:
|
|
1175
1461
|
raise ValueError("Atmosphere backend required but not available.")
|
|
1176
1462
|
|
|
1177
|
-
|
|
1463
|
+
with log_operation("Index.promote_dataset", name=name):
|
|
1464
|
+
st = sample_type or dataset.sample_type
|
|
1178
1465
|
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1466
|
+
atmosphere_schema_uri = _find_or_publish_schema(
|
|
1467
|
+
st,
|
|
1468
|
+
schema_version,
|
|
1469
|
+
atmo.client,
|
|
1470
|
+
description=description,
|
|
1471
|
+
)
|
|
1185
1472
|
|
|
1186
|
-
|
|
1473
|
+
data_urls = dataset.list_shards()
|
|
1187
1474
|
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1475
|
+
publisher = DatasetPublisher(atmo.client)
|
|
1476
|
+
uri = publisher.publish_with_urls(
|
|
1477
|
+
urls=data_urls,
|
|
1478
|
+
schema_uri=atmosphere_schema_uri,
|
|
1479
|
+
name=name,
|
|
1480
|
+
description=description,
|
|
1481
|
+
tags=tags,
|
|
1482
|
+
license=license,
|
|
1483
|
+
metadata=dataset._metadata,
|
|
1484
|
+
)
|
|
1485
|
+
return str(uri)
|