PyPI - atdata - Versions diffs - 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl - Mend

atdata 0.3.0b1py3-none-any.whl → 0.3.2b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

atdata/__init__.py +11 -0
atdata/_cid.py +0 -21
atdata/_helpers.py +12 -0
atdata/_hf_api.py +46 -1
atdata/_logging.py +43 -0
atdata/_protocols.py +81 -182
atdata/_schema_codec.py +2 -2
atdata/_sources.py +24 -4
atdata/_stub_manager.py +5 -25
atdata/atmosphere/__init__.py +60 -21
atdata/atmosphere/_lexicon_types.py +595 -0
atdata/atmosphere/_types.py +73 -245
atdata/atmosphere/client.py +64 -12
atdata/atmosphere/lens.py +60 -53
atdata/atmosphere/records.py +291 -100
atdata/atmosphere/schema.py +91 -65
atdata/atmosphere/store.py +68 -66
atdata/cli/__init__.py +16 -16
atdata/cli/diagnose.py +2 -2
atdata/cli/{local.py → infra.py} +10 -10
atdata/dataset.py +266 -47
atdata/index/__init__.py +54 -0
atdata/{local → index}/_entry.py +6 -2
atdata/{local → index}/_index.py +617 -72
atdata/{local → index}/_schema.py +5 -5
atdata/lexicons/__init__.py +127 -0
atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
atdata/lexicons/ac.foundation.dataset.record.json +117 -0
atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
atdata/lexicons/ndarray_shim.json +16 -0
atdata/local/__init__.py +12 -13
atdata/local/_repo_legacy.py +3 -3
atdata/manifest/__init__.py +4 -0
atdata/manifest/_proxy.py +321 -0
atdata/promote.py +14 -10
atdata/repository.py +66 -16
atdata/stores/__init__.py +23 -0
atdata/stores/_disk.py +131 -0
atdata/{local → stores}/_s3.py +134 -112
atdata/testing.py +12 -8
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
atdata-0.3.2b1.dist-info/RECORD +71 -0
atdata-0.3.0b1.dist-info/RECORD +0 -54
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0

atdata/atmosphere/records.py CHANGED Viewed

@@ -8,13 +8,18 @@ and loading them back. Dataset records are published as
 from typing import Type, TypeVar, Optional
 import msgpack
-from .client import AtmosphereClient
+from .client import Atmosphere
 from .schema import SchemaPublisher
-from ._types import (
-    AtUri,
-    DatasetRecord,
-    StorageLocation,
-    LEXICON_NAMESPACE,
+from ._types import AtUri, LEXICON_NAMESPACE
+from ._lexicon_types import (
+    LexDatasetRecord,
+    StorageHttp,
+    StorageS3,
+    StorageBlobs,
+    HttpShardEntry,
+    S3ShardEntry,
+    BlobEntry,
+    ShardChecksum,
 )
 # Import for type checking only to avoid circular imports
@@ -27,19 +32,23 @@ if TYPE_CHECKING:
 ST = TypeVar("ST", bound="Packable")
+def _placeholder_checksum() -> ShardChecksum:
+    """Return an empty checksum placeholder for shards without pre-computed digests."""
+    return ShardChecksum(algorithm="none", digest="")
 class DatasetPublisher:
     """Publishes dataset index records to ATProto.
     This class creates dataset records that reference a schema and point to
-    external storage (WebDataset URLs) or ATProto blobs.
+    HTTP storage, S3 storage, or ATProto blobs.
     Examples:
-        >>> dataset = atdata.Dataset[MySample]("s3://bucket/data-{000000..000009}.tar")
+        >>> dataset = atdata.Dataset[MySample]("https://example.com/data-000000.tar")
         >>>
-        >>> client = AtmosphereClient()
-        >>> client.login("handle", "password")
+        >>> atmo = Atmosphere.login("handle", "password")
         >>>
-        >>> publisher = DatasetPublisher(client)
+        >>> publisher = DatasetPublisher(atmo)
         >>> uri = publisher.publish(
         ...     dataset,
         ...     name="My Training Data",
@@ -48,15 +57,49 @@ class DatasetPublisher:
         ... )
     """
-    def __init__(self, client: AtmosphereClient):
+    def __init__(self, client: Atmosphere):
         """Initialize the dataset publisher.
         Args:
-            client: Authenticated AtmosphereClient instance.
+            client: Authenticated Atmosphere instance.
         """
         self.client = client
         self._schema_publisher = SchemaPublisher(client)
+    def _create_record(
+        self,
+        storage: "StorageHttp | StorageS3 | StorageBlobs",
+        *,
+        name: str,
+        schema_uri: str,
+        description: Optional[str] = None,
+        tags: Optional[list[str]] = None,
+        license: Optional[str] = None,
+        metadata: Optional[dict] = None,
+        rkey: Optional[str] = None,
+    ) -> AtUri:
+        """Build a LexDatasetRecord and publish it to ATProto."""
+        metadata_bytes: Optional[bytes] = None
+        if metadata is not None:
+            metadata_bytes = msgpack.packb(metadata)
+        dataset_record = LexDatasetRecord(
+            name=name,
+            schema_ref=schema_uri,
+            storage=storage,
+            description=description,
+            tags=tags or [],
+            license=license,
+            metadata=metadata_bytes,
+        )
+        return self.client.create_record(
+            collection=f"{LEXICON_NAMESPACE}.record",
+            record=dataset_record.to_record(),
+            rkey=rkey,
+            validate=False,
+        )
     def publish(
         self,
         dataset: "Dataset[ST]",
@@ -91,46 +134,34 @@ class DatasetPublisher:
         Raises:
             ValueError: If schema_uri is not provided and auto_publish_schema is False.
         """
-        # Ensure we have a schema reference
         if schema_uri is None:
             if not auto_publish_schema:
                 raise ValueError(
                     "schema_uri is required when auto_publish_schema=False"
                 )
-            # Auto-publish the schema
             schema_uri_obj = self._schema_publisher.publish(
                 dataset.sample_type,
                 version=schema_version,
             )
             schema_uri = str(schema_uri_obj)
-        # Build the storage location
-        storage = StorageLocation(
-            kind="external",
-            urls=[dataset.url],
+        shard_urls = dataset.list_shards()
+        storage = StorageHttp(
+            shards=[
+                HttpShardEntry(url=url, checksum=_placeholder_checksum())
+                for url in shard_urls
+            ]
         )
-        # Build dataset record
-        metadata_bytes: Optional[bytes] = None
-        if dataset.metadata is not None:
-            metadata_bytes = msgpack.packb(dataset.metadata)
-        dataset_record = DatasetRecord(
+        return self._create_record(
+            storage,
             name=name,
-            schema_ref=schema_uri,
-            storage=storage,
+            schema_uri=schema_uri,
             description=description,
-            tags=tags or [],
+            tags=tags,
             license=license,
-            metadata=metadata_bytes,
-        )
-        # Publish to ATProto
-        return self.client.create_record(
-            collection=f"{LEXICON_NAMESPACE}.record",
-            record=dataset_record.to_record(),
+            metadata=dataset.metadata,
             rkey=rkey,
-            validate=False,
         )
     def publish_with_urls(
@@ -143,50 +174,162 @@ class DatasetPublisher:
         tags: Optional[list[str]] = None,
         license: Optional[str] = None,
         metadata: Optional[dict] = None,
+        checksums: Optional[list[ShardChecksum]] = None,
         rkey: Optional[str] = None,
     ) -> AtUri:
-        """Publish a dataset record with explicit URLs.
+        """Publish a dataset record with explicit HTTP URLs.
         This method allows publishing a dataset record without having a
         Dataset object, useful for registering existing WebDataset files.
+        Each URL should be an individual shard (no brace notation).
         Args:
-            urls: List of WebDataset URLs with brace notation.
+            urls: List of individual shard URLs.
             schema_uri: AT URI of the schema record.
             name: Human-readable dataset name.
             description: Human-readable description.
             tags: Searchable tags for discovery.
             license: SPDX license identifier.
             metadata: Arbitrary metadata dictionary.
+            checksums: Per-shard checksums. If not provided, empty checksums
+                are used.
             rkey: Optional explicit record key.
         Returns:
             The AT URI of the created dataset record.
         """
-        storage = StorageLocation(
-            kind="external",
-            urls=urls,
+        if checksums and len(checksums) != len(urls):
+            raise ValueError(
+                f"checksums length ({len(checksums)}) must match "
+                f"urls length ({len(urls)})"
+            )
+        shards = [
+            HttpShardEntry(
+                url=url,
+                checksum=checksums[i] if checksums else _placeholder_checksum(),
+            )
+            for i, url in enumerate(urls)
+        ]
+        return self._create_record(
+            StorageHttp(shards=shards),
+            name=name,
+            schema_uri=schema_uri,
+            description=description,
+            tags=tags,
+            license=license,
+            metadata=metadata,
+            rkey=rkey,
         )
-        metadata_bytes: Optional[bytes] = None
-        if metadata is not None:
-            metadata_bytes = msgpack.packb(metadata)
+    def publish_with_s3(
+        self,
+        bucket: str,
+        keys: list[str],
+        schema_uri: str,
+        *,
+        name: str,
+        region: Optional[str] = None,
+        endpoint: Optional[str] = None,
+        description: Optional[str] = None,
+        tags: Optional[list[str]] = None,
+        license: Optional[str] = None,
+        metadata: Optional[dict] = None,
+        checksums: Optional[list[ShardChecksum]] = None,
+        rkey: Optional[str] = None,
+    ) -> AtUri:
+        """Publish a dataset record with S3 storage.
-        dataset_record = DatasetRecord(
+        Args:
+            bucket: S3 bucket name.
+            keys: List of S3 object keys for shard files.
+            schema_uri: AT URI of the schema record.
+            name: Human-readable dataset name.
+            region: AWS region (e.g., 'us-east-1').
+            endpoint: Custom S3-compatible endpoint URL.
+            description: Human-readable description.
+            tags: Searchable tags for discovery.
+            license: SPDX license identifier.
+            metadata: Arbitrary metadata dictionary.
+            checksums: Per-shard checksums.
+            rkey: Optional explicit record key.
+        Returns:
+            The AT URI of the created dataset record.
+        """
+        if checksums and len(checksums) != len(keys):
+            raise ValueError(
+                f"checksums length ({len(checksums)}) must match "
+                f"keys length ({len(keys)})"
+            )
+        shards = [
+            S3ShardEntry(
+                key=key,
+                checksum=checksums[i] if checksums else _placeholder_checksum(),
+            )
+            for i, key in enumerate(keys)
+        ]
+        return self._create_record(
+            StorageS3(bucket=bucket, shards=shards, region=region, endpoint=endpoint),
             name=name,
-            schema_ref=schema_uri,
-            storage=storage,
+            schema_uri=schema_uri,
             description=description,
-            tags=tags or [],
+            tags=tags,
             license=license,
-            metadata=metadata_bytes,
+            metadata=metadata,
+            rkey=rkey,
         )
-        return self.client.create_record(
-            collection=f"{LEXICON_NAMESPACE}.record",
-            record=dataset_record.to_record(),
+    def publish_with_blob_refs(
+        self,
+        blob_refs: list[dict],
+        schema_uri: str,
+        *,
+        name: str,
+        description: Optional[str] = None,
+        tags: Optional[list[str]] = None,
+        license: Optional[str] = None,
+        metadata: Optional[dict] = None,
+        rkey: Optional[str] = None,
+    ) -> AtUri:
+        """Publish a dataset record with pre-uploaded blob references.
+        Unlike ``publish_with_blobs`` (which takes raw bytes and uploads them),
+        this method accepts blob ref dicts that have already been uploaded to
+        the PDS.  The refs are embedded directly in the record so the PDS
+        retains the blobs.
+        Args:
+            blob_refs: List of blob reference dicts as returned by
+                ``Atmosphere.upload_blob()``.  Each dict must contain
+                ``$type``, ``ref`` (with ``$link``), ``mimeType``, and ``size``.
+            schema_uri: AT URI of the schema record.
+            name: Human-readable dataset name.
+            description: Human-readable description.
+            tags: Searchable tags for discovery.
+            license: SPDX license identifier.
+            metadata: Arbitrary metadata dictionary.
+            rkey: Optional explicit record key.
+        Returns:
+            The AT URI of the created dataset record.
+        """
+        blob_entries = [
+            BlobEntry(blob=ref, checksum=_placeholder_checksum()) for ref in blob_refs
+        ]
+        return self._create_record(
+            StorageBlobs(blobs=blob_entries),
+            name=name,
+            schema_uri=schema_uri,
+            description=description,
+            tags=tags,
+            license=license,
+            metadata=metadata,
             rkey=rkey,
-            validate=False,
         )
     def publish_with_blobs(
@@ -226,37 +369,28 @@ class DatasetPublisher:
             Blobs are only retained by the PDS when referenced in a committed
             record. This method handles that automatically.
         """
-        # Upload all blobs
-        blob_refs = []
+        blob_entries = []
         for blob_data in blobs:
             blob_ref = self.client.upload_blob(blob_data, mime_type=mime_type)
-            blob_refs.append(blob_ref)
-        # Create storage location with blob references
-        storage = StorageLocation(
-            kind="blobs",
-            blob_refs=blob_refs,
-        )
+            import hashlib
-        metadata_bytes: Optional[bytes] = None
-        if metadata is not None:
-            metadata_bytes = msgpack.packb(metadata)
+            digest = hashlib.sha256(blob_data).hexdigest()
+            blob_entries.append(
+                BlobEntry(
+                    blob=blob_ref,
+                    checksum=ShardChecksum(algorithm="sha256", digest=digest),
+                )
+            )
-        dataset_record = DatasetRecord(
+        return self._create_record(
+            StorageBlobs(blobs=blob_entries),
             name=name,
-            schema_ref=schema_uri,
-            storage=storage,
+            schema_uri=schema_uri,
             description=description,
-            tags=tags or [],
+            tags=tags,
             license=license,
-            metadata=metadata_bytes,
-        )
-        return self.client.create_record(
-            collection=f"{LEXICON_NAMESPACE}.record",
-            record=dataset_record.to_record(),
+            metadata=metadata,
             rkey=rkey,
-            validate=False,
         )
@@ -268,8 +402,8 @@ class DatasetLoader:
     Python class for the sample type.
     Examples:
-        >>> client = AtmosphereClient()
-        >>> loader = DatasetLoader(client)
+        >>> atmo = Atmosphere.login("handle", "password")
+        >>> loader = DatasetLoader(atmo)
         >>>
         >>> # List available datasets
         >>> datasets = loader.list()
@@ -280,11 +414,11 @@ class DatasetLoader:
         >>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.record/xyz")
     """
-    def __init__(self, client: AtmosphereClient):
+    def __init__(self, client: Atmosphere):
         """Initialize the dataset loader.
         Args:
-            client: AtmosphereClient instance.
+            client: Atmosphere instance.
         """
         self.client = client
@@ -311,6 +445,18 @@ class DatasetLoader:
         return record
+    def get_typed(self, uri: str | AtUri) -> LexDatasetRecord:
+        """Fetch a dataset record and return as a typed object.
+        Args:
+            uri: The AT URI of the dataset record.
+        Returns:
+            LexDatasetRecord instance.
+        """
+        record = self.get(uri)
+        return LexDatasetRecord.from_record(record)
     def list_all(
         self,
         repo: Optional[str] = None,
@@ -334,7 +480,7 @@ class DatasetLoader:
             uri: The AT URI of the dataset record.
         Returns:
-            Either "external" or "blobs".
+            One of "http", "s3", "blobs", or "external" (legacy).
         Raises:
             ValueError: If storage type is unknown.
@@ -343,16 +489,22 @@ class DatasetLoader:
         storage = record.get("storage", {})
         storage_type = storage.get("$type", "")
-        if "storageExternal" in storage_type:
-            return "external"
+        if "storageHttp" in storage_type:
+            return "http"
+        elif "storageS3" in storage_type:
+            return "s3"
         elif "storageBlobs" in storage_type:
             return "blobs"
+        elif "storageExternal" in storage_type:
+            return "external"
         else:
             raise ValueError(f"Unknown storage type: {storage_type}")
     def get_urls(self, uri: str | AtUri) -> list[str]:
         """Get the WebDataset URLs from a dataset record.
+        Supports storageHttp, storageS3, and legacy storageExternal formats.
         Args:
             uri: The AT URI of the dataset record.
@@ -360,22 +512,61 @@ class DatasetLoader:
             List of WebDataset URLs.
         Raises:
-            ValueError: If the storage type is not external URLs.
+            ValueError: If the storage type is blob-only.
         """
         record = self.get(uri)
         storage = record.get("storage", {})
         storage_type = storage.get("$type", "")
-        if "storageExternal" in storage_type:
+        if "storageHttp" in storage_type:
+            return [s["url"] for s in storage.get("shards", [])]
+        elif "storageS3" in storage_type:
+            bucket = storage.get("bucket", "")
+            endpoint = storage.get("endpoint")
+            urls = []
+            for s in storage.get("shards", []):
+                if endpoint:
+                    urls.append(f"{endpoint.rstrip('/')}/{bucket}/{s['key']}")
+                else:
+                    urls.append(f"s3://{bucket}/{s['key']}")
+            return urls
+        elif "storageExternal" in storage_type:
             return storage.get("urls", [])
         elif "storageBlobs" in storage_type:
             raise ValueError(
-                "Dataset uses blob storage, not external URLs. "
-                "Use get_blob_urls() instead."
+                "Dataset uses blob storage, not URLs. Use get_blob_urls() instead."
             )
         else:
             raise ValueError(f"Unknown storage type: {storage_type}")
+    def get_s3_info(self, uri: str | AtUri) -> dict:
+        """Get S3 storage details from a dataset record.
+        Args:
+            uri: The AT URI of the dataset record.
+        Returns:
+            Dict with keys: bucket, keys, region (optional), endpoint (optional).
+        Raises:
+            ValueError: If the storage type is not S3.
+        """
+        record = self.get(uri)
+        storage = record.get("storage", {})
+        storage_type = storage.get("$type", "")
+        if "storageS3" not in storage_type:
+            raise ValueError(
+                f"Dataset does not use S3 storage. Storage type: {storage_type}"
+            )
+        return {
+            "bucket": storage.get("bucket", ""),
+            "keys": [s["key"] for s in storage.get("shards", [])],
+            "region": storage.get("region"),
+            "endpoint": storage.get("endpoint"),
+        }
     def get_blobs(self, uri: str | AtUri) -> list[dict]:
         """Get the blob references from a dataset record.
@@ -383,7 +574,7 @@ class DatasetLoader:
             uri: The AT URI of the dataset record.
         Returns:
-            List of blob reference dicts with keys: $type, ref, mimeType, size.
+            List of blob entry dicts.
         Raises:
             ValueError: If the storage type is not blobs.
@@ -394,12 +585,11 @@ class DatasetLoader:
         storage_type = storage.get("$type", "")
         if "storageBlobs" in storage_type:
             return storage.get("blobs", [])
-        elif "storageExternal" in storage_type:
+        else:
             raise ValueError(
-                "Dataset uses external URL storage, not blobs. Use get_urls() instead."
+                f"Dataset does not use blob storage. Storage type: {storage_type}. "
+                "Use get_urls() instead."
             )
-        else:
-            raise ValueError(f"Unknown storage type: {storage_type}")
     def get_blob_urls(self, uri: str | AtUri) -> list[str]:
         """Get fetchable URLs for blob-stored dataset shards.
@@ -421,12 +611,13 @@ class DatasetLoader:
         else:
             parsed_uri = uri
-        blobs = self.get_blobs(uri)
+        blob_entries = self.get_blobs(uri)
         did = parsed_uri.authority
         urls = []
-        for blob in blobs:
-            # Extract CID from blob reference
+        for entry in blob_entries:
+            # Handle both new blobEntry format and legacy bare blob format
+            blob = entry.get("blob", entry)
             ref = blob.get("ref", {})
             cid = ref.get("$link") if isinstance(ref, dict) else str(ref)
             if cid:
@@ -463,7 +654,7 @@ class DatasetLoader:
         You must provide the sample type class, which should match the
         schema referenced by the record.
-        Supports both external URL storage and ATProto blob storage.
+        Supports HTTP, S3, blob, and legacy external storage.
         Args:
             uri: The AT URI of the dataset record.
@@ -486,10 +677,10 @@ class DatasetLoader:
         storage_type = self.get_storage_type(uri)
-        if storage_type == "external":
-            urls = self.get_urls(uri)
-        else:
+        if storage_type == "blobs":
             urls = self.get_blob_urls(uri)
+        else:
+            urls = self.get_urls(uri)
         if not urls:
             raise ValueError("Dataset record has no storage URLs")

atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

atdata 0.3.0b1py3-none-any.whl → 0.3.2b1py3-none-any.whl