PyPI - atdata - Versions diffs - 0.2.0a1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl - Mend

atdata 0.2.0a1py3-none-any.whl → 0.2.3b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

atdata/__init__.py +43 -10
atdata/_cid.py +144 -0
atdata/_helpers.py +7 -5
atdata/_hf_api.py +690 -0
atdata/_protocols.py +504 -0
atdata/_schema_codec.py +438 -0
atdata/_sources.py +508 -0
atdata/_stub_manager.py +534 -0
atdata/_type_utils.py +104 -0
atdata/atmosphere/__init__.py +269 -1
atdata/atmosphere/_types.py +4 -2
atdata/atmosphere/client.py +146 -3
atdata/atmosphere/lens.py +4 -3
atdata/atmosphere/records.py +168 -7
atdata/atmosphere/schema.py +29 -82
atdata/atmosphere/store.py +204 -0
atdata/cli/__init__.py +222 -0
atdata/cli/diagnose.py +169 -0
atdata/cli/local.py +283 -0
atdata/dataset.py +615 -257
atdata/lens.py +53 -54
atdata/local.py +1456 -228
atdata/promote.py +195 -0
{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/METADATA +106 -14
atdata-0.2.3b1.dist-info/RECORD +28 -0
atdata-0.2.0a1.dist-info/RECORD +0 -16
{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/WHEEL +0 -0
{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/licenses/LICENSE +0 -0

atdata/atmosphere/records.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ._types import (
 # Import for type checking only to avoid circular imports
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from ..dataset import PackableSample, Dataset
@@ -31,7 +32,7 @@ class DatasetPublisher:
     This class creates dataset records that reference a schema and point to
     external storage (WebDataset URLs) or ATProto blobs.
-    Example:
+    Examples:
         >>> dataset = atdata.Dataset[MySample]("s3://bucket/data-{000000..000009}.tar")
         >>>
         >>> client = AtmosphereClient()
@@ -187,6 +188,76 @@ class DatasetPublisher:
             validate=False,
         )
+    def publish_with_blobs(
+        self,
+        blobs: list[bytes],
+        schema_uri: str,
+        *,
+        name: str,
+        description: Optional[str] = None,
+        tags: Optional[list[str]] = None,
+        license: Optional[str] = None,
+        metadata: Optional[dict] = None,
+        mime_type: str = "application/x-tar",
+        rkey: Optional[str] = None,
+    ) -> AtUri:
+        """Publish a dataset with data stored as ATProto blobs.
+        This method uploads the provided data as blobs to the PDS and creates
+        a dataset record referencing them. Suitable for smaller datasets that
+        fit within blob size limits (typically 50MB per blob, configurable).
+        Args:
+            blobs: List of binary data (e.g., tar shards) to upload as blobs.
+            schema_uri: AT URI of the schema record.
+            name: Human-readable dataset name.
+            description: Human-readable description.
+            tags: Searchable tags for discovery.
+            license: SPDX license identifier.
+            metadata: Arbitrary metadata dictionary.
+            mime_type: MIME type for the blobs (default: application/x-tar).
+            rkey: Optional explicit record key.
+        Returns:
+            The AT URI of the created dataset record.
+        Note:
+            Blobs are only retained by the PDS when referenced in a committed
+            record. This method handles that automatically.
+        """
+        # Upload all blobs
+        blob_refs = []
+        for blob_data in blobs:
+            blob_ref = self.client.upload_blob(blob_data, mime_type=mime_type)
+            blob_refs.append(blob_ref)
+        # Create storage location with blob references
+        storage = StorageLocation(
+            kind="blobs",
+            blob_refs=blob_refs,
+        )
+        metadata_bytes: Optional[bytes] = None
+        if metadata is not None:
+            metadata_bytes = msgpack.packb(metadata)
+        dataset_record = DatasetRecord(
+            name=name,
+            schema_ref=schema_uri,
+            storage=storage,
+            description=description,
+            tags=tags or [],
+            license=license,
+            metadata=metadata_bytes,
+        )
+        return self.client.create_record(
+            collection=f"{LEXICON_NAMESPACE}.record",
+            record=dataset_record.to_record(),
+            rkey=rkey,
+            validate=False,
+        )
 class DatasetLoader:
     """Loads dataset records from ATProto.
@@ -195,7 +266,7 @@ class DatasetLoader:
     from them. Note that loading a dataset requires having the corresponding
     Python class for the sample type.
-    Example:
+    Examples:
         >>> client = AtmosphereClient()
         >>> loader = DatasetLoader(client)
         >>>
@@ -255,6 +326,29 @@ class DatasetLoader:
         """
         return self.client.list_datasets(repo=repo, limit=limit)
+    def get_storage_type(self, uri: str | AtUri) -> str:
+        """Get the storage type of a dataset record.
+        Args:
+            uri: The AT URI of the dataset record.
+        Returns:
+            Either "external" or "blobs".
+        Raises:
+            ValueError: If storage type is unknown.
+        """
+        record = self.get(uri)
+        storage = record.get("storage", {})
+        storage_type = storage.get("$type", "")
+        if "storageExternal" in storage_type:
+            return "external"
+        elif "storageBlobs" in storage_type:
+            return "blobs"
+        else:
+            raise ValueError(f"Unknown storage type: {storage_type}")
     def get_urls(self, uri: str | AtUri) -> list[str]:
         """Get the WebDataset URLs from a dataset record.
@@ -276,11 +370,70 @@ class DatasetLoader:
         elif "storageBlobs" in storage_type:
             raise ValueError(
                 "Dataset uses blob storage, not external URLs. "
-                "Use get_blobs() instead."
+                "Use get_blob_urls() instead."
+            )
+        else:
+            raise ValueError(f"Unknown storage type: {storage_type}")
+    def get_blobs(self, uri: str | AtUri) -> list[dict]:
+        """Get the blob references from a dataset record.
+        Args:
+            uri: The AT URI of the dataset record.
+        Returns:
+            List of blob reference dicts with keys: $type, ref, mimeType, size.
+        Raises:
+            ValueError: If the storage type is not blobs.
+        """
+        record = self.get(uri)
+        storage = record.get("storage", {})
+        storage_type = storage.get("$type", "")
+        if "storageBlobs" in storage_type:
+            return storage.get("blobs", [])
+        elif "storageExternal" in storage_type:
+            raise ValueError(
+                "Dataset uses external URL storage, not blobs. Use get_urls() instead."
             )
         else:
             raise ValueError(f"Unknown storage type: {storage_type}")
+    def get_blob_urls(self, uri: str | AtUri) -> list[str]:
+        """Get fetchable URLs for blob-stored dataset shards.
+        This resolves the PDS endpoint and constructs URLs that can be
+        used to fetch the blob data directly.
+        Args:
+            uri: The AT URI of the dataset record.
+        Returns:
+            List of URLs for fetching the blob data.
+        Raises:
+            ValueError: If storage type is not blobs or PDS cannot be resolved.
+        """
+        if isinstance(uri, str):
+            parsed_uri = AtUri.parse(uri)
+        else:
+            parsed_uri = uri
+        blobs = self.get_blobs(uri)
+        did = parsed_uri.authority
+        urls = []
+        for blob in blobs:
+            # Extract CID from blob reference
+            ref = blob.get("ref", {})
+            cid = ref.get("$link") if isinstance(ref, dict) else str(ref)
+            if cid:
+                url = self.client.get_blob_url(did, cid)
+                urls.append(url)
+        return urls
     def get_metadata(self, uri: str | AtUri) -> Optional[dict]:
         """Get the metadata from a dataset record.
@@ -309,6 +462,8 @@ class DatasetLoader:
         You must provide the sample type class, which should match the
         schema referenced by the record.
+        Supports both external URL storage and ATProto blob storage.
         Args:
             uri: The AT URI of the dataset record.
             sample_type: The Python class for the sample type.
@@ -317,9 +472,9 @@ class DatasetLoader:
             A Dataset instance configured from the record.
         Raises:
-            ValueError: If the storage type is not external URLs.
+            ValueError: If no storage URLs can be resolved.
-        Example:
+        Examples:
             >>> loader = DatasetLoader(client)
             >>> dataset = loader.to_dataset(uri, MySampleType)
             >>> for batch in dataset.shuffled(batch_size=32):
@@ -328,9 +483,15 @@ class DatasetLoader:
         # Import here to avoid circular import
         from ..dataset import Dataset
-        urls = self.get_urls(uri)
+        storage_type = self.get_storage_type(uri)
+        if storage_type == "external":
+            urls = self.get_urls(uri)
+        else:
+            urls = self.get_blob_urls(uri)
         if not urls:
-            raise ValueError("Dataset record has no URLs")
+            raise ValueError("Dataset record has no storage URLs")
         # Use the first URL (multi-URL support could be added later)
         url = urls[0]

atdata/atmosphere/schema.py CHANGED Viewed

@@ -6,8 +6,7 @@ records.
 """
 from dataclasses import fields, is_dataclass
-from typing import Type, TypeVar, Optional, Union, get_type_hints, get_origin, get_args
-import types
+from typing import Type, TypeVar, Optional, get_type_hints, get_origin, get_args
 from .client import AtmosphereClient
 from ._types import (
@@ -17,9 +16,15 @@ from ._types import (
     FieldType,
     LEXICON_NAMESPACE,
 )
+from .._type_utils import (
+    unwrap_optional,
+    is_ndarray_type,
+    extract_ndarray_dtype,
+)
 # Import for type checking only to avoid circular imports
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from ..dataset import PackableSample
@@ -32,7 +37,7 @@ class SchemaPublisher:
     This class introspects a PackableSample class to extract its field
     definitions and publishes them as an ATProto schema record.
-    Example:
+    Examples:
         >>> @atdata.packable
         ... class MySample:
         ...     image: NDArray
@@ -83,7 +88,9 @@ class SchemaPublisher:
             TypeError: If a field type is not supported.
         """
         if not is_dataclass(sample_type):
-            raise ValueError(f"{sample_type.__name__} must be a dataclass (use @packable)")
+            raise ValueError(
+                f"{sample_type.__name__} must be a dataclass (use @packable)"
+            )
         # Build the schema record
         schema_record = self._build_schema_record(
@@ -130,71 +137,38 @@ class SchemaPublisher:
     def _field_to_def(self, name: str, python_type) -> FieldDef:
         """Convert a Python field to a FieldDef."""
-        # Check for Optional types (Union with None)
-        is_optional = False
-        origin = get_origin(python_type)
-        # Handle Union types (including Optional which is Union[T, None])
-        if origin is Union or isinstance(python_type, types.UnionType):
-            args = get_args(python_type)
-            non_none_args = [a for a in args if a is not type(None)]
-            if type(None) in args or len(non_none_args) < len(args):
-                is_optional = True
-            if len(non_none_args) == 1:
-                python_type = non_none_args[0]
-            elif len(non_none_args) > 1:
-                # Complex union type - not fully supported yet
-                raise TypeError(f"Complex union types not supported: {python_type}")
+        python_type, is_optional = unwrap_optional(python_type)
         field_type = self._python_type_to_field_type(python_type)
-        return FieldDef(
-            name=name,
-            field_type=field_type,
-            optional=is_optional,
-        )
+        return FieldDef(name=name, field_type=field_type, optional=is_optional)
     def _python_type_to_field_type(self, python_type) -> FieldType:
         """Map a Python type to a FieldType."""
-        # Handle primitives
         if python_type is str:
             return FieldType(kind="primitive", primitive="str")
-        elif python_type is int:
+        if python_type is int:
             return FieldType(kind="primitive", primitive="int")
-        elif python_type is float:
+        if python_type is float:
             return FieldType(kind="primitive", primitive="float")
-        elif python_type is bool:
+        if python_type is bool:
             return FieldType(kind="primitive", primitive="bool")
-        elif python_type is bytes:
+        if python_type is bytes:
             return FieldType(kind="primitive", primitive="bytes")
-        # Check for NDArray
-        # NDArray from numpy.typing is a special generic alias
-        type_str = str(python_type)
-        if "NDArray" in type_str or "ndarray" in type_str.lower():
-            # Try to extract dtype info if available
-            dtype = "float32"  # Default
-            args = get_args(python_type)
-            if args:
-                # NDArray[np.float64] or similar
-                dtype_arg = args[-1] if args else None
-                if dtype_arg is not None:
-                    dtype = self._numpy_dtype_to_string(dtype_arg)
-            return FieldType(kind="ndarray", dtype=dtype, shape=None)
+        if is_ndarray_type(python_type):
+            return FieldType(
+                kind="ndarray", dtype=extract_ndarray_dtype(python_type), shape=None
+            )
-        # Check for list/array types
         origin = get_origin(python_type)
         if origin is list:
             args = get_args(python_type)
-            if args:
-                items = self._python_type_to_field_type(args[0])
-                return FieldType(kind="array", items=items)
-            else:
-                # Untyped list
-                return FieldType(kind="array", items=FieldType(kind="primitive", primitive="str"))
-        # Check for nested PackableSample (not yet supported)
+            items = (
+                self._python_type_to_field_type(args[0])
+                if args
+                else FieldType(kind="primitive", primitive="str")
+            )
+            return FieldType(kind="array", items=items)
         if is_dataclass(python_type):
             raise TypeError(
                 f"Nested dataclass types not yet supported: {python_type.__name__}. "
@@ -203,33 +177,6 @@ class SchemaPublisher:
         raise TypeError(f"Unsupported type for schema field: {python_type}")
-    def _numpy_dtype_to_string(self, dtype) -> str:
-        """Convert a numpy dtype annotation to a string."""
-        dtype_str = str(dtype)
-        # Handle common numpy dtypes
-        dtype_map = {
-            "float16": "float16",
-            "float32": "float32",
-            "float64": "float64",
-            "int8": "int8",
-            "int16": "int16",
-            "int32": "int32",
-            "int64": "int64",
-            "uint8": "uint8",
-            "uint16": "uint16",
-            "uint32": "uint32",
-            "uint64": "uint64",
-            "bool": "bool",
-            "complex64": "complex64",
-            "complex128": "complex128",
-        }
-        for key, value in dtype_map.items():
-            if key in dtype_str:
-                return value
-        return "float32"  # Default fallback
 class SchemaLoader:
     """Loads PackableSample schemas from ATProto.
@@ -237,7 +184,7 @@ class SchemaLoader:
     This class fetches schema records from ATProto and can list available
     schemas from a repository.
-    Example:
+    Examples:
         >>> client = AtmosphereClient()
         >>> client.login("handle", "password")
         >>>

atdata/atmosphere/store.py ADDED Viewed

@@ -0,0 +1,204 @@
+"""PDS blob storage for dataset shards.
+This module provides ``PDSBlobStore``, an implementation of the AbstractDataStore
+protocol that stores dataset shards as ATProto blobs in a Personal Data Server.
+This enables fully decentralized dataset storage where both metadata (records)
+and data (blobs) live on the AT Protocol network.
+Examples:
+    >>> from atdata.atmosphere import AtmosphereClient, PDSBlobStore
+    >>>
+    >>> client = AtmosphereClient()
+    >>> client.login("handle.bsky.social", "app-password")
+    >>>
+    >>> store = PDSBlobStore(client)
+    >>> urls = store.write_shards(dataset, prefix="mnist/v1")
+    >>> print(urls)
+    ['at://did:plc:.../blob/bafyrei...', ...]
+"""
+from __future__ import annotations
+import tempfile
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+import webdataset as wds
+if TYPE_CHECKING:
+    from ..dataset import Dataset
+    from .._sources import BlobSource
+    from .client import AtmosphereClient
+@dataclass
+class PDSBlobStore:
+    """PDS blob store implementing AbstractDataStore protocol.
+    Stores dataset shards as ATProto blobs, enabling decentralized dataset
+    storage on the AT Protocol network.
+    Each shard is written to a temporary tar file, then uploaded as a blob
+    to the user's PDS. The returned URLs are AT URIs that can be resolved
+    to HTTP URLs for streaming.
+    Attributes:
+        client: Authenticated AtmosphereClient instance.
+    Examples:
+        >>> store = PDSBlobStore(client)
+        >>> urls = store.write_shards(dataset, prefix="training/v1")
+        >>> # Returns AT URIs like:
+        >>> # ['at://did:plc:abc/blob/bafyrei...', ...]
+    """
+    client: "AtmosphereClient"
+    def write_shards(
+        self,
+        ds: "Dataset",
+        *,
+        prefix: str,
+        maxcount: int = 10000,
+        maxsize: float = 3e9,
+        **kwargs: Any,
+    ) -> list[str]:
+        """Write dataset shards as PDS blobs.
+        Creates tar archives from the dataset and uploads each as a blob
+        to the authenticated user's PDS.
+        Args:
+            ds: The Dataset to write.
+            prefix: Logical path prefix for naming (used in shard names only).
+            maxcount: Maximum samples per shard (default: 10000).
+            maxsize: Maximum shard size in bytes (default: 3GB, PDS limit).
+            **kwargs: Additional args passed to wds.ShardWriter.
+        Returns:
+            List of AT URIs for the written blobs, in format:
+            ``at://{did}/blob/{cid}``
+        Raises:
+            ValueError: If not authenticated.
+            RuntimeError: If no shards were written.
+        Note:
+            PDS blobs have size limits (typically 50MB-5GB depending on PDS).
+            Adjust maxcount/maxsize to stay within limits.
+        """
+        if not self.client.did:
+            raise ValueError("Client must be authenticated to upload blobs")
+        did = self.client.did
+        blob_urls: list[str] = []
+        # Write shards to temp files, upload each as blob
+        with tempfile.TemporaryDirectory() as temp_dir:
+            shard_pattern = f"{temp_dir}/shard-%06d.tar"
+            written_files: list[str] = []
+            # Track written files via custom post callback
+            def track_file(fname: str) -> None:
+                written_files.append(fname)
+            with wds.writer.ShardWriter(
+                shard_pattern,
+                maxcount=maxcount,
+                maxsize=maxsize,
+                post=track_file,
+                **kwargs,
+            ) as sink:
+                for sample in ds.ordered(batch_size=None):
+                    sink.write(sample.as_wds)
+            if not written_files:
+                raise RuntimeError("No shards written")
+            # Upload each shard as a blob
+            for shard_path in written_files:
+                with open(shard_path, "rb") as f:
+                    shard_data = f.read()
+                blob_ref = self.client.upload_blob(
+                    shard_data,
+                    mime_type="application/x-tar",
+                )
+                # Extract CID from blob reference
+                cid = blob_ref["ref"]["$link"]
+                at_uri = f"at://{did}/blob/{cid}"
+                blob_urls.append(at_uri)
+        return blob_urls
+    def read_url(self, url: str) -> str:
+        """Resolve an AT URI blob reference to an HTTP URL.
+        Transforms ``at://did/blob/cid`` URIs to HTTP URLs that can be
+        streamed by WebDataset.
+        Args:
+            url: AT URI in format ``at://{did}/blob/{cid}``.
+        Returns:
+            HTTP URL for fetching the blob via PDS API.
+        Raises:
+            ValueError: If URL format is invalid or PDS cannot be resolved.
+        """
+        if not url.startswith("at://"):
+            # Not an AT URI, return unchanged
+            return url
+        # Parse at://did/blob/cid
+        parts = url[5:].split("/")  # Remove 'at://'
+        if len(parts) != 3 or parts[1] != "blob":
+            raise ValueError(f"Invalid blob AT URI format: {url}")
+        did, _, cid = parts
+        return self.client.get_blob_url(did, cid)
+    def supports_streaming(self) -> bool:
+        """PDS blobs support streaming via HTTP.
+        Returns:
+            True.
+        """
+        return True
+    def create_source(self, urls: list[str]) -> "BlobSource":
+        """Create a BlobSource for reading these AT URIs.
+        This is a convenience method for creating a DataSource that can
+        stream the blobs written by this store.
+        Args:
+            urls: List of AT URIs from write_shards().
+        Returns:
+            BlobSource configured for the given URLs.
+        Raises:
+            ValueError: If URLs are not valid AT URIs.
+        """
+        from .._sources import BlobSource
+        blob_refs: list[dict[str, str]] = []
+        for url in urls:
+            if not url.startswith("at://"):
+                raise ValueError(f"Not an AT URI: {url}")
+            parts = url[5:].split("/")
+            if len(parts) != 3 or parts[1] != "blob":
+                raise ValueError(f"Invalid blob AT URI: {url}")
+            did, _, cid = parts
+            blob_refs.append({"did": did, "cid": cid})
+        return BlobSource(blob_refs=blob_refs)
+__all__ = ["PDSBlobStore"]

atdata 0.2.0a1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl

atdata 0.2.0a1py3-none-any.whl → 0.2.3b1py3-none-any.whl