PyPI - atdata - Versions diffs - 0.1.3b3__py3-none-any.whl → 0.2.0a1__py3-none-any.whl - Mend

atdata 0.1.3b3py3-none-any.whl → 0.2.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

atdata/__init__.py +39 -1
atdata/_helpers.py +39 -3
atdata/atmosphere/__init__.py +61 -0
atdata/atmosphere/_types.py +329 -0
atdata/atmosphere/client.py +393 -0
atdata/atmosphere/lens.py +280 -0
atdata/atmosphere/records.py +342 -0
atdata/atmosphere/schema.py +296 -0
atdata/dataset.py +336 -203
atdata/lens.py +177 -77
atdata/local.py +492 -0
atdata-0.2.0a1.dist-info/METADATA +181 -0
atdata-0.2.0a1.dist-info/RECORD +16 -0
{atdata-0.1.3b3.dist-info → atdata-0.2.0a1.dist-info}/WHEEL +1 -1
atdata-0.1.3b3.dist-info/METADATA +0 -18
atdata-0.1.3b3.dist-info/RECORD +0 -9
{atdata-0.1.3b3.dist-info → atdata-0.2.0a1.dist-info}/entry_points.txt +0 -0
{atdata-0.1.3b3.dist-info → atdata-0.2.0a1.dist-info}/licenses/LICENSE +0 -0

atdata/atmosphere/records.py ADDED Viewed

@@ -0,0 +1,342 @@
+"""Dataset record publishing and loading for ATProto.
+This module provides classes for publishing dataset index records to ATProto
+and loading them back. Dataset records are published as
+``ac.foundation.dataset.record`` records.
+"""
+from typing import Type, TypeVar, Optional
+import msgpack
+from .client import AtmosphereClient
+from .schema import SchemaPublisher
+from ._types import (
+    AtUri,
+    DatasetRecord,
+    StorageLocation,
+    LEXICON_NAMESPACE,
+)
+# Import for type checking only to avoid circular imports
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from ..dataset import PackableSample, Dataset
+ST = TypeVar("ST", bound="PackableSample")
+class DatasetPublisher:
+    """Publishes dataset index records to ATProto.
+    This class creates dataset records that reference a schema and point to
+    external storage (WebDataset URLs) or ATProto blobs.
+    Example:
+        >>> dataset = atdata.Dataset[MySample]("s3://bucket/data-{000000..000009}.tar")
+        >>>
+        >>> client = AtmosphereClient()
+        >>> client.login("handle", "password")
+        >>>
+        >>> publisher = DatasetPublisher(client)
+        >>> uri = publisher.publish(
+        ...     dataset,
+        ...     name="My Training Data",
+        ...     description="Training data for my model",
+        ...     tags=["computer-vision", "training"],
+        ... )
+    """
+    def __init__(self, client: AtmosphereClient):
+        """Initialize the dataset publisher.
+        Args:
+            client: Authenticated AtmosphereClient instance.
+        """
+        self.client = client
+        self._schema_publisher = SchemaPublisher(client)
+    def publish(
+        self,
+        dataset: "Dataset[ST]",
+        *,
+        name: str,
+        schema_uri: Optional[str] = None,
+        description: Optional[str] = None,
+        tags: Optional[list[str]] = None,
+        license: Optional[str] = None,
+        auto_publish_schema: bool = True,
+        schema_version: str = "1.0.0",
+        rkey: Optional[str] = None,
+    ) -> AtUri:
+        """Publish a dataset index record to ATProto.
+        Args:
+            dataset: The Dataset to publish.
+            name: Human-readable dataset name.
+            schema_uri: AT URI of the schema record. If not provided and
+                auto_publish_schema is True, the schema will be published.
+            description: Human-readable description.
+            tags: Searchable tags for discovery.
+            license: SPDX license identifier (e.g., 'MIT', 'Apache-2.0').
+            auto_publish_schema: If True and schema_uri not provided,
+                automatically publish the schema first.
+            schema_version: Version for auto-published schema.
+            rkey: Optional explicit record key.
+        Returns:
+            The AT URI of the created dataset record.
+        Raises:
+            ValueError: If schema_uri is not provided and auto_publish_schema is False.
+        """
+        # Ensure we have a schema reference
+        if schema_uri is None:
+            if not auto_publish_schema:
+                raise ValueError(
+                    "schema_uri is required when auto_publish_schema=False"
+                )
+            # Auto-publish the schema
+            schema_uri_obj = self._schema_publisher.publish(
+                dataset.sample_type,
+                version=schema_version,
+            )
+            schema_uri = str(schema_uri_obj)
+        # Build the storage location
+        storage = StorageLocation(
+            kind="external",
+            urls=[dataset.url],
+        )
+        # Build dataset record
+        metadata_bytes: Optional[bytes] = None
+        if dataset.metadata is not None:
+            metadata_bytes = msgpack.packb(dataset.metadata)
+        dataset_record = DatasetRecord(
+            name=name,
+            schema_ref=schema_uri,
+            storage=storage,
+            description=description,
+            tags=tags or [],
+            license=license,
+            metadata=metadata_bytes,
+        )
+        # Publish to ATProto
+        return self.client.create_record(
+            collection=f"{LEXICON_NAMESPACE}.record",
+            record=dataset_record.to_record(),
+            rkey=rkey,
+            validate=False,
+        )
+    def publish_with_urls(
+        self,
+        urls: list[str],
+        schema_uri: str,
+        *,
+        name: str,
+        description: Optional[str] = None,
+        tags: Optional[list[str]] = None,
+        license: Optional[str] = None,
+        metadata: Optional[dict] = None,
+        rkey: Optional[str] = None,
+    ) -> AtUri:
+        """Publish a dataset record with explicit URLs.
+        This method allows publishing a dataset record without having a
+        Dataset object, useful for registering existing WebDataset files.
+        Args:
+            urls: List of WebDataset URLs with brace notation.
+            schema_uri: AT URI of the schema record.
+            name: Human-readable dataset name.
+            description: Human-readable description.
+            tags: Searchable tags for discovery.
+            license: SPDX license identifier.
+            metadata: Arbitrary metadata dictionary.
+            rkey: Optional explicit record key.
+        Returns:
+            The AT URI of the created dataset record.
+        """
+        storage = StorageLocation(
+            kind="external",
+            urls=urls,
+        )
+        metadata_bytes: Optional[bytes] = None
+        if metadata is not None:
+            metadata_bytes = msgpack.packb(metadata)
+        dataset_record = DatasetRecord(
+            name=name,
+            schema_ref=schema_uri,
+            storage=storage,
+            description=description,
+            tags=tags or [],
+            license=license,
+            metadata=metadata_bytes,
+        )
+        return self.client.create_record(
+            collection=f"{LEXICON_NAMESPACE}.record",
+            record=dataset_record.to_record(),
+            rkey=rkey,
+            validate=False,
+        )
+class DatasetLoader:
+    """Loads dataset records from ATProto.
+    This class fetches dataset index records and can create Dataset objects
+    from them. Note that loading a dataset requires having the corresponding
+    Python class for the sample type.
+    Example:
+        >>> client = AtmosphereClient()
+        >>> loader = DatasetLoader(client)
+        >>>
+        >>> # List available datasets
+        >>> datasets = loader.list()
+        >>> for ds in datasets:
+        ...     print(ds["name"], ds["schemaRef"])
+        >>>
+        >>> # Get a specific dataset record
+        >>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.record/xyz")
+    """
+    def __init__(self, client: AtmosphereClient):
+        """Initialize the dataset loader.
+        Args:
+            client: AtmosphereClient instance.
+        """
+        self.client = client
+    def get(self, uri: str | AtUri) -> dict:
+        """Fetch a dataset record by AT URI.
+        Args:
+            uri: The AT URI of the dataset record.
+        Returns:
+            The dataset record as a dictionary.
+        Raises:
+            ValueError: If the record is not a dataset record.
+        """
+        record = self.client.get_record(uri)
+        expected_type = f"{LEXICON_NAMESPACE}.record"
+        if record.get("$type") != expected_type:
+            raise ValueError(
+                f"Record at {uri} is not a dataset record. "
+                f"Expected $type='{expected_type}', got '{record.get('$type')}'"
+            )
+        return record
+    def list_all(
+        self,
+        repo: Optional[str] = None,
+        limit: int = 100,
+    ) -> list[dict]:
+        """List dataset records from a repository.
+        Args:
+            repo: The DID of the repository. Defaults to authenticated user.
+            limit: Maximum number of records to return.
+        Returns:
+            List of dataset records.
+        """
+        return self.client.list_datasets(repo=repo, limit=limit)
+    def get_urls(self, uri: str | AtUri) -> list[str]:
+        """Get the WebDataset URLs from a dataset record.
+        Args:
+            uri: The AT URI of the dataset record.
+        Returns:
+            List of WebDataset URLs.
+        Raises:
+            ValueError: If the storage type is not external URLs.
+        """
+        record = self.get(uri)
+        storage = record.get("storage", {})
+        storage_type = storage.get("$type", "")
+        if "storageExternal" in storage_type:
+            return storage.get("urls", [])
+        elif "storageBlobs" in storage_type:
+            raise ValueError(
+                "Dataset uses blob storage, not external URLs. "
+                "Use get_blobs() instead."
+            )
+        else:
+            raise ValueError(f"Unknown storage type: {storage_type}")
+    def get_metadata(self, uri: str | AtUri) -> Optional[dict]:
+        """Get the metadata from a dataset record.
+        Args:
+            uri: The AT URI of the dataset record.
+        Returns:
+            The metadata dictionary, or None if no metadata.
+        """
+        record = self.get(uri)
+        metadata_bytes = record.get("metadata")
+        if metadata_bytes is None:
+            return None
+        return msgpack.unpackb(metadata_bytes, raw=False)
+    def to_dataset(
+        self,
+        uri: str | AtUri,
+        sample_type: Type[ST],
+    ) -> "Dataset[ST]":
+        """Create a Dataset object from an ATProto record.
+        This method creates a Dataset instance from a published record.
+        You must provide the sample type class, which should match the
+        schema referenced by the record.
+        Args:
+            uri: The AT URI of the dataset record.
+            sample_type: The Python class for the sample type.
+        Returns:
+            A Dataset instance configured from the record.
+        Raises:
+            ValueError: If the storage type is not external URLs.
+        Example:
+            >>> loader = DatasetLoader(client)
+            >>> dataset = loader.to_dataset(uri, MySampleType)
+            >>> for batch in dataset.shuffled(batch_size=32):
+            ...     process(batch)
+        """
+        # Import here to avoid circular import
+        from ..dataset import Dataset
+        urls = self.get_urls(uri)
+        if not urls:
+            raise ValueError("Dataset record has no URLs")
+        # Use the first URL (multi-URL support could be added later)
+        url = urls[0]
+        # Get metadata URL if available
+        record = self.get(uri)
+        metadata_url = record.get("metadataUrl")
+        return Dataset[sample_type](url, metadata_url=metadata_url)

atdata/atmosphere/schema.py ADDED Viewed

@@ -0,0 +1,296 @@
+"""Schema publishing and loading for ATProto.
+This module provides classes for publishing PackableSample schemas to ATProto
+and loading them back. Schemas are published as ``ac.foundation.dataset.sampleSchema``
+records.
+"""
+from dataclasses import fields, is_dataclass
+from typing import Type, TypeVar, Optional, Union, get_type_hints, get_origin, get_args
+import types
+from .client import AtmosphereClient
+from ._types import (
+    AtUri,
+    SchemaRecord,
+    FieldDef,
+    FieldType,
+    LEXICON_NAMESPACE,
+)
+# Import for type checking only to avoid circular imports
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from ..dataset import PackableSample
+ST = TypeVar("ST", bound="PackableSample")
+class SchemaPublisher:
+    """Publishes PackableSample schemas to ATProto.
+    This class introspects a PackableSample class to extract its field
+    definitions and publishes them as an ATProto schema record.
+    Example:
+        >>> @atdata.packable
+        ... class MySample:
+        ...     image: NDArray
+        ...     label: str
+        ...
+        >>> client = AtmosphereClient()
+        >>> client.login("handle", "password")
+        >>>
+        >>> publisher = SchemaPublisher(client)
+        >>> uri = publisher.publish(MySample, version="1.0.0")
+        >>> print(uri)
+        at://did:plc:.../ac.foundation.dataset.sampleSchema/...
+    """
+    def __init__(self, client: AtmosphereClient):
+        """Initialize the schema publisher.
+        Args:
+            client: Authenticated AtmosphereClient instance.
+        """
+        self.client = client
+    def publish(
+        self,
+        sample_type: Type[ST],
+        *,
+        name: Optional[str] = None,
+        version: str = "1.0.0",
+        description: Optional[str] = None,
+        metadata: Optional[dict] = None,
+        rkey: Optional[str] = None,
+    ) -> AtUri:
+        """Publish a PackableSample schema to ATProto.
+        Args:
+            sample_type: The PackableSample class to publish.
+            name: Human-readable name. Defaults to the class name.
+            version: Semantic version string (e.g., '1.0.0').
+            description: Human-readable description.
+            metadata: Arbitrary metadata dictionary.
+            rkey: Optional explicit record key. If not provided, a TID is generated.
+        Returns:
+            The AT URI of the created schema record.
+        Raises:
+            ValueError: If sample_type is not a dataclass or client is not authenticated.
+            TypeError: If a field type is not supported.
+        """
+        if not is_dataclass(sample_type):
+            raise ValueError(f"{sample_type.__name__} must be a dataclass (use @packable)")
+        # Build the schema record
+        schema_record = self._build_schema_record(
+            sample_type,
+            name=name,
+            version=version,
+            description=description,
+            metadata=metadata,
+        )
+        # Publish to ATProto
+        return self.client.create_record(
+            collection=f"{LEXICON_NAMESPACE}.sampleSchema",
+            record=schema_record.to_record(),
+            rkey=rkey,
+            validate=False,  # PDS doesn't know our lexicon
+        )
+    def _build_schema_record(
+        self,
+        sample_type: Type[ST],
+        *,
+        name: Optional[str],
+        version: str,
+        description: Optional[str],
+        metadata: Optional[dict],
+    ) -> SchemaRecord:
+        """Build a SchemaRecord from a PackableSample class."""
+        field_defs = []
+        type_hints = get_type_hints(sample_type)
+        for f in fields(sample_type):
+            field_type = type_hints.get(f.name, f.type)
+            field_def = self._field_to_def(f.name, field_type)
+            field_defs.append(field_def)
+        return SchemaRecord(
+            name=name or sample_type.__name__,
+            version=version,
+            description=description,
+            fields=field_defs,
+            metadata=metadata,
+        )
+    def _field_to_def(self, name: str, python_type) -> FieldDef:
+        """Convert a Python field to a FieldDef."""
+        # Check for Optional types (Union with None)
+        is_optional = False
+        origin = get_origin(python_type)
+        # Handle Union types (including Optional which is Union[T, None])
+        if origin is Union or isinstance(python_type, types.UnionType):
+            args = get_args(python_type)
+            non_none_args = [a for a in args if a is not type(None)]
+            if type(None) in args or len(non_none_args) < len(args):
+                is_optional = True
+            if len(non_none_args) == 1:
+                python_type = non_none_args[0]
+            elif len(non_none_args) > 1:
+                # Complex union type - not fully supported yet
+                raise TypeError(f"Complex union types not supported: {python_type}")
+        field_type = self._python_type_to_field_type(python_type)
+        return FieldDef(
+            name=name,
+            field_type=field_type,
+            optional=is_optional,
+        )
+    def _python_type_to_field_type(self, python_type) -> FieldType:
+        """Map a Python type to a FieldType."""
+        # Handle primitives
+        if python_type is str:
+            return FieldType(kind="primitive", primitive="str")
+        elif python_type is int:
+            return FieldType(kind="primitive", primitive="int")
+        elif python_type is float:
+            return FieldType(kind="primitive", primitive="float")
+        elif python_type is bool:
+            return FieldType(kind="primitive", primitive="bool")
+        elif python_type is bytes:
+            return FieldType(kind="primitive", primitive="bytes")
+        # Check for NDArray
+        # NDArray from numpy.typing is a special generic alias
+        type_str = str(python_type)
+        if "NDArray" in type_str or "ndarray" in type_str.lower():
+            # Try to extract dtype info if available
+            dtype = "float32"  # Default
+            args = get_args(python_type)
+            if args:
+                # NDArray[np.float64] or similar
+                dtype_arg = args[-1] if args else None
+                if dtype_arg is not None:
+                    dtype = self._numpy_dtype_to_string(dtype_arg)
+            return FieldType(kind="ndarray", dtype=dtype, shape=None)
+        # Check for list/array types
+        origin = get_origin(python_type)
+        if origin is list:
+            args = get_args(python_type)
+            if args:
+                items = self._python_type_to_field_type(args[0])
+                return FieldType(kind="array", items=items)
+            else:
+                # Untyped list
+                return FieldType(kind="array", items=FieldType(kind="primitive", primitive="str"))
+        # Check for nested PackableSample (not yet supported)
+        if is_dataclass(python_type):
+            raise TypeError(
+                f"Nested dataclass types not yet supported: {python_type.__name__}. "
+                "Publish nested types separately and use references."
+            )
+        raise TypeError(f"Unsupported type for schema field: {python_type}")
+    def _numpy_dtype_to_string(self, dtype) -> str:
+        """Convert a numpy dtype annotation to a string."""
+        dtype_str = str(dtype)
+        # Handle common numpy dtypes
+        dtype_map = {
+            "float16": "float16",
+            "float32": "float32",
+            "float64": "float64",
+            "int8": "int8",
+            "int16": "int16",
+            "int32": "int32",
+            "int64": "int64",
+            "uint8": "uint8",
+            "uint16": "uint16",
+            "uint32": "uint32",
+            "uint64": "uint64",
+            "bool": "bool",
+            "complex64": "complex64",
+            "complex128": "complex128",
+        }
+        for key, value in dtype_map.items():
+            if key in dtype_str:
+                return value
+        return "float32"  # Default fallback
+class SchemaLoader:
+    """Loads PackableSample schemas from ATProto.
+    This class fetches schema records from ATProto and can list available
+    schemas from a repository.
+    Example:
+        >>> client = AtmosphereClient()
+        >>> client.login("handle", "password")
+        >>>
+        >>> loader = SchemaLoader(client)
+        >>> schema = loader.get("at://did:plc:.../ac.foundation.dataset.sampleSchema/...")
+        >>> print(schema["name"])
+        'MySample'
+    """
+    def __init__(self, client: AtmosphereClient):
+        """Initialize the schema loader.
+        Args:
+            client: AtmosphereClient instance (authentication optional for reads).
+        """
+        self.client = client
+    def get(self, uri: str | AtUri) -> dict:
+        """Fetch a schema record by AT URI.
+        Args:
+            uri: The AT URI of the schema record.
+        Returns:
+            The schema record as a dictionary.
+        Raises:
+            ValueError: If the record is not a schema record.
+            atproto.exceptions.AtProtocolError: If record not found.
+        """
+        record = self.client.get_record(uri)
+        expected_type = f"{LEXICON_NAMESPACE}.sampleSchema"
+        if record.get("$type") != expected_type:
+            raise ValueError(
+                f"Record at {uri} is not a schema record. "
+                f"Expected $type='{expected_type}', got '{record.get('$type')}'"
+            )
+        return record
+    def list_all(
+        self,
+        repo: Optional[str] = None,
+        limit: int = 100,
+    ) -> list[dict]:
+        """List schema records from a repository.
+        Args:
+            repo: The DID of the repository. Defaults to authenticated user.
+            limit: Maximum number of records to return.
+        Returns:
+            List of schema records.
+        """
+        return self.client.list_schemas(repo=repo, limit=limit)

atdata 0.1.3b3__py3-none-any.whl → 0.2.0a1__py3-none-any.whl

atdata 0.1.3b3py3-none-any.whl → 0.2.0a1py3-none-any.whl