PyPI - atdata - Versions diffs - 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl - Mend

atdata 0.2.2b1py3-none-any.whl → 0.3.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

atdata/.gitignore +1 -0
atdata/__init__.py +31 -1
atdata/_cid.py +29 -35
atdata/_exceptions.py +168 -0
atdata/_helpers.py +33 -17
atdata/_hf_api.py +109 -59
atdata/_logging.py +70 -0
atdata/_protocols.py +74 -132
atdata/_schema_codec.py +38 -41
atdata/_sources.py +57 -64
atdata/_stub_manager.py +31 -26
atdata/_type_utils.py +47 -7
atdata/atmosphere/__init__.py +31 -24
atdata/atmosphere/_types.py +11 -11
atdata/atmosphere/client.py +11 -8
atdata/atmosphere/lens.py +27 -30
atdata/atmosphere/records.py +34 -39
atdata/atmosphere/schema.py +35 -31
atdata/atmosphere/store.py +16 -20
atdata/cli/__init__.py +163 -168
atdata/cli/diagnose.py +12 -8
atdata/cli/inspect.py +69 -0
atdata/cli/local.py +5 -2
atdata/cli/preview.py +63 -0
atdata/cli/schema.py +109 -0
atdata/dataset.py +678 -533
atdata/lens.py +85 -83
atdata/local/__init__.py +71 -0
atdata/local/_entry.py +157 -0
atdata/local/_index.py +940 -0
atdata/local/_repo_legacy.py +218 -0
atdata/local/_s3.py +349 -0
atdata/local/_schema.py +380 -0
atdata/manifest/__init__.py +28 -0
atdata/manifest/_aggregates.py +156 -0
atdata/manifest/_builder.py +163 -0
atdata/manifest/_fields.py +154 -0
atdata/manifest/_manifest.py +146 -0
atdata/manifest/_query.py +150 -0
atdata/manifest/_writer.py +74 -0
atdata/promote.py +20 -24
atdata/providers/__init__.py +25 -0
atdata/providers/_base.py +140 -0
atdata/providers/_factory.py +69 -0
atdata/providers/_postgres.py +214 -0
atdata/providers/_redis.py +171 -0
atdata/providers/_sqlite.py +191 -0
atdata/repository.py +323 -0
atdata/testing.py +337 -0
{atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
atdata-0.3.0b1.dist-info/RECORD +54 -0
atdata/local.py +0 -1707
atdata-0.2.2b1.dist-info/RECORD +0 -28
{atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
{atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0

atdata/_logging.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Pluggable logging for atdata.
+Provides a thin abstraction over Python's stdlib ``logging`` module that can
+be replaced with ``structlog`` or any other logger implementing the standard
+``debug``/``info``/``warning``/``error`` interface.
+Usage::
+    # Default: stdlib logging (no config needed)
+    from atdata._logging import get_logger
+    log = get_logger()
+    log.info("processing shard", extra={"shard": "data-000.tar"})
+    # Plug in structlog (or any compatible logger):
+    import structlog
+    import atdata
+    atdata.configure_logging(structlog.get_logger())
+The module also exports a lightweight ``LoggerProtocol`` for type checking
+custom logger implementations.
+"""
+from __future__ import annotations
+import logging
+from typing import Any, Protocol, runtime_checkable
+@runtime_checkable
+class LoggerProtocol(Protocol):
+    """Minimal interface that a pluggable logger must satisfy."""
+    def debug(self, msg: str, *args: Any, **kwargs: Any) -> None: ...
+    def info(self, msg: str, *args: Any, **kwargs: Any) -> None: ...
+    def warning(self, msg: str, *args: Any, **kwargs: Any) -> None: ...
+    def error(self, msg: str, *args: Any, **kwargs: Any) -> None: ...
+# ---------------------------------------------------------------------------
+# Module-level state
+# ---------------------------------------------------------------------------
+_logger: LoggerProtocol = logging.getLogger("atdata")
+def configure_logging(logger: LoggerProtocol) -> None:
+    """Replace the default logger with a custom implementation.
+    The provided logger must implement ``debug``, ``info``, ``warning``, and
+    ``error`` methods. Both ``structlog`` bound loggers and stdlib
+    ``logging.Logger`` instances satisfy this interface.
+    Args:
+        logger: A logger instance implementing :class:`LoggerProtocol`.
+    Examples:
+        >>> import structlog
+        >>> atdata.configure_logging(structlog.get_logger())
+    """
+    global _logger
+    _logger = logger
+def get_logger() -> LoggerProtocol:
+    """Return the currently configured logger.
+    Returns the stdlib ``logging.getLogger("atdata")`` by default, or
+    whatever was last set via :func:`configure_logging`.
+    """
+    return _logger

atdata/_protocols.py CHANGED Viewed

@@ -10,7 +10,7 @@ formalize that common interface.
 Note:
     Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
     the standard Python syntax for Protocol definitions - these are interface
-    specifications, not stub implementations. Concrete classes (LocalIndex,
+    specifications, not stub implementations. Concrete classes (Index,
     AtmosphereIndex, etc.) provide the actual implementations.
 Protocols:
@@ -19,22 +19,19 @@ Protocols:
     AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
     AbstractDataStore: Protocol for data storage operations
-Example:
-    ::
-        >>> def process_datasets(index: AbstractIndex) -> None:
-        ...     for entry in index.list_datasets():
-        ...         print(f"{entry.name}: {entry.data_urls}")
-        ...
-        >>> # Works with either LocalIndex or AtmosphereIndex
-        >>> process_datasets(local_index)
-        >>> process_datasets(atmosphere_index)
+Examples:
+    >>> def process_datasets(index: AbstractIndex) -> None:
+    ...     for entry in index.list_datasets():
+    ...         print(f"{entry.name}: {entry.data_urls}")
+    ...
+    >>> # Works with either Index or AtmosphereIndex
+    >>> process_datasets(local_index)
+    >>> process_datasets(atmosphere_index)
 """
 from typing import (
     IO,
     Any,
-    ClassVar,
     Iterator,
     Optional,
     Protocol,
@@ -67,39 +64,29 @@ class Packable(Protocol):
     - Schema publishing (class introspection via dataclass fields)
     - Serialization/deserialization (packed, from_bytes)
-    Example:
-        ::
-            >>> @packable
-            ... class MySample:
-            ...     name: str
-            ...     value: int
-            ...
-            >>> def process(sample_type: Type[Packable]) -> None:
-            ...     # Type checker knows sample_type has from_bytes, packed, etc.
-            ...     instance = sample_type.from_bytes(data)
-            ...     print(instance.packed)
+    Examples:
+        >>> @packable
+        ... class MySample:
+        ...     name: str
+        ...     value: int
+        ...
+        >>> def process(sample_type: Type[Packable]) -> None:
+        ...     # Type checker knows sample_type has from_bytes, packed, etc.
+        ...     instance = sample_type.from_bytes(data)
+        ...     print(instance.packed)
     """
     @classmethod
-    def from_data(cls, data: dict[str, Any]) -> "Packable":
-        """Create instance from unpacked msgpack data dictionary."""
-        ...
+    def from_data(cls, data: dict[str, Any]) -> "Packable": ...
     @classmethod
-    def from_bytes(cls, bs: bytes) -> "Packable":
-        """Create instance from raw msgpack bytes."""
-        ...
+    def from_bytes(cls, bs: bytes) -> "Packable": ...
     @property
-    def packed(self) -> bytes:
-        """Pack this sample's data into msgpack bytes."""
-        ...
+    def packed(self) -> bytes: ...
     @property
-    def as_wds(self) -> dict[str, Any]:
-        """WebDataset-compatible representation with __key__ and msgpack."""
-        ...
+    def as_wds(self) -> dict[str, Any]: ...
 ##
@@ -121,16 +108,14 @@ class IndexEntry(Protocol):
     """
     @property
-    def name(self) -> str:
-        """Human-readable dataset name."""
-        ...
+    def name(self) -> str: ...
     @property
     def schema_ref(self) -> str:
-        """Reference to the schema for this dataset.
+        """Schema reference string.
-        For local: 'local://schemas/{module.Class}@{version}'
-        For atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
+        Local: ``local://schemas/{module.Class}@{version}``
+        Atmosphere: ``at://did:plc:.../ac.foundation.dataset.sampleSchema/...``
         """
         ...
@@ -144,9 +129,7 @@ class IndexEntry(Protocol):
         ...
     @property
-    def metadata(self) -> Optional[dict]:
-        """Arbitrary metadata dictionary, or None if not set."""
-        ...
+    def metadata(self) -> Optional[dict]: ...
 ##
@@ -154,7 +137,7 @@ class IndexEntry(Protocol):
 class AbstractIndex(Protocol):
-    """Protocol for index operations - implemented by LocalIndex and AtmosphereIndex.
+    """Protocol for index operations - implemented by Index and AtmosphereIndex.
     This protocol defines the common interface for managing dataset metadata:
     - Publishing and retrieving schemas
@@ -169,21 +152,19 @@ class AbstractIndex(Protocol):
         - ``data_store``: An AbstractDataStore for reading/writing dataset shards.
           If present, ``load_dataset`` will use it for S3 credential resolution.
-    Example:
-        ::
-            >>> def publish_and_list(index: AbstractIndex) -> None:
-            ...     # Publish schemas for different types
-            ...     schema1 = index.publish_schema(ImageSample, version="1.0.0")
-            ...     schema2 = index.publish_schema(TextSample, version="1.0.0")
-            ...
-            ...     # Insert datasets of different types
-            ...     index.insert_dataset(image_ds, name="images")
-            ...     index.insert_dataset(text_ds, name="texts")
-            ...
-            ...     # List all datasets (mixed types)
-            ...     for entry in index.list_datasets():
-            ...         print(f"{entry.name} -> {entry.schema_ref}")
+    Examples:
+        >>> def publish_and_list(index: AbstractIndex) -> None:
+        ...     # Publish schemas for different types
+        ...     schema1 = index.publish_schema(ImageSample, version="1.0.0")
+        ...     schema2 = index.publish_schema(TextSample, version="1.0.0")
+        ...
+        ...     # Insert datasets of different types
+        ...     index.insert_dataset(image_ds, name="images")
+        ...     index.insert_dataset(text_ds, name="texts")
+        ...
+        ...     # List all datasets (mixed types)
+        ...     for entry in index.list_datasets():
+        ...         print(f"{entry.name} -> {entry.schema_ref}")
     """
     @property
@@ -246,21 +227,9 @@ class AbstractIndex(Protocol):
         ...
     @property
-    def datasets(self) -> Iterator[IndexEntry]:
-        """Lazily iterate over all dataset entries in this index.
-        Yields:
-            IndexEntry for each dataset (may be of different sample types).
-        """
-        ...
+    def datasets(self) -> Iterator[IndexEntry]: ...
-    def list_datasets(self) -> list[IndexEntry]:
-        """Get all dataset entries as a materialized list.
-        Returns:
-            List of IndexEntry for each dataset.
-        """
-        ...
+    def list_datasets(self) -> list[IndexEntry]: ...
     # Schema operations
@@ -306,21 +275,9 @@ class AbstractIndex(Protocol):
         ...
     @property
-    def schemas(self) -> Iterator[dict]:
-        """Lazily iterate over all schema records in this index.
-        Yields:
-            Schema records as dictionaries.
-        """
-        ...
+    def schemas(self) -> Iterator[dict]: ...
-    def list_schemas(self) -> list[dict]:
-        """Get all schema records as a materialized list.
-        Returns:
-            List of schema records as dictionaries.
-        """
-        ...
+    def list_schemas(self) -> list[dict]: ...
     def decode_schema(self, ref: str) -> Type[Packable]:
         """Reconstruct a Python Packable type from a stored schema.
@@ -341,14 +298,12 @@ class AbstractIndex(Protocol):
             KeyError: If schema not found.
             ValueError: If schema cannot be decoded (unsupported field types).
-        Example:
-            ::
-                >>> entry = index.get_dataset("my-dataset")
-                >>> SampleType = index.decode_schema(entry.schema_ref)
-                >>> ds = Dataset[SampleType](entry.data_urls[0])
-                >>> for sample in ds.ordered():
-                ...     print(sample)  # sample is instance of SampleType
+        Examples:
+            >>> entry = index.get_dataset("my-dataset")
+            >>> SampleType = index.decode_schema(entry.schema_ref)
+            >>> ds = Dataset[SampleType](entry.data_urls[0])
+            >>> for sample in ds.ordered():
+            ...     print(sample)  # sample is instance of SampleType
         """
         ...
@@ -368,13 +323,11 @@ class AbstractDataStore(Protocol):
     flexible deployment: local index with S3 storage, atmosphere index with
     S3 storage, or atmosphere index with PDS blobs.
-    Example:
-        ::
-            >>> store = S3DataStore(credentials, bucket="my-bucket")
-            >>> urls = store.write_shards(dataset, prefix="training/v1")
-            >>> print(urls)
-            ['s3://my-bucket/training/v1/shard-000000.tar', ...]
+    Examples:
+        >>> store = S3DataStore(credentials, bucket="my-bucket")
+        >>> urls = store.write_shards(dataset, prefix="training/v1")
+        >>> print(urls)
+        ['s3://my-bucket/training/v1/shard-000000.tar', ...]
     """
     def write_shards(
@@ -412,14 +365,7 @@ class AbstractDataStore(Protocol):
         """
         ...
-    def supports_streaming(self) -> bool:
-        """Whether this store supports streaming reads.
-        Returns:
-            True if the store supports efficient streaming (like S3),
-            False if data must be fully downloaded first.
-        """
-        ...
+    def supports_streaming(self) -> bool: ...
 ##
@@ -443,18 +389,16 @@ class DataSource(Protocol):
     - ATProto blob streaming
     - Any other source that can provide file-like objects
-    Example:
-        ::
-            >>> source = S3Source(
-            ...     bucket="my-bucket",
-            ...     keys=["data-000.tar", "data-001.tar"],
-            ...     endpoint="https://r2.example.com",
-            ...     credentials=creds,
-            ... )
-            >>> ds = Dataset[MySample](source)
-            >>> for sample in ds.ordered():
-            ...     print(sample)
+    Examples:
+        >>> source = S3Source(
+        ...     bucket="my-bucket",
+        ...     keys=["data-000.tar", "data-001.tar"],
+        ...     endpoint="https://r2.example.com",
+        ...     credentials=creds,
+        ... )
+        >>> ds = Dataset[MySample](source)
+        >>> for sample in ds.ordered():
+        ...     print(sample)
     """
     @property
@@ -467,12 +411,10 @@ class DataSource(Protocol):
         Yields:
             Tuple of (shard_identifier, file_like_stream).
-        Example:
-            ::
-                >>> for shard_id, stream in source.shards:
-                ...     print(f"Processing {shard_id}")
-                ...     data = stream.read()
+        Examples:
+            >>> for shard_id, stream in source.shards:
+            ...     print(f"Processing {shard_id}")
+            ...     data = stream.read()
         """
         ...
@@ -496,13 +438,13 @@ class DataSource(Protocol):
         only its assigned shards rather than iterating all shards.
         Args:
-            shard_id: Shard identifier from shard_list.
+            shard_id: Shard identifier from list_shards().
         Returns:
             File-like stream for reading the shard.
         Raises:
-            KeyError: If shard_id is not in shard_list.
+            KeyError: If shard_id is not in list_shards().
         """
         ...

atdata/_schema_codec.py CHANGED Viewed

@@ -9,19 +9,17 @@ The schema format follows the ATProto record structure defined in
 ``atmosphere/_types.py``, with field types supporting primitives, ndarrays,
 arrays, and schema references.
-Example:
-    ::
-        >>> schema = {
-        ...     "name": "ImageSample",
-        ...     "version": "1.0.0",
-        ...     "fields": [
-        ...         {"name": "image", "fieldType": {"$type": "...#ndarray", "dtype": "float32"}, "optional": False},
-        ...         {"name": "label", "fieldType": {"$type": "...#primitive", "primitive": "str"}, "optional": False},
-        ...     ]
-        ... }
-        >>> ImageSample = schema_to_type(schema)
-        >>> sample = ImageSample(image=np.zeros((64, 64)), label="cat")
+Examples:
+    >>> schema = {
+    ...     "name": "ImageSample",
+    ...     "version": "1.0.0",
+    ...     "fields": [
+    ...         {"name": "image", "fieldType": {"$type": "...#ndarray", "dtype": "float32"}, "optional": False},
+    ...         {"name": "label", "fieldType": {"$type": "...#primitive", "primitive": "str"}, "optional": False},
+    ...     ]
+    ... }
+    >>> ImageSample = schema_to_type(schema)
+    >>> sample = ImageSample(image=np.zeros((64, 64)), label="cat")
 """
 from dataclasses import field, make_dataclass
@@ -30,13 +28,14 @@ import hashlib
 from numpy.typing import NDArray
-# Import PackableSample for inheritance
+# Import PackableSample for inheritance in dynamic class generation
 from .dataset import PackableSample
+from ._protocols import Packable
 # Type cache to avoid regenerating identical types
 # Uses insertion order (Python 3.7+) for simple FIFO eviction
-_type_cache: dict[str, Type[PackableSample]] = {}
+_type_cache: dict[str, Type[Packable]] = {}
 _TYPE_CACHE_MAX_SIZE = 256
@@ -132,7 +131,7 @@ def schema_to_type(
     schema: dict,
     *,
     use_cache: bool = True,
-) -> Type[PackableSample]:
+) -> Type[Packable]:
     """Generate a PackableSample subclass from a schema record.
     This function dynamically creates a dataclass that inherits from PackableSample,
@@ -151,14 +150,12 @@ def schema_to_type(
     Raises:
         ValueError: If schema is malformed or contains unsupported types.
-    Example:
-        ::
-            >>> schema = index.get_schema("local://schemas/MySample@1.0.0")
-            >>> MySample = schema_to_type(schema)
-            >>> ds = Dataset[MySample]("data.tar")
-            >>> for sample in ds.ordered():
-            ...     print(sample)
+    Examples:
+        >>> schema = index.get_schema("local://schemas/MySample@1.0.0")
+        >>> MySample = schema_to_type(schema)
+        >>> ds = Dataset[MySample]("data.tar")
+        >>> for sample in ds.ordered():
+        ...     print(sample)
     """
     # Check cache first
     if use_cache:
@@ -207,7 +204,9 @@ def schema_to_type(
         namespace={
             "__post_init__": lambda self: PackableSample.__post_init__(self),
             "__schema_version__": version,
-            "__schema_ref__": schema.get("$ref", None),  # Store original ref if available
+            "__schema_ref__": schema.get(
+                "$ref", None
+            ),  # Store original ref if available
         },
     )
@@ -243,7 +242,9 @@ def _field_type_to_stub_str(field_type: dict, optional: bool = False) -> str:
     if kind == "primitive":
         primitive = field_type.get("primitive", "str")
-        py_type = primitive  # str, int, float, bool, bytes are all valid Python type names
+        py_type = (
+            primitive  # str, int, float, bool, bytes are all valid Python type names
+        )
     elif kind == "ndarray":
         py_type = "NDArray[Any]"
     elif kind == "array":
@@ -282,14 +283,12 @@ def generate_stub(schema: dict) -> str:
     Returns:
         String content for a .pyi stub file.
-    Example:
-        ::
-            >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
-            >>> stub_content = generate_stub(schema.to_dict())
-            >>> # Save to a stubs directory configured in your IDE
-            >>> with open("stubs/my_sample.pyi", "w") as f:
-            ...     f.write(stub_content)
+    Examples:
+        >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
+        >>> stub_content = generate_stub(schema.to_dict())
+        >>> # Save to a stubs directory configured in your IDE
+        >>> with open("stubs/my_sample.pyi", "w") as f:
+        ...     f.write(stub_content)
     """
     name = schema.get("name", "UnknownSample")
     version = schema.get("version", "1.0.0")
@@ -360,12 +359,10 @@ def generate_module(schema: dict) -> str:
     Returns:
         String content for a .py module file.
-    Example:
-        ::
-            >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
-            >>> module_content = generate_module(schema.to_dict())
-            >>> # The module can be imported after being saved
+    Examples:
+        >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
+        >>> module_content = generate_module(schema.to_dict())
+        >>> # The module can be imported after being saved
     """
     name = schema.get("name", "UnknownSample")
     version = schema.get("version", "1.0.0")
@@ -424,7 +421,7 @@ def clear_type_cache() -> None:
     _type_cache.clear()
-def get_cached_types() -> dict[str, Type[PackableSample]]:
+def get_cached_types() -> dict[str, Type[Packable]]:
     """Get a copy of the current type cache.
     Returns:

atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

atdata 0.2.2b1py3-none-any.whl → 0.3.0b1py3-none-any.whl