PyPI - atdata - Versions diffs - 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl - Mend

atdata 0.2.3b1py3-none-any.whl → 0.3.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

atdata/.gitignore +1 -0
atdata/__init__.py +30 -0
atdata/_exceptions.py +168 -0
atdata/_helpers.py +29 -15
atdata/_hf_api.py +63 -11
atdata/_logging.py +70 -0
atdata/_protocols.py +19 -62
atdata/_schema_codec.py +5 -4
atdata/_type_utils.py +28 -2
atdata/atmosphere/__init__.py +19 -9
atdata/atmosphere/records.py +3 -2
atdata/atmosphere/schema.py +2 -2
atdata/cli/__init__.py +157 -171
atdata/cli/inspect.py +69 -0
atdata/cli/local.py +1 -1
atdata/cli/preview.py +63 -0
atdata/cli/schema.py +109 -0
atdata/dataset.py +428 -326
atdata/lens.py +9 -2
atdata/local/__init__.py +71 -0
atdata/local/_entry.py +157 -0
atdata/local/_index.py +940 -0
atdata/local/_repo_legacy.py +218 -0
atdata/local/_s3.py +349 -0
atdata/local/_schema.py +380 -0
atdata/manifest/__init__.py +28 -0
atdata/manifest/_aggregates.py +156 -0
atdata/manifest/_builder.py +163 -0
atdata/manifest/_fields.py +154 -0
atdata/manifest/_manifest.py +146 -0
atdata/manifest/_query.py +150 -0
atdata/manifest/_writer.py +74 -0
atdata/promote.py +4 -4
atdata/providers/__init__.py +25 -0
atdata/providers/_base.py +140 -0
atdata/providers/_factory.py +69 -0
atdata/providers/_postgres.py +214 -0
atdata/providers/_redis.py +171 -0
atdata/providers/_sqlite.py +191 -0
atdata/repository.py +323 -0
atdata/testing.py +337 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +4 -1
atdata-0.3.0b1.dist-info/RECORD +54 -0
atdata/local.py +0 -1720
atdata-0.2.3b1.dist-info/RECORD +0 -28
{atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0

atdata/.gitignore ADDED Viewed

	@@ -0,0 +1 @@
1	+ !manifest/

atdata/__init__.py CHANGED Viewed

@@ -55,6 +55,8 @@ from .lens import (
 from ._hf_api import (
     load_dataset as load_dataset,
     DatasetDict as DatasetDict,
+    get_default_index as get_default_index,
+    set_default_index as set_default_index,
 )
 from ._protocols import (
@@ -71,10 +73,29 @@ from ._sources import (
     BlobSource as BlobSource,
 )
+from ._exceptions import (
+    AtdataError as AtdataError,
+    LensNotFoundError as LensNotFoundError,
+    SchemaError as SchemaError,
+    SampleKeyError as SampleKeyError,
+    ShardError as ShardError,
+    PartialFailureError as PartialFailureError,
+)
 from ._schema_codec import (
     schema_to_type as schema_to_type,
 )
+from ._logging import (
+    configure_logging as configure_logging,
+    get_logger as get_logger,
+)
+from .repository import (
+    Repository as Repository,
+    create_repository as create_repository,
+)
 from ._cid import (
     generate_cid as generate_cid,
     verify_cid as verify_cid,
@@ -84,6 +105,15 @@ from .promote import (
     promote_to_atmosphere as promote_to_atmosphere,
 )
+from .manifest import (
+    ManifestField as ManifestField,
+    ManifestBuilder as ManifestBuilder,
+    ShardManifest as ShardManifest,
+    ManifestWriter as ManifestWriter,
+    QueryExecutor as QueryExecutor,
+    SampleLocation as SampleLocation,
+)
 # ATProto integration (lazy import to avoid requiring atproto package)
 from . import atmosphere as atmosphere

atdata/_exceptions.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Custom exception hierarchy for atdata.
+Provides actionable error messages with contextual help, available
+alternatives, and suggested fix code snippets.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from typing import Type
+class AtdataError(Exception):
+    """Base exception for all atdata errors."""
+class LensNotFoundError(AtdataError, ValueError):
+    """No lens registered to transform between two sample types.
+    Attributes:
+        source_type: The source sample type.
+        view_type: The target view type.
+        available_targets: Types reachable from the source via registered lenses.
+    """
+    def __init__(
+        self,
+        source_type: Type,
+        view_type: Type,
+        available_targets: list[tuple[Type, str]] | None = None,
+    ) -> None:
+        self.source_type = source_type
+        self.view_type = view_type
+        self.available_targets = available_targets or []
+        src_name = source_type.__name__
+        view_name = view_type.__name__
+        lines = [f"No lens transforms {src_name} \u2192 {view_name}"]
+        if self.available_targets:
+            lines.append("")
+            lines.append(f"Available lenses from {src_name}:")
+            for target_type, lens_name in self.available_targets:
+                lines.append(
+                    f"  - {src_name} \u2192 {target_type.__name__} (via {lens_name})"
+                )
+        lines.append("")
+        lines.append("Did you mean to define:")
+        lines.append("  @lens")
+        lines.append(
+            f"  def {src_name.lower()}_to_{view_name.lower()}(source: {src_name}) -> {view_name}:"
+        )
+        lines.append(f"      return {view_name}(...)")
+        super().__init__("\n".join(lines))
+class SchemaError(AtdataError):
+    """Schema mismatch during sample deserialization.
+    Raised when the data in a shard doesn't match the expected sample type.
+    Attributes:
+        expected_fields: Fields expected by the sample type.
+        actual_fields: Fields found in the data.
+        sample_type_name: Name of the target sample type.
+    """
+    def __init__(
+        self,
+        sample_type_name: str,
+        expected_fields: list[str],
+        actual_fields: list[str],
+    ) -> None:
+        self.sample_type_name = sample_type_name
+        self.expected_fields = expected_fields
+        self.actual_fields = actual_fields
+        missing = sorted(set(expected_fields) - set(actual_fields))
+        extra = sorted(set(actual_fields) - set(expected_fields))
+        lines = [f"Schema mismatch for {sample_type_name}"]
+        if missing:
+            lines.append(f"  Missing fields: {', '.join(missing)}")
+        if extra:
+            lines.append(f"  Unexpected fields: {', '.join(extra)}")
+        lines.append("")
+        lines.append(f"Expected: {', '.join(sorted(expected_fields))}")
+        lines.append(f"Got:      {', '.join(sorted(actual_fields))}")
+        super().__init__("\n".join(lines))
+class SampleKeyError(AtdataError, KeyError):
+    """Sample with the given key was not found in the dataset.
+    Attributes:
+        key: The key that was not found.
+    """
+    def __init__(self, key: str) -> None:
+        self.key = key
+        super().__init__(
+            f"Sample with key '{key}' not found in dataset. "
+            f"Note: key lookup requires scanning all shards and is O(n)."
+        )
+class ShardError(AtdataError):
+    """Error accessing or reading a dataset shard.
+    Attributes:
+        shard_id: Identifier of the shard that failed.
+        reason: Human-readable description of what went wrong.
+    """
+    def __init__(self, shard_id: str, reason: str) -> None:
+        self.shard_id = shard_id
+        self.reason = reason
+        super().__init__(f"Failed to read shard '{shard_id}': {reason}")
+class PartialFailureError(AtdataError):
+    """Some shards succeeded but others failed during processing.
+    Raised by :meth:`Dataset.process_shards` when at least one shard fails.
+    Provides access to both the successful results and the per-shard errors,
+    enabling retry of only the failed shards.
+    Attributes:
+        succeeded_shards: List of shard identifiers that succeeded.
+        failed_shards: List of shard identifiers that failed.
+        errors: Mapping from shard identifier to the exception that occurred.
+        results: Mapping from shard identifier to the result for succeeded shards.
+    """
+    def __init__(
+        self,
+        succeeded_shards: list[str],
+        failed_shards: list[str],
+        errors: dict[str, Exception],
+        results: dict[str, object],
+    ) -> None:
+        self.succeeded_shards = succeeded_shards
+        self.failed_shards = failed_shards
+        self.errors = errors
+        self.results = results
+        n_ok = len(succeeded_shards)
+        n_fail = len(failed_shards)
+        total = n_ok + n_fail
+        lines = [f"{n_fail}/{total} shards failed during processing"]
+        for shard_id in failed_shards[:5]:
+            lines.append(f"  {shard_id}: {errors[shard_id]}")
+        if n_fail > 5:
+            lines.append(f"  ... and {n_fail - 5} more")
+        lines.append("")
+        lines.append(
+            f"Access .succeeded_shards ({n_ok}) and .failed_shards ({n_fail}) "
+            f"to inspect or retry."
+        )
+        super().__init__("\n".join(lines))

atdata/_helpers.py CHANGED Viewed

@@ -1,8 +1,7 @@
 """Helper utilities for numpy array serialization.
 This module provides utility functions for converting numpy arrays to and from
-bytes for msgpack serialization. The functions use numpy's native save/load
-format to preserve array dtype and shape information.
+bytes for msgpack serialization.
 Functions:
     - ``array_to_bytes()``: Serialize numpy array to bytes
@@ -15,10 +14,14 @@ handling of NDArray fields during msgpack packing/unpacking.
 ##
 # Imports
+import struct
 from io import BytesIO
 import numpy as np
+# .npy format magic prefix (used for backward-compatible deserialization)
+_NPY_MAGIC = b"\x93NUMPY"
 ##
@@ -26,35 +29,46 @@ import numpy as np
 def array_to_bytes(x: np.ndarray) -> bytes:
     """Convert a numpy array to bytes for msgpack serialization.
-    Uses numpy's native ``save()`` format to preserve array dtype and shape.
+    Uses a compact binary format: a short header (dtype + shape) followed by
+    raw array bytes via ``ndarray.tobytes()``. Falls back to numpy's ``.npy``
+    format for object dtypes that cannot be represented as raw bytes.
     Args:
         x: A numpy array to serialize.
     Returns:
         Raw bytes representing the serialized array.
-    Note:
-        Uses ``allow_pickle=True`` to support object dtypes.
     """
-    np_bytes = BytesIO()
-    np.save(np_bytes, x, allow_pickle=True)
-    return np_bytes.getvalue()
+    if x.dtype == object:
+        buf = BytesIO()
+        np.save(buf, x, allow_pickle=True)
+        return buf.getvalue()
+    dtype_str = x.dtype.str.encode()  # e.g. b'<f4'
+    header = struct.pack(f"<B{len(x.shape)}q", len(x.shape), *x.shape)
+    return struct.pack("<B", len(dtype_str)) + dtype_str + header + x.tobytes()
 def bytes_to_array(b: bytes) -> np.ndarray:
     """Convert serialized bytes back to a numpy array.
-    Reverses the serialization performed by ``array_to_bytes()``.
+    Transparently handles both the compact format produced by the current
+    ``array_to_bytes()`` and the legacy ``.npy`` format.
     Args:
         b: Raw bytes from a serialized numpy array.
     Returns:
         The deserialized numpy array with original dtype and shape.
-    Note:
-        Uses ``allow_pickle=True`` to support object dtypes.
     """
-    np_bytes = BytesIO(b)
-    return np.load(np_bytes, allow_pickle=True)
+    if b[:6] == _NPY_MAGIC:
+        return np.load(BytesIO(b), allow_pickle=True)
+    # Compact format: dtype_len(1B) + dtype_str + ndim(1B) + shape(ndim×8B) + data
+    dlen = b[0]
+    dtype = np.dtype(b[1 : 1 + dlen].decode())
+    ndim = b[1 + dlen]
+    offset = 2 + dlen
+    shape = struct.unpack_from(f"<{ndim}q", b, offset)
+    offset += ndim * 8
+    return np.frombuffer(b, dtype=dtype, offset=offset).reshape(shape).copy()

atdata/_hf_api.py CHANGED Viewed

@@ -29,6 +29,7 @@ Examples:
 from __future__ import annotations
 import re
+import threading
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
@@ -40,9 +41,9 @@ from typing import (
     overload,
 )
-from .dataset import Dataset, PackableSample, DictSample
+from .dataset import Dataset, DictSample
 from ._sources import URLSource, S3Source
-from ._protocols import DataSource
+from ._protocols import DataSource, Packable
 if TYPE_CHECKING:
     from ._protocols import AbstractIndex
@@ -50,7 +51,60 @@ if TYPE_CHECKING:
 ##
 # Type variables
-ST = TypeVar("ST", bound=PackableSample)
+ST = TypeVar("ST", bound=Packable)
+##
+# Default Index singleton
+_default_index: "Index | None" = None  # noqa: F821 (forward ref)
+_default_index_lock = threading.Lock()
+def get_default_index() -> "Index":  # noqa: F821
+    """Get or create the module-level default Index.
+    The default Index uses Redis for local storage (backwards-compatible
+    default) and an anonymous AtmosphereClient for read-only public data
+    resolution.
+    The default is created lazily on first access and cached for the
+    lifetime of the process.
+    Returns:
+        The default Index instance.
+    Examples:
+        >>> index = get_default_index()
+        >>> entry = index.get_dataset("local/mnist")
+    """
+    global _default_index
+    if _default_index is None:
+        with _default_index_lock:
+            if _default_index is None:
+                from .local import Index
+                _default_index = Index()
+    return _default_index
+def set_default_index(index: "Index") -> None:  # noqa: F821
+    """Override the module-level default Index.
+    Use this to configure a custom default Index with specific repositories,
+    an authenticated atmosphere client, or non-default providers.
+    Args:
+        index: The Index instance to use as the default.
+    Examples:
+        >>> from atdata.local import Index
+        >>> from atdata.providers import create_provider
+        >>> custom = Index(provider=create_provider("sqlite"))
+        >>> set_default_index(custom)
+    """
+    global _default_index
+    _default_index = index
 ##
@@ -74,10 +128,11 @@ class DatasetDict(Generic[ST], dict):
         >>>
         >>> # Iterate over all splits
         >>> for split_name, dataset in ds_dict.items():
-        ...     print(f"{split_name}: {len(dataset.shard_list)} shards")
+        ...     print(f"{split_name}: {len(dataset.list_shards())} shards")
     """
-    # TODO The above has a line for "Parameters:" that should be "Type Parameters:"; this is a temporary fix for `quartodoc` auto-generation bugs.
+    # Note: The docstring uses "Parameters:" for type parameters as a workaround
+    # for quartodoc not supporting "Type Parameters:" sections.
     def __init__(
         self,
@@ -459,7 +514,7 @@ def _resolve_indexed_path(
     handle_or_did, dataset_name = _parse_indexed_path(path)
     # For AtmosphereIndex, we need to resolve handle to DID first
-    # For LocalIndex, the handle is ignored and we just look up by name
+    # For local Index, the handle is ignored and we just look up by name
     entry = index.get_dataset(dataset_name)
     data_urls = entry.data_urls
@@ -624,16 +679,13 @@ def load_dataset(
         >>> train_ds = load_dataset("./data/train-*.tar", TextData, split="train")
         >>>
         >>> # Load from index with auto-type resolution
-        >>> index = LocalIndex()
+        >>> index = Index()
         >>> ds = load_dataset("@local/my-dataset", index=index, split="train")
     """
     # Handle @handle/dataset indexed path resolution
     if _is_indexed_path(path):
         if index is None:
-            raise ValueError(
-                f"Index required for indexed path: {path}. "
-                "Pass index=LocalIndex() or index=AtmosphereIndex(client)."
-            )
+            index = get_default_index()
         source, schema_ref = _resolve_indexed_path(path, index)

atdata/_logging.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Pluggable logging for atdata.
+Provides a thin abstraction over Python's stdlib ``logging`` module that can
+be replaced with ``structlog`` or any other logger implementing the standard
+``debug``/``info``/``warning``/``error`` interface.
+Usage::
+    # Default: stdlib logging (no config needed)
+    from atdata._logging import get_logger
+    log = get_logger()
+    log.info("processing shard", extra={"shard": "data-000.tar"})
+    # Plug in structlog (or any compatible logger):
+    import structlog
+    import atdata
+    atdata.configure_logging(structlog.get_logger())
+The module also exports a lightweight ``LoggerProtocol`` for type checking
+custom logger implementations.
+"""
+from __future__ import annotations
+import logging
+from typing import Any, Protocol, runtime_checkable
+@runtime_checkable
+class LoggerProtocol(Protocol):
+    """Minimal interface that a pluggable logger must satisfy."""
+    def debug(self, msg: str, *args: Any, **kwargs: Any) -> None: ...
+    def info(self, msg: str, *args: Any, **kwargs: Any) -> None: ...
+    def warning(self, msg: str, *args: Any, **kwargs: Any) -> None: ...
+    def error(self, msg: str, *args: Any, **kwargs: Any) -> None: ...
+# ---------------------------------------------------------------------------
+# Module-level state
+# ---------------------------------------------------------------------------
+_logger: LoggerProtocol = logging.getLogger("atdata")
+def configure_logging(logger: LoggerProtocol) -> None:
+    """Replace the default logger with a custom implementation.
+    The provided logger must implement ``debug``, ``info``, ``warning``, and
+    ``error`` methods. Both ``structlog`` bound loggers and stdlib
+    ``logging.Logger`` instances satisfy this interface.
+    Args:
+        logger: A logger instance implementing :class:`LoggerProtocol`.
+    Examples:
+        >>> import structlog
+        >>> atdata.configure_logging(structlog.get_logger())
+    """
+    global _logger
+    _logger = logger
+def get_logger() -> LoggerProtocol:
+    """Return the currently configured logger.
+    Returns the stdlib ``logging.getLogger("atdata")`` by default, or
+    whatever was last set via :func:`configure_logging`.
+    """
+    return _logger

atdata/_protocols.py CHANGED Viewed

@@ -10,7 +10,7 @@ formalize that common interface.
 Note:
     Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
     the standard Python syntax for Protocol definitions - these are interface
-    specifications, not stub implementations. Concrete classes (LocalIndex,
+    specifications, not stub implementations. Concrete classes (Index,
     AtmosphereIndex, etc.) provide the actual implementations.
 Protocols:
@@ -24,7 +24,7 @@ Examples:
     ...     for entry in index.list_datasets():
     ...         print(f"{entry.name}: {entry.data_urls}")
     ...
-    >>> # Works with either LocalIndex or AtmosphereIndex
+    >>> # Works with either Index or AtmosphereIndex
     >>> process_datasets(local_index)
     >>> process_datasets(atmosphere_index)
 """
@@ -77,24 +77,16 @@ class Packable(Protocol):
     """
     @classmethod
-    def from_data(cls, data: dict[str, Any]) -> "Packable":
-        """Create instance from unpacked msgpack data dictionary."""
-        ...
+    def from_data(cls, data: dict[str, Any]) -> "Packable": ...
     @classmethod
-    def from_bytes(cls, bs: bytes) -> "Packable":
-        """Create instance from raw msgpack bytes."""
-        ...
+    def from_bytes(cls, bs: bytes) -> "Packable": ...
     @property
-    def packed(self) -> bytes:
-        """Pack this sample's data into msgpack bytes."""
-        ...
+    def packed(self) -> bytes: ...
     @property
-    def as_wds(self) -> dict[str, Any]:
-        """WebDataset-compatible representation with __key__ and msgpack."""
-        ...
+    def as_wds(self) -> dict[str, Any]: ...
 ##
@@ -116,16 +108,14 @@ class IndexEntry(Protocol):
     """
     @property
-    def name(self) -> str:
-        """Human-readable dataset name."""
-        ...
+    def name(self) -> str: ...
     @property
     def schema_ref(self) -> str:
-        """Reference to the schema for this dataset.
+        """Schema reference string.
-        For local: 'local://schemas/{module.Class}@{version}'
-        For atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
+        Local: ``local://schemas/{module.Class}@{version}``
+        Atmosphere: ``at://did:plc:.../ac.foundation.dataset.sampleSchema/...``
         """
         ...
@@ -139,9 +129,7 @@ class IndexEntry(Protocol):
         ...
     @property
-    def metadata(self) -> Optional[dict]:
-        """Arbitrary metadata dictionary, or None if not set."""
-        ...
+    def metadata(self) -> Optional[dict]: ...
 ##
@@ -149,7 +137,7 @@ class IndexEntry(Protocol):
 class AbstractIndex(Protocol):
-    """Protocol for index operations - implemented by LocalIndex and AtmosphereIndex.
+    """Protocol for index operations - implemented by Index and AtmosphereIndex.
     This protocol defines the common interface for managing dataset metadata:
     - Publishing and retrieving schemas
@@ -239,21 +227,9 @@ class AbstractIndex(Protocol):
         ...
     @property
-    def datasets(self) -> Iterator[IndexEntry]:
-        """Lazily iterate over all dataset entries in this index.
+    def datasets(self) -> Iterator[IndexEntry]: ...
-        Yields:
-            IndexEntry for each dataset (may be of different sample types).
-        """
-        ...
-    def list_datasets(self) -> list[IndexEntry]:
-        """Get all dataset entries as a materialized list.
-        Returns:
-            List of IndexEntry for each dataset.
-        """
-        ...
+    def list_datasets(self) -> list[IndexEntry]: ...
     # Schema operations
@@ -299,21 +275,9 @@ class AbstractIndex(Protocol):
         ...
     @property
-    def schemas(self) -> Iterator[dict]:
-        """Lazily iterate over all schema records in this index.
+    def schemas(self) -> Iterator[dict]: ...
-        Yields:
-            Schema records as dictionaries.
-        """
-        ...
-    def list_schemas(self) -> list[dict]:
-        """Get all schema records as a materialized list.
-        Returns:
-            List of schema records as dictionaries.
-        """
-        ...
+    def list_schemas(self) -> list[dict]: ...
     def decode_schema(self, ref: str) -> Type[Packable]:
         """Reconstruct a Python Packable type from a stored schema.
@@ -401,14 +365,7 @@ class AbstractDataStore(Protocol):
         """
         ...
-    def supports_streaming(self) -> bool:
-        """Whether this store supports streaming reads.
-        Returns:
-            True if the store supports efficient streaming (like S3),
-            False if data must be fully downloaded first.
-        """
-        ...
+    def supports_streaming(self) -> bool: ...
 ##
@@ -481,13 +438,13 @@ class DataSource(Protocol):
         only its assigned shards rather than iterating all shards.
         Args:
-            shard_id: Shard identifier from shard_list.
+            shard_id: Shard identifier from list_shards().
         Returns:
             File-like stream for reading the shard.
         Raises:
-            KeyError: If shard_id is not in shard_list.
+            KeyError: If shard_id is not in list_shards().
         """
         ...

atdata 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

atdata 0.2.3b1py3-none-any.whl → 0.3.0b1py3-none-any.whl