PyPI - atdata - Versions diffs - 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl - Mend

atdata 0.2.3b1py3-none-any.whl → 0.3.1b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

atdata/.gitignore +1 -0
atdata/__init__.py +39 -0
atdata/_cid.py +0 -21
atdata/_exceptions.py +168 -0
atdata/_helpers.py +41 -15
atdata/_hf_api.py +95 -11
atdata/_logging.py +70 -0
atdata/_protocols.py +77 -238
atdata/_schema_codec.py +7 -6
atdata/_stub_manager.py +5 -25
atdata/_type_utils.py +28 -2
atdata/atmosphere/__init__.py +31 -20
atdata/atmosphere/_types.py +4 -4
atdata/atmosphere/client.py +64 -12
atdata/atmosphere/lens.py +11 -12
atdata/atmosphere/records.py +12 -12
atdata/atmosphere/schema.py +16 -18
atdata/atmosphere/store.py +6 -7
atdata/cli/__init__.py +161 -175
atdata/cli/diagnose.py +2 -2
atdata/cli/{local.py → infra.py} +11 -11
atdata/cli/inspect.py +69 -0
atdata/cli/preview.py +63 -0
atdata/cli/schema.py +109 -0
atdata/dataset.py +583 -328
atdata/index/__init__.py +54 -0
atdata/index/_entry.py +157 -0
atdata/index/_index.py +1198 -0
atdata/index/_schema.py +380 -0
atdata/lens.py +9 -2
atdata/lexicons/__init__.py +121 -0
atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
atdata/lexicons/ac.foundation.dataset.record.json +96 -0
atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
atdata/lexicons/ndarray_shim.json +16 -0
atdata/local/__init__.py +70 -0
atdata/local/_repo_legacy.py +218 -0
atdata/manifest/__init__.py +28 -0
atdata/manifest/_aggregates.py +156 -0
atdata/manifest/_builder.py +163 -0
atdata/manifest/_fields.py +154 -0
atdata/manifest/_manifest.py +146 -0
atdata/manifest/_query.py +150 -0
atdata/manifest/_writer.py +74 -0
atdata/promote.py +18 -14
atdata/providers/__init__.py +25 -0
atdata/providers/_base.py +140 -0
atdata/providers/_factory.py +69 -0
atdata/providers/_postgres.py +214 -0
atdata/providers/_redis.py +171 -0
atdata/providers/_sqlite.py +191 -0
atdata/repository.py +323 -0
atdata/stores/__init__.py +23 -0
atdata/stores/_disk.py +123 -0
atdata/stores/_s3.py +349 -0
atdata/testing.py +341 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
atdata-0.3.1b1.dist-info/RECORD +67 -0
atdata/local.py +0 -1720
atdata-0.2.3b1.dist-info/RECORD +0 -28
{atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0

atdata/manifest/_query.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Query executor for manifest-based dataset queries.
+Provides two-phase filtering: shard-level pruning via aggregates,
+then sample-level filtering via the parquet DataFrame.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable
+import pandas as pd
+from ._manifest import ShardManifest
+@dataclass(frozen=True)
+class SampleLocation:
+    """Location of a sample within a shard.
+    Attributes:
+        shard: Shard identifier or URL.
+        key: WebDataset ``__key__`` for the sample.
+        offset: Byte offset within the tar file.
+    Examples:
+        >>> loc = SampleLocation(shard="data/shard-000000", key="sample_00042", offset=52480)
+        >>> loc.shard
+        'data/shard-000000'
+    """
+    shard: str
+    key: str
+    offset: int
+class QueryExecutor:
+    """Executes queries over per-shard manifests.
+    Performs two-phase filtering:
+    1. **Shard-level**: uses aggregates to skip shards that cannot contain
+       matching samples (e.g., numeric range exclusion, categorical value absence).
+    2. **Sample-level**: applies the predicate to the parquet DataFrame rows.
+    Args:
+        manifests: List of ``ShardManifest`` objects to query over.
+    Examples:
+        >>> executor = QueryExecutor(manifests)
+        >>> results = executor.query(
+        ...     where=lambda df: (df["confidence"] > 0.9) & (df["label"].isin(["dog", "cat"]))
+        ... )
+        >>> len(results)
+        42
+    """
+    def __init__(self, manifests: list[ShardManifest]) -> None:
+        self._manifests = manifests
+    def query(
+        self,
+        where: Callable[[pd.DataFrame], pd.Series],
+    ) -> list[SampleLocation]:
+        """Execute a query across all manifests.
+        The ``where`` callable receives a pandas DataFrame with the per-sample
+        manifest columns and must return a boolean Series selecting matching rows.
+        Args:
+            where: Predicate function. Receives a DataFrame, returns a boolean Series.
+        Returns:
+            List of ``SampleLocation`` for all matching samples.
+        """
+        results: list[SampleLocation] = []
+        for manifest in self._manifests:
+            if manifest.samples.empty:
+                continue
+            mask = where(manifest.samples)
+            matching = manifest.samples[mask]
+            for _, row in matching.iterrows():
+                results.append(
+                    SampleLocation(
+                        shard=manifest.shard_id,
+                        key=row["__key__"],
+                        offset=int(row["__offset__"]),
+                    )
+                )
+        return results
+    @classmethod
+    def from_directory(cls, directory: str | Path) -> QueryExecutor:
+        """Load all manifests from a directory.
+        Discovers ``*.manifest.json`` files and loads each with its
+        companion parquet file.
+        Args:
+            directory: Path to scan for manifest files.
+        Returns:
+            A ``QueryExecutor`` loaded with all discovered manifests.
+        Raises:
+            FileNotFoundError: If the directory does not exist.
+        """
+        directory = Path(directory)
+        manifests: list[ShardManifest] = []
+        for json_path in sorted(directory.glob("*.manifest.json")):
+            parquet_path = json_path.with_suffix("").with_suffix(".manifest.parquet")
+            if parquet_path.exists():
+                manifests.append(ShardManifest.from_files(json_path, parquet_path))
+            else:
+                manifests.append(ShardManifest.from_json_only(json_path))
+        return cls(manifests)
+    @classmethod
+    def from_shard_urls(cls, shard_urls: list[str]) -> QueryExecutor:
+        """Load manifests corresponding to a list of shard URLs.
+        Derives manifest paths by replacing the ``.tar`` extension with
+        ``.manifest.json`` and ``.manifest.parquet``.
+        Args:
+            shard_urls: List of shard file paths or URLs.
+        Returns:
+            A ``QueryExecutor`` with manifests for shards that have them.
+        """
+        manifests: list[ShardManifest] = []
+        for url in shard_urls:
+            base = url.removesuffix(".tar")
+            json_path = Path(f"{base}.manifest.json")
+            parquet_path = Path(f"{base}.manifest.parquet")
+            if json_path.exists() and parquet_path.exists():
+                manifests.append(ShardManifest.from_files(json_path, parquet_path))
+            elif json_path.exists():
+                manifests.append(ShardManifest.from_json_only(json_path))
+        return cls(manifests)

atdata/manifest/_writer.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""ManifestWriter for serializing ShardManifest to JSON + parquet files."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from ._manifest import ShardManifest
+class ManifestWriter:
+    """Writes a ``ShardManifest`` to companion JSON and parquet files.
+    Produces two files alongside each shard:
+    - ``{base_path}.manifest.json`` -- header with metadata and aggregates
+    - ``{base_path}.manifest.parquet`` -- per-sample metadata (columnar)
+    Args:
+        base_path: The shard path without the ``.tar`` extension.
+    Examples:
+        >>> writer = ManifestWriter("/data/shard-000000")
+        >>> json_path, parquet_path = writer.write(manifest)
+    """
+    def __init__(self, base_path: str | Path) -> None:
+        self._base_path = Path(base_path)
+    @property
+    def json_path(self) -> Path:
+        """Path for the JSON header file."""
+        return self._base_path.with_suffix(".manifest.json")
+    @property
+    def parquet_path(self) -> Path:
+        """Path for the parquet per-sample file."""
+        return self._base_path.with_suffix(".manifest.parquet")
+    def write(self, manifest: ShardManifest) -> tuple[Path, Path]:
+        """Write the manifest to JSON + parquet files.
+        Args:
+            manifest: The ``ShardManifest`` to serialize.
+        Returns:
+            Tuple of ``(json_path, parquet_path)``.
+        """
+        json_out = self.json_path
+        parquet_out = self.parquet_path
+        # Ensure parent directory exists
+        json_out.parent.mkdir(parents=True, exist_ok=True)
+        # Write JSON header + aggregates
+        with open(json_out, "w", encoding="utf-8") as f:
+            json.dump(manifest.header_dict(), f, indent=2)
+        # Write per-sample parquet
+        if not manifest.samples.empty:
+            manifest.samples.to_parquet(
+                parquet_out,
+                engine="fastparquet",
+                index=False,
+            )
+        else:
+            # Write an empty parquet with no rows
+            manifest.samples.to_parquet(
+                parquet_out,
+                engine="fastparquet",
+                index=False,
+            )
+        return json_out, parquet_out

atdata/promote.py CHANGED Viewed

@@ -5,30 +5,29 @@ ATProto atmosphere network. This enables sharing datasets with the broader
 federation while maintaining schema consistency.
 Examples:
-    >>> from atdata.local import LocalIndex, Repo
-    >>> from atdata.atmosphere import AtmosphereClient, AtmosphereIndex
+    >>> from atdata.local import Index, Repo
+    >>> from atdata.atmosphere import Atmosphere
     >>> from atdata.promote import promote_to_atmosphere
     >>>
     >>> # Setup
-    >>> local_index = LocalIndex()
-    >>> client = AtmosphereClient()
-    >>> client.login("handle.bsky.social", "app-password")
+    >>> local_index = Index()
+    >>> atmo = Atmosphere.login("handle.bsky.social", "app-password")
     >>>
     >>> # Promote a dataset
     >>> entry = local_index.get_dataset("my-dataset")
-    >>> at_uri = promote_to_atmosphere(entry, local_index, client)
+    >>> at_uri = promote_to_atmosphere(entry, local_index, atmo)
 """
 from typing import TYPE_CHECKING, Type
 if TYPE_CHECKING:
-    from .local import LocalDatasetEntry, Index as LocalIndex
-    from .atmosphere import AtmosphereClient
+    from .local import LocalDatasetEntry, Index
+    from .atmosphere import Atmosphere
     from ._protocols import AbstractDataStore, Packable
 def _find_existing_schema(
-    client: "AtmosphereClient",
+    client: "Atmosphere",
     name: str,
     version: str,
 ) -> str | None:
@@ -55,7 +54,7 @@ def _find_existing_schema(
 def _find_or_publish_schema(
     sample_type: "Type[Packable]",
     version: str,
-    client: "AtmosphereClient",
+    client: "Atmosphere",
     description: str | None = None,
 ) -> str:
     """Find existing schema or publish a new one.
@@ -94,8 +93,8 @@ def _find_or_publish_schema(
 def promote_to_atmosphere(
     local_entry: "LocalDatasetEntry",
-    local_index: "LocalIndex",
-    atmosphere_client: "AtmosphereClient",
+    local_index: "Index",
+    atmosphere_client: "Atmosphere",
     *,
     data_store: "AbstractDataStore | None" = None,
     name: str | None = None,
@@ -108,10 +107,15 @@ def promote_to_atmosphere(
     This function takes a locally-indexed dataset and publishes it to ATProto,
     making it discoverable on the federated atmosphere network.
+    .. deprecated::
+        Prefer ``Index.promote_entry()`` or ``Index.promote_dataset()``
+        which provide the same functionality through the unified Index
+        interface without requiring separate client and index arguments.
     Args:
         local_entry: The LocalDatasetEntry to promote.
         local_index: Local index containing the schema for this entry.
-        atmosphere_client: Authenticated AtmosphereClient.
+        atmosphere_client: Authenticated Atmosphere.
         data_store: Optional data store for copying data to new location.
             If None, the existing data_urls are used as-is.
         name: Override name for the atmosphere record. Defaults to local name.
@@ -128,7 +132,7 @@ def promote_to_atmosphere(
     Examples:
         >>> entry = local_index.get_dataset("mnist-train")
-        >>> uri = promote_to_atmosphere(entry, local_index, client)
+        >>> uri = promote_to_atmosphere(entry, local_index, atmo)
         >>> print(uri)
         at://did:plc:abc123/ac.foundation.dataset.datasetIndex/...
     """

atdata/providers/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Storage provider backends for the local Index.
+This package defines the ``IndexProvider`` abstract base class and concrete
+implementations for Redis, SQLite, and PostgreSQL. The ``Index`` class in
+``atdata.local`` delegates all persistence to an ``IndexProvider``.
+Providers:
+    RedisProvider: Redis-backed storage (existing default).
+    SqliteProvider: SQLite file-based storage (zero external dependencies).
+    PostgresProvider: PostgreSQL storage (requires ``psycopg``).
+Examples:
+    >>> from atdata.providers import IndexProvider, create_provider
+    >>> provider = create_provider("sqlite", path="~/.atdata/index.db")
+    >>> from atdata.local import Index
+    >>> index = Index(provider=provider)
+"""
+from ._base import IndexProvider
+from ._factory import create_provider
+__all__ = [
+    "IndexProvider",
+    "create_provider",
+]

atdata/providers/_base.py ADDED Viewed

@@ -0,0 +1,140 @@
+"""Abstract base class for index storage providers.
+The ``IndexProvider`` ABC defines the persistence contract that the ``Index``
+class delegates to.  Each provider handles storage and retrieval of two entity
+types — dataset entries and schema records — using whatever backend it wraps.
+Concrete implementations live in sibling modules:
+    ``_redis.py``, ``_sqlite.py``, ``_postgres.py``
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Iterator
+if TYPE_CHECKING:
+    from ..local import LocalDatasetEntry
+class IndexProvider(ABC):
+    """Storage backend for the ``Index`` class.
+    Implementations persist ``LocalDatasetEntry`` objects and schema JSON
+    records.  The ``Index`` class owns all business logic (CID generation,
+    version bumping, schema building); the provider is a pure persistence
+    layer.
+    Examples:
+        >>> from atdata.providers import create_provider
+        >>> provider = create_provider("sqlite", path="/tmp/index.db")
+        >>> provider.store_schema("MySample", "1.0.0", '{"name": "MySample"}')
+        >>> provider.get_schema_json("MySample", "1.0.0")
+        '{"name": "MySample"}'
+    """
+    # ------------------------------------------------------------------
+    # Dataset entry operations
+    # ------------------------------------------------------------------
+    @abstractmethod
+    def store_entry(self, entry: LocalDatasetEntry) -> None:
+        """Persist a dataset entry (upsert by CID).
+        Args:
+            entry: The dataset entry to store.  The entry's ``cid`` property
+                is used as the primary key.
+        """
+    @abstractmethod
+    def get_entry_by_cid(self, cid: str) -> LocalDatasetEntry:
+        """Load a dataset entry by its content identifier.
+        Args:
+            cid: Content-addressable identifier.
+        Returns:
+            The matching ``LocalDatasetEntry``.
+        Raises:
+            KeyError: If no entry exists for *cid*.
+        """
+    @abstractmethod
+    def get_entry_by_name(self, name: str) -> LocalDatasetEntry:
+        """Load a dataset entry by its human-readable name.
+        Args:
+            name: Dataset name.
+        Returns:
+            The first matching ``LocalDatasetEntry``.
+        Raises:
+            KeyError: If no entry exists with *name*.
+        """
+    @abstractmethod
+    def iter_entries(self) -> Iterator[LocalDatasetEntry]:
+        """Iterate over all stored dataset entries.
+        Yields:
+            ``LocalDatasetEntry`` objects in unspecified order.
+        """
+    # ------------------------------------------------------------------
+    # Schema operations
+    # ------------------------------------------------------------------
+    @abstractmethod
+    def store_schema(self, name: str, version: str, schema_json: str) -> None:
+        """Persist a schema record (upsert by name + version).
+        Args:
+            name: Schema name (e.g. ``"MySample"``).
+            version: Semantic version string (e.g. ``"1.0.0"``).
+            schema_json: JSON-serialized schema record.
+        """
+    @abstractmethod
+    def get_schema_json(self, name: str, version: str) -> str | None:
+        """Load a schema's JSON by name and version.
+        Args:
+            name: Schema name.
+            version: Semantic version string.
+        Returns:
+            The JSON string, or ``None`` if not found.
+        """
+    @abstractmethod
+    def iter_schemas(self) -> Iterator[tuple[str, str, str]]:
+        """Iterate over all stored schemas.
+        Yields:
+            Tuples of ``(name, version, schema_json)``.
+        """
+    @abstractmethod
+    def find_latest_version(self, name: str) -> str | None:
+        """Find the latest semantic version for a schema name.
+        Args:
+            name: Schema name to search for.
+        Returns:
+            The latest version string (e.g. ``"1.2.3"``), or ``None``
+            if no schema with *name* exists.
+        """
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def close(self) -> None:
+        """Release any resources held by the provider.
+        The default implementation is a no-op.  Providers that hold
+        connections (SQLite, PostgreSQL) should override this.
+        """

atdata/providers/_factory.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""Factory for creating index providers by name.
+Examples:
+    >>> from atdata.providers._factory import create_provider
+    >>> provider = create_provider("sqlite", path="/tmp/index.db")
+    >>> provider = create_provider("redis")
+    >>> provider = create_provider("postgres", dsn="postgresql://localhost/mydb")
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+from ._base import IndexProvider
+def create_provider(
+    name: str,
+    *,
+    path: str | Path | None = None,
+    dsn: str | None = None,
+    redis: Any = None,
+    **kwargs: Any,
+) -> IndexProvider:
+    """Instantiate an ``IndexProvider`` by backend name.
+    Args:
+        name: One of ``"redis"``, ``"sqlite"``, or ``"postgres"``.
+        path: Database file path (SQLite).  Defaults to
+            ``~/.atdata/index.db`` when *name* is ``"sqlite"``.
+        dsn: Connection string (PostgreSQL).
+        redis: An existing ``redis.Redis`` connection (Redis).  When
+            ``None`` and *name* is ``"redis"``, a new connection is
+            created from *kwargs*.
+        **kwargs: Extra arguments forwarded to the provider constructor
+            (e.g. Redis host/port).
+    Returns:
+        A ready-to-use ``IndexProvider``.
+    Raises:
+        ValueError: If *name* is not a recognised backend.
+    """
+    name = name.lower().strip()
+    if name == "redis":
+        from ._redis import RedisProvider
+        from redis import Redis as _Redis
+        if redis is not None:
+            return RedisProvider(redis)
+        return RedisProvider(_Redis(**kwargs))
+    if name == "sqlite":
+        from ._sqlite import SqliteProvider
+        return SqliteProvider(path=path)
+    if name in ("postgres", "postgresql"):
+        from ._postgres import PostgresProvider
+        if dsn is None:
+            raise ValueError("dsn is required for the postgres provider")
+        return PostgresProvider(dsn=dsn)
+    raise ValueError(
+        f"Unknown provider {name!r}. Choose from: 'redis', 'sqlite', 'postgres'."
+    )

atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl

atdata 0.2.3b1py3-none-any.whl → 0.3.1b1py3-none-any.whl