PyPI - atdata - Versions diffs - 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl - Mend

atdata 0.2.3b1py3-none-any.whl → 0.3.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

atdata/.gitignore +1 -0
atdata/__init__.py +30 -0
atdata/_exceptions.py +168 -0
atdata/_helpers.py +29 -15
atdata/_hf_api.py +63 -11
atdata/_logging.py +70 -0
atdata/_protocols.py +19 -62
atdata/_schema_codec.py +5 -4
atdata/_type_utils.py +28 -2
atdata/atmosphere/__init__.py +19 -9
atdata/atmosphere/records.py +3 -2
atdata/atmosphere/schema.py +2 -2
atdata/cli/__init__.py +157 -171
atdata/cli/inspect.py +69 -0
atdata/cli/local.py +1 -1
atdata/cli/preview.py +63 -0
atdata/cli/schema.py +109 -0
atdata/dataset.py +428 -326
atdata/lens.py +9 -2
atdata/local/__init__.py +71 -0
atdata/local/_entry.py +157 -0
atdata/local/_index.py +940 -0
atdata/local/_repo_legacy.py +218 -0
atdata/local/_s3.py +349 -0
atdata/local/_schema.py +380 -0
atdata/manifest/__init__.py +28 -0
atdata/manifest/_aggregates.py +156 -0
atdata/manifest/_builder.py +163 -0
atdata/manifest/_fields.py +154 -0
atdata/manifest/_manifest.py +146 -0
atdata/manifest/_query.py +150 -0
atdata/manifest/_writer.py +74 -0
atdata/promote.py +4 -4
atdata/providers/__init__.py +25 -0
atdata/providers/_base.py +140 -0
atdata/providers/_factory.py +69 -0
atdata/providers/_postgres.py +214 -0
atdata/providers/_redis.py +171 -0
atdata/providers/_sqlite.py +191 -0
atdata/repository.py +323 -0
atdata/testing.py +337 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +4 -1
atdata-0.3.0b1.dist-info/RECORD +54 -0
atdata/local.py +0 -1720
atdata-0.2.3b1.dist-info/RECORD +0 -28
{atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0

atdata/providers/_sqlite.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""SQLite-backed index provider.
+Stores dataset entries and schema records in a local SQLite database file.
+Uses WAL journal mode for concurrent read access and ``INSERT OR REPLACE``
+for upsert semantics.
+No external dependencies — uses Python's built-in ``sqlite3`` module.
+"""
+from __future__ import annotations
+import sqlite3
+from pathlib import Path
+from typing import Iterator
+import msgpack
+from ._base import IndexProvider
+from .._type_utils import parse_semver
+_CREATE_TABLES = """\
+CREATE TABLE IF NOT EXISTS dataset_entries (
+    cid         TEXT PRIMARY KEY,
+    name        TEXT NOT NULL,
+    schema_ref  TEXT NOT NULL,
+    data_urls   BLOB NOT NULL,
+    metadata    BLOB,
+    legacy_uuid TEXT,
+    created_at  TEXT DEFAULT (datetime('now'))
+);
+CREATE INDEX IF NOT EXISTS idx_entries_name
+    ON dataset_entries(name);
+CREATE TABLE IF NOT EXISTS schemas (
+    name        TEXT NOT NULL,
+    version     TEXT NOT NULL,
+    schema_json TEXT NOT NULL,
+    created_at  TEXT DEFAULT (datetime('now')),
+    PRIMARY KEY (name, version)
+);
+"""
+class SqliteProvider(IndexProvider):
+    """Index provider backed by a local SQLite database.
+    Args:
+        path: Path to the database file.  The parent directory is created
+            automatically.  Defaults to ``~/.atdata/index.db``.
+    Examples:
+        >>> provider = SqliteProvider(path="/tmp/test-index.db")
+        >>> provider.store_schema("MySample", "1.0.0", '{"name":"MySample"}')
+        >>> provider.get_schema_json("MySample", "1.0.0")
+        '{"name":"MySample"}'
+    """
+    def __init__(self, path: str | Path | None = None) -> None:
+        if path is None:
+            path = Path.home() / ".atdata" / "index.db"
+        self._path = Path(path).expanduser()
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        self._conn = sqlite3.connect(str(self._path))
+        self._conn.execute("PRAGMA journal_mode=WAL")
+        self._conn.executescript(_CREATE_TABLES)
+        self._conn.commit()
+    @property
+    def path(self) -> Path:
+        """Path to the SQLite database file."""
+        return self._path
+    # ------------------------------------------------------------------
+    # Dataset entry operations
+    # ------------------------------------------------------------------
+    def store_entry(self, entry: "LocalDatasetEntry") -> None:  # noqa: F821
+        self._conn.execute(
+            """INSERT OR REPLACE INTO dataset_entries
+               (cid, name, schema_ref, data_urls, metadata, legacy_uuid)
+               VALUES (?, ?, ?, ?, ?, ?)""",
+            (
+                entry.cid,
+                entry.name,
+                entry.schema_ref,
+                msgpack.packb(entry.data_urls),
+                msgpack.packb(entry.metadata) if entry.metadata is not None else None,
+                entry._legacy_uuid,
+            ),
+        )
+        self._conn.commit()
+    def get_entry_by_cid(self, cid: str) -> "LocalDatasetEntry":  # noqa: F821
+        row = self._conn.execute(
+            "SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
+            "FROM dataset_entries WHERE cid = ?",
+            (cid,),
+        ).fetchone()
+        if row is None:
+            raise KeyError(f"LocalDatasetEntry not found: {cid}")
+        return _row_to_entry(row)
+    def get_entry_by_name(self, name: str) -> "LocalDatasetEntry":  # noqa: F821
+        row = self._conn.execute(
+            "SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
+            "FROM dataset_entries WHERE name = ? LIMIT 1",
+            (name,),
+        ).fetchone()
+        if row is None:
+            raise KeyError(f"No entry with name: {name}")
+        return _row_to_entry(row)
+    def iter_entries(self) -> Iterator["LocalDatasetEntry"]:  # noqa: F821
+        cursor = self._conn.execute(
+            "SELECT cid, name, schema_ref, data_urls, metadata, legacy_uuid "
+            "FROM dataset_entries"
+        )
+        for row in cursor:
+            yield _row_to_entry(row)
+    # ------------------------------------------------------------------
+    # Schema operations
+    # ------------------------------------------------------------------
+    def store_schema(self, name: str, version: str, schema_json: str) -> None:
+        self._conn.execute(
+            """INSERT OR REPLACE INTO schemas (name, version, schema_json)
+               VALUES (?, ?, ?)""",
+            (name, version, schema_json),
+        )
+        self._conn.commit()
+    def get_schema_json(self, name: str, version: str) -> str | None:
+        row = self._conn.execute(
+            "SELECT schema_json FROM schemas WHERE name = ? AND version = ?",
+            (name, version),
+        ).fetchone()
+        if row is None:
+            return None
+        return row[0]
+    def iter_schemas(self) -> Iterator[tuple[str, str, str]]:
+        cursor = self._conn.execute("SELECT name, version, schema_json FROM schemas")
+        yield from cursor
+    def find_latest_version(self, name: str) -> str | None:
+        cursor = self._conn.execute(
+            "SELECT version FROM schemas WHERE name = ?",
+            (name,),
+        )
+        latest: tuple[int, int, int] | None = None
+        latest_str: str | None = None
+        for (version_str,) in cursor:
+            try:
+                v = parse_semver(version_str)
+                if latest is None or v > latest:
+                    latest = v
+                    latest_str = version_str
+            except ValueError:
+                continue
+        return latest_str
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def close(self) -> None:
+        """Close the SQLite connection."""
+        self._conn.close()
+# ------------------------------------------------------------------
+# Helpers
+# ------------------------------------------------------------------
+def _row_to_entry(row: tuple) -> "LocalDatasetEntry":  # noqa: F821
+    """Convert a database row to a ``LocalDatasetEntry``."""
+    from ..local import LocalDatasetEntry
+    cid, name, schema_ref, data_urls_blob, metadata_blob, legacy_uuid = row
+    return LocalDatasetEntry(
+        name=name,
+        schema_ref=schema_ref,
+        data_urls=msgpack.unpackb(data_urls_blob),
+        metadata=msgpack.unpackb(metadata_blob) if metadata_blob is not None else None,
+        _cid=cid,
+        _legacy_uuid=legacy_uuid,
+    )

atdata/repository.py ADDED Viewed

@@ -0,0 +1,323 @@
+"""Repository and atmosphere backend for the unified Index.
+A ``Repository`` pairs an ``IndexProvider`` (persistence backend) with an
+optional ``AbstractDataStore`` (shard storage), forming a named storage unit
+that can be mounted into an ``Index``.
+The ``_AtmosphereBackend`` is an internal adapter that wraps an
+``AtmosphereClient`` to present the same operational surface as a repository,
+but routes through the ATProto network instead of a local provider.
+Examples:
+    >>> from atdata.repository import Repository, create_repository
+    >>> repo = Repository(provider=SqliteProvider("/data/lab.db"))
+    >>> repo = create_repository("sqlite", path="/data/lab.db")
+    >>>
+    >>> # With a data store for shard storage
+    >>> repo = Repository(
+    ...     provider=SqliteProvider(),
+    ...     data_store=S3DataStore(credentials, bucket="lab-data"),
+    ... )
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterator, Optional, TYPE_CHECKING
+from ._protocols import AbstractDataStore
+if TYPE_CHECKING:
+    from .providers._base import IndexProvider
+@dataclass
+class Repository:
+    """A named storage backend pairing index persistence with optional data storage.
+    Repositories are mounted into an ``Index`` by name. The built-in ``"local"``
+    repository uses SQLite by default; additional repositories can be added for
+    multi-source dataset management.
+    Attributes:
+        provider: IndexProvider handling dataset/schema persistence.
+        data_store: Optional data store for reading/writing dataset shards.
+            If present, ``insert_dataset`` will write shards to this store.
+    Examples:
+        >>> from atdata.providers import create_provider
+        >>> from atdata.repository import Repository
+        >>>
+        >>> provider = create_provider("sqlite", path="/data/lab.db")
+        >>> repo = Repository(provider=provider)
+        >>>
+        >>> # With S3 shard storage
+        >>> repo = Repository(
+        ...     provider=provider,
+        ...     data_store=S3DataStore(credentials, bucket="lab-data"),
+        ... )
+    """
+    provider: IndexProvider
+    data_store: AbstractDataStore | None = None
+def create_repository(
+    provider: str = "sqlite",
+    *,
+    path: str | Path | None = None,
+    dsn: str | None = None,
+    redis: Any = None,
+    data_store: AbstractDataStore | None = None,
+    **kwargs: Any,
+) -> Repository:
+    """Create a Repository with a provider by name.
+    This is a convenience factory that combines ``create_provider`` with
+    ``Repository`` construction.
+    Args:
+        provider: Backend name: ``"sqlite"``, ``"redis"``, or ``"postgres"``.
+        path: Database file path (SQLite only).
+        dsn: Connection string (PostgreSQL only).
+        redis: Existing Redis connection (Redis only).
+        data_store: Optional data store for shard storage.
+        **kwargs: Extra arguments forwarded to the provider constructor.
+    Returns:
+        A ready-to-use Repository.
+    Raises:
+        ValueError: If provider name is not recognised.
+    Examples:
+        >>> repo = create_repository("sqlite", path="/data/lab.db")
+        >>> repo = create_repository(
+        ...     "sqlite",
+        ...     data_store=S3DataStore(creds, bucket="lab"),
+        ... )
+    """
+    from .providers._factory import create_provider as _create_provider
+    backend = _create_provider(provider, path=path, dsn=dsn, redis=redis, **kwargs)
+    return Repository(provider=backend, data_store=data_store)
+class _AtmosphereBackend:
+    """Internal adapter wrapping AtmosphereClient for Index routing.
+    This class extracts the operational logic from ``AtmosphereIndex`` into an
+    internal component that the unified ``Index`` uses for ATProto resolution.
+    It is not part of the public API.
+    The backend is lazily initialised -- the publishers/loaders are only
+    created when the client is authenticated or when operations require them.
+    """
+    def __init__(
+        self,
+        client: Any,  # AtmosphereClient, typed as Any to avoid hard import
+        *,
+        data_store: Optional[AbstractDataStore] = None,
+    ) -> None:
+        from .atmosphere.client import AtmosphereClient
+        if not isinstance(client, AtmosphereClient):
+            raise TypeError(f"Expected AtmosphereClient, got {type(client).__name__}")
+        self.client: AtmosphereClient = client
+        self._data_store = data_store
+        self._schema_publisher: Any = None
+        self._schema_loader: Any = None
+        self._dataset_publisher: Any = None
+        self._dataset_loader: Any = None
+    def _ensure_loaders(self) -> None:
+        """Lazily create publishers/loaders on first use."""
+        if self._schema_loader is not None:
+            return
+        from .atmosphere.schema import SchemaPublisher, SchemaLoader
+        from .atmosphere.records import DatasetPublisher, DatasetLoader
+        self._schema_publisher = SchemaPublisher(self.client)
+        self._schema_loader = SchemaLoader(self.client)
+        self._dataset_publisher = DatasetPublisher(self.client)
+        self._dataset_loader = DatasetLoader(self.client)
+    @property
+    def data_store(self) -> Optional[AbstractDataStore]:
+        """The data store for this atmosphere backend, or None."""
+        return self._data_store
+    # -- Dataset operations --
+    def get_dataset(self, ref: str) -> Any:
+        """Get a dataset entry by name or AT URI.
+        Args:
+            ref: Dataset name or AT URI.
+        Returns:
+            AtmosphereIndexEntry for the dataset.
+        Raises:
+            ValueError: If record is not a dataset.
+        """
+        self._ensure_loaders()
+        from .atmosphere import AtmosphereIndexEntry
+        record = self._dataset_loader.get(ref)
+        return AtmosphereIndexEntry(ref, record)
+    def list_datasets(self, repo: str | None = None) -> list[Any]:
+        """List all dataset entries.
+        Args:
+            repo: DID of repository. Defaults to authenticated user.
+        Returns:
+            List of AtmosphereIndexEntry for each dataset.
+        """
+        self._ensure_loaders()
+        from .atmosphere import AtmosphereIndexEntry
+        records = self._dataset_loader.list_all(repo=repo)
+        return [
+            AtmosphereIndexEntry(rec.get("uri", ""), rec.get("value", rec))
+            for rec in records
+        ]
+    def iter_datasets(self, repo: str | None = None) -> Iterator[Any]:
+        """Lazily iterate over all dataset entries.
+        Args:
+            repo: DID of repository. Defaults to authenticated user.
+        Yields:
+            AtmosphereIndexEntry for each dataset.
+        """
+        self._ensure_loaders()
+        from .atmosphere import AtmosphereIndexEntry
+        records = self._dataset_loader.list_all(repo=repo)
+        for rec in records:
+            uri = rec.get("uri", "")
+            yield AtmosphereIndexEntry(uri, rec.get("value", rec))
+    def insert_dataset(
+        self,
+        ds: Any,
+        *,
+        name: str,
+        schema_ref: str | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        """Insert a dataset into ATProto.
+        Args:
+            ds: The Dataset to publish.
+            name: Human-readable name.
+            schema_ref: Optional schema AT URI. If None, auto-publishes schema.
+            **kwargs: Additional options (description, tags, license).
+        Returns:
+            AtmosphereIndexEntry for the inserted dataset.
+        """
+        self._ensure_loaders()
+        from .atmosphere import AtmosphereIndexEntry
+        uri = self._dataset_publisher.publish(
+            ds,
+            name=name,
+            schema_uri=schema_ref,
+            description=kwargs.get("description"),
+            tags=kwargs.get("tags"),
+            license=kwargs.get("license"),
+            auto_publish_schema=(schema_ref is None),
+        )
+        record = self._dataset_loader.get(uri)
+        return AtmosphereIndexEntry(str(uri), record)
+    # -- Schema operations --
+    def publish_schema(
+        self,
+        sample_type: type,
+        *,
+        version: str = "1.0.0",
+        **kwargs: Any,
+    ) -> str:
+        """Publish a schema to ATProto.
+        Args:
+            sample_type: A Packable type.
+            version: Semantic version string.
+            **kwargs: Additional options.
+        Returns:
+            AT URI of the schema record.
+        """
+        self._ensure_loaders()
+        uri = self._schema_publisher.publish(
+            sample_type,
+            version=version,
+            description=kwargs.get("description"),
+            metadata=kwargs.get("metadata"),
+        )
+        return str(uri)
+    def get_schema(self, ref: str) -> dict:
+        """Get a schema record by AT URI.
+        Args:
+            ref: AT URI of the schema record.
+        Returns:
+            Schema record dictionary.
+        """
+        self._ensure_loaders()
+        return self._schema_loader.get(ref)
+    def list_schemas(self, repo: str | None = None) -> list[dict]:
+        """List all schema records.
+        Args:
+            repo: DID of repository. Defaults to authenticated user.
+        Returns:
+            List of schema records as dictionaries.
+        """
+        self._ensure_loaders()
+        records = self._schema_loader.list_all(repo=repo)
+        return [rec.get("value", rec) for rec in records]
+    def iter_schemas(self) -> Iterator[dict]:
+        """Lazily iterate over all schema records.
+        Yields:
+            Schema records as dictionaries.
+        """
+        self._ensure_loaders()
+        records = self._schema_loader.list_all()
+        for rec in records:
+            yield rec.get("value", rec)
+    def decode_schema(self, ref: str) -> type:
+        """Reconstruct a Python type from a schema record.
+        Args:
+            ref: AT URI of the schema record.
+        Returns:
+            Dynamically generated Packable type.
+        """
+        from ._schema_codec import schema_to_type
+        schema = self.get_schema(ref)
+        return schema_to_type(schema)
+__all__ = [
+    "Repository",
+    "create_repository",
+]

atdata 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

atdata 0.2.3b1py3-none-any.whl → 0.3.0b1py3-none-any.whl