PyPI - atdata - Versions diffs - 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl - Mend

atdata 0.3.0b1py3-none-any.whl → 0.3.2b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

atdata/__init__.py +11 -0
atdata/_cid.py +0 -21
atdata/_helpers.py +12 -0
atdata/_hf_api.py +46 -1
atdata/_logging.py +43 -0
atdata/_protocols.py +81 -182
atdata/_schema_codec.py +2 -2
atdata/_sources.py +24 -4
atdata/_stub_manager.py +5 -25
atdata/atmosphere/__init__.py +60 -21
atdata/atmosphere/_lexicon_types.py +595 -0
atdata/atmosphere/_types.py +73 -245
atdata/atmosphere/client.py +64 -12
atdata/atmosphere/lens.py +60 -53
atdata/atmosphere/records.py +291 -100
atdata/atmosphere/schema.py +91 -65
atdata/atmosphere/store.py +68 -66
atdata/cli/__init__.py +16 -16
atdata/cli/diagnose.py +2 -2
atdata/cli/{local.py → infra.py} +10 -10
atdata/dataset.py +266 -47
atdata/index/__init__.py +54 -0
atdata/{local → index}/_entry.py +6 -2
atdata/{local → index}/_index.py +617 -72
atdata/{local → index}/_schema.py +5 -5
atdata/lexicons/__init__.py +127 -0
atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
atdata/lexicons/ac.foundation.dataset.record.json +117 -0
atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
atdata/lexicons/ndarray_shim.json +16 -0
atdata/local/__init__.py +12 -13
atdata/local/_repo_legacy.py +3 -3
atdata/manifest/__init__.py +4 -0
atdata/manifest/_proxy.py +321 -0
atdata/promote.py +14 -10
atdata/repository.py +66 -16
atdata/stores/__init__.py +23 -0
atdata/stores/_disk.py +131 -0
atdata/{local → stores}/_s3.py +134 -112
atdata/testing.py +12 -8
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
atdata-0.3.2b1.dist-info/RECORD +71 -0
atdata-0.3.0b1.dist-info/RECORD +0 -54
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0

atdata/atmosphere/schema.py CHANGED Viewed

@@ -1,25 +1,19 @@
 """Schema publishing and loading for ATProto.
 This module provides classes for publishing PackableSample schemas to ATProto
-and loading them back. Schemas are published as ``ac.foundation.dataset.sampleSchema``
+and loading them back. Schemas are published as ``ac.foundation.dataset.schema``
 records.
 """
 from dataclasses import fields, is_dataclass
 from typing import Type, TypeVar, Optional, get_type_hints, get_origin, get_args
-from .client import AtmosphereClient
-from ._types import (
-    AtUri,
-    SchemaRecord,
-    FieldDef,
-    FieldType,
-    LEXICON_NAMESPACE,
-)
+from .client import Atmosphere
+from ._types import AtUri, LEXICON_NAMESPACE
+from ._lexicon_types import LexSchemaRecord, JsonSchemaFormat
 from .._type_utils import (
     unwrap_optional,
     is_ndarray_type,
-    extract_ndarray_dtype,
 )
 # Import for type checking only to avoid circular imports
@@ -43,20 +37,19 @@ class SchemaPublisher:
         ...     image: NDArray
         ...     label: str
         ...
-        >>> client = AtmosphereClient()
-        >>> client.login("handle", "password")
+        >>> atmo = Atmosphere.login("handle", "password")
         >>>
-        >>> publisher = SchemaPublisher(client)
+        >>> publisher = SchemaPublisher(atmo)
         >>> uri = publisher.publish(MySample, version="1.0.0")
         >>> print(uri)
-        at://did:plc:.../ac.foundation.dataset.sampleSchema/...
+        at://did:plc:.../ac.foundation.dataset.schema/...
     """
-    def __init__(self, client: AtmosphereClient):
+    def __init__(self, client: Atmosphere):
         """Initialize the schema publisher.
         Args:
-            client: Authenticated AtmosphereClient instance.
+            client: Authenticated Atmosphere instance.
         """
         self.client = client
@@ -87,27 +80,32 @@ class SchemaPublisher:
             ValueError: If sample_type is not a dataclass or client is not authenticated.
             TypeError: If a field type is not supported.
         """
+        from atdata._logging import log_operation
         if not is_dataclass(sample_type):
             raise ValueError(
                 f"{sample_type.__name__} must be a dataclass (use @packable)"
             )
-        # Build the schema record
-        schema_record = self._build_schema_record(
-            sample_type,
-            name=name,
-            version=version,
-            description=description,
-            metadata=metadata,
-        )
+        with log_operation(
+            "SchemaPublisher.publish", schema=sample_type.__name__, version=version
+        ):
+            # Build the schema record
+            schema_record = self._build_schema_record(
+                sample_type,
+                name=name,
+                version=version,
+                description=description,
+                metadata=metadata,
+            )
-        # Publish to ATProto
-        return self.client.create_record(
-            collection=f"{LEXICON_NAMESPACE}.sampleSchema",
-            record=schema_record.to_record(),
-            rkey=rkey,
-            validate=False,  # PDS doesn't know our lexicon
-        )
+            # Publish to ATProto
+            return self.client.create_record(
+                collection=f"{LEXICON_NAMESPACE}.schema",
+                record=schema_record.to_record(),
+                rkey=rkey,
+                validate=False,  # PDS doesn't know our lexicon
+            )
     def _build_schema_record(
         self,
@@ -117,57 +115,74 @@ class SchemaPublisher:
         version: str,
         description: Optional[str],
         metadata: Optional[dict],
-    ) -> SchemaRecord:
-        """Build a SchemaRecord from a PackableSample class."""
-        field_defs = []
+    ) -> LexSchemaRecord:
+        """Build a LexSchemaRecord from a PackableSample class."""
         type_hints = get_type_hints(sample_type)
+        properties: dict[str, dict] = {}
+        required_fields: list[str] = []
+        has_ndarray = False
         for f in fields(sample_type):
             field_type = type_hints.get(f.name, f.type)
-            field_def = self._field_to_def(f.name, field_type)
-            field_defs.append(field_def)
-        return SchemaRecord(
+            field_type, is_optional = unwrap_optional(field_type)
+            prop = self._python_type_to_json_schema(field_type)
+            properties[f.name] = prop
+            if not is_optional:
+                required_fields.append(f.name)
+            if is_ndarray_type(field_type):
+                has_ndarray = True
+        schema_body = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": properties,
+        }
+        if required_fields:
+            schema_body["required"] = required_fields
+        array_format_versions = None
+        if has_ndarray:
+            array_format_versions = {"ndarrayBytes": "1.0.0"}
+        return LexSchemaRecord(
             name=name or sample_type.__name__,
             version=version,
+            schema_type="jsonSchema",
+            schema=JsonSchemaFormat(
+                schema_body=schema_body,
+                array_format_versions=array_format_versions,
+            ),
             description=description,
-            fields=field_defs,
             metadata=metadata,
         )
-    def _field_to_def(self, name: str, python_type) -> FieldDef:
-        """Convert a Python field to a FieldDef."""
-        python_type, is_optional = unwrap_optional(python_type)
-        field_type = self._python_type_to_field_type(python_type)
-        return FieldDef(name=name, field_type=field_type, optional=is_optional)
-    def _python_type_to_field_type(self, python_type) -> FieldType:
-        """Map a Python type to a FieldType."""
+    def _python_type_to_json_schema(self, python_type) -> dict:
+        """Map a Python type to a JSON Schema property definition."""
         if python_type is str:
-            return FieldType(kind="primitive", primitive="str")
+            return {"type": "string"}
         if python_type is int:
-            return FieldType(kind="primitive", primitive="int")
+            return {"type": "integer"}
         if python_type is float:
-            return FieldType(kind="primitive", primitive="float")
+            return {"type": "number"}
         if python_type is bool:
-            return FieldType(kind="primitive", primitive="bool")
+            return {"type": "boolean"}
         if python_type is bytes:
-            return FieldType(kind="primitive", primitive="bytes")
+            return {"type": "string", "format": "byte", "contentEncoding": "base64"}
         if is_ndarray_type(python_type):
-            return FieldType(
-                kind="ndarray", dtype=extract_ndarray_dtype(python_type), shape=None
-            )
+            return {
+                "$ref": "https://foundation.ac/schemas/atdata-ndarray-bytes/1.0.0#/$defs/ndarray"
+            }
         origin = get_origin(python_type)
         if origin is list:
             args = get_args(python_type)
             items = (
-                self._python_type_to_field_type(args[0])
+                self._python_type_to_json_schema(args[0])
                 if args
-                else FieldType(kind="primitive", primitive="str")
+                else {"type": "string"}
             )
-            return FieldType(kind="array", items=items)
+            return {"type": "array", "items": items}
         if is_dataclass(python_type):
             raise TypeError(
@@ -185,20 +200,19 @@ class SchemaLoader:
     schemas from a repository.
     Examples:
-        >>> client = AtmosphereClient()
-        >>> client.login("handle", "password")
+        >>> atmo = Atmosphere.login("handle", "password")
         >>>
-        >>> loader = SchemaLoader(client)
-        >>> schema = loader.get("at://did:plc:.../ac.foundation.dataset.sampleSchema/...")
+        >>> loader = SchemaLoader(atmo)
+        >>> schema = loader.get("at://did:plc:.../ac.foundation.dataset.schema/...")
         >>> print(schema["name"])
         'MySample'
     """
-    def __init__(self, client: AtmosphereClient):
+    def __init__(self, client: Atmosphere):
         """Initialize the schema loader.
         Args:
-            client: AtmosphereClient instance (authentication optional for reads).
+            client: Atmosphere instance (authentication optional for reads).
         """
         self.client = client
@@ -217,7 +231,7 @@ class SchemaLoader:
         """
         record = self.client.get_record(uri)
-        expected_type = f"{LEXICON_NAMESPACE}.sampleSchema"
+        expected_type = f"{LEXICON_NAMESPACE}.schema"
         if record.get("$type") != expected_type:
             raise ValueError(
                 f"Record at {uri} is not a schema record. "
@@ -226,6 +240,18 @@ class SchemaLoader:
         return record
+    def get_typed(self, uri: str | AtUri) -> LexSchemaRecord:
+        """Fetch a schema record and return as a typed object.
+        Args:
+            uri: The AT URI of the schema record.
+        Returns:
+            LexSchemaRecord instance.
+        """
+        record = self.get(uri)
+        return LexSchemaRecord.from_record(record)
     def list_all(
         self,
         repo: Optional[str] = None,

atdata/atmosphere/store.py CHANGED Viewed

@@ -7,12 +7,11 @@ This enables fully decentralized dataset storage where both metadata (records)
 and data (blobs) live on the AT Protocol network.
 Examples:
-    >>> from atdata.atmosphere import AtmosphereClient, PDSBlobStore
+    >>> from atdata.atmosphere import Atmosphere, PDSBlobStore
     >>>
-    >>> client = AtmosphereClient()
-    >>> client.login("handle.bsky.social", "app-password")
+    >>> atmo = Atmosphere.login("handle.bsky.social", "app-password")
     >>>
-    >>> store = PDSBlobStore(client)
+    >>> store = PDSBlobStore(atmo)
     >>> urls = store.write_shards(dataset, prefix="mnist/v1")
     >>> print(urls)
     ['at://did:plc:.../blob/bafyrei...', ...]
@@ -20,16 +19,38 @@ Examples:
 from __future__ import annotations
-import tempfile
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
-import webdataset as wds
+#: Maximum size in bytes for a single PDS blob upload (50 MB).
+PDS_BLOB_LIMIT_BYTES: int = 50_000_000
+#: Maximum total dataset size in bytes for atmosphere uploads (1 GB).
+PDS_TOTAL_DATASET_LIMIT_BYTES: int = 1_000_000_000
 if TYPE_CHECKING:
     from ..dataset import Dataset
     from .._sources import BlobSource
-    from .client import AtmosphereClient
+    from .client import Atmosphere
+class ShardUploadResult(list):
+    """Return type for ``PDSBlobStore.write_shards()``.
+    Extends ``list[str]`` (AT URIs) so it satisfies the ``AbstractDataStore``
+    protocol, while also carrying the raw blob reference dicts needed to
+    create ``storageBlobs`` records.
+    Attributes:
+        blob_refs: Blob reference dicts as returned by
+            ``Atmosphere.upload_blob()``.
+    """
+    blob_refs: list[dict]
+    def __init__(self, urls: list[str], blob_refs: list[dict]) -> None:
+        super().__init__(urls)
+        self.blob_refs = blob_refs
 @dataclass
@@ -44,7 +65,7 @@ class PDSBlobStore:
     to HTTP URLs for streaming.
     Attributes:
-        client: Authenticated AtmosphereClient instance.
+        client: Authenticated Atmosphere instance.
     Examples:
         >>> store = PDSBlobStore(client)
@@ -53,85 +74,61 @@ class PDSBlobStore:
         >>> # ['at://did:plc:abc/blob/bafyrei...', ...]
     """
-    client: "AtmosphereClient"
+    client: "Atmosphere"
     def write_shards(
         self,
         ds: "Dataset",
         *,
         prefix: str,
-        maxcount: int = 10000,
-        maxsize: float = 3e9,
         **kwargs: Any,
-    ) -> list[str]:
-        """Write dataset shards as PDS blobs.
+    ) -> "ShardUploadResult":
+        """Upload existing dataset shards as PDS blobs.
-        Creates tar archives from the dataset and uploads each as a blob
-        to the authenticated user's PDS.
+        Reads the tar archives already written to disk by the caller and
+        uploads each as a blob to the authenticated user's PDS. This
+        avoids re-serializing samples that have already been written.
         Args:
-            ds: The Dataset to write.
-            prefix: Logical path prefix for naming (used in shard names only).
-            maxcount: Maximum samples per shard (default: 10000).
-            maxsize: Maximum shard size in bytes (default: 3GB, PDS limit).
-            **kwargs: Additional args passed to wds.ShardWriter.
+            ds: The Dataset whose shards to upload.
+            prefix: Logical path prefix (unused, kept for protocol compat).
+            **kwargs: Unused, kept for protocol compatibility.
         Returns:
-            List of AT URIs for the written blobs, in format:
-            ``at://{did}/blob/{cid}``
+            A ``ShardUploadResult`` (behaves as ``list[str]`` of AT URIs)
+            with a ``blob_refs`` attribute containing the raw blob reference
+            dicts needed for ``storageBlobs`` records.
         Raises:
             ValueError: If not authenticated.
-            RuntimeError: If no shards were written.
-        Note:
-            PDS blobs have size limits (typically 50MB-5GB depending on PDS).
-            Adjust maxcount/maxsize to stay within limits.
+            RuntimeError: If no shards are found on the dataset.
         """
         if not self.client.did:
             raise ValueError("Client must be authenticated to upload blobs")
         did = self.client.did
         blob_urls: list[str] = []
+        blob_refs: list[dict] = []
+        shard_paths = ds.list_shards()
+        if not shard_paths:
+            raise RuntimeError("No shards to upload")
+        for shard_url in shard_paths:
+            with open(shard_url, "rb") as f:
+                shard_data = f.read()
+            blob_ref = self.client.upload_blob(
+                shard_data,
+                mime_type="application/x-tar",
+            )
+            blob_refs.append(blob_ref)
+            cid = blob_ref["ref"]["$link"]
+            at_uri = f"at://{did}/blob/{cid}"
+            blob_urls.append(at_uri)
-        # Write shards to temp files, upload each as blob
-        with tempfile.TemporaryDirectory() as temp_dir:
-            shard_pattern = f"{temp_dir}/shard-%06d.tar"
-            written_files: list[str] = []
-            # Track written files via custom post callback
-            def track_file(fname: str) -> None:
-                written_files.append(fname)
-            with wds.writer.ShardWriter(
-                shard_pattern,
-                maxcount=maxcount,
-                maxsize=maxsize,
-                post=track_file,
-                **kwargs,
-            ) as sink:
-                for sample in ds.ordered(batch_size=None):
-                    sink.write(sample.as_wds)
-            if not written_files:
-                raise RuntimeError("No shards written")
-            # Upload each shard as a blob
-            for shard_path in written_files:
-                with open(shard_path, "rb") as f:
-                    shard_data = f.read()
-                blob_ref = self.client.upload_blob(
-                    shard_data,
-                    mime_type="application/x-tar",
-                )
-                # Extract CID from blob reference
-                cid = blob_ref["ref"]["$link"]
-                at_uri = f"at://{did}/blob/{cid}"
-                blob_urls.append(at_uri)
-        return blob_urls
+        return ShardUploadResult(blob_urls, blob_refs)
     def read_url(self, url: str) -> str:
         """Resolve an AT URI blob reference to an HTTP URL.
@@ -201,4 +198,9 @@ class PDSBlobStore:
         return BlobSource(blob_refs=blob_refs)
-__all__ = ["PDSBlobStore"]
+__all__ = [
+    "PDS_BLOB_LIMIT_BYTES",
+    "PDS_TOTAL_DATASET_LIMIT_BYTES",
+    "PDSBlobStore",
+    "ShardUploadResult",
+]

atdata/cli/__init__.py CHANGED Viewed

@@ -1,12 +1,12 @@
 """Command-line interface for atdata.
-This module provides CLI commands for managing local development infrastructure,
+This module provides CLI commands for managing development infrastructure,
 inspecting datasets, and diagnosing configuration issues.
 Commands:
-    atdata local up      Start Redis and MinIO containers for local development
-    atdata local down    Stop local development containers
-    atdata local status  Show status of local infrastructure
+    atdata infra up      Start Redis and MinIO containers for development
+    atdata infra down    Stop development containers
+    atdata infra status  Show status of infrastructure
     atdata diagnose      Check Redis configuration and connectivity
     atdata inspect       Show dataset summary information
     atdata schema show   Display dataset schema
@@ -30,12 +30,12 @@ app = typer.Typer(
     no_args_is_help=True,
 )
-local_app = typer.Typer(
-    name="local",
-    help="Manage local development infrastructure.",
+infra_app = typer.Typer(
+    name="infra",
+    help="Manage development infrastructure.",
     no_args_is_help=True,
 )
-app.add_typer(local_app, name="local")
+app.add_typer(infra_app, name="infra")
 schema_app = typer.Typer(
     name="schema",
@@ -101,11 +101,11 @@ def diagnose(
 # ---------------------------------------------------------------------------
-# local sub-commands
+# infra sub-commands
 # ---------------------------------------------------------------------------
-@local_app.command()
+@infra_app.command()
 def up(
     redis_port: int = typer.Option(6379, help="Redis port."),
     minio_port: int = typer.Option(9000, help="MinIO API port."),
@@ -115,7 +115,7 @@ def up(
     ),
 ) -> None:
     """Start Redis and MinIO containers."""
-    from .local import local_up
+    from .infra import local_up
     code = local_up(
         redis_port=redis_port,
@@ -126,23 +126,23 @@ def up(
     raise typer.Exit(code=code)
-@local_app.command()
+@infra_app.command()
 def down(
     volumes: bool = typer.Option(
         False, "--volumes", "-v", help="Also remove volumes (deletes all data)."
     ),
 ) -> None:
     """Stop local development containers."""
-    from .local import local_down
+    from .infra import local_down
     code = local_down(remove_volumes=volumes)
     raise typer.Exit(code=code)
-@local_app.command()
+@infra_app.command()
 def status() -> None:
-    """Show status of local infrastructure."""
-    from .local import local_status
+    """Show status of infrastructure."""
+    from .infra import local_status
     code = local_status()
     raise typer.Exit(code=code)

atdata/cli/diagnose.py CHANGED Viewed

@@ -51,7 +51,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
         _print_status("Connection", False, str(e))
         print()
         print("Cannot connect to Redis. Make sure Redis is running:")
-        print("  atdata local up")
+        print("  atdata infra up")
         return 1
     # Check Redis version
@@ -162,7 +162,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
         print("  maxmemory-policy noeviction")
         print()
         print("  # Or use atdata's preconfigured local setup:")
-        print("  atdata local up")
+        print("  atdata infra up")
         return 1
     else:
         print("All checks passed. Redis is properly configured for atdata.")

atdata/cli/{local.py → infra.py} RENAMED Viewed

@@ -1,6 +1,6 @@
-"""Local infrastructure management for atdata.
+"""Infrastructure management for atdata.
-This module provides commands to start and stop local development infrastructure:
+This module provides commands to start and stop development infrastructure:
 - Redis: For index storage and metadata
 - MinIO: S3-compatible object storage for dataset files
@@ -179,7 +179,7 @@ def local_up(
     if not _check_docker():
         return 1
-    print("Starting atdata local infrastructure...")
+    print("Starting atdata infrastructure...")
     compose_content = _get_compose_file(redis_port, minio_port, minio_console_port)
     command = ["up"]
@@ -202,7 +202,7 @@ def local_up(
     # Show status
     print()
-    print("Local infrastructure started:")
+    print("Infrastructure started:")
     print(f"  Redis:        localhost:{redis_port}")
     print(f"  MinIO API:    http://localhost:{minio_port}")
     print(f"  MinIO Console: http://localhost:{minio_console_port}")
@@ -210,7 +210,7 @@ def local_up(
     print("MinIO credentials: minioadmin / minioadmin")
     print()
     print("Example usage:")
-    print("  from atdata.local import Index, S3DataStore")
+    print("  from atdata.stores import S3DataStore")
     print("  ")
     print("  store = S3DataStore.from_credentials({")
     print(f"      'AWS_ENDPOINT': 'http://localhost:{minio_port}',")
@@ -234,7 +234,7 @@ def local_down(remove_volumes: bool = False) -> int:
     if not _check_docker():
         return 1
-    print("Stopping atdata local infrastructure...")
+    print("Stopping atdata infrastructure...")
     # Use default ports for compose file (actual ports don't matter for down)
     compose_content = _get_compose_file(6379, 9000, 9001)
@@ -252,7 +252,7 @@ def local_down(remove_volumes: bool = False) -> int:
         print(f"Error: {e}", file=sys.stderr)
         return 1
-    print("Local infrastructure stopped.")
+    print("Infrastructure stopped.")
     return 0
@@ -268,16 +268,16 @@ def local_status() -> int:
     redis_running = _container_running(REDIS_CONTAINER)
     minio_running = _container_running(MINIO_CONTAINER)
-    print("atdata local infrastructure status:")
+    print("atdata infrastructure status:")
     print()
     print(f"  Redis ({REDIS_CONTAINER}):  {'running' if redis_running else 'stopped'}")
     print(f"  MinIO ({MINIO_CONTAINER}):  {'running' if minio_running else 'stopped'}")
     if redis_running or minio_running:
         print()
-        print("To stop: atdata local down")
+        print("To stop: atdata infra down")
     else:
         print()
-        print("To start: atdata local up")
+        print("To start: atdata infra up")
     return 0

atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

atdata 0.3.0b1py3-none-any.whl → 0.3.2b1py3-none-any.whl