PyPI - atdata - Versions diffs - 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl - Mend

atdata 0.3.1b1py3-none-any.whl → 0.3.2b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

atdata/__init__.py +2 -0
atdata/_hf_api.py +13 -0
atdata/_logging.py +43 -0
atdata/_protocols.py +18 -1
atdata/_sources.py +24 -4
atdata/atmosphere/__init__.py +48 -10
atdata/atmosphere/_lexicon_types.py +595 -0
atdata/atmosphere/_types.py +71 -243
atdata/atmosphere/lens.py +49 -41
atdata/atmosphere/records.py +282 -90
atdata/atmosphere/schema.py +78 -50
atdata/atmosphere/store.py +62 -59
atdata/dataset.py +201 -135
atdata/index/_entry.py +6 -2
atdata/index/_index.py +396 -109
atdata/lexicons/__init__.py +9 -3
atdata/lexicons/ac.foundation.dataset.lens.json +2 -0
atdata/lexicons/ac.foundation.dataset.record.json +22 -1
atdata/lexicons/ac.foundation.dataset.storageBlobs.json +26 -4
atdata/lexicons/ac.foundation.dataset.storageExternal.json +1 -1
atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
atdata/manifest/__init__.py +4 -0
atdata/manifest/_proxy.py +321 -0
atdata/repository.py +59 -9
atdata/stores/_disk.py +19 -11
atdata/stores/_s3.py +134 -112
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +1 -1
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/RECORD +37 -33
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0

atdata/atmosphere/schema.py CHANGED Viewed

@@ -9,17 +9,11 @@ from dataclasses import fields, is_dataclass
 from typing import Type, TypeVar, Optional, get_type_hints, get_origin, get_args
 from .client import Atmosphere
-from ._types import (
-    AtUri,
-    SchemaRecord,
-    FieldDef,
-    FieldType,
-    LEXICON_NAMESPACE,
-)
+from ._types import AtUri, LEXICON_NAMESPACE
+from ._lexicon_types import LexSchemaRecord, JsonSchemaFormat
 from .._type_utils import (
     unwrap_optional,
     is_ndarray_type,
-    extract_ndarray_dtype,
 )
 # Import for type checking only to avoid circular imports
@@ -86,27 +80,32 @@ class SchemaPublisher:
             ValueError: If sample_type is not a dataclass or client is not authenticated.
             TypeError: If a field type is not supported.
         """
+        from atdata._logging import log_operation
         if not is_dataclass(sample_type):
             raise ValueError(
                 f"{sample_type.__name__} must be a dataclass (use @packable)"
             )
-        # Build the schema record
-        schema_record = self._build_schema_record(
-            sample_type,
-            name=name,
-            version=version,
-            description=description,
-            metadata=metadata,
-        )
+        with log_operation(
+            "SchemaPublisher.publish", schema=sample_type.__name__, version=version
+        ):
+            # Build the schema record
+            schema_record = self._build_schema_record(
+                sample_type,
+                name=name,
+                version=version,
+                description=description,
+                metadata=metadata,
+            )
-        # Publish to ATProto
-        return self.client.create_record(
-            collection=f"{LEXICON_NAMESPACE}.schema",
-            record=schema_record.to_record(),
-            rkey=rkey,
-            validate=False,  # PDS doesn't know our lexicon
-        )
+            # Publish to ATProto
+            return self.client.create_record(
+                collection=f"{LEXICON_NAMESPACE}.schema",
+                record=schema_record.to_record(),
+                rkey=rkey,
+                validate=False,  # PDS doesn't know our lexicon
+            )
     def _build_schema_record(
         self,
@@ -116,57 +115,74 @@ class SchemaPublisher:
         version: str,
         description: Optional[str],
         metadata: Optional[dict],
-    ) -> SchemaRecord:
-        """Build a SchemaRecord from a PackableSample class."""
-        field_defs = []
+    ) -> LexSchemaRecord:
+        """Build a LexSchemaRecord from a PackableSample class."""
         type_hints = get_type_hints(sample_type)
+        properties: dict[str, dict] = {}
+        required_fields: list[str] = []
+        has_ndarray = False
         for f in fields(sample_type):
             field_type = type_hints.get(f.name, f.type)
-            field_def = self._field_to_def(f.name, field_type)
-            field_defs.append(field_def)
-        return SchemaRecord(
+            field_type, is_optional = unwrap_optional(field_type)
+            prop = self._python_type_to_json_schema(field_type)
+            properties[f.name] = prop
+            if not is_optional:
+                required_fields.append(f.name)
+            if is_ndarray_type(field_type):
+                has_ndarray = True
+        schema_body = {
+            "$schema": "http://json-schema.org/draft-07/schema#",
+            "type": "object",
+            "properties": properties,
+        }
+        if required_fields:
+            schema_body["required"] = required_fields
+        array_format_versions = None
+        if has_ndarray:
+            array_format_versions = {"ndarrayBytes": "1.0.0"}
+        return LexSchemaRecord(
             name=name or sample_type.__name__,
             version=version,
+            schema_type="jsonSchema",
+            schema=JsonSchemaFormat(
+                schema_body=schema_body,
+                array_format_versions=array_format_versions,
+            ),
             description=description,
-            fields=field_defs,
             metadata=metadata,
         )
-    def _field_to_def(self, name: str, python_type) -> FieldDef:
-        """Convert a Python field to a FieldDef."""
-        python_type, is_optional = unwrap_optional(python_type)
-        field_type = self._python_type_to_field_type(python_type)
-        return FieldDef(name=name, field_type=field_type, optional=is_optional)
-    def _python_type_to_field_type(self, python_type) -> FieldType:
-        """Map a Python type to a FieldType."""
+    def _python_type_to_json_schema(self, python_type) -> dict:
+        """Map a Python type to a JSON Schema property definition."""
         if python_type is str:
-            return FieldType(kind="primitive", primitive="str")
+            return {"type": "string"}
         if python_type is int:
-            return FieldType(kind="primitive", primitive="int")
+            return {"type": "integer"}
         if python_type is float:
-            return FieldType(kind="primitive", primitive="float")
+            return {"type": "number"}
         if python_type is bool:
-            return FieldType(kind="primitive", primitive="bool")
+            return {"type": "boolean"}
         if python_type is bytes:
-            return FieldType(kind="primitive", primitive="bytes")
+            return {"type": "string", "format": "byte", "contentEncoding": "base64"}
         if is_ndarray_type(python_type):
-            return FieldType(
-                kind="ndarray", dtype=extract_ndarray_dtype(python_type), shape=None
-            )
+            return {
+                "$ref": "https://foundation.ac/schemas/atdata-ndarray-bytes/1.0.0#/$defs/ndarray"
+            }
         origin = get_origin(python_type)
         if origin is list:
             args = get_args(python_type)
             items = (
-                self._python_type_to_field_type(args[0])
+                self._python_type_to_json_schema(args[0])
                 if args
-                else FieldType(kind="primitive", primitive="str")
+                else {"type": "string"}
             )
-            return FieldType(kind="array", items=items)
+            return {"type": "array", "items": items}
         if is_dataclass(python_type):
             raise TypeError(
@@ -224,6 +240,18 @@ class SchemaLoader:
         return record
+    def get_typed(self, uri: str | AtUri) -> LexSchemaRecord:
+        """Fetch a schema record and return as a typed object.
+        Args:
+            uri: The AT URI of the schema record.
+        Returns:
+            LexSchemaRecord instance.
+        """
+        record = self.get(uri)
+        return LexSchemaRecord.from_record(record)
     def list_all(
         self,
         repo: Optional[str] = None,

atdata/atmosphere/store.py CHANGED Viewed

@@ -19,11 +19,14 @@ Examples:
 from __future__ import annotations
-import tempfile
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any
-import webdataset as wds
+#: Maximum size in bytes for a single PDS blob upload (50 MB).
+PDS_BLOB_LIMIT_BYTES: int = 50_000_000
+#: Maximum total dataset size in bytes for atmosphere uploads (1 GB).
+PDS_TOTAL_DATASET_LIMIT_BYTES: int = 1_000_000_000
 if TYPE_CHECKING:
     from ..dataset import Dataset
@@ -31,6 +34,25 @@ if TYPE_CHECKING:
     from .client import Atmosphere
+class ShardUploadResult(list):
+    """Return type for ``PDSBlobStore.write_shards()``.
+    Extends ``list[str]`` (AT URIs) so it satisfies the ``AbstractDataStore``
+    protocol, while also carrying the raw blob reference dicts needed to
+    create ``storageBlobs`` records.
+    Attributes:
+        blob_refs: Blob reference dicts as returned by
+            ``Atmosphere.upload_blob()``.
+    """
+    blob_refs: list[dict]
+    def __init__(self, urls: list[str], blob_refs: list[dict]) -> None:
+        super().__init__(urls)
+        self.blob_refs = blob_refs
 @dataclass
 class PDSBlobStore:
     """PDS blob store implementing AbstractDataStore protocol.
@@ -59,78 +81,54 @@ class PDSBlobStore:
         ds: "Dataset",
         *,
         prefix: str,
-        maxcount: int = 10000,
-        maxsize: float = 3e9,
         **kwargs: Any,
-    ) -> list[str]:
-        """Write dataset shards as PDS blobs.
+    ) -> "ShardUploadResult":
+        """Upload existing dataset shards as PDS blobs.
-        Creates tar archives from the dataset and uploads each as a blob
-        to the authenticated user's PDS.
+        Reads the tar archives already written to disk by the caller and
+        uploads each as a blob to the authenticated user's PDS. This
+        avoids re-serializing samples that have already been written.
         Args:
-            ds: The Dataset to write.
-            prefix: Logical path prefix for naming (used in shard names only).
-            maxcount: Maximum samples per shard (default: 10000).
-            maxsize: Maximum shard size in bytes (default: 3GB, PDS limit).
-            **kwargs: Additional args passed to wds.ShardWriter.
+            ds: The Dataset whose shards to upload.
+            prefix: Logical path prefix (unused, kept for protocol compat).
+            **kwargs: Unused, kept for protocol compatibility.
         Returns:
-            List of AT URIs for the written blobs, in format:
-            ``at://{did}/blob/{cid}``
+            A ``ShardUploadResult`` (behaves as ``list[str]`` of AT URIs)
+            with a ``blob_refs`` attribute containing the raw blob reference
+            dicts needed for ``storageBlobs`` records.
         Raises:
             ValueError: If not authenticated.
-            RuntimeError: If no shards were written.
-        Note:
-            PDS blobs have size limits (typically 50MB-5GB depending on PDS).
-            Adjust maxcount/maxsize to stay within limits.
+            RuntimeError: If no shards are found on the dataset.
         """
         if not self.client.did:
             raise ValueError("Client must be authenticated to upload blobs")
         did = self.client.did
         blob_urls: list[str] = []
+        blob_refs: list[dict] = []
+        shard_paths = ds.list_shards()
+        if not shard_paths:
+            raise RuntimeError("No shards to upload")
+        for shard_url in shard_paths:
+            with open(shard_url, "rb") as f:
+                shard_data = f.read()
+            blob_ref = self.client.upload_blob(
+                shard_data,
+                mime_type="application/x-tar",
+            )
+            blob_refs.append(blob_ref)
+            cid = blob_ref["ref"]["$link"]
+            at_uri = f"at://{did}/blob/{cid}"
+            blob_urls.append(at_uri)
-        # Write shards to temp files, upload each as blob
-        with tempfile.TemporaryDirectory() as temp_dir:
-            shard_pattern = f"{temp_dir}/shard-%06d.tar"
-            written_files: list[str] = []
-            # Track written files via custom post callback
-            def track_file(fname: str) -> None:
-                written_files.append(fname)
-            with wds.writer.ShardWriter(
-                shard_pattern,
-                maxcount=maxcount,
-                maxsize=maxsize,
-                post=track_file,
-                **kwargs,
-            ) as sink:
-                for sample in ds.ordered(batch_size=None):
-                    sink.write(sample.as_wds)
-            if not written_files:
-                raise RuntimeError("No shards written")
-            # Upload each shard as a blob
-            for shard_path in written_files:
-                with open(shard_path, "rb") as f:
-                    shard_data = f.read()
-                blob_ref = self.client.upload_blob(
-                    shard_data,
-                    mime_type="application/x-tar",
-                )
-                # Extract CID from blob reference
-                cid = blob_ref["ref"]["$link"]
-                at_uri = f"at://{did}/blob/{cid}"
-                blob_urls.append(at_uri)
-        return blob_urls
+        return ShardUploadResult(blob_urls, blob_refs)
     def read_url(self, url: str) -> str:
         """Resolve an AT URI blob reference to an HTTP URL.
@@ -200,4 +198,9 @@ class PDSBlobStore:
         return BlobSource(blob_refs=blob_refs)
-__all__ = ["PDSBlobStore"]
+__all__ = [
+    "PDS_BLOB_LIMIT_BYTES",
+    "PDS_TOTAL_DATASET_LIMIT_BYTES",
+    "PDSBlobStore",
+    "ShardUploadResult",
+]

atdata 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

atdata 0.3.1b1py3-none-any.whl → 0.3.2b1py3-none-any.whl