PyPI - atdata - Versions diffs - 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl - Mend

atdata 0.3.1b1py3-none-any.whl → 0.3.2b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

atdata/__init__.py +2 -0
atdata/_hf_api.py +13 -0
atdata/_logging.py +43 -0
atdata/_protocols.py +18 -1
atdata/_sources.py +24 -4
atdata/atmosphere/__init__.py +48 -10
atdata/atmosphere/_lexicon_types.py +595 -0
atdata/atmosphere/_types.py +71 -243
atdata/atmosphere/lens.py +49 -41
atdata/atmosphere/records.py +282 -90
atdata/atmosphere/schema.py +78 -50
atdata/atmosphere/store.py +62 -59
atdata/dataset.py +201 -135
atdata/index/_entry.py +6 -2
atdata/index/_index.py +396 -109
atdata/lexicons/__init__.py +9 -3
atdata/lexicons/ac.foundation.dataset.lens.json +2 -0
atdata/lexicons/ac.foundation.dataset.record.json +22 -1
atdata/lexicons/ac.foundation.dataset.storageBlobs.json +26 -4
atdata/lexicons/ac.foundation.dataset.storageExternal.json +1 -1
atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
atdata/manifest/__init__.py +4 -0
atdata/manifest/_proxy.py +321 -0
atdata/repository.py +59 -9
atdata/stores/_disk.py +19 -11
atdata/stores/_s3.py +134 -112
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +1 -1
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/RECORD +37 -33
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
{atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0

atdata/__init__.py CHANGED Viewed

@@ -90,6 +90,7 @@ from ._schema_codec import (
 from ._logging import (
     configure_logging as configure_logging,
     get_logger as get_logger,
+    log_operation as log_operation,
 )
 from .repository import (
@@ -121,6 +122,7 @@ from .manifest import (
     ManifestWriter as ManifestWriter,
     QueryExecutor as QueryExecutor,
     SampleLocation as SampleLocation,
+    query_fields as query_fields,
 )
 # ATProto integration (lazy import to avoid requiring atproto package)

atdata/_hf_api.py CHANGED Viewed

@@ -714,12 +714,23 @@ def load_dataset(
         >>> index = Index()
         >>> ds = load_dataset("@local/my-dataset", index=index, split="train")
     """
+    from ._logging import get_logger
+    log = get_logger()
+    log.info(
+        "load_dataset: path=%s, split=%s, sample_type=%s",
+        path,
+        split,
+        sample_type.__name__ if sample_type is not None else "None",
+    )
     # Handle @handle/dataset indexed path resolution
     if _is_indexed_path(path):
         if index is None:
             index = get_default_index()
         source, schema_ref = _resolve_indexed_path(path, index)
+        log.debug("load_dataset: resolved indexed path, schema_ref=%s", schema_ref)
         # Resolve sample_type from schema if not provided
         resolved_type: Type = (
@@ -746,6 +757,8 @@ def load_dataset(
     if not splits_shards:
         raise FileNotFoundError(f"No data files found at path: {path}")
+    log.debug("load_dataset: resolved %d split(s) from path", len(splits_shards))
     # Build Dataset for each split
     datasets: dict[str, Dataset] = {}
     for split_name, shards in splits_shards.items():

atdata/_logging.py CHANGED Viewed

@@ -22,7 +22,10 @@ custom logger implementations.
 from __future__ import annotations
+import contextlib
 import logging
+import time
+from collections.abc import Generator
 from typing import Any, Protocol, runtime_checkable
@@ -68,3 +71,43 @@ def get_logger() -> LoggerProtocol:
     whatever was last set via :func:`configure_logging`.
     """
     return _logger
+@contextlib.contextmanager
+def log_operation(op_name: str, **context: Any) -> Generator[None, None, None]:
+    """Log the start, completion, and duration of an operation.
+    Emits an ``info`` message on entry and on successful completion
+    (with elapsed time), or an ``error`` message if an exception
+    propagates out.
+    Args:
+        op_name: Short label for the operation (e.g. ``"write_samples"``).
+        **context: Arbitrary key-value pairs included in every log message.
+    Examples:
+        >>> with log_operation("write_samples", shard_count=10):
+        ...     do_work()
+    """
+    log = get_logger()
+    ctx_str = ", ".join(f"{k}={v}" for k, v in context.items())
+    if ctx_str:
+        log.info("%s: started (%s)", op_name, ctx_str)
+    else:
+        log.info("%s: started", op_name)
+    t0 = time.monotonic()
+    try:
+        yield
+    except Exception:
+        elapsed = time.monotonic() - t0
+        if ctx_str:
+            log.error("%s: failed after %.2fs (%s)", op_name, elapsed, ctx_str)
+        else:
+            log.error("%s: failed after %.2fs", op_name, elapsed)
+        raise
+    else:
+        elapsed = time.monotonic() - t0
+        if ctx_str:
+            log.info("%s: completed in %.2fs (%s)", op_name, elapsed, ctx_str)
+        else:
+            log.info("%s: completed in %.2fs", op_name, elapsed)

atdata/_protocols.py CHANGED Viewed

@@ -151,12 +151,14 @@ class AbstractIndex(Protocol):
     # Dataset operations
-    def write(
+    def write_samples(
         self,
         samples: Iterable,
         *,
         name: str,
         schema_ref: Optional[str] = None,
+        data_store: Optional["AbstractDataStore"] = None,
+        force: bool = False,
         **kwargs,
     ) -> IndexEntry:
         """Write samples and create an index entry in one step.
@@ -164,10 +166,15 @@ class AbstractIndex(Protocol):
         Serializes samples to WebDataset tar files, stores them via the
         appropriate backend, and creates an index entry.
+        For atmosphere targets, data is uploaded as PDS blobs by default
+        with size guards (50 MB per shard, 1 GB total).
         Args:
             samples: Iterable of Packable samples. Must be non-empty.
             name: Dataset name, optionally prefixed with target backend.
             schema_ref: Optional schema reference.
+            data_store: Explicit data store for shard storage.
+            force: Bypass PDS size limits.
             **kwargs: Backend-specific options (maxcount, description, etc.).
         Returns:
@@ -181,14 +188,24 @@ class AbstractIndex(Protocol):
         *,
         name: str,
         schema_ref: Optional[str] = None,
+        data_store: Optional["AbstractDataStore"] = None,
+        force: bool = False,
+        copy: bool = False,
         **kwargs,
     ) -> IndexEntry:
         """Register an existing dataset in the index.
+        For atmosphere targets, local sources are uploaded via
+        *data_store* (defaults to PDS blobs). Credentialed sources
+        require ``copy=True``.
         Args:
             ds: The Dataset to register.
             name: Human-readable name.
             schema_ref: Explicit schema ref; auto-published if ``None``.
+            data_store: Explicit data store for shard storage.
+            force: Bypass PDS size limits.
+            copy: Copy data to destination store even for remote sources.
             **kwargs: Backend-specific options.
         """
         ...

atdata/_sources.py CHANGED Viewed

@@ -64,10 +64,20 @@ class URLSource:
         """Expand brace pattern and return list of shard URLs."""
         return list(braceexpand.braceexpand(self.url))
-    # Legacy alias for backwards compatibility
     @property
     def shard_list(self) -> list[str]:
-        """Expand brace pattern and return list of shard URLs (deprecated, use list_shards())."""
+        """Expand brace pattern and return list of shard URLs.
+        .. deprecated::
+            Use :meth:`list_shards` instead.
+        """
+        import warnings
+        warnings.warn(
+            "shard_list is deprecated, use list_shards()",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         return self.list_shards()
     @property
@@ -178,10 +188,20 @@ class S3Source:
         """Return list of S3 URIs for the shards."""
         return [f"s3://{self.bucket}/{key}" for key in self.keys]
-    # Legacy alias for backwards compatibility
     @property
     def shard_list(self) -> list[str]:
-        """Return list of S3 URIs for the shards (deprecated, use list_shards())."""
+        """Return list of S3 URIs for the shards.
+        .. deprecated::
+            Use :meth:`list_shards` instead.
+        """
+        import warnings
+        warnings.warn(
+            "shard_list is deprecated, use list_shards()",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         return self.list_shards()
     @property

atdata/atmosphere/__init__.py CHANGED Viewed

@@ -34,11 +34,23 @@ from .schema import SchemaPublisher, SchemaLoader
 from .records import DatasetPublisher, DatasetLoader
 from .lens import LensPublisher, LensLoader
 from .store import PDSBlobStore
-from ._types import (
-    AtUri,
-    SchemaRecord,
-    DatasetRecord,
-    LensRecord,
+from ._types import AtUri, LEXICON_NAMESPACE
+from ._lexicon_types import (
+    LexSchemaRecord,
+    LexDatasetRecord,
+    LexLensRecord,
+    LexCodeReference,
+    JsonSchemaFormat,
+    StorageHttp,
+    StorageS3,
+    StorageBlobs,
+    ShardChecksum,
+    HttpShardEntry,
+    S3ShardEntry,
+    BlobEntry,
+    DatasetSize,
+    StorageUnion,
+    storage_from_record,
 )
 if TYPE_CHECKING:
@@ -70,11 +82,23 @@ class AtmosphereIndexEntry:
     @property
     def data_urls(self) -> list[str]:
-        """WebDataset URLs from external storage."""
+        """WebDataset URLs from storage.
+        Handles storageHttp (shard URLs), storageS3 (s3:// URLs),
+        storageExternal (legacy), and storageBlobs (PDS blob URLs).
+        """
         storage = self._record.get("storage", {})
         storage_type = storage.get("$type", "")
+        if "storageHttp" in storage_type:
+            return [s["url"] for s in storage.get("shards", [])]
+        if "storageS3" in storage_type:
+            bucket = storage.get("bucket", "")
+            return [f"s3://{bucket}/{s['key']}" for s in storage.get("shards", [])]
         if "storageExternal" in storage_type:
             return storage.get("urls", [])
+        if "storageBlobs" in storage_type:
+            # Blob URLs must be resolved via PDS; return empty for now
+            return []
         return []
     @property
@@ -332,9 +356,23 @@ __all__ = [
     # Lens operations
     "LensPublisher",
     "LensLoader",
-    # Types
+    # Core types
     "AtUri",
-    "SchemaRecord",
-    "DatasetRecord",
-    "LensRecord",
+    "LEXICON_NAMESPACE",
+    # Lexicon-mirror types (Tier 1)
+    "LexSchemaRecord",
+    "LexDatasetRecord",
+    "LexLensRecord",
+    "LexCodeReference",
+    "JsonSchemaFormat",
+    "StorageHttp",
+    "StorageS3",
+    "StorageBlobs",
+    "StorageUnion",
+    "storage_from_record",
+    "ShardChecksum",
+    "HttpShardEntry",
+    "S3ShardEntry",
+    "BlobEntry",
+    "DatasetSize",
 ]

atdata 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

atdata 0.3.1b1py3-none-any.whl → 0.3.2b1py3-none-any.whl