PyPI - atdata - Versions diffs - 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl - Mend

atdata 0.3.0b1py3-none-any.whl → 0.3.2b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

atdata/__init__.py +11 -0
atdata/_cid.py +0 -21
atdata/_helpers.py +12 -0
atdata/_hf_api.py +46 -1
atdata/_logging.py +43 -0
atdata/_protocols.py +81 -182
atdata/_schema_codec.py +2 -2
atdata/_sources.py +24 -4
atdata/_stub_manager.py +5 -25
atdata/atmosphere/__init__.py +60 -21
atdata/atmosphere/_lexicon_types.py +595 -0
atdata/atmosphere/_types.py +73 -245
atdata/atmosphere/client.py +64 -12
atdata/atmosphere/lens.py +60 -53
atdata/atmosphere/records.py +291 -100
atdata/atmosphere/schema.py +91 -65
atdata/atmosphere/store.py +68 -66
atdata/cli/__init__.py +16 -16
atdata/cli/diagnose.py +2 -2
atdata/cli/{local.py → infra.py} +10 -10
atdata/dataset.py +266 -47
atdata/index/__init__.py +54 -0
atdata/{local → index}/_entry.py +6 -2
atdata/{local → index}/_index.py +617 -72
atdata/{local → index}/_schema.py +5 -5
atdata/lexicons/__init__.py +127 -0
atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
atdata/lexicons/ac.foundation.dataset.record.json +117 -0
atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
atdata/lexicons/ndarray_shim.json +16 -0
atdata/local/__init__.py +12 -13
atdata/local/_repo_legacy.py +3 -3
atdata/manifest/__init__.py +4 -0
atdata/manifest/_proxy.py +321 -0
atdata/promote.py +14 -10
atdata/repository.py +66 -16
atdata/stores/__init__.py +23 -0
atdata/stores/_disk.py +131 -0
atdata/{local → stores}/_s3.py +134 -112
atdata/testing.py +12 -8
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
atdata-0.3.2b1.dist-info/RECORD +71 -0
atdata-0.3.0b1.dist-info/RECORD +0 -54
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
{atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0

atdata/__init__.py CHANGED Viewed

@@ -44,6 +44,7 @@ from .dataset import (
     SampleBatch as SampleBatch,
     Dataset as Dataset,
     packable as packable,
+    write_samples as write_samples,
 )
 from .lens import (
@@ -89,6 +90,7 @@ from ._schema_codec import (
 from ._logging import (
     configure_logging as configure_logging,
     get_logger as get_logger,
+    log_operation as log_operation,
 )
 from .repository import (
@@ -96,6 +98,14 @@ from .repository import (
     create_repository as create_repository,
 )
+from .index import (
+    Index as Index,
+)
+from .stores import (
+    LocalDiskStore as LocalDiskStore,
+)
 from ._cid import (
     generate_cid as generate_cid,
     verify_cid as verify_cid,
@@ -112,6 +122,7 @@ from .manifest import (
     ManifestWriter as ManifestWriter,
     QueryExecutor as QueryExecutor,
     SampleLocation as SampleLocation,
+    query_fields as query_fields,
 )
 # ATProto integration (lazy import to avoid requiring atproto package)

atdata/_cid.py CHANGED Viewed

@@ -116,29 +116,8 @@ def verify_cid(cid: str, data: Any) -> bool:
     return cid == expected_cid
-def parse_cid(cid: str) -> dict:
-    """Parse a CID string into its components.
-    Args:
-        cid: CID string to parse.
-    Returns:
-        Dictionary with 'version', 'codec', and 'hash' keys.
-        The 'hash' value is itself a dict with 'code', 'size', and 'digest'.
-    Examples:
-        >>> info = parse_cid('bafyrei...')
-        >>> info['version']
-        1
-        >>> info['codec']
-        113  # 0x71 = dag-cbor
-    """
-    return libipld.decode_cid(cid)
 __all__ = [
     "generate_cid",
     "generate_cid_from_bytes",
     "verify_cid",
-    "parse_cid",
 ]

atdata/_helpers.py CHANGED Viewed

@@ -65,10 +65,22 @@ def bytes_to_array(b: bytes) -> np.ndarray:
         return np.load(BytesIO(b), allow_pickle=True)
     # Compact format: dtype_len(1B) + dtype_str + ndim(1B) + shape(ndim×8B) + data
+    if len(b) < 2:
+        raise ValueError(f"Array buffer too short ({len(b)} bytes): need at least 2")
     dlen = b[0]
+    min_header = 2 + dlen  # dtype_len + dtype_str + ndim
+    if len(b) < min_header:
+        raise ValueError(
+            f"Array buffer too short ({len(b)} bytes): need at least {min_header} for header"
+        )
     dtype = np.dtype(b[1 : 1 + dlen].decode())
     ndim = b[1 + dlen]
     offset = 2 + dlen
+    min_with_shape = offset + ndim * 8
+    if len(b) < min_with_shape:
+        raise ValueError(
+            f"Array buffer too short ({len(b)} bytes): need at least {min_with_shape} for shape"
+        )
     shape = struct.unpack_from(f"<{ndim}q", b, offset)
     offset += ndim * 8
     return np.frombuffer(b, dtype=dtype, offset=offset).reshape(shape).copy()

atdata/_hf_api.py CHANGED Viewed

@@ -32,6 +32,7 @@ import re
 import threading
 from pathlib import Path
 from typing import (
+    Any,
     TYPE_CHECKING,
     Generic,
     Mapping,
@@ -65,7 +66,7 @@ def get_default_index() -> "Index":  # noqa: F821
     """Get or create the module-level default Index.
     The default Index uses Redis for local storage (backwards-compatible
-    default) and an anonymous AtmosphereClient for read-only public data
+    default) and an anonymous Atmosphere for read-only public data
     resolution.
     The default is created lazily on first access and cached for the
@@ -189,6 +190,37 @@ class DatasetDict(Generic[ST], dict):
         """
         return {name: len(ds.list_shards()) for name, ds in self.items()}
+    # Methods proxied to the sole Dataset when only one split exists.
+    _DATASET_METHODS = frozenset(
+        {
+            "ordered",
+            "shuffled",
+            "as_type",
+            "list_shards",
+            "head",
+        }
+    )
+    def __getattr__(self, name: str) -> Any:
+        """Proxy common Dataset methods when this dict has exactly one split.
+        When a ``DatasetDict`` contains a single split, calling iteration
+        methods like ``.ordered()`` or ``.shuffled()`` is forwarded to the
+        contained ``Dataset`` for convenience.  Multi-split dicts raise
+        ``AttributeError`` with a hint to select a split explicitly.
+        """
+        if name in self._DATASET_METHODS:
+            if len(self) == 1:
+                return getattr(next(iter(self.values())), name)
+            splits = ", ".join(f"'{k}'" for k in self.keys())
+            raise AttributeError(
+                f"'{type(self).__name__}' has {len(self)} splits ({splits}). "
+                f"Select one first, e.g. ds_dict['{next(iter(self.keys()))}'].{name}()"
+            )
+        raise AttributeError(
+            f"'{type(self).__name__}' object has no attribute '{name}'"
+        )
 ##
 # Path resolution utilities
@@ -682,12 +714,23 @@ def load_dataset(
         >>> index = Index()
         >>> ds = load_dataset("@local/my-dataset", index=index, split="train")
     """
+    from ._logging import get_logger
+    log = get_logger()
+    log.info(
+        "load_dataset: path=%s, split=%s, sample_type=%s",
+        path,
+        split,
+        sample_type.__name__ if sample_type is not None else "None",
+    )
     # Handle @handle/dataset indexed path resolution
     if _is_indexed_path(path):
         if index is None:
             index = get_default_index()
         source, schema_ref = _resolve_indexed_path(path, index)
+        log.debug("load_dataset: resolved indexed path, schema_ref=%s", schema_ref)
         # Resolve sample_type from schema if not provided
         resolved_type: Type = (
@@ -714,6 +757,8 @@ def load_dataset(
     if not splits_shards:
         raise FileNotFoundError(f"No data files found at path: {path}")
+    log.debug("load_dataset: resolved %d split(s) from path", len(splits_shards))
     # Build Dataset for each split
     datasets: dict[str, Dataset] = {}
     for split_name, shards in splits_shards.items():

atdata/_logging.py CHANGED Viewed

@@ -22,7 +22,10 @@ custom logger implementations.
 from __future__ import annotations
+import contextlib
 import logging
+import time
+from collections.abc import Generator
 from typing import Any, Protocol, runtime_checkable
@@ -68,3 +71,43 @@ def get_logger() -> LoggerProtocol:
     whatever was last set via :func:`configure_logging`.
     """
     return _logger
+@contextlib.contextmanager
+def log_operation(op_name: str, **context: Any) -> Generator[None, None, None]:
+    """Log the start, completion, and duration of an operation.
+    Emits an ``info`` message on entry and on successful completion
+    (with elapsed time), or an ``error`` message if an exception
+    propagates out.
+    Args:
+        op_name: Short label for the operation (e.g. ``"write_samples"``).
+        **context: Arbitrary key-value pairs included in every log message.
+    Examples:
+        >>> with log_operation("write_samples", shard_count=10):
+        ...     do_work()
+    """
+    log = get_logger()
+    ctx_str = ", ".join(f"{k}={v}" for k, v in context.items())
+    if ctx_str:
+        log.info("%s: started (%s)", op_name, ctx_str)
+    else:
+        log.info("%s: started", op_name)
+    t0 = time.monotonic()
+    try:
+        yield
+    except Exception:
+        elapsed = time.monotonic() - t0
+        if ctx_str:
+            log.error("%s: failed after %.2fs (%s)", op_name, elapsed, ctx_str)
+        else:
+            log.error("%s: failed after %.2fs", op_name, elapsed)
+        raise
+    else:
+        elapsed = time.monotonic() - t0
+        if ctx_str:
+            log.info("%s: completed in %.2fs (%s)", op_name, elapsed, ctx_str)
+        else:
+            log.info("%s: completed in %.2fs", op_name, elapsed)

atdata/_protocols.py CHANGED Viewed

@@ -1,37 +1,25 @@
 """Protocol definitions for atdata index and storage abstractions.
-This module defines the abstract protocols that enable interchangeable
-index backends (local Redis vs ATProto PDS) and data stores (S3 vs PDS blobs).
-The key insight is that both local and atmosphere implementations solve the
-same problem: indexed dataset storage with external data URLs. These protocols
-formalize that common interface.
-Note:
-    Protocol methods use ``...`` (Ellipsis) as the body per PEP 544. This is
-    the standard Python syntax for Protocol definitions - these are interface
-    specifications, not stub implementations. Concrete classes (Index,
-    AtmosphereIndex, etc.) provide the actual implementations.
+Defines the abstract protocols that enable interchangeable index backends
+(local SQLite/Redis vs ATProto PDS) and data stores (S3, local disk, PDS blobs).
 Protocols:
-    Packable: Structural interface for packable sample types (lens compatibility)
+    Packable: Structural interface for packable sample types
     IndexEntry: Common interface for dataset index entries
     AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
     AbstractDataStore: Protocol for data storage operations
+    DataSource: Protocol for streaming shard data
 Examples:
     >>> def process_datasets(index: AbstractIndex) -> None:
     ...     for entry in index.list_datasets():
     ...         print(f"{entry.name}: {entry.data_urls}")
-    ...
-    >>> # Works with either Index or AtmosphereIndex
-    >>> process_datasets(local_index)
-    >>> process_datasets(atmosphere_index)
 """
 from typing import (
     IO,
     Any,
+    Iterable,
     Iterator,
     Optional,
     Protocol,
@@ -115,7 +103,7 @@ class IndexEntry(Protocol):
         """Schema reference string.
         Local: ``local://schemas/{module.Class}@{version}``
-        Atmosphere: ``at://did:plc:.../ac.foundation.dataset.sampleSchema/...``
+        Atmosphere: ``at://did:plc:.../ac.foundation.dataset.schema/...``
         """
         ...
@@ -137,32 +125,16 @@ class IndexEntry(Protocol):
 class AbstractIndex(Protocol):
-    """Protocol for index operations - implemented by Index and AtmosphereIndex.
-    This protocol defines the common interface for managing dataset metadata:
-    - Publishing and retrieving schemas
-    - Inserting and listing datasets
-    - (Future) Publishing and retrieving lenses
-    A single index can hold datasets of many different sample types. The sample
-    type is tracked via schema references, not as a generic parameter on the index.
+    """Protocol for index operations — implemented by Index and AtmosphereIndex.
-    Optional Extensions:
-        Some index implementations support additional features:
-        - ``data_store``: An AbstractDataStore for reading/writing dataset shards.
-          If present, ``load_dataset`` will use it for S3 credential resolution.
+    Manages dataset metadata: publishing/retrieving schemas, inserting/listing
+    datasets. A single index holds datasets of many sample types, tracked via
+    schema references.
     Examples:
         >>> def publish_and_list(index: AbstractIndex) -> None:
-        ...     # Publish schemas for different types
-        ...     schema1 = index.publish_schema(ImageSample, version="1.0.0")
-        ...     schema2 = index.publish_schema(TextSample, version="1.0.0")
-        ...
-        ...     # Insert datasets of different types
+        ...     index.publish_schema(ImageSample, version="1.0.0")
         ...     index.insert_dataset(image_ds, name="images")
-        ...     index.insert_dataset(text_ds, name="texts")
-        ...
-        ...     # List all datasets (mixed types)
         ...     for entry in index.list_datasets():
         ...         print(f"{entry.name} -> {entry.schema_ref}")
     """
@@ -171,55 +143,75 @@ class AbstractIndex(Protocol):
     def data_store(self) -> Optional["AbstractDataStore"]:
         """Optional data store for reading/writing shards.
-        If present, ``load_dataset`` will use it for credential resolution
-        (e.g., S3 credentials from S3DataStore).
-        Returns:
-            AbstractDataStore instance, or None if this index doesn't have
-            an associated data store.
-        Note:
-            Not all index implementations provide a data_store. Use
-            ``hasattr(index, 'data_store') and index.data_store is not None``
-            for safe access.
+        If present, ``load_dataset`` uses it for credential resolution.
+        Not all implementations provide a data_store; check with
+        ``getattr(index, 'data_store', None)``.
         """
         ...
     # Dataset operations
-    def insert_dataset(
+    def write_samples(
         self,
-        ds: "Dataset",
+        samples: Iterable,
         *,
         name: str,
         schema_ref: Optional[str] = None,
+        data_store: Optional["AbstractDataStore"] = None,
+        force: bool = False,
         **kwargs,
     ) -> IndexEntry:
-        """Insert a dataset into the index.
+        """Write samples and create an index entry in one step.
+        Serializes samples to WebDataset tar files, stores them via the
+        appropriate backend, and creates an index entry.
-        The sample type is inferred from ``ds.sample_type``. If schema_ref is not
-        provided, the schema may be auto-published based on the sample type.
+        For atmosphere targets, data is uploaded as PDS blobs by default
+        with size guards (50 MB per shard, 1 GB total).
         Args:
-            ds: The Dataset to register in the index (any sample type).
-            name: Human-readable name for the dataset.
-            schema_ref: Optional explicit schema reference. If not provided,
-                the schema may be auto-published or inferred from ds.sample_type.
-            **kwargs: Additional backend-specific options.
+            samples: Iterable of Packable samples. Must be non-empty.
+            name: Dataset name, optionally prefixed with target backend.
+            schema_ref: Optional schema reference.
+            data_store: Explicit data store for shard storage.
+            force: Bypass PDS size limits.
+            **kwargs: Backend-specific options (maxcount, description, etc.).
         Returns:
-            IndexEntry for the inserted dataset.
+            IndexEntry for the created dataset.
         """
         ...
-    def get_dataset(self, ref: str) -> IndexEntry:
-        """Get a dataset entry by name or reference.
+    def insert_dataset(
+        self,
+        ds: "Dataset",
+        *,
+        name: str,
+        schema_ref: Optional[str] = None,
+        data_store: Optional["AbstractDataStore"] = None,
+        force: bool = False,
+        copy: bool = False,
+        **kwargs,
+    ) -> IndexEntry:
+        """Register an existing dataset in the index.
+        For atmosphere targets, local sources are uploaded via
+        *data_store* (defaults to PDS blobs). Credentialed sources
+        require ``copy=True``.
         Args:
-            ref: Dataset name, path, or full reference string.
+            ds: The Dataset to register.
+            name: Human-readable name.
+            schema_ref: Explicit schema ref; auto-published if ``None``.
+            data_store: Explicit data store for shard storage.
+            force: Bypass PDS size limits.
+            copy: Copy data to destination store even for remote sources.
+            **kwargs: Backend-specific options.
+        """
+        ...
-        Returns:
-            IndexEntry for the dataset.
+    def get_dataset(self, ref: str) -> IndexEntry:
+        """Get a dataset entry by name or reference.
         Raises:
             KeyError: If dataset not found.
@@ -242,33 +234,19 @@ class AbstractIndex(Protocol):
     ) -> str:
         """Publish a schema for a sample type.
-        The sample_type is accepted as ``type`` rather than ``Type[Packable]`` to
-        support ``@packable``-decorated classes, which satisfy the Packable protocol
-        at runtime but cannot be statically verified by type checkers.
         Args:
-            sample_type: A Packable type (PackableSample subclass or @packable-decorated).
-                Validated at runtime via the @runtime_checkable Packable protocol.
-            version: Semantic version string for the schema.
-            **kwargs: Additional backend-specific options.
+            sample_type: A Packable type (``@packable``-decorated or subclass).
+            version: Semantic version string.
+            **kwargs: Backend-specific options.
         Returns:
-            Schema reference string:
-            - Local: 'local://schemas/{module.Class}@{version}'
-            - Atmosphere: 'at://did:plc:.../ac.foundation.dataset.sampleSchema/...'
+            Schema reference string (``local://...`` or ``at://...``).
         """
         ...
     def get_schema(self, ref: str) -> dict:
         """Get a schema record by reference.
-        Args:
-            ref: Schema reference string (local:// or at://).
-        Returns:
-            Schema record as a dictionary with fields like 'name', 'version',
-            'fields', etc.
         Raises:
             KeyError: If schema not found.
         """
@@ -280,30 +258,15 @@ class AbstractIndex(Protocol):
     def list_schemas(self) -> list[dict]: ...
     def decode_schema(self, ref: str) -> Type[Packable]:
-        """Reconstruct a Python Packable type from a stored schema.
-        This method enables loading datasets without knowing the sample type
-        ahead of time. The index retrieves the schema record and dynamically
-        generates a Packable class matching the schema definition.
-        Args:
-            ref: Schema reference string (local:// or at://).
-        Returns:
-            A dynamically generated Packable class with fields matching
-            the schema definition. The class can be used with
-            ``Dataset[T]`` to load and iterate over samples.
+        """Reconstruct a Packable type from a stored schema.
         Raises:
             KeyError: If schema not found.
-            ValueError: If schema cannot be decoded (unsupported field types).
+            ValueError: If schema has unsupported field types.
         Examples:
-            >>> entry = index.get_dataset("my-dataset")
             >>> SampleType = index.decode_schema(entry.schema_ref)
             >>> ds = Dataset[SampleType](entry.data_urls[0])
-            >>> for sample in ds.ordered():
-            ...     print(sample)  # sample is instance of SampleType
         """
         ...
@@ -313,21 +276,14 @@ class AbstractIndex(Protocol):
 class AbstractDataStore(Protocol):
-    """Protocol for data storage operations.
+    """Protocol for data storage backends (S3, local disk, PDS blobs).
-    This protocol abstracts over different storage backends for dataset data:
-    - S3DataStore: S3-compatible object storage
-    - PDSBlobStore: ATProto PDS blob storage (future)
-    The separation of index (metadata) from data store (actual files) allows
-    flexible deployment: local index with S3 storage, atmosphere index with
-    S3 storage, or atmosphere index with PDS blobs.
+    Separates index (metadata) from data store (shard files), enabling
+    flexible deployment combinations.
     Examples:
         >>> store = S3DataStore(credentials, bucket="my-bucket")
         >>> urls = store.write_shards(dataset, prefix="training/v1")
-        >>> print(urls)
-        ['s3://my-bucket/training/v1/shard-000000.tar', ...]
     """
     def write_shards(
@@ -341,28 +297,16 @@ class AbstractDataStore(Protocol):
         Args:
             ds: The Dataset to write.
-            prefix: Path prefix for the shards (e.g., 'datasets/mnist/v1').
-            **kwargs: Backend-specific options (e.g., maxcount for shard size).
+            prefix: Path prefix (e.g., ``'datasets/mnist/v1'``).
+            **kwargs: Backend-specific options (``maxcount``, ``maxsize``, etc.).
         Returns:
-            List of URLs for the written shards, suitable for use with
-            WebDataset or atdata.Dataset().
+            List of shard URLs suitable for ``atdata.Dataset()``.
         """
         ...
     def read_url(self, url: str) -> str:
-        """Resolve a storage URL for reading.
-        Some storage backends may need to transform URLs (e.g., signing S3 URLs
-        or resolving blob references). This method returns a URL that can be
-        used directly with WebDataset.
-        Args:
-            url: Storage URL to resolve.
-        Returns:
-            WebDataset-compatible URL for reading.
-        """
+        """Resolve a storage URL for reading (e.g., sign S3 URLs)."""
         ...
     def supports_streaming(self) -> bool: ...
@@ -374,77 +318,32 @@ class AbstractDataStore(Protocol):
 @runtime_checkable
 class DataSource(Protocol):
-    """Protocol for data sources that provide streams to Dataset.
+    """Protocol for data sources that stream shard data to Dataset.
-    A DataSource abstracts over different ways of accessing dataset shards:
-    - URLSource: Standard WebDataset-compatible URLs (http, https, pipe, gs, etc.)
-    - S3Source: S3-compatible storage with explicit credentials
-    - BlobSource: ATProto blob references (future)
-    The key method is ``shards()``, which yields (identifier, stream) pairs.
-    These are fed directly to WebDataset's tar_file_expander, bypassing URL
-    resolution entirely. This enables:
-    - Private S3 repos with credentials
-    - Custom endpoints (Cloudflare R2, MinIO)
-    - ATProto blob streaming
-    - Any other source that can provide file-like objects
+    Implementations (URLSource, S3Source, BlobSource) yield
+    ``(identifier, stream)`` pairs fed to WebDataset's tar expander,
+    bypassing URL resolution. This enables private S3, custom endpoints,
+    and ATProto blob streaming.
     Examples:
-        >>> source = S3Source(
-        ...     bucket="my-bucket",
-        ...     keys=["data-000.tar", "data-001.tar"],
-        ...     endpoint="https://r2.example.com",
-        ...     credentials=creds,
-        ... )
+        >>> source = S3Source(bucket="my-bucket", keys=["data-000.tar"])
         >>> ds = Dataset[MySample](source)
-        >>> for sample in ds.ordered():
-        ...     print(sample)
     """
     @property
     def shards(self) -> Iterator[tuple[str, IO[bytes]]]:
-        """Lazily yield (identifier, stream) pairs for each shard.
-        The identifier is used for error messages and __url__ metadata.
-        The stream must be a file-like object that can be read by tarfile.
-        Yields:
-            Tuple of (shard_identifier, file_like_stream).
-        Examples:
-            >>> for shard_id, stream in source.shards:
-            ...     print(f"Processing {shard_id}")
-            ...     data = stream.read()
-        """
+        """Lazily yield ``(shard_id, stream)`` pairs for each shard."""
         ...
     def list_shards(self) -> list[str]:
-        """Get list of shard identifiers without opening streams.
-        Used for metadata queries like counting shards without actually
-        streaming data. Implementations should return identifiers that
-        match what shards would yield.
-        Returns:
-            List of shard identifier strings.
-        """
+        """Shard identifiers without opening streams."""
         ...
     def open_shard(self, shard_id: str) -> IO[bytes]:
-        """Open a single shard by its identifier.
-        This method enables random access to individual shards, which is
-        required for PyTorch DataLoader worker splitting. Each worker opens
-        only its assigned shards rather than iterating all shards.
-        Args:
-            shard_id: Shard identifier from list_shards().
-        Returns:
-            File-like stream for reading the shard.
+        """Open a single shard for random access (e.g., DataLoader splitting).
         Raises:
-            KeyError: If shard_id is not in list_shards().
+            KeyError: If *shard_id* is not in ``list_shards()``.
         """
         ...

atdata/_schema_codec.py CHANGED Viewed

@@ -284,7 +284,7 @@ def generate_stub(schema: dict) -> str:
         String content for a .pyi stub file.
     Examples:
-        >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
+        >>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
         >>> stub_content = generate_stub(schema.to_dict())
         >>> # Save to a stubs directory configured in your IDE
         >>> with open("stubs/my_sample.pyi", "w") as f:
@@ -360,7 +360,7 @@ def generate_module(schema: dict) -> str:
         String content for a .py module file.
     Examples:
-        >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
+        >>> schema = index.get_schema("atdata://local/schema/MySample@1.0.0")
         >>> module_content = generate_module(schema.to_dict())
         >>> # The module can be imported after being saved
     """

atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

atdata 0.3.0b1py3-none-any.whl → 0.3.2b1py3-none-any.whl