PyPI - atdata - Versions diffs - 0.2.2b1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl - Mend

atdata 0.2.2b1py3-none-any.whl → 0.2.3b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

atdata/__init__.py +1 -1
atdata/_cid.py +29 -35
atdata/_helpers.py +7 -5
atdata/_hf_api.py +48 -50
atdata/_protocols.py +56 -71
atdata/_schema_codec.py +33 -37
atdata/_sources.py +57 -64
atdata/_stub_manager.py +31 -26
atdata/_type_utils.py +19 -5
atdata/atmosphere/__init__.py +20 -23
atdata/atmosphere/_types.py +11 -11
atdata/atmosphere/client.py +11 -8
atdata/atmosphere/lens.py +27 -30
atdata/atmosphere/records.py +31 -37
atdata/atmosphere/schema.py +33 -29
atdata/atmosphere/store.py +16 -20
atdata/cli/__init__.py +12 -3
atdata/cli/diagnose.py +12 -8
atdata/cli/local.py +4 -1
atdata/dataset.py +284 -241
atdata/lens.py +77 -82
atdata/local.py +182 -169
atdata/promote.py +18 -22
{atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/METADATA +2 -1
atdata-0.2.3b1.dist-info/RECORD +28 -0
atdata-0.2.2b1.dist-info/RECORD +0 -28
{atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/WHEEL +0 -0
{atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/licenses/LICENSE +0 -0

atdata/__init__.py CHANGED Viewed

@@ -88,4 +88,4 @@ from .promote import (
 from . import atmosphere as atmosphere
 # CLI entry point
-from .cli import main as main
+from .cli import main as main

atdata/_cid.py CHANGED Viewed

@@ -12,13 +12,11 @@ The CIDs generated here use:
 This ensures compatibility with ATProto's CID requirements and enables
 seamless promotion from local storage to atmosphere (ATProto network).
-Example:
-    ::
-        >>> schema = {"name": "ImageSample", "version": "1.0.0", "fields": [...]}
-        >>> cid = generate_cid(schema)
-        >>> print(cid)
-        bafyreihffx5a2e7k6r5zqgp5iwpjqr2gfyheqhzqtlxagvqjqyxzqpzqaa
+Examples:
+    >>> schema = {"name": "ImageSample", "version": "1.0.0", "fields": [...]}
+    >>> cid = generate_cid(schema)
+    >>> print(cid)
+    bafyreihffx5a2e7k6r5zqgp5iwpjqr2gfyheqhzqtlxagvqjqyxzqpzqaa
 """
 import hashlib
@@ -50,11 +48,9 @@ def generate_cid(data: Any) -> str:
     Raises:
         ValueError: If the data cannot be encoded as DAG-CBOR.
-    Example:
-        ::
-            >>> generate_cid({"name": "test", "value": 42})
-            'bafyrei...'
+    Examples:
+        >>> generate_cid({"name": "test", "value": 42})
+        'bafyrei...'
     """
     # Encode data as DAG-CBOR
     try:
@@ -68,7 +64,9 @@ def generate_cid(data: Any) -> str:
     # Build raw CID bytes:
     # CIDv1 = version(1) + codec(dag-cbor) + multihash
     # Multihash = code(sha256) + size(32) + digest
-    raw_cid_bytes = bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
+    raw_cid_bytes = (
+        bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
+    )
     # Encode to base32 multibase string
     return libipld.encode_cid(raw_cid_bytes)
@@ -86,14 +84,14 @@ def generate_cid_from_bytes(data_bytes: bytes) -> str:
     Returns:
         CIDv1 string in base32 multibase format.
-    Example:
-        ::
-            >>> cbor_bytes = libipld.encode_dag_cbor({"key": "value"})
-            >>> cid = generate_cid_from_bytes(cbor_bytes)
+    Examples:
+        >>> cbor_bytes = libipld.encode_dag_cbor({"key": "value"})
+        >>> cid = generate_cid_from_bytes(cbor_bytes)
     """
     sha256_hash = hashlib.sha256(data_bytes).digest()
-    raw_cid_bytes = bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
+    raw_cid_bytes = (
+        bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
+    )
     return libipld.encode_cid(raw_cid_bytes)
@@ -107,14 +105,12 @@ def verify_cid(cid: str, data: Any) -> bool:
     Returns:
         True if the CID matches the data, False otherwise.
-    Example:
-        ::
-            >>> cid = generate_cid({"name": "test"})
-            >>> verify_cid(cid, {"name": "test"})
-            True
-            >>> verify_cid(cid, {"name": "different"})
-            False
+    Examples:
+        >>> cid = generate_cid({"name": "test"})
+        >>> verify_cid(cid, {"name": "test"})
+        True
+        >>> verify_cid(cid, {"name": "different"})
+        False
     """
     expected_cid = generate_cid(data)
     return cid == expected_cid
@@ -130,14 +126,12 @@ def parse_cid(cid: str) -> dict:
         Dictionary with 'version', 'codec', and 'hash' keys.
         The 'hash' value is itself a dict with 'code', 'size', and 'digest'.
-    Example:
-        ::
-            >>> info = parse_cid('bafyrei...')
-            >>> info['version']
-            1
-            >>> info['codec']
-            113  # 0x71 = dag-cbor
+    Examples:
+        >>> info = parse_cid('bafyrei...')
+        >>> info['version']
+        1
+        >>> info['codec']
+        113  # 0x71 = dag-cbor
     """
     return libipld.decode_cid(cid)

atdata/_helpers.py CHANGED Viewed

@@ -22,7 +22,8 @@ import numpy as np
 ##
-def array_to_bytes( x: np.ndarray ) -> bytes:
+def array_to_bytes(x: np.ndarray) -> bytes:
     """Convert a numpy array to bytes for msgpack serialization.
     Uses numpy's native ``save()`` format to preserve array dtype and shape.
@@ -37,10 +38,11 @@ def array_to_bytes( x: np.ndarray ) -> bytes:
         Uses ``allow_pickle=True`` to support object dtypes.
     """
     np_bytes = BytesIO()
-    np.save( np_bytes, x, allow_pickle = True )
+    np.save(np_bytes, x, allow_pickle=True)
     return np_bytes.getvalue()
-def bytes_to_array( b: bytes ) -> np.ndarray:
+def bytes_to_array(b: bytes) -> np.ndarray:
     """Convert serialized bytes back to a numpy array.
     Reverses the serialization performed by ``array_to_bytes()``.
@@ -54,5 +56,5 @@ def bytes_to_array( b: bytes ) -> np.ndarray:
     Note:
         Uses ``allow_pickle=True`` to support object dtypes.
     """
-    np_bytes = BytesIO( b )
-    return np.load( np_bytes, allow_pickle = True )
+    np_bytes = BytesIO(b)
+    return np.load(np_bytes, allow_pickle=True)

atdata/_hf_api.py CHANGED Viewed

@@ -9,23 +9,21 @@ Key differences from HuggingFace Datasets:
 - Built on WebDataset for efficient streaming of large datasets
 - No Arrow caching layer (WebDataset handles remote/local transparently)
-Example:
-    ::
-        >>> import atdata
-        >>> from atdata import load_dataset
-        >>>
-        >>> @atdata.packable
-        ... class MyData:
-        ...     text: str
-        ...     label: int
-        >>>
-        >>> # Load a single split
-        >>> ds = load_dataset("path/to/train-{000000..000099}.tar", MyData, split="train")
-        >>>
-        >>> # Load all splits (returns DatasetDict)
-        >>> ds_dict = load_dataset("path/to/{train,test}-*.tar", MyData)
-        >>> train_ds = ds_dict["train"]
+Examples:
+    >>> import atdata
+    >>> from atdata import load_dataset
+    >>>
+    >>> @atdata.packable
+    ... class MyData:
+    ...     text: str
+    ...     label: int
+    >>>
+    >>> # Load a single split
+    >>> ds = load_dataset("path/to/train-{000000..000099}.tar", MyData, split="train")
+    >>>
+    >>> # Load all splits (returns DatasetDict)
+    >>> ds_dict = load_dataset("path/to/{train,test}-*.tar", MyData)
+    >>> train_ds = ds_dict["train"]
 """
 from __future__ import annotations
@@ -48,7 +46,6 @@ from ._protocols import DataSource
 if TYPE_CHECKING:
     from ._protocols import AbstractIndex
-    from .local import S3DataStore
 ##
 # Type variables
@@ -70,17 +67,16 @@ class DatasetDict(Generic[ST], dict):
     Parameters:
         ST: The sample type for all datasets in this dict.
-    Example:
-        ::
-            >>> ds_dict = load_dataset("path/to/data", MyData)
-            >>> train = ds_dict["train"]
-            >>> test = ds_dict["test"]
-            >>>
-            >>> # Iterate over all splits
-            >>> for split_name, dataset in ds_dict.items():
-            ...     print(f"{split_name}: {len(dataset.shard_list)} shards")
+    Examples:
+        >>> ds_dict = load_dataset("path/to/data", MyData)
+        >>> train = ds_dict["train"]
+        >>> test = ds_dict["test"]
+        >>>
+        >>> # Iterate over all splits
+        >>> for split_name, dataset in ds_dict.items():
+        ...     print(f"{split_name}: {len(dataset.shard_list)} shards")
     """
     # TODO The above has a line for "Parameters:" that should be "Type Parameters:"; this is a temporary fix for `quartodoc` auto-generation bugs.
     def __init__(
@@ -468,7 +464,7 @@ def _resolve_indexed_path(
     data_urls = entry.data_urls
     # Check if index has a data store
-    if hasattr(index, 'data_store') and index.data_store is not None:
+    if hasattr(index, "data_store") and index.data_store is not None:
         store = index.data_store
         # Import here to avoid circular imports at module level
@@ -613,25 +609,23 @@ def load_dataset(
         FileNotFoundError: If no data files are found at the path.
         KeyError: If dataset not found in index.
-    Example:
-        ::
-            >>> # Load without type - get DictSample for exploration
-            >>> ds = load_dataset("./data/train.tar", split="train")
-            >>> for sample in ds.ordered():
-            ...     print(sample.keys())  # Explore fields
-            ...     print(sample["text"]) # Dict-style access
-            ...     print(sample.label)   # Attribute access
-            >>>
-            >>> # Convert to typed schema
-            >>> typed_ds = ds.as_type(TextData)
-            >>>
-            >>> # Or load with explicit type directly
-            >>> train_ds = load_dataset("./data/train-*.tar", TextData, split="train")
-            >>>
-            >>> # Load from index with auto-type resolution
-            >>> index = LocalIndex()
-            >>> ds = load_dataset("@local/my-dataset", index=index, split="train")
+    Examples:
+        >>> # Load without type - get DictSample for exploration
+        >>> ds = load_dataset("./data/train.tar", split="train")
+        >>> for sample in ds.ordered():
+        ...     print(sample.keys())  # Explore fields
+        ...     print(sample["text"]) # Dict-style access
+        ...     print(sample.label)   # Attribute access
+        >>>
+        >>> # Convert to typed schema
+        >>> typed_ds = ds.as_type(TextData)
+        >>>
+        >>> # Or load with explicit type directly
+        >>> train_ds = load_dataset("./data/train-*.tar", TextData, split="train")
+        >>>
+        >>> # Load from index with auto-type resolution
+        >>> index = LocalIndex()
+        >>> ds = load_dataset("@local/my-dataset", index=index, split="train")
     """
     # Handle @handle/dataset indexed path resolution
     if _is_indexed_path(path):
@@ -644,7 +638,9 @@ def load_dataset(
         source, schema_ref = _resolve_indexed_path(path, index)
         # Resolve sample_type from schema if not provided
-        resolved_type: Type = sample_type if sample_type is not None else index.decode_schema(schema_ref)
+        resolved_type: Type = (
+            sample_type if sample_type is not None else index.decode_schema(schema_ref)
+        )
         # Create dataset from the resolved source (includes credentials if S3)
         ds = Dataset[resolved_type](source)
@@ -653,7 +649,9 @@ def load_dataset(
             # Indexed datasets are single-split by default
             return ds
-        return DatasetDict({"train": ds}, sample_type=resolved_type, streaming=streaming)
+        return DatasetDict(
+            {"train": ds}, sample_type=resolved_type, streaming=streaming
+        )
     # Use DictSample as default when no type specified
     resolved_type = sample_type if sample_type is not None else DictSample

atdata/_protocols.py CHANGED Viewed

@@ -19,22 +19,19 @@ Protocols:
     AbstractIndex: Protocol for index operations (schemas, datasets, lenses)
     AbstractDataStore: Protocol for data storage operations
-Example:
-    ::
-        >>> def process_datasets(index: AbstractIndex) -> None:
-        ...     for entry in index.list_datasets():
-        ...         print(f"{entry.name}: {entry.data_urls}")
-        ...
-        >>> # Works with either LocalIndex or AtmosphereIndex
-        >>> process_datasets(local_index)
-        >>> process_datasets(atmosphere_index)
+Examples:
+    >>> def process_datasets(index: AbstractIndex) -> None:
+    ...     for entry in index.list_datasets():
+    ...         print(f"{entry.name}: {entry.data_urls}")
+    ...
+    >>> # Works with either LocalIndex or AtmosphereIndex
+    >>> process_datasets(local_index)
+    >>> process_datasets(atmosphere_index)
 """
 from typing import (
     IO,
     Any,
-    ClassVar,
     Iterator,
     Optional,
     Protocol,
@@ -67,18 +64,16 @@ class Packable(Protocol):
     - Schema publishing (class introspection via dataclass fields)
     - Serialization/deserialization (packed, from_bytes)
-    Example:
-        ::
-            >>> @packable
-            ... class MySample:
-            ...     name: str
-            ...     value: int
-            ...
-            >>> def process(sample_type: Type[Packable]) -> None:
-            ...     # Type checker knows sample_type has from_bytes, packed, etc.
-            ...     instance = sample_type.from_bytes(data)
-            ...     print(instance.packed)
+    Examples:
+        >>> @packable
+        ... class MySample:
+        ...     name: str
+        ...     value: int
+        ...
+        >>> def process(sample_type: Type[Packable]) -> None:
+        ...     # Type checker knows sample_type has from_bytes, packed, etc.
+        ...     instance = sample_type.from_bytes(data)
+        ...     print(instance.packed)
     """
     @classmethod
@@ -169,21 +164,19 @@ class AbstractIndex(Protocol):
         - ``data_store``: An AbstractDataStore for reading/writing dataset shards.
           If present, ``load_dataset`` will use it for S3 credential resolution.
-    Example:
-        ::
-            >>> def publish_and_list(index: AbstractIndex) -> None:
-            ...     # Publish schemas for different types
-            ...     schema1 = index.publish_schema(ImageSample, version="1.0.0")
-            ...     schema2 = index.publish_schema(TextSample, version="1.0.0")
-            ...
-            ...     # Insert datasets of different types
-            ...     index.insert_dataset(image_ds, name="images")
-            ...     index.insert_dataset(text_ds, name="texts")
-            ...
-            ...     # List all datasets (mixed types)
-            ...     for entry in index.list_datasets():
-            ...         print(f"{entry.name} -> {entry.schema_ref}")
+    Examples:
+        >>> def publish_and_list(index: AbstractIndex) -> None:
+        ...     # Publish schemas for different types
+        ...     schema1 = index.publish_schema(ImageSample, version="1.0.0")
+        ...     schema2 = index.publish_schema(TextSample, version="1.0.0")
+        ...
+        ...     # Insert datasets of different types
+        ...     index.insert_dataset(image_ds, name="images")
+        ...     index.insert_dataset(text_ds, name="texts")
+        ...
+        ...     # List all datasets (mixed types)
+        ...     for entry in index.list_datasets():
+        ...         print(f"{entry.name} -> {entry.schema_ref}")
     """
     @property
@@ -341,14 +334,12 @@ class AbstractIndex(Protocol):
             KeyError: If schema not found.
             ValueError: If schema cannot be decoded (unsupported field types).
-        Example:
-            ::
-                >>> entry = index.get_dataset("my-dataset")
-                >>> SampleType = index.decode_schema(entry.schema_ref)
-                >>> ds = Dataset[SampleType](entry.data_urls[0])
-                >>> for sample in ds.ordered():
-                ...     print(sample)  # sample is instance of SampleType
+        Examples:
+            >>> entry = index.get_dataset("my-dataset")
+            >>> SampleType = index.decode_schema(entry.schema_ref)
+            >>> ds = Dataset[SampleType](entry.data_urls[0])
+            >>> for sample in ds.ordered():
+            ...     print(sample)  # sample is instance of SampleType
         """
         ...
@@ -368,13 +359,11 @@ class AbstractDataStore(Protocol):
     flexible deployment: local index with S3 storage, atmosphere index with
     S3 storage, or atmosphere index with PDS blobs.
-    Example:
-        ::
-            >>> store = S3DataStore(credentials, bucket="my-bucket")
-            >>> urls = store.write_shards(dataset, prefix="training/v1")
-            >>> print(urls)
-            ['s3://my-bucket/training/v1/shard-000000.tar', ...]
+    Examples:
+        >>> store = S3DataStore(credentials, bucket="my-bucket")
+        >>> urls = store.write_shards(dataset, prefix="training/v1")
+        >>> print(urls)
+        ['s3://my-bucket/training/v1/shard-000000.tar', ...]
     """
     def write_shards(
@@ -443,18 +432,16 @@ class DataSource(Protocol):
     - ATProto blob streaming
     - Any other source that can provide file-like objects
-    Example:
-        ::
-            >>> source = S3Source(
-            ...     bucket="my-bucket",
-            ...     keys=["data-000.tar", "data-001.tar"],
-            ...     endpoint="https://r2.example.com",
-            ...     credentials=creds,
-            ... )
-            >>> ds = Dataset[MySample](source)
-            >>> for sample in ds.ordered():
-            ...     print(sample)
+    Examples:
+        >>> source = S3Source(
+        ...     bucket="my-bucket",
+        ...     keys=["data-000.tar", "data-001.tar"],
+        ...     endpoint="https://r2.example.com",
+        ...     credentials=creds,
+        ... )
+        >>> ds = Dataset[MySample](source)
+        >>> for sample in ds.ordered():
+        ...     print(sample)
     """
     @property
@@ -467,12 +454,10 @@ class DataSource(Protocol):
         Yields:
             Tuple of (shard_identifier, file_like_stream).
-        Example:
-            ::
-                >>> for shard_id, stream in source.shards:
-                ...     print(f"Processing {shard_id}")
-                ...     data = stream.read()
+        Examples:
+            >>> for shard_id, stream in source.shards:
+            ...     print(f"Processing {shard_id}")
+            ...     data = stream.read()
         """
         ...

atdata/_schema_codec.py CHANGED Viewed

@@ -9,19 +9,17 @@ The schema format follows the ATProto record structure defined in
 ``atmosphere/_types.py``, with field types supporting primitives, ndarrays,
 arrays, and schema references.
-Example:
-    ::
-        >>> schema = {
-        ...     "name": "ImageSample",
-        ...     "version": "1.0.0",
-        ...     "fields": [
-        ...         {"name": "image", "fieldType": {"$type": "...#ndarray", "dtype": "float32"}, "optional": False},
-        ...         {"name": "label", "fieldType": {"$type": "...#primitive", "primitive": "str"}, "optional": False},
-        ...     ]
-        ... }
-        >>> ImageSample = schema_to_type(schema)
-        >>> sample = ImageSample(image=np.zeros((64, 64)), label="cat")
+Examples:
+    >>> schema = {
+    ...     "name": "ImageSample",
+    ...     "version": "1.0.0",
+    ...     "fields": [
+    ...         {"name": "image", "fieldType": {"$type": "...#ndarray", "dtype": "float32"}, "optional": False},
+    ...         {"name": "label", "fieldType": {"$type": "...#primitive", "primitive": "str"}, "optional": False},
+    ...     ]
+    ... }
+    >>> ImageSample = schema_to_type(schema)
+    >>> sample = ImageSample(image=np.zeros((64, 64)), label="cat")
 """
 from dataclasses import field, make_dataclass
@@ -151,14 +149,12 @@ def schema_to_type(
     Raises:
         ValueError: If schema is malformed or contains unsupported types.
-    Example:
-        ::
-            >>> schema = index.get_schema("local://schemas/MySample@1.0.0")
-            >>> MySample = schema_to_type(schema)
-            >>> ds = Dataset[MySample]("data.tar")
-            >>> for sample in ds.ordered():
-            ...     print(sample)
+    Examples:
+        >>> schema = index.get_schema("local://schemas/MySample@1.0.0")
+        >>> MySample = schema_to_type(schema)
+        >>> ds = Dataset[MySample]("data.tar")
+        >>> for sample in ds.ordered():
+        ...     print(sample)
     """
     # Check cache first
     if use_cache:
@@ -207,7 +203,9 @@ def schema_to_type(
         namespace={
             "__post_init__": lambda self: PackableSample.__post_init__(self),
             "__schema_version__": version,
-            "__schema_ref__": schema.get("$ref", None),  # Store original ref if available
+            "__schema_ref__": schema.get(
+                "$ref", None
+            ),  # Store original ref if available
         },
     )
@@ -243,7 +241,9 @@ def _field_type_to_stub_str(field_type: dict, optional: bool = False) -> str:
     if kind == "primitive":
         primitive = field_type.get("primitive", "str")
-        py_type = primitive  # str, int, float, bool, bytes are all valid Python type names
+        py_type = (
+            primitive  # str, int, float, bool, bytes are all valid Python type names
+        )
     elif kind == "ndarray":
         py_type = "NDArray[Any]"
     elif kind == "array":
@@ -282,14 +282,12 @@ def generate_stub(schema: dict) -> str:
     Returns:
         String content for a .pyi stub file.
-    Example:
-        ::
-            >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
-            >>> stub_content = generate_stub(schema.to_dict())
-            >>> # Save to a stubs directory configured in your IDE
-            >>> with open("stubs/my_sample.pyi", "w") as f:
-            ...     f.write(stub_content)
+    Examples:
+        >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
+        >>> stub_content = generate_stub(schema.to_dict())
+        >>> # Save to a stubs directory configured in your IDE
+        >>> with open("stubs/my_sample.pyi", "w") as f:
+        ...     f.write(stub_content)
     """
     name = schema.get("name", "UnknownSample")
     version = schema.get("version", "1.0.0")
@@ -360,12 +358,10 @@ def generate_module(schema: dict) -> str:
     Returns:
         String content for a .py module file.
-    Example:
-        ::
-            >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
-            >>> module_content = generate_module(schema.to_dict())
-            >>> # The module can be imported after being saved
+    Examples:
+        >>> schema = index.get_schema("atdata://local/sampleSchema/MySample@1.0.0")
+        >>> module_content = generate_module(schema.to_dict())
+        >>> # The module can be imported after being saved
     """
     name = schema.get("name", "UnknownSample")
     version = schema.get("version", "1.0.0")

atdata 0.2.2b1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl

atdata 0.2.2b1py3-none-any.whl → 0.2.3b1py3-none-any.whl