PyPI - atdata - Versions diffs - 0.2.2b1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl - Mend

atdata 0.2.2b1py3-none-any.whl → 0.2.3b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

atdata/__init__.py +1 -1
atdata/_cid.py +29 -35
atdata/_helpers.py +7 -5
atdata/_hf_api.py +48 -50
atdata/_protocols.py +56 -71
atdata/_schema_codec.py +33 -37
atdata/_sources.py +57 -64
atdata/_stub_manager.py +31 -26
atdata/_type_utils.py +19 -5
atdata/atmosphere/__init__.py +20 -23
atdata/atmosphere/_types.py +11 -11
atdata/atmosphere/client.py +11 -8
atdata/atmosphere/lens.py +27 -30
atdata/atmosphere/records.py +31 -37
atdata/atmosphere/schema.py +33 -29
atdata/atmosphere/store.py +16 -20
atdata/cli/__init__.py +12 -3
atdata/cli/diagnose.py +12 -8
atdata/cli/local.py +4 -1
atdata/dataset.py +284 -241
atdata/lens.py +77 -82
atdata/local.py +182 -169
atdata/promote.py +18 -22
{atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/METADATA +2 -1
atdata-0.2.3b1.dist-info/RECORD +28 -0
atdata-0.2.2b1.dist-info/RECORD +0 -28
{atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/WHEEL +0 -0
{atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.2b1.dist-info → atdata-0.2.3b1.dist-info}/licenses/LICENSE +0 -0

atdata/dataset.py CHANGED Viewed

@@ -13,18 +13,16 @@ The implementation handles automatic conversion between numpy arrays and bytes
 during serialization, enabling efficient storage of numerical data in WebDataset
 archives.
-Example:
-    ::
-        >>> @packable
-        ... class ImageSample:
-        ...     image: NDArray
-        ...     label: str
-        ...
-        >>> ds = Dataset[ImageSample]("data-{000000..000009}.tar")
-        >>> for batch in ds.shuffled(batch_size=32):
-        ...     images = batch.image  # Stacked numpy array (32, H, W, C)
-        ...     labels = batch.label  # List of 32 strings
+Examples:
+    >>> @packable
+    ... class ImageSample:
+    ...     image: NDArray
+    ...     label: str
+    ...
+    >>> ds = Dataset[ImageSample]("data-{000000..000009}.tar")
+    >>> for batch in ds.shuffled(batch_size=32):
+    ...     images = batch.image  # Stacked numpy array (32, H, W, C)
+    ...     labels = batch.label  # List of 32 strings
 """
 ##
@@ -43,7 +41,7 @@ from dataclasses import (
 )
 from abc import ABC
-from ._sources import URLSource, S3Source
+from ._sources import URLSource
 from ._protocols import DataSource
 from tqdm import tqdm
@@ -66,6 +64,7 @@ from typing import (
     TypeVar,
     TypeAlias,
     dataclass_transform,
+    overload,
 )
 from numpy.typing import NDArray
@@ -85,30 +84,31 @@ WDSRawSample: TypeAlias = Dict[str, Any]
 WDSRawBatch: TypeAlias = Dict[str, Any]
 SampleExportRow: TypeAlias = Dict[str, Any]
-SampleExportMap: TypeAlias = Callable[['PackableSample'], SampleExportRow]
+SampleExportMap: TypeAlias = Callable[["PackableSample"], SampleExportRow]
 ##
 # Main base classes
-DT = TypeVar( 'DT' )
+DT = TypeVar("DT")
-def _make_packable( x ):
+def _make_packable(x):
     """Convert numpy arrays to bytes; pass through other values unchanged."""
-    if isinstance( x, np.ndarray ):
-        return eh.array_to_bytes( x )
+    if isinstance(x, np.ndarray):
+        return eh.array_to_bytes(x)
     return x
-def _is_possibly_ndarray_type( t ):
+def _is_possibly_ndarray_type(t):
     """Return True if type annotation is NDArray or Optional[NDArray]."""
     if t == NDArray:
         return True
-    if isinstance( t, types.UnionType ):
-        return any( x == NDArray for x in t.__args__ )
+    if isinstance(t, types.UnionType):
+        return any(x == NDArray for x in t.__args__)
     return False
 class DictSample:
     """Dynamic sample type providing dict-like access to raw msgpack data.
@@ -126,24 +126,22 @@ class DictSample:
     ``@packable``-decorated class. Every ``@packable`` class automatically
     registers a lens from ``DictSample``, making this conversion seamless.
-    Example:
-        ::
-            >>> ds = load_dataset("path/to/data.tar")  # Returns Dataset[DictSample]
-            >>> for sample in ds.ordered():
-            ...     print(sample.some_field)      # Attribute access
-            ...     print(sample["other_field"])  # Dict access
-            ...     print(sample.keys())          # Inspect available fields
-            ...
-            >>> # Convert to typed schema
-            >>> typed_ds = ds.as_type(MyTypedSample)
+    Examples:
+        >>> ds = load_dataset("path/to/data.tar")  # Returns Dataset[DictSample]
+        >>> for sample in ds.ordered():
+        ...     print(sample.some_field)      # Attribute access
+        ...     print(sample["other_field"])  # Dict access
+        ...     print(sample.keys())          # Inspect available fields
+        ...
+        >>> # Convert to typed schema
+        >>> typed_ds = ds.as_type(MyTypedSample)
     Note:
         NDArray fields are stored as raw bytes in DictSample. They are only
         converted to numpy arrays when accessed through a typed sample class.
     """
-    __slots__ = ('_data',)
+    __slots__ = ("_data",)
     def __init__(self, _data: dict[str, Any] | None = None, **kwargs: Any) -> None:
         """Create a DictSample from a dictionary or keyword arguments.
@@ -153,12 +151,12 @@ class DictSample:
             **kwargs: Field values if _data is not provided.
         """
         if _data is not None:
-            object.__setattr__(self, '_data', _data)
+            object.__setattr__(self, "_data", _data)
         else:
-            object.__setattr__(self, '_data', kwargs)
+            object.__setattr__(self, "_data", kwargs)
     @classmethod
-    def from_data(cls, data: dict[str, Any]) -> 'DictSample':
+    def from_data(cls, data: dict[str, Any]) -> "DictSample":
         """Create a DictSample from unpacked msgpack data.
         Args:
@@ -170,7 +168,7 @@ class DictSample:
         return cls(_data=data)
     @classmethod
-    def from_bytes(cls, bs: bytes) -> 'DictSample':
+    def from_bytes(cls, bs: bytes) -> "DictSample":
         """Create a DictSample from raw msgpack bytes.
         Args:
@@ -194,7 +192,7 @@ class DictSample:
             AttributeError: If the field doesn't exist.
         """
         # Avoid infinite recursion for _data lookup
-        if name == '_data':
+        if name == "_data":
             raise AttributeError(name)
         try:
             return self._data[name]
@@ -260,24 +258,24 @@ class DictSample:
         return msgpack.packb(self._data)
     @property
-    def as_wds(self) -> 'WDSRawSample':
+    def as_wds(self) -> "WDSRawSample":
         """Pack this sample's data for writing to WebDataset.
         Returns:
             A dictionary with ``__key__`` and ``msgpack`` fields.
         """
         return {
-            '__key__': str(uuid.uuid1(0, 0)),
-            'msgpack': self.packed,
+            "__key__": str(uuid.uuid1(0, 0)),
+            "msgpack": self.packed,
         }
     def __repr__(self) -> str:
-        fields = ', '.join(f'{k}=...' for k in self._data.keys())
-        return f'DictSample({fields})'
+        fields = ", ".join(f"{k}=..." for k in self._data.keys())
+        return f"DictSample({fields})"
 @dataclass
-class PackableSample( ABC ):
+class PackableSample(ABC):
     """Base class for samples that can be serialized with msgpack.
     This abstract base class provides automatic serialization/deserialization
@@ -289,54 +287,52 @@ class PackableSample( ABC ):
     1. Direct inheritance with the ``@dataclass`` decorator
     2. Using the ``@packable`` decorator (recommended)
-    Example:
-        ::
-            >>> @packable
-            ... class MyData:
-            ...     name: str
-            ...     embeddings: NDArray
-            ...
-            >>> sample = MyData(name="test", embeddings=np.array([1.0, 2.0]))
-            >>> packed = sample.packed  # Serialize to bytes
-            >>> restored = MyData.from_bytes(packed)  # Deserialize
+    Examples:
+        >>> @packable
+        ... class MyData:
+        ...     name: str
+        ...     embeddings: NDArray
+        ...
+        >>> sample = MyData(name="test", embeddings=np.array([1.0, 2.0]))
+        >>> packed = sample.packed  # Serialize to bytes
+        >>> restored = MyData.from_bytes(packed)  # Deserialize
     """
-    def _ensure_good( self ):
+    def _ensure_good(self):
         """Convert bytes to NDArray for fields annotated as NDArray or NDArray | None."""
         # Auto-convert known types when annotated
         # for var_name, var_type in vars( self.__class__ )['__annotations__'].items():
-        for field in dataclasses.fields( self ):
+        for field in dataclasses.fields(self):
             var_name = field.name
             var_type = field.type
             # Annotation for this variable is to be an NDArray
-            if _is_possibly_ndarray_type( var_type ):
+            if _is_possibly_ndarray_type(var_type):
                 # ... so, we'll always auto-convert to numpy
-                var_cur_value = getattr( self, var_name )
+                var_cur_value = getattr(self, var_name)
                 # Execute the appropriate conversion for intermediate data
                 # based on what is provided
-                if isinstance( var_cur_value, np.ndarray ):
+                if isinstance(var_cur_value, np.ndarray):
                     # Already the correct type, no conversion needed
                     continue
-                elif isinstance( var_cur_value, bytes ):
+                elif isinstance(var_cur_value, bytes):
                     # Design note: bytes in NDArray-typed fields are always interpreted
                     # as serialized arrays. This means raw bytes fields must not be
                     # annotated as NDArray.
-                    setattr( self, var_name, eh.bytes_to_array( var_cur_value ) )
+                    setattr(self, var_name, eh.bytes_to_array(var_cur_value))
-    def __post_init__( self ):
+    def __post_init__(self):
         self._ensure_good()
     ##
     @classmethod
-    def from_data( cls, data: WDSRawSample ) -> Self:
+    def from_data(cls, data: WDSRawSample) -> Self:
         """Create a sample instance from unpacked msgpack data.
         Args:
@@ -345,10 +341,10 @@ class PackableSample( ABC ):
         Returns:
             New instance with NDArray fields auto-converted from bytes.
         """
-        return cls( **data )
+        return cls(**data)
     @classmethod
-    def from_bytes( cls, bs: bytes ) -> Self:
+    def from_bytes(cls, bs: bytes) -> Self:
         """Create a sample instance from raw msgpack bytes.
         Args:
@@ -357,10 +353,10 @@ class PackableSample( ABC ):
         Returns:
             A new instance of this sample class deserialized from the bytes.
         """
-        return cls.from_data( ormsgpack.unpackb( bs ) )
+        return cls.from_data(ormsgpack.unpackb(bs))
     @property
-    def packed( self ) -> bytes:
+    def packed(self) -> bytes:
         """Pack this sample's data into msgpack bytes.
         NDArray fields are automatically converted to bytes before packing.
@@ -375,20 +371,17 @@ class PackableSample( ABC ):
         # Make sure that all of our (possibly unpackable) data is in a packable
         # format
-        o = {
-            k: _make_packable( v )
-            for k, v in vars( self ).items()
-        }
+        o = {k: _make_packable(v) for k, v in vars(self).items()}
-        ret = msgpack.packb( o )
+        ret = msgpack.packb(o)
         if ret is None:
-            raise RuntimeError( f'Failed to pack sample to bytes: {o}' )
+            raise RuntimeError(f"Failed to pack sample to bytes: {o}")
         return ret
     @property
-    def as_wds( self ) -> WDSRawSample:
+    def as_wds(self) -> WDSRawSample:
         """Pack this sample's data for writing to WebDataset.
         Returns:
@@ -401,19 +394,21 @@ class PackableSample( ABC ):
         """
         return {
             # Generates a UUID that is timelike-sortable
-            '__key__': str( uuid.uuid1( 0, 0 ) ),
-            'msgpack': self.packed,
+            "__key__": str(uuid.uuid1(0, 0)),
+            "msgpack": self.packed,
         }
-def _batch_aggregate( xs: Sequence ):
+def _batch_aggregate(xs: Sequence):
     """Stack arrays into numpy array with batch dim; otherwise return list."""
     if not xs:
         return []
-    if isinstance( xs[0], np.ndarray ):
-        return np.array( list( xs ) )
-    return list( xs )
+    if isinstance(xs[0], np.ndarray):
+        return np.array(list(xs))
+    return list(xs)
-class SampleBatch( Generic[DT] ):
+class SampleBatch(Generic[DT]):
     """A batch of samples with automatic attribute aggregation.
     This class wraps a sequence of samples and provides magic ``__getattr__``
@@ -430,12 +425,10 @@ class SampleBatch( Generic[DT] ):
     Attributes:
         samples: The list of sample instances in this batch.
-    Example:
-        ::
-            >>> batch = SampleBatch[MyData]([sample1, sample2, sample3])
-            >>> batch.embeddings  # Returns stacked numpy array of shape (3, ...)
-            >>> batch.names  # Returns list of names
+    Examples:
+        >>> batch = SampleBatch[MyData]([sample1, sample2, sample3])
+        >>> batch.embeddings  # Returns stacked numpy array of shape (3, ...)
+        >>> batch.names  # Returns list of names
     Note:
         This class uses Python's ``__orig_class__`` mechanism to extract the
@@ -443,10 +436,11 @@ class SampleBatch( Generic[DT] ):
         subscripted syntax ``SampleBatch[MyType](samples)`` rather than
         calling the constructor directly with an unsubscripted class.
     """
     # Design note: The docstring uses "Parameters:" for type parameters because
     # quartodoc doesn't yet support "Type Parameters:" sections in generated docs.
-    def __init__( self, samples: Sequence[DT] ):
+    def __init__(self, samples: Sequence[DT]):
         """Create a batch from a sequence of samples.
         Args:
@@ -454,23 +448,23 @@ class SampleBatch( Generic[DT] ):
                 Each sample must be an instance of a type derived from
                 ``PackableSample``.
         """
-        self.samples = list( samples )
+        self.samples = list(samples)
         self._aggregate_cache = dict()
         self._sample_type_cache: Type | None = None
     @property
-    def sample_type( self ) -> Type:
+    def sample_type(self) -> Type:
         """The type of each sample in this batch.
         Returns:
             The type parameter ``DT`` used when creating this ``SampleBatch[DT]``.
         """
         if self._sample_type_cache is None:
-            self._sample_type_cache = typing.get_args( self.__orig_class__)[0]
+            self._sample_type_cache = typing.get_args(self.__orig_class__)[0]
             assert self._sample_type_cache is not None
         return self._sample_type_cache
-    def __getattr__( self, name ):
+    def __getattr__(self, name):
         """Aggregate an attribute across all samples in the batch.
         This magic method enables attribute-style access to aggregated sample
@@ -487,20 +481,19 @@ class SampleBatch( Generic[DT] ):
             AttributeError: If the attribute doesn't exist on the sample type.
         """
         # Aggregate named params of sample type
-        if name in vars( self.sample_type )['__annotations__']:
+        if name in vars(self.sample_type)["__annotations__"]:
             if name not in self._aggregate_cache:
                 self._aggregate_cache[name] = _batch_aggregate(
-                    [ getattr( x, name )
-                      for x in self.samples ]
+                    [getattr(x, name) for x in self.samples]
                 )
             return self._aggregate_cache[name]
-        raise AttributeError( f'No sample attribute named {name}' )
+        raise AttributeError(f"No sample attribute named {name}")
-ST = TypeVar( 'ST', bound = PackableSample )
-RT = TypeVar( 'RT', bound = PackableSample )
+ST = TypeVar("ST", bound=PackableSample)
+RT = TypeVar("RT", bound=PackableSample)
 class _ShardListStage(wds.utils.PipelineStage):
@@ -538,7 +531,7 @@ class _StreamOpenerStage(wds.utils.PipelineStage):
             yield sample
-class Dataset( Generic[ST] ):
+class Dataset(Generic[ST]):
     """A typed dataset built on WebDataset with lens transformations.
     This class wraps WebDataset tar archives and provides type-safe iteration
@@ -557,16 +550,14 @@ class Dataset( Generic[ST] ):
     Attributes:
         url: WebDataset brace-notation URL for the tar file(s).
-    Example:
-        ::
-            >>> ds = Dataset[MyData]("path/to/data-{000000..000009}.tar")
-            >>> for sample in ds.ordered(batch_size=32):
-            ...     # sample is SampleBatch[MyData] with batch_size samples
-            ...     embeddings = sample.embeddings  # shape: (32, ...)
-            ...
-            >>> # Transform to a different view
-            >>> ds_view = ds.as_type(MyDataView)
+    Examples:
+        >>> ds = Dataset[MyData]("path/to/data-{000000..000009}.tar")
+        >>> for sample in ds.ordered(batch_size=32):
+        ...     # sample is SampleBatch[MyData] with batch_size samples
+        ...     embeddings = sample.embeddings  # shape: (32, ...)
+        ...
+        >>> # Transform to a different view
+        >>> ds_view = ds.as_type(MyDataView)
     Note:
         This class uses Python's ``__orig_class__`` mechanism to extract the
@@ -574,22 +565,24 @@ class Dataset( Generic[ST] ):
         subscripted syntax ``Dataset[MyType](url)`` rather than calling the
         constructor directly with an unsubscripted class.
     """
     # Design note: The docstring uses "Parameters:" for type parameters because
     # quartodoc doesn't yet support "Type Parameters:" sections in generated docs.
     @property
-    def sample_type( self ) -> Type:
+    def sample_type(self) -> Type:
         """The type of each returned sample from this dataset's iterator.
         Returns:
             The type parameter ``ST`` used when creating this ``Dataset[ST]``.
         """
         if self._sample_type_cache is None:
-            self._sample_type_cache = typing.get_args( self.__orig_class__ )[0]
+            self._sample_type_cache = typing.get_args(self.__orig_class__)[0]
             assert self._sample_type_cache is not None
         return self._sample_type_cache
     @property
-    def batch_type( self ) -> Type:
+    def batch_type(self) -> Type:
         """The type of batches produced by this dataset.
         Returns:
@@ -597,12 +590,13 @@ class Dataset( Generic[ST] ):
         """
         return SampleBatch[self.sample_type]
-    def __init__( self,
-                 source: DataSource | str | None = None,
-                 metadata_url: str | None = None,
-                 *,
-                 url: str | None = None,
-             ) -> None:
+    def __init__(
+        self,
+        source: DataSource | str | None = None,
+        metadata_url: str | None = None,
+        *,
+        url: str | None = None,
+    ) -> None:
         """Create a dataset from a DataSource or URL.
         Args:
@@ -650,7 +644,7 @@ class Dataset( Generic[ST] ):
         """The underlying data source for this dataset."""
         return self._source
-    def as_type( self, other: Type[RT] ) -> 'Dataset[RT]':
+    def as_type(self, other: Type[RT]) -> "Dataset[RT]":
         """View this dataset through a different sample type using a registered lens.
         Args:
@@ -666,10 +660,10 @@ class Dataset( Generic[ST] ):
             ValueError: If no registered lens exists between the current
                 sample type and the target type.
         """
-        ret = Dataset[other]( self._source )
+        ret = Dataset[other](self._source)
         # Get the singleton lens registry
         lenses = LensNetwork()
-        ret._output_lens = lenses.transform( self.sample_type, ret.sample_type )
+        ret._output_lens = lenses.transform(self.sample_type, ret.sample_type)
         return ret
     @property
@@ -679,11 +673,9 @@ class Dataset( Generic[ST] ):
         Yields:
             Shard identifiers (e.g., 'train-000000.tar', 'train-000001.tar').
-        Example:
-            ::
-                >>> for shard in ds.shards:
-                ...     print(f"Processing {shard}")
+        Examples:
+            >>> for shard in ds.shards:
+            ...     print(f"Processing {shard}")
         """
         return iter(self._source.list_shards())
@@ -705,6 +697,7 @@ class Dataset( Generic[ST] ):
             Use :meth:`list_shards` instead.
         """
         import warnings
         warnings.warn(
             "shard_list is deprecated, use list_shards() instead",
             DeprecationWarning,
@@ -713,7 +706,7 @@ class Dataset( Generic[ST] ):
         return self.list_shards()
     @property
-    def metadata( self ) -> dict[str, Any] | None:
+    def metadata(self) -> dict[str, Any] | None:
         """Fetch and cache metadata from metadata_url.
         Returns:
@@ -726,27 +719,47 @@ class Dataset( Generic[ST] ):
             return None
         if self._metadata is None:
-            with requests.get( self.metadata_url, stream = True ) as response:
+            with requests.get(self.metadata_url, stream=True) as response:
                 response.raise_for_status()
-                self._metadata = msgpack.unpackb( response.content, raw = False )
+                self._metadata = msgpack.unpackb(response.content, raw=False)
         # Use our cached values
         return self._metadata
-    def ordered( self,
-                batch_size: int | None = None,
-            ) -> Iterable[ST]:
-        """Iterate over the dataset in order
+    @overload
+    def ordered(
+        self,
+        batch_size: None = None,
+    ) -> Iterable[ST]: ...
+    @overload
+    def ordered(
+        self,
+        batch_size: int,
+    ) -> Iterable[SampleBatch[ST]]: ...
+    def ordered(
+        self,
+        batch_size: int | None = None,
+    ) -> Iterable[ST] | Iterable[SampleBatch[ST]]:
+        """Iterate over the dataset in order.
         Args:
-            batch_size (:obj:`int`, optional): The size of iterated batches.
-                Default: None (unbatched). If ``None``, iterates over one
-                sample at a time with no batch dimension.
+            batch_size: The size of iterated batches. Default: None (unbatched).
+                If ``None``, iterates over one sample at a time with no batch
+                dimension.
         Returns:
-            :obj:`webdataset.DataPipeline` A data pipeline that iterates over
-            the dataset in its original sample order
+            A data pipeline that iterates over the dataset in its original
+            sample order. When ``batch_size`` is ``None``, yields individual
+            samples of type ``ST``. When ``batch_size`` is an integer, yields
+            ``SampleBatch[ST]`` instances containing that many samples.
+        Examples:
+            >>> for sample in ds.ordered():
+            ...     process(sample)  # sample is ST
+            >>> for batch in ds.ordered(batch_size=32):
+            ...     process(batch)  # batch is SampleBatch[ST]
         """
         if batch_size is None:
             return wds.pipeline.DataPipeline(
@@ -755,7 +768,7 @@ class Dataset( Generic[ST] ):
                 _StreamOpenerStage(self._source),
                 wds.tariterators.tar_file_expander,
                 wds.tariterators.group_by_keys,
-                wds.filters.map( self.wrap ),
+                wds.filters.map(self.wrap),
             )
         return wds.pipeline.DataPipeline(
@@ -764,15 +777,33 @@ class Dataset( Generic[ST] ):
             _StreamOpenerStage(self._source),
             wds.tariterators.tar_file_expander,
             wds.tariterators.group_by_keys,
-            wds.filters.batched( batch_size ),
-            wds.filters.map( self.wrap_batch ),
+            wds.filters.batched(batch_size),
+            wds.filters.map(self.wrap_batch),
         )
-    def shuffled( self,
-                buffer_shards: int = 100,
-                buffer_samples: int = 10_000,
-                batch_size: int | None = None,
-            ) -> Iterable[ST]:
+    @overload
+    def shuffled(
+        self,
+        buffer_shards: int = 100,
+        buffer_samples: int = 10_000,
+        batch_size: None = None,
+    ) -> Iterable[ST]: ...
+    @overload
+    def shuffled(
+        self,
+        buffer_shards: int = 100,
+        buffer_samples: int = 10_000,
+        *,
+        batch_size: int,
+    ) -> Iterable[SampleBatch[ST]]: ...
+    def shuffled(
+        self,
+        buffer_shards: int = 100,
+        buffer_samples: int = 10_000,
+        batch_size: int | None = None,
+    ) -> Iterable[ST] | Iterable[SampleBatch[ST]]:
         """Iterate over the dataset in random order.
         Args:
@@ -787,42 +818,50 @@ class Dataset( Generic[ST] ):
                 dimension.
         Returns:
-            A WebDataset data pipeline that iterates over the dataset in
-            randomized order. If ``batch_size`` is not ``None``, yields
-            ``SampleBatch[ST]`` instances; otherwise yields individual ``ST``
-            samples.
+            A data pipeline that iterates over the dataset in randomized order.
+            When ``batch_size`` is ``None``, yields individual samples of type
+            ``ST``. When ``batch_size`` is an integer, yields ``SampleBatch[ST]``
+            instances containing that many samples.
+        Examples:
+            >>> for sample in ds.shuffled():
+            ...     process(sample)  # sample is ST
+            >>> for batch in ds.shuffled(batch_size=32):
+            ...     process(batch)  # batch is SampleBatch[ST]
         """
         if batch_size is None:
             return wds.pipeline.DataPipeline(
                 _ShardListStage(self._source),
-                wds.filters.shuffle( buffer_shards ),
+                wds.filters.shuffle(buffer_shards),
                 wds.shardlists.split_by_worker,
                 _StreamOpenerStage(self._source),
                 wds.tariterators.tar_file_expander,
                 wds.tariterators.group_by_keys,
-                wds.filters.shuffle( buffer_samples ),
-                wds.filters.map( self.wrap ),
+                wds.filters.shuffle(buffer_samples),
+                wds.filters.map(self.wrap),
             )
         return wds.pipeline.DataPipeline(
             _ShardListStage(self._source),
-            wds.filters.shuffle( buffer_shards ),
+            wds.filters.shuffle(buffer_shards),
             wds.shardlists.split_by_worker,
             _StreamOpenerStage(self._source),
             wds.tariterators.tar_file_expander,
             wds.tariterators.group_by_keys,
-            wds.filters.shuffle( buffer_samples ),
-            wds.filters.batched( batch_size ),
-            wds.filters.map( self.wrap_batch ),
+            wds.filters.shuffle(buffer_samples),
+            wds.filters.batched(batch_size),
+            wds.filters.map(self.wrap_batch),
         )
     # Design note: Uses pandas for parquet export. Could be replaced with
     # direct fastparquet calls to reduce dependencies if needed.
-    def to_parquet( self, path: Pathlike,
-                sample_map: Optional[SampleExportMap] = None,
-                maxcount: Optional[int] = None,
-                **kwargs,
-            ):
+    def to_parquet(
+        self,
+        path: Pathlike,
+        sample_map: Optional[SampleExportMap] = None,
+        maxcount: Optional[int] = None,
+        **kwargs,
+    ):
         """Export dataset contents to parquet format.
         Converts all samples to a pandas DataFrame and saves to parquet file(s).
@@ -851,63 +890,62 @@ class Dataset( Generic[ST] ):
             This creates multiple parquet files: ``output-000000.parquet``,
             ``output-000001.parquet``, etc.
-        Example:
-            ::
-                >>> ds = Dataset[MySample]("data.tar")
-                >>> # Small dataset - load all at once
-                >>> ds.to_parquet("output.parquet")
-                >>>
-                >>> # Large dataset - process in chunks
-                >>> ds.to_parquet("output.parquet", maxcount=50000)
+        Examples:
+            >>> ds = Dataset[MySample]("data.tar")
+            >>> # Small dataset - load all at once
+            >>> ds.to_parquet("output.parquet")
+            >>>
+            >>> # Large dataset - process in chunks
+            >>> ds.to_parquet("output.parquet", maxcount=50000)
         """
         ##
         # Normalize args
-        path = Path( path )
+        path = Path(path)
         if sample_map is None:
             sample_map = asdict
-        verbose = kwargs.get( 'verbose', False )
-        it = self.ordered( batch_size = None )
+        verbose = kwargs.get("verbose", False)
+        it = self.ordered(batch_size=None)
         if verbose:
-            it = tqdm( it )
+            it = tqdm(it)
         #
         if maxcount is None:
             # Load and save full dataset
-            df = pd.DataFrame( [ sample_map( x )
-                                 for x in self.ordered( batch_size = None ) ] )
-            df.to_parquet( path, **kwargs )
+            df = pd.DataFrame([sample_map(x) for x in self.ordered(batch_size=None)])
+            df.to_parquet(path, **kwargs)
         else:
             # Load and save dataset in segments of size `maxcount`
             cur_segment = 0
             cur_buffer = []
-            path_template = (path.parent / f'{path.stem}-{{:06d}}{path.suffix}').as_posix()
+            path_template = (
+                path.parent / f"{path.stem}-{{:06d}}{path.suffix}"
+            ).as_posix()
-            for x in self.ordered( batch_size = None ):
-                cur_buffer.append( sample_map( x ) )
+            for x in self.ordered(batch_size=None):
+                cur_buffer.append(sample_map(x))
-                if len( cur_buffer ) >= maxcount:
+                if len(cur_buffer) >= maxcount:
                     # Write current segment
-                    cur_path = path_template.format( cur_segment )
-                    df = pd.DataFrame( cur_buffer )
-                    df.to_parquet( cur_path, **kwargs )
+                    cur_path = path_template.format(cur_segment)
+                    df = pd.DataFrame(cur_buffer)
+                    df.to_parquet(cur_path, **kwargs)
                     cur_segment += 1
                     cur_buffer = []
-            if len( cur_buffer ) > 0:
+            if len(cur_buffer) > 0:
                 # Write one last segment with remainder
-                cur_path = path_template.format( cur_segment )
-                df = pd.DataFrame( cur_buffer )
-                df.to_parquet( cur_path, **kwargs )
+                cur_path = path_template.format(cur_segment)
+                df = pd.DataFrame(cur_buffer)
+                df.to_parquet(cur_path, **kwargs)
-    def wrap( self, sample: WDSRawSample ) -> ST:
+    def wrap(self, sample: WDSRawSample) -> ST:
         """Wrap a raw msgpack sample into the appropriate dataset-specific type.
         Args:
@@ -918,18 +956,22 @@ class Dataset( Generic[ST] ):
             A deserialized sample of type ``ST``, optionally transformed through
             a lens if ``as_type()`` was called.
         """
-        if 'msgpack' not in sample:
-            raise ValueError(f"Sample missing 'msgpack' key, got keys: {list(sample.keys())}")
-        if not isinstance(sample['msgpack'], bytes):
-            raise ValueError(f"Expected sample['msgpack'] to be bytes, got {type(sample['msgpack']).__name__}")
+        if "msgpack" not in sample:
+            raise ValueError(
+                f"Sample missing 'msgpack' key, got keys: {list(sample.keys())}"
+            )
+        if not isinstance(sample["msgpack"], bytes):
+            raise ValueError(
+                f"Expected sample['msgpack'] to be bytes, got {type(sample['msgpack']).__name__}"
+            )
         if self._output_lens is None:
-            return self.sample_type.from_bytes( sample['msgpack'] )
+            return self.sample_type.from_bytes(sample["msgpack"])
-        source_sample = self._output_lens.source_type.from_bytes( sample['msgpack'] )
-        return self._output_lens( source_sample )
+        source_sample = self._output_lens.source_type.from_bytes(sample["msgpack"])
+        return self._output_lens(source_sample)
-    def wrap_batch( self, batch: WDSRawBatch ) -> SampleBatch[ST]:
+    def wrap_batch(self, batch: WDSRawBatch) -> SampleBatch[ST]:
         """Wrap a batch of raw msgpack samples into a typed SampleBatch.
         Args:
@@ -945,26 +987,29 @@ class Dataset( Generic[ST] ):
             aggregates them into a batch.
         """
-        if 'msgpack' not in batch:
-            raise ValueError(f"Batch missing 'msgpack' key, got keys: {list(batch.keys())}")
+        if "msgpack" not in batch:
+            raise ValueError(
+                f"Batch missing 'msgpack' key, got keys: {list(batch.keys())}"
+            )
         if self._output_lens is None:
-            batch_unpacked = [ self.sample_type.from_bytes( bs )
-                               for bs in batch['msgpack'] ]
-            return SampleBatch[self.sample_type]( batch_unpacked )
+            batch_unpacked = [
+                self.sample_type.from_bytes(bs) for bs in batch["msgpack"]
+            ]
+            return SampleBatch[self.sample_type](batch_unpacked)
-        batch_source = [ self._output_lens.source_type.from_bytes( bs )
-                         for bs in batch['msgpack'] ]
-        batch_view = [ self._output_lens( s )
-                       for s in batch_source ]
-        return SampleBatch[self.sample_type]( batch_view )
+        batch_source = [
+            self._output_lens.source_type.from_bytes(bs) for bs in batch["msgpack"]
+        ]
+        batch_view = [self._output_lens(s) for s in batch_source]
+        return SampleBatch[self.sample_type](batch_view)
-_T = TypeVar('_T')
+_T = TypeVar("_T")
 @dataclass_transform()
-def packable( cls: type[_T] ) -> type[_T]:
+def packable(cls: type[_T]) -> type[_T]:
     """Decorator to convert a regular class into a ``PackableSample``.
     This decorator transforms a class into a dataclass that inherits from
@@ -984,19 +1029,17 @@ def packable( cls: type[_T] ) -> type[_T]:
         ``Packable`` protocol and can be used with ``Type[Packable]`` signatures.
     Examples:
-        This is a test of the functionality::
-            @packable
-            class MyData:
-                name: str
-                values: NDArray
-            sample = MyData(name="test", values=np.array([1, 2, 3]))
-            bytes_data = sample.packed
-            restored = MyData.from_bytes(bytes_data)
-            # Works with Packable-typed APIs
-            index.publish_schema(MyData, version="1.0.0")  # Type-safe
+        >>> @packable
+        ... class MyData:
+        ...     name: str
+        ...     values: NDArray
+        ...
+        >>> sample = MyData(name="test", values=np.array([1, 2, 3]))
+        >>> bytes_data = sample.packed
+        >>> restored = MyData.from_bytes(bytes_data)
+        >>>
+        >>> # Works with Packable-typed APIs
+        >>> index.publish_schema(MyData, version="1.0.0")  # Type-safe
     """
     ##
@@ -1005,14 +1048,14 @@ def packable( cls: type[_T] ) -> type[_T]:
     class_annotations = cls.__annotations__
     # Add in dataclass niceness to original class
-    as_dataclass = dataclass( cls )
+    as_dataclass = dataclass(cls)
     # This triggers a bunch of behind-the-scenes stuff for the newly annotated class
     @dataclass
-    class as_packable( as_dataclass, PackableSample ):
-        def __post_init__( self ):
-            return PackableSample.__post_init__( self )
+    class as_packable(as_dataclass, PackableSample):
+        def __post_init__(self):
+            return PackableSample.__post_init__(self)
     # Restore original class identity for better repr/debugging
     as_packable.__name__ = class_name
     as_packable.__qualname__ = class_name
@@ -1023,10 +1066,10 @@ def packable( cls: type[_T] ) -> type[_T]:
     # Fix qualnames of dataclass-generated methods so they don't show
     # 'packable.<locals>.as_packable' in help() and IDE hints
-    old_qualname_prefix = 'packable.<locals>.as_packable'
-    for attr_name in ('__init__', '__repr__', '__eq__', '__post_init__'):
+    old_qualname_prefix = "packable.<locals>.as_packable"
+    for attr_name in ("__init__", "__repr__", "__eq__", "__post_init__"):
         attr = getattr(as_packable, attr_name, None)
-        if attr is not None and hasattr(attr, '__qualname__'):
+        if attr is not None and hasattr(attr, "__qualname__"):
             if attr.__qualname__.startswith(old_qualname_prefix):
                 attr.__qualname__ = attr.__qualname__.replace(
                     old_qualname_prefix, class_name, 1
@@ -1042,4 +1085,4 @@ def packable( cls: type[_T] ) -> type[_T]:
     ##
-    return as_packable
+    return as_packable

atdata 0.2.2b1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl

atdata 0.2.2b1py3-none-any.whl → 0.2.3b1py3-none-any.whl