PyPI - atdata - Versions diffs - 0.1.3b3__py3-none-any.whl → 0.2.0a1__py3-none-any.whl - Mend

atdata 0.1.3b3py3-none-any.whl → 0.2.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

atdata/__init__.py +39 -1
atdata/_helpers.py +39 -3
atdata/atmosphere/__init__.py +61 -0
atdata/atmosphere/_types.py +329 -0
atdata/atmosphere/client.py +393 -0
atdata/atmosphere/lens.py +280 -0
atdata/atmosphere/records.py +342 -0
atdata/atmosphere/schema.py +296 -0
atdata/dataset.py +336 -203
atdata/lens.py +177 -77
atdata/local.py +492 -0
atdata-0.2.0a1.dist-info/METADATA +181 -0
atdata-0.2.0a1.dist-info/RECORD +16 -0
{atdata-0.1.3b3.dist-info → atdata-0.2.0a1.dist-info}/WHEEL +1 -1
atdata-0.1.3b3.dist-info/METADATA +0 -18
atdata-0.1.3b3.dist-info/RECORD +0 -9
{atdata-0.1.3b3.dist-info → atdata-0.2.0a1.dist-info}/entry_points.txt +0 -0
{atdata-0.1.3b3.dist-info → atdata-0.2.0a1.dist-info}/licenses/LICENSE +0 -0

atdata/dataset.py CHANGED Viewed

@@ -1,4 +1,29 @@
-"""Schematized WebDatasets"""
+"""Core dataset and sample infrastructure for typed WebDatasets.
+This module provides the core components for working with typed, msgpack-serialized
+samples in WebDataset format:
+- ``PackableSample``: Base class for msgpack-serializable samples with automatic
+  NDArray handling
+- ``SampleBatch``: Automatic batching with attribute aggregation
+- ``Dataset``: Generic typed dataset wrapper for WebDataset tar files
+- ``@packable``: Decorator to convert regular classes into PackableSample subclasses
+The implementation handles automatic conversion between numpy arrays and bytes
+during serialization, enabling efficient storage of numerical data in WebDataset
+archives.
+Example:
+    >>> @packable
+    ... class ImageSample:
+    ...     image: NDArray
+    ...     label: str
+    ...
+    >>> ds = Dataset[ImageSample]("data-{000000..000009}.tar")
+    >>> for batch in ds.shuffled(batch_size=32):
+    ...     images = batch.image  # Stacked numpy array (32, H, W, C)
+    ...     labels = batch.label  # List of 32 strings
+"""
 ##
 # Imports
@@ -7,7 +32,6 @@ import webdataset as wds
 from pathlib import Path
 import uuid
-import functools
 import dataclasses
 import types
@@ -15,14 +39,12 @@ from dataclasses import (
     dataclass,
     asdict,
 )
-from abc import (
-    ABC,
-    abstractmethod,
-)
+from abc import ABC
 from tqdm import tqdm
 import numpy as np
 import pandas as pd
+import requests
 import typing
 from typing import (
@@ -40,15 +62,7 @@ from typing import (
     TypeVar,
     TypeAlias,
 )
-# from typing_inspect import get_bound, get_parameters
-from numpy.typing import (
-    NDArray,
-    ArrayLike,
-)
-#
-# import ekumen.atmosphere as eat
+from numpy.typing import NDArray
 import msgpack
 import ormsgpack
@@ -71,50 +85,35 @@ SampleExportMap: TypeAlias = Callable[['PackableSample'], SampleExportRow]
 ##
 # Main base classes
-# TODO Check for best way to ensure this typevar is used as a dataclass type
-# DT = TypeVar( 'DT', bound = dataclass.__class__ )
 DT = TypeVar( 'DT' )
 MsgpackRawSample: TypeAlias = Dict[str, Any]
-# @dataclass
-# class ArrayBytes:
-#     """Annotates bytes that should be interpreted as the raw contents of a
-#     numpy NDArray"""
-#     raw_bytes: bytes
-#     """The raw bytes of the corresponding NDArray"""
-#     def __init__( self,
-#             array: Optional[ArrayLike] = None,
-#             raw: Optional[bytes] = None,
-#         ):
-#         """TODO"""
-#         if array is not None:
-#             array = np.array( array )
-#             self.raw_bytes = eh.array_to_bytes( array )
-#         elif raw is not None:
-#             self.raw_bytes = raw
-#         else:
-#             raise ValueError( 'Must provide either `array` or `raw` bytes' )
-#     @property
-#     def to_numpy( self ) -> NDArray:
-#         """Return the `raw_bytes` data as an NDArray"""
-#         return eh.bytes_to_array( self.raw_bytes )
 def _make_packable( x ):
-    # if isinstance( x, ArrayBytes ):
-    #     return x.raw_bytes
+    """Convert a value to a msgpack-compatible format.
+    Args:
+        x: A value to convert. If it's a numpy array, converts to bytes.
+            Otherwise returns the value unchanged.
+    Returns:
+        The value in a format suitable for msgpack serialization.
+    """
     if isinstance( x, np.ndarray ):
         return eh.array_to_bytes( x )
     return x
 def _is_possibly_ndarray_type( t ):
-    """Checks if a type annotation is possibly an NDArray."""
+    """Check if a type annotation is or contains NDArray.
+    Args:
+        t: A type annotation to check.
+    Returns:
+        ``True`` if the type is ``NDArray`` or a union containing ``NDArray``
+        (e.g., ``NDArray | None``), ``False`` otherwise.
+    """
     # Directly an NDArray
     if t == NDArray:
@@ -133,10 +132,40 @@ def _is_possibly_ndarray_type( t ):
 @dataclass
 class PackableSample( ABC ):
-    """A sample that can be packed and unpacked with msgpack"""
+    """Base class for samples that can be serialized with msgpack.
+    This abstract base class provides automatic serialization/deserialization
+    for dataclass-based samples. Fields annotated as ``NDArray`` or
+    ``NDArray | None`` are automatically converted between numpy arrays and
+    bytes during packing/unpacking.
+    Subclasses should be defined either by:
+    1. Direct inheritance with the ``@dataclass`` decorator
+    2. Using the ``@packable`` decorator (recommended)
+    Example:
+        >>> @packable
+        ... class MyData:
+        ...     name: str
+        ...     embeddings: NDArray
+        ...
+        >>> sample = MyData(name="test", embeddings=np.array([1.0, 2.0]))
+        >>> packed = sample.packed  # Serialize to bytes
+        >>> restored = MyData.from_bytes(packed)  # Deserialize
+    """
     def _ensure_good( self ):
-        """TODO Stupid kludge because of __post_init__ nonsense for wrapped classes"""
+        """Auto-convert annotated NDArray fields from bytes to numpy arrays.
+        This method scans all dataclass fields and for any field annotated as
+        ``NDArray`` or ``NDArray | None``, automatically converts bytes values
+        to numpy arrays using the helper deserialization function. This enables
+        transparent handling of array serialization in msgpack data.
+        Note:
+            This is called during ``__post_init__`` to ensure proper type
+            conversion after deserialization.
+        """
         # Auto-convert known types when annotated
         # for var_name, var_type in vars( self.__class__ )['__annotations__'].items():
@@ -154,11 +183,8 @@ class PackableSample( ABC ):
                 # based on what is provided
                 if isinstance( var_cur_value, np.ndarray ):
-                    # we're good!
-                    pass
-                # elif isinstance( var_cur_value, ArrayBytes ):
-                #     setattr( self, var_name, var_cur_value.to_numpy )
+                    # Already the correct type, no conversion needed
+                    continue
                 elif isinstance( var_cur_value, bytes ):
                     # TODO This does create a constraint that serialized bytes
@@ -173,19 +199,45 @@ class PackableSample( ABC ):
     @classmethod
     def from_data( cls, data: MsgpackRawSample ) -> Self:
-        """Create a sample instance from unpacked msgpack data"""
+        """Create a sample instance from unpacked msgpack data.
+        Args:
+            data: A dictionary of unpacked msgpack data with keys matching
+                the sample's field names.
+        Returns:
+            A new instance of this sample class with fields populated from
+            the data dictionary and NDArray fields auto-converted from bytes.
+        """
         ret = cls( **data )
         ret._ensure_good()
         return ret
     @classmethod
     def from_bytes( cls, bs: bytes ) -> Self:
-        """Create a sample instance from raw msgpack bytes"""
+        """Create a sample instance from raw msgpack bytes.
+        Args:
+            bs: Raw bytes from a msgpack-serialized sample.
+        Returns:
+            A new instance of this sample class deserialized from the bytes.
+        """
         return cls.from_data( ormsgpack.unpackb( bs ) )
     @property
     def packed( self ) -> bytes:
-        """Pack this sample's data into msgpack bytes"""
+        """Pack this sample's data into msgpack bytes.
+        NDArray fields are automatically converted to bytes before packing.
+        All other fields are packed as-is if they're msgpack-compatible.
+        Returns:
+            Raw msgpack bytes representing this sample's data.
+        Raises:
+            RuntimeError: If msgpack serialization fails.
+        """
         # Make sure that all of our (possibly unpackable) data is in a packable
         # format
@@ -204,7 +256,15 @@ class PackableSample( ABC ):
     # TODO Expand to allow for specifying explicit __key__
     @property
     def as_wds( self ) -> WDSRawSample:
-        """Pack this sample's data for writing to webdataset"""
+        """Pack this sample's data for writing to WebDataset.
+        Returns:
+            A dictionary with ``__key__`` (UUID v1 for sortable keys) and
+            ``msgpack`` (packed sample data) fields suitable for WebDataset.
+        Note:
+            TODO: Expand to allow specifying explicit ``__key__`` values.
+        """
         return {
             # Generates a UUID that is timelike-sortable
             '__key__': str( uuid.uuid1( 0, 0 ) ),
@@ -212,30 +272,86 @@ class PackableSample( ABC ):
         }
 def _batch_aggregate( xs: Sequence ):
+    """Aggregate a sequence of values into a batch-appropriate format.
+    Args:
+        xs: A sequence of values to aggregate. If the first element is a numpy
+            array, all elements are stacked into a single array. Otherwise,
+            returns a list.
+    Returns:
+        A numpy array (if elements are arrays) or a list (otherwise).
+    """
     if not xs:
         # Empty sequence
         return []
-    # Aggregate
+    # Aggregate
     if isinstance( xs[0], np.ndarray ):
         return np.array( list( xs ) )
     return list( xs )
 class SampleBatch( Generic[DT] ):
+    """A batch of samples with automatic attribute aggregation.
+    This class wraps a sequence of samples and provides magic ``__getattr__``
+    access to aggregate sample attributes. When you access an attribute that
+    exists on the sample type, it automatically aggregates values across all
+    samples in the batch.
+    NDArray fields are stacked into a numpy array with a batch dimension.
+    Other fields are aggregated into a list.
+    Type Parameters:
+        DT: The sample type, must derive from ``PackableSample``.
+    Attributes:
+        samples: The list of sample instances in this batch.
+    Example:
+        >>> batch = SampleBatch[MyData]([sample1, sample2, sample3])
+        >>> batch.embeddings  # Returns stacked numpy array of shape (3, ...)
+        >>> batch.names  # Returns list of names
+    """
     def __init__( self, samples: Sequence[DT] ):
-        """TODO"""
+        """Create a batch from a sequence of samples.
+        Args:
+            samples: A sequence of sample instances to aggregate into a batch.
+                Each sample must be an instance of a type derived from
+                ``PackableSample``.
+        """
         self.samples = list( samples )
         self._aggregate_cache = dict()
     @property
     def sample_type( self ) -> Type:
-        """The type of each sample in this batch"""
+        """The type of each sample in this batch.
+        Returns:
+            The type parameter ``DT`` used when creating this ``SampleBatch[DT]``.
+        """
         return typing.get_args( self.__orig_class__)[0]
     def __getattr__( self, name ):
+        """Aggregate an attribute across all samples in the batch.
+        This magic method enables attribute-style access to aggregated sample
+        fields. Results are cached for efficiency.
+        Args:
+            name: The attribute name to aggregate across samples.
+        Returns:
+            For NDArray fields: a stacked numpy array with batch dimension.
+            For other fields: a list of values from each sample.
+        Raises:
+            AttributeError: If the attribute doesn't exist on the sample type.
+        """
         # Aggregate named params of sample type
         if name in vars( self.sample_type )['__annotations__']:
             if name not in self._aggregate_cache:
@@ -243,91 +359,112 @@ class SampleBatch( Generic[DT] ):
                     [ getattr( x, name )
                       for x in self.samples ]
                 )
-            return self._aggregate_cache[name]
-        raise AttributeError( f'No sample attribute named {name}' )
-# class AnySample( BaseModel ):
-#     """A sample that can hold anything"""
-#     value: Any
+            return self._aggregate_cache[name]
-# class AnyBatch( BaseModel ):
-#     """A batch of `AnySample`s"""
-#     values: list[AnySample]
+        raise AttributeError( f'No sample attribute named {name}' )
 ST = TypeVar( 'ST', bound = PackableSample )
-# BT = TypeVar( 'BT' )
 RT = TypeVar( 'RT', bound = PackableSample )
-# TODO For python 3.13
-# BT = TypeVar( 'BT', default = None )
-# IT = TypeVar( 'IT', default = Any )
 class Dataset( Generic[ST] ):
-    """A dataset that ingests and formats raw samples from a WebDataset
+    """A typed dataset built on WebDataset with lens transformations.
+    This class wraps WebDataset tar archives and provides type-safe iteration
+    over samples of a specific ``PackableSample`` type. Samples are stored as
+    msgpack-serialized data within WebDataset shards.
+    The dataset supports:
+    - Ordered and shuffled iteration
+    - Automatic batching with ``SampleBatch``
+    - Type transformations via the lens system (``as_type()``)
+    - Export to parquet format
+    Type Parameters:
+        ST: The sample type for this dataset, must derive from ``PackableSample``.
+    Attributes:
+        url: WebDataset brace-notation URL for the tar file(s).
+    Example:
+        >>> ds = Dataset[MyData]("path/to/data-{000000..000009}.tar")
+        >>> for sample in ds.ordered(batch_size=32):
+        ...     # sample is SampleBatch[MyData] with batch_size samples
+        ...     embeddings = sample.embeddings  # shape: (32, ...)
+        ...
+        >>> # Transform to a different view
+        >>> ds_view = ds.as_type(MyDataView)
-    (Abstract base for subclassing)
     """
-    # sample_class: Type = get_parameters( )
-    # """The type of each returned sample from this `Dataset`'s iterator"""
-    # batch_class: Type = get_bound( BT )
-    # """The type of a batch built from `sample_class`"""
     @property
     def sample_type( self ) -> Type:
-        """The type of each returned sample from this `Dataset`'s iterator"""
-        # TODO Figure out why linting fails here
+        """The type of each returned sample from this dataset's iterator.
+        Returns:
+            The type parameter ``ST`` used when creating this ``Dataset[ST]``.
+        Note:
+            Extracts the type parameter at runtime using ``__orig_class__``.
+        """
+        # NOTE: Linting may fail here due to __orig_class__ being a runtime attribute
         return typing.get_args( self.__orig_class__ )[0]
     @property
     def batch_type( self ) -> Type:
-        """The type of a batch built from `sample_class`"""
-        # return self.__orig_class__.__args__[1]
-        return SampleBatch[self.sample_type]
+        """The type of batches produced by this dataset.
-    # _schema_registry_sample: dict[str, Type]
-    # _schema_registry_batch: dict[str, Type | None]
+        Returns:
+            ``SampleBatch[ST]`` where ``ST`` is this dataset's sample type.
+        """
+        return SampleBatch[self.sample_type]
-    #
+    def __init__( self, url: str,
+                 metadata_url: str | None = None,
+             ) -> None:
+        """Create a dataset from a WebDataset URL.
-    def __init__( self, url: str ) -> None:
-        """TODO"""
+        Args:
+            url: WebDataset brace-notation URL pointing to tar files, e.g.,
+                ``"path/to/file-{000000..000009}.tar"`` for multiple shards or
+                ``"path/to/file-000000.tar"`` for a single shard.
+        """
         super().__init__()
         self.url = url
+        """WebDataset brace-notation URL pointing to tar files, e.g.,
+                ``"path/to/file-{000000..000009}.tar"`` for multiple shards or
+                ``"path/to/file-000000.tar"`` for a single shard.
+        """
+        self._metadata: dict[str, Any] | None = None
+        self.metadata_url: str | None = metadata_url
+        """Optional URL to msgpack-encoded metadata for this dataset."""
         # Allow addition of automatic transformation of raw underlying data
         self._output_lens: Lens | None = None
     def as_type( self, other: Type[RT] ) -> 'Dataset[RT]':
-        """TODO"""
+        """View this dataset through a different sample type using a registered lens.
+        Args:
+            other: The target sample type to transform into. Must be a type
+                derived from ``PackableSample``.
+        Returns:
+            A new ``Dataset`` instance that yields samples of type ``other``
+            by applying the appropriate lens transformation from the global
+            ``LensNetwork`` registry.
+        Raises:
+            ValueError: If no registered lens exists between the current
+                sample type and the target type.
+        """
         ret = Dataset[other]( self.url )
         # Get the singleton lens registry
         lenses = LensNetwork()
         ret._output_lens = lenses.transform( self.sample_type, ret.sample_type )
         return ret
-    # @classmethod
-    # def register( cls, uri: str,
-    #             sample_class: Type,
-    #             batch_class: Optional[Type] = None,
-    #         ):
-    #     """Register an `ekumen` schema to use a particular dataset sample class"""
-    #     cls._schema_registry_sample[uri] = sample_class
-    #     cls._schema_registry_batch[uri] = batch_class
-    # @classmethod
-    # def at( cls, uri: str ) -> 'Dataset':
-    #     """Create a Dataset for the `ekumen` index entry at `uri`"""
-    #     client = eat.Client()
-    #     return cls( )
-    # Common functionality
     @property
     def shard_list( self ) -> list[str]:
         """List of individual dataset shards
@@ -341,6 +478,27 @@ class Dataset( Generic[ST] ):
             wds.filters.map( lambda x: x['url'] )
         )
         return list( pipe )
+    @property
+    def metadata( self ) -> dict[str, Any] | None:
+        """Fetch and cache metadata from metadata_url.
+        Returns:
+            Deserialized metadata dictionary, or None if no metadata_url is set.
+        Raises:
+            requests.HTTPError: If metadata fetch fails.
+        """
+        if self.metadata_url is None:
+            return None
+        if self._metadata is None:
+            with requests.get( self.metadata_url, stream = True ) as response:
+                response.raise_for_status()
+                self._metadata = msgpack.unpackb( response.content, raw = False )
+        # Use our cached values
+        return self._metadata
     def ordered( self,
                 batch_size: int | None = 1,
@@ -359,22 +517,17 @@ class Dataset( Generic[ST] ):
         """
         if batch_size is None:
-            # TODO Duplication here
             return wds.pipeline.DataPipeline(
                 wds.shardlists.SimpleShardList( self.url ),
                 wds.shardlists.split_by_worker,
-                #
                 wds.tariterators.tarfile_to_samples(),
-                # wds.map( self.preprocess ),
                 wds.filters.map( self.wrap ),
             )
         return wds.pipeline.DataPipeline(
             wds.shardlists.SimpleShardList( self.url ),
             wds.shardlists.split_by_worker,
-            #
             wds.tariterators.tarfile_to_samples(),
-            # wds.map( self.preprocess ),
             wds.filters.batched( batch_size ),
             wds.filters.map( self.wrap_batch ),
         )
@@ -384,30 +537,30 @@ class Dataset( Generic[ST] ):
                 buffer_samples: int = 10_000,
                 batch_size: int | None = 1,
             ) -> Iterable[ST]:
-        """Iterate over the dataset in random order
+        """Iterate over the dataset in random order.
         Args:
-            buffer_shards (int): Asdf
-            batch_size (:obj:`int`, optional) The size of iterated batches.
-                Default: 1. If ``None``, iterates over one sample at a time
-                with no batch dimension.
+            buffer_shards: Number of shards to buffer for shuffling at the
+                shard level. Larger values increase randomness but use more
+                memory. Default: 100.
+            buffer_samples: Number of samples to buffer for shuffling within
+                shards. Larger values increase randomness but use more memory.
+                Default: 10,000.
+            batch_size: The size of iterated batches. Default: 1. If ``None``,
+                iterates over one sample at a time with no batch dimension.
         Returns:
-            :obj:`webdataset.DataPipeline` A data pipeline that iterates over
-                the dataset in its original sample order
+            A WebDataset data pipeline that iterates over the dataset in
+            randomized order. If ``batch_size`` is not ``None``, yields
+            ``SampleBatch[ST]`` instances; otherwise yields individual ``ST``
+            samples.
         """
         if batch_size is None:
-            # TODO Duplication here
             return wds.pipeline.DataPipeline(
                 wds.shardlists.SimpleShardList( self.url ),
                 wds.filters.shuffle( buffer_shards ),
                 wds.shardlists.split_by_worker,
-                #
                 wds.tariterators.tarfile_to_samples(),
-                # wds.shuffle( buffer_samples ),
-                # wds.map( self.preprocess ),
                 wds.filters.shuffle( buffer_samples ),
                 wds.filters.map( self.wrap ),
             )
@@ -416,10 +569,7 @@ class Dataset( Generic[ST] ):
             wds.shardlists.SimpleShardList( self.url ),
             wds.filters.shuffle( buffer_shards ),
             wds.shardlists.split_by_worker,
-            #
             wds.tariterators.tarfile_to_samples(),
-            # wds.shuffle( buffer_samples ),
-            # wds.map( self.preprocess ),
             wds.filters.shuffle( buffer_samples ),
             wds.filters.batched( batch_size ),
             wds.filters.map( self.wrap_batch ),
@@ -462,11 +612,11 @@ class Dataset( Generic[ST] ):
             cur_segment = 0
             cur_buffer = []
-            path_template = (path.parent / f'{path.stem}-%06d.{path.suffix}').as_posix()
+            path_template = (path.parent / f'{path.stem}-{{:06d}}{path.suffix}').as_posix()
             for x in self.ordered( batch_size = None ):
                 cur_buffer.append( sample_map( x ) )
                 if len( cur_buffer ) >= maxcount:
                     # Write current segment
                     cur_path = path_template.format( cur_segment )
@@ -482,25 +632,17 @@ class Dataset( Generic[ST] ):
                 df = pd.DataFrame( cur_buffer )
                 df.to_parquet( cur_path, **kwargs )
+    def wrap( self, sample: MsgpackRawSample ) -> ST:
+        """Wrap a raw msgpack sample into the appropriate dataset-specific type.
-    # Implemented by specific subclasses
-    # @property
-    # @abstractmethod
-    # def url( self ) -> str:
-    #     """str: Brace-notation URL of the underlying full WebDataset"""
-    #     pass
-    # @classmethod
-    # # TODO replace Any with IT
-    # def preprocess( cls, sample: WDSRawSample ) -> Any:
-    #     """Pre-built preprocessor for a raw `sample` from the given dataset"""
-    #     return sample
+        Args:
+            sample: A dictionary containing at minimum a ``'msgpack'`` key with
+                serialized sample bytes.
-    # @classmethod
-    # TODO replace Any with IT
-    def wrap( self, sample: MsgpackRawSample ) -> ST:
-        """Wrap a `sample` into the appropriate dataset-specific type"""
+        Returns:
+            A deserialized sample of type ``ST``, optionally transformed through
+            a lens if ``as_type()`` was called.
+        """
         assert 'msgpack' in sample
         assert type( sample['msgpack'] ) == bytes
@@ -509,24 +651,21 @@ class Dataset( Generic[ST] ):
         source_sample = self._output_lens.source_type.from_bytes( sample['msgpack'] )
         return self._output_lens( source_sample )
-        # try:
-        #     assert type( sample ) == dict
-        #     return cls.sample_class( **{
-        #         k: v
-        #         for k, v in sample.items() if k != '__key__'
-        #     } )
-        # except Exception as e:
-        #     # Sample constructor failed -- revert to default
-        #     return AnySample(
-        #         value = sample,
-        #     )
     def wrap_batch( self, batch: WDSRawBatch ) -> SampleBatch[ST]:
-        """Wrap a `batch` of samples into the appropriate dataset-specific type
-        This default implementation simply creates a list one sample at a time
+        """Wrap a batch of raw msgpack samples into a typed SampleBatch.
+        Args:
+            batch: A dictionary containing a ``'msgpack'`` key with a list of
+                serialized sample bytes.
+        Returns:
+            A ``SampleBatch[ST]`` containing deserialized samples, optionally
+            transformed through a lens if ``as_type()`` was called.
+        Note:
+            This implementation deserializes samples one at a time, then
+            aggregates them into a batch.
         """
         assert 'msgpack' in batch
@@ -542,38 +681,32 @@ class Dataset( Generic[ST] ):
                        for s in batch_source ]
         return SampleBatch[self.sample_type]( batch_view )
-    # # @classmethod
-    # def wrap_batch( self, batch: WDSRawBatch ) -> BT:
-    #     """Wrap a `batch` of samples into the appropriate dataset-specific type
-    #     This default implementation simply creates a list one sample at a time
-    #     """
-    #     assert cls.batch_class is not None, 'No batch class specified'
-    #     return cls.batch_class( **batch )
-##
-# Shortcut decorators
-# def packable( cls ):
-#     """TODO"""
-#     def decorator( cls ):
-#         # Create a new class dynamically
-#         # The new class inherits from the new_parent_class first, then the original cls
-#         new_bases = (PackableSample,) + cls.__bases__
-#         new_cls = type(cls.__name__, new_bases, dict(cls.__dict__))
-#         # Optionally, update __module__ and __qualname__ for better introspection
-#         new_cls.__module__ = cls.__module__
-#         new_cls.__qualname__ = cls.__qualname__
-#         return new_cls
-#     return decorator
 def packable( cls ):
-    """TODO"""
+    """Decorator to convert a regular class into a ``PackableSample``.
+    This decorator transforms a class into a dataclass that inherits from
+    ``PackableSample``, enabling automatic msgpack serialization/deserialization
+    with special handling for NDArray fields.
+    Args:
+        cls: The class to convert. Should have type annotations for its fields.
+    Returns:
+        A new dataclass that inherits from ``PackableSample`` with the same
+        name and annotations as the original class.
+    Example:
+        >>> @packable
+        ... class MyData:
+        ...     name: str
+        ...     values: NDArray
+        ...
+        >>> sample = MyData(name="test", values=np.array([1, 2, 3]))
+        >>> bytes_data = sample.packed
+        >>> restored = MyData.from_bytes(bytes_data)
+    """
     ##
     class_name = cls.__name__

atdata 0.1.3b3__py3-none-any.whl → 0.2.0a1__py3-none-any.whl

atdata 0.1.3b3py3-none-any.whl → 0.2.0a1py3-none-any.whl