PyPI - atdata - Versions diffs - 0.1.3b3__py3-none-any.whl → 0.1.3b4__py3-none-any.whl - Mend

atdata 0.1.3b3py3-none-any.whl → 0.1.3b4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

atdata/__init__.py +36 -1
atdata/_helpers.py +39 -3
atdata/dataset.py +299 -37
atdata/lens.py +179 -26
atdata-0.1.3b4.dist-info/METADATA +172 -0
atdata-0.1.3b4.dist-info/RECORD +9 -0
atdata-0.1.3b3.dist-info/METADATA +0 -18
atdata-0.1.3b3.dist-info/RECORD +0 -9
{atdata-0.1.3b3.dist-info → atdata-0.1.3b4.dist-info}/WHEEL +0 -0
{atdata-0.1.3b3.dist-info → atdata-0.1.3b4.dist-info}/entry_points.txt +0 -0
{atdata-0.1.3b3.dist-info → atdata-0.1.3b4.dist-info}/licenses/LICENSE +0 -0

atdata/__init__.py CHANGED Viewed

@@ -1,4 +1,39 @@
-"""A loose federation of distributed, typed datasets"""
+"""A loose federation of distributed, typed datasets.
+``atdata`` provides a typed dataset abstraction built on WebDataset, with support
+for:
+- **Typed samples** with automatic msgpack serialization
+- **NDArray handling** with transparent bytes conversion
+- **Lens transformations** for viewing datasets through different type schemas
+- **Batch aggregation** with automatic numpy array stacking
+- **WebDataset integration** for efficient large-scale dataset storage
+Quick Start:
+    >>> import atdata
+    >>> import numpy as np
+    >>>
+    >>> @atdata.packable
+    ... class MyData:
+    ...     features: np.ndarray
+    ...     label: str
+    >>>
+    >>> # Create dataset from WebDataset tar files
+    >>> ds = atdata.Dataset[MyData]("path/to/data-{000000..000009}.tar")
+    >>>
+    >>> # Iterate with automatic batching
+    >>> for batch in ds.shuffled(batch_size=32):
+    ...     features = batch.features  # numpy array (32, ...)
+    ...     labels = batch.label  # list of 32 strings
+Main Components:
+    - ``PackableSample``: Base class for msgpack-serializable samples
+    - ``Dataset``: Typed dataset wrapper for WebDataset
+    - ``SampleBatch``: Automatic batch aggregation
+    - ``Lens``: Bidirectional type transformations
+    - ``@packable``: Decorator for creating PackableSample classes
+    - ``@lens``: Decorator for creating lens transformations
+"""
 ##
 # Expose components

atdata/_helpers.py CHANGED Viewed

@@ -1,4 +1,16 @@
-"""Assorted helper methods for `atdata`"""
+"""Helper utilities for numpy array serialization.
+This module provides utility functions for converting numpy arrays to and from
+bytes for msgpack serialization. The functions use numpy's native save/load
+format to preserve array dtype and shape information.
+Functions:
+    - ``array_to_bytes()``: Serialize numpy array to bytes
+    - ``bytes_to_array()``: Deserialize bytes to numpy array
+These helpers are used internally by ``PackableSample`` to enable transparent
+handling of NDArray fields during msgpack packing/unpacking.
+"""
 ##
 # Imports
@@ -11,12 +23,36 @@ import numpy as np
 ##
 def array_to_bytes( x: np.ndarray ) -> bytes:
-    """Convert `numpy` array to a format suitable for packing"""
+    """Convert a numpy array to bytes for msgpack serialization.
+    Uses numpy's native ``save()`` format to preserve array dtype and shape.
+    Args:
+        x: A numpy array to serialize.
+    Returns:
+        Raw bytes representing the serialized array.
+    Note:
+        Uses ``allow_pickle=True`` to support object dtypes.
+    """
     np_bytes = BytesIO()
     np.save( np_bytes, x, allow_pickle = True )
     return np_bytes.getvalue()
 def bytes_to_array( b: bytes ) -> np.ndarray:
-    """Convert packed bytes back to a `numpy` array"""
+    """Convert serialized bytes back to a numpy array.
+    Reverses the serialization performed by ``array_to_bytes()``.
+    Args:
+        b: Raw bytes from a serialized numpy array.
+    Returns:
+        The deserialized numpy array with original dtype and shape.
+    Note:
+        Uses ``allow_pickle=True`` to support object dtypes.
+    """
     np_bytes = BytesIO( b )
     return np.load( np_bytes, allow_pickle = True )

atdata/dataset.py CHANGED Viewed

@@ -1,4 +1,29 @@
-"""Schematized WebDatasets"""
+"""Core dataset and sample infrastructure for typed WebDatasets.
+This module provides the core components for working with typed, msgpack-serialized
+samples in WebDataset format:
+- ``PackableSample``: Base class for msgpack-serializable samples with automatic
+  NDArray handling
+- ``SampleBatch``: Automatic batching with attribute aggregation
+- ``Dataset``: Generic typed dataset wrapper for WebDataset tar files
+- ``@packable``: Decorator to convert regular classes into PackableSample subclasses
+The implementation handles automatic conversion between numpy arrays and bytes
+during serialization, enabling efficient storage of numerical data in WebDataset
+archives.
+Example:
+    >>> @packable
+    ... class ImageSample:
+    ...     image: NDArray
+    ...     label: str
+    ...
+    >>> ds = Dataset[ImageSample]("data-{000000..000009}.tar")
+    >>> for batch in ds.shuffled(batch_size=32):
+    ...     images = batch.image  # Stacked numpy array (32, H, W, C)
+    ...     labels = batch.label  # List of 32 strings
+"""
 ##
 # Imports
@@ -107,6 +132,15 @@ MsgpackRawSample: TypeAlias = Dict[str, Any]
 #         return eh.bytes_to_array( self.raw_bytes )
 def _make_packable( x ):
+    """Convert a value to a msgpack-compatible format.
+    Args:
+        x: A value to convert. If it's a numpy array, converts to bytes.
+            Otherwise returns the value unchanged.
+    Returns:
+        The value in a format suitable for msgpack serialization.
+    """
     # if isinstance( x, ArrayBytes ):
     #     return x.raw_bytes
     if isinstance( x, np.ndarray ):
@@ -114,7 +148,15 @@ def _make_packable( x ):
     return x
 def _is_possibly_ndarray_type( t ):
-    """Checks if a type annotation is possibly an NDArray."""
+    """Check if a type annotation is or contains NDArray.
+    Args:
+        t: A type annotation to check.
+    Returns:
+        ``True`` if the type is ``NDArray`` or a union containing ``NDArray``
+        (e.g., ``NDArray | None``), ``False`` otherwise.
+    """
     # Directly an NDArray
     if t == NDArray:
@@ -133,10 +175,40 @@ def _is_possibly_ndarray_type( t ):
 @dataclass
 class PackableSample( ABC ):
-    """A sample that can be packed and unpacked with msgpack"""
+    """Base class for samples that can be serialized with msgpack.
+    This abstract base class provides automatic serialization/deserialization
+    for dataclass-based samples. Fields annotated as ``NDArray`` or
+    ``NDArray | None`` are automatically converted between numpy arrays and
+    bytes during packing/unpacking.
+    Subclasses should be defined either by:
+    1. Direct inheritance with the ``@dataclass`` decorator
+    2. Using the ``@packable`` decorator (recommended)
+    Example:
+        >>> @packable
+        ... class MyData:
+        ...     name: str
+        ...     embeddings: NDArray
+        ...
+        >>> sample = MyData(name="test", embeddings=np.array([1.0, 2.0]))
+        >>> packed = sample.packed  # Serialize to bytes
+        >>> restored = MyData.from_bytes(packed)  # Deserialize
+    """
     def _ensure_good( self ):
-        """TODO Stupid kludge because of __post_init__ nonsense for wrapped classes"""
+        """Auto-convert annotated NDArray fields from bytes to numpy arrays.
+        This method scans all dataclass fields and for any field annotated as
+        ``NDArray`` or ``NDArray | None``, automatically converts bytes values
+        to numpy arrays using the helper deserialization function. This enables
+        transparent handling of array serialization in msgpack data.
+        Note:
+            This is called during ``__post_init__`` to ensure proper type
+            conversion after deserialization.
+        """
         # Auto-convert known types when annotated
         # for var_name, var_type in vars( self.__class__ )['__annotations__'].items():
@@ -173,19 +245,45 @@ class PackableSample( ABC ):
     @classmethod
     def from_data( cls, data: MsgpackRawSample ) -> Self:
-        """Create a sample instance from unpacked msgpack data"""
+        """Create a sample instance from unpacked msgpack data.
+        Args:
+            data: A dictionary of unpacked msgpack data with keys matching
+                the sample's field names.
+        Returns:
+            A new instance of this sample class with fields populated from
+            the data dictionary and NDArray fields auto-converted from bytes.
+        """
         ret = cls( **data )
         ret._ensure_good()
         return ret
     @classmethod
     def from_bytes( cls, bs: bytes ) -> Self:
-        """Create a sample instance from raw msgpack bytes"""
+        """Create a sample instance from raw msgpack bytes.
+        Args:
+            bs: Raw bytes from a msgpack-serialized sample.
+        Returns:
+            A new instance of this sample class deserialized from the bytes.
+        """
         return cls.from_data( ormsgpack.unpackb( bs ) )
     @property
     def packed( self ) -> bytes:
-        """Pack this sample's data into msgpack bytes"""
+        """Pack this sample's data into msgpack bytes.
+        NDArray fields are automatically converted to bytes before packing.
+        All other fields are packed as-is if they're msgpack-compatible.
+        Returns:
+            Raw msgpack bytes representing this sample's data.
+        Raises:
+            RuntimeError: If msgpack serialization fails.
+        """
         # Make sure that all of our (possibly unpackable) data is in a packable
         # format
@@ -204,7 +302,15 @@ class PackableSample( ABC ):
     # TODO Expand to allow for specifying explicit __key__
     @property
     def as_wds( self ) -> WDSRawSample:
-        """Pack this sample's data for writing to webdataset"""
+        """Pack this sample's data for writing to WebDataset.
+        Returns:
+            A dictionary with ``__key__`` (UUID v1 for sortable keys) and
+            ``msgpack`` (packed sample data) fields suitable for WebDataset.
+        Note:
+            TODO: Expand to allow specifying explicit ``__key__`` values.
+        """
         return {
             # Generates a UUID that is timelike-sortable
             '__key__': str( uuid.uuid1( 0, 0 ) ),
@@ -212,30 +318,86 @@ class PackableSample( ABC ):
         }
 def _batch_aggregate( xs: Sequence ):
+    """Aggregate a sequence of values into a batch-appropriate format.
+    Args:
+        xs: A sequence of values to aggregate. If the first element is a numpy
+            array, all elements are stacked into a single array. Otherwise,
+            returns a list.
+    Returns:
+        A numpy array (if elements are arrays) or a list (otherwise).
+    """
     if not xs:
         # Empty sequence
         return []
-    # Aggregate
+    # Aggregate
     if isinstance( xs[0], np.ndarray ):
         return np.array( list( xs ) )
     return list( xs )
 class SampleBatch( Generic[DT] ):
+    """A batch of samples with automatic attribute aggregation.
+    This class wraps a sequence of samples and provides magic ``__getattr__``
+    access to aggregate sample attributes. When you access an attribute that
+    exists on the sample type, it automatically aggregates values across all
+    samples in the batch.
+    NDArray fields are stacked into a numpy array with a batch dimension.
+    Other fields are aggregated into a list.
+    Type Parameters:
+        DT: The sample type, must derive from ``PackableSample``.
+    Attributes:
+        samples: The list of sample instances in this batch.
+    Example:
+        >>> batch = SampleBatch[MyData]([sample1, sample2, sample3])
+        >>> batch.embeddings  # Returns stacked numpy array of shape (3, ...)
+        >>> batch.names  # Returns list of names
+    """
     def __init__( self, samples: Sequence[DT] ):
-        """TODO"""
+        """Create a batch from a sequence of samples.
+        Args:
+            samples: A sequence of sample instances to aggregate into a batch.
+                Each sample must be an instance of a type derived from
+                ``PackableSample``.
+        """
         self.samples = list( samples )
         self._aggregate_cache = dict()
     @property
     def sample_type( self ) -> Type:
-        """The type of each sample in this batch"""
+        """The type of each sample in this batch.
+        Returns:
+            The type parameter ``DT`` used when creating this ``SampleBatch[DT]``.
+        """
         return typing.get_args( self.__orig_class__)[0]
     def __getattr__( self, name ):
+        """Aggregate an attribute across all samples in the batch.
+        This magic method enables attribute-style access to aggregated sample
+        fields. Results are cached for efficiency.
+        Args:
+            name: The attribute name to aggregate across samples.
+        Returns:
+            For NDArray fields: a stacked numpy array with batch dimension.
+            For other fields: a list of values from each sample.
+        Raises:
+            AttributeError: If the attribute doesn't exist on the sample type.
+        """
         # Aggregate named params of sample type
         if name in vars( self.sample_type )['__annotations__']:
             if name not in self._aggregate_cache:
@@ -243,9 +405,9 @@ class SampleBatch( Generic[DT] ):
                     [ getattr( x, name )
                       for x in self.samples ]
                 )
             return self._aggregate_cache[name]
         raise AttributeError( f'No sample attribute named {name}' )
@@ -268,9 +430,32 @@ RT = TypeVar( 'RT', bound = PackableSample )
 # IT = TypeVar( 'IT', default = Any )
 class Dataset( Generic[ST] ):
-    """A dataset that ingests and formats raw samples from a WebDataset
-    (Abstract base for subclassing)
+    """A typed dataset built on WebDataset with lens transformations.
+    This class wraps WebDataset tar archives and provides type-safe iteration
+    over samples of a specific ``PackableSample`` type. Samples are stored as
+    msgpack-serialized data within WebDataset shards.
+    The dataset supports:
+    - Ordered and shuffled iteration
+    - Automatic batching with ``SampleBatch``
+    - Type transformations via the lens system (``as_type()``)
+    - Export to parquet format
+    Type Parameters:
+        ST: The sample type for this dataset, must derive from ``PackableSample``.
+    Attributes:
+        url: WebDataset brace-notation URL for the tar file(s).
+    Example:
+        >>> ds = Dataset[MyData]("path/to/data-{000000..000009}.tar")
+        >>> for sample in ds.ordered(batch_size=32):
+        ...     # sample is SampleBatch[MyData] with batch_size samples
+        ...     embeddings = sample.embeddings  # shape: (32, ...)
+        ...
+        >>> # Transform to a different view
+        >>> ds_view = ds.as_type(MyDataView)
     """
     # sample_class: Type = get_parameters( )
@@ -280,12 +465,23 @@ class Dataset( Generic[ST] ):
     @property
     def sample_type( self ) -> Type:
-        """The type of each returned sample from this `Dataset`'s iterator"""
-        # TODO Figure out why linting fails here
+        """The type of each returned sample from this dataset's iterator.
+        Returns:
+            The type parameter ``ST`` used when creating this ``Dataset[ST]``.
+        Note:
+            Extracts the type parameter at runtime using ``__orig_class__``.
+        """
+        # NOTE: Linting may fail here due to __orig_class__ being a runtime attribute
         return typing.get_args( self.__orig_class__ )[0]
     @property
     def batch_type( self ) -> Type:
-        """The type of a batch built from `sample_class`"""
+        """The type of batches produced by this dataset.
+        Returns:
+            ``SampleBatch[ST]`` where ``ST`` is this dataset's sample type.
+        """
         # return self.__orig_class__.__args__[1]
         return SampleBatch[self.sample_type]
@@ -296,7 +492,13 @@ class Dataset( Generic[ST] ):
     #
     def __init__( self, url: str ) -> None:
-        """TODO"""
+        """Create a dataset from a WebDataset URL.
+        Args:
+            url: WebDataset brace-notation URL pointing to tar files, e.g.,
+                ``"path/to/file-{000000..000009}.tar"`` for multiple shards or
+                ``"path/to/file-000000.tar"`` for a single shard.
+        """
         super().__init__()
         self.url = url
@@ -304,7 +506,21 @@ class Dataset( Generic[ST] ):
         self._output_lens: Lens | None = None
     def as_type( self, other: Type[RT] ) -> 'Dataset[RT]':
-        """TODO"""
+        """View this dataset through a different sample type using a registered lens.
+        Args:
+            other: The target sample type to transform into. Must be a type
+                derived from ``PackableSample``.
+        Returns:
+            A new ``Dataset`` instance that yields samples of type ``other``
+            by applying the appropriate lens transformation from the global
+            ``LensNetwork`` registry.
+        Raises:
+            ValueError: If no registered lens exists between the current
+                sample type and the target type.
+        """
         ret = Dataset[other]( self.url )
         # Get the singleton lens registry
         lenses = LensNetwork()
@@ -384,18 +600,23 @@ class Dataset( Generic[ST] ):
                 buffer_samples: int = 10_000,
                 batch_size: int | None = 1,
             ) -> Iterable[ST]:
-        """Iterate over the dataset in random order
+        """Iterate over the dataset in random order.
         Args:
-            buffer_shards (int): Asdf
-            batch_size (:obj:`int`, optional) The size of iterated batches.
-                Default: 1. If ``None``, iterates over one sample at a time
-                with no batch dimension.
+            buffer_shards: Number of shards to buffer for shuffling at the
+                shard level. Larger values increase randomness but use more
+                memory. Default: 100.
+            buffer_samples: Number of samples to buffer for shuffling within
+                shards. Larger values increase randomness but use more memory.
+                Default: 10,000.
+            batch_size: The size of iterated batches. Default: 1. If ``None``,
+                iterates over one sample at a time with no batch dimension.
         Returns:
-            :obj:`webdataset.DataPipeline` A data pipeline that iterates over
-                the dataset in its original sample order
+            A WebDataset data pipeline that iterates over the dataset in
+            randomized order. If ``batch_size`` is not ``None``, yields
+            ``SampleBatch[ST]`` instances; otherwise yields individual ``ST``
+            samples.
         """
         if batch_size is None:
@@ -500,7 +721,16 @@ class Dataset( Generic[ST] ):
     # @classmethod
     # TODO replace Any with IT
     def wrap( self, sample: MsgpackRawSample ) -> ST:
-        """Wrap a `sample` into the appropriate dataset-specific type"""
+        """Wrap a raw msgpack sample into the appropriate dataset-specific type.
+        Args:
+            sample: A dictionary containing at minimum a ``'msgpack'`` key with
+                serialized sample bytes.
+        Returns:
+            A deserialized sample of type ``ST``, optionally transformed through
+            a lens if ``as_type()`` was called.
+        """
         assert 'msgpack' in sample
         assert type( sample['msgpack'] ) == bytes
@@ -524,9 +754,19 @@ class Dataset( Generic[ST] ):
         #     )
     def wrap_batch( self, batch: WDSRawBatch ) -> SampleBatch[ST]:
-        """Wrap a `batch` of samples into the appropriate dataset-specific type
-        This default implementation simply creates a list one sample at a time
+        """Wrap a batch of raw msgpack samples into a typed SampleBatch.
+        Args:
+            batch: A dictionary containing a ``'msgpack'`` key with a list of
+                serialized sample bytes.
+        Returns:
+            A ``SampleBatch[ST]`` containing deserialized samples, optionally
+            transformed through a lens if ``as_type()`` was called.
+        Note:
+            This implementation deserializes samples one at a time, then
+            aggregates them into a batch.
         """
         assert 'msgpack' in batch
@@ -572,8 +812,30 @@ class Dataset( Generic[ST] ):
 #     return decorator
 def packable( cls ):
-    """TODO"""
+    """Decorator to convert a regular class into a ``PackableSample``.
+    This decorator transforms a class into a dataclass that inherits from
+    ``PackableSample``, enabling automatic msgpack serialization/deserialization
+    with special handling for NDArray fields.
+    Args:
+        cls: The class to convert. Should have type annotations for its fields.
+    Returns:
+        A new dataclass that inherits from ``PackableSample`` with the same
+        name and annotations as the original class.
+    Example:
+        >>> @packable
+        ... class MyData:
+        ...     name: str
+        ...     values: NDArray
+        ...
+        >>> sample = MyData(name="test", values=np.array([1, 2, 3]))
+        >>> bytes_data = sample.packed
+        >>> restored = MyData.from_bytes(bytes_data)
+    """
     ##
     class_name = cls.__name__

atdata/lens.py CHANGED Viewed

@@ -1,4 +1,42 @@
-"""Lenses between typed datasets"""
+"""Lens-based type transformations for datasets.
+This module implements a lens system for bidirectional transformations between
+different sample types. Lenses enable viewing a dataset through different type
+schemas without duplicating the underlying data.
+Key components:
+- ``Lens``: Bidirectional transformation with getter (S -> V) and optional
+  putter (V, S -> S)
+- ``LensNetwork``: Global singleton registry for lens transformations
+- ``@lens``: Decorator to create and register lens transformations
+Lenses support the functional programming concept of composable, well-behaved
+transformations that satisfy lens laws (GetPut and PutGet).
+Example:
+    >>> @packable
+    ... class FullData:
+    ...     name: str
+    ...     age: int
+    ...     embedding: NDArray
+    ...
+    >>> @packable
+    ... class NameOnly:
+    ...     name: str
+    ...
+    >>> @lens
+    ... def name_view(full: FullData) -> NameOnly:
+    ...     return NameOnly(name=full.name)
+    ...
+    >>> @name_view.putter
+    ... def name_view_put(view: NameOnly, source: FullData) -> FullData:
+    ...     return FullData(name=view.name, age=source.age,
+    ...                     embedding=source.embedding)
+    ...
+    >>> ds = Dataset[FullData]("data.tar")
+    >>> ds_names = ds.as_type(NameOnly)  # Uses registered lens
+"""
 ##
 # Imports
@@ -39,24 +77,45 @@ type LensPutter[S, V] = Callable[[V, S], S]
 # Shortcut decorators
 class Lens( Generic[S, V] ):
-    """TODO"""
-    # @property
-    # def source_type( self ) -> Type[S]:
-    #     """The source type (S) for the lens; what is put to"""
-    #     # TODO Figure out why linting fails here
-    #     return self.__orig_class__.__args__[0]
-    # @property
-    # def view_type( self ) -> Type[V]:
-    #     """The view type (V) for the lens; what is get'd from"""
-    #     # TODO FIgure out why linting fails here
-    #     return self.__orig_class__.__args__[1]
+    """A bidirectional transformation between two sample types.
+    A lens provides a way to view and update data of type ``S`` (source) as if
+    it were type ``V`` (view). It consists of a getter that transforms ``S -> V``
+    and an optional putter that transforms ``(V, S) -> S``, enabling updates to
+    the view to be reflected back in the source.
+    Type Parameters:
+        S: The source type, must derive from ``PackableSample``.
+        V: The view type, must derive from ``PackableSample``.
+    Example:
+        >>> @lens
+        ... def name_lens(full: FullData) -> NameOnly:
+        ...     return NameOnly(name=full.name)
+        ...
+        >>> @name_lens.putter
+        ... def name_lens_put(view: NameOnly, source: FullData) -> FullData:
+        ...     return FullData(name=view.name, age=source.age)
+    """
     def __init__( self, get: LensGetter[S, V],
                 put: Optional[LensPutter[S, V]] = None
             ) -> None:
-        """TODO"""
+        """Initialize a lens with a getter and optional putter function.
+        Args:
+            get: A function that transforms from source type ``S`` to view type
+                ``V``. Must accept exactly one parameter annotated with the
+                source type.
+            put: An optional function that updates the source based on a modified
+                view. Takes a view of type ``V`` and original source of type ``S``,
+                and returns an updated source of type ``S``. If not provided, a
+                trivial putter is used that ignores updates to the view.
+        Raises:
+            AssertionError: If the getter function doesn't have exactly one
+                parameter.
+        """
         ##
         # Check argument validity
@@ -70,11 +129,11 @@ class Lens( Generic[S, V] ):
         functools.update_wrapper( self, get )
         self.source_type: Type[PackableSample] = input_types[0].annotation
-        self.view_type = sig.return_annotation
+        self.view_type: Type[PackableSample] = sig.return_annotation
         # Store the getter
         self._getter = get
         # Determine and store the putter
         if put is None:
             # Trivial putter does not update the source
@@ -86,7 +145,20 @@ class Lens( Generic[S, V] ):
     #
     def putter( self, put: LensPutter[S, V] ) -> LensPutter[S, V]:
-        """TODO"""
+        """Decorator to register a putter function for this lens.
+        Args:
+            put: A function that takes a view of type ``V`` and source of type
+                ``S``, and returns an updated source of type ``S``.
+        Returns:
+            The putter function, allowing this to be used as a decorator.
+        Example:
+            >>> @my_lens.putter
+            ... def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
+            ...     return SourceType(...)
+        """
         ##
         self._putter = put
         return put
@@ -94,16 +166,39 @@ class Lens( Generic[S, V] ):
     # Methods to actually execute transformations
     def put( self, v: V, s: S ) -> S:
-        """TODO"""
+        """Update the source based on a modified view.
+        Args:
+            v: The modified view of type ``V``.
+            s: The original source of type ``S``.
+        Returns:
+            An updated source of type ``S`` that reflects changes from the view.
+        """
         return self._putter( v, s )
     def get( self, s: S ) -> V:
-        """TODO"""
+        """Transform the source into the view type.
+        Args:
+            s: The source sample of type ``S``.
+        Returns:
+            A view of the source as type ``V``.
+        """
         return self( s )
     # Convenience to enable calling the lens as its getter
     def __call__( self, s: S ) -> V:
+        """Apply the lens transformation (same as ``get()``).
+        Args:
+            s: The source sample of type ``S``.
+        Returns:
+            A view of the source as type ``V``.
+        """
         return self._getter( s )
 # TODO Figure out how to properly parameterize this
@@ -124,6 +219,28 @@ class Lens( Generic[S, V] ):
 # lens = _lens_factory
 def lens(  f: LensGetter[S, V] ) -> Lens[S, V]:
+    """Decorator to create and register a lens transformation.
+    This decorator converts a getter function into a ``Lens`` object and
+    automatically registers it in the global ``LensNetwork`` registry.
+    Args:
+        f: A getter function that transforms from source type ``S`` to view
+            type ``V``. Must have exactly one parameter with a type annotation.
+    Returns:
+        A ``Lens[S, V]`` object that can be called to apply the transformation
+        or decorated with ``@lens_name.putter`` to add a putter function.
+    Example:
+        >>> @lens
+        ... def extract_name(full: FullData) -> NameOnly:
+        ...     return NameOnly(name=full.name)
+        ...
+        >>> @extract_name.putter
+        ... def extract_name_put(view: NameOnly, source: FullData) -> FullData:
+        ...     return FullData(name=view.name, age=source.age)
+    """
     ret = Lens[S, V]( f )
     _network.register( ret )
     return ret
@@ -136,25 +253,46 @@ def lens(  f: LensGetter[S, V] ) -> Lens[S, V]:
 # """TODO"""
 class LensNetwork:
-    """TODO"""
+    """Global registry for lens transformations between sample types.
+    This class implements a singleton pattern to maintain a global registry of
+    all lenses decorated with ``@lens``. It enables looking up transformations
+    between different ``PackableSample`` types.
+    Attributes:
+        _instance: The singleton instance of this class.
+        _registry: Dictionary mapping ``(source_type, view_type)`` tuples to
+            their corresponding ``Lens`` objects.
+    """
     _instance = None
     """The singleton instance"""
     def __new__(cls, *args, **kwargs):
+        """Ensure only one instance of LensNetwork exists (singleton pattern)."""
         if cls._instance is None:
             # If no instance exists, create a new one
             cls._instance = super().__new__(cls)
         return cls._instance  # Return the existing (or newly created) instance
     def __init__(self):
+        """Initialize the lens registry (only on first instantiation)."""
         if not hasattr(self, '_initialized'):  # Check if already initialized
             self._registry: Dict[LensSignature, Lens] = dict()
             self._initialized = True
     def register( self, _lens: Lens ):
-        """Set `lens` as the canonical view between its source and view types"""
+        """Register a lens as the canonical transformation between two types.
+        Args:
+            _lens: The lens to register. Will be stored in the registry under
+                the key ``(_lens.source_type, _lens.view_type)``.
+        Note:
+            If a lens already exists for the same type pair, it will be
+            overwritten.
+        """
         # sig = inspect.signature( _lens.get )
         # input_types = list( sig.parameters.values() )
         # assert len( input_types ) == 1, \
@@ -169,13 +307,28 @@ class LensNetwork:
         self._registry[_lens.source_type, _lens.view_type] = _lens
     def transform( self, source: DatasetType, view: DatasetType ) -> Lens:
-        """TODO"""
+        """Look up the lens transformation between two sample types.
+        Args:
+            source: The source sample type (must derive from ``PackableSample``).
+            view: The target view type (must derive from ``PackableSample``).
+        Returns:
+            The registered ``Lens`` that transforms from ``source`` to ``view``.
+        Raises:
+            ValueError: If no lens has been registered for the given type pair.
+        Note:
+            Currently only supports direct transformations. Compositional
+            transformations (chaining multiple lenses) are not yet implemented.
+        """
         # TODO Handle compositional closure
         ret = self._registry.get( (source, view), None )
         if ret is None:
             raise ValueError( f'No registered lens from source {source} to view {view}' )
         return ret

atdata-0.1.3b4.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,172 @@
+Metadata-Version: 2.4
+Name: atdata
+Version: 0.1.3b4
+Summary: A loose federation of distributed, typed datasets
+Author-email: Maxine Levesque <hello@maxine.science>
+License-File: LICENSE
+Requires-Python: >=3.12
+Requires-Dist: fastparquet>=2024.11.0
+Requires-Dist: msgpack>=1.1.2
+Requires-Dist: numpy>=2.3.4
+Requires-Dist: ormsgpack>=1.11.0
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: tqdm>=4.67.1
+Requires-Dist: webdataset>=1.0.2
+Description-Content-Type: text/markdown
+# atdata
+[![codecov](https://codecov.io/gh/foundation-ac/atdata/branch/main/graph/badge.svg)](https://codecov.io/gh/foundation-ac/atdata)
+A loose federation of distributed, typed datasets built on WebDataset.
+**atdata** provides a type-safe, composable framework for working with large-scale datasets. It combines the efficiency of WebDataset's tar-based storage with Python's type system and functional programming patterns.
+## Features
+- **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
+- **Lens Transformations** - Bidirectional, composable transformations between different dataset views
+- **Automatic Batching** - Smart batch aggregation with numpy array stacking
+- **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
+## Installation
+```bash
+pip install atdata
+```
+Requires Python 3.12 or later.
+## Quick Start
+### Defining Sample Types
+Use the `@packable` decorator to create typed dataset samples:
+```python
+import atdata
+from numpy.typing import NDArray
+@atdata.packable
+class ImageSample:
+    image: NDArray
+    label: str
+    metadata: dict
+```
+### Creating Datasets
+```python
+# Create a dataset
+dataset = atdata.Dataset[ImageSample]("path/to/data-{000000..000009}.tar")
+# Iterate over samples in order
+for sample in dataset.ordered(batch_size=None):
+    print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
+# Iterate with shuffling and batching
+for batch in dataset.shuffled(batch_size=32):
+    # batch.image is automatically stacked into shape (32, ...)
+    # batch.label is a list of 32 labels
+    process_batch(batch.image, batch.label)
+```
+### Lens Transformations
+Define reusable transformations between sample types:
+```python
+@atdata.packable
+class ProcessedSample:
+    features: NDArray
+    label: str
+@atdata.lens
+def preprocess(sample: ImageSample) -> ProcessedSample:
+    features = extract_features(sample.image)
+    return ProcessedSample(features=features, label=sample.label)
+# Apply lens to view dataset as ProcessedSample
+processed_ds = dataset.as_type(ProcessedSample)
+for sample in processed_ds.ordered(batch_size=None):
+    # sample is now a ProcessedSample
+    print(sample.features.shape)
+```
+## Core Concepts
+### PackableSample
+Base class for serializable samples. Fields annotated as `NDArray` are automatically handled:
+```python
+@atdata.packable
+class MySample:
+    array_field: NDArray      # Automatically serialized
+    optional_array: NDArray | None
+    regular_field: str
+```
+### Lens
+Bidirectional transformations with getter/putter semantics:
+```python
+@atdata.lens
+def my_lens(source: SourceType) -> ViewType:
+    # Transform source -> view
+    return ViewType(...)
+@my_lens.putter
+def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
+    # Transform view -> source
+    return SourceType(...)
+```
+### Dataset URLs
+Uses WebDataset brace expansion for sharded datasets:
+- Single file: `"data/dataset-000000.tar"`
+- Multiple shards: `"data/dataset-{000000..000099}.tar"`
+- Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
+## Development
+### Setup
+```bash
+# Install uv if not already available
+python -m pip install uv
+# Install dependencies
+uv sync
+```
+### Testing
+```bash
+# Run all tests with coverage
+pytest
+# Run specific test file
+pytest tests/test_dataset.py
+# Run single test
+pytest tests/test_lens.py::test_lens
+```
+### Building
+```bash
+uv build
+```
+## Contributing
+Contributions are welcome! This project is in beta, so the API may still evolve.
+## License
+This project is licensed under the Mozilla Public License 2.0. See [LICENSE](LICENSE) for details.

atdata-0.1.3b4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+atdata/__init__.py,sha256=_363ZuJfwbBQTMYsoKOiyoBe4AHr3iplK-EQyrAeTdg,1545
+atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
+atdata/dataset.py,sha256=O_7b3ub_M4IMRuhv95oz1PVFdsOhNiyXgtY8NphPdBk,27842
+atdata/lens.py,sha256=ynn1DQkR89eRL6JV9EsawuPY9JTrZ67pAX4cRvZ6UVk,11157
+atdata-0.1.3b4.dist-info/METADATA,sha256=SdZSI_SonE-pt4nhmFh5bz9zKD79wT2CKXKFxrTfvgc,4162
+atdata-0.1.3b4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+atdata-0.1.3b4.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
+atdata-0.1.3b4.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
+atdata-0.1.3b4.dist-info/RECORD,,

atdata-0.1.3b3.dist-info/METADATA DELETED Viewed

@@ -1,18 +0,0 @@
-Metadata-Version: 2.4
-Name: atdata
-Version: 0.1.3b3
-Summary: A loose federation of distributed, typed datasets
-Author-email: Maxine Levesque <hello@maxine.science>
-License-File: LICENSE
-Requires-Python: >=3.12
-Requires-Dist: fastparquet>=2024.11.0
-Requires-Dist: msgpack>=1.1.2
-Requires-Dist: numpy>=2.3.4
-Requires-Dist: ormsgpack>=1.11.0
-Requires-Dist: pandas>=2.3.3
-Requires-Dist: tqdm>=4.67.1
-Requires-Dist: webdataset>=1.0.2
-Description-Content-Type: text/markdown
-# atdata
-A loose federation of distributed, typed datasets

atdata-0.1.3b3.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-atdata/__init__.py,sha256=V2qBg7i2mfCNG9nww6Gi_fDp7iwolDMrNzhmNO6VA7M,233
-atdata/_helpers.py,sha256=R63JhXewAKZYnZ9Th7R6yZh0IOUPYGBsth3FpRUMD-U,503
-atdata/dataset.py,sha256=qyAiKSjjYqFVWmaLz5LAIZ3_YVHbm5lg32zmctqjjlE,18085
-atdata/lens.py,sha256=HvXuRqYTeJBpMyIQVdGZXxEvbGKBuFCF8lbiib4TqsA,5306
-atdata-0.1.3b3.dist-info/METADATA,sha256=jrGZ592QbkJdZCq8FLmXOznQ0LkTUyUkqLVIH3ZRj4U,529
-atdata-0.1.3b3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-atdata-0.1.3b3.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
-atdata-0.1.3b3.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
-atdata-0.1.3b3.dist-info/RECORD,,

{atdata-0.1.3b3.dist-info → atdata-0.1.3b4.dist-info}/WHEEL RENAMED Viewed

File without changes

{atdata-0.1.3b3.dist-info → atdata-0.1.3b4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{atdata-0.1.3b3.dist-info → atdata-0.1.3b4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

atdata 0.1.3b3__py3-none-any.whl → 0.1.3b4__py3-none-any.whl

atdata 0.1.3b3py3-none-any.whl → 0.1.3b4py3-none-any.whl