PyPI - atdata - Versions diffs - 0.1.2a4__tar.gz → 0.1.3b3__tar.gz - Mend

atdata 0.1.2a4tar.gz → 0.1.3b3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{atdata-0.1.2a4 → atdata-0.1.3b3}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,16 @@
 Metadata-Version: 2.4
 Name: atdata
-Version: 0.1.2a4
+Version: 0.1.3b3
 Summary: A loose federation of distributed, typed datasets
 Author-email: Maxine Levesque <hello@maxine.science>
 License-File: LICENSE
 Requires-Python: >=3.12
+Requires-Dist: fastparquet>=2024.11.0
 Requires-Dist: msgpack>=1.1.2
 Requires-Dist: numpy>=2.3.4
 Requires-Dist: ormsgpack>=1.11.0
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: tqdm>=4.67.1
 Requires-Dist: webdataset>=1.0.2
 Description-Content-Type: text/markdown

{atdata-0.1.2a4 → atdata-0.1.3b3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "atdata"
-version = "0.1.2a4"
+version = "0.1.3b3"
 description = "A loose federation of distributed, typed datasets"
 readme = "README.md"
 authors = [
@@ -8,9 +8,12 @@ authors = [
 ]
 requires-python = ">=3.12"
 dependencies = [
+    "fastparquet>=2024.11.0",
     "msgpack>=1.1.2",
     "numpy>=2.3.4",
     "ormsgpack>=1.11.0",
+    "pandas>=2.3.3",
+    "tqdm>=4.67.1",
     "webdataset>=1.0.2",
 ]

{atdata-0.1.2a4 → atdata-0.1.3b3}/src/atdata/__init__.py RENAMED Viewed

@@ -10,5 +10,11 @@ from .dataset import (
     packable,
 )
+from .lens import (
+    Lens,
+    LensNetwork,
+    lens,
+)
 #

{atdata-0.1.2a4 → atdata-0.1.3b3}/src/atdata/dataset.py RENAMED Viewed

@@ -5,21 +5,34 @@
 import webdataset as wds
-import functools
-from dataclasses import dataclass
+from pathlib import Path
 import uuid
+import functools
-import numpy as np
+import dataclasses
+import types
+from dataclasses import (
+    dataclass,
+    asdict,
+)
 from abc import (
     ABC,
     abstractmethod,
 )
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+import typing
 from typing import (
     Any,
     Optional,
     Dict,
     Sequence,
+    Iterable,
+    Callable,
+    Union,
     #
     Self,
     Generic,
@@ -40,14 +53,20 @@ from numpy.typing import (
 import msgpack
 import ormsgpack
 from . import _helpers as eh
+from .lens import Lens, LensNetwork
 ##
 # Typing help
+Pathlike = str | Path
 WDSRawSample: TypeAlias = Dict[str, Any]
 WDSRawBatch: TypeAlias = Dict[str, Any]
+SampleExportRow: TypeAlias = Dict[str, Any]
+SampleExportMap: TypeAlias = Callable[['PackableSample'], SampleExportRow]
 ##
 # Main base classes
@@ -94,6 +113,25 @@ def _make_packable( x ):
         return eh.array_to_bytes( x )
     return x
+def _is_possibly_ndarray_type( t ):
+    """Checks if a type annotation is possibly an NDArray."""
+    # Directly an NDArray
+    if t == NDArray:
+        # print( 'is an NDArray' )
+        return True
+    # Check for Optionals (i.e., NDArray | None)
+    if isinstance( t, types.UnionType ):
+        t_parts = t.__args__
+        if any( x == NDArray
+                for x in t_parts ):
+            return True
+    # Not an NDArray
+    return False
+@dataclass
 class PackableSample( ABC ):
     """A sample that can be packed and unpacked with msgpack"""
@@ -101,10 +139,13 @@ class PackableSample( ABC ):
         """TODO Stupid kludge because of __post_init__ nonsense for wrapped classes"""
         # Auto-convert known types when annotated
-        for var_name, var_type in vars( self.__class__ )['__annotations__'].items():
+        # for var_name, var_type in vars( self.__class__ )['__annotations__'].items():
+        for field in dataclasses.fields( self ):
+            var_name = field.name
+            var_type = field.type
             # Annotation for this variable is to be an NDArray
-            if var_type == NDArray:
+            if _is_possibly_ndarray_type( var_type ):
                 # ... so, we'll always auto-convert to numpy
                 var_cur_value = getattr( self, var_name )
@@ -120,6 +161,9 @@ class PackableSample( ABC ):
                 #     setattr( self, var_name, var_cur_value.to_numpy )
                 elif isinstance( var_cur_value, bytes ):
+                    # TODO This does create a constraint that serialized bytes
+                    # in a field that might be an NDArray are always interpreted
+                    # as being the NDArray interpretation
                     setattr( self, var_name, eh.bytes_to_array( var_cur_value ) )
     def __post_init__( self ):
@@ -189,7 +233,7 @@ class SampleBatch( Generic[DT] ):
     @property
     def sample_type( self ) -> Type:
         """The type of each sample in this batch"""
-        return self.__orig_class__.__args__[0]
+        return typing.get_args( self.__orig_class__)[0]
     def __getattr__( self, name ):
         # Aggregate named params of sample type
@@ -217,6 +261,8 @@ class SampleBatch( Generic[DT] ):
 ST = TypeVar( 'ST', bound = PackableSample )
 # BT = TypeVar( 'BT' )
+RT = TypeVar( 'RT', bound = PackableSample )
 # TODO For python 3.13
 # BT = TypeVar( 'BT', default = None )
 # IT = TypeVar( 'IT', default = Any )
@@ -235,7 +281,8 @@ class Dataset( Generic[ST] ):
     @property
     def sample_type( self ) -> Type:
         """The type of each returned sample from this `Dataset`'s iterator"""
-        return self.__orig_class__.__args__[0]
+        # TODO Figure out why linting fails here
+        return typing.get_args( self.__orig_class__ )[0]
     @property
     def batch_type( self ) -> Type:
         """The type of a batch built from `sample_class`"""
@@ -253,6 +300,17 @@ class Dataset( Generic[ST] ):
         super().__init__()
         self.url = url
+        # Allow addition of automatic transformation of raw underlying data
+        self._output_lens: Lens | None = None
+    def as_type( self, other: Type[RT] ) -> 'Dataset[RT]':
+        """TODO"""
+        ret = Dataset[other]( self.url )
+        # Get the singleton lens registry
+        lenses = LensNetwork()
+        ret._output_lens = lenses.transform( self.sample_type, ret.sample_type )
+        return ret
     # @classmethod
     # def register( cls, uri: str,
     #             sample_class: Type,
@@ -278,15 +336,15 @@ class Dataset( Generic[ST] ):
             A full (non-lazy) list of the individual ``tar`` files within the
             source WebDataset.
         """
-        pipe = wds.DataPipeline(
-            wds.SimpleShardList( self.url ),
-            wds.map( lambda x: x['url'] )
+        pipe = wds.pipeline.DataPipeline(
+            wds.shardlists.SimpleShardList( self.url ),
+            wds.filters.map( lambda x: x['url'] )
         )
         return list( pipe )
     def ordered( self,
                 batch_size: int | None = 1,
-            ) -> wds.DataPipeline:
+            ) -> Iterable[ST]:
         """Iterate over the dataset in order
         Args:
@@ -302,30 +360,30 @@ class Dataset( Generic[ST] ):
         if batch_size is None:
             # TODO Duplication here
-            return wds.DataPipeline(
-                wds.SimpleShardList( self.url ),
-                wds.split_by_worker,
+            return wds.pipeline.DataPipeline(
+                wds.shardlists.SimpleShardList( self.url ),
+                wds.shardlists.split_by_worker,
                 #
-                wds.tarfile_to_samples(),
+                wds.tariterators.tarfile_to_samples(),
                 # wds.map( self.preprocess ),
-                wds.map( self.wrap ),
+                wds.filters.map( self.wrap ),
             )
-        return wds.DataPipeline(
-            wds.SimpleShardList( self.url ),
-            wds.split_by_worker,
+        return wds.pipeline.DataPipeline(
+            wds.shardlists.SimpleShardList( self.url ),
+            wds.shardlists.split_by_worker,
             #
-            wds.tarfile_to_samples(),
+            wds.tariterators.tarfile_to_samples(),
             # wds.map( self.preprocess ),
-            wds.batched( batch_size ),
-            wds.map( self.wrap_batch ),
+            wds.filters.batched( batch_size ),
+            wds.filters.map( self.wrap_batch ),
         )
     def shuffled( self,
                 buffer_shards: int = 100,
                 buffer_samples: int = 10_000,
                 batch_size: int | None = 1,
-            ) -> wds.DataPipeline:
+            ) -> Iterable[ST]:
         """Iterate over the dataset in random order
         Args:
@@ -342,30 +400,88 @@ class Dataset( Generic[ST] ):
         if batch_size is None:
             # TODO Duplication here
-            return wds.DataPipeline(
-                wds.SimpleShardList( self.url ),
-                wds.shuffle( buffer_shards ),
-                wds.split_by_worker,
+            return wds.pipeline.DataPipeline(
+                wds.shardlists.SimpleShardList( self.url ),
+                wds.filters.shuffle( buffer_shards ),
+                wds.shardlists.split_by_worker,
                 #
-                wds.tarfile_to_samples(),
+                wds.tariterators.tarfile_to_samples(),
                 # wds.shuffle( buffer_samples ),
                 # wds.map( self.preprocess ),
-                wds.shuffle( buffer_samples ),
-                wds.map( self.wrap ),
+                wds.filters.shuffle( buffer_samples ),
+                wds.filters.map( self.wrap ),
             )
-        return wds.DataPipeline(
-            wds.SimpleShardList( self.url ),
-            wds.shuffle( buffer_shards ),
-            wds.split_by_worker,
+        return wds.pipeline.DataPipeline(
+            wds.shardlists.SimpleShardList( self.url ),
+            wds.filters.shuffle( buffer_shards ),
+            wds.shardlists.split_by_worker,
             #
-            wds.tarfile_to_samples(),
+            wds.tariterators.tarfile_to_samples(),
             # wds.shuffle( buffer_samples ),
             # wds.map( self.preprocess ),
-            wds.shuffle( buffer_samples ),
-            wds.batched( batch_size ),
-            wds.map( self.wrap_batch ),
+            wds.filters.shuffle( buffer_samples ),
+            wds.filters.batched( batch_size ),
+            wds.filters.map( self.wrap_batch ),
         )
+    # TODO Rewrite to eliminate `pandas` dependency directly calling
+    # `fastparquet`
+    def to_parquet( self, path: Pathlike,
+                sample_map: Optional[SampleExportMap] = None,
+                maxcount: Optional[int] = None,
+                **kwargs,
+            ):
+        """Save dataset contents to a `parquet` file at `path`
+        `kwargs` sent to `pandas.to_parquet`
+        """
+        ##
+        # Normalize args
+        path = Path( path )
+        if sample_map is None:
+            sample_map = asdict
+        verbose = kwargs.get( 'verbose', False )
+        it = self.ordered( batch_size = None )
+        if verbose:
+            it = tqdm( it )
+        #
+        if maxcount is None:
+            # Load and save full dataset
+            df = pd.DataFrame( [ sample_map( x )
+                                 for x in self.ordered( batch_size = None ) ] )
+            df.to_parquet( path, **kwargs )
+        else:
+            # Load and save dataset in segments of size `maxcount`
+            cur_segment = 0
+            cur_buffer = []
+            path_template = (path.parent / f'{path.stem}-%06d.{path.suffix}').as_posix()
+            for x in self.ordered( batch_size = None ):
+                cur_buffer.append( sample_map( x ) )
+                if len( cur_buffer ) >= maxcount:
+                    # Write current segment
+                    cur_path = path_template.format( cur_segment )
+                    df = pd.DataFrame( cur_buffer )
+                    df.to_parquet( cur_path, **kwargs )
+                    cur_segment += 1
+                    cur_buffer = []
+            if len( cur_buffer ) > 0:
+                # Write one last segment with remainder
+                cur_path = path_template.format( cur_segment )
+                df = pd.DataFrame( cur_buffer )
+                df.to_parquet( cur_path, **kwargs )
     # Implemented by specific subclasses
@@ -388,20 +504,24 @@ class Dataset( Generic[ST] ):
         assert 'msgpack' in sample
         assert type( sample['msgpack'] ) == bytes
-        return self.sample_type.from_bytes( sample['msgpack'] )
+        if self._output_lens is None:
+            return self.sample_type.from_bytes( sample['msgpack'] )
+        source_sample = self._output_lens.source_type.from_bytes( sample['msgpack'] )
+        return self._output_lens( source_sample )
-        try:
-            assert type( sample ) == dict
-            return cls.sample_class( **{
-                k: v
-                for k, v in sample.items() if k != '__key__'
-            } )
+        # try:
+        #     assert type( sample ) == dict
+        #     return cls.sample_class( **{
+        #         k: v
+        #         for k, v in sample.items() if k != '__key__'
+        #     } )
-        except Exception as e:
-            # Sample constructor failed -- revert to default
-            return AnySample(
-                value = sample,
-            )
+        # except Exception as e:
+        #     # Sample constructor failed -- revert to default
+        #     return AnySample(
+        #         value = sample,
+        #     )
     def wrap_batch( self, batch: WDSRawBatch ) -> SampleBatch[ST]:
         """Wrap a `batch` of samples into the appropriate dataset-specific type
@@ -410,10 +530,17 @@ class Dataset( Generic[ST] ):
         """
         assert 'msgpack' in batch
-        batch_unpacked = [ self.sample_type.from_bytes( bs )
-                           for bs in batch['msgpack'] ]
-        return SampleBatch[self.sample_type]( batch_unpacked )
+        if self._output_lens is None:
+            batch_unpacked = [ self.sample_type.from_bytes( bs )
+                               for bs in batch['msgpack'] ]
+            return SampleBatch[self.sample_type]( batch_unpacked )
+        batch_source = [ self._output_lens.source_type.from_bytes( bs )
+                         for bs in batch['msgpack'] ]
+        batch_view = [ self._output_lens( s )
+                       for s in batch_source ]
+        return SampleBatch[self.sample_type]( batch_view )
     # # @classmethod
     # def wrap_batch( self, batch: WDSRawBatch ) -> BT:
@@ -449,6 +576,9 @@ def packable( cls ):
     ##
+    class_name = cls.__name__
+    class_annotations = cls.__annotations__
     # Add in dataclass niceness to original class
     as_dataclass = dataclass( cls )
@@ -458,8 +588,9 @@ def packable( cls ):
         def __post_init__( self ):
             return PackableSample.__post_init__( self )
-    as_packable.__name__ = cls.__name__
-    as_packable.__annotations__ = cls.__annotations__
+    # TODO This doesn't properly carry over the original
+    as_packable.__name__ = class_name
+    as_packable.__annotations__ = class_annotations
     ##

atdata-0.1.3b3/src/atdata/lens.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""Lenses between typed datasets"""
+##
+# Imports
+import functools
+import inspect
+from typing import (
+    TypeAlias,
+    Type,
+    TypeVar,
+    Tuple,
+    Dict,
+    Callable,
+    Optional,
+    Generic,
+    #
+    TYPE_CHECKING
+)
+if TYPE_CHECKING:
+    from .dataset import PackableSample
+##
+# Typing helpers
+DatasetType: TypeAlias = Type['PackableSample']
+LensSignature: TypeAlias = Tuple[DatasetType, DatasetType]
+S = TypeVar( 'S', bound = 'PackableSample' )
+V = TypeVar( 'V', bound = 'PackableSample' )
+type LensGetter[S, V] = Callable[[S], V]
+type LensPutter[S, V] = Callable[[V, S], S]
+##
+# Shortcut decorators
+class Lens( Generic[S, V] ):
+    """TODO"""
+    # @property
+    # def source_type( self ) -> Type[S]:
+    #     """The source type (S) for the lens; what is put to"""
+    #     # TODO Figure out why linting fails here
+    #     return self.__orig_class__.__args__[0]
+    # @property
+    # def view_type( self ) -> Type[V]:
+    #     """The view type (V) for the lens; what is get'd from"""
+    #     # TODO FIgure out why linting fails here
+    #     return self.__orig_class__.__args__[1]
+    def __init__( self, get: LensGetter[S, V],
+                put: Optional[LensPutter[S, V]] = None
+            ) -> None:
+        """TODO"""
+        ##
+        # Check argument validity
+        sig = inspect.signature( get )
+        input_types = list( sig.parameters.values() )
+        assert len( input_types ) == 1, \
+            'Wrong number of input args for lens: should only have one'
+        # Update function details for this object as returned by annotation
+        functools.update_wrapper( self, get )
+        self.source_type: Type[PackableSample] = input_types[0].annotation
+        self.view_type = sig.return_annotation
+        # Store the getter
+        self._getter = get
+        # Determine and store the putter
+        if put is None:
+            # Trivial putter does not update the source
+            def _trivial_put( v: V, s: S ) -> S:
+                return s
+            put = _trivial_put
+        self._putter = put
+    #
+    def putter( self, put: LensPutter[S, V] ) -> LensPutter[S, V]:
+        """TODO"""
+        ##
+        self._putter = put
+        return put
+    # Methods to actually execute transformations
+    def put( self, v: V, s: S ) -> S:
+        """TODO"""
+        return self._putter( v, s )
+    def get( self, s: S ) -> V:
+        """TODO"""
+        return self( s )
+    # Convenience to enable calling the lens as its getter
+    def __call__( self, s: S ) -> V:
+        return self._getter( s )
+# TODO Figure out how to properly parameterize this
+# def _lens_factory[S, V]( register: bool = True ):
+#     """Register the annotated function `f` as the getter of a sample lens"""
+#     # The actual lens decorator taking a lens getter function to a lens object
+#     def _decorator( f: LensGetter[S, V] ) -> Lens[S, V]:
+#         ret = Lens[S, V]( f )
+#         if register:
+#             _network.register( ret )
+#         return ret
+#     # Return the lens decorator
+#     return _decorator
+# # For convenience
+# lens = _lens_factory
+def lens(  f: LensGetter[S, V] ) -> Lens[S, V]:
+    ret = Lens[S, V]( f )
+    _network.register( ret )
+    return ret
+##
+# Global registry of used lenses
+# _registered_lenses: Dict[LensSignature, Lens] = dict()
+# """TODO"""
+class LensNetwork:
+    """TODO"""
+    _instance = None
+    """The singleton instance"""
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            # If no instance exists, create a new one
+            cls._instance = super().__new__(cls)
+        return cls._instance  # Return the existing (or newly created) instance
+    def __init__(self):
+        if not hasattr(self, '_initialized'):  # Check if already initialized
+            self._registry: Dict[LensSignature, Lens] = dict()
+            self._initialized = True
+    def register( self, _lens: Lens ):
+        """Set `lens` as the canonical view between its source and view types"""
+        # sig = inspect.signature( _lens.get )
+        # input_types = list( sig.parameters.values() )
+        # assert len( input_types ) == 1, \
+        #     'Wrong number of input args for lens: should only have one'
+        # input_type = input_types[0].annotation
+        # print( input_type )
+        # output_type = sig.return_annotation
+        # self._registry[input_type, output_type] = _lens
+        # print( _lens.source_type )
+        self._registry[_lens.source_type, _lens.view_type] = _lens
+    def transform( self, source: DatasetType, view: DatasetType ) -> Lens:
+        """TODO"""
+        # TODO Handle compositional closure
+        ret = self._registry.get( (source, view), None )
+        if ret is None:
+            raise ValueError( f'No registered lens from source {source} to view {view}' )
+        return ret
+# Create global singleton registry instance
+_network = LensNetwork()
+# def lens( f: LensPutter ) -> Lens:
+#     """Register the annotated function `f` as a sample lens"""
+#     ##
+#     sig = inspect.signature( f )
+#     input_types = list( sig.parameters.values() )
+#     output_type = sig.return_annotation
+#     _registered_lenses[]
+#     f.lens = Lens(
+#     )
+#     return f

{atdata-0.1.2a4 → atdata-0.1.3b3}/tests/test_dataset.py RENAMED Viewed

@@ -50,6 +50,12 @@ class NumpyTestSampleDecorated:
     label: int
     image: NDArray
+@atdata.packable
+class NumpyOptionalSampleDecorated:
+    label: int
+    image: NDArray
+    embeddings: NDArray | None = None
 test_cases = [
     {
         'SampleType': BasicTestSample,
@@ -59,6 +65,7 @@ test_cases = [
             'value': 1024.768,
         },
         'sample_wds_stem': 'basic_test',
+        'test_parquet': True,
     },
     {
         'SampleType': NumpyTestSample,
@@ -68,6 +75,7 @@ test_cases = [
             'image': np.random.randn( 1024, 1024 ),
         },
         'sample_wds_stem': 'numpy_test',
+        'test_parquet': False,
     },
     {
         'SampleType': BasicTestSampleDecorated,
@@ -77,6 +85,7 @@ test_cases = [
             'value': 1024.768,
         },
         'sample_wds_stem': 'basic_test_decorated',
+        'test_parquet': True,
     },
     {
         'SampleType': NumpyTestSampleDecorated,
@@ -86,6 +95,29 @@ test_cases = [
             'image': np.random.randn( 1024, 1024 ),
         },
         'sample_wds_stem': 'numpy_test_decorated',
+        'test_parquet': False,
+    },
+    {
+        'SampleType': NumpyOptionalSampleDecorated,
+        'sample_data':
+        {
+            'label': 9_001,
+            'image': np.random.randn( 1024, 1024 ),
+            'embeddings': np.random.randn( 512 ),
+        },
+        'sample_wds_stem': 'numpy_optional_decorated',
+        'test_parquet': False,
+    },
+    {
+        'SampleType': NumpyOptionalSampleDecorated,
+        'sample_data':
+        {
+            'label': 9_001,
+            'image': np.random.randn( 1024, 1024 ),
+            'embeddings': None,
+        },
+        'sample_wds_stem': 'numpy_optional_decorated_none',
+        'test_parquet': False,
     },
 ]
@@ -175,7 +207,7 @@ def test_wds(
     ).as_posix()
     file_wds_pattern = file_pattern.format( shard_id = '%06d' )
-    with wds.ShardWriter(
+    with wds.writer.ShardWriter(
         pattern = file_wds_pattern,
         maxcount = shard_maxcount,
     ) as sink:
@@ -323,5 +355,56 @@ def test_wds(
     assert iterations_run == n_iterate, \
         "Only found {iterations_run} samples, not {n_iterate}"
+#
+@pytest.mark.parametrize(
+    ('SampleType', 'sample_data', 'sample_wds_stem', 'test_parquet'),
+    [ (
+        case['SampleType'],
+        case['sample_data'],
+        case['sample_wds_stem'],
+        case['test_parquet']
+      )
+      for case in test_cases ]
+)
+def test_parquet_export(
+            SampleType: Type[atdata.PackableSample],
+            sample_data: atds.MsgpackRawSample,
+            sample_wds_stem: str,
+            test_parquet: bool,
+            tmp_path
+        ):
+    """Test our ability to export a dataset to `parquet` format"""
+    # Skip irrelevant test cases
+    if not test_parquet:
+        return
+    ## Testing hyperparameters
+    n_copies_dataset = 1_000
+    n_per_file = 100
+    ## Start out by writing tar dataset
+    wds_filename = (tmp_path / f'{sample_wds_stem}.tar').as_posix()
+    with wds.writer.TarWriter( wds_filename ) as sink:
+        for _ in range( n_copies_dataset ):
+            new_sample = SampleType.from_data( sample_data )
+            sink.write( new_sample.as_wds )
+    ## Now export to `parquet`
+    dataset = atdata.Dataset[SampleType]( wds_filename )
+    parquet_filename = tmp_path / f'{sample_wds_stem}.parquet'
+    dataset.to_parquet( parquet_filename )
+    parquet_filename = tmp_path / f'{sample_wds_stem}-segments.parquet'
+    dataset.to_parquet( parquet_filename, maxcount = n_per_file )
+    ## Double-check our `parquet` export
+    # TODO
 ##

atdata-0.1.3b3/tests/test_lens.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""Test lens functionality."""
+##
+# Imports
+import pytest
+from dataclasses import dataclass
+import webdataset as wds
+import atdata
+import numpy as np
+from numpy.typing import NDArray
+##
+# Tests
+def test_lens():
+    """Test a lens between sample types"""
+    # Set up the lens scenario
+    @atdata.packable
+    class Source:
+        name: str
+        age: int
+        height: float
+    @atdata.packable
+    class View:
+        name: str
+        height: float
+    @atdata.lens
+    def polite( s: Source ) -> View:
+        return View(
+            name = s.name,
+            height = s.height,
+        )
+    @polite.putter
+    def polite_update( v: View, s: Source ) -> Source:
+        return Source(
+            name = v.name,
+            height = v.height,
+            #
+            age = s.age,
+        )
+    # Test with an example sample
+    test_source = Source(
+        name = 'Hello World',
+        age = 42,
+        height = 182.9,
+    )
+    correct_view = View(
+        name = test_source.name,
+        height = test_source.height,
+    )
+    test_view = polite( test_source )
+    assert test_view == correct_view, \
+        f'Incorrect lens behavior: {test_view}, and not {correct_view}'
+    # This lens should be well-behaved
+    update_view = View(
+        name = 'Now Taller',
+        height = 192.9,
+    )
+    x = polite( polite.put( update_view, test_source ) )
+    assert x == update_view, \
+        f'Violation of GetPut: {x} =/= {update_view}'
+    y = polite.put( polite( test_source ), test_source )
+    assert y == test_source, \
+        f'Violation of PutGet: {y} =/= {test_source}'
+    # TODO Test PutPut
+def test_conversion( tmp_path ):
+    """Test automatic interconversion between sample types"""
+    @dataclass
+    class Source( atdata.PackableSample ):
+        name: str
+        height: float
+        favorite_pizza: str
+        favorite_image: NDArray
+    @dataclass
+    class View( atdata.PackableSample ):
+        name: str
+        favorite_pizza: str
+        favorite_image: NDArray
+    @atdata.lens
+    def polite( s: Source ) -> View:
+        return View(
+            name = s.name,
+            favorite_pizza = s.favorite_pizza,
+            favorite_image = s.favorite_image,
+        )
+    lens_network = atdata.LensNetwork()
+    print( lens_network._registry )
+    # Map a test sample through the view
+    test_source = Source(
+        name = 'Larry',
+        height = 42.,
+        favorite_pizza = 'pineapple',
+        favorite_image = np.random.randn( 224, 224 )
+    )
+    test_view = polite( test_source )
+    # Create a test dataset
+    k_test = 100
+    test_filename = (
+        tmp_path
+        / 'test-source.tar'
+    ).as_posix()
+    with wds.writer.TarWriter( test_filename ) as dest:
+        for i in range( k_test ):
+            # Create a new copied sample
+            cur_sample = Source(
+                name = test_source.name,
+                height = test_source.height,
+                favorite_pizza = test_source.favorite_pizza,
+                favorite_image = test_source.favorite_image,
+            )
+            dest.write( cur_sample.as_wds )
+    # Try reading the test dataset
+    ds = (
+        atdata.Dataset[Source]( test_filename )
+            .as_type( View )
+    )
+    assert ds.sample_type == View, \
+        'Auto-mapped'
+    sample: View | None = None
+    for sample in ds.ordered( batch_size = None ):
+        # Load only the first sample
+        break
+    assert sample is not None, \
+        'Did not load any samples from `Source` dataset'
+    assert sample.name == test_view.name, \
+        f'Divergence on auto-mapped dataset: `name` should be {test_view.name}, but is {sample.name}'
+    # assert sample.height == test_view.height, \
+    #     f'Divergence on auto-mapped dataset: `height` should be {test_view.height}, but is {sample.height}'
+    assert sample.favorite_pizza == test_view.favorite_pizza, \
+        f'Divergence on auto-mapped dataset: `favorite_pizza` should be {test_view.favorite_pizza}, but is {sample.favorite_pizza}'
+    assert np.all( sample.favorite_image == test_view.favorite_image ), \
+        f'Divergence on auto-mapped dataset: `favorite_image`'
+##

{atdata-0.1.2a4 → atdata-0.1.3b3}/.github/workflows/uv-publish-pypi.yml RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.3b3}/.github/workflows/uv-test.yml RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.3b3}/.gitignore RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.3b3}/.python-version RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.3b3}/LICENSE RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.3b3}/README.md RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.3b3}/src/atdata/_helpers.py RENAMED Viewed

File without changes

atdata 0.1.2a4__tar.gz → 0.1.3b3__tar.gz

atdata 0.1.2a4tar.gz → 0.1.3b3tar.gz