PyPI - atdata - Versions diffs - 0.1.2b1__tar.gz → 0.1.3a2__tar.gz - Mend

atdata 0.1.2b1tar.gz → 0.1.3a2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{atdata-0.1.2b1 → atdata-0.1.3a2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: atdata
-Version: 0.1.2b1
+Version: 0.1.3a2
 Summary: A loose federation of distributed, typed datasets
 Author-email: Maxine Levesque <hello@maxine.science>
 License-File: LICENSE

{atdata-0.1.2b1 → atdata-0.1.3a2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "atdata"
-version = "0.1.2b1"
+version = "0.1.3a2"
 description = "A loose federation of distributed, typed datasets"
 readme = "README.md"
 authors = [

{atdata-0.1.2b1 → atdata-0.1.3a2}/src/atdata/__init__.py RENAMED Viewed

@@ -12,6 +12,7 @@ from .dataset import (
 from .lens import (
     Lens,
+    LensNetwork,
     lens,
 )

{atdata-0.1.2b1 → atdata-0.1.3a2}/src/atdata/dataset.py RENAMED Viewed

@@ -48,6 +48,7 @@ from numpy.typing import (
 import msgpack
 import ormsgpack
 from . import _helpers as eh
+from .lens import Lens, LensNetwork
 ##
@@ -231,6 +232,8 @@ class SampleBatch( Generic[DT] ):
 ST = TypeVar( 'ST', bound = PackableSample )
 # BT = TypeVar( 'BT' )
+RT = TypeVar( 'RT', bound = PackableSample )
 # TODO For python 3.13
 # BT = TypeVar( 'BT', default = None )
 # IT = TypeVar( 'IT', default = Any )
@@ -268,6 +271,17 @@ class Dataset( Generic[ST] ):
         super().__init__()
         self.url = url
+        # Allow addition of automatic transformation of raw underlying data
+        self._output_lens: Lens | None = None
+    def as_type( self, other: Type[RT] ) -> 'Dataset[RT]':
+        """TODO"""
+        ret = Dataset[other]( self.url )
+        # Get the singleton lens registry
+        lenses = LensNetwork()
+        ret._output_lens = lenses.transform( self.sample_type, ret.sample_type )
+        return ret
     # @classmethod
     # def register( cls, uri: str,
     #             sample_class: Type,
@@ -293,9 +307,9 @@ class Dataset( Generic[ST] ):
             A full (non-lazy) list of the individual ``tar`` files within the
             source WebDataset.
         """
-        pipe = wds.DataPipeline(
-            wds.SimpleShardList( self.url ),
-            wds.map( lambda x: x['url'] )
+        pipe = wds.pipeline.DataPipeline(
+            wds.shardlists.SimpleShardList( self.url ),
+            wds.filters.map( lambda x: x['url'] )
         )
         return list( pipe )
@@ -317,23 +331,23 @@ class Dataset( Generic[ST] ):
         if batch_size is None:
             # TODO Duplication here
-            return wds.DataPipeline(
-                wds.SimpleShardList( self.url ),
-                wds.split_by_worker,
+            return wds.pipeline.DataPipeline(
+                wds.shardlists.SimpleShardList( self.url ),
+                wds.shardlists.split_by_worker,
                 #
-                wds.tarfile_to_samples(),
+                wds.tariterators.tarfile_to_samples(),
                 # wds.map( self.preprocess ),
-                wds.map( self.wrap ),
+                wds.filters.map( self.wrap ),
             )
-        return wds.DataPipeline(
-            wds.SimpleShardList( self.url ),
-            wds.split_by_worker,
+        return wds.pipeline.DataPipeline(
+            wds.shardlists.SimpleShardList( self.url ),
+            wds.shardlists.split_by_worker,
             #
-            wds.tarfile_to_samples(),
+            wds.tariterators.tarfile_to_samples(),
             # wds.map( self.preprocess ),
-            wds.batched( batch_size ),
-            wds.map( self.wrap_batch ),
+            wds.filters.batched( batch_size ),
+            wds.filters.map( self.wrap_batch ),
         )
     def shuffled( self,
@@ -461,7 +475,11 @@ class Dataset( Generic[ST] ):
         assert 'msgpack' in sample
         assert type( sample['msgpack'] ) == bytes
-        return self.sample_type.from_bytes( sample['msgpack'] )
+        if self._output_lens is None:
+            return self.sample_type.from_bytes( sample['msgpack'] )
+        source_sample = self._output_lens.source_type.from_bytes( sample['msgpack'] )
+        return self._output_lens( source_sample )
         # try:
         #     assert type( sample ) == dict

atdata-0.1.3a2/src/atdata/lens.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""Lenses between typed datasets"""
+##
+# Imports
+import functools
+import inspect
+from typing import (
+    TypeAlias,
+    Type,
+    TypeVar,
+    Tuple,
+    Dict,
+    Callable,
+    Optional,
+    Generic,
+    #
+    TYPE_CHECKING
+)
+if TYPE_CHECKING:
+    from .dataset import PackableSample
+##
+# Typing helpers
+DatasetType: TypeAlias = Type['PackableSample']
+LensSignature: TypeAlias = Tuple[DatasetType, DatasetType]
+S = TypeVar( 'S', bound = 'PackableSample' )
+V = TypeVar( 'V', bound = 'PackableSample' )
+type LensGetter[S, V] = Callable[[S], V]
+type LensPutter[S, V] = Callable[[V, S], S]
+##
+# Shortcut decorators
+class Lens( Generic[S, V] ):
+    """TODO"""
+    # @property
+    # def source_type( self ) -> Type[S]:
+    #     """The source type (S) for the lens; what is put to"""
+    #     # TODO Figure out why linting fails here
+    #     return self.__orig_class__.__args__[0]
+    # @property
+    # def view_type( self ) -> Type[V]:
+    #     """The view type (V) for the lens; what is get'd from"""
+    #     # TODO FIgure out why linting fails here
+    #     return self.__orig_class__.__args__[1]
+    def __init__( self, get: LensGetter[S, V],
+                put: Optional[LensPutter[S, V]] = None
+            ) -> None:
+        """TODO"""
+        ##
+        # Check argument validity
+        sig = inspect.signature( get )
+        input_types = list( sig.parameters.values() )
+        assert len( input_types ) == 1, \
+            'Wrong number of input args for lens: should only have one'
+        # Update function details for this object as returned by annotation
+        functools.update_wrapper( self, get )
+        self.source_type: Type[PackableSample] = input_types[0].annotation
+        self.view_type = sig.return_annotation
+        # Store the getter
+        self._getter = get
+        # Determine and store the putter
+        if put is None:
+            # Trivial putter does not update the source
+            def _trivial_put( v: V, s: S ) -> S:
+                return s
+            put = _trivial_put
+        self._putter = put
+    #
+    def putter( self, put: LensPutter[S, V] ) -> LensPutter[S, V]:
+        """TODO"""
+        ##
+        self._putter = put
+        return put
+    # Methods to actually execute transformations
+    def put( self, v: V, s: S ) -> S:
+        """TODO"""
+        return self._putter( v, s )
+    def get( self, s: S ) -> V:
+        """TODO"""
+        return self( s )
+    # Convenience to enable calling the lens as its getter
+    def __call__( self, s: S ) -> V:
+        return self._getter( s )
+# TODO Figure out how to properly parameterize this
+# def _lens_factory[S, V]( register: bool = True ):
+#     """Register the annotated function `f` as the getter of a sample lens"""
+#     # The actual lens decorator taking a lens getter function to a lens object
+#     def _decorator( f: LensGetter[S, V] ) -> Lens[S, V]:
+#         ret = Lens[S, V]( f )
+#         if register:
+#             _network.register( ret )
+#         return ret
+#     # Return the lens decorator
+#     return _decorator
+# # For convenience
+# lens = _lens_factory
+def lens(  f: LensGetter[S, V] ) -> Lens[S, V]:
+    ret = Lens[S, V]( f )
+    _network.register( ret )
+    return ret
+##
+# Global registry of used lenses
+# _registered_lenses: Dict[LensSignature, Lens] = dict()
+# """TODO"""
+class LensNetwork:
+    """TODO"""
+    _instance = None
+    """The singleton instance"""
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            # If no instance exists, create a new one
+            cls._instance = super().__new__(cls)
+        return cls._instance  # Return the existing (or newly created) instance
+    def __init__(self):
+        if not hasattr(self, '_initialized'):  # Check if already initialized
+            self._registry: Dict[LensSignature, Lens] = dict()
+            self._initialized = True
+    def register( self, _lens: Lens ):
+        """Set `lens` as the canonical view between its source and view types"""
+        # sig = inspect.signature( _lens.get )
+        # input_types = list( sig.parameters.values() )
+        # assert len( input_types ) == 1, \
+        #     'Wrong number of input args for lens: should only have one'
+        # input_type = input_types[0].annotation
+        # print( input_type )
+        # output_type = sig.return_annotation
+        # self._registry[input_type, output_type] = _lens
+        print( _lens.source_type )
+        self._registry[_lens.source_type, _lens.view_type] = _lens
+    def transform( self, source: DatasetType, view: DatasetType ) -> Lens:
+        """TODO"""
+        # TODO Handle compositional closure
+        ret = self._registry.get( (source, view), None )
+        if ret is None:
+            raise ValueError( f'No registered lens from source {source} to view {view}' )
+        return ret
+# Create global singleton registry instance
+_network = LensNetwork()
+# def lens( f: LensPutter ) -> Lens:
+#     """Register the annotated function `f` as a sample lens"""
+#     ##
+#     sig = inspect.signature( f )
+#     input_types = list( sig.parameters.values() )
+#     output_type = sig.return_annotation
+#     _registered_lenses[]
+#     f.lens = Lens(
+#     )
+#     return f

{atdata-0.1.2b1 → atdata-0.1.3a2}/tests/test_dataset.py RENAMED Viewed

@@ -179,7 +179,7 @@ def test_wds(
     ).as_posix()
     file_wds_pattern = file_pattern.format( shard_id = '%06d' )
-    with wds.ShardWriter(
+    with wds.writer.ShardWriter(
         pattern = file_wds_pattern,
         maxcount = shard_maxcount,
     ) as sink:
@@ -339,7 +339,7 @@ def test_wds(
       )
       for case in test_cases ]
 )
-def test_create_sample(
+def test_parquet_export(
             SampleType: Type[atdata.PackableSample],
             sample_data: atds.MsgpackRawSample,
             sample_wds_stem: str,
@@ -360,7 +360,7 @@ def test_create_sample(
     ## Start out by writing tar dataset
     wds_filename = (tmp_path / f'{sample_wds_stem}.tar').as_posix()
-    with wds.TarWriter( wds_filename ) as sink:
+    with wds.writer.TarWriter( wds_filename ) as sink:
         for _ in range( n_copies_dataset ):
             new_sample = SampleType.from_data( sample_data )
             sink.write( new_sample.as_wds )
@@ -371,73 +371,12 @@ def test_create_sample(
     parquet_filename = tmp_path / f'{sample_wds_stem}.parquet'
     dataset.to_parquet( parquet_filename )
+    parquet_filename = tmp_path / f'{sample_wds_stem}-segments.parquet'
+    dataset.to_parquet( parquet_filename, maxcount = n_per_file )
     ## Double-check our `parquet` export
     # TODO
-def test_lens():
-    """Test a lens between sample types"""
-    # Set up the lens scenario
-    @atdata.packable
-    class Source:
-        name: str
-        age: int
-        height: float
-    @atdata.packable
-    class View:
-        name: str
-        height: float
-    @atdata.lens
-    def polite( s: Source ) -> View:
-        return View(
-            name = s.name,
-            height = s.height,
-        )
-    @polite.putter
-    def polite_update( v: View, s: Source ) -> Source:
-        return Source(
-            name = v.name,
-            height = v.height,
-            #
-            age = s.age,
-        )
-    # Test with an example sample
-    test_source = Source(
-        name = 'Hello World',
-        age = 42,
-        height = 182.9,
-    )
-    correct_view = View(
-        name = test_source.name,
-        height = test_source.height,
-    )
-    test_view = polite( test_source )
-    assert test_view == correct_view, \
-        f'Incorrect lens behavior: {test_view}, and not {correct_view}'
-    # This lens should be well-behaved
-    update_view = View(
-        name = 'Now Taller',
-        height = 192.9,
-    )
-    x = polite( polite.put( update_view, test_source ) )
-    assert x == update_view, \
-        f'Violation of GetPut: {x} =/= {update_view}'
-    y = polite.put( polite( test_source ), test_source )
-    assert y == test_source, \
-        f'Violation of PutGet: {y} =/= {test_source}'
-    # TODO Test PutPut
 ##

atdata-0.1.3a2/tests/test_lens.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""Test lens functionality."""
+##
+# Imports
+import pytest
+from dataclasses import dataclass
+import webdataset as wds
+import atdata
+import numpy as np
+from numpy.typing import NDArray
+##
+# Tests
+def test_lens():
+    """Test a lens between sample types"""
+    # Set up the lens scenario
+    @atdata.packable
+    class Source:
+        name: str
+        age: int
+        height: float
+    @atdata.packable
+    class View:
+        name: str
+        height: float
+    @atdata.lens
+    def polite( s: Source ) -> View:
+        return View(
+            name = s.name,
+            height = s.height,
+        )
+    @polite.putter
+    def polite_update( v: View, s: Source ) -> Source:
+        return Source(
+            name = v.name,
+            height = v.height,
+            #
+            age = s.age,
+        )
+    # Test with an example sample
+    test_source = Source(
+        name = 'Hello World',
+        age = 42,
+        height = 182.9,
+    )
+    correct_view = View(
+        name = test_source.name,
+        height = test_source.height,
+    )
+    test_view = polite( test_source )
+    assert test_view == correct_view, \
+        f'Incorrect lens behavior: {test_view}, and not {correct_view}'
+    # This lens should be well-behaved
+    update_view = View(
+        name = 'Now Taller',
+        height = 192.9,
+    )
+    x = polite( polite.put( update_view, test_source ) )
+    assert x == update_view, \
+        f'Violation of GetPut: {x} =/= {update_view}'
+    y = polite.put( polite( test_source ), test_source )
+    assert y == test_source, \
+        f'Violation of PutGet: {y} =/= {test_source}'
+    # TODO Test PutPut
+def test_conversion( tmp_path ):
+    """Test automatic interconversion between sample types"""
+    @dataclass
+    class Source( atdata.PackableSample ):
+        name: str
+        height: float
+        favorite_pizza: str
+        favorite_image: NDArray
+    @dataclass
+    class View( atdata.PackableSample ):
+        name: str
+        favorite_pizza: str
+        favorite_image: NDArray
+    @atdata.lens
+    def polite( s: Source ) -> View:
+        return View(
+            name = s.name,
+            favorite_pizza = s.favorite_pizza,
+            favorite_image = s.favorite_image,
+        )
+    lens_network = atdata.LensNetwork()
+    print( lens_network._registry )
+    # Map a test sample through the view
+    test_source = Source(
+        name = 'Larry',
+        height = 42.,
+        favorite_pizza = 'pineapple',
+        favorite_image = np.random.randn( 224, 224 )
+    )
+    test_view = polite( test_source )
+    # Create a test dataset
+    k_test = 100
+    test_filename = (
+        tmp_path
+        / 'test-source.tar'
+    ).as_posix()
+    with wds.writer.TarWriter( test_filename ) as dest:
+        for i in range( k_test ):
+            # Create a new copied sample
+            cur_sample = Source(
+                name = test_source.name,
+                height = test_source.height,
+                favorite_pizza = test_source.favorite_pizza,
+                favorite_image = test_source.favorite_image,
+            )
+            dest.write( cur_sample.as_wds )
+    # Try reading the test dataset
+    ds = (
+        atdata.Dataset[Source]( test_filename )
+            .as_type( View )
+    )
+    assert ds.sample_type == View, \
+        'Auto-mapped'
+    sample: View | None = None
+    for sample in ds.ordered( batch_size = None ):
+        # Load only the first sample
+        break
+    assert sample is not None, \
+        'Did not load any samples from `Source` dataset'
+    assert sample.name == test_view.name, \
+        f'Divergence on auto-mapped dataset: `name` should be {test_view.name}, but is {sample.name}'
+    # assert sample.height == test_view.height, \
+    #     f'Divergence on auto-mapped dataset: `height` should be {test_view.height}, but is {sample.height}'
+    assert sample.favorite_pizza == test_view.favorite_pizza, \
+        f'Divergence on auto-mapped dataset: `favorite_pizza` should be {test_view.favorite_pizza}, but is {sample.favorite_pizza}'
+    assert np.all( sample.favorite_image == test_view.favorite_image ), \
+        f'Divergence on auto-mapped dataset: `favorite_image`'
+##

atdata-0.1.2b1/src/atdata/lens.py DELETED Viewed

@@ -1,122 +0,0 @@
-"""Lenses between typed datasets"""
-##
-# Imports
-from .dataset import PackableSample
-import functools
-import inspect
-from typing import (
-    TypeAlias,
-    Type,
-    TypeVar,
-    Tuple,
-    Dict,
-    Callable,
-    Optional,
-    Generic,
-)
-##
-# Typing helpers
-DatasetType: TypeAlias = Type[PackableSample]
-LensSignature: TypeAlias = Tuple[DatasetType, DatasetType]
-S = TypeVar( 'S', bound = PackableSample )
-V = TypeVar( 'V', bound = PackableSample )
-type LensGetter[S, V] = Callable[[S], V]
-type LensPutter[S, V] = Callable[[V, S], S]
-##
-# Shortcut decorators
-class Lens( Generic[S, V] ):
-    """TODO"""
-    def __init__( self, get: LensGetter[S, V],
-                put: Optional[LensPutter[S, V]] = None
-            ) -> None:
-        """TODO"""
-        ##
-        # Update
-        functools.update_wrapper( self, get )
-        # Store the getter
-        self._getter = get
-        # Determine and store the putter
-        if put is None:
-            # Trivial putter does not update the source
-            def _trivial_put( v: V, s: S ) -> S:
-                return s
-            put = _trivial_put
-        self._putter = put
-        # Register this lens for this type signature
-        sig = inspect.signature( get )
-        input_types = list( sig.parameters.values() )
-        assert len( input_types ) == 1, \
-            'Wrong number of input args for lens: should only have one'
-        input_type = input_types[0].annotation
-        output_type = sig.return_annotation
-        _registered_lenses[(input_type, output_type)] = self
-        print( _registered_lenses )
-    #
-    def putter( self, put: LensPutter[S, V] ) -> LensPutter[S, V]:
-        """TODO"""
-        ##
-        self._putter = put
-        return put
-    def put( self, v: V, s: S ) -> S:
-        """TODO"""
-        return self._putter( v, s )
-    def get( self, s: S ) -> V:
-        """TODO"""
-        return self( s )
-    #
-    def __call__( self, s: S ) -> V:
-        return self._getter( s )
-def lens( f: LensGetter[S, V] ) -> Lens[S, V]:
-    """Register the annotated function `f` as the getter of a sample lens"""
-    return Lens[S, V]( f )
-##
-# Global registration of used lenses
-_registered_lenses: Dict[LensSignature, Lens] = dict()
-"""TODO"""
-# def lens( f: LensPutter ) -> Lens:
-#     """Register the annotated function `f` as a sample lens"""
-#     ##
-#     sig = inspect.signature( f )
-#     input_types = list( sig.parameters.values() )
-#     output_type = sig.return_annotation
-#     _registered_lenses[]
-#     f.lens = Lens(
-#     )
-#     return f

{atdata-0.1.2b1 → atdata-0.1.3a2}/.github/workflows/uv-publish-pypi.yml RENAMED Viewed

File without changes

{atdata-0.1.2b1 → atdata-0.1.3a2}/.github/workflows/uv-test.yml RENAMED Viewed

File without changes

{atdata-0.1.2b1 → atdata-0.1.3a2}/.gitignore RENAMED Viewed

File without changes

{atdata-0.1.2b1 → atdata-0.1.3a2}/.python-version RENAMED Viewed

File without changes

{atdata-0.1.2b1 → atdata-0.1.3a2}/LICENSE RENAMED Viewed

File without changes

{atdata-0.1.2b1 → atdata-0.1.3a2}/README.md RENAMED Viewed

File without changes

{atdata-0.1.2b1 → atdata-0.1.3a2}/src/atdata/_helpers.py RENAMED Viewed

File without changes

atdata 0.1.2b1__tar.gz → 0.1.3a2__tar.gz

atdata 0.1.2b1tar.gz → 0.1.3a2tar.gz