PyPI - atdata - Versions diffs - 0.1.2a4__tar.gz → 0.1.2b1__tar.gz - Mend

atdata 0.1.2a4tar.gz → 0.1.2b1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

{atdata-0.1.2a4 → atdata-0.1.2b1}/PKG-INFO RENAMED Viewed

@@ -1,13 +1,16 @@
 Metadata-Version: 2.4
 Name: atdata
-Version: 0.1.2a4
+Version: 0.1.2b1
 Summary: A loose federation of distributed, typed datasets
 Author-email: Maxine Levesque <hello@maxine.science>
 License-File: LICENSE
 Requires-Python: >=3.12
+Requires-Dist: fastparquet>=2024.11.0
 Requires-Dist: msgpack>=1.1.2
 Requires-Dist: numpy>=2.3.4
 Requires-Dist: ormsgpack>=1.11.0
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: tqdm>=4.67.1
 Requires-Dist: webdataset>=1.0.2
 Description-Content-Type: text/markdown

{atdata-0.1.2a4 → atdata-0.1.2b1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "atdata"
-version = "0.1.2a4"
+version = "0.1.2b1"
 description = "A loose federation of distributed, typed datasets"
 readme = "README.md"
 authors = [
@@ -8,9 +8,12 @@ authors = [
 ]
 requires-python = ">=3.12"
 dependencies = [
+    "fastparquet>=2024.11.0",
     "msgpack>=1.1.2",
     "numpy>=2.3.4",
     "ormsgpack>=1.11.0",
+    "pandas>=2.3.3",
+    "tqdm>=4.67.1",
     "webdataset>=1.0.2",
 ]

{atdata-0.1.2a4 → atdata-0.1.2b1}/src/atdata/__init__.py RENAMED Viewed

@@ -10,5 +10,10 @@ from .dataset import (
     packable,
 )
+from .lens import (
+    Lens,
+    lens,
+)
 #

{atdata-0.1.2a4 → atdata-0.1.2b1}/src/atdata/dataset.py RENAMED Viewed

@@ -5,21 +5,29 @@
 import webdataset as wds
-import functools
-from dataclasses import dataclass
+from pathlib import Path
 import uuid
-import numpy as np
+import functools
+from dataclasses import (
+    dataclass,
+    asdict,
+)
 from abc import (
     ABC,
     abstractmethod,
 )
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
 from typing import (
     Any,
     Optional,
     Dict,
     Sequence,
+    Iterable,
+    Callable,
     #
     Self,
     Generic,
@@ -45,9 +53,14 @@ from . import _helpers as eh
 ##
 # Typing help
+Pathlike = str | Path
 WDSRawSample: TypeAlias = Dict[str, Any]
 WDSRawBatch: TypeAlias = Dict[str, Any]
+SampleExportRow: TypeAlias = Dict[str, Any]
+SampleExportMap: TypeAlias = Callable[['PackableSample'], SampleExportRow]
 ##
 # Main base classes
@@ -94,6 +107,7 @@ def _make_packable( x ):
         return eh.array_to_bytes( x )
     return x
+@dataclass
 class PackableSample( ABC ):
     """A sample that can be packed and unpacked with msgpack"""
@@ -235,6 +249,7 @@ class Dataset( Generic[ST] ):
     @property
     def sample_type( self ) -> Type:
         """The type of each returned sample from this `Dataset`'s iterator"""
+        # TODO Figure out why linting fails here
         return self.__orig_class__.__args__[0]
     @property
     def batch_type( self ) -> Type:
@@ -286,7 +301,7 @@ class Dataset( Generic[ST] ):
     def ordered( self,
                 batch_size: int | None = 1,
-            ) -> wds.DataPipeline:
+            ) -> Iterable[ST]:
         """Iterate over the dataset in order
         Args:
@@ -325,7 +340,7 @@ class Dataset( Generic[ST] ):
                 buffer_shards: int = 100,
                 buffer_samples: int = 10_000,
                 batch_size: int | None = 1,
-            ) -> wds.DataPipeline:
+            ) -> Iterable[ST]:
         """Iterate over the dataset in random order
         Args:
@@ -366,6 +381,64 @@ class Dataset( Generic[ST] ):
             wds.batched( batch_size ),
             wds.map( self.wrap_batch ),
         )
+    # TODO Rewrite to eliminate `pandas` dependency directly calling
+    # `fastparquet`
+    def to_parquet( self, path: Pathlike,
+                sample_map: Optional[SampleExportMap] = None,
+                maxcount: Optional[int] = None,
+                **kwargs,
+            ):
+        """Save dataset contents to a `parquet` file at `path`
+        `kwargs` sent to `pandas.to_parquet`
+        """
+        ##
+        # Normalize args
+        path = Path( path )
+        if sample_map is None:
+            sample_map = asdict
+        verbose = kwargs.get( 'verbose', False )
+        it = self.ordered( batch_size = None )
+        if verbose:
+            it = tqdm( it )
+        #
+        if maxcount is None:
+            # Load and save full dataset
+            df = pd.DataFrame( [ sample_map( x )
+                                 for x in self.ordered( batch_size = None ) ] )
+            df.to_parquet( path, **kwargs )
+        else:
+            # Load and save dataset in segments of size `maxcount`
+            cur_segment = 0
+            cur_buffer = []
+            path_template = (path.parent / f'{path.stem}-%06d.{path.suffix}').as_posix()
+            for x in self.ordered( batch_size = None ):
+                cur_buffer.append( sample_map( x ) )
+                if len( cur_buffer ) >= maxcount:
+                    # Write current segment
+                    cur_path = path_template.format( cur_segment )
+                    df = pd.DataFrame( cur_buffer )
+                    df.to_parquet( cur_path, **kwargs )
+                    cur_segment += 1
+                    cur_buffer = []
+            if len( cur_buffer ) > 0:
+                # Write one last segment with remainder
+                cur_path = path_template.format( cur_segment )
+                df = pd.DataFrame( cur_buffer )
+                df.to_parquet( cur_path, **kwargs )
     # Implemented by specific subclasses
@@ -390,18 +463,18 @@ class Dataset( Generic[ST] ):
         return self.sample_type.from_bytes( sample['msgpack'] )
-        try:
-            assert type( sample ) == dict
-            return cls.sample_class( **{
-                k: v
-                for k, v in sample.items() if k != '__key__'
-            } )
+        # try:
+        #     assert type( sample ) == dict
+        #     return cls.sample_class( **{
+        #         k: v
+        #         for k, v in sample.items() if k != '__key__'
+        #     } )
-        except Exception as e:
-            # Sample constructor failed -- revert to default
-            return AnySample(
-                value = sample,
-            )
+        # except Exception as e:
+        #     # Sample constructor failed -- revert to default
+        #     return AnySample(
+        #         value = sample,
+        #     )
     def wrap_batch( self, batch: WDSRawBatch ) -> SampleBatch[ST]:
         """Wrap a `batch` of samples into the appropriate dataset-specific type
@@ -449,6 +522,9 @@ def packable( cls ):
     ##
+    class_name = cls.__name__
+    class_annotations = cls.__annotations__
     # Add in dataclass niceness to original class
     as_dataclass = dataclass( cls )
@@ -458,8 +534,9 @@ def packable( cls ):
         def __post_init__( self ):
             return PackableSample.__post_init__( self )
-    as_packable.__name__ = cls.__name__
-    as_packable.__annotations__ = cls.__annotations__
+    # TODO This doesn't properly carry over the original
+    as_packable.__name__ = class_name
+    as_packable.__annotations__ = class_annotations
     ##

atdata-0.1.2b1/src/atdata/lens.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Lenses between typed datasets"""
+##
+# Imports
+from .dataset import PackableSample
+import functools
+import inspect
+from typing import (
+    TypeAlias,
+    Type,
+    TypeVar,
+    Tuple,
+    Dict,
+    Callable,
+    Optional,
+    Generic,
+)
+##
+# Typing helpers
+DatasetType: TypeAlias = Type[PackableSample]
+LensSignature: TypeAlias = Tuple[DatasetType, DatasetType]
+S = TypeVar( 'S', bound = PackableSample )
+V = TypeVar( 'V', bound = PackableSample )
+type LensGetter[S, V] = Callable[[S], V]
+type LensPutter[S, V] = Callable[[V, S], S]
+##
+# Shortcut decorators
+class Lens( Generic[S, V] ):
+    """TODO"""
+    def __init__( self, get: LensGetter[S, V],
+                put: Optional[LensPutter[S, V]] = None
+            ) -> None:
+        """TODO"""
+        ##
+        # Update
+        functools.update_wrapper( self, get )
+        # Store the getter
+        self._getter = get
+        # Determine and store the putter
+        if put is None:
+            # Trivial putter does not update the source
+            def _trivial_put( v: V, s: S ) -> S:
+                return s
+            put = _trivial_put
+        self._putter = put
+        # Register this lens for this type signature
+        sig = inspect.signature( get )
+        input_types = list( sig.parameters.values() )
+        assert len( input_types ) == 1, \
+            'Wrong number of input args for lens: should only have one'
+        input_type = input_types[0].annotation
+        output_type = sig.return_annotation
+        _registered_lenses[(input_type, output_type)] = self
+        print( _registered_lenses )
+    #
+    def putter( self, put: LensPutter[S, V] ) -> LensPutter[S, V]:
+        """TODO"""
+        ##
+        self._putter = put
+        return put
+    def put( self, v: V, s: S ) -> S:
+        """TODO"""
+        return self._putter( v, s )
+    def get( self, s: S ) -> V:
+        """TODO"""
+        return self( s )
+    #
+    def __call__( self, s: S ) -> V:
+        return self._getter( s )
+def lens( f: LensGetter[S, V] ) -> Lens[S, V]:
+    """Register the annotated function `f` as the getter of a sample lens"""
+    return Lens[S, V]( f )
+##
+# Global registration of used lenses
+_registered_lenses: Dict[LensSignature, Lens] = dict()
+"""TODO"""
+# def lens( f: LensPutter ) -> Lens:
+#     """Register the annotated function `f` as a sample lens"""
+#     ##
+#     sig = inspect.signature( f )
+#     input_types = list( sig.parameters.values() )
+#     output_type = sig.return_annotation
+#     _registered_lenses[]
+#     f.lens = Lens(
+#     )
+#     return f

{atdata-0.1.2a4 → atdata-0.1.2b1}/tests/test_dataset.py RENAMED Viewed

@@ -59,6 +59,7 @@ test_cases = [
             'value': 1024.768,
         },
         'sample_wds_stem': 'basic_test',
+        'test_parquet': True,
     },
     {
         'SampleType': NumpyTestSample,
@@ -68,6 +69,7 @@ test_cases = [
             'image': np.random.randn( 1024, 1024 ),
         },
         'sample_wds_stem': 'numpy_test',
+        'test_parquet': False,
     },
     {
         'SampleType': BasicTestSampleDecorated,
@@ -77,6 +79,7 @@ test_cases = [
             'value': 1024.768,
         },
         'sample_wds_stem': 'basic_test_decorated',
+        'test_parquet': True,
     },
     {
         'SampleType': NumpyTestSampleDecorated,
@@ -86,6 +89,7 @@ test_cases = [
             'image': np.random.randn( 1024, 1024 ),
         },
         'sample_wds_stem': 'numpy_test_decorated',
+        'test_parquet': False,
     },
 ]
@@ -323,5 +327,117 @@ def test_wds(
     assert iterations_run == n_iterate, \
         "Only found {iterations_run} samples, not {n_iterate}"
+#
+@pytest.mark.parametrize(
+    ('SampleType', 'sample_data', 'sample_wds_stem', 'test_parquet'),
+    [ (
+        case['SampleType'],
+        case['sample_data'],
+        case['sample_wds_stem'],
+        case['test_parquet']
+      )
+      for case in test_cases ]
+)
+def test_create_sample(
+            SampleType: Type[atdata.PackableSample],
+            sample_data: atds.MsgpackRawSample,
+            sample_wds_stem: str,
+            test_parquet: bool,
+            tmp_path
+        ):
+    """Test our ability to export a dataset to `parquet` format"""
+    # Skip irrelevant test cases
+    if not test_parquet:
+        return
+    ## Testing hyperparameters
+    n_copies_dataset = 1_000
+    n_per_file = 100
+    ## Start out by writing tar dataset
+    wds_filename = (tmp_path / f'{sample_wds_stem}.tar').as_posix()
+    with wds.TarWriter( wds_filename ) as sink:
+        for _ in range( n_copies_dataset ):
+            new_sample = SampleType.from_data( sample_data )
+            sink.write( new_sample.as_wds )
+    ## Now export to `parquet`
+    dataset = atdata.Dataset[SampleType]( wds_filename )
+    parquet_filename = tmp_path / f'{sample_wds_stem}.parquet'
+    dataset.to_parquet( parquet_filename )
+    ## Double-check our `parquet` export
+    # TODO
+def test_lens():
+    """Test a lens between sample types"""
+    # Set up the lens scenario
+    @atdata.packable
+    class Source:
+        name: str
+        age: int
+        height: float
+    @atdata.packable
+    class View:
+        name: str
+        height: float
+    @atdata.lens
+    def polite( s: Source ) -> View:
+        return View(
+            name = s.name,
+            height = s.height,
+        )
+    @polite.putter
+    def polite_update( v: View, s: Source ) -> Source:
+        return Source(
+            name = v.name,
+            height = v.height,
+            #
+            age = s.age,
+        )
+    # Test with an example sample
+    test_source = Source(
+        name = 'Hello World',
+        age = 42,
+        height = 182.9,
+    )
+    correct_view = View(
+        name = test_source.name,
+        height = test_source.height,
+    )
+    test_view = polite( test_source )
+    assert test_view == correct_view, \
+        f'Incorrect lens behavior: {test_view}, and not {correct_view}'
+    # This lens should be well-behaved
+    update_view = View(
+        name = 'Now Taller',
+        height = 192.9,
+    )
+    x = polite( polite.put( update_view, test_source ) )
+    assert x == update_view, \
+        f'Violation of GetPut: {x} =/= {update_view}'
+    y = polite.put( polite( test_source ), test_source )
+    assert y == test_source, \
+        f'Violation of PutGet: {y} =/= {test_source}'
+    # TODO Test PutPut
 ##

{atdata-0.1.2a4 → atdata-0.1.2b1}/.github/workflows/uv-publish-pypi.yml RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.2b1}/.github/workflows/uv-test.yml RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.2b1}/.gitignore RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.2b1}/.python-version RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.2b1}/LICENSE RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.2b1}/README.md RENAMED Viewed

File without changes

{atdata-0.1.2a4 → atdata-0.1.2b1}/src/atdata/_helpers.py RENAMED Viewed

File without changes

atdata 0.1.2a4__tar.gz → 0.1.2b1__tar.gz

atdata 0.1.2a4tar.gz → 0.1.2b1tar.gz