PyPI - atdata - Versions diffs - 0.1.2a3__py3-none-any.whl → 0.1.2b1__py3-none-any.whl - Mend

atdata 0.1.2a3py3-none-any.whl → 0.1.2b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

atdata/__init__.py +5 -0
atdata/dataset.py +102 -22
atdata/lens.py +122 -0
{atdata-0.1.2a3.dist-info → atdata-0.1.2b1.dist-info}/METADATA +4 -1
atdata-0.1.2b1.dist-info/RECORD +9 -0
atdata-0.1.2a3.dist-info/RECORD +0 -8
{atdata-0.1.2a3.dist-info → atdata-0.1.2b1.dist-info}/WHEEL +0 -0
{atdata-0.1.2a3.dist-info → atdata-0.1.2b1.dist-info}/entry_points.txt +0 -0
{atdata-0.1.2a3.dist-info → atdata-0.1.2b1.dist-info}/licenses/LICENSE +0 -0

atdata/__init__.py CHANGED Viewed

@@ -10,5 +10,10 @@ from .dataset import (
     packable,
 )
+from .lens import (
+    Lens,
+    lens,
+)
 #

atdata/dataset.py CHANGED Viewed

@@ -5,21 +5,29 @@
 import webdataset as wds
-import functools
-from dataclasses import dataclass
+from pathlib import Path
 import uuid
-import numpy as np
+import functools
+from dataclasses import (
+    dataclass,
+    asdict,
+)
 from abc import (
     ABC,
     abstractmethod,
 )
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
 from typing import (
     Any,
     Optional,
     Dict,
     Sequence,
+    Iterable,
+    Callable,
     #
     Self,
     Generic,
@@ -45,9 +53,14 @@ from . import _helpers as eh
 ##
 # Typing help
+Pathlike = str | Path
 WDSRawSample: TypeAlias = Dict[str, Any]
 WDSRawBatch: TypeAlias = Dict[str, Any]
+SampleExportRow: TypeAlias = Dict[str, Any]
+SampleExportMap: TypeAlias = Callable[['PackableSample'], SampleExportRow]
 ##
 # Main base classes
@@ -94,6 +107,7 @@ def _make_packable( x ):
         return eh.array_to_bytes( x )
     return x
+@dataclass
 class PackableSample( ABC ):
     """A sample that can be packed and unpacked with msgpack"""
@@ -235,6 +249,7 @@ class Dataset( Generic[ST] ):
     @property
     def sample_type( self ) -> Type:
         """The type of each returned sample from this `Dataset`'s iterator"""
+        # TODO Figure out why linting fails here
         return self.__orig_class__.__args__[0]
     @property
     def batch_type( self ) -> Type:
@@ -286,7 +301,7 @@ class Dataset( Generic[ST] ):
     def ordered( self,
                 batch_size: int | None = 1,
-            ) -> wds.DataPipeline:
+            ) -> Iterable[ST]:
         """Iterate over the dataset in order
         Args:
@@ -325,7 +340,7 @@ class Dataset( Generic[ST] ):
                 buffer_shards: int = 100,
                 buffer_samples: int = 10_000,
                 batch_size: int | None = 1,
-            ) -> wds.DataPipeline:
+            ) -> Iterable[ST]:
         """Iterate over the dataset in random order
         Args:
@@ -366,6 +381,64 @@ class Dataset( Generic[ST] ):
             wds.batched( batch_size ),
             wds.map( self.wrap_batch ),
         )
+    # TODO Rewrite to eliminate `pandas` dependency directly calling
+    # `fastparquet`
+    def to_parquet( self, path: Pathlike,
+                sample_map: Optional[SampleExportMap] = None,
+                maxcount: Optional[int] = None,
+                **kwargs,
+            ):
+        """Save dataset contents to a `parquet` file at `path`
+        `kwargs` sent to `pandas.to_parquet`
+        """
+        ##
+        # Normalize args
+        path = Path( path )
+        if sample_map is None:
+            sample_map = asdict
+        verbose = kwargs.get( 'verbose', False )
+        it = self.ordered( batch_size = None )
+        if verbose:
+            it = tqdm( it )
+        #
+        if maxcount is None:
+            # Load and save full dataset
+            df = pd.DataFrame( [ sample_map( x )
+                                 for x in self.ordered( batch_size = None ) ] )
+            df.to_parquet( path, **kwargs )
+        else:
+            # Load and save dataset in segments of size `maxcount`
+            cur_segment = 0
+            cur_buffer = []
+            path_template = (path.parent / f'{path.stem}-%06d.{path.suffix}').as_posix()
+            for x in self.ordered( batch_size = None ):
+                cur_buffer.append( sample_map( x ) )
+                if len( cur_buffer ) >= maxcount:
+                    # Write current segment
+                    cur_path = path_template.format( cur_segment )
+                    df = pd.DataFrame( cur_buffer )
+                    df.to_parquet( cur_path, **kwargs )
+                    cur_segment += 1
+                    cur_buffer = []
+            if len( cur_buffer ) > 0:
+                # Write one last segment with remainder
+                cur_path = path_template.format( cur_segment )
+                df = pd.DataFrame( cur_buffer )
+                df.to_parquet( cur_path, **kwargs )
     # Implemented by specific subclasses
@@ -390,18 +463,18 @@ class Dataset( Generic[ST] ):
         return self.sample_type.from_bytes( sample['msgpack'] )
-        try:
-            assert type( sample ) == dict
-            return cls.sample_class( **{
-                k: v
-                for k, v in sample.items() if k != '__key__'
-            } )
+        # try:
+        #     assert type( sample ) == dict
+        #     return cls.sample_class( **{
+        #         k: v
+        #         for k, v in sample.items() if k != '__key__'
+        #     } )
-        except Exception as e:
-            # Sample constructor failed -- revert to default
-            return AnySample(
-                value = sample,
-            )
+        # except Exception as e:
+        #     # Sample constructor failed -- revert to default
+        #     return AnySample(
+        #         value = sample,
+        #     )
     def wrap_batch( self, batch: WDSRawBatch ) -> SampleBatch[ST]:
         """Wrap a `batch` of samples into the appropriate dataset-specific type
@@ -449,15 +522,22 @@ def packable( cls ):
     ##
+    class_name = cls.__name__
+    class_annotations = cls.__annotations__
+    # Add in dataclass niceness to original class
     as_dataclass = dataclass( cls )
-    class as_packable( PackableSample, as_dataclass ):
+    # This triggers a bunch of behind-the-scenes stuff for the newly annotated class
+    @dataclass
+    class as_packable( as_dataclass, PackableSample ):
         def __post_init__( self ):
             return PackableSample.__post_init__( self )
-    as_packable.__name__ = cls.__name__
-    as_packable.__annotations__ = cls.__annotations__
+    # TODO This doesn't properly carry over the original
+    as_packable.__name__ = class_name
+    as_packable.__annotations__ = class_annotations
     ##
     return as_packable

atdata/lens.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Lenses between typed datasets"""
+##
+# Imports
+from .dataset import PackableSample
+import functools
+import inspect
+from typing import (
+    TypeAlias,
+    Type,
+    TypeVar,
+    Tuple,
+    Dict,
+    Callable,
+    Optional,
+    Generic,
+)
+##
+# Typing helpers
+DatasetType: TypeAlias = Type[PackableSample]
+LensSignature: TypeAlias = Tuple[DatasetType, DatasetType]
+S = TypeVar( 'S', bound = PackableSample )
+V = TypeVar( 'V', bound = PackableSample )
+type LensGetter[S, V] = Callable[[S], V]
+type LensPutter[S, V] = Callable[[V, S], S]
+##
+# Shortcut decorators
+class Lens( Generic[S, V] ):
+    """TODO"""
+    def __init__( self, get: LensGetter[S, V],
+                put: Optional[LensPutter[S, V]] = None
+            ) -> None:
+        """TODO"""
+        ##
+        # Update
+        functools.update_wrapper( self, get )
+        # Store the getter
+        self._getter = get
+        # Determine and store the putter
+        if put is None:
+            # Trivial putter does not update the source
+            def _trivial_put( v: V, s: S ) -> S:
+                return s
+            put = _trivial_put
+        self._putter = put
+        # Register this lens for this type signature
+        sig = inspect.signature( get )
+        input_types = list( sig.parameters.values() )
+        assert len( input_types ) == 1, \
+            'Wrong number of input args for lens: should only have one'
+        input_type = input_types[0].annotation
+        output_type = sig.return_annotation
+        _registered_lenses[(input_type, output_type)] = self
+        print( _registered_lenses )
+    #
+    def putter( self, put: LensPutter[S, V] ) -> LensPutter[S, V]:
+        """TODO"""
+        ##
+        self._putter = put
+        return put
+    def put( self, v: V, s: S ) -> S:
+        """TODO"""
+        return self._putter( v, s )
+    def get( self, s: S ) -> V:
+        """TODO"""
+        return self( s )
+    #
+    def __call__( self, s: S ) -> V:
+        return self._getter( s )
+def lens( f: LensGetter[S, V] ) -> Lens[S, V]:
+    """Register the annotated function `f` as the getter of a sample lens"""
+    return Lens[S, V]( f )
+##
+# Global registration of used lenses
+_registered_lenses: Dict[LensSignature, Lens] = dict()
+"""TODO"""
+# def lens( f: LensPutter ) -> Lens:
+#     """Register the annotated function `f` as a sample lens"""
+#     ##
+#     sig = inspect.signature( f )
+#     input_types = list( sig.parameters.values() )
+#     output_type = sig.return_annotation
+#     _registered_lenses[]
+#     f.lens = Lens(
+#     )
+#     return f

{atdata-0.1.2a3.dist-info → atdata-0.1.2b1.dist-info}/METADATA RENAMED Viewed

@@ -1,13 +1,16 @@
 Metadata-Version: 2.4
 Name: atdata
-Version: 0.1.2a3
+Version: 0.1.2b1
 Summary: A loose federation of distributed, typed datasets
 Author-email: Maxine Levesque <hello@maxine.science>
 License-File: LICENSE
 Requires-Python: >=3.12
+Requires-Dist: fastparquet>=2024.11.0
 Requires-Dist: msgpack>=1.1.2
 Requires-Dist: numpy>=2.3.4
 Requires-Dist: ormsgpack>=1.11.0
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: tqdm>=4.67.1
 Requires-Dist: webdataset>=1.0.2
 Description-Content-Type: text/markdown

atdata-0.1.2b1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+atdata/__init__.py,sha256=YnlohxQwTUK6V84XHm2gdeCQH5sIrTHVLSApB-nt_z8,216
+atdata/_helpers.py,sha256=R63JhXewAKZYnZ9Th7R6yZh0IOUPYGBsth3FpRUMD-U,503
+atdata/dataset.py,sha256=pBaND2D33JiJoiL9CtCTBa3octtifa21P19K076sW3Q,15905
+atdata/lens.py,sha256=ikExMWdGP3QH-bEuUDNAYO_ZjeaKJTfL9lpaN9CrRB4,2624
+atdata-0.1.2b1.dist-info/METADATA,sha256=WtHM3N0kMxJKwPXl5cluRNnvk49WU_e72lVhmcInraY,529
+atdata-0.1.2b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+atdata-0.1.2b1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
+atdata-0.1.2b1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
+atdata-0.1.2b1.dist-info/RECORD,,

atdata-0.1.2a3.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-atdata/__init__.py,sha256=jPZVd_6UIo0DSbCnXAnYZ2eMwHYzOk--5vtEDTZvwqw,173
-atdata/_helpers.py,sha256=R63JhXewAKZYnZ9Th7R6yZh0IOUPYGBsth3FpRUMD-U,503
-atdata/dataset.py,sha256=HXctGwIbU5kr2pqiQCYDyGP1mkph1gIt-x1_PRtWyew,13372
-atdata-0.1.2a3.dist-info/METADATA,sha256=Jj5vP4NW-HtckIsPRzzpXVQXgcQ8HaFSGehdAu4Vfbo,434
-atdata-0.1.2a3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-atdata-0.1.2a3.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
-atdata-0.1.2a3.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
-atdata-0.1.2a3.dist-info/RECORD,,

{atdata-0.1.2a3.dist-info → atdata-0.1.2b1.dist-info}/WHEEL RENAMED Viewed

File without changes

{atdata-0.1.2a3.dist-info → atdata-0.1.2b1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{atdata-0.1.2a3.dist-info → atdata-0.1.2b1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

atdata 0.1.2a3__py3-none-any.whl → 0.1.2b1__py3-none-any.whl

atdata 0.1.2a3py3-none-any.whl → 0.1.2b1py3-none-any.whl