PyPI - atdata - Versions diffs - 0.1.1a1__tar.gz → 0.1.1a3__tar.gz - Mend

atdata 0.1.1a1tar.gz → 0.1.1a3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

atdata-0.1.1a3/.github/workflows/uv-publish-pypi.yml +46 -0
atdata-0.1.1a3/.github/workflows/uv-test.yml +40 -0
{atdata-0.1.1a1 → atdata-0.1.1a3}/.gitignore +2 -0
{atdata-0.1.1a1 → atdata-0.1.1a3}/PKG-INFO +2 -2
{atdata-0.1.1a1 → atdata-0.1.1a3}/README.md +1 -1
{atdata-0.1.1a1 → atdata-0.1.1a3}/pyproject.toml +5 -2
atdata-0.1.1a3/src/atdata/__init__.py +13 -0
atdata-0.1.1a3/src/atdata/_helpers.py +22 -0
{atdata-0.1.1a1 → atdata-0.1.1a3}/src/atdata/dataset.py +32 -32
atdata-0.1.1a3/tests/test_dataset.py +272 -0
atdata-0.1.1a1/src/atdata/__init__.py +0 -2
atdata-0.1.1a1/src/atdata/_helpers.py +0 -30
atdata-0.1.1a1/tests/test_dataset.py +0 -69
{atdata-0.1.1a1 → atdata-0.1.1a3}/.python-version +0 -0
{atdata-0.1.1a1 → atdata-0.1.1a3}/LICENSE +0 -0

atdata-0.1.1a3/.github/workflows/uv-publish-pypi.yml ADDED Viewed

@@ -0,0 +1,46 @@
+#
+name: Build and upload package to PyPI
+on:
+  release:
+    types:
+      - published
+permissions:
+  contents: read
+jobs:
+  uv-build-release-pypi-publish:
+    name: "Build release distribution and publish to PyPI"
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+    steps:
+      - uses: actions/checkout@v5
+      - name: "Set up Python"
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: "pyproject.toml"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+      - name: Install project
+        run: uv sync --all-extras --dev
+        # TODO Better to use --locked for author control over versions?
+        # run: uv sync --locked --all-extras --dev
+      - name: Build release distributions
+        run: uv build
+      - name: Publish to PyPI
+        env:
+          UV_PUBLISH_TOKEN: ${{ secrets.UV_PUBLISH_TOKEN }}
+        run: uv publish
+##

atdata-0.1.1a3/.github/workflows/uv-test.yml ADDED Viewed

@@ -0,0 +1,40 @@
+#
+name: Run tests with `uv`
+on:
+  push:
+    branches:
+      - main
+      - release/*
+  pull_request:
+    branches:
+      - main
+jobs:
+  uv-test:
+    name: Run tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - name: "Set up Python"
+        uses: actions/setup-python@v5
+        with:
+          python-version-file: "pyproject.toml"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+      - name: Install the project
+        run: uv sync --all-extras --dev
+        # TODO Better to use --locked for author control over versions?
+        # run: uv sync --locked --all-extras --dev
+      - name: Run tests
+        # For example, using `pytest`
+        run: uv run pytest tests
+#

{atdata-0.1.1a1 → atdata-0.1.1a3}/.gitignore RENAMED Viewed

@@ -1,5 +1,7 @@
 ## Custom
+# mac garbage
+**/.DS_Store
 # Don't commit any .env files
 **/*.env
 # Don't commit `uv` lockfiles

{atdata-0.1.1a1 → atdata-0.1.1a3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: atdata
-Version: 0.1.1a1
+Version: 0.1.1a3
 Summary: A loose federation of distributed, typed datasets
 Author-email: Maxine Levesque <hello@maxine.science>
 License-File: LICENSE
@@ -11,5 +11,5 @@ Requires-Dist: ormsgpack>=1.11.0
 Requires-Dist: webdataset>=1.0.2
 Description-Content-Type: text/markdown
-# ekumen
+# atdata
 A loose federation of distributed, typed datasets

{atdata-0.1.1a1 → atdata-0.1.1a3}/README.md RENAMED Viewed

@@ -1,2 +1,2 @@
-# ekumen
+# atdata
 A loose federation of distributed, typed datasets

{atdata-0.1.1a1 → atdata-0.1.1a3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "atdata"
-version = "0.1.1a1"
+version = "0.1.1a3"
 description = "A loose federation of distributed, typed datasets"
 readme = "README.md"
 authors = [
@@ -15,12 +15,15 @@ dependencies = [
 ]
 [project.scripts]
-ekumen = "atdata:main"
+atdata = "atdata:main"
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
+[tool.pytest.ini_options]
+addopts = "--cov=atdata --cov-report=html"
 [dependency-groups]
 dev = [
     "pytest>=8.4.2",

atdata-0.1.1a3/src/atdata/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""A loose federation of distributed, typed datasets"""
+##
+# Expose components
+from .dataset import (
+    PackableSample,
+    SampleBatch,
+    Dataset,
+)
+#

atdata-0.1.1a3/src/atdata/_helpers.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""Assorted helper methods for `atdata`"""
+##
+# Imports
+from io import BytesIO
+import numpy as np
+##
+def array_to_bytes( x: np.ndarray ) -> bytes:
+    """Convert `numpy` array to a format suitable for packing"""
+    np_bytes = BytesIO()
+    np.save( np_bytes, x, allow_pickle = True )
+    return np_bytes.getvalue()
+def bytes_to_array( b: bytes ) -> np.ndarray:
+    """Convert packed bytes back to a `numpy` array"""
+    np_bytes = BytesIO( b )
+    return np.load( np_bytes, allow_pickle = True )

{atdata-0.1.1a1 → atdata-0.1.1a3}/src/atdata/dataset.py RENAMED Viewed

@@ -57,38 +57,38 @@ DT = TypeVar( 'DT' )
 MsgpackRawSample: TypeAlias = Dict[str, Any]
-@dataclass
-class ArrayBytes:
-    """Annotates bytes that should be interpreted as the raw contents of a
-    numpy NDArray"""
+# @dataclass
+# class ArrayBytes:
+#     """Annotates bytes that should be interpreted as the raw contents of a
+#     numpy NDArray"""
-    raw_bytes: bytes
-    """The raw bytes of the corresponding NDArray"""
-    def __init__( self,
-            array: Optional[ArrayLike] = None,
-            raw: Optional[bytes] = None,
-        ):
-        """TODO"""
-        if array is not None:
-            array = np.array( array )
-            self.raw_bytes = eh.array_to_bytes( array )
+#     raw_bytes: bytes
+#     """The raw bytes of the corresponding NDArray"""
+#     def __init__( self,
+#             array: Optional[ArrayLike] = None,
+#             raw: Optional[bytes] = None,
+#         ):
+#         """TODO"""
+#         if array is not None:
+#             array = np.array( array )
+#             self.raw_bytes = eh.array_to_bytes( array )
-        elif raw is not None:
-            self.raw_bytes = raw
+#         elif raw is not None:
+#             self.raw_bytes = raw
-        else:
-            raise ValueError( 'Must provide either `array` or `raw` bytes' )
+#         else:
+#             raise ValueError( 'Must provide either `array` or `raw` bytes' )
-    @property
-    def to_numpy( self ) -> NDArray:
-        """Return the `raw_bytes` data as an NDArray"""
-        return eh.bytes_to_array( self.raw_bytes )
+#     @property
+#     def to_numpy( self ) -> NDArray:
+#         """Return the `raw_bytes` data as an NDArray"""
+#         return eh.bytes_to_array( self.raw_bytes )
 def _make_packable( x ):
-    if isinstance( x, ArrayBytes ):
-        return x.raw_bytes
+    # if isinstance( x, ArrayBytes ):
+    #     return x.raw_bytes
     if isinstance( x, np.ndarray ):
         return eh.array_to_bytes( x )
     return x
@@ -114,8 +114,8 @@ class PackableSample( ABC ):
                     # we're good!
                     pass
-                elif isinstance( var_cur_value, ArrayBytes ):
-                    setattr( self, var_name, var_cur_value.to_numpy )
+                # elif isinstance( var_cur_value, ArrayBytes ):
+                #     setattr( self, var_name, var_cur_value.to_numpy )
                 elif isinstance( var_cur_value, bytes ):
                     setattr( self, var_name, eh.bytes_to_array( var_cur_value ) )
@@ -172,7 +172,7 @@ def _batch_aggregate( xs: Sequence ):
     return list( xs )
-class SamlpeBatch( Generic[DT] ):
+class SampleBatch( Generic[DT] ):
     def __init__( self, samples: Sequence[DT] ):
         """TODO"""
@@ -233,7 +233,7 @@ class Dataset( Generic[ST] ):
     def batch_type( self ) -> Type:
         """The type of a batch built from `sample_class`"""
         # return self.__orig_class__.__args__[1]
-        return SamlpeBatch[self.sample_type]
+        return SampleBatch[self.sample_type]
     # _schema_registry_sample: dict[str, Type]
@@ -396,7 +396,7 @@ class Dataset( Generic[ST] ):
                 value = sample,
             )
-    def wrap_batch( self, batch: WDSRawBatch ) -> SamlpeBatch[ST]:
+    def wrap_batch( self, batch: WDSRawBatch ) -> SampleBatch[ST]:
         """Wrap a `batch` of samples into the appropriate dataset-specific type
         This default implementation simply creates a list one sample at a time
@@ -405,7 +405,7 @@ class Dataset( Generic[ST] ):
         assert 'msgpack' in batch
         batch_unpacked = [ self.sample_type.from_bytes( bs )
                            for bs in batch['msgpack'] ]
-        return SamlpeBatch[self.sample_type]( batch_unpacked )
+        return SampleBatch[self.sample_type]( batch_unpacked )
     # # @classmethod

atdata-0.1.1a3/tests/test_dataset.py ADDED Viewed

@@ -0,0 +1,272 @@
+"""Test dataaset functionality."""
+##
+# Imports
+# Tests
+import pytest
+# System
+from dataclasses import dataclass
+# External
+import numpy as np
+import webdataset as wds
+# Local
+import atdata
+import atdata.dataset as atds
+# Typing
+from numpy.typing import NDArray
+from typing import (
+    Type,
+    Any,
+)
+##
+# Sample test cases
+@dataclass
+class BasicTestSample( atdata.PackableSample ):
+    name: str
+    position: int
+    value: float
+@dataclass
+class NumpyTestSample( atdata.PackableSample ):
+    label: int
+    image: NDArray
+test_cases = [
+    {
+        'SampleType': BasicTestSample,
+        'sample_data': {
+            'name': 'Hello, world!',
+            'position': 42,
+            'value': 1024.768,
+        },
+        'sample_wds_stem': 'basic_test',
+    },
+    {
+        'SampleType': NumpyTestSample,
+        'sample_data':
+        {
+            'label': 9_001,
+            'image': np.random.randn( 1024, 1024 ),
+        },
+        'sample_wds_stem': 'numpy_test',
+    },
+]
+## Tests
+@pytest.mark.parametrize(
+    ('SampleType', 'sample_data'),
+    [ (case['SampleType'], case['sample_data'])
+      for case in test_cases ]
+)
+def test_create_sample(
+            SampleType: Type[atdata.PackableSample],
+            sample_data: atds.MsgpackRawSample,
+        ):
+    """Test our ability to create samples from semi-structured data"""
+    sample = SampleType.from_data( sample_data )
+    assert isinstance( sample, SampleType ), \
+        f'Did not properly form sample for test type {SampleType}'
+    for k, v in sample_data.items():
+        cur_assertion: bool
+        if isinstance( v, np.ndarray ):
+            cur_assertion = np.all( getattr( sample, k ) == v ) == True
+        else:
+            cur_assertion = getattr( sample, k ) == v
+        assert cur_assertion, \
+            f'Did not properly incorporate property {k} of test type {SampleType}'
+#
+@pytest.mark.parametrize(
+    ('SampleType', 'sample_data', 'sample_wds_stem'),
+    [ (case['SampleType'], case['sample_data'], case['sample_wds_stem'])
+      for case in test_cases ]
+)
+def test_wds(
+            SampleType: Type[atdata.PackableSample],
+            sample_data: atds.MsgpackRawSample,
+            sample_wds_stem: str,
+            tmp_path
+        ):
+    """Test our ability to write samples as `WebDatasets` to disk"""
+    ## Testing hyperparameters
+    n_copies = 100
+    shard_maxcount = 10
+    batch_size = 4
+    n_iterate = 10
+    ## Write sharded dataset
+    file_pattern = (
+        tmp_path
+        / (f'{sample_wds_stem}' + '-{shard_id}.tar')
+    ).as_posix()
+    file_wds_pattern = file_pattern.format( shard_id = '%06d' )
+    with wds.ShardWriter(
+        pattern = file_wds_pattern,
+        maxcount = shard_maxcount,
+    ) as sink:
+        for i_sample in range( n_copies ):
+            new_sample = SampleType.from_data( sample_data )
+            assert isinstance( new_sample, SampleType ), \
+                f'Did not properly form sample for test type {SampleType}'
+            sink.write( new_sample.as_wds )
+    ## Ordered
+    # Read first shard, no batches
+    first_filename = file_pattern.format( shard_id = f'{0:06d}' )
+    dataset = atdata.Dataset[SampleType]( first_filename )
+    iterations_run = 0
+    for i_iterate, cur_sample in enumerate( dataset.ordered( batch_size = None ) ):
+        assert isinstance( cur_sample, SampleType ), \
+            f'Single sample for {SampleType} written to `wds` is of wrong type'
+        # Check sample values
+        for k, v in sample_data.items():
+            if isinstance( v, np.ndarray ):
+                is_correct = np.all( getattr( cur_sample, k ) == v )
+            else:
+                is_correct = getattr( cur_sample, k ) == v
+            assert is_correct, \
+                f'{SampleType}: Incorrect sample value found for {k}'
+        iterations_run += 1
+        if iterations_run >= n_iterate:
+            break
+    assert iterations_run == n_iterate, \
+        f"Only found {iterations_run} samples, not {n_iterate}"
+    # Read all shards, batches
+    start_id = f'{0:06d}'
+    end_id = f'{9:06d}'
+    first_filename = file_pattern.format( shard_id = '{' + start_id + '..' + end_id + '}' )
+    print( first_filename )
+    dataset = atdata.Dataset[SampleType]( first_filename )
+    iterations_run = 0
+    for i_iterate, cur_batch in enumerate( dataset.ordered( batch_size = batch_size ) ):
+        assert isinstance( cur_batch, atdata.SampleBatch ), \
+            f'{SampleType}: Batch sample is not correctly a batch'
+        assert cur_batch.sample_type == SampleType, \
+            f'{SampleType}: Batch `sample_type` is incorrect type'
+        if i_iterate == 0:
+            cur_n = len( cur_batch.samples )
+            assert cur_n == batch_size, \
+                f'{SampleType}: Batch has {cur_n} samples, not {batch_size}'
+        assert isinstance( cur_batch.samples[0], SampleType ), \
+            f'{SampleType}: Batch sample of wrong type ({type( cur_batch.samples[0])})'
+        # Check batch values
+        for k, v in sample_data.items():
+            cur_batch_data = getattr( cur_batch, k )
+            if isinstance( v, np.ndarray ):
+                assert isinstance( cur_batch_data, np.ndarray ), \
+                    f'{SampleType}: `NDArray` not carried through to batch'
+                is_correct = all(
+                    [ np.all( cur_batch_data[i] == v )
+                      for i in range( cur_batch_data.shape[0] ) ]
+                )
+            else:
+                is_correct = all(
+                    [ cur_batch_data[i] == v
+                      for i in range( len( cur_batch_data ) ) ]
+                )
+            assert is_correct, \
+                f'{SampleType}: Incorrect sample value found for {k}'
+        iterations_run += 1
+        if iterations_run >= n_iterate:
+            break
+    assert iterations_run == n_iterate, \
+        "Only found {iterations_run} samples, not {n_iterate}"
+    ## Shuffled
+    # Read first shard, no batches
+    first_filename = file_pattern.format( shard_id = f'{0:06d}' )
+    dataset = atdata.Dataset[SampleType]( first_filename )
+    iterations_run = 0
+    for i_iterate, cur_sample in enumerate( dataset.shuffled( batch_size = None ) ):
+        assert isinstance( cur_sample, SampleType ), \
+            f'Single sample for {SampleType} written to `wds` is of wrong type'
+        iterations_run += 1
+        if iterations_run >= n_iterate:
+            break
+    assert iterations_run == n_iterate, \
+        f"Only found {iterations_run} samples, not {n_iterate}"
+    # Read all shards, batches
+    start_id = f'{0:06d}'
+    end_id = f'{9:06d}'
+    first_filename = file_pattern.format( shard_id = '{' + start_id + '..' + end_id + '}' )
+    print( first_filename )
+    dataset = atdata.Dataset[SampleType]( first_filename )
+    iterations_run = 0
+    for i_iterate, cur_sample in enumerate( dataset.shuffled( batch_size = batch_size ) ):
+        assert isinstance( cur_sample, atdata.SampleBatch ), \
+            f'{SampleType}: Batch sample is not correctly a batch'
+        assert cur_sample.sample_type == SampleType, \
+            f'{SampleType}: Batch `sample_type` is incorrect type'
+        if i_iterate == 0:
+            cur_n = len( cur_sample.samples )
+            assert cur_n == batch_size, \
+                f'{SampleType}: Batch has {cur_n} samples, not {batch_size}'
+        assert isinstance( cur_sample.samples[0], SampleType ), \
+            f'{SampleType}: Batch sample of wrong type ({type( cur_sample.samples[0])})'
+        iterations_run += 1
+        if iterations_run >= n_iterate:
+            break
+    assert iterations_run == n_iterate, \
+        "Only found {iterations_run} samples, not {n_iterate}"
+##

atdata-0.1.1a1/src/atdata/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- def main() -> None:
2	- print("Hello from ekumen!")

atdata-0.1.1a1/src/atdata/_helpers.py DELETED Viewed

@@ -1,30 +0,0 @@
-"""Assorted helper methods for `ekumen`"""
-##
-# Imports
-from io import BytesIO
-import ormsgpack as omp
-import numpy as np
-##
-#
-def pack_instance( x ) -> bytes:
-    return omp.packb( x )
-def unpack( bs: bytes ):
-    return omp.unpackb( bs )
-##
-def array_to_bytes(x: np.ndarray) -> bytes:
-    np_bytes = BytesIO()
-    np.save(np_bytes, x, allow_pickle=True)
-    return np_bytes.getvalue()
-def bytes_to_array(b: bytes) -> np.ndarray:
-    np_bytes = BytesIO(b)
-    return np.load(np_bytes, allow_pickle=True)

atdata-0.1.1a1/tests/test_dataset.py DELETED Viewed

@@ -1,69 +0,0 @@
-"""Test dataaset functionality."""
-##
-import pytest
-from dataclasses import dataclass
-import numpy as np
-from numpy.typing import NDArray
-from typing import (
-    Type,
-    Any,
-)
-import atdata.dataset as ekd
-## Sample test cases
-@dataclass
-class BasicTestSample( ekd.PackableSample ):
-    name: str
-    position: int
-    value: float
-@dataclass
-class NumpyTestSample( ekd.PackableSample ):
-    label: int
-    image: NDArray
-test_sample_classes = [
-    (
-        BasicTestSample, {
-            'name': 'Hello, world!',
-            'position': 42,
-            'value': 1024.768,
-        }
-    ),
-    (
-        NumpyTestSample, {
-            'label': 9_001,
-            'image': np.random.randn( 1024, 1024 ),
-        }
-    )
-]
-## Tests
-@pytest.mark.parametrize( ('SampleType', 'sample_data'), test_sample_classes )
-def test_create_sample(
-            SampleType: Type[ekd.PackableSample],
-            sample_data: ekd.MsgpackRawSample,
-        ):
-    """
-    Test our ability to create samples from semi-structured data
-    """
-    sample = SampleType.from_data( sample_data )
-    assert isinstance( sample, SampleType ), f'Did not properly form sample for test type {SampleType}'
-    for k, v in sample_data.items():
-        cur_assertion: bool
-        if isinstance( v, np.ndarray ):
-            cur_assertion = np.all( getattr( sample, k ) == v ) == True
-        else:
-            cur_assertion = getattr( sample, k ) == v
-        assert cur_assertion, f'Did not properly incorporate property {k} of test type {SampleType}'

{atdata-0.1.1a1 → atdata-0.1.1a3}/.python-version RENAMED Viewed

File without changes

{atdata-0.1.1a1 → atdata-0.1.1a3}/LICENSE RENAMED Viewed

File without changes

atdata 0.1.1a1__tar.gz → 0.1.1a3__tar.gz

atdata 0.1.1a1tar.gz → 0.1.1a3tar.gz