atdata 0.1.1a1__tar.gz → 0.1.1a3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ #
2
+
3
+ name: Build and upload package to PyPI
4
+
5
+ on:
6
+ release:
7
+ types:
8
+ - published
9
+
10
+ permissions:
11
+ contents: read
12
+
13
+ jobs:
14
+
15
+ uv-build-release-pypi-publish:
16
+ name: "Build release distribution and publish to PyPI"
17
+ runs-on: ubuntu-latest
18
+ environment:
19
+ name: pypi
20
+
21
+ steps:
22
+ - uses: actions/checkout@v5
23
+
24
+ - name: "Set up Python"
25
+ uses: actions/setup-python@v5
26
+ with:
27
+ python-version-file: "pyproject.toml"
28
+
29
+ - name: Install uv
30
+ uses: astral-sh/setup-uv@v6
31
+
32
+ - name: Install project
33
+ run: uv sync --all-extras --dev
34
+ # TODO Better to use --locked for author control over versions?
35
+ # run: uv sync --locked --all-extras --dev
36
+
37
+ - name: Build release distributions
38
+ run: uv build
39
+
40
+ - name: Publish to PyPI
41
+ env:
42
+ UV_PUBLISH_TOKEN: ${{ secrets.UV_PUBLISH_TOKEN }}
43
+ run: uv publish
44
+
45
+
46
+ ##
@@ -0,0 +1,40 @@
1
+ #
2
+
3
+ name: Run tests with `uv`
4
+
5
+ on:
6
+ push:
7
+ branches:
8
+ - main
9
+ - release/*
10
+ pull_request:
11
+ branches:
12
+ - main
13
+
14
+ jobs:
15
+ uv-test:
16
+ name: Run tests
17
+ runs-on: ubuntu-latest
18
+
19
+ steps:
20
+ - uses: actions/checkout@v5
21
+
22
+ - name: "Set up Python"
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version-file: "pyproject.toml"
26
+
27
+ - name: Install uv
28
+ uses: astral-sh/setup-uv@v6
29
+
30
+ - name: Install the project
31
+ run: uv sync --all-extras --dev
32
+ # TODO Better to use --locked for author control over versions?
33
+ # run: uv sync --locked --all-extras --dev
34
+
35
+ - name: Run tests
36
+ # For example, using `pytest`
37
+ run: uv run pytest tests
38
+
39
+
40
+ #
@@ -1,5 +1,7 @@
1
1
  ## Custom
2
2
 
3
+ # mac garbage
4
+ **/.DS_Store
3
5
  # Don't commit any .env files
4
6
  **/*.env
5
7
  # Don't commit `uv` lockfiles
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: atdata
3
- Version: 0.1.1a1
3
+ Version: 0.1.1a3
4
4
  Summary: A loose federation of distributed, typed datasets
5
5
  Author-email: Maxine Levesque <hello@maxine.science>
6
6
  License-File: LICENSE
@@ -11,5 +11,5 @@ Requires-Dist: ormsgpack>=1.11.0
11
11
  Requires-Dist: webdataset>=1.0.2
12
12
  Description-Content-Type: text/markdown
13
13
 
14
- # ekumen
14
+ # atdata
15
15
  A loose federation of distributed, typed datasets
@@ -1,2 +1,2 @@
1
- # ekumen
1
+ # atdata
2
2
  A loose federation of distributed, typed datasets
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "atdata"
3
- version = "0.1.1a1"
3
+ version = "0.1.1a3"
4
4
  description = "A loose federation of distributed, typed datasets"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -15,12 +15,15 @@ dependencies = [
15
15
  ]
16
16
 
17
17
  [project.scripts]
18
- ekumen = "atdata:main"
18
+ atdata = "atdata:main"
19
19
 
20
20
  [build-system]
21
21
  requires = ["hatchling"]
22
22
  build-backend = "hatchling.build"
23
23
 
24
+ [tool.pytest.ini_options]
25
+ addopts = "--cov=atdata --cov-report=html"
26
+
24
27
  [dependency-groups]
25
28
  dev = [
26
29
  "pytest>=8.4.2",
@@ -0,0 +1,13 @@
1
+ """A loose federation of distributed, typed datasets"""
2
+
3
+ ##
4
+ # Expose components
5
+
6
+ from .dataset import (
7
+ PackableSample,
8
+ SampleBatch,
9
+ Dataset,
10
+ )
11
+
12
+
13
+ #
@@ -0,0 +1,22 @@
1
+ """Assorted helper methods for `atdata`"""
2
+
3
+ ##
4
+ # Imports
5
+
6
+ from io import BytesIO
7
+
8
+ import numpy as np
9
+
10
+
11
+ ##
12
+
13
+ def array_to_bytes( x: np.ndarray ) -> bytes:
14
+ """Convert `numpy` array to a format suitable for packing"""
15
+ np_bytes = BytesIO()
16
+ np.save( np_bytes, x, allow_pickle = True )
17
+ return np_bytes.getvalue()
18
+
19
+ def bytes_to_array( b: bytes ) -> np.ndarray:
20
+ """Convert packed bytes back to a `numpy` array"""
21
+ np_bytes = BytesIO( b )
22
+ return np.load( np_bytes, allow_pickle = True )
@@ -57,38 +57,38 @@ DT = TypeVar( 'DT' )
57
57
 
58
58
  MsgpackRawSample: TypeAlias = Dict[str, Any]
59
59
 
60
- @dataclass
61
- class ArrayBytes:
62
- """Annotates bytes that should be interpreted as the raw contents of a
63
- numpy NDArray"""
60
+ # @dataclass
61
+ # class ArrayBytes:
62
+ # """Annotates bytes that should be interpreted as the raw contents of a
63
+ # numpy NDArray"""
64
64
 
65
- raw_bytes: bytes
66
- """The raw bytes of the corresponding NDArray"""
67
-
68
- def __init__( self,
69
- array: Optional[ArrayLike] = None,
70
- raw: Optional[bytes] = None,
71
- ):
72
- """TODO"""
73
-
74
- if array is not None:
75
- array = np.array( array )
76
- self.raw_bytes = eh.array_to_bytes( array )
65
+ # raw_bytes: bytes
66
+ # """The raw bytes of the corresponding NDArray"""
67
+
68
+ # def __init__( self,
69
+ # array: Optional[ArrayLike] = None,
70
+ # raw: Optional[bytes] = None,
71
+ # ):
72
+ # """TODO"""
73
+
74
+ # if array is not None:
75
+ # array = np.array( array )
76
+ # self.raw_bytes = eh.array_to_bytes( array )
77
77
 
78
- elif raw is not None:
79
- self.raw_bytes = raw
78
+ # elif raw is not None:
79
+ # self.raw_bytes = raw
80
80
 
81
- else:
82
- raise ValueError( 'Must provide either `array` or `raw` bytes' )
81
+ # else:
82
+ # raise ValueError( 'Must provide either `array` or `raw` bytes' )
83
83
 
84
- @property
85
- def to_numpy( self ) -> NDArray:
86
- """Return the `raw_bytes` data as an NDArray"""
87
- return eh.bytes_to_array( self.raw_bytes )
84
+ # @property
85
+ # def to_numpy( self ) -> NDArray:
86
+ # """Return the `raw_bytes` data as an NDArray"""
87
+ # return eh.bytes_to_array( self.raw_bytes )
88
88
 
89
89
  def _make_packable( x ):
90
- if isinstance( x, ArrayBytes ):
91
- return x.raw_bytes
90
+ # if isinstance( x, ArrayBytes ):
91
+ # return x.raw_bytes
92
92
  if isinstance( x, np.ndarray ):
93
93
  return eh.array_to_bytes( x )
94
94
  return x
@@ -114,8 +114,8 @@ class PackableSample( ABC ):
114
114
  # we're good!
115
115
  pass
116
116
 
117
- elif isinstance( var_cur_value, ArrayBytes ):
118
- setattr( self, var_name, var_cur_value.to_numpy )
117
+ # elif isinstance( var_cur_value, ArrayBytes ):
118
+ # setattr( self, var_name, var_cur_value.to_numpy )
119
119
 
120
120
  elif isinstance( var_cur_value, bytes ):
121
121
  setattr( self, var_name, eh.bytes_to_array( var_cur_value ) )
@@ -172,7 +172,7 @@ def _batch_aggregate( xs: Sequence ):
172
172
 
173
173
  return list( xs )
174
174
 
175
- class SamlpeBatch( Generic[DT] ):
175
+ class SampleBatch( Generic[DT] ):
176
176
 
177
177
  def __init__( self, samples: Sequence[DT] ):
178
178
  """TODO"""
@@ -233,7 +233,7 @@ class Dataset( Generic[ST] ):
233
233
  def batch_type( self ) -> Type:
234
234
  """The type of a batch built from `sample_class`"""
235
235
  # return self.__orig_class__.__args__[1]
236
- return SamlpeBatch[self.sample_type]
236
+ return SampleBatch[self.sample_type]
237
237
 
238
238
 
239
239
  # _schema_registry_sample: dict[str, Type]
@@ -396,7 +396,7 @@ class Dataset( Generic[ST] ):
396
396
  value = sample,
397
397
  )
398
398
 
399
- def wrap_batch( self, batch: WDSRawBatch ) -> SamlpeBatch[ST]:
399
+ def wrap_batch( self, batch: WDSRawBatch ) -> SampleBatch[ST]:
400
400
  """Wrap a `batch` of samples into the appropriate dataset-specific type
401
401
 
402
402
  This default implementation simply creates a list one sample at a time
@@ -405,7 +405,7 @@ class Dataset( Generic[ST] ):
405
405
  assert 'msgpack' in batch
406
406
  batch_unpacked = [ self.sample_type.from_bytes( bs )
407
407
  for bs in batch['msgpack'] ]
408
- return SamlpeBatch[self.sample_type]( batch_unpacked )
408
+ return SampleBatch[self.sample_type]( batch_unpacked )
409
409
 
410
410
 
411
411
  # # @classmethod
@@ -0,0 +1,272 @@
1
+ """Test dataaset functionality."""
2
+
3
+ ##
4
+ # Imports
5
+
6
+ # Tests
7
+ import pytest
8
+
9
+ # System
10
+ from dataclasses import dataclass
11
+
12
+ # External
13
+ import numpy as np
14
+ import webdataset as wds
15
+
16
+ # Local
17
+ import atdata
18
+ import atdata.dataset as atds
19
+
20
+ # Typing
21
+ from numpy.typing import NDArray
22
+ from typing import (
23
+ Type,
24
+ Any,
25
+ )
26
+
27
+
28
+ ##
29
+ # Sample test cases
30
+
31
+ @dataclass
32
+ class BasicTestSample( atdata.PackableSample ):
33
+ name: str
34
+ position: int
35
+ value: float
36
+
37
+ @dataclass
38
+ class NumpyTestSample( atdata.PackableSample ):
39
+ label: int
40
+ image: NDArray
41
+
42
+ test_cases = [
43
+ {
44
+ 'SampleType': BasicTestSample,
45
+ 'sample_data': {
46
+ 'name': 'Hello, world!',
47
+ 'position': 42,
48
+ 'value': 1024.768,
49
+ },
50
+ 'sample_wds_stem': 'basic_test',
51
+ },
52
+ {
53
+ 'SampleType': NumpyTestSample,
54
+ 'sample_data':
55
+ {
56
+ 'label': 9_001,
57
+ 'image': np.random.randn( 1024, 1024 ),
58
+ },
59
+ 'sample_wds_stem': 'numpy_test',
60
+ },
61
+ ]
62
+
63
+
64
+ ## Tests
65
+
66
+ @pytest.mark.parametrize(
67
+ ('SampleType', 'sample_data'),
68
+ [ (case['SampleType'], case['sample_data'])
69
+ for case in test_cases ]
70
+ )
71
+ def test_create_sample(
72
+ SampleType: Type[atdata.PackableSample],
73
+ sample_data: atds.MsgpackRawSample,
74
+ ):
75
+ """Test our ability to create samples from semi-structured data"""
76
+
77
+ sample = SampleType.from_data( sample_data )
78
+ assert isinstance( sample, SampleType ), \
79
+ f'Did not properly form sample for test type {SampleType}'
80
+
81
+ for k, v in sample_data.items():
82
+ cur_assertion: bool
83
+ if isinstance( v, np.ndarray ):
84
+ cur_assertion = np.all( getattr( sample, k ) == v ) == True
85
+ else:
86
+ cur_assertion = getattr( sample, k ) == v
87
+ assert cur_assertion, \
88
+ f'Did not properly incorporate property {k} of test type {SampleType}'
89
+
90
+ #
91
+
92
+ @pytest.mark.parametrize(
93
+ ('SampleType', 'sample_data', 'sample_wds_stem'),
94
+ [ (case['SampleType'], case['sample_data'], case['sample_wds_stem'])
95
+ for case in test_cases ]
96
+ )
97
+ def test_wds(
98
+ SampleType: Type[atdata.PackableSample],
99
+ sample_data: atds.MsgpackRawSample,
100
+ sample_wds_stem: str,
101
+ tmp_path
102
+ ):
103
+ """Test our ability to write samples as `WebDatasets` to disk"""
104
+
105
+ ## Testing hyperparameters
106
+
107
+ n_copies = 100
108
+ shard_maxcount = 10
109
+ batch_size = 4
110
+ n_iterate = 10
111
+
112
+
113
+ ## Write sharded dataset
114
+
115
+ file_pattern = (
116
+ tmp_path
117
+ / (f'{sample_wds_stem}' + '-{shard_id}.tar')
118
+ ).as_posix()
119
+ file_wds_pattern = file_pattern.format( shard_id = '%06d' )
120
+
121
+ with wds.ShardWriter(
122
+ pattern = file_wds_pattern,
123
+ maxcount = shard_maxcount,
124
+ ) as sink:
125
+
126
+ for i_sample in range( n_copies ):
127
+ new_sample = SampleType.from_data( sample_data )
128
+ assert isinstance( new_sample, SampleType ), \
129
+ f'Did not properly form sample for test type {SampleType}'
130
+
131
+ sink.write( new_sample.as_wds )
132
+
133
+
134
+ ## Ordered
135
+
136
+ # Read first shard, no batches
137
+
138
+ first_filename = file_pattern.format( shard_id = f'{0:06d}' )
139
+ dataset = atdata.Dataset[SampleType]( first_filename )
140
+
141
+ iterations_run = 0
142
+ for i_iterate, cur_sample in enumerate( dataset.ordered( batch_size = None ) ):
143
+
144
+ assert isinstance( cur_sample, SampleType ), \
145
+ f'Single sample for {SampleType} written to `wds` is of wrong type'
146
+
147
+ # Check sample values
148
+
149
+ for k, v in sample_data.items():
150
+ if isinstance( v, np.ndarray ):
151
+ is_correct = np.all( getattr( cur_sample, k ) == v )
152
+ else:
153
+ is_correct = getattr( cur_sample, k ) == v
154
+ assert is_correct, \
155
+ f'{SampleType}: Incorrect sample value found for {k}'
156
+
157
+ iterations_run += 1
158
+ if iterations_run >= n_iterate:
159
+ break
160
+
161
+ assert iterations_run == n_iterate, \
162
+ f"Only found {iterations_run} samples, not {n_iterate}"
163
+
164
+ # Read all shards, batches
165
+
166
+ start_id = f'{0:06d}'
167
+ end_id = f'{9:06d}'
168
+ first_filename = file_pattern.format( shard_id = '{' + start_id + '..' + end_id + '}' )
169
+ print( first_filename )
170
+ dataset = atdata.Dataset[SampleType]( first_filename )
171
+
172
+ iterations_run = 0
173
+ for i_iterate, cur_batch in enumerate( dataset.ordered( batch_size = batch_size ) ):
174
+
175
+ assert isinstance( cur_batch, atdata.SampleBatch ), \
176
+ f'{SampleType}: Batch sample is not correctly a batch'
177
+
178
+ assert cur_batch.sample_type == SampleType, \
179
+ f'{SampleType}: Batch `sample_type` is incorrect type'
180
+
181
+ if i_iterate == 0:
182
+ cur_n = len( cur_batch.samples )
183
+ assert cur_n == batch_size, \
184
+ f'{SampleType}: Batch has {cur_n} samples, not {batch_size}'
185
+
186
+ assert isinstance( cur_batch.samples[0], SampleType ), \
187
+ f'{SampleType}: Batch sample of wrong type ({type( cur_batch.samples[0])})'
188
+
189
+ # Check batch values
190
+ for k, v in sample_data.items():
191
+ cur_batch_data = getattr( cur_batch, k )
192
+
193
+ if isinstance( v, np.ndarray ):
194
+ assert isinstance( cur_batch_data, np.ndarray ), \
195
+ f'{SampleType}: `NDArray` not carried through to batch'
196
+
197
+ is_correct = all(
198
+ [ np.all( cur_batch_data[i] == v )
199
+ for i in range( cur_batch_data.shape[0] ) ]
200
+ )
201
+
202
+ else:
203
+ is_correct = all(
204
+ [ cur_batch_data[i] == v
205
+ for i in range( len( cur_batch_data ) ) ]
206
+ )
207
+
208
+ assert is_correct, \
209
+ f'{SampleType}: Incorrect sample value found for {k}'
210
+
211
+ iterations_run += 1
212
+ if iterations_run >= n_iterate:
213
+ break
214
+
215
+ assert iterations_run == n_iterate, \
216
+ "Only found {iterations_run} samples, not {n_iterate}"
217
+
218
+
219
+ ## Shuffled
220
+
221
+ # Read first shard, no batches
222
+
223
+ first_filename = file_pattern.format( shard_id = f'{0:06d}' )
224
+ dataset = atdata.Dataset[SampleType]( first_filename )
225
+
226
+ iterations_run = 0
227
+ for i_iterate, cur_sample in enumerate( dataset.shuffled( batch_size = None ) ):
228
+
229
+ assert isinstance( cur_sample, SampleType ), \
230
+ f'Single sample for {SampleType} written to `wds` is of wrong type'
231
+
232
+ iterations_run += 1
233
+ if iterations_run >= n_iterate:
234
+ break
235
+
236
+ assert iterations_run == n_iterate, \
237
+ f"Only found {iterations_run} samples, not {n_iterate}"
238
+
239
+ # Read all shards, batches
240
+
241
+ start_id = f'{0:06d}'
242
+ end_id = f'{9:06d}'
243
+ first_filename = file_pattern.format( shard_id = '{' + start_id + '..' + end_id + '}' )
244
+ print( first_filename )
245
+ dataset = atdata.Dataset[SampleType]( first_filename )
246
+
247
+ iterations_run = 0
248
+ for i_iterate, cur_sample in enumerate( dataset.shuffled( batch_size = batch_size ) ):
249
+
250
+ assert isinstance( cur_sample, atdata.SampleBatch ), \
251
+ f'{SampleType}: Batch sample is not correctly a batch'
252
+
253
+ assert cur_sample.sample_type == SampleType, \
254
+ f'{SampleType}: Batch `sample_type` is incorrect type'
255
+
256
+ if i_iterate == 0:
257
+ cur_n = len( cur_sample.samples )
258
+ assert cur_n == batch_size, \
259
+ f'{SampleType}: Batch has {cur_n} samples, not {batch_size}'
260
+
261
+ assert isinstance( cur_sample.samples[0], SampleType ), \
262
+ f'{SampleType}: Batch sample of wrong type ({type( cur_sample.samples[0])})'
263
+
264
+ iterations_run += 1
265
+ if iterations_run >= n_iterate:
266
+ break
267
+
268
+ assert iterations_run == n_iterate, \
269
+ "Only found {iterations_run} samples, not {n_iterate}"
270
+
271
+
272
+ ##
@@ -1,2 +0,0 @@
1
- def main() -> None:
2
- print("Hello from ekumen!")
@@ -1,30 +0,0 @@
1
- """Assorted helper methods for `ekumen`"""
2
-
3
- ##
4
- # Imports
5
-
6
- from io import BytesIO
7
- import ormsgpack as omp
8
-
9
- import numpy as np
10
-
11
-
12
- ##
13
- #
14
-
15
- def pack_instance( x ) -> bytes:
16
- return omp.packb( x )
17
-
18
- def unpack( bs: bytes ):
19
- return omp.unpackb( bs )
20
-
21
- ##
22
-
23
- def array_to_bytes(x: np.ndarray) -> bytes:
24
- np_bytes = BytesIO()
25
- np.save(np_bytes, x, allow_pickle=True)
26
- return np_bytes.getvalue()
27
-
28
- def bytes_to_array(b: bytes) -> np.ndarray:
29
- np_bytes = BytesIO(b)
30
- return np.load(np_bytes, allow_pickle=True)
@@ -1,69 +0,0 @@
1
- """Test dataaset functionality."""
2
-
3
- ##
4
-
5
- import pytest
6
-
7
- from dataclasses import dataclass
8
-
9
- import numpy as np
10
-
11
- from numpy.typing import NDArray
12
- from typing import (
13
- Type,
14
- Any,
15
- )
16
-
17
- import atdata.dataset as ekd
18
-
19
-
20
- ## Sample test cases
21
-
22
- @dataclass
23
- class BasicTestSample( ekd.PackableSample ):
24
- name: str
25
- position: int
26
- value: float
27
-
28
- @dataclass
29
- class NumpyTestSample( ekd.PackableSample ):
30
- label: int
31
- image: NDArray
32
-
33
- test_sample_classes = [
34
- (
35
- BasicTestSample, {
36
- 'name': 'Hello, world!',
37
- 'position': 42,
38
- 'value': 1024.768,
39
- }
40
- ),
41
- (
42
- NumpyTestSample, {
43
- 'label': 9_001,
44
- 'image': np.random.randn( 1024, 1024 ),
45
- }
46
- )
47
- ]
48
-
49
-
50
- ## Tests
51
-
52
- @pytest.mark.parametrize( ('SampleType', 'sample_data'), test_sample_classes )
53
- def test_create_sample(
54
- SampleType: Type[ekd.PackableSample],
55
- sample_data: ekd.MsgpackRawSample,
56
- ):
57
- """
58
- Test our ability to create samples from semi-structured data
59
- """
60
- sample = SampleType.from_data( sample_data )
61
- assert isinstance( sample, SampleType ), f'Did not properly form sample for test type {SampleType}'
62
-
63
- for k, v in sample_data.items():
64
- cur_assertion: bool
65
- if isinstance( v, np.ndarray ):
66
- cur_assertion = np.all( getattr( sample, k ) == v ) == True
67
- else:
68
- cur_assertion = getattr( sample, k ) == v
69
- assert cur_assertion, f'Did not properly incorporate property {k} of test type {SampleType}'
File without changes
File without changes