atdata 0.1.3b3__tar.gz → 0.1.3b4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {atdata-0.1.3b3 → atdata-0.1.3b4}/.github/workflows/uv-test.yml +11 -3
- atdata-0.1.3b4/.vscode/settings.json +10 -0
- atdata-0.1.3b4/CLAUDE.md +149 -0
- atdata-0.1.3b4/PKG-INFO +172 -0
- atdata-0.1.3b4/README.md +156 -0
- {atdata-0.1.3b3 → atdata-0.1.3b4}/pyproject.toml +1 -1
- atdata-0.1.3b4/src/atdata/__init__.py +55 -0
- atdata-0.1.3b4/src/atdata/_helpers.py +58 -0
- {atdata-0.1.3b3 → atdata-0.1.3b4}/src/atdata/dataset.py +299 -37
- atdata-0.1.3b4/src/atdata/lens.py +353 -0
- atdata-0.1.3b3/PKG-INFO +0 -18
- atdata-0.1.3b3/README.md +0 -2
- atdata-0.1.3b3/src/atdata/__init__.py +0 -20
- atdata-0.1.3b3/src/atdata/_helpers.py +0 -22
- atdata-0.1.3b3/src/atdata/lens.py +0 -200
- {atdata-0.1.3b3 → atdata-0.1.3b4}/.github/workflows/uv-publish-pypi.yml +0 -0
- {atdata-0.1.3b3 → atdata-0.1.3b4}/.gitignore +0 -0
- {atdata-0.1.3b3 → atdata-0.1.3b4}/.python-version +0 -0
- {atdata-0.1.3b3 → atdata-0.1.3b4}/LICENSE +0 -0
- {atdata-0.1.3b3 → atdata-0.1.3b4}/tests/test_dataset.py +0 -0
- {atdata-0.1.3b3 → atdata-0.1.3b4}/tests/test_lens.py +0 -0
|
@@ -15,6 +15,8 @@ jobs:
|
|
|
15
15
|
uv-test:
|
|
16
16
|
name: Run tests
|
|
17
17
|
runs-on: ubuntu-latest
|
|
18
|
+
environment:
|
|
19
|
+
name: test
|
|
18
20
|
|
|
19
21
|
steps:
|
|
20
22
|
- uses: actions/checkout@v5
|
|
@@ -32,9 +34,15 @@ jobs:
|
|
|
32
34
|
# TODO Better to use --locked for author control over versions?
|
|
33
35
|
# run: uv sync --locked --all-extras --dev
|
|
34
36
|
|
|
35
|
-
- name: Run tests
|
|
36
|
-
|
|
37
|
-
|
|
37
|
+
- name: Run tests with coverage
|
|
38
|
+
run: uv run pytest --cov=atdata --cov-report=xml --cov-report=term
|
|
39
|
+
|
|
40
|
+
- name: Upload coverage to Codecov
|
|
41
|
+
uses: codecov/codecov-action@v5
|
|
42
|
+
with:
|
|
43
|
+
# file: ./coverage.xml # Claude hallucination -- fascinating!
|
|
44
|
+
fail_ci_if_error: false
|
|
45
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
38
46
|
|
|
39
47
|
|
|
40
48
|
#
|
atdata-0.1.3b4/CLAUDE.md
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
`atdata` is a Python library that implements a loose federation of distributed, typed datasets built on top of WebDataset. It provides:
|
|
8
|
+
|
|
9
|
+
- **Typed samples** with automatic serialization via msgpack
|
|
10
|
+
- **Lens-based transformations** between different dataset schemas
|
|
11
|
+
- **Batch aggregation** with automatic numpy array stacking
|
|
12
|
+
- **WebDataset integration** for efficient large-scale dataset storage
|
|
13
|
+
|
|
14
|
+
## Development Commands
|
|
15
|
+
|
|
16
|
+
### Environment Setup
|
|
17
|
+
```bash
|
|
18
|
+
# Uses uv for dependency management
|
|
19
|
+
python -m pip install uv # if not already installed
|
|
20
|
+
uv sync
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Testing
|
|
24
|
+
```bash
|
|
25
|
+
# Run all tests with coverage
|
|
26
|
+
pytest
|
|
27
|
+
|
|
28
|
+
# Run specific test file
|
|
29
|
+
pytest tests/test_dataset.py
|
|
30
|
+
pytest tests/test_lens.py
|
|
31
|
+
|
|
32
|
+
# Run single test
|
|
33
|
+
pytest tests/test_dataset.py::test_create_sample
|
|
34
|
+
pytest tests/test_lens.py::test_lens
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Building
|
|
38
|
+
```bash
|
|
39
|
+
# Build the package
|
|
40
|
+
uv build
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Architecture
|
|
44
|
+
|
|
45
|
+
### Core Components
|
|
46
|
+
|
|
47
|
+
The codebase has three main modules under `src/atdata/`:
|
|
48
|
+
|
|
49
|
+
1. **dataset.py** - Core dataset and sample infrastructure
|
|
50
|
+
- `PackableSample`: Base class for samples that can be serialized with msgpack
|
|
51
|
+
- `Dataset[ST]`: Generic typed dataset wrapping WebDataset tar files
|
|
52
|
+
- `SampleBatch[DT]`: Automatic batching with attribute aggregation
|
|
53
|
+
- `@packable` decorator: Converts dataclasses into PackableSample subclasses
|
|
54
|
+
|
|
55
|
+
2. **lens.py** - Type transformation system
|
|
56
|
+
- `Lens[S, V]`: Bidirectional transformations between sample types (getter/putter)
|
|
57
|
+
- `LensNetwork`: Singleton registry for lens transformations
|
|
58
|
+
- `@lens` decorator: Registers lens getters globally
|
|
59
|
+
|
|
60
|
+
3. **_helpers.py** - Serialization utilities
|
|
61
|
+
- `array_to_bytes()` / `bytes_to_array()`: numpy array serialization
|
|
62
|
+
|
|
63
|
+
### Key Design Patterns
|
|
64
|
+
|
|
65
|
+
**Sample Type Definition**
|
|
66
|
+
|
|
67
|
+
Two approaches for defining sample types:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
# Approach 1: Explicit inheritance
|
|
71
|
+
@dataclass
|
|
72
|
+
class MySample(atdata.PackableSample):
|
|
73
|
+
field1: str
|
|
74
|
+
field2: NDArray
|
|
75
|
+
|
|
76
|
+
# Approach 2: Decorator (recommended)
|
|
77
|
+
@atdata.packable
|
|
78
|
+
class MySample:
|
|
79
|
+
field1: str
|
|
80
|
+
field2: NDArray
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**NDArray Handling**
|
|
84
|
+
|
|
85
|
+
Fields annotated as `NDArray` or `NDArray | None` are automatically:
|
|
86
|
+
- Converted from bytes during deserialization
|
|
87
|
+
- Converted to bytes during serialization (via `_helpers.array_to_bytes`)
|
|
88
|
+
- Handled by `_ensure_good()` method in `PackableSample.__post_init__`
|
|
89
|
+
|
|
90
|
+
**Lens Transformations**
|
|
91
|
+
|
|
92
|
+
Lenses enable viewing datasets through different type schemas:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
@atdata.lens
|
|
96
|
+
def my_lens(source: SourceType) -> ViewType:
|
|
97
|
+
return ViewType(...)
|
|
98
|
+
|
|
99
|
+
@my_lens.putter
|
|
100
|
+
def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
|
|
101
|
+
return SourceType(...)
|
|
102
|
+
|
|
103
|
+
# Use with datasets
|
|
104
|
+
ds = atdata.Dataset[SourceType](url).as_type(ViewType)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The `LensNetwork` singleton (in `lens.py:183`) maintains a global registry of all lenses decorated with `@lens`.
|
|
108
|
+
|
|
109
|
+
**Batch Aggregation**
|
|
110
|
+
|
|
111
|
+
`SampleBatch` uses `__getattr__` magic to aggregate sample attributes:
|
|
112
|
+
- For `NDArray` fields: stacks into numpy array with batch dimension
|
|
113
|
+
- For other fields: creates list
|
|
114
|
+
- Results are cached in `_aggregate_cache`
|
|
115
|
+
|
|
116
|
+
### Dataset URLs
|
|
117
|
+
|
|
118
|
+
Datasets use WebDataset brace-notation URLs:
|
|
119
|
+
- Single shard: `path/to/file-000000.tar`
|
|
120
|
+
- Multiple shards: `path/to/file-{000000..000009}.tar`
|
|
121
|
+
|
|
122
|
+
### Important Implementation Details
|
|
123
|
+
|
|
124
|
+
**Type Parameters**
|
|
125
|
+
|
|
126
|
+
The codebase uses Python 3.12+ generics heavily:
|
|
127
|
+
- `Dataset[ST]` where `ST` is the sample type
|
|
128
|
+
- `SampleBatch[DT]` where `DT` is the sample type
|
|
129
|
+
- Uses `__orig_class__.__args__[0]` at runtime to extract type parameters
|
|
130
|
+
|
|
131
|
+
**Serialization Flow**
|
|
132
|
+
|
|
133
|
+
1. Sample → `as_wds` property → dict with `__key__` and `msgpack` bytes
|
|
134
|
+
2. Msgpack bytes created by `packed` property calling `_make_packable()` on fields
|
|
135
|
+
3. Deserialization: `from_bytes()` → `from_data()` → `__init__` → `_ensure_good()`
|
|
136
|
+
|
|
137
|
+
**WebDataset Integration**
|
|
138
|
+
|
|
139
|
+
- Uses `wds.ShardWriter` / `wds.TarWriter` for writing
|
|
140
|
+
- Dataset iteration via `wds.DataPipeline` with custom `wrap()` / `wrap_batch()` methods
|
|
141
|
+
- Supports `ordered()` and `shuffled()` iteration modes
|
|
142
|
+
|
|
143
|
+
## Testing Notes
|
|
144
|
+
|
|
145
|
+
- Tests use parametrization heavily via `@pytest.mark.parametrize`
|
|
146
|
+
- Test cases cover both decorator and inheritance syntax
|
|
147
|
+
- Temporary WebDataset tar files created in `tmp_path` fixture
|
|
148
|
+
- Tests verify both serialization and batch aggregation behavior
|
|
149
|
+
- Lens tests verify well-behavedness (GetPut/PutGet laws)
|
atdata-0.1.3b4/PKG-INFO
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: atdata
|
|
3
|
+
Version: 0.1.3b4
|
|
4
|
+
Summary: A loose federation of distributed, typed datasets
|
|
5
|
+
Author-email: Maxine Levesque <hello@maxine.science>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: fastparquet>=2024.11.0
|
|
9
|
+
Requires-Dist: msgpack>=1.1.2
|
|
10
|
+
Requires-Dist: numpy>=2.3.4
|
|
11
|
+
Requires-Dist: ormsgpack>=1.11.0
|
|
12
|
+
Requires-Dist: pandas>=2.3.3
|
|
13
|
+
Requires-Dist: tqdm>=4.67.1
|
|
14
|
+
Requires-Dist: webdataset>=1.0.2
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# atdata
|
|
18
|
+
|
|
19
|
+
[](https://codecov.io/gh/foundation-ac/atdata)
|
|
20
|
+
|
|
21
|
+
A loose federation of distributed, typed datasets built on WebDataset.
|
|
22
|
+
|
|
23
|
+
**atdata** provides a type-safe, composable framework for working with large-scale datasets. It combines the efficiency of WebDataset's tar-based storage with Python's type system and functional programming patterns.
|
|
24
|
+
|
|
25
|
+
## Features
|
|
26
|
+
|
|
27
|
+
- **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
|
|
28
|
+
- **Lens Transformations** - Bidirectional, composable transformations between different dataset views
|
|
29
|
+
- **Automatic Batching** - Smart batch aggregation with numpy array stacking
|
|
30
|
+
- **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install atdata
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Requires Python 3.12 or later.
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
### Defining Sample Types
|
|
43
|
+
|
|
44
|
+
Use the `@packable` decorator to create typed dataset samples:
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import atdata
|
|
48
|
+
from numpy.typing import NDArray
|
|
49
|
+
|
|
50
|
+
@atdata.packable
|
|
51
|
+
class ImageSample:
|
|
52
|
+
image: NDArray
|
|
53
|
+
label: str
|
|
54
|
+
metadata: dict
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Creating Datasets
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
# Create a dataset
|
|
61
|
+
dataset = atdata.Dataset[ImageSample]("path/to/data-{000000..000009}.tar")
|
|
62
|
+
|
|
63
|
+
# Iterate over samples in order
|
|
64
|
+
for sample in dataset.ordered(batch_size=None):
|
|
65
|
+
print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
|
|
66
|
+
|
|
67
|
+
# Iterate with shuffling and batching
|
|
68
|
+
for batch in dataset.shuffled(batch_size=32):
|
|
69
|
+
# batch.image is automatically stacked into shape (32, ...)
|
|
70
|
+
# batch.label is a list of 32 labels
|
|
71
|
+
process_batch(batch.image, batch.label)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Lens Transformations
|
|
75
|
+
|
|
76
|
+
Define reusable transformations between sample types:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
@atdata.packable
|
|
80
|
+
class ProcessedSample:
|
|
81
|
+
features: NDArray
|
|
82
|
+
label: str
|
|
83
|
+
|
|
84
|
+
@atdata.lens
|
|
85
|
+
def preprocess(sample: ImageSample) -> ProcessedSample:
|
|
86
|
+
features = extract_features(sample.image)
|
|
87
|
+
return ProcessedSample(features=features, label=sample.label)
|
|
88
|
+
|
|
89
|
+
# Apply lens to view dataset as ProcessedSample
|
|
90
|
+
processed_ds = dataset.as_type(ProcessedSample)
|
|
91
|
+
|
|
92
|
+
for sample in processed_ds.ordered(batch_size=None):
|
|
93
|
+
# sample is now a ProcessedSample
|
|
94
|
+
print(sample.features.shape)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Core Concepts
|
|
98
|
+
|
|
99
|
+
### PackableSample
|
|
100
|
+
|
|
101
|
+
Base class for serializable samples. Fields annotated as `NDArray` are automatically handled:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
@atdata.packable
|
|
105
|
+
class MySample:
|
|
106
|
+
array_field: NDArray # Automatically serialized
|
|
107
|
+
optional_array: NDArray | None
|
|
108
|
+
regular_field: str
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Lens
|
|
112
|
+
|
|
113
|
+
Bidirectional transformations with getter/putter semantics:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
@atdata.lens
|
|
117
|
+
def my_lens(source: SourceType) -> ViewType:
|
|
118
|
+
# Transform source -> view
|
|
119
|
+
return ViewType(...)
|
|
120
|
+
|
|
121
|
+
@my_lens.putter
|
|
122
|
+
def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
|
|
123
|
+
# Transform view -> source
|
|
124
|
+
return SourceType(...)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Dataset URLs
|
|
128
|
+
|
|
129
|
+
Uses WebDataset brace expansion for sharded datasets:
|
|
130
|
+
|
|
131
|
+
- Single file: `"data/dataset-000000.tar"`
|
|
132
|
+
- Multiple shards: `"data/dataset-{000000..000099}.tar"`
|
|
133
|
+
- Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
|
|
134
|
+
|
|
135
|
+
## Development
|
|
136
|
+
|
|
137
|
+
### Setup
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
# Install uv if not already available
|
|
141
|
+
python -m pip install uv
|
|
142
|
+
|
|
143
|
+
# Install dependencies
|
|
144
|
+
uv sync
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Testing
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Run all tests with coverage
|
|
151
|
+
pytest
|
|
152
|
+
|
|
153
|
+
# Run specific test file
|
|
154
|
+
pytest tests/test_dataset.py
|
|
155
|
+
|
|
156
|
+
# Run single test
|
|
157
|
+
pytest tests/test_lens.py::test_lens
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Building
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
uv build
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Contributing
|
|
167
|
+
|
|
168
|
+
Contributions are welcome! This project is in beta, so the API may still evolve.
|
|
169
|
+
|
|
170
|
+
## License
|
|
171
|
+
|
|
172
|
+
This project is licensed under the Mozilla Public License 2.0. See [LICENSE](LICENSE) for details.
|
atdata-0.1.3b4/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# atdata
|
|
2
|
+
|
|
3
|
+
[](https://codecov.io/gh/foundation-ac/atdata)
|
|
4
|
+
|
|
5
|
+
A loose federation of distributed, typed datasets built on WebDataset.
|
|
6
|
+
|
|
7
|
+
**atdata** provides a type-safe, composable framework for working with large-scale datasets. It combines the efficiency of WebDataset's tar-based storage with Python's type system and functional programming patterns.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
|
|
12
|
+
- **Lens Transformations** - Bidirectional, composable transformations between different dataset views
|
|
13
|
+
- **Automatic Batching** - Smart batch aggregation with numpy array stacking
|
|
14
|
+
- **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install atdata
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Requires Python 3.12 or later.
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
### Defining Sample Types
|
|
27
|
+
|
|
28
|
+
Use the `@packable` decorator to create typed dataset samples:
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import atdata
|
|
32
|
+
from numpy.typing import NDArray
|
|
33
|
+
|
|
34
|
+
@atdata.packable
|
|
35
|
+
class ImageSample:
|
|
36
|
+
image: NDArray
|
|
37
|
+
label: str
|
|
38
|
+
metadata: dict
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Creating Datasets
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
# Create a dataset
|
|
45
|
+
dataset = atdata.Dataset[ImageSample]("path/to/data-{000000..000009}.tar")
|
|
46
|
+
|
|
47
|
+
# Iterate over samples in order
|
|
48
|
+
for sample in dataset.ordered(batch_size=None):
|
|
49
|
+
print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
|
|
50
|
+
|
|
51
|
+
# Iterate with shuffling and batching
|
|
52
|
+
for batch in dataset.shuffled(batch_size=32):
|
|
53
|
+
# batch.image is automatically stacked into shape (32, ...)
|
|
54
|
+
# batch.label is a list of 32 labels
|
|
55
|
+
process_batch(batch.image, batch.label)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Lens Transformations
|
|
59
|
+
|
|
60
|
+
Define reusable transformations between sample types:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
@atdata.packable
|
|
64
|
+
class ProcessedSample:
|
|
65
|
+
features: NDArray
|
|
66
|
+
label: str
|
|
67
|
+
|
|
68
|
+
@atdata.lens
|
|
69
|
+
def preprocess(sample: ImageSample) -> ProcessedSample:
|
|
70
|
+
features = extract_features(sample.image)
|
|
71
|
+
return ProcessedSample(features=features, label=sample.label)
|
|
72
|
+
|
|
73
|
+
# Apply lens to view dataset as ProcessedSample
|
|
74
|
+
processed_ds = dataset.as_type(ProcessedSample)
|
|
75
|
+
|
|
76
|
+
for sample in processed_ds.ordered(batch_size=None):
|
|
77
|
+
# sample is now a ProcessedSample
|
|
78
|
+
print(sample.features.shape)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Core Concepts
|
|
82
|
+
|
|
83
|
+
### PackableSample
|
|
84
|
+
|
|
85
|
+
Base class for serializable samples. Fields annotated as `NDArray` are automatically handled:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
@atdata.packable
|
|
89
|
+
class MySample:
|
|
90
|
+
array_field: NDArray # Automatically serialized
|
|
91
|
+
optional_array: NDArray | None
|
|
92
|
+
regular_field: str
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Lens
|
|
96
|
+
|
|
97
|
+
Bidirectional transformations with getter/putter semantics:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
@atdata.lens
|
|
101
|
+
def my_lens(source: SourceType) -> ViewType:
|
|
102
|
+
# Transform source -> view
|
|
103
|
+
return ViewType(...)
|
|
104
|
+
|
|
105
|
+
@my_lens.putter
|
|
106
|
+
def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
|
|
107
|
+
# Transform view -> source
|
|
108
|
+
return SourceType(...)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Dataset URLs
|
|
112
|
+
|
|
113
|
+
Uses WebDataset brace expansion for sharded datasets:
|
|
114
|
+
|
|
115
|
+
- Single file: `"data/dataset-000000.tar"`
|
|
116
|
+
- Multiple shards: `"data/dataset-{000000..000099}.tar"`
|
|
117
|
+
- Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
|
|
118
|
+
|
|
119
|
+
## Development
|
|
120
|
+
|
|
121
|
+
### Setup
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Install uv if not already available
|
|
125
|
+
python -m pip install uv
|
|
126
|
+
|
|
127
|
+
# Install dependencies
|
|
128
|
+
uv sync
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Testing
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# Run all tests with coverage
|
|
135
|
+
pytest
|
|
136
|
+
|
|
137
|
+
# Run specific test file
|
|
138
|
+
pytest tests/test_dataset.py
|
|
139
|
+
|
|
140
|
+
# Run single test
|
|
141
|
+
pytest tests/test_lens.py::test_lens
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Building
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
uv build
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Contributing
|
|
151
|
+
|
|
152
|
+
Contributions are welcome! This project is in beta, so the API may still evolve.
|
|
153
|
+
|
|
154
|
+
## License
|
|
155
|
+
|
|
156
|
+
This project is licensed under the Mozilla Public License 2.0. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""A loose federation of distributed, typed datasets.
|
|
2
|
+
|
|
3
|
+
``atdata`` provides a typed dataset abstraction built on WebDataset, with support
|
|
4
|
+
for:
|
|
5
|
+
|
|
6
|
+
- **Typed samples** with automatic msgpack serialization
|
|
7
|
+
- **NDArray handling** with transparent bytes conversion
|
|
8
|
+
- **Lens transformations** for viewing datasets through different type schemas
|
|
9
|
+
- **Batch aggregation** with automatic numpy array stacking
|
|
10
|
+
- **WebDataset integration** for efficient large-scale dataset storage
|
|
11
|
+
|
|
12
|
+
Quick Start:
|
|
13
|
+
>>> import atdata
|
|
14
|
+
>>> import numpy as np
|
|
15
|
+
>>>
|
|
16
|
+
>>> @atdata.packable
|
|
17
|
+
... class MyData:
|
|
18
|
+
... features: np.ndarray
|
|
19
|
+
... label: str
|
|
20
|
+
>>>
|
|
21
|
+
>>> # Create dataset from WebDataset tar files
|
|
22
|
+
>>> ds = atdata.Dataset[MyData]("path/to/data-{000000..000009}.tar")
|
|
23
|
+
>>>
|
|
24
|
+
>>> # Iterate with automatic batching
|
|
25
|
+
>>> for batch in ds.shuffled(batch_size=32):
|
|
26
|
+
... features = batch.features # numpy array (32, ...)
|
|
27
|
+
... labels = batch.label # list of 32 strings
|
|
28
|
+
|
|
29
|
+
Main Components:
|
|
30
|
+
- ``PackableSample``: Base class for msgpack-serializable samples
|
|
31
|
+
- ``Dataset``: Typed dataset wrapper for WebDataset
|
|
32
|
+
- ``SampleBatch``: Automatic batch aggregation
|
|
33
|
+
- ``Lens``: Bidirectional type transformations
|
|
34
|
+
- ``@packable``: Decorator for creating PackableSample classes
|
|
35
|
+
- ``@lens``: Decorator for creating lens transformations
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
##
|
|
39
|
+
# Expose components
|
|
40
|
+
|
|
41
|
+
from .dataset import (
|
|
42
|
+
PackableSample,
|
|
43
|
+
SampleBatch,
|
|
44
|
+
Dataset,
|
|
45
|
+
packable,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
from .lens import (
|
|
49
|
+
Lens,
|
|
50
|
+
LensNetwork,
|
|
51
|
+
lens,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
#
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Helper utilities for numpy array serialization.
|
|
2
|
+
|
|
3
|
+
This module provides utility functions for converting numpy arrays to and from
|
|
4
|
+
bytes for msgpack serialization. The functions use numpy's native save/load
|
|
5
|
+
format to preserve array dtype and shape information.
|
|
6
|
+
|
|
7
|
+
Functions:
|
|
8
|
+
- ``array_to_bytes()``: Serialize numpy array to bytes
|
|
9
|
+
- ``bytes_to_array()``: Deserialize bytes to numpy array
|
|
10
|
+
|
|
11
|
+
These helpers are used internally by ``PackableSample`` to enable transparent
|
|
12
|
+
handling of NDArray fields during msgpack packing/unpacking.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
##
|
|
16
|
+
# Imports
|
|
17
|
+
|
|
18
|
+
from io import BytesIO
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
##
|
|
24
|
+
|
|
25
|
+
def array_to_bytes( x: np.ndarray ) -> bytes:
|
|
26
|
+
"""Convert a numpy array to bytes for msgpack serialization.
|
|
27
|
+
|
|
28
|
+
Uses numpy's native ``save()`` format to preserve array dtype and shape.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
x: A numpy array to serialize.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Raw bytes representing the serialized array.
|
|
35
|
+
|
|
36
|
+
Note:
|
|
37
|
+
Uses ``allow_pickle=True`` to support object dtypes.
|
|
38
|
+
"""
|
|
39
|
+
np_bytes = BytesIO()
|
|
40
|
+
np.save( np_bytes, x, allow_pickle = True )
|
|
41
|
+
return np_bytes.getvalue()
|
|
42
|
+
|
|
43
|
+
def bytes_to_array( b: bytes ) -> np.ndarray:
|
|
44
|
+
"""Convert serialized bytes back to a numpy array.
|
|
45
|
+
|
|
46
|
+
Reverses the serialization performed by ``array_to_bytes()``.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
b: Raw bytes from a serialized numpy array.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
The deserialized numpy array with original dtype and shape.
|
|
53
|
+
|
|
54
|
+
Note:
|
|
55
|
+
Uses ``allow_pickle=True`` to support object dtypes.
|
|
56
|
+
"""
|
|
57
|
+
np_bytes = BytesIO( b )
|
|
58
|
+
return np.load( np_bytes, allow_pickle = True )
|