atdata 0.1.3b4__py3-none-any.whl → 0.2.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/promote.py ADDED
@@ -0,0 +1,199 @@
1
+ """Promotion workflow for migrating datasets from local to atmosphere.
2
+
3
+ This module provides functionality to promote locally-indexed datasets to the
4
+ ATProto atmosphere network. This enables sharing datasets with the broader
5
+ federation while maintaining schema consistency.
6
+
7
+ Example:
8
+ ::
9
+
10
+ >>> from atdata.local import LocalIndex, Repo
11
+ >>> from atdata.atmosphere import AtmosphereClient, AtmosphereIndex
12
+ >>> from atdata.promote import promote_to_atmosphere
13
+ >>>
14
+ >>> # Setup
15
+ >>> local_index = LocalIndex()
16
+ >>> client = AtmosphereClient()
17
+ >>> client.login("handle.bsky.social", "app-password")
18
+ >>>
19
+ >>> # Promote a dataset
20
+ >>> entry = local_index.get_dataset("my-dataset")
21
+ >>> at_uri = promote_to_atmosphere(entry, local_index, client)
22
+ """
23
+
24
+ from typing import TYPE_CHECKING, Type
25
+
26
+ if TYPE_CHECKING:
27
+ from .local import LocalDatasetEntry, Index as LocalIndex
28
+ from .atmosphere import AtmosphereClient
29
+ from ._protocols import AbstractDataStore, Packable
30
+
31
+
32
+ def _find_existing_schema(
33
+ client: "AtmosphereClient",
34
+ name: str,
35
+ version: str,
36
+ ) -> str | None:
37
+ """Check if a schema with the given name and version already exists.
38
+
39
+ Args:
40
+ client: Authenticated atmosphere client.
41
+ name: Schema name to search for.
42
+ version: Schema version to match.
43
+
44
+ Returns:
45
+ AT URI of existing schema if found, None otherwise.
46
+ """
47
+ from .atmosphere import SchemaLoader
48
+
49
+ loader = SchemaLoader(client)
50
+ for record in loader.list_all():
51
+ rec_value = record.get("value", record)
52
+ if rec_value.get("name") == name and rec_value.get("version") == version:
53
+ return record.get("uri", "")
54
+ return None
55
+
56
+
57
+ def _find_or_publish_schema(
58
+ sample_type: "Type[Packable]",
59
+ version: str,
60
+ client: "AtmosphereClient",
61
+ description: str | None = None,
62
+ ) -> str:
63
+ """Find existing schema or publish a new one.
64
+
65
+ Checks if a schema with the same name and version already exists on the
66
+ user's atmosphere repository. If found, returns the existing URI to avoid
67
+ duplicates. Otherwise, publishes a new schema record.
68
+
69
+ Args:
70
+ sample_type: The PackableSample subclass to publish.
71
+ version: Semantic version string.
72
+ client: Authenticated atmosphere client.
73
+ description: Optional schema description.
74
+
75
+ Returns:
76
+ AT URI of the schema (existing or newly published).
77
+ """
78
+ from .atmosphere import SchemaPublisher
79
+
80
+ schema_name = f"{sample_type.__module__}.{sample_type.__name__}"
81
+
82
+ # Check for existing schema
83
+ existing = _find_existing_schema(client, schema_name, version)
84
+ if existing:
85
+ return existing
86
+
87
+ # Publish new schema
88
+ publisher = SchemaPublisher(client)
89
+ uri = publisher.publish(
90
+ sample_type,
91
+ version=version,
92
+ description=description,
93
+ )
94
+ return str(uri)
95
+
96
+
97
+ def promote_to_atmosphere(
98
+ local_entry: "LocalDatasetEntry",
99
+ local_index: "LocalIndex",
100
+ atmosphere_client: "AtmosphereClient",
101
+ *,
102
+ data_store: "AbstractDataStore | None" = None,
103
+ name: str | None = None,
104
+ description: str | None = None,
105
+ tags: list[str] | None = None,
106
+ license: str | None = None,
107
+ ) -> str:
108
+ """Promote a local dataset to the atmosphere network.
109
+
110
+ This function takes a locally-indexed dataset and publishes it to ATProto,
111
+ making it discoverable on the federated atmosphere network.
112
+
113
+ Args:
114
+ local_entry: The LocalDatasetEntry to promote.
115
+ local_index: Local index containing the schema for this entry.
116
+ atmosphere_client: Authenticated AtmosphereClient.
117
+ data_store: Optional data store for copying data to new location.
118
+ If None, the existing data_urls are used as-is.
119
+ name: Override name for the atmosphere record. Defaults to local name.
120
+ description: Optional description for the dataset.
121
+ tags: Optional tags for discovery.
122
+ license: Optional license identifier.
123
+
124
+ Returns:
125
+ AT URI of the created atmosphere dataset record.
126
+
127
+ Raises:
128
+ KeyError: If schema not found in local index.
129
+ ValueError: If local entry has no data URLs.
130
+
131
+ Example:
132
+ ::
133
+
134
+ >>> entry = local_index.get_dataset("mnist-train")
135
+ >>> uri = promote_to_atmosphere(entry, local_index, client)
136
+ >>> print(uri)
137
+ at://did:plc:abc123/ac.foundation.dataset.datasetIndex/...
138
+ """
139
+ from .atmosphere import DatasetPublisher
140
+ from ._schema_codec import schema_to_type
141
+
142
+ # Validate entry has data
143
+ if not local_entry.data_urls:
144
+ raise ValueError(f"Local entry '{local_entry.name}' has no data URLs")
145
+
146
+ # Get schema from local index
147
+ schema_ref = local_entry.schema_ref
148
+ schema_record = local_index.get_schema(schema_ref)
149
+
150
+ # Reconstruct sample type from schema
151
+ sample_type = schema_to_type(schema_record)
152
+ schema_version = schema_record.get("version", "1.0.0")
153
+
154
+ # Find or publish schema on atmosphere (deduplication)
155
+ atmosphere_schema_uri = _find_or_publish_schema(
156
+ sample_type,
157
+ schema_version,
158
+ atmosphere_client,
159
+ description=schema_record.get("description"),
160
+ )
161
+
162
+ # Determine data URLs
163
+ if data_store is not None:
164
+ # Copy data to new storage location
165
+ # Create a temporary Dataset to write through the data store
166
+ from .dataset import Dataset
167
+
168
+ # Build WDS URL from data_urls
169
+ if len(local_entry.data_urls) == 1:
170
+ wds_url = local_entry.data_urls[0]
171
+ else:
172
+ # Use brace notation for multiple URLs
173
+ wds_url = " ".join(local_entry.data_urls)
174
+
175
+ ds = Dataset[sample_type](wds_url)
176
+ prefix = f"promoted/{local_entry.name}"
177
+ data_urls = data_store.write_shards(ds, prefix=prefix)
178
+ else:
179
+ # Use existing URLs as-is
180
+ data_urls = local_entry.data_urls
181
+
182
+ # Publish dataset record to atmosphere
183
+ publisher = DatasetPublisher(atmosphere_client)
184
+ uri = publisher.publish_with_urls(
185
+ urls=data_urls,
186
+ schema_uri=atmosphere_schema_uri,
187
+ name=name or local_entry.name,
188
+ description=description,
189
+ tags=tags,
190
+ license=license,
191
+ metadata=local_entry.metadata,
192
+ )
193
+
194
+ return str(uri)
195
+
196
+
197
+ __all__ = [
198
+ "promote_to_atmosphere",
199
+ ]
@@ -0,0 +1,272 @@
1
+ Metadata-Version: 2.4
2
+ Name: atdata
3
+ Version: 0.2.2b1
4
+ Summary: A loose federation of distributed, typed datasets
5
+ Author-email: Maxine Levesque <hello@maxine.science>, "Maxine @ Forecast Bio" <maxine@forecast.bio>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: atproto>=0.0.65
9
+ Requires-Dist: fastparquet>=2024.11.0
10
+ Requires-Dist: libipld>=3.3.2
11
+ Requires-Dist: msgpack>=1.1.2
12
+ Requires-Dist: numpy>=2.3.4
13
+ Requires-Dist: ormsgpack>=1.11.0
14
+ Requires-Dist: pandas>=2.3.3
15
+ Requires-Dist: pydantic>=2.12.5
16
+ Requires-Dist: python-dotenv>=1.2.1
17
+ Requires-Dist: redis-om>=0.3.5
18
+ Requires-Dist: requests>=2.32.5
19
+ Requires-Dist: s3fs>=2025.12.0
20
+ Requires-Dist: schemamodels>=0.9.1
21
+ Requires-Dist: tqdm>=4.67.1
22
+ Requires-Dist: webdataset>=1.0.2
23
+ Provides-Extra: atmosphere
24
+ Requires-Dist: atproto>=0.0.55; extra == 'atmosphere'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # atdata
28
+
29
+ [![codecov](https://codecov.io/gh/foundation-ac/atdata/branch/main/graph/badge.svg)](https://codecov.io/gh/foundation-ac/atdata)
30
+
31
+ A loose federation of distributed, typed datasets built on WebDataset.
32
+
33
+ **atdata** provides a type-safe, composable framework for working with large-scale datasets. It combines the efficiency of WebDataset's tar-based storage with Python's type system and functional programming patterns.
34
+
35
+ ## Features
36
+
37
+ - **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
38
+ - **Schema-free Exploration** - Load datasets without defining a schema first using `DictSample`
39
+ - **Lens Transformations** - Bidirectional, composable transformations between different dataset views
40
+ - **Automatic Batching** - Smart batch aggregation with numpy array stacking
41
+ - **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
42
+ - **Flexible Data Sources** - Stream from local files, HTTP URLs, or S3-compatible storage
43
+ - **HuggingFace-style API** - `load_dataset()` with path resolution and split handling
44
+ - **Local & Atmosphere Storage** - Index datasets locally with Redis or publish to ATProto network
45
+
46
+ ## Installation
47
+
48
+ ```bash
49
+ pip install atdata
50
+ ```
51
+
52
+ Requires Python 3.12 or later.
53
+
54
+ ## Quick Start
55
+
56
+ ### Loading Datasets
57
+
58
+ The primary way to load datasets is with `load_dataset()`:
59
+
60
+ ```python
61
+ from atdata import load_dataset
62
+
63
+ # Load without specifying a type - returns Dataset[DictSample]
64
+ ds = load_dataset("path/to/data.tar", split="train")
65
+
66
+ # Explore the data
67
+ for sample in ds.ordered():
68
+ print(sample.keys()) # See available fields
69
+ print(sample["text"]) # Dict-style access
70
+ print(sample.label) # Attribute access
71
+ break
72
+ ```
73
+
74
+ ### Defining Typed Schemas
75
+
76
+ Once you understand your data, define a typed schema with `@packable`:
77
+
78
+ ```python
79
+ import atdata
80
+ from numpy.typing import NDArray
81
+
82
+ @atdata.packable
83
+ class ImageSample:
84
+ image: NDArray
85
+ label: str
86
+ metadata: dict
87
+ ```
88
+
89
+ ### Loading with Types
90
+
91
+ ```python
92
+ # Load with explicit type
93
+ ds = load_dataset("path/to/data-{000000..000009}.tar", ImageSample, split="train")
94
+
95
+ # Or convert from DictSample
96
+ ds = load_dataset("path/to/data.tar", split="train").as_type(ImageSample)
97
+
98
+ # Iterate over samples
99
+ for sample in ds.ordered():
100
+ print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
101
+
102
+ # Iterate with shuffling and batching
103
+ for batch in ds.shuffled(batch_size=32):
104
+ # batch.image is automatically stacked into shape (32, ...)
105
+ # batch.label is a list of 32 labels
106
+ process_batch(batch.image, batch.label)
107
+ ```
108
+
109
+ ### Lens Transformations
110
+
111
+ Define reusable transformations between sample types:
112
+
113
+ ```python
114
+ @atdata.packable
115
+ class ProcessedSample:
116
+ features: NDArray
117
+ label: str
118
+
119
+ @atdata.lens
120
+ def preprocess(sample: ImageSample) -> ProcessedSample:
121
+ features = extract_features(sample.image)
122
+ return ProcessedSample(features=features, label=sample.label)
123
+
124
+ # Apply lens to view dataset as ProcessedSample
125
+ processed_ds = dataset.as_type(ProcessedSample)
126
+
127
+ for sample in processed_ds.ordered(batch_size=None):
128
+ # sample is now a ProcessedSample
129
+ print(sample.features.shape)
130
+ ```
131
+
132
+ ## Core Concepts
133
+
134
+ ### DictSample
135
+
136
+ The default sample type for schema-free exploration. Provides both attribute and dict-style access:
137
+
138
+ ```python
139
+ ds = load_dataset("data.tar", split="train")
140
+
141
+ for sample in ds.ordered():
142
+ # Dict-style access
143
+ print(sample["field_name"])
144
+
145
+ # Attribute access
146
+ print(sample.field_name)
147
+
148
+ # Introspection
149
+ print(sample.keys())
150
+ print(sample.to_dict())
151
+ ```
152
+
153
+ ### PackableSample
154
+
155
+ Base class for typed, serializable samples. Fields annotated as `NDArray` are automatically handled:
156
+
157
+ ```python
158
+ @atdata.packable
159
+ class MySample:
160
+ array_field: NDArray # Automatically serialized
161
+ optional_array: NDArray | None
162
+ regular_field: str
163
+ ```
164
+
165
+ Every `@packable` class automatically registers a lens from `DictSample`, enabling seamless conversion via `.as_type()`.
166
+
167
+ ### Lens
168
+
169
+ Bidirectional transformations with getter/putter semantics:
170
+
171
+ ```python
172
+ @atdata.lens
173
+ def my_lens(source: SourceType) -> ViewType:
174
+ # Transform source -> view
175
+ return ViewType(...)
176
+
177
+ @my_lens.putter
178
+ def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
179
+ # Transform view -> source
180
+ return SourceType(...)
181
+ ```
182
+
183
+ ### Data Sources
184
+
185
+ Datasets support multiple backends via the `DataSource` protocol:
186
+
187
+ ```python
188
+ # String URLs (most common) - automatically wrapped in URLSource
189
+ dataset = atdata.Dataset[ImageSample]("data-{000000..000009}.tar")
190
+
191
+ # S3 with authentication (private buckets, Cloudflare R2, MinIO)
192
+ source = atdata.S3Source(
193
+ bucket="my-bucket",
194
+ keys=["data-000000.tar", "data-000001.tar"],
195
+ endpoint="https://my-account.r2.cloudflarestorage.com",
196
+ access_key="...",
197
+ secret_key="...",
198
+ )
199
+ dataset = atdata.Dataset[ImageSample](source)
200
+ ```
201
+
202
+ ### Dataset URLs
203
+
204
+ Uses WebDataset brace expansion for sharded datasets:
205
+
206
+ - Single file: `"data/dataset-000000.tar"`
207
+ - Multiple shards: `"data/dataset-{000000..000099}.tar"`
208
+ - Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
209
+
210
+ ### HuggingFace-style API
211
+
212
+ Load datasets with a familiar interface:
213
+
214
+ ```python
215
+ from atdata import load_dataset
216
+
217
+ # Load without type for exploration (returns Dataset[DictSample])
218
+ ds = load_dataset("./data/train-*.tar", split="train")
219
+
220
+ # Load with explicit type
221
+ ds = load_dataset("./data/train-*.tar", ImageSample, split="train")
222
+
223
+ # Load from S3 with brace notation
224
+ ds = load_dataset("s3://bucket/data-{000000..000099}.tar", ImageSample, split="train")
225
+
226
+ # Load all splits (returns DatasetDict)
227
+ ds_dict = load_dataset("./data", ImageSample)
228
+ train_ds = ds_dict["train"]
229
+ test_ds = ds_dict["test"]
230
+
231
+ # Convert DictSample to typed schema
232
+ ds = load_dataset("./data/train.tar", split="train").as_type(ImageSample)
233
+ ```
234
+
235
+ ## Development
236
+
237
+ ### Setup
238
+
239
+ ```bash
240
+ # Install uv if not already available
241
+ python -m pip install uv
242
+
243
+ # Install dependencies
244
+ uv sync
245
+ ```
246
+
247
+ ### Testing
248
+
249
+ ```bash
250
+ # Run all tests with coverage
251
+ uv run pytest
252
+
253
+ # Run specific test file
254
+ uv run pytest tests/test_dataset.py
255
+
256
+ # Run single test
257
+ uv run pytest tests/test_lens.py::test_lens
258
+ ```
259
+
260
+ ### Building
261
+
262
+ ```bash
263
+ uv build
264
+ ```
265
+
266
+ ## Contributing
267
+
268
+ Contributions are welcome! This project is in beta, so the API may still evolve.
269
+
270
+ ## License
271
+
272
+ This project is licensed under the Mozilla Public License 2.0. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,28 @@
1
+ atdata/__init__.py,sha256=TH-HoeLOdDJL42HqcrcR4pAoizI21CTUffdBlnEQZzM,2451
2
+ atdata/_cid.py,sha256=aLH4Iov6oQsxFt4pj2-0SwcfBMQsIFtBDADQekQRtEw,4083
3
+ atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
4
+ atdata/_hf_api.py,sha256=D4-2-3bhO9yO6G_wMBlNHO_Pcj-USsUUErJNgPbdK5w,23113
5
+ atdata/_protocols.py,sha256=qHZDas6jOOWWEJYwZU4zPsB9RcX1WgwnOLNsvMek9rg,16055
6
+ atdata/_schema_codec.py,sha256=5kKCYy2F4Tg_6wpcw2lxvKFUZrMmZsTBAuBqMvuthrw,14430
7
+ atdata/_sources.py,sha256=w3nSqvd74T68vDv_xFZzx4myaFilFWXvFX7OoAT6vBU,16820
8
+ atdata/_stub_manager.py,sha256=rTyxlf5LeEcs4SHdXI-xnthhboOjK9uWIAzyXj5w9MA,19150
9
+ atdata/_type_utils.py,sha256=yKx_imuYpZkJZOQRBuneoVm7HNVIXnLZ3SbLQA7OeFw,2891
10
+ atdata/dataset.py,sha256=HTWKBXc7nKB4iaYBRMzRhxQ3iN7DZGTSPPlpcxPIdoc,36113
11
+ atdata/lens.py,sha256=fKt07rKFgwmtr10ArzhvtxJw8krcs4qJHIMc9Hbe544,9907
12
+ atdata/local.py,sha256=Wg7ynRMmpup83Plh9EgHrshgiJfznzWvt0aXnxT0jfU,57547
13
+ atdata/promote.py,sha256=DdjgrbUSOrtPau_fyGdELMwkVM8kM0bTBH8ojpcJXtc,6385
14
+ atdata/atmosphere/__init__.py,sha256=DgBGgGN6AmRnq6Csm5kbxu24KBmp41QvVOvG0ewyOB8,9863
15
+ atdata/atmosphere/_types.py,sha256=TK08Skpy2mLEgyiQmeYF4Pl_V0YfOiHI2zlClAdFvv4,9604
16
+ atdata/atmosphere/client.py,sha256=cZQMJKJQ49CC99WYFsUw71iZD3tA9cRwGWpvcYjM1p8,16174
17
+ atdata/atmosphere/lens.py,sha256=8papN5AK8Id-1LxNaHLyvpR_YYIkt5ifMd-47-QFJt4,9377
18
+ atdata/atmosphere/records.py,sha256=aC1oaAyIUid56CGkFEiZe5-jGkmkVgcQKzzFauM9yso,16003
19
+ atdata/atmosphere/schema.py,sha256=IBbssMT8mXJbJ7otW5ZJXj9eZUCwI1VsYagjeEQPwCI,7767
20
+ atdata/atmosphere/store.py,sha256=dEVGJtIUfxWZhD4NraUNGjD8Umh3CugrDCZONn-i7r0,6366
21
+ atdata/cli/__init__.py,sha256=fLnG7-Nra0IEOK140c4RoSSRvah1uXIDKpl6uW31acc,5698
22
+ atdata/cli/diagnose.py,sha256=q6wj4Skl-KwEAnwD9Bnhk_Gr8EEnrsNkSxwKtk54LaM,5455
23
+ atdata/cli/local.py,sha256=xNItYhEYOGQd30_eJ39s8IQfIfrozMxblsPc29hY-rE,8046
24
+ atdata-0.2.2b1.dist-info/METADATA,sha256=QrVdfUO-0PrI2UI113lZw16cHh_Bgdqirx-xrSZ9_bg,7270
25
+ atdata-0.2.2b1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
26
+ atdata-0.2.2b1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
27
+ atdata-0.2.2b1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
28
+ atdata-0.2.2b1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,172 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: atdata
3
- Version: 0.1.3b4
4
- Summary: A loose federation of distributed, typed datasets
5
- Author-email: Maxine Levesque <hello@maxine.science>
6
- License-File: LICENSE
7
- Requires-Python: >=3.12
8
- Requires-Dist: fastparquet>=2024.11.0
9
- Requires-Dist: msgpack>=1.1.2
10
- Requires-Dist: numpy>=2.3.4
11
- Requires-Dist: ormsgpack>=1.11.0
12
- Requires-Dist: pandas>=2.3.3
13
- Requires-Dist: tqdm>=4.67.1
14
- Requires-Dist: webdataset>=1.0.2
15
- Description-Content-Type: text/markdown
16
-
17
- # atdata
18
-
19
- [![codecov](https://codecov.io/gh/foundation-ac/atdata/branch/main/graph/badge.svg)](https://codecov.io/gh/foundation-ac/atdata)
20
-
21
- A loose federation of distributed, typed datasets built on WebDataset.
22
-
23
- **atdata** provides a type-safe, composable framework for working with large-scale datasets. It combines the efficiency of WebDataset's tar-based storage with Python's type system and functional programming patterns.
24
-
25
- ## Features
26
-
27
- - **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
28
- - **Lens Transformations** - Bidirectional, composable transformations between different dataset views
29
- - **Automatic Batching** - Smart batch aggregation with numpy array stacking
30
- - **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
31
-
32
- ## Installation
33
-
34
- ```bash
35
- pip install atdata
36
- ```
37
-
38
- Requires Python 3.12 or later.
39
-
40
- ## Quick Start
41
-
42
- ### Defining Sample Types
43
-
44
- Use the `@packable` decorator to create typed dataset samples:
45
-
46
- ```python
47
- import atdata
48
- from numpy.typing import NDArray
49
-
50
- @atdata.packable
51
- class ImageSample:
52
- image: NDArray
53
- label: str
54
- metadata: dict
55
- ```
56
-
57
- ### Creating Datasets
58
-
59
- ```python
60
- # Create a dataset
61
- dataset = atdata.Dataset[ImageSample]("path/to/data-{000000..000009}.tar")
62
-
63
- # Iterate over samples in order
64
- for sample in dataset.ordered(batch_size=None):
65
- print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
66
-
67
- # Iterate with shuffling and batching
68
- for batch in dataset.shuffled(batch_size=32):
69
- # batch.image is automatically stacked into shape (32, ...)
70
- # batch.label is a list of 32 labels
71
- process_batch(batch.image, batch.label)
72
- ```
73
-
74
- ### Lens Transformations
75
-
76
- Define reusable transformations between sample types:
77
-
78
- ```python
79
- @atdata.packable
80
- class ProcessedSample:
81
- features: NDArray
82
- label: str
83
-
84
- @atdata.lens
85
- def preprocess(sample: ImageSample) -> ProcessedSample:
86
- features = extract_features(sample.image)
87
- return ProcessedSample(features=features, label=sample.label)
88
-
89
- # Apply lens to view dataset as ProcessedSample
90
- processed_ds = dataset.as_type(ProcessedSample)
91
-
92
- for sample in processed_ds.ordered(batch_size=None):
93
- # sample is now a ProcessedSample
94
- print(sample.features.shape)
95
- ```
96
-
97
- ## Core Concepts
98
-
99
- ### PackableSample
100
-
101
- Base class for serializable samples. Fields annotated as `NDArray` are automatically handled:
102
-
103
- ```python
104
- @atdata.packable
105
- class MySample:
106
- array_field: NDArray # Automatically serialized
107
- optional_array: NDArray | None
108
- regular_field: str
109
- ```
110
-
111
- ### Lens
112
-
113
- Bidirectional transformations with getter/putter semantics:
114
-
115
- ```python
116
- @atdata.lens
117
- def my_lens(source: SourceType) -> ViewType:
118
- # Transform source -> view
119
- return ViewType(...)
120
-
121
- @my_lens.putter
122
- def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
123
- # Transform view -> source
124
- return SourceType(...)
125
- ```
126
-
127
- ### Dataset URLs
128
-
129
- Uses WebDataset brace expansion for sharded datasets:
130
-
131
- - Single file: `"data/dataset-000000.tar"`
132
- - Multiple shards: `"data/dataset-{000000..000099}.tar"`
133
- - Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
134
-
135
- ## Development
136
-
137
- ### Setup
138
-
139
- ```bash
140
- # Install uv if not already available
141
- python -m pip install uv
142
-
143
- # Install dependencies
144
- uv sync
145
- ```
146
-
147
- ### Testing
148
-
149
- ```bash
150
- # Run all tests with coverage
151
- pytest
152
-
153
- # Run specific test file
154
- pytest tests/test_dataset.py
155
-
156
- # Run single test
157
- pytest tests/test_lens.py::test_lens
158
- ```
159
-
160
- ### Building
161
-
162
- ```bash
163
- uv build
164
- ```
165
-
166
- ## Contributing
167
-
168
- Contributions are welcome! This project is in beta, so the API may still evolve.
169
-
170
- ## License
171
-
172
- This project is licensed under the Mozilla Public License 2.0. See [LICENSE](LICENSE) for details.
@@ -1,9 +0,0 @@
1
- atdata/__init__.py,sha256=_363ZuJfwbBQTMYsoKOiyoBe4AHr3iplK-EQyrAeTdg,1545
2
- atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
3
- atdata/dataset.py,sha256=O_7b3ub_M4IMRuhv95oz1PVFdsOhNiyXgtY8NphPdBk,27842
4
- atdata/lens.py,sha256=ynn1DQkR89eRL6JV9EsawuPY9JTrZ67pAX4cRvZ6UVk,11157
5
- atdata-0.1.3b4.dist-info/METADATA,sha256=SdZSI_SonE-pt4nhmFh5bz9zKD79wT2CKXKFxrTfvgc,4162
6
- atdata-0.1.3b4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
- atdata-0.1.3b4.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
8
- atdata-0.1.3b4.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
9
- atdata-0.1.3b4.dist-info/RECORD,,