atdata 0.1.3b4__py3-none-any.whl → 0.2.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +44 -8
- atdata/_cid.py +150 -0
- atdata/_hf_api.py +692 -0
- atdata/_protocols.py +519 -0
- atdata/_schema_codec.py +442 -0
- atdata/_sources.py +515 -0
- atdata/_stub_manager.py +529 -0
- atdata/_type_utils.py +90 -0
- atdata/atmosphere/__init__.py +332 -0
- atdata/atmosphere/_types.py +331 -0
- atdata/atmosphere/client.py +533 -0
- atdata/atmosphere/lens.py +284 -0
- atdata/atmosphere/records.py +509 -0
- atdata/atmosphere/schema.py +239 -0
- atdata/atmosphere/store.py +208 -0
- atdata/cli/__init__.py +213 -0
- atdata/cli/diagnose.py +165 -0
- atdata/cli/local.py +280 -0
- atdata/dataset.py +510 -324
- atdata/lens.py +63 -112
- atdata/local.py +1707 -0
- atdata/promote.py +199 -0
- atdata-0.2.2b1.dist-info/METADATA +272 -0
- atdata-0.2.2b1.dist-info/RECORD +28 -0
- {atdata-0.1.3b4.dist-info → atdata-0.2.2b1.dist-info}/WHEEL +1 -1
- atdata-0.1.3b4.dist-info/METADATA +0 -172
- atdata-0.1.3b4.dist-info/RECORD +0 -9
- {atdata-0.1.3b4.dist-info → atdata-0.2.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.1.3b4.dist-info → atdata-0.2.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/promote.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Promotion workflow for migrating datasets from local to atmosphere.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to promote locally-indexed datasets to the
|
|
4
|
+
ATProto atmosphere network. This enables sharing datasets with the broader
|
|
5
|
+
federation while maintaining schema consistency.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
::
|
|
9
|
+
|
|
10
|
+
>>> from atdata.local import LocalIndex, Repo
|
|
11
|
+
>>> from atdata.atmosphere import AtmosphereClient, AtmosphereIndex
|
|
12
|
+
>>> from atdata.promote import promote_to_atmosphere
|
|
13
|
+
>>>
|
|
14
|
+
>>> # Setup
|
|
15
|
+
>>> local_index = LocalIndex()
|
|
16
|
+
>>> client = AtmosphereClient()
|
|
17
|
+
>>> client.login("handle.bsky.social", "app-password")
|
|
18
|
+
>>>
|
|
19
|
+
>>> # Promote a dataset
|
|
20
|
+
>>> entry = local_index.get_dataset("my-dataset")
|
|
21
|
+
>>> at_uri = promote_to_atmosphere(entry, local_index, client)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from typing import TYPE_CHECKING, Type
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from .local import LocalDatasetEntry, Index as LocalIndex
|
|
28
|
+
from .atmosphere import AtmosphereClient
|
|
29
|
+
from ._protocols import AbstractDataStore, Packable
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _find_existing_schema(
|
|
33
|
+
client: "AtmosphereClient",
|
|
34
|
+
name: str,
|
|
35
|
+
version: str,
|
|
36
|
+
) -> str | None:
|
|
37
|
+
"""Check if a schema with the given name and version already exists.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
client: Authenticated atmosphere client.
|
|
41
|
+
name: Schema name to search for.
|
|
42
|
+
version: Schema version to match.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
AT URI of existing schema if found, None otherwise.
|
|
46
|
+
"""
|
|
47
|
+
from .atmosphere import SchemaLoader
|
|
48
|
+
|
|
49
|
+
loader = SchemaLoader(client)
|
|
50
|
+
for record in loader.list_all():
|
|
51
|
+
rec_value = record.get("value", record)
|
|
52
|
+
if rec_value.get("name") == name and rec_value.get("version") == version:
|
|
53
|
+
return record.get("uri", "")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _find_or_publish_schema(
|
|
58
|
+
sample_type: "Type[Packable]",
|
|
59
|
+
version: str,
|
|
60
|
+
client: "AtmosphereClient",
|
|
61
|
+
description: str | None = None,
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Find existing schema or publish a new one.
|
|
64
|
+
|
|
65
|
+
Checks if a schema with the same name and version already exists on the
|
|
66
|
+
user's atmosphere repository. If found, returns the existing URI to avoid
|
|
67
|
+
duplicates. Otherwise, publishes a new schema record.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
sample_type: The PackableSample subclass to publish.
|
|
71
|
+
version: Semantic version string.
|
|
72
|
+
client: Authenticated atmosphere client.
|
|
73
|
+
description: Optional schema description.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
AT URI of the schema (existing or newly published).
|
|
77
|
+
"""
|
|
78
|
+
from .atmosphere import SchemaPublisher
|
|
79
|
+
|
|
80
|
+
schema_name = f"{sample_type.__module__}.{sample_type.__name__}"
|
|
81
|
+
|
|
82
|
+
# Check for existing schema
|
|
83
|
+
existing = _find_existing_schema(client, schema_name, version)
|
|
84
|
+
if existing:
|
|
85
|
+
return existing
|
|
86
|
+
|
|
87
|
+
# Publish new schema
|
|
88
|
+
publisher = SchemaPublisher(client)
|
|
89
|
+
uri = publisher.publish(
|
|
90
|
+
sample_type,
|
|
91
|
+
version=version,
|
|
92
|
+
description=description,
|
|
93
|
+
)
|
|
94
|
+
return str(uri)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def promote_to_atmosphere(
|
|
98
|
+
local_entry: "LocalDatasetEntry",
|
|
99
|
+
local_index: "LocalIndex",
|
|
100
|
+
atmosphere_client: "AtmosphereClient",
|
|
101
|
+
*,
|
|
102
|
+
data_store: "AbstractDataStore | None" = None,
|
|
103
|
+
name: str | None = None,
|
|
104
|
+
description: str | None = None,
|
|
105
|
+
tags: list[str] | None = None,
|
|
106
|
+
license: str | None = None,
|
|
107
|
+
) -> str:
|
|
108
|
+
"""Promote a local dataset to the atmosphere network.
|
|
109
|
+
|
|
110
|
+
This function takes a locally-indexed dataset and publishes it to ATProto,
|
|
111
|
+
making it discoverable on the federated atmosphere network.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
local_entry: The LocalDatasetEntry to promote.
|
|
115
|
+
local_index: Local index containing the schema for this entry.
|
|
116
|
+
atmosphere_client: Authenticated AtmosphereClient.
|
|
117
|
+
data_store: Optional data store for copying data to new location.
|
|
118
|
+
If None, the existing data_urls are used as-is.
|
|
119
|
+
name: Override name for the atmosphere record. Defaults to local name.
|
|
120
|
+
description: Optional description for the dataset.
|
|
121
|
+
tags: Optional tags for discovery.
|
|
122
|
+
license: Optional license identifier.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
AT URI of the created atmosphere dataset record.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
KeyError: If schema not found in local index.
|
|
129
|
+
ValueError: If local entry has no data URLs.
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
::
|
|
133
|
+
|
|
134
|
+
>>> entry = local_index.get_dataset("mnist-train")
|
|
135
|
+
>>> uri = promote_to_atmosphere(entry, local_index, client)
|
|
136
|
+
>>> print(uri)
|
|
137
|
+
at://did:plc:abc123/ac.foundation.dataset.datasetIndex/...
|
|
138
|
+
"""
|
|
139
|
+
from .atmosphere import DatasetPublisher
|
|
140
|
+
from ._schema_codec import schema_to_type
|
|
141
|
+
|
|
142
|
+
# Validate entry has data
|
|
143
|
+
if not local_entry.data_urls:
|
|
144
|
+
raise ValueError(f"Local entry '{local_entry.name}' has no data URLs")
|
|
145
|
+
|
|
146
|
+
# Get schema from local index
|
|
147
|
+
schema_ref = local_entry.schema_ref
|
|
148
|
+
schema_record = local_index.get_schema(schema_ref)
|
|
149
|
+
|
|
150
|
+
# Reconstruct sample type from schema
|
|
151
|
+
sample_type = schema_to_type(schema_record)
|
|
152
|
+
schema_version = schema_record.get("version", "1.0.0")
|
|
153
|
+
|
|
154
|
+
# Find or publish schema on atmosphere (deduplication)
|
|
155
|
+
atmosphere_schema_uri = _find_or_publish_schema(
|
|
156
|
+
sample_type,
|
|
157
|
+
schema_version,
|
|
158
|
+
atmosphere_client,
|
|
159
|
+
description=schema_record.get("description"),
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Determine data URLs
|
|
163
|
+
if data_store is not None:
|
|
164
|
+
# Copy data to new storage location
|
|
165
|
+
# Create a temporary Dataset to write through the data store
|
|
166
|
+
from .dataset import Dataset
|
|
167
|
+
|
|
168
|
+
# Build WDS URL from data_urls
|
|
169
|
+
if len(local_entry.data_urls) == 1:
|
|
170
|
+
wds_url = local_entry.data_urls[0]
|
|
171
|
+
else:
|
|
172
|
+
# Use brace notation for multiple URLs
|
|
173
|
+
wds_url = " ".join(local_entry.data_urls)
|
|
174
|
+
|
|
175
|
+
ds = Dataset[sample_type](wds_url)
|
|
176
|
+
prefix = f"promoted/{local_entry.name}"
|
|
177
|
+
data_urls = data_store.write_shards(ds, prefix=prefix)
|
|
178
|
+
else:
|
|
179
|
+
# Use existing URLs as-is
|
|
180
|
+
data_urls = local_entry.data_urls
|
|
181
|
+
|
|
182
|
+
# Publish dataset record to atmosphere
|
|
183
|
+
publisher = DatasetPublisher(atmosphere_client)
|
|
184
|
+
uri = publisher.publish_with_urls(
|
|
185
|
+
urls=data_urls,
|
|
186
|
+
schema_uri=atmosphere_schema_uri,
|
|
187
|
+
name=name or local_entry.name,
|
|
188
|
+
description=description,
|
|
189
|
+
tags=tags,
|
|
190
|
+
license=license,
|
|
191
|
+
metadata=local_entry.metadata,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return str(uri)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
__all__ = [
|
|
198
|
+
"promote_to_atmosphere",
|
|
199
|
+
]
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: atdata
|
|
3
|
+
Version: 0.2.2b1
|
|
4
|
+
Summary: A loose federation of distributed, typed datasets
|
|
5
|
+
Author-email: Maxine Levesque <hello@maxine.science>, "Maxine @ Forecast Bio" <maxine@forecast.bio>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: atproto>=0.0.65
|
|
9
|
+
Requires-Dist: fastparquet>=2024.11.0
|
|
10
|
+
Requires-Dist: libipld>=3.3.2
|
|
11
|
+
Requires-Dist: msgpack>=1.1.2
|
|
12
|
+
Requires-Dist: numpy>=2.3.4
|
|
13
|
+
Requires-Dist: ormsgpack>=1.11.0
|
|
14
|
+
Requires-Dist: pandas>=2.3.3
|
|
15
|
+
Requires-Dist: pydantic>=2.12.5
|
|
16
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
17
|
+
Requires-Dist: redis-om>=0.3.5
|
|
18
|
+
Requires-Dist: requests>=2.32.5
|
|
19
|
+
Requires-Dist: s3fs>=2025.12.0
|
|
20
|
+
Requires-Dist: schemamodels>=0.9.1
|
|
21
|
+
Requires-Dist: tqdm>=4.67.1
|
|
22
|
+
Requires-Dist: webdataset>=1.0.2
|
|
23
|
+
Provides-Extra: atmosphere
|
|
24
|
+
Requires-Dist: atproto>=0.0.55; extra == 'atmosphere'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# atdata
|
|
28
|
+
|
|
29
|
+
[](https://codecov.io/gh/foundation-ac/atdata)
|
|
30
|
+
|
|
31
|
+
A loose federation of distributed, typed datasets built on WebDataset.
|
|
32
|
+
|
|
33
|
+
**atdata** provides a type-safe, composable framework for working with large-scale datasets. It combines the efficiency of WebDataset's tar-based storage with Python's type system and functional programming patterns.
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
|
|
38
|
+
- **Schema-free Exploration** - Load datasets without defining a schema first using `DictSample`
|
|
39
|
+
- **Lens Transformations** - Bidirectional, composable transformations between different dataset views
|
|
40
|
+
- **Automatic Batching** - Smart batch aggregation with numpy array stacking
|
|
41
|
+
- **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
|
|
42
|
+
- **Flexible Data Sources** - Stream from local files, HTTP URLs, or S3-compatible storage
|
|
43
|
+
- **HuggingFace-style API** - `load_dataset()` with path resolution and split handling
|
|
44
|
+
- **Local & Atmosphere Storage** - Index datasets locally with Redis or publish to ATProto network
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install atdata
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Requires Python 3.12 or later.
|
|
53
|
+
|
|
54
|
+
## Quick Start
|
|
55
|
+
|
|
56
|
+
### Loading Datasets
|
|
57
|
+
|
|
58
|
+
The primary way to load datasets is with `load_dataset()`:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from atdata import load_dataset
|
|
62
|
+
|
|
63
|
+
# Load without specifying a type - returns Dataset[DictSample]
|
|
64
|
+
ds = load_dataset("path/to/data.tar", split="train")
|
|
65
|
+
|
|
66
|
+
# Explore the data
|
|
67
|
+
for sample in ds.ordered():
|
|
68
|
+
print(sample.keys()) # See available fields
|
|
69
|
+
print(sample["text"]) # Dict-style access
|
|
70
|
+
print(sample.label) # Attribute access
|
|
71
|
+
break
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Defining Typed Schemas
|
|
75
|
+
|
|
76
|
+
Once you understand your data, define a typed schema with `@packable`:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import atdata
|
|
80
|
+
from numpy.typing import NDArray
|
|
81
|
+
|
|
82
|
+
@atdata.packable
|
|
83
|
+
class ImageSample:
|
|
84
|
+
image: NDArray
|
|
85
|
+
label: str
|
|
86
|
+
metadata: dict
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Loading with Types
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
# Load with explicit type
|
|
93
|
+
ds = load_dataset("path/to/data-{000000..000009}.tar", ImageSample, split="train")
|
|
94
|
+
|
|
95
|
+
# Or convert from DictSample
|
|
96
|
+
ds = load_dataset("path/to/data.tar", split="train").as_type(ImageSample)
|
|
97
|
+
|
|
98
|
+
# Iterate over samples
|
|
99
|
+
for sample in ds.ordered():
|
|
100
|
+
print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
|
|
101
|
+
|
|
102
|
+
# Iterate with shuffling and batching
|
|
103
|
+
for batch in ds.shuffled(batch_size=32):
|
|
104
|
+
# batch.image is automatically stacked into shape (32, ...)
|
|
105
|
+
# batch.label is a list of 32 labels
|
|
106
|
+
process_batch(batch.image, batch.label)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Lens Transformations
|
|
110
|
+
|
|
111
|
+
Define reusable transformations between sample types:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
@atdata.packable
|
|
115
|
+
class ProcessedSample:
|
|
116
|
+
features: NDArray
|
|
117
|
+
label: str
|
|
118
|
+
|
|
119
|
+
@atdata.lens
|
|
120
|
+
def preprocess(sample: ImageSample) -> ProcessedSample:
|
|
121
|
+
features = extract_features(sample.image)
|
|
122
|
+
return ProcessedSample(features=features, label=sample.label)
|
|
123
|
+
|
|
124
|
+
# Apply lens to view dataset as ProcessedSample
|
|
125
|
+
processed_ds = dataset.as_type(ProcessedSample)
|
|
126
|
+
|
|
127
|
+
for sample in processed_ds.ordered(batch_size=None):
|
|
128
|
+
# sample is now a ProcessedSample
|
|
129
|
+
print(sample.features.shape)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Core Concepts
|
|
133
|
+
|
|
134
|
+
### DictSample
|
|
135
|
+
|
|
136
|
+
The default sample type for schema-free exploration. Provides both attribute and dict-style access:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
ds = load_dataset("data.tar", split="train")
|
|
140
|
+
|
|
141
|
+
for sample in ds.ordered():
|
|
142
|
+
# Dict-style access
|
|
143
|
+
print(sample["field_name"])
|
|
144
|
+
|
|
145
|
+
# Attribute access
|
|
146
|
+
print(sample.field_name)
|
|
147
|
+
|
|
148
|
+
# Introspection
|
|
149
|
+
print(sample.keys())
|
|
150
|
+
print(sample.to_dict())
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### PackableSample
|
|
154
|
+
|
|
155
|
+
Base class for typed, serializable samples. Fields annotated as `NDArray` are automatically handled:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
@atdata.packable
|
|
159
|
+
class MySample:
|
|
160
|
+
array_field: NDArray # Automatically serialized
|
|
161
|
+
optional_array: NDArray | None
|
|
162
|
+
regular_field: str
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Every `@packable` class automatically registers a lens from `DictSample`, enabling seamless conversion via `.as_type()`.
|
|
166
|
+
|
|
167
|
+
### Lens
|
|
168
|
+
|
|
169
|
+
Bidirectional transformations with getter/putter semantics:
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
@atdata.lens
|
|
173
|
+
def my_lens(source: SourceType) -> ViewType:
|
|
174
|
+
# Transform source -> view
|
|
175
|
+
return ViewType(...)
|
|
176
|
+
|
|
177
|
+
@my_lens.putter
|
|
178
|
+
def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
|
|
179
|
+
# Transform view -> source
|
|
180
|
+
return SourceType(...)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Data Sources
|
|
184
|
+
|
|
185
|
+
Datasets support multiple backends via the `DataSource` protocol:
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
# String URLs (most common) - automatically wrapped in URLSource
|
|
189
|
+
dataset = atdata.Dataset[ImageSample]("data-{000000..000009}.tar")
|
|
190
|
+
|
|
191
|
+
# S3 with authentication (private buckets, Cloudflare R2, MinIO)
|
|
192
|
+
source = atdata.S3Source(
|
|
193
|
+
bucket="my-bucket",
|
|
194
|
+
keys=["data-000000.tar", "data-000001.tar"],
|
|
195
|
+
endpoint="https://my-account.r2.cloudflarestorage.com",
|
|
196
|
+
access_key="...",
|
|
197
|
+
secret_key="...",
|
|
198
|
+
)
|
|
199
|
+
dataset = atdata.Dataset[ImageSample](source)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Dataset URLs
|
|
203
|
+
|
|
204
|
+
Uses WebDataset brace expansion for sharded datasets:
|
|
205
|
+
|
|
206
|
+
- Single file: `"data/dataset-000000.tar"`
|
|
207
|
+
- Multiple shards: `"data/dataset-{000000..000099}.tar"`
|
|
208
|
+
- Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
|
|
209
|
+
|
|
210
|
+
### HuggingFace-style API
|
|
211
|
+
|
|
212
|
+
Load datasets with a familiar interface:
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from atdata import load_dataset
|
|
216
|
+
|
|
217
|
+
# Load without type for exploration (returns Dataset[DictSample])
|
|
218
|
+
ds = load_dataset("./data/train-*.tar", split="train")
|
|
219
|
+
|
|
220
|
+
# Load with explicit type
|
|
221
|
+
ds = load_dataset("./data/train-*.tar", ImageSample, split="train")
|
|
222
|
+
|
|
223
|
+
# Load from S3 with brace notation
|
|
224
|
+
ds = load_dataset("s3://bucket/data-{000000..000099}.tar", ImageSample, split="train")
|
|
225
|
+
|
|
226
|
+
# Load all splits (returns DatasetDict)
|
|
227
|
+
ds_dict = load_dataset("./data", ImageSample)
|
|
228
|
+
train_ds = ds_dict["train"]
|
|
229
|
+
test_ds = ds_dict["test"]
|
|
230
|
+
|
|
231
|
+
# Convert DictSample to typed schema
|
|
232
|
+
ds = load_dataset("./data/train.tar", split="train").as_type(ImageSample)
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Development
|
|
236
|
+
|
|
237
|
+
### Setup
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
# Install uv if not already available
|
|
241
|
+
python -m pip install uv
|
|
242
|
+
|
|
243
|
+
# Install dependencies
|
|
244
|
+
uv sync
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Testing
|
|
248
|
+
|
|
249
|
+
```bash
|
|
250
|
+
# Run all tests with coverage
|
|
251
|
+
uv run pytest
|
|
252
|
+
|
|
253
|
+
# Run specific test file
|
|
254
|
+
uv run pytest tests/test_dataset.py
|
|
255
|
+
|
|
256
|
+
# Run single test
|
|
257
|
+
uv run pytest tests/test_lens.py::test_lens
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Building
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
uv build
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Contributing
|
|
267
|
+
|
|
268
|
+
Contributions are welcome! This project is in beta, so the API may still evolve.
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
This project is licensed under the Mozilla Public License 2.0. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
atdata/__init__.py,sha256=TH-HoeLOdDJL42HqcrcR4pAoizI21CTUffdBlnEQZzM,2451
|
|
2
|
+
atdata/_cid.py,sha256=aLH4Iov6oQsxFt4pj2-0SwcfBMQsIFtBDADQekQRtEw,4083
|
|
3
|
+
atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
|
|
4
|
+
atdata/_hf_api.py,sha256=D4-2-3bhO9yO6G_wMBlNHO_Pcj-USsUUErJNgPbdK5w,23113
|
|
5
|
+
atdata/_protocols.py,sha256=qHZDas6jOOWWEJYwZU4zPsB9RcX1WgwnOLNsvMek9rg,16055
|
|
6
|
+
atdata/_schema_codec.py,sha256=5kKCYy2F4Tg_6wpcw2lxvKFUZrMmZsTBAuBqMvuthrw,14430
|
|
7
|
+
atdata/_sources.py,sha256=w3nSqvd74T68vDv_xFZzx4myaFilFWXvFX7OoAT6vBU,16820
|
|
8
|
+
atdata/_stub_manager.py,sha256=rTyxlf5LeEcs4SHdXI-xnthhboOjK9uWIAzyXj5w9MA,19150
|
|
9
|
+
atdata/_type_utils.py,sha256=yKx_imuYpZkJZOQRBuneoVm7HNVIXnLZ3SbLQA7OeFw,2891
|
|
10
|
+
atdata/dataset.py,sha256=HTWKBXc7nKB4iaYBRMzRhxQ3iN7DZGTSPPlpcxPIdoc,36113
|
|
11
|
+
atdata/lens.py,sha256=fKt07rKFgwmtr10ArzhvtxJw8krcs4qJHIMc9Hbe544,9907
|
|
12
|
+
atdata/local.py,sha256=Wg7ynRMmpup83Plh9EgHrshgiJfznzWvt0aXnxT0jfU,57547
|
|
13
|
+
atdata/promote.py,sha256=DdjgrbUSOrtPau_fyGdELMwkVM8kM0bTBH8ojpcJXtc,6385
|
|
14
|
+
atdata/atmosphere/__init__.py,sha256=DgBGgGN6AmRnq6Csm5kbxu24KBmp41QvVOvG0ewyOB8,9863
|
|
15
|
+
atdata/atmosphere/_types.py,sha256=TK08Skpy2mLEgyiQmeYF4Pl_V0YfOiHI2zlClAdFvv4,9604
|
|
16
|
+
atdata/atmosphere/client.py,sha256=cZQMJKJQ49CC99WYFsUw71iZD3tA9cRwGWpvcYjM1p8,16174
|
|
17
|
+
atdata/atmosphere/lens.py,sha256=8papN5AK8Id-1LxNaHLyvpR_YYIkt5ifMd-47-QFJt4,9377
|
|
18
|
+
atdata/atmosphere/records.py,sha256=aC1oaAyIUid56CGkFEiZe5-jGkmkVgcQKzzFauM9yso,16003
|
|
19
|
+
atdata/atmosphere/schema.py,sha256=IBbssMT8mXJbJ7otW5ZJXj9eZUCwI1VsYagjeEQPwCI,7767
|
|
20
|
+
atdata/atmosphere/store.py,sha256=dEVGJtIUfxWZhD4NraUNGjD8Umh3CugrDCZONn-i7r0,6366
|
|
21
|
+
atdata/cli/__init__.py,sha256=fLnG7-Nra0IEOK140c4RoSSRvah1uXIDKpl6uW31acc,5698
|
|
22
|
+
atdata/cli/diagnose.py,sha256=q6wj4Skl-KwEAnwD9Bnhk_Gr8EEnrsNkSxwKtk54LaM,5455
|
|
23
|
+
atdata/cli/local.py,sha256=xNItYhEYOGQd30_eJ39s8IQfIfrozMxblsPc29hY-rE,8046
|
|
24
|
+
atdata-0.2.2b1.dist-info/METADATA,sha256=QrVdfUO-0PrI2UI113lZw16cHh_Bgdqirx-xrSZ9_bg,7270
|
|
25
|
+
atdata-0.2.2b1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
26
|
+
atdata-0.2.2b1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
|
|
27
|
+
atdata-0.2.2b1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
|
|
28
|
+
atdata-0.2.2b1.dist-info/RECORD,,
|
|
@@ -1,172 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: atdata
|
|
3
|
-
Version: 0.1.3b4
|
|
4
|
-
Summary: A loose federation of distributed, typed datasets
|
|
5
|
-
Author-email: Maxine Levesque <hello@maxine.science>
|
|
6
|
-
License-File: LICENSE
|
|
7
|
-
Requires-Python: >=3.12
|
|
8
|
-
Requires-Dist: fastparquet>=2024.11.0
|
|
9
|
-
Requires-Dist: msgpack>=1.1.2
|
|
10
|
-
Requires-Dist: numpy>=2.3.4
|
|
11
|
-
Requires-Dist: ormsgpack>=1.11.0
|
|
12
|
-
Requires-Dist: pandas>=2.3.3
|
|
13
|
-
Requires-Dist: tqdm>=4.67.1
|
|
14
|
-
Requires-Dist: webdataset>=1.0.2
|
|
15
|
-
Description-Content-Type: text/markdown
|
|
16
|
-
|
|
17
|
-
# atdata
|
|
18
|
-
|
|
19
|
-
[](https://codecov.io/gh/foundation-ac/atdata)
|
|
20
|
-
|
|
21
|
-
A loose federation of distributed, typed datasets built on WebDataset.
|
|
22
|
-
|
|
23
|
-
**atdata** provides a type-safe, composable framework for working with large-scale datasets. It combines the efficiency of WebDataset's tar-based storage with Python's type system and functional programming patterns.
|
|
24
|
-
|
|
25
|
-
## Features
|
|
26
|
-
|
|
27
|
-
- **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
|
|
28
|
-
- **Lens Transformations** - Bidirectional, composable transformations between different dataset views
|
|
29
|
-
- **Automatic Batching** - Smart batch aggregation with numpy array stacking
|
|
30
|
-
- **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
|
|
31
|
-
|
|
32
|
-
## Installation
|
|
33
|
-
|
|
34
|
-
```bash
|
|
35
|
-
pip install atdata
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
Requires Python 3.12 or later.
|
|
39
|
-
|
|
40
|
-
## Quick Start
|
|
41
|
-
|
|
42
|
-
### Defining Sample Types
|
|
43
|
-
|
|
44
|
-
Use the `@packable` decorator to create typed dataset samples:
|
|
45
|
-
|
|
46
|
-
```python
|
|
47
|
-
import atdata
|
|
48
|
-
from numpy.typing import NDArray
|
|
49
|
-
|
|
50
|
-
@atdata.packable
|
|
51
|
-
class ImageSample:
|
|
52
|
-
image: NDArray
|
|
53
|
-
label: str
|
|
54
|
-
metadata: dict
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
### Creating Datasets
|
|
58
|
-
|
|
59
|
-
```python
|
|
60
|
-
# Create a dataset
|
|
61
|
-
dataset = atdata.Dataset[ImageSample]("path/to/data-{000000..000009}.tar")
|
|
62
|
-
|
|
63
|
-
# Iterate over samples in order
|
|
64
|
-
for sample in dataset.ordered(batch_size=None):
|
|
65
|
-
print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
|
|
66
|
-
|
|
67
|
-
# Iterate with shuffling and batching
|
|
68
|
-
for batch in dataset.shuffled(batch_size=32):
|
|
69
|
-
# batch.image is automatically stacked into shape (32, ...)
|
|
70
|
-
# batch.label is a list of 32 labels
|
|
71
|
-
process_batch(batch.image, batch.label)
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
### Lens Transformations
|
|
75
|
-
|
|
76
|
-
Define reusable transformations between sample types:
|
|
77
|
-
|
|
78
|
-
```python
|
|
79
|
-
@atdata.packable
|
|
80
|
-
class ProcessedSample:
|
|
81
|
-
features: NDArray
|
|
82
|
-
label: str
|
|
83
|
-
|
|
84
|
-
@atdata.lens
|
|
85
|
-
def preprocess(sample: ImageSample) -> ProcessedSample:
|
|
86
|
-
features = extract_features(sample.image)
|
|
87
|
-
return ProcessedSample(features=features, label=sample.label)
|
|
88
|
-
|
|
89
|
-
# Apply lens to view dataset as ProcessedSample
|
|
90
|
-
processed_ds = dataset.as_type(ProcessedSample)
|
|
91
|
-
|
|
92
|
-
for sample in processed_ds.ordered(batch_size=None):
|
|
93
|
-
# sample is now a ProcessedSample
|
|
94
|
-
print(sample.features.shape)
|
|
95
|
-
```
|
|
96
|
-
|
|
97
|
-
## Core Concepts
|
|
98
|
-
|
|
99
|
-
### PackableSample
|
|
100
|
-
|
|
101
|
-
Base class for serializable samples. Fields annotated as `NDArray` are automatically handled:
|
|
102
|
-
|
|
103
|
-
```python
|
|
104
|
-
@atdata.packable
|
|
105
|
-
class MySample:
|
|
106
|
-
array_field: NDArray # Automatically serialized
|
|
107
|
-
optional_array: NDArray | None
|
|
108
|
-
regular_field: str
|
|
109
|
-
```
|
|
110
|
-
|
|
111
|
-
### Lens
|
|
112
|
-
|
|
113
|
-
Bidirectional transformations with getter/putter semantics:
|
|
114
|
-
|
|
115
|
-
```python
|
|
116
|
-
@atdata.lens
|
|
117
|
-
def my_lens(source: SourceType) -> ViewType:
|
|
118
|
-
# Transform source -> view
|
|
119
|
-
return ViewType(...)
|
|
120
|
-
|
|
121
|
-
@my_lens.putter
|
|
122
|
-
def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
|
|
123
|
-
# Transform view -> source
|
|
124
|
-
return SourceType(...)
|
|
125
|
-
```
|
|
126
|
-
|
|
127
|
-
### Dataset URLs
|
|
128
|
-
|
|
129
|
-
Uses WebDataset brace expansion for sharded datasets:
|
|
130
|
-
|
|
131
|
-
- Single file: `"data/dataset-000000.tar"`
|
|
132
|
-
- Multiple shards: `"data/dataset-{000000..000099}.tar"`
|
|
133
|
-
- Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
|
|
134
|
-
|
|
135
|
-
## Development
|
|
136
|
-
|
|
137
|
-
### Setup
|
|
138
|
-
|
|
139
|
-
```bash
|
|
140
|
-
# Install uv if not already available
|
|
141
|
-
python -m pip install uv
|
|
142
|
-
|
|
143
|
-
# Install dependencies
|
|
144
|
-
uv sync
|
|
145
|
-
```
|
|
146
|
-
|
|
147
|
-
### Testing
|
|
148
|
-
|
|
149
|
-
```bash
|
|
150
|
-
# Run all tests with coverage
|
|
151
|
-
pytest
|
|
152
|
-
|
|
153
|
-
# Run specific test file
|
|
154
|
-
pytest tests/test_dataset.py
|
|
155
|
-
|
|
156
|
-
# Run single test
|
|
157
|
-
pytest tests/test_lens.py::test_lens
|
|
158
|
-
```
|
|
159
|
-
|
|
160
|
-
### Building
|
|
161
|
-
|
|
162
|
-
```bash
|
|
163
|
-
uv build
|
|
164
|
-
```
|
|
165
|
-
|
|
166
|
-
## Contributing
|
|
167
|
-
|
|
168
|
-
Contributions are welcome! This project is in beta, so the API may still evolve.
|
|
169
|
-
|
|
170
|
-
## License
|
|
171
|
-
|
|
172
|
-
This project is licensed under the Mozilla Public License 2.0. See [LICENSE](LICENSE) for details.
|
atdata-0.1.3b4.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
atdata/__init__.py,sha256=_363ZuJfwbBQTMYsoKOiyoBe4AHr3iplK-EQyrAeTdg,1545
|
|
2
|
-
atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
|
|
3
|
-
atdata/dataset.py,sha256=O_7b3ub_M4IMRuhv95oz1PVFdsOhNiyXgtY8NphPdBk,27842
|
|
4
|
-
atdata/lens.py,sha256=ynn1DQkR89eRL6JV9EsawuPY9JTrZ67pAX4cRvZ6UVk,11157
|
|
5
|
-
atdata-0.1.3b4.dist-info/METADATA,sha256=SdZSI_SonE-pt4nhmFh5bz9zKD79wT2CKXKFxrTfvgc,4162
|
|
6
|
-
atdata-0.1.3b4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
7
|
-
atdata-0.1.3b4.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
|
|
8
|
-
atdata-0.1.3b4.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
|
|
9
|
-
atdata-0.1.3b4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|