atdata 0.2.0a1__py3-none-any.whl → 0.2.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/promote.py ADDED
@@ -0,0 +1,199 @@
1
+ """Promotion workflow for migrating datasets from local to atmosphere.
2
+
3
+ This module provides functionality to promote locally-indexed datasets to the
4
+ ATProto atmosphere network. This enables sharing datasets with the broader
5
+ federation while maintaining schema consistency.
6
+
7
+ Example:
8
+ ::
9
+
10
+ >>> from atdata.local import LocalIndex, Repo
11
+ >>> from atdata.atmosphere import AtmosphereClient, AtmosphereIndex
12
+ >>> from atdata.promote import promote_to_atmosphere
13
+ >>>
14
+ >>> # Setup
15
+ >>> local_index = LocalIndex()
16
+ >>> client = AtmosphereClient()
17
+ >>> client.login("handle.bsky.social", "app-password")
18
+ >>>
19
+ >>> # Promote a dataset
20
+ >>> entry = local_index.get_dataset("my-dataset")
21
+ >>> at_uri = promote_to_atmosphere(entry, local_index, client)
22
+ """
23
+
24
+ from typing import TYPE_CHECKING, Type
25
+
26
+ if TYPE_CHECKING:
27
+ from .local import LocalDatasetEntry, Index as LocalIndex
28
+ from .atmosphere import AtmosphereClient
29
+ from ._protocols import AbstractDataStore, Packable
30
+
31
+
32
+ def _find_existing_schema(
33
+ client: "AtmosphereClient",
34
+ name: str,
35
+ version: str,
36
+ ) -> str | None:
37
+ """Check if a schema with the given name and version already exists.
38
+
39
+ Args:
40
+ client: Authenticated atmosphere client.
41
+ name: Schema name to search for.
42
+ version: Schema version to match.
43
+
44
+ Returns:
45
+ AT URI of existing schema if found, None otherwise.
46
+ """
47
+ from .atmosphere import SchemaLoader
48
+
49
+ loader = SchemaLoader(client)
50
+ for record in loader.list_all():
51
+ rec_value = record.get("value", record)
52
+ if rec_value.get("name") == name and rec_value.get("version") == version:
53
+ return record.get("uri", "")
54
+ return None
55
+
56
+
57
+ def _find_or_publish_schema(
58
+ sample_type: "Type[Packable]",
59
+ version: str,
60
+ client: "AtmosphereClient",
61
+ description: str | None = None,
62
+ ) -> str:
63
+ """Find existing schema or publish a new one.
64
+
65
+ Checks if a schema with the same name and version already exists on the
66
+ user's atmosphere repository. If found, returns the existing URI to avoid
67
+ duplicates. Otherwise, publishes a new schema record.
68
+
69
+ Args:
70
+ sample_type: The PackableSample subclass to publish.
71
+ version: Semantic version string.
72
+ client: Authenticated atmosphere client.
73
+ description: Optional schema description.
74
+
75
+ Returns:
76
+ AT URI of the schema (existing or newly published).
77
+ """
78
+ from .atmosphere import SchemaPublisher
79
+
80
+ schema_name = f"{sample_type.__module__}.{sample_type.__name__}"
81
+
82
+ # Check for existing schema
83
+ existing = _find_existing_schema(client, schema_name, version)
84
+ if existing:
85
+ return existing
86
+
87
+ # Publish new schema
88
+ publisher = SchemaPublisher(client)
89
+ uri = publisher.publish(
90
+ sample_type,
91
+ version=version,
92
+ description=description,
93
+ )
94
+ return str(uri)
95
+
96
+
97
+ def promote_to_atmosphere(
98
+ local_entry: "LocalDatasetEntry",
99
+ local_index: "LocalIndex",
100
+ atmosphere_client: "AtmosphereClient",
101
+ *,
102
+ data_store: "AbstractDataStore | None" = None,
103
+ name: str | None = None,
104
+ description: str | None = None,
105
+ tags: list[str] | None = None,
106
+ license: str | None = None,
107
+ ) -> str:
108
+ """Promote a local dataset to the atmosphere network.
109
+
110
+ This function takes a locally-indexed dataset and publishes it to ATProto,
111
+ making it discoverable on the federated atmosphere network.
112
+
113
+ Args:
114
+ local_entry: The LocalDatasetEntry to promote.
115
+ local_index: Local index containing the schema for this entry.
116
+ atmosphere_client: Authenticated AtmosphereClient.
117
+ data_store: Optional data store for copying data to new location.
118
+ If None, the existing data_urls are used as-is.
119
+ name: Override name for the atmosphere record. Defaults to local name.
120
+ description: Optional description for the dataset.
121
+ tags: Optional tags for discovery.
122
+ license: Optional license identifier.
123
+
124
+ Returns:
125
+ AT URI of the created atmosphere dataset record.
126
+
127
+ Raises:
128
+ KeyError: If schema not found in local index.
129
+ ValueError: If local entry has no data URLs.
130
+
131
+ Example:
132
+ ::
133
+
134
+ >>> entry = local_index.get_dataset("mnist-train")
135
+ >>> uri = promote_to_atmosphere(entry, local_index, client)
136
+ >>> print(uri)
137
+ at://did:plc:abc123/ac.foundation.dataset.datasetIndex/...
138
+ """
139
+ from .atmosphere import DatasetPublisher
140
+ from ._schema_codec import schema_to_type
141
+
142
+ # Validate entry has data
143
+ if not local_entry.data_urls:
144
+ raise ValueError(f"Local entry '{local_entry.name}' has no data URLs")
145
+
146
+ # Get schema from local index
147
+ schema_ref = local_entry.schema_ref
148
+ schema_record = local_index.get_schema(schema_ref)
149
+
150
+ # Reconstruct sample type from schema
151
+ sample_type = schema_to_type(schema_record)
152
+ schema_version = schema_record.get("version", "1.0.0")
153
+
154
+ # Find or publish schema on atmosphere (deduplication)
155
+ atmosphere_schema_uri = _find_or_publish_schema(
156
+ sample_type,
157
+ schema_version,
158
+ atmosphere_client,
159
+ description=schema_record.get("description"),
160
+ )
161
+
162
+ # Determine data URLs
163
+ if data_store is not None:
164
+ # Copy data to new storage location
165
+ # Create a temporary Dataset to write through the data store
166
+ from .dataset import Dataset
167
+
168
+ # Build WDS URL from data_urls
169
+ if len(local_entry.data_urls) == 1:
170
+ wds_url = local_entry.data_urls[0]
171
+ else:
172
+ # Use brace notation for multiple URLs
173
+ wds_url = " ".join(local_entry.data_urls)
174
+
175
+ ds = Dataset[sample_type](wds_url)
176
+ prefix = f"promoted/{local_entry.name}"
177
+ data_urls = data_store.write_shards(ds, prefix=prefix)
178
+ else:
179
+ # Use existing URLs as-is
180
+ data_urls = local_entry.data_urls
181
+
182
+ # Publish dataset record to atmosphere
183
+ publisher = DatasetPublisher(atmosphere_client)
184
+ uri = publisher.publish_with_urls(
185
+ urls=data_urls,
186
+ schema_uri=atmosphere_schema_uri,
187
+ name=name or local_entry.name,
188
+ description=description,
189
+ tags=tags,
190
+ license=license,
191
+ metadata=local_entry.metadata,
192
+ )
193
+
194
+ return str(uri)
195
+
196
+
197
+ __all__ = [
198
+ "promote_to_atmosphere",
199
+ ]
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: atdata
3
- Version: 0.2.0a1
3
+ Version: 0.2.2b1
4
4
  Summary: A loose federation of distributed, typed datasets
5
- Author-email: Maxine Levesque <hello@maxine.science>
5
+ Author-email: Maxine Levesque <hello@maxine.science>, "Maxine @ Forecast Bio" <maxine@forecast.bio>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.12
8
8
  Requires-Dist: atproto>=0.0.65
9
9
  Requires-Dist: fastparquet>=2024.11.0
10
+ Requires-Dist: libipld>=3.3.2
10
11
  Requires-Dist: msgpack>=1.1.2
11
12
  Requires-Dist: numpy>=2.3.4
12
13
  Requires-Dist: ormsgpack>=1.11.0
@@ -34,9 +35,13 @@ A loose federation of distributed, typed datasets built on WebDataset.
34
35
  ## Features
35
36
 
36
37
  - **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
38
+ - **Schema-free Exploration** - Load datasets without defining a schema first using `DictSample`
37
39
  - **Lens Transformations** - Bidirectional, composable transformations between different dataset views
38
40
  - **Automatic Batching** - Smart batch aggregation with numpy array stacking
39
41
  - **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
42
+ - **Flexible Data Sources** - Stream from local files, HTTP URLs, or S3-compatible storage
43
+ - **HuggingFace-style API** - `load_dataset()` with path resolution and split handling
44
+ - **Local & Atmosphere Storage** - Index datasets locally with Redis or publish to ATProto network
40
45
 
41
46
  ## Installation
42
47
 
@@ -48,9 +53,27 @@ Requires Python 3.12 or later.
48
53
 
49
54
  ## Quick Start
50
55
 
51
- ### Defining Sample Types
56
+ ### Loading Datasets
52
57
 
53
- Use the `@packable` decorator to create typed dataset samples:
58
+ The primary way to load datasets is with `load_dataset()`:
59
+
60
+ ```python
61
+ from atdata import load_dataset
62
+
63
+ # Load without specifying a type - returns Dataset[DictSample]
64
+ ds = load_dataset("path/to/data.tar", split="train")
65
+
66
+ # Explore the data
67
+ for sample in ds.ordered():
68
+ print(sample.keys()) # See available fields
69
+ print(sample["text"]) # Dict-style access
70
+ print(sample.label) # Attribute access
71
+ break
72
+ ```
73
+
74
+ ### Defining Typed Schemas
75
+
76
+ Once you understand your data, define a typed schema with `@packable`:
54
77
 
55
78
  ```python
56
79
  import atdata
@@ -63,18 +86,21 @@ class ImageSample:
63
86
  metadata: dict
64
87
  ```
65
88
 
66
- ### Creating Datasets
89
+ ### Loading with Types
67
90
 
68
91
  ```python
69
- # Create a dataset
70
- dataset = atdata.Dataset[ImageSample]("path/to/data-{000000..000009}.tar")
92
+ # Load with explicit type
93
+ ds = load_dataset("path/to/data-{000000..000009}.tar", ImageSample, split="train")
94
+
95
+ # Or convert from DictSample
96
+ ds = load_dataset("path/to/data.tar", split="train").as_type(ImageSample)
71
97
 
72
- # Iterate over samples in order
73
- for sample in dataset.ordered(batch_size=None):
98
+ # Iterate over samples
99
+ for sample in ds.ordered():
74
100
  print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
75
101
 
76
102
  # Iterate with shuffling and batching
77
- for batch in dataset.shuffled(batch_size=32):
103
+ for batch in ds.shuffled(batch_size=32):
78
104
  # batch.image is automatically stacked into shape (32, ...)
79
105
  # batch.label is a list of 32 labels
80
106
  process_batch(batch.image, batch.label)
@@ -105,9 +131,28 @@ for sample in processed_ds.ordered(batch_size=None):
105
131
 
106
132
  ## Core Concepts
107
133
 
134
+ ### DictSample
135
+
136
+ The default sample type for schema-free exploration. Provides both attribute and dict-style access:
137
+
138
+ ```python
139
+ ds = load_dataset("data.tar", split="train")
140
+
141
+ for sample in ds.ordered():
142
+ # Dict-style access
143
+ print(sample["field_name"])
144
+
145
+ # Attribute access
146
+ print(sample.field_name)
147
+
148
+ # Introspection
149
+ print(sample.keys())
150
+ print(sample.to_dict())
151
+ ```
152
+
108
153
  ### PackableSample
109
154
 
110
- Base class for serializable samples. Fields annotated as `NDArray` are automatically handled:
155
+ Base class for typed, serializable samples. Fields annotated as `NDArray` are automatically handled:
111
156
 
112
157
  ```python
113
158
  @atdata.packable
@@ -117,6 +162,8 @@ class MySample:
117
162
  regular_field: str
118
163
  ```
119
164
 
165
+ Every `@packable` class automatically registers a lens from `DictSample`, enabling seamless conversion via `.as_type()`.
166
+
120
167
  ### Lens
121
168
 
122
169
  Bidirectional transformations with getter/putter semantics:
@@ -133,6 +180,25 @@ def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
133
180
  return SourceType(...)
134
181
  ```
135
182
 
183
+ ### Data Sources
184
+
185
+ Datasets support multiple backends via the `DataSource` protocol:
186
+
187
+ ```python
188
+ # String URLs (most common) - automatically wrapped in URLSource
189
+ dataset = atdata.Dataset[ImageSample]("data-{000000..000009}.tar")
190
+
191
+ # S3 with authentication (private buckets, Cloudflare R2, MinIO)
192
+ source = atdata.S3Source(
193
+ bucket="my-bucket",
194
+ keys=["data-000000.tar", "data-000001.tar"],
195
+ endpoint="https://my-account.r2.cloudflarestorage.com",
196
+ access_key="...",
197
+ secret_key="...",
198
+ )
199
+ dataset = atdata.Dataset[ImageSample](source)
200
+ ```
201
+
136
202
  ### Dataset URLs
137
203
 
138
204
  Uses WebDataset brace expansion for sharded datasets:
@@ -141,6 +207,31 @@ Uses WebDataset brace expansion for sharded datasets:
141
207
  - Multiple shards: `"data/dataset-{000000..000099}.tar"`
142
208
  - Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
143
209
 
210
+ ### HuggingFace-style API
211
+
212
+ Load datasets with a familiar interface:
213
+
214
+ ```python
215
+ from atdata import load_dataset
216
+
217
+ # Load without type for exploration (returns Dataset[DictSample])
218
+ ds = load_dataset("./data/train-*.tar", split="train")
219
+
220
+ # Load with explicit type
221
+ ds = load_dataset("./data/train-*.tar", ImageSample, split="train")
222
+
223
+ # Load from S3 with brace notation
224
+ ds = load_dataset("s3://bucket/data-{000000..000099}.tar", ImageSample, split="train")
225
+
226
+ # Load all splits (returns DatasetDict)
227
+ ds_dict = load_dataset("./data", ImageSample)
228
+ train_ds = ds_dict["train"]
229
+ test_ds = ds_dict["test"]
230
+
231
+ # Convert DictSample to typed schema
232
+ ds = load_dataset("./data/train.tar", split="train").as_type(ImageSample)
233
+ ```
234
+
144
235
  ## Development
145
236
 
146
237
  ### Setup
@@ -157,13 +248,13 @@ uv sync
157
248
 
158
249
  ```bash
159
250
  # Run all tests with coverage
160
- pytest
251
+ uv run pytest
161
252
 
162
253
  # Run specific test file
163
- pytest tests/test_dataset.py
254
+ uv run pytest tests/test_dataset.py
164
255
 
165
256
  # Run single test
166
- pytest tests/test_lens.py::test_lens
257
+ uv run pytest tests/test_lens.py::test_lens
167
258
  ```
168
259
 
169
260
  ### Building
@@ -0,0 +1,28 @@
1
+ atdata/__init__.py,sha256=TH-HoeLOdDJL42HqcrcR4pAoizI21CTUffdBlnEQZzM,2451
2
+ atdata/_cid.py,sha256=aLH4Iov6oQsxFt4pj2-0SwcfBMQsIFtBDADQekQRtEw,4083
3
+ atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
4
+ atdata/_hf_api.py,sha256=D4-2-3bhO9yO6G_wMBlNHO_Pcj-USsUUErJNgPbdK5w,23113
5
+ atdata/_protocols.py,sha256=qHZDas6jOOWWEJYwZU4zPsB9RcX1WgwnOLNsvMek9rg,16055
6
+ atdata/_schema_codec.py,sha256=5kKCYy2F4Tg_6wpcw2lxvKFUZrMmZsTBAuBqMvuthrw,14430
7
+ atdata/_sources.py,sha256=w3nSqvd74T68vDv_xFZzx4myaFilFWXvFX7OoAT6vBU,16820
8
+ atdata/_stub_manager.py,sha256=rTyxlf5LeEcs4SHdXI-xnthhboOjK9uWIAzyXj5w9MA,19150
9
+ atdata/_type_utils.py,sha256=yKx_imuYpZkJZOQRBuneoVm7HNVIXnLZ3SbLQA7OeFw,2891
10
+ atdata/dataset.py,sha256=HTWKBXc7nKB4iaYBRMzRhxQ3iN7DZGTSPPlpcxPIdoc,36113
11
+ atdata/lens.py,sha256=fKt07rKFgwmtr10ArzhvtxJw8krcs4qJHIMc9Hbe544,9907
12
+ atdata/local.py,sha256=Wg7ynRMmpup83Plh9EgHrshgiJfznzWvt0aXnxT0jfU,57547
13
+ atdata/promote.py,sha256=DdjgrbUSOrtPau_fyGdELMwkVM8kM0bTBH8ojpcJXtc,6385
14
+ atdata/atmosphere/__init__.py,sha256=DgBGgGN6AmRnq6Csm5kbxu24KBmp41QvVOvG0ewyOB8,9863
15
+ atdata/atmosphere/_types.py,sha256=TK08Skpy2mLEgyiQmeYF4Pl_V0YfOiHI2zlClAdFvv4,9604
16
+ atdata/atmosphere/client.py,sha256=cZQMJKJQ49CC99WYFsUw71iZD3tA9cRwGWpvcYjM1p8,16174
17
+ atdata/atmosphere/lens.py,sha256=8papN5AK8Id-1LxNaHLyvpR_YYIkt5ifMd-47-QFJt4,9377
18
+ atdata/atmosphere/records.py,sha256=aC1oaAyIUid56CGkFEiZe5-jGkmkVgcQKzzFauM9yso,16003
19
+ atdata/atmosphere/schema.py,sha256=IBbssMT8mXJbJ7otW5ZJXj9eZUCwI1VsYagjeEQPwCI,7767
20
+ atdata/atmosphere/store.py,sha256=dEVGJtIUfxWZhD4NraUNGjD8Umh3CugrDCZONn-i7r0,6366
21
+ atdata/cli/__init__.py,sha256=fLnG7-Nra0IEOK140c4RoSSRvah1uXIDKpl6uW31acc,5698
22
+ atdata/cli/diagnose.py,sha256=q6wj4Skl-KwEAnwD9Bnhk_Gr8EEnrsNkSxwKtk54LaM,5455
23
+ atdata/cli/local.py,sha256=xNItYhEYOGQd30_eJ39s8IQfIfrozMxblsPc29hY-rE,8046
24
+ atdata-0.2.2b1.dist-info/METADATA,sha256=QrVdfUO-0PrI2UI113lZw16cHh_Bgdqirx-xrSZ9_bg,7270
25
+ atdata-0.2.2b1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
26
+ atdata-0.2.2b1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
27
+ atdata-0.2.2b1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
28
+ atdata-0.2.2b1.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- atdata/__init__.py,sha256=6RYvy9GJwqtSQbCS81HaQyOyAVgLxm63kBt0SH5Qapo,1642
2
- atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
3
- atdata/dataset.py,sha256=O2j1_ABvTFcs83_y-GGDRROD9zRe-237O2OiI1NhySg,24173
4
- atdata/lens.py,sha256=lFFVeuKXa17KYjfz3VFqE9Xf0vy3C6puSiF78hyIaAI,9673
5
- atdata/local.py,sha256=IdNOTA0nvszG-XRkRMkT_zkMivIx93WKh3bpgIx_u_o,15458
6
- atdata/atmosphere/__init__.py,sha256=8tPDziazrQWdyvetWTVV1eWRt6JBy86WfnvAeyh8iJE,1743
7
- atdata/atmosphere/_types.py,sha256=0606wb2c8Ty7cmZWTh5mb_qwJmAwYf5oaJU_wk9moa8,9564
8
- atdata/atmosphere/client.py,sha256=tihVBlhPCz3TZBHs_Ce7uYwE70IzKyeXNpDKsN_qc5U,11358
9
- atdata/atmosphere/lens.py,sha256=BzUdagItYsyzYHtK1jqppJJ1VUHJVQRw0hi7LuvJG5Q,9267
10
- atdata/atmosphere/records.py,sha256=-9hhSLsr6sDHkzCVWDudZtxTMHXcVyUHeVojlNcGdL4,10672
11
- atdata/atmosphere/schema.py,sha256=6gQMGSRjgESaXZzBYMfO51qL9JMiyNGrqJe4iWarO7w,9872
12
- atdata-0.2.0a1.dist-info/METADATA,sha256=EBwfarL5lmzP2lMdn7Z9yfZBjP6TwTalnTDC8cc7cdY,4471
13
- atdata-0.2.0a1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
- atdata-0.2.0a1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
15
- atdata-0.2.0a1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
16
- atdata-0.2.0a1.dist-info/RECORD,,