atdata 0.2.0a1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/promote.py ADDED
@@ -0,0 +1,195 @@
1
+ """Promotion workflow for migrating datasets from local to atmosphere.
2
+
3
+ This module provides functionality to promote locally-indexed datasets to the
4
+ ATProto atmosphere network. This enables sharing datasets with the broader
5
+ federation while maintaining schema consistency.
6
+
7
+ Examples:
8
+ >>> from atdata.local import LocalIndex, Repo
9
+ >>> from atdata.atmosphere import AtmosphereClient, AtmosphereIndex
10
+ >>> from atdata.promote import promote_to_atmosphere
11
+ >>>
12
+ >>> # Setup
13
+ >>> local_index = LocalIndex()
14
+ >>> client = AtmosphereClient()
15
+ >>> client.login("handle.bsky.social", "app-password")
16
+ >>>
17
+ >>> # Promote a dataset
18
+ >>> entry = local_index.get_dataset("my-dataset")
19
+ >>> at_uri = promote_to_atmosphere(entry, local_index, client)
20
+ """
21
+
22
+ from typing import TYPE_CHECKING, Type
23
+
24
+ if TYPE_CHECKING:
25
+ from .local import LocalDatasetEntry, Index as LocalIndex
26
+ from .atmosphere import AtmosphereClient
27
+ from ._protocols import AbstractDataStore, Packable
28
+
29
+
30
+ def _find_existing_schema(
31
+ client: "AtmosphereClient",
32
+ name: str,
33
+ version: str,
34
+ ) -> str | None:
35
+ """Check if a schema with the given name and version already exists.
36
+
37
+ Args:
38
+ client: Authenticated atmosphere client.
39
+ name: Schema name to search for.
40
+ version: Schema version to match.
41
+
42
+ Returns:
43
+ AT URI of existing schema if found, None otherwise.
44
+ """
45
+ from .atmosphere import SchemaLoader
46
+
47
+ loader = SchemaLoader(client)
48
+ for record in loader.list_all():
49
+ rec_value = record.get("value", record)
50
+ if rec_value.get("name") == name and rec_value.get("version") == version:
51
+ return record.get("uri", "")
52
+ return None
53
+
54
+
55
+ def _find_or_publish_schema(
56
+ sample_type: "Type[Packable]",
57
+ version: str,
58
+ client: "AtmosphereClient",
59
+ description: str | None = None,
60
+ ) -> str:
61
+ """Find existing schema or publish a new one.
62
+
63
+ Checks if a schema with the same name and version already exists on the
64
+ user's atmosphere repository. If found, returns the existing URI to avoid
65
+ duplicates. Otherwise, publishes a new schema record.
66
+
67
+ Args:
68
+ sample_type: The PackableSample subclass to publish.
69
+ version: Semantic version string.
70
+ client: Authenticated atmosphere client.
71
+ description: Optional schema description.
72
+
73
+ Returns:
74
+ AT URI of the schema (existing or newly published).
75
+ """
76
+ from .atmosphere import SchemaPublisher
77
+
78
+ schema_name = f"{sample_type.__module__}.{sample_type.__name__}"
79
+
80
+ # Check for existing schema
81
+ existing = _find_existing_schema(client, schema_name, version)
82
+ if existing:
83
+ return existing
84
+
85
+ # Publish new schema
86
+ publisher = SchemaPublisher(client)
87
+ uri = publisher.publish(
88
+ sample_type,
89
+ version=version,
90
+ description=description,
91
+ )
92
+ return str(uri)
93
+
94
+
95
+ def promote_to_atmosphere(
96
+ local_entry: "LocalDatasetEntry",
97
+ local_index: "LocalIndex",
98
+ atmosphere_client: "AtmosphereClient",
99
+ *,
100
+ data_store: "AbstractDataStore | None" = None,
101
+ name: str | None = None,
102
+ description: str | None = None,
103
+ tags: list[str] | None = None,
104
+ license: str | None = None,
105
+ ) -> str:
106
+ """Promote a local dataset to the atmosphere network.
107
+
108
+ This function takes a locally-indexed dataset and publishes it to ATProto,
109
+ making it discoverable on the federated atmosphere network.
110
+
111
+ Args:
112
+ local_entry: The LocalDatasetEntry to promote.
113
+ local_index: Local index containing the schema for this entry.
114
+ atmosphere_client: Authenticated AtmosphereClient.
115
+ data_store: Optional data store for copying data to new location.
116
+ If None, the existing data_urls are used as-is.
117
+ name: Override name for the atmosphere record. Defaults to local name.
118
+ description: Optional description for the dataset.
119
+ tags: Optional tags for discovery.
120
+ license: Optional license identifier.
121
+
122
+ Returns:
123
+ AT URI of the created atmosphere dataset record.
124
+
125
+ Raises:
126
+ KeyError: If schema not found in local index.
127
+ ValueError: If local entry has no data URLs.
128
+
129
+ Examples:
130
+ >>> entry = local_index.get_dataset("mnist-train")
131
+ >>> uri = promote_to_atmosphere(entry, local_index, client)
132
+ >>> print(uri)
133
+ at://did:plc:abc123/ac.foundation.dataset.datasetIndex/...
134
+ """
135
+ from .atmosphere import DatasetPublisher
136
+ from ._schema_codec import schema_to_type
137
+
138
+ # Validate entry has data
139
+ if not local_entry.data_urls:
140
+ raise ValueError(f"Local entry '{local_entry.name}' has no data URLs")
141
+
142
+ # Get schema from local index
143
+ schema_ref = local_entry.schema_ref
144
+ schema_record = local_index.get_schema(schema_ref)
145
+
146
+ # Reconstruct sample type from schema
147
+ sample_type = schema_to_type(schema_record)
148
+ schema_version = schema_record.get("version", "1.0.0")
149
+
150
+ # Find or publish schema on atmosphere (deduplication)
151
+ atmosphere_schema_uri = _find_or_publish_schema(
152
+ sample_type,
153
+ schema_version,
154
+ atmosphere_client,
155
+ description=schema_record.get("description"),
156
+ )
157
+
158
+ # Determine data URLs
159
+ if data_store is not None:
160
+ # Copy data to new storage location
161
+ # Create a temporary Dataset to write through the data store
162
+ from .dataset import Dataset
163
+
164
+ # Build WDS URL from data_urls
165
+ if len(local_entry.data_urls) == 1:
166
+ wds_url = local_entry.data_urls[0]
167
+ else:
168
+ # Use brace notation for multiple URLs
169
+ wds_url = " ".join(local_entry.data_urls)
170
+
171
+ ds = Dataset[sample_type](wds_url)
172
+ prefix = f"promoted/{local_entry.name}"
173
+ data_urls = data_store.write_shards(ds, prefix=prefix)
174
+ else:
175
+ # Use existing URLs as-is
176
+ data_urls = local_entry.data_urls
177
+
178
+ # Publish dataset record to atmosphere
179
+ publisher = DatasetPublisher(atmosphere_client)
180
+ uri = publisher.publish_with_urls(
181
+ urls=data_urls,
182
+ schema_uri=atmosphere_schema_uri,
183
+ name=name or local_entry.name,
184
+ description=description,
185
+ tags=tags,
186
+ license=license,
187
+ metadata=local_entry.metadata,
188
+ )
189
+
190
+ return str(uri)
191
+
192
+
193
+ __all__ = [
194
+ "promote_to_atmosphere",
195
+ ]
@@ -1,12 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: atdata
3
- Version: 0.2.0a1
3
+ Version: 0.2.3b1
4
4
  Summary: A loose federation of distributed, typed datasets
5
- Author-email: Maxine Levesque <hello@maxine.science>
5
+ Author-email: Maxine Levesque <hello@maxine.science>, "Maxine @ Forecast Bio" <maxine@forecast.bio>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.12
8
8
  Requires-Dist: atproto>=0.0.65
9
+ Requires-Dist: boto3>=1.41.5
9
10
  Requires-Dist: fastparquet>=2024.11.0
11
+ Requires-Dist: libipld>=3.3.2
10
12
  Requires-Dist: msgpack>=1.1.2
11
13
  Requires-Dist: numpy>=2.3.4
12
14
  Requires-Dist: ormsgpack>=1.11.0
@@ -34,9 +36,13 @@ A loose federation of distributed, typed datasets built on WebDataset.
34
36
  ## Features
35
37
 
36
38
  - **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
39
+ - **Schema-free Exploration** - Load datasets without defining a schema first using `DictSample`
37
40
  - **Lens Transformations** - Bidirectional, composable transformations between different dataset views
38
41
  - **Automatic Batching** - Smart batch aggregation with numpy array stacking
39
42
  - **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
43
+ - **Flexible Data Sources** - Stream from local files, HTTP URLs, or S3-compatible storage
44
+ - **HuggingFace-style API** - `load_dataset()` with path resolution and split handling
45
+ - **Local & Atmosphere Storage** - Index datasets locally with Redis or publish to ATProto network
40
46
 
41
47
  ## Installation
42
48
 
@@ -48,9 +54,27 @@ Requires Python 3.12 or later.
48
54
 
49
55
  ## Quick Start
50
56
 
51
- ### Defining Sample Types
57
+ ### Loading Datasets
52
58
 
53
- Use the `@packable` decorator to create typed dataset samples:
59
+ The primary way to load datasets is with `load_dataset()`:
60
+
61
+ ```python
62
+ from atdata import load_dataset
63
+
64
+ # Load without specifying a type - returns Dataset[DictSample]
65
+ ds = load_dataset("path/to/data.tar", split="train")
66
+
67
+ # Explore the data
68
+ for sample in ds.ordered():
69
+ print(sample.keys()) # See available fields
70
+ print(sample["text"]) # Dict-style access
71
+ print(sample.label) # Attribute access
72
+ break
73
+ ```
74
+
75
+ ### Defining Typed Schemas
76
+
77
+ Once you understand your data, define a typed schema with `@packable`:
54
78
 
55
79
  ```python
56
80
  import atdata
@@ -63,18 +87,21 @@ class ImageSample:
63
87
  metadata: dict
64
88
  ```
65
89
 
66
- ### Creating Datasets
90
+ ### Loading with Types
67
91
 
68
92
  ```python
69
- # Create a dataset
70
- dataset = atdata.Dataset[ImageSample]("path/to/data-{000000..000009}.tar")
93
+ # Load with explicit type
94
+ ds = load_dataset("path/to/data-{000000..000009}.tar", ImageSample, split="train")
95
+
96
+ # Or convert from DictSample
97
+ ds = load_dataset("path/to/data.tar", split="train").as_type(ImageSample)
71
98
 
72
- # Iterate over samples in order
73
- for sample in dataset.ordered(batch_size=None):
99
+ # Iterate over samples
100
+ for sample in ds.ordered():
74
101
  print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
75
102
 
76
103
  # Iterate with shuffling and batching
77
- for batch in dataset.shuffled(batch_size=32):
104
+ for batch in ds.shuffled(batch_size=32):
78
105
  # batch.image is automatically stacked into shape (32, ...)
79
106
  # batch.label is a list of 32 labels
80
107
  process_batch(batch.image, batch.label)
@@ -105,9 +132,28 @@ for sample in processed_ds.ordered(batch_size=None):
105
132
 
106
133
  ## Core Concepts
107
134
 
135
+ ### DictSample
136
+
137
+ The default sample type for schema-free exploration. Provides both attribute and dict-style access:
138
+
139
+ ```python
140
+ ds = load_dataset("data.tar", split="train")
141
+
142
+ for sample in ds.ordered():
143
+ # Dict-style access
144
+ print(sample["field_name"])
145
+
146
+ # Attribute access
147
+ print(sample.field_name)
148
+
149
+ # Introspection
150
+ print(sample.keys())
151
+ print(sample.to_dict())
152
+ ```
153
+
108
154
  ### PackableSample
109
155
 
110
- Base class for serializable samples. Fields annotated as `NDArray` are automatically handled:
156
+ Base class for typed, serializable samples. Fields annotated as `NDArray` are automatically handled:
111
157
 
112
158
  ```python
113
159
  @atdata.packable
@@ -117,6 +163,8 @@ class MySample:
117
163
  regular_field: str
118
164
  ```
119
165
 
166
+ Every `@packable` class automatically registers a lens from `DictSample`, enabling seamless conversion via `.as_type()`.
167
+
120
168
  ### Lens
121
169
 
122
170
  Bidirectional transformations with getter/putter semantics:
@@ -133,6 +181,25 @@ def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
133
181
  return SourceType(...)
134
182
  ```
135
183
 
184
+ ### Data Sources
185
+
186
+ Datasets support multiple backends via the `DataSource` protocol:
187
+
188
+ ```python
189
+ # String URLs (most common) - automatically wrapped in URLSource
190
+ dataset = atdata.Dataset[ImageSample]("data-{000000..000009}.tar")
191
+
192
+ # S3 with authentication (private buckets, Cloudflare R2, MinIO)
193
+ source = atdata.S3Source(
194
+ bucket="my-bucket",
195
+ keys=["data-000000.tar", "data-000001.tar"],
196
+ endpoint="https://my-account.r2.cloudflarestorage.com",
197
+ access_key="...",
198
+ secret_key="...",
199
+ )
200
+ dataset = atdata.Dataset[ImageSample](source)
201
+ ```
202
+
136
203
  ### Dataset URLs
137
204
 
138
205
  Uses WebDataset brace expansion for sharded datasets:
@@ -141,6 +208,31 @@ Uses WebDataset brace expansion for sharded datasets:
141
208
  - Multiple shards: `"data/dataset-{000000..000099}.tar"`
142
209
  - Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
143
210
 
211
+ ### HuggingFace-style API
212
+
213
+ Load datasets with a familiar interface:
214
+
215
+ ```python
216
+ from atdata import load_dataset
217
+
218
+ # Load without type for exploration (returns Dataset[DictSample])
219
+ ds = load_dataset("./data/train-*.tar", split="train")
220
+
221
+ # Load with explicit type
222
+ ds = load_dataset("./data/train-*.tar", ImageSample, split="train")
223
+
224
+ # Load from S3 with brace notation
225
+ ds = load_dataset("s3://bucket/data-{000000..000099}.tar", ImageSample, split="train")
226
+
227
+ # Load all splits (returns DatasetDict)
228
+ ds_dict = load_dataset("./data", ImageSample)
229
+ train_ds = ds_dict["train"]
230
+ test_ds = ds_dict["test"]
231
+
232
+ # Convert DictSample to typed schema
233
+ ds = load_dataset("./data/train.tar", split="train").as_type(ImageSample)
234
+ ```
235
+
144
236
  ## Development
145
237
 
146
238
  ### Setup
@@ -157,13 +249,13 @@ uv sync
157
249
 
158
250
  ```bash
159
251
  # Run all tests with coverage
160
- pytest
252
+ uv run pytest
161
253
 
162
254
  # Run specific test file
163
- pytest tests/test_dataset.py
255
+ uv run pytest tests/test_dataset.py
164
256
 
165
257
  # Run single test
166
- pytest tests/test_lens.py::test_lens
258
+ uv run pytest tests/test_lens.py::test_lens
167
259
  ```
168
260
 
169
261
  ### Building
@@ -0,0 +1,28 @@
1
+ atdata/__init__.py,sha256=yMp3NFDIerlv0U0ltXnTg2CvbUY-9C_etNYA1JAaf88,2452
2
+ atdata/_cid.py,sha256=6wLV_dcQJy5Eb-wld7_h7Kcp7QoVixIqUDIIoSwpQms,3992
3
+ atdata/_helpers.py,sha256=zoo9tKs_soM9n_gTQ_DRgA3iPi8i8W01L819UmzVcwo,1553
4
+ atdata/_hf_api.py,sha256=cG8JIZAOcKEVCS0XGthe-5YPMUsxHnOFpe_HeYN-WEs,22948
5
+ atdata/_protocols.py,sha256=TkBnA4mosvelsxSGnzUUf8DAcspt-zwmSLbJPGuxkRE,15764
6
+ atdata/_schema_codec.py,sha256=I2cjXuICpdP1cMsG7Vpj6T7Kz0zEwYO780pAmnpjGj8,14352
7
+ atdata/_sources.py,sha256=A7HMkS_dqN5Sx7rG1nZsO1Laxozt3C65_P2Hiv41VXk,16624
8
+ atdata/_stub_manager.py,sha256=Heh0HAYjVjnkUQcPWAEOrkkEKN3Mi9vTJPA-ZRseFw8,19141
9
+ atdata/_type_utils.py,sha256=p8pdo_Ujtds1F_G816DsPKPY9JxI8Aha6iFruvn11ro,2947
10
+ atdata/dataset.py,sha256=VTdK6rssSIHJH9GzDGLJYO8PJKNkxWW8g-U2ZQTxB_U,36773
11
+ atdata/lens.py,sha256=vyoSRMEyqk9npKmm8vfhMsO-TOfpakDNJyD_GqnqmDM,9670
12
+ atdata/local.py,sha256=S9uAsxrTm8kBwWF7VjrNaSMJvIllyFYP6a75oEyJljA,57352
13
+ atdata/promote.py,sha256=fPLVNkwukX5rjvR4z24K4kQbXlWRtHzHbQYQ3P8dcy8,6303
14
+ atdata/atmosphere/__init__.py,sha256=pm6nskOZguhnFiDbKK99uHHQW3c7v3Qe2OJmDfFSjaY,9778
15
+ atdata/atmosphere/_types.py,sha256=MRhXnmAuQLJPdoq1skrBGXCsaQYdtKG_nA3YlSjwJXY,9595
16
+ atdata/atmosphere/client.py,sha256=acw82w3_cxWbWDtIRvH1VDHGJSroGqhSenFFostXTXo,16210
17
+ atdata/atmosphere/lens.py,sha256=EnrddTD-SAnyxU0XF__QkePLUhb4lPwfFLaseL_STDc,9260
18
+ atdata/atmosphere/records.py,sha256=esEm8Lz2zUi4CS9udHTAeNLCHwilHM3VljhY202fdMk,15844
19
+ atdata/atmosphere/schema.py,sha256=6V_lL-aFtgt56cbJioYygOrUFdzn6Hj5gqgBch__HMw,7767
20
+ atdata/atmosphere/store.py,sha256=NR4tGS9u3_ogvnyyOHDVF0tRKChruj_NE9Df4qrZiDU,6324
21
+ atdata/cli/__init__.py,sha256=R8GvGfbLhdGTStBgaD4nUGkInNE_pY60z_hA-rKPWH4,5728
22
+ atdata/cli/diagnose.py,sha256=Det9ozOvxXKd8Abu-xEsMjaXR34H_cuSX9MJJIlhnsA,5483
23
+ atdata/cli/local.py,sha256=7yatEQ61ipdtWtlMcIeMgcQLfu3ysCqOrcYBCyG3ivA,8077
24
+ atdata-0.2.3b1.dist-info/METADATA,sha256=beXT0CgUFSG9heYgNS92KnvaGYimC4T7ZtspWuIzYl8,7299
25
+ atdata-0.2.3b1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
26
+ atdata-0.2.3b1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
27
+ atdata-0.2.3b1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
28
+ atdata-0.2.3b1.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- atdata/__init__.py,sha256=6RYvy9GJwqtSQbCS81HaQyOyAVgLxm63kBt0SH5Qapo,1642
2
- atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
3
- atdata/dataset.py,sha256=O2j1_ABvTFcs83_y-GGDRROD9zRe-237O2OiI1NhySg,24173
4
- atdata/lens.py,sha256=lFFVeuKXa17KYjfz3VFqE9Xf0vy3C6puSiF78hyIaAI,9673
5
- atdata/local.py,sha256=IdNOTA0nvszG-XRkRMkT_zkMivIx93WKh3bpgIx_u_o,15458
6
- atdata/atmosphere/__init__.py,sha256=8tPDziazrQWdyvetWTVV1eWRt6JBy86WfnvAeyh8iJE,1743
7
- atdata/atmosphere/_types.py,sha256=0606wb2c8Ty7cmZWTh5mb_qwJmAwYf5oaJU_wk9moa8,9564
8
- atdata/atmosphere/client.py,sha256=tihVBlhPCz3TZBHs_Ce7uYwE70IzKyeXNpDKsN_qc5U,11358
9
- atdata/atmosphere/lens.py,sha256=BzUdagItYsyzYHtK1jqppJJ1VUHJVQRw0hi7LuvJG5Q,9267
10
- atdata/atmosphere/records.py,sha256=-9hhSLsr6sDHkzCVWDudZtxTMHXcVyUHeVojlNcGdL4,10672
11
- atdata/atmosphere/schema.py,sha256=6gQMGSRjgESaXZzBYMfO51qL9JMiyNGrqJe4iWarO7w,9872
12
- atdata-0.2.0a1.dist-info/METADATA,sha256=EBwfarL5lmzP2lMdn7Z9yfZBjP6TwTalnTDC8cc7cdY,4471
13
- atdata-0.2.0a1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
14
- atdata-0.2.0a1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
15
- atdata-0.2.0a1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
16
- atdata-0.2.0a1.dist-info/RECORD,,