atdata 0.1.3b4__py3-none-any.whl → 0.2.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,332 @@
1
+ """ATProto integration for distributed dataset federation.
2
+
3
+ This module provides ATProto publishing and discovery capabilities for atdata,
4
+ enabling a loose federation of distributed, typed datasets on the AT Protocol
5
+ network.
6
+
7
+ Key components:
8
+
9
+ - ``AtmosphereClient``: Authentication and session management for ATProto
10
+ - ``SchemaPublisher``: Publish PackableSample schemas as ATProto records
11
+ - ``DatasetPublisher``: Publish dataset index records with WebDataset URLs
12
+ - ``LensPublisher``: Publish lens transformation records
13
+
14
+ The ATProto integration is additive - existing atdata functionality continues
15
+ to work unchanged. These features are opt-in for users who want to publish
16
+ or discover datasets on the ATProto network.
17
+
18
+ Example:
19
+ ::
20
+
21
+ >>> from atdata.atmosphere import AtmosphereClient, SchemaPublisher
22
+ >>>
23
+ >>> client = AtmosphereClient()
24
+ >>> client.login("handle.bsky.social", "app-password")
25
+ >>>
26
+ >>> publisher = SchemaPublisher(client)
27
+ >>> schema_uri = publisher.publish(MySampleType, version="1.0.0")
28
+
29
+ Note:
30
+ This module requires the ``atproto`` package to be installed::
31
+
32
+ pip install atproto
33
+ """
34
+
35
+ from typing import Iterator, Optional, Type, TYPE_CHECKING
36
+
37
+ from .client import AtmosphereClient
38
+ from .schema import SchemaPublisher, SchemaLoader
39
+ from .records import DatasetPublisher, DatasetLoader
40
+ from .lens import LensPublisher, LensLoader
41
+ from .store import PDSBlobStore
42
+ from ._types import (
43
+ AtUri,
44
+ SchemaRecord,
45
+ DatasetRecord,
46
+ LensRecord,
47
+ )
48
+
49
+ if TYPE_CHECKING:
50
+ from ..dataset import Dataset
51
+ from .._protocols import Packable
52
+
53
+
54
+ class AtmosphereIndexEntry:
55
+ """Entry wrapper for ATProto dataset records implementing IndexEntry protocol.
56
+
57
+ Attributes:
58
+ _uri: AT URI of the record.
59
+ _record: Raw record dictionary.
60
+ """
61
+
62
+ def __init__(self, uri: str, record: dict):
63
+ self._uri = uri
64
+ self._record = record
65
+
66
+ @property
67
+ def name(self) -> str:
68
+ """Human-readable dataset name."""
69
+ return self._record.get("name", "")
70
+
71
+ @property
72
+ def schema_ref(self) -> str:
73
+ """AT URI of the schema record."""
74
+ return self._record.get("schemaRef", "")
75
+
76
+ @property
77
+ def data_urls(self) -> list[str]:
78
+ """WebDataset URLs from external storage."""
79
+ storage = self._record.get("storage", {})
80
+ storage_type = storage.get("$type", "")
81
+ if "storageExternal" in storage_type:
82
+ return storage.get("urls", [])
83
+ return []
84
+
85
+ @property
86
+ def metadata(self) -> Optional[dict]:
87
+ """Metadata from the record, if any."""
88
+ import msgpack
89
+ metadata_bytes = self._record.get("metadata")
90
+ if metadata_bytes is None:
91
+ return None
92
+ return msgpack.unpackb(metadata_bytes, raw=False)
93
+
94
+ @property
95
+ def uri(self) -> str:
96
+ """AT URI of this record."""
97
+ return self._uri
98
+
99
+
100
+ class AtmosphereIndex:
101
+ """ATProto index implementing AbstractIndex protocol.
102
+
103
+ Wraps SchemaPublisher/Loader and DatasetPublisher/Loader to provide
104
+ a unified interface compatible with LocalIndex.
105
+
106
+ Optionally accepts a ``PDSBlobStore`` for writing dataset shards as
107
+ ATProto blobs, enabling fully decentralized dataset storage.
108
+
109
+ Example:
110
+ ::
111
+
112
+ >>> client = AtmosphereClient()
113
+ >>> client.login("handle.bsky.social", "app-password")
114
+ >>>
115
+ >>> # Without blob storage (external URLs only)
116
+ >>> index = AtmosphereIndex(client)
117
+ >>>
118
+ >>> # With PDS blob storage
119
+ >>> store = PDSBlobStore(client)
120
+ >>> index = AtmosphereIndex(client, data_store=store)
121
+ >>> entry = index.insert_dataset(dataset, name="my-data")
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ client: AtmosphereClient,
127
+ *,
128
+ data_store: Optional[PDSBlobStore] = None,
129
+ ):
130
+ """Initialize the atmosphere index.
131
+
132
+ Args:
133
+ client: Authenticated AtmosphereClient instance.
134
+ data_store: Optional PDSBlobStore for writing shards as blobs.
135
+ If provided, insert_dataset will upload shards to PDS.
136
+ """
137
+ self.client = client
138
+ self._schema_publisher = SchemaPublisher(client)
139
+ self._schema_loader = SchemaLoader(client)
140
+ self._dataset_publisher = DatasetPublisher(client)
141
+ self._dataset_loader = DatasetLoader(client)
142
+ self._data_store = data_store
143
+
144
+ @property
145
+ def data_store(self) -> Optional[PDSBlobStore]:
146
+ """The PDS blob store for writing shards, or None if not configured."""
147
+ return self._data_store
148
+
149
+ # Dataset operations
150
+
151
+ def insert_dataset(
152
+ self,
153
+ ds: "Dataset",
154
+ *,
155
+ name: str,
156
+ schema_ref: Optional[str] = None,
157
+ **kwargs,
158
+ ) -> AtmosphereIndexEntry:
159
+ """Insert a dataset into ATProto.
160
+
161
+ Args:
162
+ ds: The Dataset to publish.
163
+ name: Human-readable name.
164
+ schema_ref: Optional schema AT URI. If None, auto-publishes schema.
165
+ **kwargs: Additional options (description, tags, license).
166
+
167
+ Returns:
168
+ AtmosphereIndexEntry for the inserted dataset.
169
+ """
170
+ uri = self._dataset_publisher.publish(
171
+ ds,
172
+ name=name,
173
+ schema_uri=schema_ref,
174
+ description=kwargs.get("description"),
175
+ tags=kwargs.get("tags"),
176
+ license=kwargs.get("license"),
177
+ auto_publish_schema=(schema_ref is None),
178
+ )
179
+ record = self._dataset_loader.get(uri)
180
+ return AtmosphereIndexEntry(str(uri), record)
181
+
182
+ def get_dataset(self, ref: str) -> AtmosphereIndexEntry:
183
+ """Get a dataset by AT URI.
184
+
185
+ Args:
186
+ ref: AT URI of the dataset record.
187
+
188
+ Returns:
189
+ AtmosphereIndexEntry for the dataset.
190
+
191
+ Raises:
192
+ ValueError: If record is not a dataset.
193
+ """
194
+ record = self._dataset_loader.get(ref)
195
+ return AtmosphereIndexEntry(ref, record)
196
+
197
+ @property
198
+ def datasets(self) -> Iterator[AtmosphereIndexEntry]:
199
+ """Lazily iterate over all dataset entries (AbstractIndex protocol).
200
+
201
+ Uses the authenticated user's repository.
202
+
203
+ Yields:
204
+ AtmosphereIndexEntry for each dataset.
205
+ """
206
+ records = self._dataset_loader.list_all()
207
+ for rec in records:
208
+ uri = rec.get("uri", "")
209
+ yield AtmosphereIndexEntry(uri, rec.get("value", rec))
210
+
211
+ def list_datasets(self, repo: Optional[str] = None) -> list[AtmosphereIndexEntry]:
212
+ """Get all dataset entries as a materialized list (AbstractIndex protocol).
213
+
214
+ Args:
215
+ repo: DID of repository. Defaults to authenticated user.
216
+
217
+ Returns:
218
+ List of AtmosphereIndexEntry for each dataset.
219
+ """
220
+ records = self._dataset_loader.list_all(repo=repo)
221
+ return [
222
+ AtmosphereIndexEntry(rec.get("uri", ""), rec.get("value", rec))
223
+ for rec in records
224
+ ]
225
+
226
+ # Schema operations
227
+
228
+ def publish_schema(
229
+ self,
230
+ sample_type: "Type[Packable]",
231
+ *,
232
+ version: str = "1.0.0",
233
+ **kwargs,
234
+ ) -> str:
235
+ """Publish a schema to ATProto.
236
+
237
+ Args:
238
+ sample_type: A Packable type (PackableSample subclass or @packable-decorated).
239
+ version: Semantic version string.
240
+ **kwargs: Additional options (description, metadata).
241
+
242
+ Returns:
243
+ AT URI of the schema record.
244
+ """
245
+ uri = self._schema_publisher.publish(
246
+ sample_type,
247
+ version=version,
248
+ description=kwargs.get("description"),
249
+ metadata=kwargs.get("metadata"),
250
+ )
251
+ return str(uri)
252
+
253
+ def get_schema(self, ref: str) -> dict:
254
+ """Get a schema record by AT URI.
255
+
256
+ Args:
257
+ ref: AT URI of the schema record.
258
+
259
+ Returns:
260
+ Schema record dictionary.
261
+
262
+ Raises:
263
+ ValueError: If record is not a schema.
264
+ """
265
+ return self._schema_loader.get(ref)
266
+
267
+ @property
268
+ def schemas(self) -> Iterator[dict]:
269
+ """Lazily iterate over all schema records (AbstractIndex protocol).
270
+
271
+ Uses the authenticated user's repository.
272
+
273
+ Yields:
274
+ Schema records as dictionaries.
275
+ """
276
+ records = self._schema_loader.list_all()
277
+ for rec in records:
278
+ yield rec.get("value", rec)
279
+
280
+ def list_schemas(self, repo: Optional[str] = None) -> list[dict]:
281
+ """Get all schema records as a materialized list (AbstractIndex protocol).
282
+
283
+ Args:
284
+ repo: DID of repository. Defaults to authenticated user.
285
+
286
+ Returns:
287
+ List of schema records as dictionaries.
288
+ """
289
+ records = self._schema_loader.list_all(repo=repo)
290
+ return [rec.get("value", rec) for rec in records]
291
+
292
+ def decode_schema(self, ref: str) -> "Type[Packable]":
293
+ """Reconstruct a Python type from a schema record.
294
+
295
+ Args:
296
+ ref: AT URI of the schema record.
297
+
298
+ Returns:
299
+ Dynamically generated Packable type.
300
+
301
+ Raises:
302
+ ValueError: If schema cannot be decoded.
303
+ """
304
+ from .._schema_codec import schema_to_type
305
+
306
+ schema = self.get_schema(ref)
307
+ return schema_to_type(schema)
308
+
309
+
310
+ __all__ = [
311
+ # Client
312
+ "AtmosphereClient",
313
+ # Storage
314
+ "PDSBlobStore",
315
+ # Unified index (AbstractIndex protocol)
316
+ "AtmosphereIndex",
317
+ "AtmosphereIndexEntry",
318
+ # Schema operations
319
+ "SchemaPublisher",
320
+ "SchemaLoader",
321
+ # Dataset operations
322
+ "DatasetPublisher",
323
+ "DatasetLoader",
324
+ # Lens operations
325
+ "LensPublisher",
326
+ "LensLoader",
327
+ # Types
328
+ "AtUri",
329
+ "SchemaRecord",
330
+ "DatasetRecord",
331
+ "LensRecord",
332
+ ]
@@ -0,0 +1,331 @@
1
+ """Type definitions for ATProto record structures.
2
+
3
+ This module defines the data structures used to represent ATProto records
4
+ for schemas, datasets, and lenses. These types map to the Lexicon definitions
5
+ in the ``ac.foundation.dataset.*`` namespace.
6
+ """
7
+
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime, timezone
10
+ from typing import Optional, Literal, Any
11
+
12
+ # Lexicon namespace for atdata records
13
+ LEXICON_NAMESPACE = "ac.foundation.dataset"
14
+
15
+
16
+ @dataclass
17
+ class AtUri:
18
+ """Parsed AT Protocol URI.
19
+
20
+ AT URIs follow the format: at://<authority>/<collection>/<rkey>
21
+
22
+ Example:
23
+ ::
24
+
25
+ >>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.sampleSchema/xyz")
26
+ >>> uri.authority
27
+ 'did:plc:abc123'
28
+ >>> uri.collection
29
+ 'ac.foundation.dataset.sampleSchema'
30
+ >>> uri.rkey
31
+ 'xyz'
32
+ """
33
+
34
+ authority: str
35
+ """The DID or handle of the repository owner."""
36
+
37
+ collection: str
38
+ """The NSID of the record collection."""
39
+
40
+ rkey: str
41
+ """The record key within the collection."""
42
+
43
+ @classmethod
44
+ def parse(cls, uri: str) -> "AtUri":
45
+ """Parse an AT URI string into components.
46
+
47
+ Args:
48
+ uri: AT URI string in format ``at://<authority>/<collection>/<rkey>``
49
+
50
+ Returns:
51
+ Parsed AtUri instance.
52
+
53
+ Raises:
54
+ ValueError: If the URI format is invalid.
55
+ """
56
+ if not uri.startswith("at://"):
57
+ raise ValueError(f"Invalid AT URI: must start with 'at://': {uri}")
58
+
59
+ parts = uri[5:].split("/")
60
+ if len(parts) < 3:
61
+ raise ValueError(f"Invalid AT URI: expected authority/collection/rkey: {uri}")
62
+
63
+ return cls(
64
+ authority=parts[0],
65
+ collection=parts[1],
66
+ rkey="/".join(parts[2:]), # rkey may contain slashes
67
+ )
68
+
69
+ def __str__(self) -> str:
70
+ """Format as AT URI string."""
71
+ return f"at://{self.authority}/{self.collection}/{self.rkey}"
72
+
73
+
74
+ @dataclass
75
+ class FieldType:
76
+ """Schema field type definition.
77
+
78
+ Represents a type in the schema type system, supporting primitives,
79
+ ndarrays, and references to other schemas.
80
+ """
81
+
82
+ kind: Literal["primitive", "ndarray", "ref", "array"]
83
+ """The category of type."""
84
+
85
+ primitive: Optional[str] = None
86
+ """For kind='primitive': one of 'str', 'int', 'float', 'bool', 'bytes'."""
87
+
88
+ dtype: Optional[str] = None
89
+ """For kind='ndarray': numpy dtype string (e.g., 'float32')."""
90
+
91
+ shape: Optional[list[int | None]] = None
92
+ """For kind='ndarray': shape constraints (None for any dimension)."""
93
+
94
+ ref: Optional[str] = None
95
+ """For kind='ref': AT URI of referenced schema."""
96
+
97
+ items: Optional["FieldType"] = None
98
+ """For kind='array': type of array elements."""
99
+
100
+
101
+ @dataclass
102
+ class FieldDef:
103
+ """Schema field definition."""
104
+
105
+ name: str
106
+ """Field name."""
107
+
108
+ field_type: FieldType
109
+ """Type of this field."""
110
+
111
+ optional: bool = False
112
+ """Whether this field can be None."""
113
+
114
+ description: Optional[str] = None
115
+ """Human-readable description."""
116
+
117
+
118
+ @dataclass
119
+ class SchemaRecord:
120
+ """ATProto record for a PackableSample schema.
121
+
122
+ Maps to the ``ac.foundation.dataset.sampleSchema`` Lexicon.
123
+ """
124
+
125
+ name: str
126
+ """Human-readable schema name."""
127
+
128
+ version: str
129
+ """Semantic version string (e.g., '1.0.0')."""
130
+
131
+ fields: list[FieldDef]
132
+ """List of field definitions."""
133
+
134
+ description: Optional[str] = None
135
+ """Human-readable description."""
136
+
137
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
138
+ """When this record was created."""
139
+
140
+ metadata: Optional[dict] = None
141
+ """Arbitrary metadata as msgpack-encoded bytes."""
142
+
143
+ def to_record(self) -> dict:
144
+ """Convert to ATProto record dict for publishing."""
145
+ record = {
146
+ "$type": f"{LEXICON_NAMESPACE}.sampleSchema",
147
+ "name": self.name,
148
+ "version": self.version,
149
+ "fields": [self._field_to_dict(f) for f in self.fields],
150
+ "createdAt": self.created_at.isoformat(),
151
+ }
152
+ if self.description:
153
+ record["description"] = self.description
154
+ if self.metadata:
155
+ record["metadata"] = self.metadata
156
+ return record
157
+
158
+ def _field_to_dict(self, field_def: FieldDef) -> dict:
159
+ """Convert a field definition to dict."""
160
+ result = {
161
+ "name": field_def.name,
162
+ "fieldType": self._type_to_dict(field_def.field_type),
163
+ "optional": field_def.optional,
164
+ }
165
+ if field_def.description:
166
+ result["description"] = field_def.description
167
+ return result
168
+
169
+ def _type_to_dict(self, field_type: FieldType) -> dict:
170
+ """Convert a field type to dict."""
171
+ result: dict = {"$type": f"{LEXICON_NAMESPACE}.schemaType#{field_type.kind}"}
172
+
173
+ if field_type.kind == "primitive":
174
+ result["primitive"] = field_type.primitive
175
+ elif field_type.kind == "ndarray":
176
+ result["dtype"] = field_type.dtype
177
+ if field_type.shape:
178
+ result["shape"] = field_type.shape
179
+ elif field_type.kind == "ref":
180
+ result["ref"] = field_type.ref
181
+ elif field_type.kind == "array":
182
+ if field_type.items:
183
+ result["items"] = self._type_to_dict(field_type.items)
184
+
185
+ return result
186
+
187
+
188
+ @dataclass
189
+ class StorageLocation:
190
+ """Dataset storage location specification."""
191
+
192
+ kind: Literal["external", "blobs"]
193
+ """Storage type: external URLs or ATProto blobs."""
194
+
195
+ urls: Optional[list[str]] = None
196
+ """For kind='external': WebDataset URLs with brace notation."""
197
+
198
+ blob_refs: Optional[list[dict]] = None
199
+ """For kind='blobs': ATProto blob references."""
200
+
201
+
202
+ @dataclass
203
+ class DatasetRecord:
204
+ """ATProto record for a dataset index.
205
+
206
+ Maps to the ``ac.foundation.dataset.record`` Lexicon.
207
+ """
208
+
209
+ name: str
210
+ """Human-readable dataset name."""
211
+
212
+ schema_ref: str
213
+ """AT URI of the schema record."""
214
+
215
+ storage: StorageLocation
216
+ """Where the dataset data is stored."""
217
+
218
+ description: Optional[str] = None
219
+ """Human-readable description."""
220
+
221
+ tags: list[str] = field(default_factory=list)
222
+ """Searchable tags."""
223
+
224
+ license: Optional[str] = None
225
+ """SPDX license identifier."""
226
+
227
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
228
+ """When this record was created."""
229
+
230
+ metadata: Optional[bytes] = None
231
+ """Arbitrary metadata as msgpack-encoded bytes."""
232
+
233
+ def to_record(self) -> dict:
234
+ """Convert to ATProto record dict for publishing."""
235
+ record = {
236
+ "$type": f"{LEXICON_NAMESPACE}.record",
237
+ "name": self.name,
238
+ "schemaRef": self.schema_ref,
239
+ "storage": self._storage_to_dict(),
240
+ "createdAt": self.created_at.isoformat(),
241
+ }
242
+ if self.description:
243
+ record["description"] = self.description
244
+ if self.tags:
245
+ record["tags"] = self.tags
246
+ if self.license:
247
+ record["license"] = self.license
248
+ if self.metadata:
249
+ record["metadata"] = self.metadata
250
+ return record
251
+
252
+ def _storage_to_dict(self) -> dict:
253
+ """Convert storage location to dict."""
254
+ if self.storage.kind == "external":
255
+ return {
256
+ "$type": f"{LEXICON_NAMESPACE}.storageExternal",
257
+ "urls": self.storage.urls or [],
258
+ }
259
+ else:
260
+ return {
261
+ "$type": f"{LEXICON_NAMESPACE}.storageBlobs",
262
+ "blobs": self.storage.blob_refs or [],
263
+ }
264
+
265
+
266
+ @dataclass
267
+ class CodeReference:
268
+ """Reference to lens code in a git repository."""
269
+
270
+ repository: str
271
+ """Git repository URL."""
272
+
273
+ commit: str
274
+ """Git commit hash."""
275
+
276
+ path: str
277
+ """Path to the code file/function."""
278
+
279
+
280
+ @dataclass
281
+ class LensRecord:
282
+ """ATProto record for a lens transformation.
283
+
284
+ Maps to the ``ac.foundation.dataset.lens`` Lexicon.
285
+ """
286
+
287
+ name: str
288
+ """Human-readable lens name."""
289
+
290
+ source_schema: str
291
+ """AT URI of the source schema."""
292
+
293
+ target_schema: str
294
+ """AT URI of the target schema."""
295
+
296
+ description: Optional[str] = None
297
+ """What this transformation does."""
298
+
299
+ getter_code: Optional[CodeReference] = None
300
+ """Reference to getter function code."""
301
+
302
+ putter_code: Optional[CodeReference] = None
303
+ """Reference to putter function code."""
304
+
305
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
306
+ """When this record was created."""
307
+
308
+ def to_record(self) -> dict:
309
+ """Convert to ATProto record dict for publishing."""
310
+ record: dict[str, Any] = {
311
+ "$type": f"{LEXICON_NAMESPACE}.lens",
312
+ "name": self.name,
313
+ "sourceSchema": self.source_schema,
314
+ "targetSchema": self.target_schema,
315
+ "createdAt": self.created_at.isoformat(),
316
+ }
317
+ if self.description:
318
+ record["description"] = self.description
319
+ if self.getter_code:
320
+ record["getterCode"] = {
321
+ "repository": self.getter_code.repository,
322
+ "commit": self.getter_code.commit,
323
+ "path": self.getter_code.path,
324
+ }
325
+ if self.putter_code:
326
+ record["putterCode"] = {
327
+ "repository": self.putter_code.repository,
328
+ "commit": self.putter_code.commit,
329
+ "path": self.putter_code.path,
330
+ }
331
+ return record