atdata 0.2.0a1__py3-none-any.whl → 0.2.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,13 +16,15 @@ to work unchanged. These features are opt-in for users who want to publish
16
16
  or discover datasets on the ATProto network.
17
17
 
18
18
  Example:
19
- >>> from atdata.atmosphere import AtmosphereClient, SchemaPublisher
20
- >>>
21
- >>> client = AtmosphereClient()
22
- >>> client.login("handle.bsky.social", "app-password")
23
- >>>
24
- >>> publisher = SchemaPublisher(client)
25
- >>> schema_uri = publisher.publish(MySampleType, version="1.0.0")
19
+ ::
20
+
21
+ >>> from atdata.atmosphere import AtmosphereClient, SchemaPublisher
22
+ >>>
23
+ >>> client = AtmosphereClient()
24
+ >>> client.login("handle.bsky.social", "app-password")
25
+ >>>
26
+ >>> publisher = SchemaPublisher(client)
27
+ >>> schema_uri = publisher.publish(MySampleType, version="1.0.0")
26
28
 
27
29
  Note:
28
30
  This module requires the ``atproto`` package to be installed::
@@ -30,10 +32,13 @@ Note:
30
32
  pip install atproto
31
33
  """
32
34
 
35
+ from typing import Iterator, Optional, Type, TYPE_CHECKING
36
+
33
37
  from .client import AtmosphereClient
34
38
  from .schema import SchemaPublisher, SchemaLoader
35
39
  from .records import DatasetPublisher, DatasetLoader
36
40
  from .lens import LensPublisher, LensLoader
41
+ from .store import PDSBlobStore
37
42
  from ._types import (
38
43
  AtUri,
39
44
  SchemaRecord,
@@ -41,9 +46,275 @@ from ._types import (
41
46
  LensRecord,
42
47
  )
43
48
 
49
+ if TYPE_CHECKING:
50
+ from ..dataset import Dataset
51
+ from .._protocols import Packable
52
+
53
+
54
+ class AtmosphereIndexEntry:
55
+ """Entry wrapper for ATProto dataset records implementing IndexEntry protocol.
56
+
57
+ Attributes:
58
+ _uri: AT URI of the record.
59
+ _record: Raw record dictionary.
60
+ """
61
+
62
+ def __init__(self, uri: str, record: dict):
63
+ self._uri = uri
64
+ self._record = record
65
+
66
+ @property
67
+ def name(self) -> str:
68
+ """Human-readable dataset name."""
69
+ return self._record.get("name", "")
70
+
71
+ @property
72
+ def schema_ref(self) -> str:
73
+ """AT URI of the schema record."""
74
+ return self._record.get("schemaRef", "")
75
+
76
+ @property
77
+ def data_urls(self) -> list[str]:
78
+ """WebDataset URLs from external storage."""
79
+ storage = self._record.get("storage", {})
80
+ storage_type = storage.get("$type", "")
81
+ if "storageExternal" in storage_type:
82
+ return storage.get("urls", [])
83
+ return []
84
+
85
+ @property
86
+ def metadata(self) -> Optional[dict]:
87
+ """Metadata from the record, if any."""
88
+ import msgpack
89
+ metadata_bytes = self._record.get("metadata")
90
+ if metadata_bytes is None:
91
+ return None
92
+ return msgpack.unpackb(metadata_bytes, raw=False)
93
+
94
+ @property
95
+ def uri(self) -> str:
96
+ """AT URI of this record."""
97
+ return self._uri
98
+
99
+
100
+ class AtmosphereIndex:
101
+ """ATProto index implementing AbstractIndex protocol.
102
+
103
+ Wraps SchemaPublisher/Loader and DatasetPublisher/Loader to provide
104
+ a unified interface compatible with LocalIndex.
105
+
106
+ Optionally accepts a ``PDSBlobStore`` for writing dataset shards as
107
+ ATProto blobs, enabling fully decentralized dataset storage.
108
+
109
+ Example:
110
+ ::
111
+
112
+ >>> client = AtmosphereClient()
113
+ >>> client.login("handle.bsky.social", "app-password")
114
+ >>>
115
+ >>> # Without blob storage (external URLs only)
116
+ >>> index = AtmosphereIndex(client)
117
+ >>>
118
+ >>> # With PDS blob storage
119
+ >>> store = PDSBlobStore(client)
120
+ >>> index = AtmosphereIndex(client, data_store=store)
121
+ >>> entry = index.insert_dataset(dataset, name="my-data")
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ client: AtmosphereClient,
127
+ *,
128
+ data_store: Optional[PDSBlobStore] = None,
129
+ ):
130
+ """Initialize the atmosphere index.
131
+
132
+ Args:
133
+ client: Authenticated AtmosphereClient instance.
134
+ data_store: Optional PDSBlobStore for writing shards as blobs.
135
+ If provided, insert_dataset will upload shards to PDS.
136
+ """
137
+ self.client = client
138
+ self._schema_publisher = SchemaPublisher(client)
139
+ self._schema_loader = SchemaLoader(client)
140
+ self._dataset_publisher = DatasetPublisher(client)
141
+ self._dataset_loader = DatasetLoader(client)
142
+ self._data_store = data_store
143
+
144
+ @property
145
+ def data_store(self) -> Optional[PDSBlobStore]:
146
+ """The PDS blob store for writing shards, or None if not configured."""
147
+ return self._data_store
148
+
149
+ # Dataset operations
150
+
151
+ def insert_dataset(
152
+ self,
153
+ ds: "Dataset",
154
+ *,
155
+ name: str,
156
+ schema_ref: Optional[str] = None,
157
+ **kwargs,
158
+ ) -> AtmosphereIndexEntry:
159
+ """Insert a dataset into ATProto.
160
+
161
+ Args:
162
+ ds: The Dataset to publish.
163
+ name: Human-readable name.
164
+ schema_ref: Optional schema AT URI. If None, auto-publishes schema.
165
+ **kwargs: Additional options (description, tags, license).
166
+
167
+ Returns:
168
+ AtmosphereIndexEntry for the inserted dataset.
169
+ """
170
+ uri = self._dataset_publisher.publish(
171
+ ds,
172
+ name=name,
173
+ schema_uri=schema_ref,
174
+ description=kwargs.get("description"),
175
+ tags=kwargs.get("tags"),
176
+ license=kwargs.get("license"),
177
+ auto_publish_schema=(schema_ref is None),
178
+ )
179
+ record = self._dataset_loader.get(uri)
180
+ return AtmosphereIndexEntry(str(uri), record)
181
+
182
+ def get_dataset(self, ref: str) -> AtmosphereIndexEntry:
183
+ """Get a dataset by AT URI.
184
+
185
+ Args:
186
+ ref: AT URI of the dataset record.
187
+
188
+ Returns:
189
+ AtmosphereIndexEntry for the dataset.
190
+
191
+ Raises:
192
+ ValueError: If record is not a dataset.
193
+ """
194
+ record = self._dataset_loader.get(ref)
195
+ return AtmosphereIndexEntry(ref, record)
196
+
197
+ @property
198
+ def datasets(self) -> Iterator[AtmosphereIndexEntry]:
199
+ """Lazily iterate over all dataset entries (AbstractIndex protocol).
200
+
201
+ Uses the authenticated user's repository.
202
+
203
+ Yields:
204
+ AtmosphereIndexEntry for each dataset.
205
+ """
206
+ records = self._dataset_loader.list_all()
207
+ for rec in records:
208
+ uri = rec.get("uri", "")
209
+ yield AtmosphereIndexEntry(uri, rec.get("value", rec))
210
+
211
+ def list_datasets(self, repo: Optional[str] = None) -> list[AtmosphereIndexEntry]:
212
+ """Get all dataset entries as a materialized list (AbstractIndex protocol).
213
+
214
+ Args:
215
+ repo: DID of repository. Defaults to authenticated user.
216
+
217
+ Returns:
218
+ List of AtmosphereIndexEntry for each dataset.
219
+ """
220
+ records = self._dataset_loader.list_all(repo=repo)
221
+ return [
222
+ AtmosphereIndexEntry(rec.get("uri", ""), rec.get("value", rec))
223
+ for rec in records
224
+ ]
225
+
226
+ # Schema operations
227
+
228
+ def publish_schema(
229
+ self,
230
+ sample_type: "Type[Packable]",
231
+ *,
232
+ version: str = "1.0.0",
233
+ **kwargs,
234
+ ) -> str:
235
+ """Publish a schema to ATProto.
236
+
237
+ Args:
238
+ sample_type: A Packable type (PackableSample subclass or @packable-decorated).
239
+ version: Semantic version string.
240
+ **kwargs: Additional options (description, metadata).
241
+
242
+ Returns:
243
+ AT URI of the schema record.
244
+ """
245
+ uri = self._schema_publisher.publish(
246
+ sample_type,
247
+ version=version,
248
+ description=kwargs.get("description"),
249
+ metadata=kwargs.get("metadata"),
250
+ )
251
+ return str(uri)
252
+
253
+ def get_schema(self, ref: str) -> dict:
254
+ """Get a schema record by AT URI.
255
+
256
+ Args:
257
+ ref: AT URI of the schema record.
258
+
259
+ Returns:
260
+ Schema record dictionary.
261
+
262
+ Raises:
263
+ ValueError: If record is not a schema.
264
+ """
265
+ return self._schema_loader.get(ref)
266
+
267
+ @property
268
+ def schemas(self) -> Iterator[dict]:
269
+ """Lazily iterate over all schema records (AbstractIndex protocol).
270
+
271
+ Uses the authenticated user's repository.
272
+
273
+ Yields:
274
+ Schema records as dictionaries.
275
+ """
276
+ records = self._schema_loader.list_all()
277
+ for rec in records:
278
+ yield rec.get("value", rec)
279
+
280
+ def list_schemas(self, repo: Optional[str] = None) -> list[dict]:
281
+ """Get all schema records as a materialized list (AbstractIndex protocol).
282
+
283
+ Args:
284
+ repo: DID of repository. Defaults to authenticated user.
285
+
286
+ Returns:
287
+ List of schema records as dictionaries.
288
+ """
289
+ records = self._schema_loader.list_all(repo=repo)
290
+ return [rec.get("value", rec) for rec in records]
291
+
292
+ def decode_schema(self, ref: str) -> "Type[Packable]":
293
+ """Reconstruct a Python type from a schema record.
294
+
295
+ Args:
296
+ ref: AT URI of the schema record.
297
+
298
+ Returns:
299
+ Dynamically generated Packable type.
300
+
301
+ Raises:
302
+ ValueError: If schema cannot be decoded.
303
+ """
304
+ from .._schema_codec import schema_to_type
305
+
306
+ schema = self.get_schema(ref)
307
+ return schema_to_type(schema)
308
+
309
+
44
310
  __all__ = [
45
311
  # Client
46
312
  "AtmosphereClient",
313
+ # Storage
314
+ "PDSBlobStore",
315
+ # Unified index (AbstractIndex protocol)
316
+ "AtmosphereIndex",
317
+ "AtmosphereIndexEntry",
47
318
  # Schema operations
48
319
  "SchemaPublisher",
49
320
  "SchemaLoader",
@@ -20,13 +20,15 @@ class AtUri:
20
20
  AT URIs follow the format: at://<authority>/<collection>/<rkey>
21
21
 
22
22
  Example:
23
- >>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.sampleSchema/xyz")
24
- >>> uri.authority
25
- 'did:plc:abc123'
26
- >>> uri.collection
27
- 'ac.foundation.dataset.sampleSchema'
28
- >>> uri.rkey
29
- 'xyz'
23
+ ::
24
+
25
+ >>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.sampleSchema/xyz")
26
+ >>> uri.authority
27
+ 'did:plc:abc123'
28
+ >>> uri.collection
29
+ 'ac.foundation.dataset.sampleSchema'
30
+ >>> uri.rkey
31
+ 'xyz'
30
32
  """
31
33
 
32
34
  authority: str
@@ -34,10 +34,12 @@ class AtmosphereClient:
34
34
  for working with atdata records (schemas, datasets, lenses).
35
35
 
36
36
  Example:
37
- >>> client = AtmosphereClient()
38
- >>> client.login("alice.bsky.social", "app-password")
39
- >>> print(client.did)
40
- 'did:plc:...'
37
+ ::
38
+
39
+ >>> client = AtmosphereClient()
40
+ >>> client.login("alice.bsky.social", "app-password")
41
+ >>> print(client.did)
42
+ 'did:plc:...'
41
43
 
42
44
  Note:
43
45
  The password should be an app-specific password, not your main account
@@ -254,7 +256,18 @@ class AtmosphereClient:
254
256
  }
255
257
  )
256
258
 
257
- return response.value
259
+ # Convert ATProto model to dict if needed
260
+ value = response.value
261
+ # DotDict and similar ATProto models have to_dict()
262
+ if hasattr(value, "to_dict") and callable(value.to_dict):
263
+ return value.to_dict()
264
+ elif isinstance(value, dict):
265
+ return dict(value)
266
+ elif hasattr(value, "model_dump") and callable(value.model_dump):
267
+ return value.model_dump()
268
+ elif hasattr(value, "__dict__"):
269
+ return dict(value.__dict__)
270
+ return value
258
271
 
259
272
  def delete_record(
260
273
  self,
@@ -287,6 +300,119 @@ class AtmosphereClient:
287
300
 
288
301
  self._client.com.atproto.repo.delete_record(data=data)
289
302
 
303
+ def upload_blob(
304
+ self,
305
+ data: bytes,
306
+ mime_type: str = "application/octet-stream",
307
+ ) -> dict:
308
+ """Upload binary data as a blob to the PDS.
309
+
310
+ Args:
311
+ data: Binary data to upload.
312
+ mime_type: MIME type of the data (for reference, not enforced by PDS).
313
+
314
+ Returns:
315
+ A blob reference dict with keys: '$type', 'ref', 'mimeType', 'size'.
316
+ This can be embedded directly in record fields.
317
+
318
+ Raises:
319
+ ValueError: If not authenticated.
320
+ atproto.exceptions.AtProtocolError: If upload fails.
321
+ """
322
+ self._ensure_authenticated()
323
+
324
+ response = self._client.upload_blob(data)
325
+ blob_ref = response.blob
326
+
327
+ # Convert to dict format suitable for embedding in records
328
+ return {
329
+ "$type": "blob",
330
+ "ref": {"$link": blob_ref.ref.link if hasattr(blob_ref.ref, "link") else str(blob_ref.ref)},
331
+ "mimeType": blob_ref.mime_type,
332
+ "size": blob_ref.size,
333
+ }
334
+
335
+ def get_blob(
336
+ self,
337
+ did: str,
338
+ cid: str,
339
+ ) -> bytes:
340
+ """Download a blob from a PDS.
341
+
342
+ This resolves the PDS endpoint from the DID document and fetches
343
+ the blob directly from the PDS.
344
+
345
+ Args:
346
+ did: The DID of the repository containing the blob.
347
+ cid: The CID of the blob.
348
+
349
+ Returns:
350
+ The blob data as bytes.
351
+
352
+ Raises:
353
+ ValueError: If PDS endpoint cannot be resolved.
354
+ requests.HTTPError: If blob fetch fails.
355
+ """
356
+ import requests
357
+
358
+ # Resolve PDS endpoint from DID document
359
+ pds_endpoint = self._resolve_pds_endpoint(did)
360
+ if not pds_endpoint:
361
+ raise ValueError(f"Could not resolve PDS endpoint for {did}")
362
+
363
+ # Fetch blob from PDS
364
+ url = f"{pds_endpoint}/xrpc/com.atproto.sync.getBlob"
365
+ response = requests.get(url, params={"did": did, "cid": cid})
366
+ response.raise_for_status()
367
+ return response.content
368
+
369
+ def _resolve_pds_endpoint(self, did: str) -> Optional[str]:
370
+ """Resolve the PDS endpoint for a DID.
371
+
372
+ Args:
373
+ did: The DID to resolve.
374
+
375
+ Returns:
376
+ The PDS service endpoint URL, or None if not found.
377
+ """
378
+ import requests
379
+
380
+ # For did:plc, query the PLC directory
381
+ if did.startswith("did:plc:"):
382
+ try:
383
+ response = requests.get(f"https://plc.directory/{did}")
384
+ response.raise_for_status()
385
+ did_doc = response.json()
386
+
387
+ for service in did_doc.get("service", []):
388
+ if service.get("type") == "AtprotoPersonalDataServer":
389
+ return service.get("serviceEndpoint")
390
+ except requests.RequestException:
391
+ return None
392
+
393
+ # For did:web, would need different resolution (not implemented)
394
+ return None
395
+
396
+ def get_blob_url(self, did: str, cid: str) -> str:
397
+ """Get the direct URL for fetching a blob.
398
+
399
+ This is useful for passing to WebDataset or other HTTP clients.
400
+
401
+ Args:
402
+ did: The DID of the repository containing the blob.
403
+ cid: The CID of the blob.
404
+
405
+ Returns:
406
+ The full URL for fetching the blob.
407
+
408
+ Raises:
409
+ ValueError: If PDS endpoint cannot be resolved.
410
+ """
411
+ pds_endpoint = self._resolve_pds_endpoint(did)
412
+ if not pds_endpoint:
413
+ raise ValueError(f"Could not resolve PDS endpoint for {did}")
414
+ return f"{pds_endpoint}/xrpc/com.atproto.sync.getBlob?did={did}&cid={cid}"
415
+
290
416
  def list_records(
291
417
  self,
292
418
  collection: str,
@@ -324,7 +450,21 @@ class AtmosphereClient:
324
450
  }
325
451
  )
326
452
 
327
- records = [r.value for r in response.records]
453
+ # Convert ATProto models to dicts if needed
454
+ records = []
455
+ for r in response.records:
456
+ value = r.value
457
+ # DotDict and similar ATProto models have to_dict()
458
+ if hasattr(value, "to_dict") and callable(value.to_dict):
459
+ records.append(value.to_dict())
460
+ elif isinstance(value, dict):
461
+ records.append(dict(value))
462
+ elif hasattr(value, "model_dump") and callable(value.model_dump):
463
+ records.append(value.model_dump())
464
+ elif hasattr(value, "__dict__"):
465
+ records.append(dict(value.__dict__))
466
+ else:
467
+ records.append(value)
328
468
  return records, response.cursor
329
469
 
330
470
  # Convenience methods for atdata collections
atdata/atmosphere/lens.py CHANGED
@@ -9,7 +9,7 @@ Note:
9
9
  implementations.
10
10
  """
11
11
 
12
- from typing import Optional, Callable
12
+ from typing import Optional
13
13
 
14
14
  from .client import AtmosphereClient
15
15
  from ._types import (
@@ -32,23 +32,25 @@ class LensPublisher:
32
32
  and point to the transformation code in a git repository.
33
33
 
34
34
  Example:
35
- >>> @atdata.lens
36
- ... def my_lens(source: SourceType) -> TargetType:
37
- ... return TargetType(field=source.other_field)
38
- >>>
39
- >>> client = AtmosphereClient()
40
- >>> client.login("handle", "password")
41
- >>>
42
- >>> publisher = LensPublisher(client)
43
- >>> uri = publisher.publish(
44
- ... name="my_lens",
45
- ... source_schema_uri="at://did:plc:abc/ac.foundation.dataset.sampleSchema/source",
46
- ... target_schema_uri="at://did:plc:abc/ac.foundation.dataset.sampleSchema/target",
47
- ... code_repository="https://github.com/user/repo",
48
- ... code_commit="abc123def456",
49
- ... getter_path="mymodule.lenses:my_lens",
50
- ... putter_path="mymodule.lenses:my_lens_putter",
51
- ... )
35
+ ::
36
+
37
+ >>> @atdata.lens
38
+ ... def my_lens(source: SourceType) -> TargetType:
39
+ ... return TargetType(field=source.other_field)
40
+ >>>
41
+ >>> client = AtmosphereClient()
42
+ >>> client.login("handle", "password")
43
+ >>>
44
+ >>> publisher = LensPublisher(client)
45
+ >>> uri = publisher.publish(
46
+ ... name="my_lens",
47
+ ... source_schema_uri="at://did:plc:abc/ac.foundation.dataset.sampleSchema/source",
48
+ ... target_schema_uri="at://did:plc:abc/ac.foundation.dataset.sampleSchema/target",
49
+ ... code_repository="https://github.com/user/repo",
50
+ ... code_commit="abc123def456",
51
+ ... getter_path="mymodule.lenses:my_lens",
52
+ ... putter_path="mymodule.lenses:my_lens_putter",
53
+ ... )
52
54
 
53
55
  Security Note:
54
56
  Lens code is stored as references to git repositories rather than
@@ -194,13 +196,15 @@ class LensLoader:
194
196
  it manually.
195
197
 
196
198
  Example:
197
- >>> client = AtmosphereClient()
198
- >>> loader = LensLoader(client)
199
- >>>
200
- >>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.lens/xyz")
201
- >>> print(record["name"])
202
- >>> print(record["sourceSchema"])
203
- >>> print(record.get("getterCode", {}).get("repository"))
199
+ ::
200
+
201
+ >>> client = AtmosphereClient()
202
+ >>> loader = LensLoader(client)
203
+ >>>
204
+ >>> record = loader.get("at://did:plc:abc/ac.foundation.dataset.lens/xyz")
205
+ >>> print(record["name"])
206
+ >>> print(record["sourceSchema"])
207
+ >>> print(record.get("getterCode", {}).get("repository"))
204
208
  """
205
209
 
206
210
  def __init__(self, client: AtmosphereClient):