atdata 0.1.3b4__py3-none-any.whl → 0.2.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +3 -0
- atdata/atmosphere/__init__.py +61 -0
- atdata/atmosphere/_types.py +329 -0
- atdata/atmosphere/client.py +393 -0
- atdata/atmosphere/lens.py +280 -0
- atdata/atmosphere/records.py +342 -0
- atdata/atmosphere/schema.py +296 -0
- atdata/dataset.py +40 -169
- atdata/lens.py +2 -55
- atdata/local.py +492 -0
- {atdata-0.1.3b4.dist-info → atdata-0.2.0a1.dist-info}/METADATA +10 -1
- atdata-0.2.0a1.dist-info/RECORD +16 -0
- {atdata-0.1.3b4.dist-info → atdata-0.2.0a1.dist-info}/WHEEL +1 -1
- atdata-0.1.3b4.dist-info/RECORD +0 -9
- {atdata-0.1.3b4.dist-info → atdata-0.2.0a1.dist-info}/entry_points.txt +0 -0
- {atdata-0.1.3b4.dist-info → atdata-0.2.0a1.dist-info}/licenses/LICENSE +0 -0
atdata/__init__.py
CHANGED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""ATProto integration for distributed dataset federation.
|
|
2
|
+
|
|
3
|
+
This module provides ATProto publishing and discovery capabilities for atdata,
|
|
4
|
+
enabling a loose federation of distributed, typed datasets on the AT Protocol
|
|
5
|
+
network.
|
|
6
|
+
|
|
7
|
+
Key components:
|
|
8
|
+
|
|
9
|
+
- ``AtmosphereClient``: Authentication and session management for ATProto
|
|
10
|
+
- ``SchemaPublisher``: Publish PackableSample schemas as ATProto records
|
|
11
|
+
- ``DatasetPublisher``: Publish dataset index records with WebDataset URLs
|
|
12
|
+
- ``LensPublisher``: Publish lens transformation records
|
|
13
|
+
|
|
14
|
+
The ATProto integration is additive - existing atdata functionality continues
|
|
15
|
+
to work unchanged. These features are opt-in for users who want to publish
|
|
16
|
+
or discover datasets on the ATProto network.
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
>>> from atdata.atmosphere import AtmosphereClient, SchemaPublisher
|
|
20
|
+
>>>
|
|
21
|
+
>>> client = AtmosphereClient()
|
|
22
|
+
>>> client.login("handle.bsky.social", "app-password")
|
|
23
|
+
>>>
|
|
24
|
+
>>> publisher = SchemaPublisher(client)
|
|
25
|
+
>>> schema_uri = publisher.publish(MySampleType, version="1.0.0")
|
|
26
|
+
|
|
27
|
+
Note:
|
|
28
|
+
This module requires the ``atproto`` package to be installed::
|
|
29
|
+
|
|
30
|
+
pip install atproto
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from .client import AtmosphereClient
|
|
34
|
+
from .schema import SchemaPublisher, SchemaLoader
|
|
35
|
+
from .records import DatasetPublisher, DatasetLoader
|
|
36
|
+
from .lens import LensPublisher, LensLoader
|
|
37
|
+
from ._types import (
|
|
38
|
+
AtUri,
|
|
39
|
+
SchemaRecord,
|
|
40
|
+
DatasetRecord,
|
|
41
|
+
LensRecord,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
# Client
|
|
46
|
+
"AtmosphereClient",
|
|
47
|
+
# Schema operations
|
|
48
|
+
"SchemaPublisher",
|
|
49
|
+
"SchemaLoader",
|
|
50
|
+
# Dataset operations
|
|
51
|
+
"DatasetPublisher",
|
|
52
|
+
"DatasetLoader",
|
|
53
|
+
# Lens operations
|
|
54
|
+
"LensPublisher",
|
|
55
|
+
"LensLoader",
|
|
56
|
+
# Types
|
|
57
|
+
"AtUri",
|
|
58
|
+
"SchemaRecord",
|
|
59
|
+
"DatasetRecord",
|
|
60
|
+
"LensRecord",
|
|
61
|
+
]
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""Type definitions for ATProto record structures.
|
|
2
|
+
|
|
3
|
+
This module defines the data structures used to represent ATProto records
|
|
4
|
+
for schemas, datasets, and lenses. These types map to the Lexicon definitions
|
|
5
|
+
in the ``ac.foundation.dataset.*`` namespace.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from typing import Optional, Literal, Any
|
|
11
|
+
|
|
12
|
+
# Lexicon namespace for atdata records
|
|
13
|
+
LEXICON_NAMESPACE = "ac.foundation.dataset"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class AtUri:
|
|
18
|
+
"""Parsed AT Protocol URI.
|
|
19
|
+
|
|
20
|
+
AT URIs follow the format: at://<authority>/<collection>/<rkey>
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> uri = AtUri.parse("at://did:plc:abc123/ac.foundation.dataset.sampleSchema/xyz")
|
|
24
|
+
>>> uri.authority
|
|
25
|
+
'did:plc:abc123'
|
|
26
|
+
>>> uri.collection
|
|
27
|
+
'ac.foundation.dataset.sampleSchema'
|
|
28
|
+
>>> uri.rkey
|
|
29
|
+
'xyz'
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
authority: str
|
|
33
|
+
"""The DID or handle of the repository owner."""
|
|
34
|
+
|
|
35
|
+
collection: str
|
|
36
|
+
"""The NSID of the record collection."""
|
|
37
|
+
|
|
38
|
+
rkey: str
|
|
39
|
+
"""The record key within the collection."""
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def parse(cls, uri: str) -> "AtUri":
|
|
43
|
+
"""Parse an AT URI string into components.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
uri: AT URI string in format ``at://<authority>/<collection>/<rkey>``
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Parsed AtUri instance.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ValueError: If the URI format is invalid.
|
|
53
|
+
"""
|
|
54
|
+
if not uri.startswith("at://"):
|
|
55
|
+
raise ValueError(f"Invalid AT URI: must start with 'at://': {uri}")
|
|
56
|
+
|
|
57
|
+
parts = uri[5:].split("/")
|
|
58
|
+
if len(parts) < 3:
|
|
59
|
+
raise ValueError(f"Invalid AT URI: expected authority/collection/rkey: {uri}")
|
|
60
|
+
|
|
61
|
+
return cls(
|
|
62
|
+
authority=parts[0],
|
|
63
|
+
collection=parts[1],
|
|
64
|
+
rkey="/".join(parts[2:]), # rkey may contain slashes
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def __str__(self) -> str:
|
|
68
|
+
"""Format as AT URI string."""
|
|
69
|
+
return f"at://{self.authority}/{self.collection}/{self.rkey}"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@dataclass
|
|
73
|
+
class FieldType:
|
|
74
|
+
"""Schema field type definition.
|
|
75
|
+
|
|
76
|
+
Represents a type in the schema type system, supporting primitives,
|
|
77
|
+
ndarrays, and references to other schemas.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
kind: Literal["primitive", "ndarray", "ref", "array"]
|
|
81
|
+
"""The category of type."""
|
|
82
|
+
|
|
83
|
+
primitive: Optional[str] = None
|
|
84
|
+
"""For kind='primitive': one of 'str', 'int', 'float', 'bool', 'bytes'."""
|
|
85
|
+
|
|
86
|
+
dtype: Optional[str] = None
|
|
87
|
+
"""For kind='ndarray': numpy dtype string (e.g., 'float32')."""
|
|
88
|
+
|
|
89
|
+
shape: Optional[list[int | None]] = None
|
|
90
|
+
"""For kind='ndarray': shape constraints (None for any dimension)."""
|
|
91
|
+
|
|
92
|
+
ref: Optional[str] = None
|
|
93
|
+
"""For kind='ref': AT URI of referenced schema."""
|
|
94
|
+
|
|
95
|
+
items: Optional["FieldType"] = None
|
|
96
|
+
"""For kind='array': type of array elements."""
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class FieldDef:
|
|
101
|
+
"""Schema field definition."""
|
|
102
|
+
|
|
103
|
+
name: str
|
|
104
|
+
"""Field name."""
|
|
105
|
+
|
|
106
|
+
field_type: FieldType
|
|
107
|
+
"""Type of this field."""
|
|
108
|
+
|
|
109
|
+
optional: bool = False
|
|
110
|
+
"""Whether this field can be None."""
|
|
111
|
+
|
|
112
|
+
description: Optional[str] = None
|
|
113
|
+
"""Human-readable description."""
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class SchemaRecord:
|
|
118
|
+
"""ATProto record for a PackableSample schema.
|
|
119
|
+
|
|
120
|
+
Maps to the ``ac.foundation.dataset.sampleSchema`` Lexicon.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
name: str
|
|
124
|
+
"""Human-readable schema name."""
|
|
125
|
+
|
|
126
|
+
version: str
|
|
127
|
+
"""Semantic version string (e.g., '1.0.0')."""
|
|
128
|
+
|
|
129
|
+
fields: list[FieldDef]
|
|
130
|
+
"""List of field definitions."""
|
|
131
|
+
|
|
132
|
+
description: Optional[str] = None
|
|
133
|
+
"""Human-readable description."""
|
|
134
|
+
|
|
135
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
136
|
+
"""When this record was created."""
|
|
137
|
+
|
|
138
|
+
metadata: Optional[dict] = None
|
|
139
|
+
"""Arbitrary metadata as msgpack-encoded bytes."""
|
|
140
|
+
|
|
141
|
+
def to_record(self) -> dict:
|
|
142
|
+
"""Convert to ATProto record dict for publishing."""
|
|
143
|
+
record = {
|
|
144
|
+
"$type": f"{LEXICON_NAMESPACE}.sampleSchema",
|
|
145
|
+
"name": self.name,
|
|
146
|
+
"version": self.version,
|
|
147
|
+
"fields": [self._field_to_dict(f) for f in self.fields],
|
|
148
|
+
"createdAt": self.created_at.isoformat(),
|
|
149
|
+
}
|
|
150
|
+
if self.description:
|
|
151
|
+
record["description"] = self.description
|
|
152
|
+
if self.metadata:
|
|
153
|
+
record["metadata"] = self.metadata
|
|
154
|
+
return record
|
|
155
|
+
|
|
156
|
+
def _field_to_dict(self, field_def: FieldDef) -> dict:
|
|
157
|
+
"""Convert a field definition to dict."""
|
|
158
|
+
result = {
|
|
159
|
+
"name": field_def.name,
|
|
160
|
+
"fieldType": self._type_to_dict(field_def.field_type),
|
|
161
|
+
"optional": field_def.optional,
|
|
162
|
+
}
|
|
163
|
+
if field_def.description:
|
|
164
|
+
result["description"] = field_def.description
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
def _type_to_dict(self, field_type: FieldType) -> dict:
|
|
168
|
+
"""Convert a field type to dict."""
|
|
169
|
+
result: dict = {"$type": f"{LEXICON_NAMESPACE}.schemaType#{field_type.kind}"}
|
|
170
|
+
|
|
171
|
+
if field_type.kind == "primitive":
|
|
172
|
+
result["primitive"] = field_type.primitive
|
|
173
|
+
elif field_type.kind == "ndarray":
|
|
174
|
+
result["dtype"] = field_type.dtype
|
|
175
|
+
if field_type.shape:
|
|
176
|
+
result["shape"] = field_type.shape
|
|
177
|
+
elif field_type.kind == "ref":
|
|
178
|
+
result["ref"] = field_type.ref
|
|
179
|
+
elif field_type.kind == "array":
|
|
180
|
+
if field_type.items:
|
|
181
|
+
result["items"] = self._type_to_dict(field_type.items)
|
|
182
|
+
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@dataclass
|
|
187
|
+
class StorageLocation:
|
|
188
|
+
"""Dataset storage location specification."""
|
|
189
|
+
|
|
190
|
+
kind: Literal["external", "blobs"]
|
|
191
|
+
"""Storage type: external URLs or ATProto blobs."""
|
|
192
|
+
|
|
193
|
+
urls: Optional[list[str]] = None
|
|
194
|
+
"""For kind='external': WebDataset URLs with brace notation."""
|
|
195
|
+
|
|
196
|
+
blob_refs: Optional[list[dict]] = None
|
|
197
|
+
"""For kind='blobs': ATProto blob references."""
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@dataclass
|
|
201
|
+
class DatasetRecord:
|
|
202
|
+
"""ATProto record for a dataset index.
|
|
203
|
+
|
|
204
|
+
Maps to the ``ac.foundation.dataset.record`` Lexicon.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
name: str
|
|
208
|
+
"""Human-readable dataset name."""
|
|
209
|
+
|
|
210
|
+
schema_ref: str
|
|
211
|
+
"""AT URI of the schema record."""
|
|
212
|
+
|
|
213
|
+
storage: StorageLocation
|
|
214
|
+
"""Where the dataset data is stored."""
|
|
215
|
+
|
|
216
|
+
description: Optional[str] = None
|
|
217
|
+
"""Human-readable description."""
|
|
218
|
+
|
|
219
|
+
tags: list[str] = field(default_factory=list)
|
|
220
|
+
"""Searchable tags."""
|
|
221
|
+
|
|
222
|
+
license: Optional[str] = None
|
|
223
|
+
"""SPDX license identifier."""
|
|
224
|
+
|
|
225
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
226
|
+
"""When this record was created."""
|
|
227
|
+
|
|
228
|
+
metadata: Optional[bytes] = None
|
|
229
|
+
"""Arbitrary metadata as msgpack-encoded bytes."""
|
|
230
|
+
|
|
231
|
+
def to_record(self) -> dict:
|
|
232
|
+
"""Convert to ATProto record dict for publishing."""
|
|
233
|
+
record = {
|
|
234
|
+
"$type": f"{LEXICON_NAMESPACE}.record",
|
|
235
|
+
"name": self.name,
|
|
236
|
+
"schemaRef": self.schema_ref,
|
|
237
|
+
"storage": self._storage_to_dict(),
|
|
238
|
+
"createdAt": self.created_at.isoformat(),
|
|
239
|
+
}
|
|
240
|
+
if self.description:
|
|
241
|
+
record["description"] = self.description
|
|
242
|
+
if self.tags:
|
|
243
|
+
record["tags"] = self.tags
|
|
244
|
+
if self.license:
|
|
245
|
+
record["license"] = self.license
|
|
246
|
+
if self.metadata:
|
|
247
|
+
record["metadata"] = self.metadata
|
|
248
|
+
return record
|
|
249
|
+
|
|
250
|
+
def _storage_to_dict(self) -> dict:
|
|
251
|
+
"""Convert storage location to dict."""
|
|
252
|
+
if self.storage.kind == "external":
|
|
253
|
+
return {
|
|
254
|
+
"$type": f"{LEXICON_NAMESPACE}.storageExternal",
|
|
255
|
+
"urls": self.storage.urls or [],
|
|
256
|
+
}
|
|
257
|
+
else:
|
|
258
|
+
return {
|
|
259
|
+
"$type": f"{LEXICON_NAMESPACE}.storageBlobs",
|
|
260
|
+
"blobs": self.storage.blob_refs or [],
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
@dataclass
|
|
265
|
+
class CodeReference:
|
|
266
|
+
"""Reference to lens code in a git repository."""
|
|
267
|
+
|
|
268
|
+
repository: str
|
|
269
|
+
"""Git repository URL."""
|
|
270
|
+
|
|
271
|
+
commit: str
|
|
272
|
+
"""Git commit hash."""
|
|
273
|
+
|
|
274
|
+
path: str
|
|
275
|
+
"""Path to the code file/function."""
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
@dataclass
|
|
279
|
+
class LensRecord:
|
|
280
|
+
"""ATProto record for a lens transformation.
|
|
281
|
+
|
|
282
|
+
Maps to the ``ac.foundation.dataset.lens`` Lexicon.
|
|
283
|
+
"""
|
|
284
|
+
|
|
285
|
+
name: str
|
|
286
|
+
"""Human-readable lens name."""
|
|
287
|
+
|
|
288
|
+
source_schema: str
|
|
289
|
+
"""AT URI of the source schema."""
|
|
290
|
+
|
|
291
|
+
target_schema: str
|
|
292
|
+
"""AT URI of the target schema."""
|
|
293
|
+
|
|
294
|
+
description: Optional[str] = None
|
|
295
|
+
"""What this transformation does."""
|
|
296
|
+
|
|
297
|
+
getter_code: Optional[CodeReference] = None
|
|
298
|
+
"""Reference to getter function code."""
|
|
299
|
+
|
|
300
|
+
putter_code: Optional[CodeReference] = None
|
|
301
|
+
"""Reference to putter function code."""
|
|
302
|
+
|
|
303
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
304
|
+
"""When this record was created."""
|
|
305
|
+
|
|
306
|
+
def to_record(self) -> dict:
|
|
307
|
+
"""Convert to ATProto record dict for publishing."""
|
|
308
|
+
record: dict[str, Any] = {
|
|
309
|
+
"$type": f"{LEXICON_NAMESPACE}.lens",
|
|
310
|
+
"name": self.name,
|
|
311
|
+
"sourceSchema": self.source_schema,
|
|
312
|
+
"targetSchema": self.target_schema,
|
|
313
|
+
"createdAt": self.created_at.isoformat(),
|
|
314
|
+
}
|
|
315
|
+
if self.description:
|
|
316
|
+
record["description"] = self.description
|
|
317
|
+
if self.getter_code:
|
|
318
|
+
record["getterCode"] = {
|
|
319
|
+
"repository": self.getter_code.repository,
|
|
320
|
+
"commit": self.getter_code.commit,
|
|
321
|
+
"path": self.getter_code.path,
|
|
322
|
+
}
|
|
323
|
+
if self.putter_code:
|
|
324
|
+
record["putterCode"] = {
|
|
325
|
+
"repository": self.putter_code.repository,
|
|
326
|
+
"commit": self.putter_code.commit,
|
|
327
|
+
"path": self.putter_code.path,
|
|
328
|
+
}
|
|
329
|
+
return record
|