atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +11 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +46 -1
- atdata/_logging.py +43 -0
- atdata/_protocols.py +81 -182
- atdata/_schema_codec.py +2 -2
- atdata/_sources.py +24 -4
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +60 -21
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +73 -245
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +60 -53
- atdata/atmosphere/records.py +291 -100
- atdata/atmosphere/schema.py +91 -65
- atdata/atmosphere/store.py +68 -66
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +266 -47
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_entry.py +6 -2
- atdata/{local → index}/_index.py +617 -72
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +127 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
- atdata/lexicons/ac.foundation.dataset.record.json +117 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/promote.py +14 -10
- atdata/repository.py +66 -16
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +131 -0
- atdata/{local → stores}/_s3.py +134 -112
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
- atdata-0.3.2b1.dist-info/RECORD +71 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
"""Lexicon-mirror types for the ``ac.foundation.dataset`` namespace.
|
|
2
|
+
|
|
3
|
+
These dataclasses map 1:1 to the ATProto Lexicon JSON definitions. They are
|
|
4
|
+
the canonical Python representation for serializing to and deserializing from
|
|
5
|
+
ATProto record dicts. Each class provides ``to_record()`` and ``from_record()``
|
|
6
|
+
for round-trip conversion.
|
|
7
|
+
|
|
8
|
+
Internal/local types (used outside the atmosphere context) live in
|
|
9
|
+
``atdata.index._schema`` and ``atdata.index._entry``.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
LEXICON_NAMESPACE = "ac.foundation.dataset"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Shared definitions
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class ShardChecksum:
|
|
28
|
+
"""Content hash for shard integrity verification.
|
|
29
|
+
|
|
30
|
+
Mirrors ``ac.foundation.dataset.record#shardChecksum``.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
algorithm: str
|
|
34
|
+
"""Hash algorithm identifier (e.g., 'sha256', 'blake3')."""
|
|
35
|
+
|
|
36
|
+
digest: str
|
|
37
|
+
"""Hex-encoded hash digest."""
|
|
38
|
+
|
|
39
|
+
def to_record(self) -> dict[str, str]:
|
|
40
|
+
"""Serialize to ATProto record dict."""
|
|
41
|
+
return {"algorithm": self.algorithm, "digest": self.digest}
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def from_record(cls, d: dict[str, Any]) -> ShardChecksum:
|
|
45
|
+
"""Deserialize from ATProto record dict."""
|
|
46
|
+
return cls(algorithm=d["algorithm"], digest=d["digest"])
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class DatasetSize:
|
|
51
|
+
"""Dataset size metadata.
|
|
52
|
+
|
|
53
|
+
Mirrors ``ac.foundation.dataset.record#datasetSize``.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
samples: int | None = None
|
|
57
|
+
bytes_: int | None = None
|
|
58
|
+
shards: int | None = None
|
|
59
|
+
|
|
60
|
+
def to_record(self) -> dict[str, Any]:
|
|
61
|
+
"""Serialize to ATProto record dict."""
|
|
62
|
+
d: dict[str, Any] = {}
|
|
63
|
+
if self.samples is not None:
|
|
64
|
+
d["samples"] = self.samples
|
|
65
|
+
if self.bytes_ is not None:
|
|
66
|
+
d["bytes"] = self.bytes_
|
|
67
|
+
if self.shards is not None:
|
|
68
|
+
d["shards"] = self.shards
|
|
69
|
+
return d
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def from_record(cls, d: dict[str, Any]) -> DatasetSize:
|
|
73
|
+
"""Deserialize from ATProto record dict."""
|
|
74
|
+
return cls(
|
|
75
|
+
samples=d.get("samples"),
|
|
76
|
+
bytes_=d.get("bytes"),
|
|
77
|
+
shards=d.get("shards"),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Storage types
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class HttpShardEntry:
|
|
88
|
+
"""A single HTTP-accessible shard with integrity checksum.
|
|
89
|
+
|
|
90
|
+
Mirrors ``ac.foundation.dataset.storageHttp#shardEntry``.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
url: str
|
|
94
|
+
checksum: ShardChecksum
|
|
95
|
+
|
|
96
|
+
def to_record(self) -> dict[str, Any]:
|
|
97
|
+
"""Serialize to ATProto record dict."""
|
|
98
|
+
return {"url": self.url, "checksum": self.checksum.to_record()}
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def from_record(cls, d: dict[str, Any]) -> HttpShardEntry:
|
|
102
|
+
"""Deserialize from ATProto record dict."""
|
|
103
|
+
return cls(
|
|
104
|
+
url=d["url"],
|
|
105
|
+
checksum=ShardChecksum.from_record(d["checksum"]),
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@dataclass
|
|
110
|
+
class StorageHttp:
|
|
111
|
+
"""HTTP/HTTPS storage for WebDataset tar archives.
|
|
112
|
+
|
|
113
|
+
Mirrors ``ac.foundation.dataset.storageHttp``.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
shards: list[HttpShardEntry]
|
|
117
|
+
|
|
118
|
+
def to_record(self) -> dict[str, Any]:
|
|
119
|
+
"""Serialize to ATProto record dict."""
|
|
120
|
+
return {
|
|
121
|
+
"$type": f"{LEXICON_NAMESPACE}.storageHttp",
|
|
122
|
+
"shards": [s.to_record() for s in self.shards],
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def from_record(cls, d: dict[str, Any]) -> StorageHttp:
|
|
127
|
+
"""Deserialize from ATProto record dict."""
|
|
128
|
+
return cls(
|
|
129
|
+
shards=[HttpShardEntry.from_record(s) for s in d["shards"]],
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass
|
|
134
|
+
class S3ShardEntry:
|
|
135
|
+
"""A single S3 object shard with integrity checksum.
|
|
136
|
+
|
|
137
|
+
Mirrors ``ac.foundation.dataset.storageS3#shardEntry``.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
key: str
|
|
141
|
+
checksum: ShardChecksum
|
|
142
|
+
|
|
143
|
+
def to_record(self) -> dict[str, Any]:
|
|
144
|
+
"""Serialize to ATProto record dict."""
|
|
145
|
+
return {"key": self.key, "checksum": self.checksum.to_record()}
|
|
146
|
+
|
|
147
|
+
@classmethod
|
|
148
|
+
def from_record(cls, d: dict[str, Any]) -> S3ShardEntry:
|
|
149
|
+
"""Deserialize from ATProto record dict."""
|
|
150
|
+
return cls(
|
|
151
|
+
key=d["key"],
|
|
152
|
+
checksum=ShardChecksum.from_record(d["checksum"]),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass
|
|
157
|
+
class StorageS3:
|
|
158
|
+
"""S3/S3-compatible storage for WebDataset tar archives.
|
|
159
|
+
|
|
160
|
+
Mirrors ``ac.foundation.dataset.storageS3``.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
bucket: str
|
|
164
|
+
shards: list[S3ShardEntry]
|
|
165
|
+
region: str | None = None
|
|
166
|
+
endpoint: str | None = None
|
|
167
|
+
|
|
168
|
+
def to_record(self) -> dict[str, Any]:
|
|
169
|
+
"""Serialize to ATProto record dict."""
|
|
170
|
+
d: dict[str, Any] = {
|
|
171
|
+
"$type": f"{LEXICON_NAMESPACE}.storageS3",
|
|
172
|
+
"bucket": self.bucket,
|
|
173
|
+
"shards": [s.to_record() for s in self.shards],
|
|
174
|
+
}
|
|
175
|
+
if self.region is not None:
|
|
176
|
+
d["region"] = self.region
|
|
177
|
+
if self.endpoint is not None:
|
|
178
|
+
d["endpoint"] = self.endpoint
|
|
179
|
+
return d
|
|
180
|
+
|
|
181
|
+
@classmethod
|
|
182
|
+
def from_record(cls, d: dict[str, Any]) -> StorageS3:
|
|
183
|
+
"""Deserialize from ATProto record dict."""
|
|
184
|
+
return cls(
|
|
185
|
+
bucket=d["bucket"],
|
|
186
|
+
shards=[S3ShardEntry.from_record(s) for s in d["shards"]],
|
|
187
|
+
region=d.get("region"),
|
|
188
|
+
endpoint=d.get("endpoint"),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@dataclass
|
|
193
|
+
class BlobEntry:
|
|
194
|
+
"""A single PDS blob shard with optional integrity checksum.
|
|
195
|
+
|
|
196
|
+
Mirrors ``ac.foundation.dataset.storageBlobs#blobEntry``.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
blob: dict[str, Any]
|
|
200
|
+
"""ATProto blob reference dict."""
|
|
201
|
+
|
|
202
|
+
checksum: ShardChecksum | None = None
|
|
203
|
+
|
|
204
|
+
def to_record(self) -> dict[str, Any]:
|
|
205
|
+
"""Serialize to ATProto record dict."""
|
|
206
|
+
d: dict[str, Any] = {"blob": self.blob}
|
|
207
|
+
if self.checksum is not None:
|
|
208
|
+
d["checksum"] = self.checksum.to_record()
|
|
209
|
+
return d
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def from_record(cls, d: dict[str, Any]) -> BlobEntry:
|
|
213
|
+
"""Deserialize from ATProto record dict."""
|
|
214
|
+
checksum = None
|
|
215
|
+
if "checksum" in d:
|
|
216
|
+
checksum = ShardChecksum.from_record(d["checksum"])
|
|
217
|
+
return cls(blob=d["blob"], checksum=checksum)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@dataclass
|
|
221
|
+
class StorageBlobs:
|
|
222
|
+
"""ATProto PDS blob storage for WebDataset tar archives.
|
|
223
|
+
|
|
224
|
+
Mirrors ``ac.foundation.dataset.storageBlobs``.
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
blobs: list[BlobEntry]
|
|
228
|
+
|
|
229
|
+
def to_record(self) -> dict[str, Any]:
|
|
230
|
+
"""Serialize to ATProto record dict."""
|
|
231
|
+
return {
|
|
232
|
+
"$type": f"{LEXICON_NAMESPACE}.storageBlobs",
|
|
233
|
+
"blobs": [b.to_record() for b in self.blobs],
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
@classmethod
|
|
237
|
+
def from_record(cls, d: dict[str, Any]) -> StorageBlobs:
|
|
238
|
+
"""Deserialize from ATProto record dict."""
|
|
239
|
+
return cls(
|
|
240
|
+
blobs=[BlobEntry.from_record(b) for b in d["blobs"]],
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
StorageUnion = StorageHttp | StorageS3 | StorageBlobs
|
|
245
|
+
"""Union of all storage types for dataset records."""
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
_STORAGE_TYPE_MAP: dict[str, type[StorageHttp | StorageS3 | StorageBlobs]] = {
|
|
249
|
+
f"{LEXICON_NAMESPACE}.storageHttp": StorageHttp,
|
|
250
|
+
f"{LEXICON_NAMESPACE}.storageS3": StorageS3,
|
|
251
|
+
f"{LEXICON_NAMESPACE}.storageBlobs": StorageBlobs,
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def storage_from_record(d: dict[str, Any]) -> StorageUnion:
|
|
256
|
+
"""Deserialize a storage union variant from an ATProto record dict.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
d: Storage dict with ``$type`` discriminator.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
The appropriate storage type instance.
|
|
263
|
+
|
|
264
|
+
Raises:
|
|
265
|
+
ValueError: If the ``$type`` is not recognized.
|
|
266
|
+
"""
|
|
267
|
+
type_id = d.get("$type", "")
|
|
268
|
+
# Exact match first
|
|
269
|
+
if type_id in _STORAGE_TYPE_MAP:
|
|
270
|
+
return _STORAGE_TYPE_MAP[type_id].from_record(d)
|
|
271
|
+
# Legacy: storageExternal → treat as HTTP (without checksums)
|
|
272
|
+
if "storageExternal" in type_id:
|
|
273
|
+
urls = d.get("urls", [])
|
|
274
|
+
shards = [
|
|
275
|
+
HttpShardEntry(url=url, checksum=ShardChecksum("none", "")) for url in urls
|
|
276
|
+
]
|
|
277
|
+
return StorageHttp(shards=shards)
|
|
278
|
+
raise ValueError(f"Unknown storage type: {type_id!r}")
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# ---------------------------------------------------------------------------
|
|
282
|
+
# Code references (lens)
|
|
283
|
+
# ---------------------------------------------------------------------------
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
@dataclass
|
|
287
|
+
class LexCodeReference:
|
|
288
|
+
"""Reference to code in an external repository.
|
|
289
|
+
|
|
290
|
+
Mirrors ``ac.foundation.dataset.lens#codeReference``.
|
|
291
|
+
All fields are required per the lexicon.
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
repository: str
|
|
295
|
+
"""Repository URL."""
|
|
296
|
+
|
|
297
|
+
commit: str
|
|
298
|
+
"""Git commit hash (ensures immutability)."""
|
|
299
|
+
|
|
300
|
+
path: str
|
|
301
|
+
"""Path to function within repository."""
|
|
302
|
+
|
|
303
|
+
branch: str | None = None
|
|
304
|
+
"""Optional branch name (commit hash is authoritative)."""
|
|
305
|
+
|
|
306
|
+
def to_record(self) -> dict[str, str]:
|
|
307
|
+
"""Serialize to ATProto record dict."""
|
|
308
|
+
d: dict[str, str] = {
|
|
309
|
+
"repository": self.repository,
|
|
310
|
+
"commit": self.commit,
|
|
311
|
+
"path": self.path,
|
|
312
|
+
}
|
|
313
|
+
if self.branch is not None:
|
|
314
|
+
d["branch"] = self.branch
|
|
315
|
+
return d
|
|
316
|
+
|
|
317
|
+
@classmethod
|
|
318
|
+
def from_record(cls, d: dict[str, Any]) -> LexCodeReference:
|
|
319
|
+
"""Deserialize from ATProto record dict."""
|
|
320
|
+
return cls(
|
|
321
|
+
repository=d["repository"],
|
|
322
|
+
commit=d["commit"],
|
|
323
|
+
path=d["path"],
|
|
324
|
+
branch=d.get("branch"),
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
# ---------------------------------------------------------------------------
|
|
329
|
+
# Schema record
|
|
330
|
+
# ---------------------------------------------------------------------------
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
@dataclass
|
|
334
|
+
class JsonSchemaFormat:
|
|
335
|
+
"""JSON Schema Draft 7 format for sample type definitions.
|
|
336
|
+
|
|
337
|
+
Mirrors ``ac.foundation.dataset.schema#jsonSchemaFormat``.
|
|
338
|
+
"""
|
|
339
|
+
|
|
340
|
+
schema_body: dict[str, Any]
|
|
341
|
+
"""The JSON Schema object (with $schema, type, properties keys)."""
|
|
342
|
+
|
|
343
|
+
array_format_versions: dict[str, str] | None = None
|
|
344
|
+
"""Mapping from array format identifiers to semver strings."""
|
|
345
|
+
|
|
346
|
+
def to_record(self) -> dict[str, Any]:
|
|
347
|
+
"""Serialize to ATProto record dict."""
|
|
348
|
+
d: dict[str, Any] = {
|
|
349
|
+
"$type": f"{LEXICON_NAMESPACE}.schema#jsonSchemaFormat",
|
|
350
|
+
}
|
|
351
|
+
# Merge the schema body keys directly into the record
|
|
352
|
+
d.update(self.schema_body)
|
|
353
|
+
if self.array_format_versions:
|
|
354
|
+
d["arrayFormatVersions"] = self.array_format_versions
|
|
355
|
+
return d
|
|
356
|
+
|
|
357
|
+
@classmethod
|
|
358
|
+
def from_record(cls, d: dict[str, Any]) -> JsonSchemaFormat:
|
|
359
|
+
"""Deserialize from ATProto record dict."""
|
|
360
|
+
afv = d.get("arrayFormatVersions")
|
|
361
|
+
# Extract schema body: everything except $type and arrayFormatVersions
|
|
362
|
+
body = {k: v for k, v in d.items() if k not in ("$type", "arrayFormatVersions")}
|
|
363
|
+
return cls(schema_body=body, array_format_versions=afv)
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
@dataclass
|
|
367
|
+
class LexSchemaRecord:
|
|
368
|
+
"""Versioned sample type definition.
|
|
369
|
+
|
|
370
|
+
Mirrors ``ac.foundation.dataset.schema`` (main record).
|
|
371
|
+
"""
|
|
372
|
+
|
|
373
|
+
name: str
|
|
374
|
+
"""Human-readable display name."""
|
|
375
|
+
|
|
376
|
+
version: str
|
|
377
|
+
"""Semantic version string (e.g., '1.0.0')."""
|
|
378
|
+
|
|
379
|
+
schema_type: str
|
|
380
|
+
"""Schema format identifier (e.g., 'jsonSchema')."""
|
|
381
|
+
|
|
382
|
+
schema: JsonSchemaFormat
|
|
383
|
+
"""Schema definition (currently only jsonSchemaFormat)."""
|
|
384
|
+
|
|
385
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
386
|
+
"""Timestamp when this schema version was created."""
|
|
387
|
+
|
|
388
|
+
description: str | None = None
|
|
389
|
+
"""Human-readable description."""
|
|
390
|
+
|
|
391
|
+
metadata: dict[str, Any] | None = None
|
|
392
|
+
"""Optional metadata (license, tags, etc.)."""
|
|
393
|
+
|
|
394
|
+
def to_record(self) -> dict[str, Any]:
|
|
395
|
+
"""Serialize to ATProto record dict."""
|
|
396
|
+
d: dict[str, Any] = {
|
|
397
|
+
"$type": f"{LEXICON_NAMESPACE}.schema",
|
|
398
|
+
"name": self.name,
|
|
399
|
+
"version": self.version,
|
|
400
|
+
"schemaType": self.schema_type,
|
|
401
|
+
"schema": self.schema.to_record(),
|
|
402
|
+
"createdAt": self.created_at.isoformat(),
|
|
403
|
+
}
|
|
404
|
+
if self.description is not None:
|
|
405
|
+
d["description"] = self.description
|
|
406
|
+
if self.metadata is not None:
|
|
407
|
+
d["metadata"] = self.metadata
|
|
408
|
+
return d
|
|
409
|
+
|
|
410
|
+
@classmethod
|
|
411
|
+
def from_record(cls, d: dict[str, Any]) -> LexSchemaRecord:
|
|
412
|
+
"""Deserialize from ATProto record dict."""
|
|
413
|
+
return cls(
|
|
414
|
+
name=d["name"],
|
|
415
|
+
version=d["version"],
|
|
416
|
+
schema_type=d["schemaType"],
|
|
417
|
+
schema=JsonSchemaFormat.from_record(d["schema"]),
|
|
418
|
+
created_at=datetime.fromisoformat(d["createdAt"]),
|
|
419
|
+
description=d.get("description"),
|
|
420
|
+
metadata=d.get("metadata"),
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# ---------------------------------------------------------------------------
|
|
425
|
+
# Dataset record
|
|
426
|
+
# ---------------------------------------------------------------------------
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
@dataclass
|
|
430
|
+
class LexDatasetRecord:
|
|
431
|
+
"""Dataset index record pointing to WebDataset storage.
|
|
432
|
+
|
|
433
|
+
Mirrors ``ac.foundation.dataset.record`` (main record).
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
name: str
|
|
437
|
+
"""Human-readable dataset name."""
|
|
438
|
+
|
|
439
|
+
schema_ref: str
|
|
440
|
+
"""AT-URI reference to the schema record."""
|
|
441
|
+
|
|
442
|
+
storage: StorageUnion
|
|
443
|
+
"""Storage location for dataset shards."""
|
|
444
|
+
|
|
445
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
446
|
+
"""Timestamp when this record was created."""
|
|
447
|
+
|
|
448
|
+
description: str | None = None
|
|
449
|
+
"""Human-readable description."""
|
|
450
|
+
|
|
451
|
+
metadata: bytes | None = None
|
|
452
|
+
"""Msgpack-encoded metadata dict."""
|
|
453
|
+
|
|
454
|
+
tags: list[str] | None = None
|
|
455
|
+
"""Searchable tags for discovery."""
|
|
456
|
+
|
|
457
|
+
size: DatasetSize | None = None
|
|
458
|
+
"""Dataset size information."""
|
|
459
|
+
|
|
460
|
+
license: str | None = None
|
|
461
|
+
"""SPDX license identifier or URL."""
|
|
462
|
+
|
|
463
|
+
def to_record(self) -> dict[str, Any]:
|
|
464
|
+
"""Serialize to ATProto record dict."""
|
|
465
|
+
d: dict[str, Any] = {
|
|
466
|
+
"$type": f"{LEXICON_NAMESPACE}.record",
|
|
467
|
+
"name": self.name,
|
|
468
|
+
"schemaRef": self.schema_ref,
|
|
469
|
+
"storage": self.storage.to_record(),
|
|
470
|
+
"createdAt": self.created_at.isoformat(),
|
|
471
|
+
}
|
|
472
|
+
if self.description is not None:
|
|
473
|
+
d["description"] = self.description
|
|
474
|
+
if self.metadata is not None:
|
|
475
|
+
d["metadata"] = self.metadata
|
|
476
|
+
if self.tags:
|
|
477
|
+
d["tags"] = self.tags
|
|
478
|
+
if self.size is not None:
|
|
479
|
+
d["size"] = self.size.to_record()
|
|
480
|
+
if self.license is not None:
|
|
481
|
+
d["license"] = self.license
|
|
482
|
+
return d
|
|
483
|
+
|
|
484
|
+
@classmethod
|
|
485
|
+
def from_record(cls, d: dict[str, Any]) -> LexDatasetRecord:
|
|
486
|
+
"""Deserialize from ATProto record dict."""
|
|
487
|
+
size = None
|
|
488
|
+
if "size" in d:
|
|
489
|
+
size = DatasetSize.from_record(d["size"])
|
|
490
|
+
return cls(
|
|
491
|
+
name=d["name"],
|
|
492
|
+
schema_ref=d["schemaRef"],
|
|
493
|
+
storage=storage_from_record(d["storage"]),
|
|
494
|
+
created_at=datetime.fromisoformat(d["createdAt"]),
|
|
495
|
+
description=d.get("description"),
|
|
496
|
+
metadata=d.get("metadata"),
|
|
497
|
+
tags=d.get("tags"),
|
|
498
|
+
size=size,
|
|
499
|
+
license=d.get("license"),
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
# ---------------------------------------------------------------------------
|
|
504
|
+
# Lens record
|
|
505
|
+
# ---------------------------------------------------------------------------
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
@dataclass
|
|
509
|
+
class LexLensRecord:
|
|
510
|
+
"""Bidirectional transformation between two sample types.
|
|
511
|
+
|
|
512
|
+
Mirrors ``ac.foundation.dataset.lens`` (main record).
|
|
513
|
+
``getter_code`` and ``putter_code`` are required per the lexicon.
|
|
514
|
+
"""
|
|
515
|
+
|
|
516
|
+
name: str
|
|
517
|
+
"""Human-readable lens name."""
|
|
518
|
+
|
|
519
|
+
source_schema: str
|
|
520
|
+
"""AT-URI reference to source schema."""
|
|
521
|
+
|
|
522
|
+
target_schema: str
|
|
523
|
+
"""AT-URI reference to target schema."""
|
|
524
|
+
|
|
525
|
+
getter_code: LexCodeReference
|
|
526
|
+
"""Code reference for getter function (Source -> Target)."""
|
|
527
|
+
|
|
528
|
+
putter_code: LexCodeReference
|
|
529
|
+
"""Code reference for putter function (Target, Source -> Source)."""
|
|
530
|
+
|
|
531
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
532
|
+
"""Timestamp when this lens was created."""
|
|
533
|
+
|
|
534
|
+
description: str | None = None
|
|
535
|
+
"""What this transformation does."""
|
|
536
|
+
|
|
537
|
+
language: str | None = None
|
|
538
|
+
"""Programming language (e.g., 'python')."""
|
|
539
|
+
|
|
540
|
+
metadata: dict[str, Any] | None = None
|
|
541
|
+
"""Arbitrary metadata."""
|
|
542
|
+
|
|
543
|
+
def to_record(self) -> dict[str, Any]:
|
|
544
|
+
"""Serialize to ATProto record dict."""
|
|
545
|
+
d: dict[str, Any] = {
|
|
546
|
+
"$type": f"{LEXICON_NAMESPACE}.lens",
|
|
547
|
+
"name": self.name,
|
|
548
|
+
"sourceSchema": self.source_schema,
|
|
549
|
+
"targetSchema": self.target_schema,
|
|
550
|
+
"getterCode": self.getter_code.to_record(),
|
|
551
|
+
"putterCode": self.putter_code.to_record(),
|
|
552
|
+
"createdAt": self.created_at.isoformat(),
|
|
553
|
+
}
|
|
554
|
+
if self.description is not None:
|
|
555
|
+
d["description"] = self.description
|
|
556
|
+
if self.language is not None:
|
|
557
|
+
d["language"] = self.language
|
|
558
|
+
if self.metadata is not None:
|
|
559
|
+
d["metadata"] = self.metadata
|
|
560
|
+
return d
|
|
561
|
+
|
|
562
|
+
@classmethod
|
|
563
|
+
def from_record(cls, d: dict[str, Any]) -> LexLensRecord:
|
|
564
|
+
"""Deserialize from ATProto record dict."""
|
|
565
|
+
return cls(
|
|
566
|
+
name=d["name"],
|
|
567
|
+
source_schema=d["sourceSchema"],
|
|
568
|
+
target_schema=d["targetSchema"],
|
|
569
|
+
getter_code=LexCodeReference.from_record(d["getterCode"]),
|
|
570
|
+
putter_code=LexCodeReference.from_record(d["putterCode"]),
|
|
571
|
+
created_at=datetime.fromisoformat(d["createdAt"]),
|
|
572
|
+
description=d.get("description"),
|
|
573
|
+
language=d.get("language"),
|
|
574
|
+
metadata=d.get("metadata"),
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
__all__ = [
|
|
579
|
+
"LEXICON_NAMESPACE",
|
|
580
|
+
"ShardChecksum",
|
|
581
|
+
"DatasetSize",
|
|
582
|
+
"HttpShardEntry",
|
|
583
|
+
"StorageHttp",
|
|
584
|
+
"S3ShardEntry",
|
|
585
|
+
"StorageS3",
|
|
586
|
+
"BlobEntry",
|
|
587
|
+
"StorageBlobs",
|
|
588
|
+
"StorageUnion",
|
|
589
|
+
"storage_from_record",
|
|
590
|
+
"LexCodeReference",
|
|
591
|
+
"JsonSchemaFormat",
|
|
592
|
+
"LexSchemaRecord",
|
|
593
|
+
"LexDatasetRecord",
|
|
594
|
+
"LexLensRecord",
|
|
595
|
+
]
|