atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +11 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +46 -1
- atdata/_logging.py +43 -0
- atdata/_protocols.py +81 -182
- atdata/_schema_codec.py +2 -2
- atdata/_sources.py +24 -4
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +60 -21
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +73 -245
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +60 -53
- atdata/atmosphere/records.py +291 -100
- atdata/atmosphere/schema.py +91 -65
- atdata/atmosphere/store.py +68 -66
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +266 -47
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_entry.py +6 -2
- atdata/{local → index}/_index.py +617 -72
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +127 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
- atdata/lexicons/ac.foundation.dataset.record.json +117 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/promote.py +14 -10
- atdata/repository.py +66 -16
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +131 -0
- atdata/{local → stores}/_s3.py +134 -112
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
- atdata-0.3.2b1.dist-info/RECORD +71 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -26,7 +26,7 @@ from typing import (
|
|
|
26
26
|
T = TypeVar("T", bound=Packable)
|
|
27
27
|
|
|
28
28
|
# URI scheme prefixes
|
|
29
|
-
_ATDATA_URI_PREFIX = "atdata://local/
|
|
29
|
+
_ATDATA_URI_PREFIX = "atdata://local/schema/"
|
|
30
30
|
_LEGACY_URI_PREFIX = "local://schemas/"
|
|
31
31
|
|
|
32
32
|
|
|
@@ -37,7 +37,7 @@ class SchemaNamespace:
|
|
|
37
37
|
Supports attribute access, iteration, ``len()``, and ``in`` checks.
|
|
38
38
|
|
|
39
39
|
Examples:
|
|
40
|
-
>>> index.load_schema("atdata://local/
|
|
40
|
+
>>> index.load_schema("atdata://local/schema/MySample@1.0.0")
|
|
41
41
|
>>> MyType = index.types.MySample
|
|
42
42
|
>>> sample = MyType(field1="hello", field2=42)
|
|
43
43
|
|
|
@@ -207,7 +207,7 @@ class LocalSchemaRecord:
|
|
|
207
207
|
"""List of field definitions."""
|
|
208
208
|
|
|
209
209
|
ref: str
|
|
210
|
-
"""Schema reference URI (atdata://local/
|
|
210
|
+
"""Schema reference URI (atdata://local/schema/{name}@{version})."""
|
|
211
211
|
|
|
212
212
|
description: Optional[str] = None
|
|
213
213
|
"""Human-readable description."""
|
|
@@ -259,7 +259,7 @@ def _kind_str_for_sample_type(st: Type[Packable]) -> str:
|
|
|
259
259
|
|
|
260
260
|
|
|
261
261
|
def _schema_ref_from_type(sample_type: Type[Packable], version: str) -> str:
|
|
262
|
-
"""Generate 'atdata://local/
|
|
262
|
+
"""Generate 'atdata://local/schema/{name}@{version}' reference."""
|
|
263
263
|
return _make_schema_ref(sample_type.__name__, version)
|
|
264
264
|
|
|
265
265
|
|
|
@@ -271,7 +271,7 @@ def _make_schema_ref(name: str, version: str) -> str:
|
|
|
271
271
|
def _parse_schema_ref(ref: str) -> tuple[str, str]:
|
|
272
272
|
"""Parse schema reference into (name, version).
|
|
273
273
|
|
|
274
|
-
Supports both new format: 'atdata://local/
|
|
274
|
+
Supports both new format: 'atdata://local/schema/{name}@{version}'
|
|
275
275
|
and legacy format: 'local://schemas/{module.Class}@{version}'
|
|
276
276
|
"""
|
|
277
277
|
if ref.startswith(_ATDATA_URI_PREFIX):
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""ATProto Lexicon definitions for the atdata federation.
|
|
2
|
+
|
|
3
|
+
This package contains the canonical Lexicon JSON files for the
|
|
4
|
+
``ac.foundation.dataset`` namespace. These define the ATProto record
|
|
5
|
+
types used by atdata for publishing schemas, datasets, and lenses
|
|
6
|
+
to the AT Protocol network.
|
|
7
|
+
|
|
8
|
+
Lexicons:
|
|
9
|
+
ac.foundation.dataset.schema
|
|
10
|
+
Versioned sample type definitions (PackableSample schemas).
|
|
11
|
+
ac.foundation.dataset.record
|
|
12
|
+
Dataset index records pointing to WebDataset storage.
|
|
13
|
+
ac.foundation.dataset.lens
|
|
14
|
+
Bidirectional transformations between schemas.
|
|
15
|
+
ac.foundation.dataset.schemaType
|
|
16
|
+
Extensible token for schema format identifiers.
|
|
17
|
+
ac.foundation.dataset.arrayFormat
|
|
18
|
+
Extensible token for array serialization formats.
|
|
19
|
+
ac.foundation.dataset.storageHttp
|
|
20
|
+
HTTP/HTTPS URL-based storage with per-shard checksums.
|
|
21
|
+
ac.foundation.dataset.storageS3
|
|
22
|
+
S3/S3-compatible object storage with per-shard checksums.
|
|
23
|
+
ac.foundation.dataset.storageBlobs
|
|
24
|
+
ATProto PDS blob-based storage.
|
|
25
|
+
ac.foundation.dataset.storageExternal
|
|
26
|
+
(Deprecated) External URL-based storage.
|
|
27
|
+
ac.foundation.dataset.getLatestSchema
|
|
28
|
+
XRPC query for fetching the latest schema version.
|
|
29
|
+
|
|
30
|
+
The ``ndarray_shim.json`` file defines the standard NDArray type
|
|
31
|
+
for use within JSON Schema definitions.
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
>>> from atdata.lexicons import load_lexicon
|
|
35
|
+
>>> schema_lex = load_lexicon("ac.foundation.dataset.schema")
|
|
36
|
+
>>> schema_lex["id"]
|
|
37
|
+
'ac.foundation.dataset.schema'
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
import json
|
|
41
|
+
from importlib import resources
|
|
42
|
+
from functools import lru_cache
|
|
43
|
+
from typing import Any
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
NAMESPACE = "ac.foundation.dataset"
|
|
47
|
+
|
|
48
|
+
LEXICON_IDS = (
|
|
49
|
+
f"{NAMESPACE}.schema",
|
|
50
|
+
f"{NAMESPACE}.record",
|
|
51
|
+
f"{NAMESPACE}.lens",
|
|
52
|
+
f"{NAMESPACE}.schemaType",
|
|
53
|
+
f"{NAMESPACE}.arrayFormat",
|
|
54
|
+
f"{NAMESPACE}.storageHttp",
|
|
55
|
+
f"{NAMESPACE}.storageS3",
|
|
56
|
+
f"{NAMESPACE}.storageBlobs",
|
|
57
|
+
f"{NAMESPACE}.storageExternal", # deprecated
|
|
58
|
+
f"{NAMESPACE}.getLatestSchema",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@lru_cache(maxsize=16)
|
|
63
|
+
def load_lexicon(lexicon_id: str) -> dict[str, Any]:
|
|
64
|
+
"""Load a lexicon definition by its NSID.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
lexicon_id: The lexicon NSID, e.g. ``"ac.foundation.dataset.schema"``.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Parsed JSON dictionary containing the lexicon definition.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
FileNotFoundError: If no lexicon file exists for the given ID.
|
|
74
|
+
|
|
75
|
+
Examples:
|
|
76
|
+
>>> lex = load_lexicon("ac.foundation.dataset.schema")
|
|
77
|
+
>>> lex["defs"]["main"]["type"]
|
|
78
|
+
'record'
|
|
79
|
+
"""
|
|
80
|
+
filename = f"{lexicon_id}.json"
|
|
81
|
+
ref = resources.files(__package__).joinpath(filename)
|
|
82
|
+
try:
|
|
83
|
+
text = ref.read_text(encoding="utf-8")
|
|
84
|
+
except FileNotFoundError:
|
|
85
|
+
raise FileNotFoundError(
|
|
86
|
+
f"No lexicon file found for '{lexicon_id}'. "
|
|
87
|
+
f"Expected {filename} in {__package__}."
|
|
88
|
+
) from None
|
|
89
|
+
return json.loads(text)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@lru_cache(maxsize=1)
|
|
93
|
+
def load_ndarray_shim() -> dict[str, Any]:
|
|
94
|
+
"""Load the NDArray JSON Schema shim definition.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Parsed JSON dictionary containing the NDArray shim schema.
|
|
98
|
+
|
|
99
|
+
Examples:
|
|
100
|
+
>>> shim = load_ndarray_shim()
|
|
101
|
+
>>> shim["$defs"]["ndarray"]["type"]
|
|
102
|
+
'string'
|
|
103
|
+
"""
|
|
104
|
+
ref = resources.files(__package__).joinpath("ndarray_shim.json")
|
|
105
|
+
return json.loads(ref.read_text(encoding="utf-8"))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def list_lexicons() -> tuple[str, ...]:
|
|
109
|
+
"""Return the tuple of all known lexicon NSIDs.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Tuple of lexicon ID strings.
|
|
113
|
+
|
|
114
|
+
Examples:
|
|
115
|
+
>>> "ac.foundation.dataset.schema" in list_lexicons()
|
|
116
|
+
True
|
|
117
|
+
"""
|
|
118
|
+
return LEXICON_IDS
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
__all__ = [
|
|
122
|
+
"NAMESPACE",
|
|
123
|
+
"LEXICON_IDS",
|
|
124
|
+
"load_lexicon",
|
|
125
|
+
"load_ndarray_shim",
|
|
126
|
+
"list_lexicons",
|
|
127
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.arrayFormat",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "string",
|
|
7
|
+
"description": "Array serialization format identifier for NDArray fields in sample schemas. Known values correspond to token definitions in this Lexicon. Each format has versioned specifications maintained by foundation.ac at canonical URLs.",
|
|
8
|
+
"knownValues": ["ndarrayBytes"],
|
|
9
|
+
"maxLength": 50
|
|
10
|
+
},
|
|
11
|
+
"ndarrayBytes": {
|
|
12
|
+
"type": "token",
|
|
13
|
+
"description": "Numpy .npy binary format for NDArray serialization. Stores arrays with dtype and shape in binary header. Versions maintained at https://foundation.ac/schemas/atdata-ndarray-bytes/{version}/"
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.getLatestSchema",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "query",
|
|
7
|
+
"description": "Get the latest version of a sample schema by its permanent NSID identifier",
|
|
8
|
+
"parameters": {
|
|
9
|
+
"type": "params",
|
|
10
|
+
"required": [
|
|
11
|
+
"schemaId"
|
|
12
|
+
],
|
|
13
|
+
"properties": {
|
|
14
|
+
"schemaId": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"description": "The permanent NSID identifier for the schema (the {NSID} part of the rkey {NSID}@{semver})",
|
|
17
|
+
"maxLength": 500
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"output": {
|
|
22
|
+
"encoding": "application/json",
|
|
23
|
+
"schema": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"required": [
|
|
26
|
+
"uri",
|
|
27
|
+
"version",
|
|
28
|
+
"record"
|
|
29
|
+
],
|
|
30
|
+
"properties": {
|
|
31
|
+
"uri": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"description": "AT-URI of the latest schema version",
|
|
34
|
+
"maxLength": 500
|
|
35
|
+
},
|
|
36
|
+
"version": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"description": "Semantic version of the latest schema",
|
|
39
|
+
"maxLength": 20
|
|
40
|
+
},
|
|
41
|
+
"record": {
|
|
42
|
+
"type": "ref",
|
|
43
|
+
"ref": "ac.foundation.dataset.schema",
|
|
44
|
+
"description": "The full schema record"
|
|
45
|
+
},
|
|
46
|
+
"allVersions": {
|
|
47
|
+
"type": "array",
|
|
48
|
+
"description": "All available versions (optional, sorted by semver descending)",
|
|
49
|
+
"items": {
|
|
50
|
+
"type": "object",
|
|
51
|
+
"required": [
|
|
52
|
+
"uri",
|
|
53
|
+
"version"
|
|
54
|
+
],
|
|
55
|
+
"properties": {
|
|
56
|
+
"uri": {
|
|
57
|
+
"type": "string",
|
|
58
|
+
"maxLength": 500
|
|
59
|
+
},
|
|
60
|
+
"version": {
|
|
61
|
+
"type": "string",
|
|
62
|
+
"maxLength": 20
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"errors": [
|
|
71
|
+
{
|
|
72
|
+
"name": "SchemaNotFound",
|
|
73
|
+
"description": "No schema found with the given NSID"
|
|
74
|
+
}
|
|
75
|
+
]
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.lens",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Bidirectional transformation (Lens) between two sample types, with code stored in external repositories",
|
|
8
|
+
"key": "tid",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"sourceSchema",
|
|
14
|
+
"targetSchema",
|
|
15
|
+
"getterCode",
|
|
16
|
+
"putterCode",
|
|
17
|
+
"createdAt"
|
|
18
|
+
],
|
|
19
|
+
"properties": {
|
|
20
|
+
"name": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"description": "Human-readable lens name",
|
|
23
|
+
"maxLength": 100
|
|
24
|
+
},
|
|
25
|
+
"sourceSchema": {
|
|
26
|
+
"type": "string",
|
|
27
|
+
"format": "at-uri",
|
|
28
|
+
"description": "AT-URI reference to source schema",
|
|
29
|
+
"maxLength": 500
|
|
30
|
+
},
|
|
31
|
+
"targetSchema": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"format": "at-uri",
|
|
34
|
+
"description": "AT-URI reference to target schema",
|
|
35
|
+
"maxLength": 500
|
|
36
|
+
},
|
|
37
|
+
"description": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"description": "What this transformation does",
|
|
40
|
+
"maxLength": 1000
|
|
41
|
+
},
|
|
42
|
+
"getterCode": {
|
|
43
|
+
"type": "ref",
|
|
44
|
+
"ref": "#codeReference",
|
|
45
|
+
"description": "Code reference for getter function (Source -> Target)"
|
|
46
|
+
},
|
|
47
|
+
"putterCode": {
|
|
48
|
+
"type": "ref",
|
|
49
|
+
"ref": "#codeReference",
|
|
50
|
+
"description": "Code reference for putter function (Target, Source -> Source)"
|
|
51
|
+
},
|
|
52
|
+
"language": {
|
|
53
|
+
"type": "string",
|
|
54
|
+
"description": "Programming language of the lens implementation (e.g., 'python', 'typescript')",
|
|
55
|
+
"maxLength": 50
|
|
56
|
+
},
|
|
57
|
+
"metadata": {
|
|
58
|
+
"type": "object",
|
|
59
|
+
"description": "Arbitrary metadata (author, performance notes, etc.)"
|
|
60
|
+
},
|
|
61
|
+
"createdAt": {
|
|
62
|
+
"type": "string",
|
|
63
|
+
"format": "datetime",
|
|
64
|
+
"description": "Timestamp when this lens was created"
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
"codeReference": {
|
|
70
|
+
"type": "object",
|
|
71
|
+
"description": "Reference to code in an external repository (GitHub, tangled.org, etc.)",
|
|
72
|
+
"required": [
|
|
73
|
+
"repository",
|
|
74
|
+
"commit",
|
|
75
|
+
"path"
|
|
76
|
+
],
|
|
77
|
+
"properties": {
|
|
78
|
+
"repository": {
|
|
79
|
+
"type": "string",
|
|
80
|
+
"description": "Repository URL (e.g., 'https://github.com/user/repo' or 'at://did/tangled.repo/...')",
|
|
81
|
+
"maxLength": 500
|
|
82
|
+
},
|
|
83
|
+
"commit": {
|
|
84
|
+
"type": "string",
|
|
85
|
+
"description": "Git commit hash (ensures immutability)",
|
|
86
|
+
"maxLength": 40
|
|
87
|
+
},
|
|
88
|
+
"path": {
|
|
89
|
+
"type": "string",
|
|
90
|
+
"description": "Path to function within repository (e.g., 'lenses/vision.py:rgb_to_grayscale')",
|
|
91
|
+
"maxLength": 500
|
|
92
|
+
},
|
|
93
|
+
"branch": {
|
|
94
|
+
"type": "string",
|
|
95
|
+
"description": "Optional branch name (for reference, commit hash is authoritative)",
|
|
96
|
+
"maxLength": 100
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.record",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Index record for a WebDataset-backed dataset with references to storage location and sample schema",
|
|
8
|
+
"key": "tid",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"schemaRef",
|
|
14
|
+
"storage",
|
|
15
|
+
"createdAt"
|
|
16
|
+
],
|
|
17
|
+
"properties": {
|
|
18
|
+
"name": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "Human-readable dataset name",
|
|
21
|
+
"maxLength": 200
|
|
22
|
+
},
|
|
23
|
+
"schemaRef": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"format": "at-uri",
|
|
26
|
+
"description": "AT-URI reference to the schema record for this dataset's samples",
|
|
27
|
+
"maxLength": 500
|
|
28
|
+
},
|
|
29
|
+
"storage": {
|
|
30
|
+
"type": "union",
|
|
31
|
+
"description": "Storage location for dataset files (WebDataset tar archives)",
|
|
32
|
+
"refs": [
|
|
33
|
+
"ac.foundation.dataset.storageHttp",
|
|
34
|
+
"ac.foundation.dataset.storageS3",
|
|
35
|
+
"ac.foundation.dataset.storageBlobs"
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
"description": {
|
|
39
|
+
"type": "string",
|
|
40
|
+
"description": "Human-readable description of the dataset",
|
|
41
|
+
"maxLength": 5000
|
|
42
|
+
},
|
|
43
|
+
"metadata": {
|
|
44
|
+
"type": "bytes",
|
|
45
|
+
"description": "Msgpack-encoded metadata dict for arbitrary extended key-value pairs. Use this for additional metadata beyond the core top-level fields (license, tags, size). Top-level fields are preferred for discoverable/searchable metadata.",
|
|
46
|
+
"maxLength": 100000
|
|
47
|
+
},
|
|
48
|
+
"tags": {
|
|
49
|
+
"type": "array",
|
|
50
|
+
"description": "Searchable tags for dataset discovery. Aligns with Schema.org keywords property.",
|
|
51
|
+
"items": {
|
|
52
|
+
"type": "string",
|
|
53
|
+
"maxLength": 150
|
|
54
|
+
},
|
|
55
|
+
"maxLength": 30
|
|
56
|
+
},
|
|
57
|
+
"size": {
|
|
58
|
+
"type": "ref",
|
|
59
|
+
"ref": "#datasetSize",
|
|
60
|
+
"description": "Dataset size information (optional)"
|
|
61
|
+
},
|
|
62
|
+
"license": {
|
|
63
|
+
"type": "string",
|
|
64
|
+
"description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
|
|
65
|
+
"maxLength": 200
|
|
66
|
+
},
|
|
67
|
+
"createdAt": {
|
|
68
|
+
"type": "string",
|
|
69
|
+
"format": "datetime",
|
|
70
|
+
"description": "Timestamp when this dataset record was created"
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
},
|
|
75
|
+
"shardChecksum": {
|
|
76
|
+
"type": "object",
|
|
77
|
+
"description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",
|
|
78
|
+
"required": [
|
|
79
|
+
"algorithm",
|
|
80
|
+
"digest"
|
|
81
|
+
],
|
|
82
|
+
"properties": {
|
|
83
|
+
"algorithm": {
|
|
84
|
+
"type": "string",
|
|
85
|
+
"description": "Hash algorithm identifier (e.g., 'sha256', 'blake3')",
|
|
86
|
+
"maxLength": 20
|
|
87
|
+
},
|
|
88
|
+
"digest": {
|
|
89
|
+
"type": "string",
|
|
90
|
+
"description": "Hex-encoded hash digest",
|
|
91
|
+
"maxLength": 128
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
},
|
|
95
|
+
"datasetSize": {
|
|
96
|
+
"type": "object",
|
|
97
|
+
"description": "Information about dataset size",
|
|
98
|
+
"properties": {
|
|
99
|
+
"samples": {
|
|
100
|
+
"type": "integer",
|
|
101
|
+
"description": "Total number of samples in the dataset",
|
|
102
|
+
"minimum": 0
|
|
103
|
+
},
|
|
104
|
+
"bytes": {
|
|
105
|
+
"type": "integer",
|
|
106
|
+
"description": "Total size in bytes",
|
|
107
|
+
"minimum": 0
|
|
108
|
+
},
|
|
109
|
+
"shards": {
|
|
110
|
+
"type": "integer",
|
|
111
|
+
"description": "Number of WebDataset shards",
|
|
112
|
+
"minimum": 1
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.schema",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Definition of a PackableSample-compatible sample type. Supports versioning via rkey format: {NSID}@{semver}. Schema format is extensible via union type.",
|
|
8
|
+
"key": "any",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"version",
|
|
14
|
+
"schemaType",
|
|
15
|
+
"schema",
|
|
16
|
+
"createdAt"
|
|
17
|
+
],
|
|
18
|
+
"properties": {
|
|
19
|
+
"name": {
|
|
20
|
+
"type": "string",
|
|
21
|
+
"description": "Human-readable display name for this sample type. Used for documentation and UI. The NSID in the record URI provides unique identification; name collisions across NSIDs are acceptable.",
|
|
22
|
+
"maxLength": 100
|
|
23
|
+
},
|
|
24
|
+
"version": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Semantic version (e.g., '1.0.0')",
|
|
27
|
+
"pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
|
|
28
|
+
"maxLength": 100
|
|
29
|
+
},
|
|
30
|
+
"schemaType": {
|
|
31
|
+
"type": "ref",
|
|
32
|
+
"ref": "ac.foundation.dataset.schemaType",
|
|
33
|
+
"description": "Type of schema definition. This field indicates which union member is present in the schema field."
|
|
34
|
+
},
|
|
35
|
+
"schema": {
|
|
36
|
+
"type": "union",
|
|
37
|
+
"refs": ["ac.foundation.dataset.schema#jsonSchemaFormat"],
|
|
38
|
+
"closed": false,
|
|
39
|
+
"description": "Schema definition for this sample type. Currently supports JSON Schema Draft 7. Union allows for future schema formats (Avro, Protobuf, etc.) without breaking changes."
|
|
40
|
+
},
|
|
41
|
+
"description": {
|
|
42
|
+
"type": "string",
|
|
43
|
+
"description": "Human-readable description of what this sample type represents",
|
|
44
|
+
"maxLength": 5000
|
|
45
|
+
},
|
|
46
|
+
"metadata": {
|
|
47
|
+
"type": "object",
|
|
48
|
+
"description": "Optional metadata about this schema. Common fields include license and tags, but any additional fields are permitted.",
|
|
49
|
+
"maxProperties": 50,
|
|
50
|
+
"properties": {
|
|
51
|
+
"license": {
|
|
52
|
+
"type": "string",
|
|
53
|
+
"description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
|
|
54
|
+
"maxLength": 200
|
|
55
|
+
},
|
|
56
|
+
"tags": {
|
|
57
|
+
"type": "array",
|
|
58
|
+
"description": "Categorization keywords for discovery. Aligns with Schema.org keywords property.",
|
|
59
|
+
"items": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"maxLength": 150
|
|
62
|
+
},
|
|
63
|
+
"maxLength": 30
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"createdAt": {
|
|
68
|
+
"type": "string",
|
|
69
|
+
"format": "datetime",
|
|
70
|
+
"description": "Timestamp when this schema version was created. Immutable once set (ATProto records are permanent)."
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
},
|
|
75
|
+
"jsonSchemaFormat": {
|
|
76
|
+
"type": "object",
|
|
77
|
+
"description": "JSON Schema Draft 7 format for sample type definitions. Used with NDArray shim for array types.",
|
|
78
|
+
"required": ["$type", "$schema", "type", "properties"],
|
|
79
|
+
"properties": {
|
|
80
|
+
"$type": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"const": "ac.foundation.dataset.schema#jsonSchemaFormat"
|
|
83
|
+
},
|
|
84
|
+
"$schema": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"const": "http://json-schema.org/draft-07/schema#",
|
|
87
|
+
"description": "JSON Schema version identifier"
|
|
88
|
+
},
|
|
89
|
+
"type": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"const": "object",
|
|
92
|
+
"description": "Sample types must be objects"
|
|
93
|
+
},
|
|
94
|
+
"properties": {
|
|
95
|
+
"type": "object",
|
|
96
|
+
"description": "Field definitions for the sample type",
|
|
97
|
+
"minProperties": 1
|
|
98
|
+
},
|
|
99
|
+
"arrayFormatVersions": {
|
|
100
|
+
"type": "object",
|
|
101
|
+
"description": "Mapping from array format identifiers to semantic versions. Keys are ac.foundation.dataset.arrayFormat values (e.g., 'ndarrayBytes'), values are semver strings (e.g., '1.0.0'). Foundation.ac maintains canonical shim schemas at https://foundation.ac/schemas/atdata-{format}-bytes/{version}/.",
|
|
102
|
+
"maxProperties": 10
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.schemaType",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "string",
|
|
7
|
+
"description": "Schema type identifier for atdata sample definitions. Known values correspond to token definitions in this Lexicon. New schema types can be added as tokens without breaking changes.",
|
|
8
|
+
"knownValues": ["jsonSchema"],
|
|
9
|
+
"maxLength": 50
|
|
10
|
+
},
|
|
11
|
+
"jsonSchema": {
|
|
12
|
+
"type": "token",
|
|
13
|
+
"description": "JSON Schema Draft 7 format for sample type definitions. When schemaType is 'jsonSchema', the schema field must contain an object conforming to ac.foundation.dataset.schema#jsonSchemaFormat."
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageBlobs",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "Storage via ATProto PDS blobs for WebDataset tar archives. Used in ac.foundation.dataset.record storage union for maximum decentralization.",
|
|
8
|
+
"required": [
|
|
9
|
+
"blobs"
|
|
10
|
+
],
|
|
11
|
+
"properties": {
|
|
12
|
+
"blobs": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "Array of blob entries for WebDataset tar files",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "ref",
|
|
17
|
+
"ref": "#blobEntry"
|
|
18
|
+
},
|
|
19
|
+
"minLength": 1
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"blobEntry": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"description": "A single PDS blob shard with optional integrity checksum",
|
|
26
|
+
"required": [
|
|
27
|
+
"blob"
|
|
28
|
+
],
|
|
29
|
+
"properties": {
|
|
30
|
+
"blob": {
|
|
31
|
+
"type": "blob",
|
|
32
|
+
"accept": [
|
|
33
|
+
"application/x-tar"
|
|
34
|
+
],
|
|
35
|
+
"maxSize": 52428800,
|
|
36
|
+
"description": "Blob reference to a WebDataset tar archive"
|
|
37
|
+
},
|
|
38
|
+
"checksum": {
|
|
39
|
+
"type": "ref",
|
|
40
|
+
"ref": "ac.foundation.dataset.record#shardChecksum",
|
|
41
|
+
"description": "Content hash for integrity verification (optional since PDS blobs have built-in CID integrity)"
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageExternal",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "(Deprecated: use storageHttp or storageS3 instead.) External storage via URLs for WebDataset tar archives. URLs support brace notation for sharding (e.g., 'data-{000000..000099}.tar').",
|
|
8
|
+
"required": [
|
|
9
|
+
"urls"
|
|
10
|
+
],
|
|
11
|
+
"properties": {
|
|
12
|
+
"urls": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "WebDataset URLs with optional brace notation for sharded tar files",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"format": "uri",
|
|
18
|
+
"maxLength": 1000
|
|
19
|
+
},
|
|
20
|
+
"minLength": 1
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|