atdata 0.3.0b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +9 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +33 -1
- atdata/_protocols.py +64 -182
- atdata/_schema_codec.py +2 -2
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +12 -11
- atdata/atmosphere/_types.py +4 -4
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +11 -12
- atdata/atmosphere/records.py +9 -10
- atdata/atmosphere/schema.py +14 -16
- atdata/atmosphere/store.py +6 -7
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +155 -2
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_index.py +322 -64
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +121 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/promote.py +14 -10
- atdata/repository.py +7 -7
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +123 -0
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +2 -2
- atdata-0.3.1b1.dist-info/RECORD +67 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- /atdata/{local → index}/_entry.py +0 -0
- /atdata/{local → stores}/_s3.py +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""ATProto Lexicon definitions for the atdata federation.
|
|
2
|
+
|
|
3
|
+
This package contains the canonical Lexicon JSON files for the
|
|
4
|
+
``ac.foundation.dataset`` namespace. These define the ATProto record
|
|
5
|
+
types used by atdata for publishing schemas, datasets, and lenses
|
|
6
|
+
to the AT Protocol network.
|
|
7
|
+
|
|
8
|
+
Lexicons:
|
|
9
|
+
ac.foundation.dataset.schema
|
|
10
|
+
Versioned sample type definitions (PackableSample schemas).
|
|
11
|
+
ac.foundation.dataset.record
|
|
12
|
+
Dataset index records pointing to WebDataset storage.
|
|
13
|
+
ac.foundation.dataset.lens
|
|
14
|
+
Bidirectional transformations between schemas.
|
|
15
|
+
ac.foundation.dataset.schemaType
|
|
16
|
+
Extensible token for schema format identifiers.
|
|
17
|
+
ac.foundation.dataset.arrayFormat
|
|
18
|
+
Extensible token for array serialization formats.
|
|
19
|
+
ac.foundation.dataset.storageExternal
|
|
20
|
+
External URL-based storage (S3, HTTP, IPFS).
|
|
21
|
+
ac.foundation.dataset.storageBlobs
|
|
22
|
+
ATProto PDS blob-based storage.
|
|
23
|
+
ac.foundation.dataset.getLatestSchema
|
|
24
|
+
XRPC query for fetching the latest schema version.
|
|
25
|
+
|
|
26
|
+
The ``ndarray_shim.json`` file defines the standard NDArray type
|
|
27
|
+
for use within JSON Schema definitions.
|
|
28
|
+
|
|
29
|
+
Examples:
|
|
30
|
+
>>> from atdata.lexicons import load_lexicon
|
|
31
|
+
>>> schema_lex = load_lexicon("ac.foundation.dataset.schema")
|
|
32
|
+
>>> schema_lex["id"]
|
|
33
|
+
'ac.foundation.dataset.schema'
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
import json
|
|
37
|
+
from importlib import resources
|
|
38
|
+
from functools import lru_cache
|
|
39
|
+
from typing import Any
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
NAMESPACE = "ac.foundation.dataset"
|
|
43
|
+
|
|
44
|
+
LEXICON_IDS = (
|
|
45
|
+
f"{NAMESPACE}.schema",
|
|
46
|
+
f"{NAMESPACE}.record",
|
|
47
|
+
f"{NAMESPACE}.lens",
|
|
48
|
+
f"{NAMESPACE}.schemaType",
|
|
49
|
+
f"{NAMESPACE}.arrayFormat",
|
|
50
|
+
f"{NAMESPACE}.storageExternal",
|
|
51
|
+
f"{NAMESPACE}.storageBlobs",
|
|
52
|
+
f"{NAMESPACE}.getLatestSchema",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@lru_cache(maxsize=16)
|
|
57
|
+
def load_lexicon(lexicon_id: str) -> dict[str, Any]:
|
|
58
|
+
"""Load a lexicon definition by its NSID.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
lexicon_id: The lexicon NSID, e.g. ``"ac.foundation.dataset.schema"``.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Parsed JSON dictionary containing the lexicon definition.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
FileNotFoundError: If no lexicon file exists for the given ID.
|
|
68
|
+
|
|
69
|
+
Examples:
|
|
70
|
+
>>> lex = load_lexicon("ac.foundation.dataset.schema")
|
|
71
|
+
>>> lex["defs"]["main"]["type"]
|
|
72
|
+
'record'
|
|
73
|
+
"""
|
|
74
|
+
filename = f"{lexicon_id}.json"
|
|
75
|
+
ref = resources.files(__package__).joinpath(filename)
|
|
76
|
+
try:
|
|
77
|
+
text = ref.read_text(encoding="utf-8")
|
|
78
|
+
except FileNotFoundError:
|
|
79
|
+
raise FileNotFoundError(
|
|
80
|
+
f"No lexicon file found for '{lexicon_id}'. "
|
|
81
|
+
f"Expected {filename} in {__package__}."
|
|
82
|
+
) from None
|
|
83
|
+
return json.loads(text)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@lru_cache(maxsize=1)
|
|
87
|
+
def load_ndarray_shim() -> dict[str, Any]:
|
|
88
|
+
"""Load the NDArray JSON Schema shim definition.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Parsed JSON dictionary containing the NDArray shim schema.
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
>>> shim = load_ndarray_shim()
|
|
95
|
+
>>> shim["$defs"]["ndarray"]["type"]
|
|
96
|
+
'string'
|
|
97
|
+
"""
|
|
98
|
+
ref = resources.files(__package__).joinpath("ndarray_shim.json")
|
|
99
|
+
return json.loads(ref.read_text(encoding="utf-8"))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def list_lexicons() -> tuple[str, ...]:
|
|
103
|
+
"""Return the tuple of all known lexicon NSIDs.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Tuple of lexicon ID strings.
|
|
107
|
+
|
|
108
|
+
Examples:
|
|
109
|
+
>>> "ac.foundation.dataset.schema" in list_lexicons()
|
|
110
|
+
True
|
|
111
|
+
"""
|
|
112
|
+
return LEXICON_IDS
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
__all__ = [
|
|
116
|
+
"NAMESPACE",
|
|
117
|
+
"LEXICON_IDS",
|
|
118
|
+
"load_lexicon",
|
|
119
|
+
"load_ndarray_shim",
|
|
120
|
+
"list_lexicons",
|
|
121
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.arrayFormat",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "string",
|
|
7
|
+
"description": "Array serialization format identifier for NDArray fields in sample schemas. Known values correspond to token definitions in this Lexicon. Each format has versioned specifications maintained by foundation.ac at canonical URLs.",
|
|
8
|
+
"knownValues": ["ndarrayBytes"],
|
|
9
|
+
"maxLength": 50
|
|
10
|
+
},
|
|
11
|
+
"ndarrayBytes": {
|
|
12
|
+
"type": "token",
|
|
13
|
+
"description": "Numpy .npy binary format for NDArray serialization. Stores arrays with dtype and shape in binary header. Versions maintained at https://foundation.ac/schemas/atdata-ndarray-bytes/{version}/"
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.getLatestSchema",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "query",
|
|
7
|
+
"description": "Get the latest version of a sample schema by its permanent NSID identifier",
|
|
8
|
+
"parameters": {
|
|
9
|
+
"type": "params",
|
|
10
|
+
"required": [
|
|
11
|
+
"schemaId"
|
|
12
|
+
],
|
|
13
|
+
"properties": {
|
|
14
|
+
"schemaId": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"description": "The permanent NSID identifier for the schema (the {NSID} part of the rkey {NSID}@{semver})",
|
|
17
|
+
"maxLength": 500
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"output": {
|
|
22
|
+
"encoding": "application/json",
|
|
23
|
+
"schema": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"required": [
|
|
26
|
+
"uri",
|
|
27
|
+
"version",
|
|
28
|
+
"record"
|
|
29
|
+
],
|
|
30
|
+
"properties": {
|
|
31
|
+
"uri": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"description": "AT-URI of the latest schema version",
|
|
34
|
+
"maxLength": 500
|
|
35
|
+
},
|
|
36
|
+
"version": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"description": "Semantic version of the latest schema",
|
|
39
|
+
"maxLength": 20
|
|
40
|
+
},
|
|
41
|
+
"record": {
|
|
42
|
+
"type": "ref",
|
|
43
|
+
"ref": "ac.foundation.dataset.schema",
|
|
44
|
+
"description": "The full schema record"
|
|
45
|
+
},
|
|
46
|
+
"allVersions": {
|
|
47
|
+
"type": "array",
|
|
48
|
+
"description": "All available versions (optional, sorted by semver descending)",
|
|
49
|
+
"items": {
|
|
50
|
+
"type": "object",
|
|
51
|
+
"required": [
|
|
52
|
+
"uri",
|
|
53
|
+
"version"
|
|
54
|
+
],
|
|
55
|
+
"properties": {
|
|
56
|
+
"uri": {
|
|
57
|
+
"type": "string",
|
|
58
|
+
"maxLength": 500
|
|
59
|
+
},
|
|
60
|
+
"version": {
|
|
61
|
+
"type": "string",
|
|
62
|
+
"maxLength": 20
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"errors": [
|
|
71
|
+
{
|
|
72
|
+
"name": "SchemaNotFound",
|
|
73
|
+
"description": "No schema found with the given NSID"
|
|
74
|
+
}
|
|
75
|
+
]
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.lens",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Bidirectional transformation (Lens) between two sample types, with code stored in external repositories",
|
|
8
|
+
"key": "tid",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"sourceSchema",
|
|
14
|
+
"targetSchema",
|
|
15
|
+
"getterCode",
|
|
16
|
+
"putterCode",
|
|
17
|
+
"createdAt"
|
|
18
|
+
],
|
|
19
|
+
"properties": {
|
|
20
|
+
"name": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"description": "Human-readable lens name",
|
|
23
|
+
"maxLength": 100
|
|
24
|
+
},
|
|
25
|
+
"sourceSchema": {
|
|
26
|
+
"type": "string",
|
|
27
|
+
"description": "AT-URI reference to source schema",
|
|
28
|
+
"maxLength": 500
|
|
29
|
+
},
|
|
30
|
+
"targetSchema": {
|
|
31
|
+
"type": "string",
|
|
32
|
+
"description": "AT-URI reference to target schema",
|
|
33
|
+
"maxLength": 500
|
|
34
|
+
},
|
|
35
|
+
"description": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"description": "What this transformation does",
|
|
38
|
+
"maxLength": 1000
|
|
39
|
+
},
|
|
40
|
+
"getterCode": {
|
|
41
|
+
"type": "ref",
|
|
42
|
+
"ref": "#codeReference",
|
|
43
|
+
"description": "Code reference for getter function (Source -> Target)"
|
|
44
|
+
},
|
|
45
|
+
"putterCode": {
|
|
46
|
+
"type": "ref",
|
|
47
|
+
"ref": "#codeReference",
|
|
48
|
+
"description": "Code reference for putter function (Target, Source -> Source)"
|
|
49
|
+
},
|
|
50
|
+
"language": {
|
|
51
|
+
"type": "string",
|
|
52
|
+
"description": "Programming language of the lens implementation (e.g., 'python', 'typescript')",
|
|
53
|
+
"maxLength": 50
|
|
54
|
+
},
|
|
55
|
+
"metadata": {
|
|
56
|
+
"type": "object",
|
|
57
|
+
"description": "Arbitrary metadata (author, performance notes, etc.)"
|
|
58
|
+
},
|
|
59
|
+
"createdAt": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"format": "datetime",
|
|
62
|
+
"description": "Timestamp when this lens was created"
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"codeReference": {
|
|
68
|
+
"type": "object",
|
|
69
|
+
"description": "Reference to code in an external repository (GitHub, tangled.org, etc.)",
|
|
70
|
+
"required": [
|
|
71
|
+
"repository",
|
|
72
|
+
"commit",
|
|
73
|
+
"path"
|
|
74
|
+
],
|
|
75
|
+
"properties": {
|
|
76
|
+
"repository": {
|
|
77
|
+
"type": "string",
|
|
78
|
+
"description": "Repository URL (e.g., 'https://github.com/user/repo' or 'at://did/tangled.repo/...')",
|
|
79
|
+
"maxLength": 500
|
|
80
|
+
},
|
|
81
|
+
"commit": {
|
|
82
|
+
"type": "string",
|
|
83
|
+
"description": "Git commit hash (ensures immutability)",
|
|
84
|
+
"maxLength": 40
|
|
85
|
+
},
|
|
86
|
+
"path": {
|
|
87
|
+
"type": "string",
|
|
88
|
+
"description": "Path to function within repository (e.g., 'lenses/vision.py:rgb_to_grayscale')",
|
|
89
|
+
"maxLength": 500
|
|
90
|
+
},
|
|
91
|
+
"branch": {
|
|
92
|
+
"type": "string",
|
|
93
|
+
"description": "Optional branch name (for reference, commit hash is authoritative)",
|
|
94
|
+
"maxLength": 100
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.record",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Index record for a WebDataset-backed dataset with references to storage location and sample schema",
|
|
8
|
+
"key": "tid",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"schemaRef",
|
|
14
|
+
"storage",
|
|
15
|
+
"createdAt"
|
|
16
|
+
],
|
|
17
|
+
"properties": {
|
|
18
|
+
"name": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "Human-readable dataset name",
|
|
21
|
+
"maxLength": 200
|
|
22
|
+
},
|
|
23
|
+
"schemaRef": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"format": "at-uri",
|
|
26
|
+
"description": "AT-URI reference to the schema record for this dataset's samples",
|
|
27
|
+
"maxLength": 500
|
|
28
|
+
},
|
|
29
|
+
"storage": {
|
|
30
|
+
"type": "union",
|
|
31
|
+
"description": "Storage location for dataset files (WebDataset tar archives)",
|
|
32
|
+
"refs": [
|
|
33
|
+
"ac.foundation.dataset.storageExternal",
|
|
34
|
+
"ac.foundation.dataset.storageBlobs"
|
|
35
|
+
]
|
|
36
|
+
},
|
|
37
|
+
"description": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"description": "Human-readable description of the dataset",
|
|
40
|
+
"maxLength": 5000
|
|
41
|
+
},
|
|
42
|
+
"metadata": {
|
|
43
|
+
"type": "bytes",
|
|
44
|
+
"description": "Msgpack-encoded metadata dict for arbitrary extended key-value pairs. Use this for additional metadata beyond the core top-level fields (license, tags, size). Top-level fields are preferred for discoverable/searchable metadata.",
|
|
45
|
+
"maxLength": 100000
|
|
46
|
+
},
|
|
47
|
+
"tags": {
|
|
48
|
+
"type": "array",
|
|
49
|
+
"description": "Searchable tags for dataset discovery. Aligns with Schema.org keywords property.",
|
|
50
|
+
"items": {
|
|
51
|
+
"type": "string",
|
|
52
|
+
"maxLength": 150
|
|
53
|
+
},
|
|
54
|
+
"maxLength": 30
|
|
55
|
+
},
|
|
56
|
+
"size": {
|
|
57
|
+
"type": "ref",
|
|
58
|
+
"ref": "#datasetSize",
|
|
59
|
+
"description": "Dataset size information (optional)"
|
|
60
|
+
},
|
|
61
|
+
"license": {
|
|
62
|
+
"type": "string",
|
|
63
|
+
"description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
|
|
64
|
+
"maxLength": 200
|
|
65
|
+
},
|
|
66
|
+
"createdAt": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"format": "datetime",
|
|
69
|
+
"description": "Timestamp when this dataset record was created"
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
"datasetSize": {
|
|
75
|
+
"type": "object",
|
|
76
|
+
"description": "Information about dataset size",
|
|
77
|
+
"properties": {
|
|
78
|
+
"samples": {
|
|
79
|
+
"type": "integer",
|
|
80
|
+
"description": "Total number of samples in the dataset",
|
|
81
|
+
"minimum": 0
|
|
82
|
+
},
|
|
83
|
+
"bytes": {
|
|
84
|
+
"type": "integer",
|
|
85
|
+
"description": "Total size in bytes",
|
|
86
|
+
"minimum": 0
|
|
87
|
+
},
|
|
88
|
+
"shards": {
|
|
89
|
+
"type": "integer",
|
|
90
|
+
"description": "Number of WebDataset shards",
|
|
91
|
+
"minimum": 1
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.schema",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Definition of a PackableSample-compatible sample type. Supports versioning via rkey format: {NSID}@{semver}. Schema format is extensible via union type.",
|
|
8
|
+
"key": "any",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"version",
|
|
14
|
+
"schemaType",
|
|
15
|
+
"schema",
|
|
16
|
+
"createdAt"
|
|
17
|
+
],
|
|
18
|
+
"properties": {
|
|
19
|
+
"name": {
|
|
20
|
+
"type": "string",
|
|
21
|
+
"description": "Human-readable display name for this sample type. Used for documentation and UI. The NSID in the record URI provides unique identification; name collisions across NSIDs are acceptable.",
|
|
22
|
+
"maxLength": 100
|
|
23
|
+
},
|
|
24
|
+
"version": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Semantic version (e.g., '1.0.0')",
|
|
27
|
+
"pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
|
|
28
|
+
"maxLength": 100
|
|
29
|
+
},
|
|
30
|
+
"schemaType": {
|
|
31
|
+
"type": "ref",
|
|
32
|
+
"ref": "ac.foundation.dataset.schemaType",
|
|
33
|
+
"description": "Type of schema definition. This field indicates which union member is present in the schema field."
|
|
34
|
+
},
|
|
35
|
+
"schema": {
|
|
36
|
+
"type": "union",
|
|
37
|
+
"refs": ["ac.foundation.dataset.schema#jsonSchemaFormat"],
|
|
38
|
+
"closed": false,
|
|
39
|
+
"description": "Schema definition for this sample type. Currently supports JSON Schema Draft 7. Union allows for future schema formats (Avro, Protobuf, etc.) without breaking changes."
|
|
40
|
+
},
|
|
41
|
+
"description": {
|
|
42
|
+
"type": "string",
|
|
43
|
+
"description": "Human-readable description of what this sample type represents",
|
|
44
|
+
"maxLength": 5000
|
|
45
|
+
},
|
|
46
|
+
"metadata": {
|
|
47
|
+
"type": "object",
|
|
48
|
+
"description": "Optional metadata about this schema. Common fields include license and tags, but any additional fields are permitted.",
|
|
49
|
+
"maxProperties": 50,
|
|
50
|
+
"properties": {
|
|
51
|
+
"license": {
|
|
52
|
+
"type": "string",
|
|
53
|
+
"description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
|
|
54
|
+
"maxLength": 200
|
|
55
|
+
},
|
|
56
|
+
"tags": {
|
|
57
|
+
"type": "array",
|
|
58
|
+
"description": "Categorization keywords for discovery. Aligns with Schema.org keywords property.",
|
|
59
|
+
"items": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"maxLength": 150
|
|
62
|
+
},
|
|
63
|
+
"maxLength": 30
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"createdAt": {
|
|
68
|
+
"type": "string",
|
|
69
|
+
"format": "datetime",
|
|
70
|
+
"description": "Timestamp when this schema version was created. Immutable once set (ATProto records are permanent)."
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
},
|
|
75
|
+
"jsonSchemaFormat": {
|
|
76
|
+
"type": "object",
|
|
77
|
+
"description": "JSON Schema Draft 7 format for sample type definitions. Used with NDArray shim for array types.",
|
|
78
|
+
"required": ["$type", "$schema", "type", "properties"],
|
|
79
|
+
"properties": {
|
|
80
|
+
"$type": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"const": "ac.foundation.dataset.schema#jsonSchemaFormat"
|
|
83
|
+
},
|
|
84
|
+
"$schema": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"const": "http://json-schema.org/draft-07/schema#",
|
|
87
|
+
"description": "JSON Schema version identifier"
|
|
88
|
+
},
|
|
89
|
+
"type": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"const": "object",
|
|
92
|
+
"description": "Sample types must be objects"
|
|
93
|
+
},
|
|
94
|
+
"properties": {
|
|
95
|
+
"type": "object",
|
|
96
|
+
"description": "Field definitions for the sample type",
|
|
97
|
+
"minProperties": 1
|
|
98
|
+
},
|
|
99
|
+
"arrayFormatVersions": {
|
|
100
|
+
"type": "object",
|
|
101
|
+
"description": "Mapping from array format identifiers to semantic versions. Keys are ac.foundation.dataset.arrayFormat values (e.g., 'ndarrayBytes'), values are semver strings (e.g., '1.0.0'). Foundation.ac maintains canonical shim schemas at https://foundation.ac/schemas/atdata-{format}-bytes/{version}/.",
|
|
102
|
+
"maxProperties": 10
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.schemaType",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "string",
|
|
7
|
+
"description": "Schema type identifier for atdata sample definitions. Known values correspond to token definitions in this Lexicon. New schema types can be added as tokens without breaking changes.",
|
|
8
|
+
"knownValues": ["jsonSchema"],
|
|
9
|
+
"maxLength": 50
|
|
10
|
+
},
|
|
11
|
+
"jsonSchema": {
|
|
12
|
+
"type": "token",
|
|
13
|
+
"description": "JSON Schema Draft 7 format for sample type definitions. When schemaType is 'jsonSchema', the schema field must contain an object conforming to ac.foundation.dataset.schema#jsonSchemaFormat."
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageBlobs",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "Storage via ATProto PDS blobs for WebDataset tar archives. Each blob contains one or more tar files. Used in ac.foundation.dataset.record storage union for maximum decentralization.",
|
|
8
|
+
"required": [
|
|
9
|
+
"blobs"
|
|
10
|
+
],
|
|
11
|
+
"properties": {
|
|
12
|
+
"blobs": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "Array of blob references for WebDataset tar files",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "blob",
|
|
17
|
+
"description": "Blob reference to a WebDataset tar archive"
|
|
18
|
+
},
|
|
19
|
+
"minLength": 1
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageExternal",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "External storage via URLs (S3, HTTP, IPFS, etc.) for WebDataset tar archives. URLs support brace notation for sharding (e.g., 'data-{000000..000099}.tar'). Used in ac.foundation.dataset.record storage union.",
|
|
8
|
+
"required": [
|
|
9
|
+
"urls"
|
|
10
|
+
],
|
|
11
|
+
"properties": {
|
|
12
|
+
"urls": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "WebDataset URLs with optional brace notation for sharded tar files",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"format": "uri",
|
|
18
|
+
"maxLength": 1000
|
|
19
|
+
},
|
|
20
|
+
"minLength": 1
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://foundation.ac/schemas/atdata-ndarray-bytes/1.0.0",
|
|
4
|
+
"title": "ATDataNDArrayBytes",
|
|
5
|
+
"description": "Standard definition for numpy NDArray types in JSON Schema, compatible with atdata WebDataset serialization. This type's contents are interpreted as containing the raw bytes data for a serialized numpy NDArray, and serve as a marker for atdata-based code generation to use standard numpy types, rather than generated dataclasses.",
|
|
6
|
+
"version": "1.0.0",
|
|
7
|
+
"$defs": {
|
|
8
|
+
"ndarray": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"format": "byte",
|
|
11
|
+
"description": "Numpy array serialized using numpy `.npy` format via `np.save` (includes dtype and shape in binary header). When represented in JSON, this is a base64-encoded string. In msgpack, this is raw bytes.",
|
|
12
|
+
"contentEncoding": "base64",
|
|
13
|
+
"contentMediaType": "application/octet-stream"
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
atdata/local/__init__.py
CHANGED
|
@@ -1,24 +1,22 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Backward-compatibility shim for atdata.local.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
.. deprecated::
|
|
4
|
+
Import from ``atdata.index`` and ``atdata.stores`` instead::
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
- ``LocalDatasetEntry``: Index entry with ATProto-compatible CIDs.
|
|
8
|
-
- ``S3DataStore``: S3-compatible shard storage.
|
|
6
|
+
from atdata.index import Index, LocalDatasetEntry
|
|
7
|
+
from atdata.stores import S3DataStore, LocalDiskStore
|
|
9
8
|
"""
|
|
10
9
|
|
|
11
|
-
from atdata.
|
|
10
|
+
from atdata.index import (
|
|
11
|
+
Index,
|
|
12
12
|
LocalDatasetEntry,
|
|
13
13
|
BasicIndexEntry,
|
|
14
|
-
REDIS_KEY_DATASET_ENTRY,
|
|
15
|
-
REDIS_KEY_SCHEMA,
|
|
16
|
-
)
|
|
17
|
-
from atdata.local._schema import (
|
|
18
14
|
SchemaNamespace,
|
|
19
15
|
SchemaFieldType,
|
|
20
16
|
SchemaField,
|
|
21
17
|
LocalSchemaRecord,
|
|
18
|
+
REDIS_KEY_DATASET_ENTRY,
|
|
19
|
+
REDIS_KEY_SCHEMA,
|
|
22
20
|
_ATDATA_URI_PREFIX,
|
|
23
21
|
_LEGACY_URI_PREFIX,
|
|
24
22
|
_kind_str_for_sample_type,
|
|
@@ -29,8 +27,8 @@ from atdata.local._schema import (
|
|
|
29
27
|
_python_type_to_field_type,
|
|
30
28
|
_build_schema_record,
|
|
31
29
|
)
|
|
32
|
-
from atdata.
|
|
33
|
-
|
|
30
|
+
from atdata.stores import (
|
|
31
|
+
LocalDiskStore,
|
|
34
32
|
S3DataStore,
|
|
35
33
|
_s3_env,
|
|
36
34
|
_s3_from_credentials,
|
|
@@ -44,6 +42,7 @@ from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
|
|
|
44
42
|
|
|
45
43
|
__all__ = [
|
|
46
44
|
# Public API
|
|
45
|
+
"LocalDiskStore",
|
|
47
46
|
"Index",
|
|
48
47
|
"LocalDatasetEntry",
|
|
49
48
|
"BasicIndexEntry",
|
atdata/local/_repo_legacy.py
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
from atdata import Dataset
|
|
4
4
|
|
|
5
|
-
from atdata.
|
|
6
|
-
from atdata.
|
|
5
|
+
from atdata.index._entry import LocalDatasetEntry
|
|
6
|
+
from atdata.stores._s3 import _s3_env, _s3_from_credentials, _create_s3_write_callbacks
|
|
7
7
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from uuid import uuid4
|
|
@@ -97,7 +97,7 @@ class Repo:
|
|
|
97
97
|
|
|
98
98
|
#
|
|
99
99
|
|
|
100
|
-
from atdata.
|
|
100
|
+
from atdata.index._index import Index
|
|
101
101
|
|
|
102
102
|
self.index = Index(redis=redis)
|
|
103
103
|
|