atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. atdata/__init__.py +11 -0
  2. atdata/_cid.py +0 -21
  3. atdata/_helpers.py +12 -0
  4. atdata/_hf_api.py +46 -1
  5. atdata/_logging.py +43 -0
  6. atdata/_protocols.py +81 -182
  7. atdata/_schema_codec.py +2 -2
  8. atdata/_sources.py +24 -4
  9. atdata/_stub_manager.py +5 -25
  10. atdata/atmosphere/__init__.py +60 -21
  11. atdata/atmosphere/_lexicon_types.py +595 -0
  12. atdata/atmosphere/_types.py +73 -245
  13. atdata/atmosphere/client.py +64 -12
  14. atdata/atmosphere/lens.py +60 -53
  15. atdata/atmosphere/records.py +291 -100
  16. atdata/atmosphere/schema.py +91 -65
  17. atdata/atmosphere/store.py +68 -66
  18. atdata/cli/__init__.py +16 -16
  19. atdata/cli/diagnose.py +2 -2
  20. atdata/cli/{local.py → infra.py} +10 -10
  21. atdata/dataset.py +266 -47
  22. atdata/index/__init__.py +54 -0
  23. atdata/{local → index}/_entry.py +6 -2
  24. atdata/{local → index}/_index.py +617 -72
  25. atdata/{local → index}/_schema.py +5 -5
  26. atdata/lexicons/__init__.py +127 -0
  27. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  28. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  29. atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
  30. atdata/lexicons/ac.foundation.dataset.record.json +117 -0
  31. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  32. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
  34. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  35. atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
  36. atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
  37. atdata/lexicons/ndarray_shim.json +16 -0
  38. atdata/local/__init__.py +12 -13
  39. atdata/local/_repo_legacy.py +3 -3
  40. atdata/manifest/__init__.py +4 -0
  41. atdata/manifest/_proxy.py +321 -0
  42. atdata/promote.py +14 -10
  43. atdata/repository.py +66 -16
  44. atdata/stores/__init__.py +23 -0
  45. atdata/stores/_disk.py +131 -0
  46. atdata/{local → stores}/_s3.py +134 -112
  47. atdata/testing.py +12 -8
  48. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
  49. atdata-0.3.2b1.dist-info/RECORD +71 -0
  50. atdata-0.3.0b1.dist-info/RECORD +0 -54
  51. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
  52. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
  53. {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
@@ -26,7 +26,7 @@ from typing import (
26
26
  T = TypeVar("T", bound=Packable)
27
27
 
28
28
  # URI scheme prefixes
29
- _ATDATA_URI_PREFIX = "atdata://local/sampleSchema/"
29
+ _ATDATA_URI_PREFIX = "atdata://local/schema/"
30
30
  _LEGACY_URI_PREFIX = "local://schemas/"
31
31
 
32
32
 
@@ -37,7 +37,7 @@ class SchemaNamespace:
37
37
  Supports attribute access, iteration, ``len()``, and ``in`` checks.
38
38
 
39
39
  Examples:
40
- >>> index.load_schema("atdata://local/sampleSchema/MySample@1.0.0")
40
+ >>> index.load_schema("atdata://local/schema/MySample@1.0.0")
41
41
  >>> MyType = index.types.MySample
42
42
  >>> sample = MyType(field1="hello", field2=42)
43
43
 
@@ -207,7 +207,7 @@ class LocalSchemaRecord:
207
207
  """List of field definitions."""
208
208
 
209
209
  ref: str
210
- """Schema reference URI (atdata://local/sampleSchema/{name}@{version})."""
210
+ """Schema reference URI (atdata://local/schema/{name}@{version})."""
211
211
 
212
212
  description: Optional[str] = None
213
213
  """Human-readable description."""
@@ -259,7 +259,7 @@ def _kind_str_for_sample_type(st: Type[Packable]) -> str:
259
259
 
260
260
 
261
261
  def _schema_ref_from_type(sample_type: Type[Packable], version: str) -> str:
262
- """Generate 'atdata://local/sampleSchema/{name}@{version}' reference."""
262
+ """Generate 'atdata://local/schema/{name}@{version}' reference."""
263
263
  return _make_schema_ref(sample_type.__name__, version)
264
264
 
265
265
 
@@ -271,7 +271,7 @@ def _make_schema_ref(name: str, version: str) -> str:
271
271
  def _parse_schema_ref(ref: str) -> tuple[str, str]:
272
272
  """Parse schema reference into (name, version).
273
273
 
274
- Supports both new format: 'atdata://local/sampleSchema/{name}@{version}'
274
+ Supports both new format: 'atdata://local/schema/{name}@{version}'
275
275
  and legacy format: 'local://schemas/{module.Class}@{version}'
276
276
  """
277
277
  if ref.startswith(_ATDATA_URI_PREFIX):
@@ -0,0 +1,127 @@
1
+ """ATProto Lexicon definitions for the atdata federation.
2
+
3
+ This package contains the canonical Lexicon JSON files for the
4
+ ``ac.foundation.dataset`` namespace. These define the ATProto record
5
+ types used by atdata for publishing schemas, datasets, and lenses
6
+ to the AT Protocol network.
7
+
8
+ Lexicons:
9
+ ac.foundation.dataset.schema
10
+ Versioned sample type definitions (PackableSample schemas).
11
+ ac.foundation.dataset.record
12
+ Dataset index records pointing to WebDataset storage.
13
+ ac.foundation.dataset.lens
14
+ Bidirectional transformations between schemas.
15
+ ac.foundation.dataset.schemaType
16
+ Extensible token for schema format identifiers.
17
+ ac.foundation.dataset.arrayFormat
18
+ Extensible token for array serialization formats.
19
+ ac.foundation.dataset.storageHttp
20
+ HTTP/HTTPS URL-based storage with per-shard checksums.
21
+ ac.foundation.dataset.storageS3
22
+ S3/S3-compatible object storage with per-shard checksums.
23
+ ac.foundation.dataset.storageBlobs
24
+ ATProto PDS blob-based storage.
25
+ ac.foundation.dataset.storageExternal
26
+ (Deprecated) External URL-based storage.
27
+ ac.foundation.dataset.getLatestSchema
28
+ XRPC query for fetching the latest schema version.
29
+
30
+ The ``ndarray_shim.json`` file defines the standard NDArray type
31
+ for use within JSON Schema definitions.
32
+
33
+ Examples:
34
+ >>> from atdata.lexicons import load_lexicon
35
+ >>> schema_lex = load_lexicon("ac.foundation.dataset.schema")
36
+ >>> schema_lex["id"]
37
+ 'ac.foundation.dataset.schema'
38
+ """
39
+
40
+ import json
41
+ from importlib import resources
42
+ from functools import lru_cache
43
+ from typing import Any
44
+
45
+
46
+ NAMESPACE = "ac.foundation.dataset"
47
+
48
+ LEXICON_IDS = (
49
+ f"{NAMESPACE}.schema",
50
+ f"{NAMESPACE}.record",
51
+ f"{NAMESPACE}.lens",
52
+ f"{NAMESPACE}.schemaType",
53
+ f"{NAMESPACE}.arrayFormat",
54
+ f"{NAMESPACE}.storageHttp",
55
+ f"{NAMESPACE}.storageS3",
56
+ f"{NAMESPACE}.storageBlobs",
57
+ f"{NAMESPACE}.storageExternal", # deprecated
58
+ f"{NAMESPACE}.getLatestSchema",
59
+ )
60
+
61
+
62
+ @lru_cache(maxsize=16)
63
+ def load_lexicon(lexicon_id: str) -> dict[str, Any]:
64
+ """Load a lexicon definition by its NSID.
65
+
66
+ Args:
67
+ lexicon_id: The lexicon NSID, e.g. ``"ac.foundation.dataset.schema"``.
68
+
69
+ Returns:
70
+ Parsed JSON dictionary containing the lexicon definition.
71
+
72
+ Raises:
73
+ FileNotFoundError: If no lexicon file exists for the given ID.
74
+
75
+ Examples:
76
+ >>> lex = load_lexicon("ac.foundation.dataset.schema")
77
+ >>> lex["defs"]["main"]["type"]
78
+ 'record'
79
+ """
80
+ filename = f"{lexicon_id}.json"
81
+ ref = resources.files(__package__).joinpath(filename)
82
+ try:
83
+ text = ref.read_text(encoding="utf-8")
84
+ except FileNotFoundError:
85
+ raise FileNotFoundError(
86
+ f"No lexicon file found for '{lexicon_id}'. "
87
+ f"Expected {filename} in {__package__}."
88
+ ) from None
89
+ return json.loads(text)
90
+
91
+
92
+ @lru_cache(maxsize=1)
93
+ def load_ndarray_shim() -> dict[str, Any]:
94
+ """Load the NDArray JSON Schema shim definition.
95
+
96
+ Returns:
97
+ Parsed JSON dictionary containing the NDArray shim schema.
98
+
99
+ Examples:
100
+ >>> shim = load_ndarray_shim()
101
+ >>> shim["$defs"]["ndarray"]["type"]
102
+ 'string'
103
+ """
104
+ ref = resources.files(__package__).joinpath("ndarray_shim.json")
105
+ return json.loads(ref.read_text(encoding="utf-8"))
106
+
107
+
108
+ def list_lexicons() -> tuple[str, ...]:
109
+ """Return the tuple of all known lexicon NSIDs.
110
+
111
+ Returns:
112
+ Tuple of lexicon ID strings.
113
+
114
+ Examples:
115
+ >>> "ac.foundation.dataset.schema" in list_lexicons()
116
+ True
117
+ """
118
+ return LEXICON_IDS
119
+
120
+
121
+ __all__ = [
122
+ "NAMESPACE",
123
+ "LEXICON_IDS",
124
+ "load_lexicon",
125
+ "load_ndarray_shim",
126
+ "list_lexicons",
127
+ ]
@@ -0,0 +1,16 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.arrayFormat",
4
+ "defs": {
5
+ "main": {
6
+ "type": "string",
7
+ "description": "Array serialization format identifier for NDArray fields in sample schemas. Known values correspond to token definitions in this Lexicon. Each format has versioned specifications maintained by foundation.ac at canonical URLs.",
8
+ "knownValues": ["ndarrayBytes"],
9
+ "maxLength": 50
10
+ },
11
+ "ndarrayBytes": {
12
+ "type": "token",
13
+ "description": "Numpy .npy binary format for NDArray serialization. Stores arrays with dtype and shape in binary header. Versions maintained at https://foundation.ac/schemas/atdata-ndarray-bytes/{version}/"
14
+ }
15
+ }
16
+ }
@@ -0,0 +1,78 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.getLatestSchema",
4
+ "defs": {
5
+ "main": {
6
+ "type": "query",
7
+ "description": "Get the latest version of a sample schema by its permanent NSID identifier",
8
+ "parameters": {
9
+ "type": "params",
10
+ "required": [
11
+ "schemaId"
12
+ ],
13
+ "properties": {
14
+ "schemaId": {
15
+ "type": "string",
16
+ "description": "The permanent NSID identifier for the schema (the {NSID} part of the rkey {NSID}@{semver})",
17
+ "maxLength": 500
18
+ }
19
+ }
20
+ },
21
+ "output": {
22
+ "encoding": "application/json",
23
+ "schema": {
24
+ "type": "object",
25
+ "required": [
26
+ "uri",
27
+ "version",
28
+ "record"
29
+ ],
30
+ "properties": {
31
+ "uri": {
32
+ "type": "string",
33
+ "description": "AT-URI of the latest schema version",
34
+ "maxLength": 500
35
+ },
36
+ "version": {
37
+ "type": "string",
38
+ "description": "Semantic version of the latest schema",
39
+ "maxLength": 20
40
+ },
41
+ "record": {
42
+ "type": "ref",
43
+ "ref": "ac.foundation.dataset.schema",
44
+ "description": "The full schema record"
45
+ },
46
+ "allVersions": {
47
+ "type": "array",
48
+ "description": "All available versions (optional, sorted by semver descending)",
49
+ "items": {
50
+ "type": "object",
51
+ "required": [
52
+ "uri",
53
+ "version"
54
+ ],
55
+ "properties": {
56
+ "uri": {
57
+ "type": "string",
58
+ "maxLength": 500
59
+ },
60
+ "version": {
61
+ "type": "string",
62
+ "maxLength": 20
63
+ }
64
+ }
65
+ }
66
+ }
67
+ }
68
+ }
69
+ },
70
+ "errors": [
71
+ {
72
+ "name": "SchemaNotFound",
73
+ "description": "No schema found with the given NSID"
74
+ }
75
+ ]
76
+ }
77
+ }
78
+ }
@@ -0,0 +1,101 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.lens",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Bidirectional transformation (Lens) between two sample types, with code stored in external repositories",
8
+ "key": "tid",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "sourceSchema",
14
+ "targetSchema",
15
+ "getterCode",
16
+ "putterCode",
17
+ "createdAt"
18
+ ],
19
+ "properties": {
20
+ "name": {
21
+ "type": "string",
22
+ "description": "Human-readable lens name",
23
+ "maxLength": 100
24
+ },
25
+ "sourceSchema": {
26
+ "type": "string",
27
+ "format": "at-uri",
28
+ "description": "AT-URI reference to source schema",
29
+ "maxLength": 500
30
+ },
31
+ "targetSchema": {
32
+ "type": "string",
33
+ "format": "at-uri",
34
+ "description": "AT-URI reference to target schema",
35
+ "maxLength": 500
36
+ },
37
+ "description": {
38
+ "type": "string",
39
+ "description": "What this transformation does",
40
+ "maxLength": 1000
41
+ },
42
+ "getterCode": {
43
+ "type": "ref",
44
+ "ref": "#codeReference",
45
+ "description": "Code reference for getter function (Source -> Target)"
46
+ },
47
+ "putterCode": {
48
+ "type": "ref",
49
+ "ref": "#codeReference",
50
+ "description": "Code reference for putter function (Target, Source -> Source)"
51
+ },
52
+ "language": {
53
+ "type": "string",
54
+ "description": "Programming language of the lens implementation (e.g., 'python', 'typescript')",
55
+ "maxLength": 50
56
+ },
57
+ "metadata": {
58
+ "type": "object",
59
+ "description": "Arbitrary metadata (author, performance notes, etc.)"
60
+ },
61
+ "createdAt": {
62
+ "type": "string",
63
+ "format": "datetime",
64
+ "description": "Timestamp when this lens was created"
65
+ }
66
+ }
67
+ }
68
+ },
69
+ "codeReference": {
70
+ "type": "object",
71
+ "description": "Reference to code in an external repository (GitHub, tangled.org, etc.)",
72
+ "required": [
73
+ "repository",
74
+ "commit",
75
+ "path"
76
+ ],
77
+ "properties": {
78
+ "repository": {
79
+ "type": "string",
80
+ "description": "Repository URL (e.g., 'https://github.com/user/repo' or 'at://did/tangled.repo/...')",
81
+ "maxLength": 500
82
+ },
83
+ "commit": {
84
+ "type": "string",
85
+ "description": "Git commit hash (ensures immutability)",
86
+ "maxLength": 40
87
+ },
88
+ "path": {
89
+ "type": "string",
90
+ "description": "Path to function within repository (e.g., 'lenses/vision.py:rgb_to_grayscale')",
91
+ "maxLength": 500
92
+ },
93
+ "branch": {
94
+ "type": "string",
95
+ "description": "Optional branch name (for reference, commit hash is authoritative)",
96
+ "maxLength": 100
97
+ }
98
+ }
99
+ }
100
+ }
101
+ }
@@ -0,0 +1,117 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.record",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Index record for a WebDataset-backed dataset with references to storage location and sample schema",
8
+ "key": "tid",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "schemaRef",
14
+ "storage",
15
+ "createdAt"
16
+ ],
17
+ "properties": {
18
+ "name": {
19
+ "type": "string",
20
+ "description": "Human-readable dataset name",
21
+ "maxLength": 200
22
+ },
23
+ "schemaRef": {
24
+ "type": "string",
25
+ "format": "at-uri",
26
+ "description": "AT-URI reference to the schema record for this dataset's samples",
27
+ "maxLength": 500
28
+ },
29
+ "storage": {
30
+ "type": "union",
31
+ "description": "Storage location for dataset files (WebDataset tar archives)",
32
+ "refs": [
33
+ "ac.foundation.dataset.storageHttp",
34
+ "ac.foundation.dataset.storageS3",
35
+ "ac.foundation.dataset.storageBlobs"
36
+ ]
37
+ },
38
+ "description": {
39
+ "type": "string",
40
+ "description": "Human-readable description of the dataset",
41
+ "maxLength": 5000
42
+ },
43
+ "metadata": {
44
+ "type": "bytes",
45
+ "description": "Msgpack-encoded metadata dict for arbitrary extended key-value pairs. Use this for additional metadata beyond the core top-level fields (license, tags, size). Top-level fields are preferred for discoverable/searchable metadata.",
46
+ "maxLength": 100000
47
+ },
48
+ "tags": {
49
+ "type": "array",
50
+ "description": "Searchable tags for dataset discovery. Aligns with Schema.org keywords property.",
51
+ "items": {
52
+ "type": "string",
53
+ "maxLength": 150
54
+ },
55
+ "maxLength": 30
56
+ },
57
+ "size": {
58
+ "type": "ref",
59
+ "ref": "#datasetSize",
60
+ "description": "Dataset size information (optional)"
61
+ },
62
+ "license": {
63
+ "type": "string",
64
+ "description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
65
+ "maxLength": 200
66
+ },
67
+ "createdAt": {
68
+ "type": "string",
69
+ "format": "datetime",
70
+ "description": "Timestamp when this dataset record was created"
71
+ }
72
+ }
73
+ }
74
+ },
75
+ "shardChecksum": {
76
+ "type": "object",
77
+ "description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",
78
+ "required": [
79
+ "algorithm",
80
+ "digest"
81
+ ],
82
+ "properties": {
83
+ "algorithm": {
84
+ "type": "string",
85
+ "description": "Hash algorithm identifier (e.g., 'sha256', 'blake3')",
86
+ "maxLength": 20
87
+ },
88
+ "digest": {
89
+ "type": "string",
90
+ "description": "Hex-encoded hash digest",
91
+ "maxLength": 128
92
+ }
93
+ }
94
+ },
95
+ "datasetSize": {
96
+ "type": "object",
97
+ "description": "Information about dataset size",
98
+ "properties": {
99
+ "samples": {
100
+ "type": "integer",
101
+ "description": "Total number of samples in the dataset",
102
+ "minimum": 0
103
+ },
104
+ "bytes": {
105
+ "type": "integer",
106
+ "description": "Total size in bytes",
107
+ "minimum": 0
108
+ },
109
+ "shards": {
110
+ "type": "integer",
111
+ "description": "Number of WebDataset shards",
112
+ "minimum": 1
113
+ }
114
+ }
115
+ }
116
+ }
117
+ }
@@ -0,0 +1,107 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.schema",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Definition of a PackableSample-compatible sample type. Supports versioning via rkey format: {NSID}@{semver}. Schema format is extensible via union type.",
8
+ "key": "any",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "version",
14
+ "schemaType",
15
+ "schema",
16
+ "createdAt"
17
+ ],
18
+ "properties": {
19
+ "name": {
20
+ "type": "string",
21
+ "description": "Human-readable display name for this sample type. Used for documentation and UI. The NSID in the record URI provides unique identification; name collisions across NSIDs are acceptable.",
22
+ "maxLength": 100
23
+ },
24
+ "version": {
25
+ "type": "string",
26
+ "description": "Semantic version (e.g., '1.0.0')",
27
+ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
28
+ "maxLength": 100
29
+ },
30
+ "schemaType": {
31
+ "type": "ref",
32
+ "ref": "ac.foundation.dataset.schemaType",
33
+ "description": "Type of schema definition. This field indicates which union member is present in the schema field."
34
+ },
35
+ "schema": {
36
+ "type": "union",
37
+ "refs": ["ac.foundation.dataset.schema#jsonSchemaFormat"],
38
+ "closed": false,
39
+ "description": "Schema definition for this sample type. Currently supports JSON Schema Draft 7. Union allows for future schema formats (Avro, Protobuf, etc.) without breaking changes."
40
+ },
41
+ "description": {
42
+ "type": "string",
43
+ "description": "Human-readable description of what this sample type represents",
44
+ "maxLength": 5000
45
+ },
46
+ "metadata": {
47
+ "type": "object",
48
+ "description": "Optional metadata about this schema. Common fields include license and tags, but any additional fields are permitted.",
49
+ "maxProperties": 50,
50
+ "properties": {
51
+ "license": {
52
+ "type": "string",
53
+ "description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
54
+ "maxLength": 200
55
+ },
56
+ "tags": {
57
+ "type": "array",
58
+ "description": "Categorization keywords for discovery. Aligns with Schema.org keywords property.",
59
+ "items": {
60
+ "type": "string",
61
+ "maxLength": 150
62
+ },
63
+ "maxLength": 30
64
+ }
65
+ }
66
+ },
67
+ "createdAt": {
68
+ "type": "string",
69
+ "format": "datetime",
70
+ "description": "Timestamp when this schema version was created. Immutable once set (ATProto records are permanent)."
71
+ }
72
+ }
73
+ }
74
+ },
75
+ "jsonSchemaFormat": {
76
+ "type": "object",
77
+ "description": "JSON Schema Draft 7 format for sample type definitions. Used with NDArray shim for array types.",
78
+ "required": ["$type", "$schema", "type", "properties"],
79
+ "properties": {
80
+ "$type": {
81
+ "type": "string",
82
+ "const": "ac.foundation.dataset.schema#jsonSchemaFormat"
83
+ },
84
+ "$schema": {
85
+ "type": "string",
86
+ "const": "http://json-schema.org/draft-07/schema#",
87
+ "description": "JSON Schema version identifier"
88
+ },
89
+ "type": {
90
+ "type": "string",
91
+ "const": "object",
92
+ "description": "Sample types must be objects"
93
+ },
94
+ "properties": {
95
+ "type": "object",
96
+ "description": "Field definitions for the sample type",
97
+ "minProperties": 1
98
+ },
99
+ "arrayFormatVersions": {
100
+ "type": "object",
101
+ "description": "Mapping from array format identifiers to semantic versions. Keys are ac.foundation.dataset.arrayFormat values (e.g., 'ndarrayBytes'), values are semver strings (e.g., '1.0.0'). Foundation.ac maintains canonical shim schemas at https://foundation.ac/schemas/atdata-{format}-bytes/{version}/.",
102
+ "maxProperties": 10
103
+ }
104
+ }
105
+ }
106
+ }
107
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.schemaType",
4
+ "defs": {
5
+ "main": {
6
+ "type": "string",
7
+ "description": "Schema type identifier for atdata sample definitions. Known values correspond to token definitions in this Lexicon. New schema types can be added as tokens without breaking changes.",
8
+ "knownValues": ["jsonSchema"],
9
+ "maxLength": 50
10
+ },
11
+ "jsonSchema": {
12
+ "type": "token",
13
+ "description": "JSON Schema Draft 7 format for sample type definitions. When schemaType is 'jsonSchema', the schema field must contain an object conforming to ac.foundation.dataset.schema#jsonSchemaFormat."
14
+ }
15
+ }
16
+ }
@@ -0,0 +1,46 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.storageBlobs",
4
+ "defs": {
5
+ "main": {
6
+ "type": "object",
7
+ "description": "Storage via ATProto PDS blobs for WebDataset tar archives. Used in ac.foundation.dataset.record storage union for maximum decentralization.",
8
+ "required": [
9
+ "blobs"
10
+ ],
11
+ "properties": {
12
+ "blobs": {
13
+ "type": "array",
14
+ "description": "Array of blob entries for WebDataset tar files",
15
+ "items": {
16
+ "type": "ref",
17
+ "ref": "#blobEntry"
18
+ },
19
+ "minLength": 1
20
+ }
21
+ }
22
+ },
23
+ "blobEntry": {
24
+ "type": "object",
25
+ "description": "A single PDS blob shard with optional integrity checksum",
26
+ "required": [
27
+ "blob"
28
+ ],
29
+ "properties": {
30
+ "blob": {
31
+ "type": "blob",
32
+ "accept": [
33
+ "application/x-tar"
34
+ ],
35
+ "maxSize": 52428800,
36
+ "description": "Blob reference to a WebDataset tar archive"
37
+ },
38
+ "checksum": {
39
+ "type": "ref",
40
+ "ref": "ac.foundation.dataset.record#shardChecksum",
41
+ "description": "Content hash for integrity verification (optional since PDS blobs have built-in CID integrity)"
42
+ }
43
+ }
44
+ }
45
+ }
46
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.storageExternal",
4
+ "defs": {
5
+ "main": {
6
+ "type": "object",
7
+ "description": "(Deprecated: use storageHttp or storageS3 instead.) External storage via URLs for WebDataset tar archives. URLs support brace notation for sharding (e.g., 'data-{000000..000099}.tar').",
8
+ "required": [
9
+ "urls"
10
+ ],
11
+ "properties": {
12
+ "urls": {
13
+ "type": "array",
14
+ "description": "WebDataset URLs with optional brace notation for sharded tar files",
15
+ "items": {
16
+ "type": "string",
17
+ "format": "uri",
18
+ "maxLength": 1000
19
+ },
20
+ "minLength": 1
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }