atdata 0.3.0b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. atdata/__init__.py +9 -0
  2. atdata/_cid.py +0 -21
  3. atdata/_helpers.py +12 -0
  4. atdata/_hf_api.py +33 -1
  5. atdata/_protocols.py +64 -182
  6. atdata/_schema_codec.py +2 -2
  7. atdata/_stub_manager.py +5 -25
  8. atdata/atmosphere/__init__.py +12 -11
  9. atdata/atmosphere/_types.py +4 -4
  10. atdata/atmosphere/client.py +64 -12
  11. atdata/atmosphere/lens.py +11 -12
  12. atdata/atmosphere/records.py +9 -10
  13. atdata/atmosphere/schema.py +14 -16
  14. atdata/atmosphere/store.py +6 -7
  15. atdata/cli/__init__.py +16 -16
  16. atdata/cli/diagnose.py +2 -2
  17. atdata/cli/{local.py → infra.py} +10 -10
  18. atdata/dataset.py +155 -2
  19. atdata/index/__init__.py +54 -0
  20. atdata/{local → index}/_index.py +322 -64
  21. atdata/{local → index}/_schema.py +5 -5
  22. atdata/lexicons/__init__.py +121 -0
  23. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  24. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  25. atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
  26. atdata/lexicons/ac.foundation.dataset.record.json +96 -0
  27. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  28. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  29. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
  30. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  31. atdata/lexicons/ndarray_shim.json +16 -0
  32. atdata/local/__init__.py +12 -13
  33. atdata/local/_repo_legacy.py +3 -3
  34. atdata/promote.py +14 -10
  35. atdata/repository.py +7 -7
  36. atdata/stores/__init__.py +23 -0
  37. atdata/stores/_disk.py +123 -0
  38. atdata/testing.py +12 -8
  39. {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +2 -2
  40. atdata-0.3.1b1.dist-info/RECORD +67 -0
  41. atdata-0.3.0b1.dist-info/RECORD +0 -54
  42. /atdata/{local → index}/_entry.py +0 -0
  43. /atdata/{local → stores}/_s3.py +0 -0
  44. {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
  45. {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
  46. {atdata-0.3.0b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,121 @@
1
+ """ATProto Lexicon definitions for the atdata federation.
2
+
3
+ This package contains the canonical Lexicon JSON files for the
4
+ ``ac.foundation.dataset`` namespace. These define the ATProto record
5
+ types used by atdata for publishing schemas, datasets, and lenses
6
+ to the AT Protocol network.
7
+
8
+ Lexicons:
9
+ ac.foundation.dataset.schema
10
+ Versioned sample type definitions (PackableSample schemas).
11
+ ac.foundation.dataset.record
12
+ Dataset index records pointing to WebDataset storage.
13
+ ac.foundation.dataset.lens
14
+ Bidirectional transformations between schemas.
15
+ ac.foundation.dataset.schemaType
16
+ Extensible token for schema format identifiers.
17
+ ac.foundation.dataset.arrayFormat
18
+ Extensible token for array serialization formats.
19
+ ac.foundation.dataset.storageExternal
20
+ External URL-based storage (S3, HTTP, IPFS).
21
+ ac.foundation.dataset.storageBlobs
22
+ ATProto PDS blob-based storage.
23
+ ac.foundation.dataset.getLatestSchema
24
+ XRPC query for fetching the latest schema version.
25
+
26
+ The ``ndarray_shim.json`` file defines the standard NDArray type
27
+ for use within JSON Schema definitions.
28
+
29
+ Examples:
30
+ >>> from atdata.lexicons import load_lexicon
31
+ >>> schema_lex = load_lexicon("ac.foundation.dataset.schema")
32
+ >>> schema_lex["id"]
33
+ 'ac.foundation.dataset.schema'
34
+ """
35
+
36
+ import json
37
+ from importlib import resources
38
+ from functools import lru_cache
39
+ from typing import Any
40
+
41
+
42
+ NAMESPACE = "ac.foundation.dataset"
43
+
44
+ LEXICON_IDS = (
45
+ f"{NAMESPACE}.schema",
46
+ f"{NAMESPACE}.record",
47
+ f"{NAMESPACE}.lens",
48
+ f"{NAMESPACE}.schemaType",
49
+ f"{NAMESPACE}.arrayFormat",
50
+ f"{NAMESPACE}.storageExternal",
51
+ f"{NAMESPACE}.storageBlobs",
52
+ f"{NAMESPACE}.getLatestSchema",
53
+ )
54
+
55
+
56
+ @lru_cache(maxsize=16)
57
+ def load_lexicon(lexicon_id: str) -> dict[str, Any]:
58
+ """Load a lexicon definition by its NSID.
59
+
60
+ Args:
61
+ lexicon_id: The lexicon NSID, e.g. ``"ac.foundation.dataset.schema"``.
62
+
63
+ Returns:
64
+ Parsed JSON dictionary containing the lexicon definition.
65
+
66
+ Raises:
67
+ FileNotFoundError: If no lexicon file exists for the given ID.
68
+
69
+ Examples:
70
+ >>> lex = load_lexicon("ac.foundation.dataset.schema")
71
+ >>> lex["defs"]["main"]["type"]
72
+ 'record'
73
+ """
74
+ filename = f"{lexicon_id}.json"
75
+ ref = resources.files(__package__).joinpath(filename)
76
+ try:
77
+ text = ref.read_text(encoding="utf-8")
78
+ except FileNotFoundError:
79
+ raise FileNotFoundError(
80
+ f"No lexicon file found for '{lexicon_id}'. "
81
+ f"Expected {filename} in {__package__}."
82
+ ) from None
83
+ return json.loads(text)
84
+
85
+
86
+ @lru_cache(maxsize=1)
87
+ def load_ndarray_shim() -> dict[str, Any]:
88
+ """Load the NDArray JSON Schema shim definition.
89
+
90
+ Returns:
91
+ Parsed JSON dictionary containing the NDArray shim schema.
92
+
93
+ Examples:
94
+ >>> shim = load_ndarray_shim()
95
+ >>> shim["$defs"]["ndarray"]["type"]
96
+ 'string'
97
+ """
98
+ ref = resources.files(__package__).joinpath("ndarray_shim.json")
99
+ return json.loads(ref.read_text(encoding="utf-8"))
100
+
101
+
102
+ def list_lexicons() -> tuple[str, ...]:
103
+ """Return the tuple of all known lexicon NSIDs.
104
+
105
+ Returns:
106
+ Tuple of lexicon ID strings.
107
+
108
+ Examples:
109
+ >>> "ac.foundation.dataset.schema" in list_lexicons()
110
+ True
111
+ """
112
+ return LEXICON_IDS
113
+
114
+
115
+ __all__ = [
116
+ "NAMESPACE",
117
+ "LEXICON_IDS",
118
+ "load_lexicon",
119
+ "load_ndarray_shim",
120
+ "list_lexicons",
121
+ ]
@@ -0,0 +1,16 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.arrayFormat",
4
+ "defs": {
5
+ "main": {
6
+ "type": "string",
7
+ "description": "Array serialization format identifier for NDArray fields in sample schemas. Known values correspond to token definitions in this Lexicon. Each format has versioned specifications maintained by foundation.ac at canonical URLs.",
8
+ "knownValues": ["ndarrayBytes"],
9
+ "maxLength": 50
10
+ },
11
+ "ndarrayBytes": {
12
+ "type": "token",
13
+ "description": "Numpy .npy binary format for NDArray serialization. Stores arrays with dtype and shape in binary header. Versions maintained at https://foundation.ac/schemas/atdata-ndarray-bytes/{version}/"
14
+ }
15
+ }
16
+ }
@@ -0,0 +1,78 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.getLatestSchema",
4
+ "defs": {
5
+ "main": {
6
+ "type": "query",
7
+ "description": "Get the latest version of a sample schema by its permanent NSID identifier",
8
+ "parameters": {
9
+ "type": "params",
10
+ "required": [
11
+ "schemaId"
12
+ ],
13
+ "properties": {
14
+ "schemaId": {
15
+ "type": "string",
16
+ "description": "The permanent NSID identifier for the schema (the {NSID} part of the rkey {NSID}@{semver})",
17
+ "maxLength": 500
18
+ }
19
+ }
20
+ },
21
+ "output": {
22
+ "encoding": "application/json",
23
+ "schema": {
24
+ "type": "object",
25
+ "required": [
26
+ "uri",
27
+ "version",
28
+ "record"
29
+ ],
30
+ "properties": {
31
+ "uri": {
32
+ "type": "string",
33
+ "description": "AT-URI of the latest schema version",
34
+ "maxLength": 500
35
+ },
36
+ "version": {
37
+ "type": "string",
38
+ "description": "Semantic version of the latest schema",
39
+ "maxLength": 20
40
+ },
41
+ "record": {
42
+ "type": "ref",
43
+ "ref": "ac.foundation.dataset.schema",
44
+ "description": "The full schema record"
45
+ },
46
+ "allVersions": {
47
+ "type": "array",
48
+ "description": "All available versions (optional, sorted by semver descending)",
49
+ "items": {
50
+ "type": "object",
51
+ "required": [
52
+ "uri",
53
+ "version"
54
+ ],
55
+ "properties": {
56
+ "uri": {
57
+ "type": "string",
58
+ "maxLength": 500
59
+ },
60
+ "version": {
61
+ "type": "string",
62
+ "maxLength": 20
63
+ }
64
+ }
65
+ }
66
+ }
67
+ }
68
+ }
69
+ },
70
+ "errors": [
71
+ {
72
+ "name": "SchemaNotFound",
73
+ "description": "No schema found with the given NSID"
74
+ }
75
+ ]
76
+ }
77
+ }
78
+ }
@@ -0,0 +1,99 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.lens",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Bidirectional transformation (Lens) between two sample types, with code stored in external repositories",
8
+ "key": "tid",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "sourceSchema",
14
+ "targetSchema",
15
+ "getterCode",
16
+ "putterCode",
17
+ "createdAt"
18
+ ],
19
+ "properties": {
20
+ "name": {
21
+ "type": "string",
22
+ "description": "Human-readable lens name",
23
+ "maxLength": 100
24
+ },
25
+ "sourceSchema": {
26
+ "type": "string",
27
+ "description": "AT-URI reference to source schema",
28
+ "maxLength": 500
29
+ },
30
+ "targetSchema": {
31
+ "type": "string",
32
+ "description": "AT-URI reference to target schema",
33
+ "maxLength": 500
34
+ },
35
+ "description": {
36
+ "type": "string",
37
+ "description": "What this transformation does",
38
+ "maxLength": 1000
39
+ },
40
+ "getterCode": {
41
+ "type": "ref",
42
+ "ref": "#codeReference",
43
+ "description": "Code reference for getter function (Source -> Target)"
44
+ },
45
+ "putterCode": {
46
+ "type": "ref",
47
+ "ref": "#codeReference",
48
+ "description": "Code reference for putter function (Target, Source -> Source)"
49
+ },
50
+ "language": {
51
+ "type": "string",
52
+ "description": "Programming language of the lens implementation (e.g., 'python', 'typescript')",
53
+ "maxLength": 50
54
+ },
55
+ "metadata": {
56
+ "type": "object",
57
+ "description": "Arbitrary metadata (author, performance notes, etc.)"
58
+ },
59
+ "createdAt": {
60
+ "type": "string",
61
+ "format": "datetime",
62
+ "description": "Timestamp when this lens was created"
63
+ }
64
+ }
65
+ }
66
+ },
67
+ "codeReference": {
68
+ "type": "object",
69
+ "description": "Reference to code in an external repository (GitHub, tangled.org, etc.)",
70
+ "required": [
71
+ "repository",
72
+ "commit",
73
+ "path"
74
+ ],
75
+ "properties": {
76
+ "repository": {
77
+ "type": "string",
78
+ "description": "Repository URL (e.g., 'https://github.com/user/repo' or 'at://did/tangled.repo/...')",
79
+ "maxLength": 500
80
+ },
81
+ "commit": {
82
+ "type": "string",
83
+ "description": "Git commit hash (ensures immutability)",
84
+ "maxLength": 40
85
+ },
86
+ "path": {
87
+ "type": "string",
88
+ "description": "Path to function within repository (e.g., 'lenses/vision.py:rgb_to_grayscale')",
89
+ "maxLength": 500
90
+ },
91
+ "branch": {
92
+ "type": "string",
93
+ "description": "Optional branch name (for reference, commit hash is authoritative)",
94
+ "maxLength": 100
95
+ }
96
+ }
97
+ }
98
+ }
99
+ }
@@ -0,0 +1,96 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.record",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Index record for a WebDataset-backed dataset with references to storage location and sample schema",
8
+ "key": "tid",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "schemaRef",
14
+ "storage",
15
+ "createdAt"
16
+ ],
17
+ "properties": {
18
+ "name": {
19
+ "type": "string",
20
+ "description": "Human-readable dataset name",
21
+ "maxLength": 200
22
+ },
23
+ "schemaRef": {
24
+ "type": "string",
25
+ "format": "at-uri",
26
+ "description": "AT-URI reference to the schema record for this dataset's samples",
27
+ "maxLength": 500
28
+ },
29
+ "storage": {
30
+ "type": "union",
31
+ "description": "Storage location for dataset files (WebDataset tar archives)",
32
+ "refs": [
33
+ "ac.foundation.dataset.storageExternal",
34
+ "ac.foundation.dataset.storageBlobs"
35
+ ]
36
+ },
37
+ "description": {
38
+ "type": "string",
39
+ "description": "Human-readable description of the dataset",
40
+ "maxLength": 5000
41
+ },
42
+ "metadata": {
43
+ "type": "bytes",
44
+ "description": "Msgpack-encoded metadata dict for arbitrary extended key-value pairs. Use this for additional metadata beyond the core top-level fields (license, tags, size). Top-level fields are preferred for discoverable/searchable metadata.",
45
+ "maxLength": 100000
46
+ },
47
+ "tags": {
48
+ "type": "array",
49
+ "description": "Searchable tags for dataset discovery. Aligns with Schema.org keywords property.",
50
+ "items": {
51
+ "type": "string",
52
+ "maxLength": 150
53
+ },
54
+ "maxLength": 30
55
+ },
56
+ "size": {
57
+ "type": "ref",
58
+ "ref": "#datasetSize",
59
+ "description": "Dataset size information (optional)"
60
+ },
61
+ "license": {
62
+ "type": "string",
63
+ "description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
64
+ "maxLength": 200
65
+ },
66
+ "createdAt": {
67
+ "type": "string",
68
+ "format": "datetime",
69
+ "description": "Timestamp when this dataset record was created"
70
+ }
71
+ }
72
+ }
73
+ },
74
+ "datasetSize": {
75
+ "type": "object",
76
+ "description": "Information about dataset size",
77
+ "properties": {
78
+ "samples": {
79
+ "type": "integer",
80
+ "description": "Total number of samples in the dataset",
81
+ "minimum": 0
82
+ },
83
+ "bytes": {
84
+ "type": "integer",
85
+ "description": "Total size in bytes",
86
+ "minimum": 0
87
+ },
88
+ "shards": {
89
+ "type": "integer",
90
+ "description": "Number of WebDataset shards",
91
+ "minimum": 1
92
+ }
93
+ }
94
+ }
95
+ }
96
+ }
@@ -0,0 +1,107 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.schema",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Definition of a PackableSample-compatible sample type. Supports versioning via rkey format: {NSID}@{semver}. Schema format is extensible via union type.",
8
+ "key": "any",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "version",
14
+ "schemaType",
15
+ "schema",
16
+ "createdAt"
17
+ ],
18
+ "properties": {
19
+ "name": {
20
+ "type": "string",
21
+ "description": "Human-readable display name for this sample type. Used for documentation and UI. The NSID in the record URI provides unique identification; name collisions across NSIDs are acceptable.",
22
+ "maxLength": 100
23
+ },
24
+ "version": {
25
+ "type": "string",
26
+ "description": "Semantic version (e.g., '1.0.0')",
27
+ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
28
+ "maxLength": 100
29
+ },
30
+ "schemaType": {
31
+ "type": "ref",
32
+ "ref": "ac.foundation.dataset.schemaType",
33
+ "description": "Type of schema definition. This field indicates which union member is present in the schema field."
34
+ },
35
+ "schema": {
36
+ "type": "union",
37
+ "refs": ["ac.foundation.dataset.schema#jsonSchemaFormat"],
38
+ "closed": false,
39
+ "description": "Schema definition for this sample type. Currently supports JSON Schema Draft 7. Union allows for future schema formats (Avro, Protobuf, etc.) without breaking changes."
40
+ },
41
+ "description": {
42
+ "type": "string",
43
+ "description": "Human-readable description of what this sample type represents",
44
+ "maxLength": 5000
45
+ },
46
+ "metadata": {
47
+ "type": "object",
48
+ "description": "Optional metadata about this schema. Common fields include license and tags, but any additional fields are permitted.",
49
+ "maxProperties": 50,
50
+ "properties": {
51
+ "license": {
52
+ "type": "string",
53
+ "description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
54
+ "maxLength": 200
55
+ },
56
+ "tags": {
57
+ "type": "array",
58
+ "description": "Categorization keywords for discovery. Aligns with Schema.org keywords property.",
59
+ "items": {
60
+ "type": "string",
61
+ "maxLength": 150
62
+ },
63
+ "maxLength": 30
64
+ }
65
+ }
66
+ },
67
+ "createdAt": {
68
+ "type": "string",
69
+ "format": "datetime",
70
+ "description": "Timestamp when this schema version was created. Immutable once set (ATProto records are permanent)."
71
+ }
72
+ }
73
+ }
74
+ },
75
+ "jsonSchemaFormat": {
76
+ "type": "object",
77
+ "description": "JSON Schema Draft 7 format for sample type definitions. Used with NDArray shim for array types.",
78
+ "required": ["$type", "$schema", "type", "properties"],
79
+ "properties": {
80
+ "$type": {
81
+ "type": "string",
82
+ "const": "ac.foundation.dataset.schema#jsonSchemaFormat"
83
+ },
84
+ "$schema": {
85
+ "type": "string",
86
+ "const": "http://json-schema.org/draft-07/schema#",
87
+ "description": "JSON Schema version identifier"
88
+ },
89
+ "type": {
90
+ "type": "string",
91
+ "const": "object",
92
+ "description": "Sample types must be objects"
93
+ },
94
+ "properties": {
95
+ "type": "object",
96
+ "description": "Field definitions for the sample type",
97
+ "minProperties": 1
98
+ },
99
+ "arrayFormatVersions": {
100
+ "type": "object",
101
+ "description": "Mapping from array format identifiers to semantic versions. Keys are ac.foundation.dataset.arrayFormat values (e.g., 'ndarrayBytes'), values are semver strings (e.g., '1.0.0'). Foundation.ac maintains canonical shim schemas at https://foundation.ac/schemas/atdata-{format}-bytes/{version}/.",
102
+ "maxProperties": 10
103
+ }
104
+ }
105
+ }
106
+ }
107
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.schemaType",
4
+ "defs": {
5
+ "main": {
6
+ "type": "string",
7
+ "description": "Schema type identifier for atdata sample definitions. Known values correspond to token definitions in this Lexicon. New schema types can be added as tokens without breaking changes.",
8
+ "knownValues": ["jsonSchema"],
9
+ "maxLength": 50
10
+ },
11
+ "jsonSchema": {
12
+ "type": "token",
13
+ "description": "JSON Schema Draft 7 format for sample type definitions. When schemaType is 'jsonSchema', the schema field must contain an object conforming to ac.foundation.dataset.schema#jsonSchemaFormat."
14
+ }
15
+ }
16
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.storageBlobs",
4
+ "defs": {
5
+ "main": {
6
+ "type": "object",
7
+ "description": "Storage via ATProto PDS blobs for WebDataset tar archives. Each blob contains one or more tar files. Used in ac.foundation.dataset.record storage union for maximum decentralization.",
8
+ "required": [
9
+ "blobs"
10
+ ],
11
+ "properties": {
12
+ "blobs": {
13
+ "type": "array",
14
+ "description": "Array of blob references for WebDataset tar files",
15
+ "items": {
16
+ "type": "blob",
17
+ "description": "Blob reference to a WebDataset tar archive"
18
+ },
19
+ "minLength": 1
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.storageExternal",
4
+ "defs": {
5
+ "main": {
6
+ "type": "object",
7
+ "description": "External storage via URLs (S3, HTTP, IPFS, etc.) for WebDataset tar archives. URLs support brace notation for sharding (e.g., 'data-{000000..000099}.tar'). Used in ac.foundation.dataset.record storage union.",
8
+ "required": [
9
+ "urls"
10
+ ],
11
+ "properties": {
12
+ "urls": {
13
+ "type": "array",
14
+ "description": "WebDataset URLs with optional brace notation for sharded tar files",
15
+ "items": {
16
+ "type": "string",
17
+ "format": "uri",
18
+ "maxLength": 1000
19
+ },
20
+ "minLength": 1
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "$id": "https://foundation.ac/schemas/atdata-ndarray-bytes/1.0.0",
4
+ "title": "ATDataNDArrayBytes",
5
+ "description": "Standard definition for numpy NDArray types in JSON Schema, compatible with atdata WebDataset serialization. This type's contents are interpreted as containing the raw bytes data for a serialized numpy NDArray, and serve as a marker for atdata-based code generation to use standard numpy types, rather than generated dataclasses.",
6
+ "version": "1.0.0",
7
+ "$defs": {
8
+ "ndarray": {
9
+ "type": "string",
10
+ "format": "byte",
11
+ "description": "Numpy array serialized using numpy `.npy` format via `np.save` (includes dtype and shape in binary header). When represented in JSON, this is a base64-encoded string. In msgpack, this is raw bytes.",
12
+ "contentEncoding": "base64",
13
+ "contentMediaType": "application/octet-stream"
14
+ }
15
+ }
16
+ }
atdata/local/__init__.py CHANGED
@@ -1,24 +1,22 @@
1
- """Local storage backend for atdata datasets.
1
+ """Backward-compatibility shim for atdata.local.
2
2
 
3
- Key classes:
3
+ .. deprecated::
4
+ Import from ``atdata.index`` and ``atdata.stores`` instead::
4
5
 
5
- - ``Index``: Unified index with pluggable providers (SQLite default),
6
- named repositories, and optional atmosphere backend.
7
- - ``LocalDatasetEntry``: Index entry with ATProto-compatible CIDs.
8
- - ``S3DataStore``: S3-compatible shard storage.
6
+ from atdata.index import Index, LocalDatasetEntry
7
+ from atdata.stores import S3DataStore, LocalDiskStore
9
8
  """
10
9
 
11
- from atdata.local._entry import (
10
+ from atdata.index import (
11
+ Index,
12
12
  LocalDatasetEntry,
13
13
  BasicIndexEntry,
14
- REDIS_KEY_DATASET_ENTRY,
15
- REDIS_KEY_SCHEMA,
16
- )
17
- from atdata.local._schema import (
18
14
  SchemaNamespace,
19
15
  SchemaFieldType,
20
16
  SchemaField,
21
17
  LocalSchemaRecord,
18
+ REDIS_KEY_DATASET_ENTRY,
19
+ REDIS_KEY_SCHEMA,
22
20
  _ATDATA_URI_PREFIX,
23
21
  _LEGACY_URI_PREFIX,
24
22
  _kind_str_for_sample_type,
@@ -29,8 +27,8 @@ from atdata.local._schema import (
29
27
  _python_type_to_field_type,
30
28
  _build_schema_record,
31
29
  )
32
- from atdata.local._index import Index
33
- from atdata.local._s3 import (
30
+ from atdata.stores import (
31
+ LocalDiskStore,
34
32
  S3DataStore,
35
33
  _s3_env,
36
34
  _s3_from_credentials,
@@ -44,6 +42,7 @@ from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
44
42
 
45
43
  __all__ = [
46
44
  # Public API
45
+ "LocalDiskStore",
47
46
  "Index",
48
47
  "LocalDatasetEntry",
49
48
  "BasicIndexEntry",
@@ -2,8 +2,8 @@
2
2
 
3
3
  from atdata import Dataset
4
4
 
5
- from atdata.local._entry import LocalDatasetEntry
6
- from atdata.local._s3 import _s3_env, _s3_from_credentials, _create_s3_write_callbacks
5
+ from atdata.index._entry import LocalDatasetEntry
6
+ from atdata.stores._s3 import _s3_env, _s3_from_credentials, _create_s3_write_callbacks
7
7
 
8
8
  from pathlib import Path
9
9
  from uuid import uuid4
@@ -97,7 +97,7 @@ class Repo:
97
97
 
98
98
  #
99
99
 
100
- from atdata.local._index import Index
100
+ from atdata.index._index import Index
101
101
 
102
102
  self.index = Index(redis=redis)
103
103