atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +39 -0
- atdata/_cid.py +0 -21
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +41 -15
- atdata/_hf_api.py +95 -11
- atdata/_logging.py +70 -0
- atdata/_protocols.py +77 -238
- atdata/_schema_codec.py +7 -6
- atdata/_stub_manager.py +5 -25
- atdata/_type_utils.py +28 -2
- atdata/atmosphere/__init__.py +31 -20
- atdata/atmosphere/_types.py +4 -4
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +11 -12
- atdata/atmosphere/records.py +12 -12
- atdata/atmosphere/schema.py +16 -18
- atdata/atmosphere/store.py +6 -7
- atdata/cli/__init__.py +161 -175
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +11 -11
- atdata/cli/inspect.py +69 -0
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +583 -328
- atdata/index/__init__.py +54 -0
- atdata/index/_entry.py +157 -0
- atdata/index/_index.py +1198 -0
- atdata/index/_schema.py +380 -0
- atdata/lens.py +9 -2
- atdata/lexicons/__init__.py +121 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +70 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +18 -14
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +123 -0
- atdata/stores/_s3.py +349 -0
- atdata/testing.py +341 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
- atdata-0.3.1b1.dist-info/RECORD +67 -0
- atdata/local.py +0 -1720
- atdata-0.2.3b1.dist-info/RECORD +0 -28
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.lens",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Bidirectional transformation (Lens) between two sample types, with code stored in external repositories",
|
|
8
|
+
"key": "tid",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"sourceSchema",
|
|
14
|
+
"targetSchema",
|
|
15
|
+
"getterCode",
|
|
16
|
+
"putterCode",
|
|
17
|
+
"createdAt"
|
|
18
|
+
],
|
|
19
|
+
"properties": {
|
|
20
|
+
"name": {
|
|
21
|
+
"type": "string",
|
|
22
|
+
"description": "Human-readable lens name",
|
|
23
|
+
"maxLength": 100
|
|
24
|
+
},
|
|
25
|
+
"sourceSchema": {
|
|
26
|
+
"type": "string",
|
|
27
|
+
"description": "AT-URI reference to source schema",
|
|
28
|
+
"maxLength": 500
|
|
29
|
+
},
|
|
30
|
+
"targetSchema": {
|
|
31
|
+
"type": "string",
|
|
32
|
+
"description": "AT-URI reference to target schema",
|
|
33
|
+
"maxLength": 500
|
|
34
|
+
},
|
|
35
|
+
"description": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"description": "What this transformation does",
|
|
38
|
+
"maxLength": 1000
|
|
39
|
+
},
|
|
40
|
+
"getterCode": {
|
|
41
|
+
"type": "ref",
|
|
42
|
+
"ref": "#codeReference",
|
|
43
|
+
"description": "Code reference for getter function (Source -> Target)"
|
|
44
|
+
},
|
|
45
|
+
"putterCode": {
|
|
46
|
+
"type": "ref",
|
|
47
|
+
"ref": "#codeReference",
|
|
48
|
+
"description": "Code reference for putter function (Target, Source -> Source)"
|
|
49
|
+
},
|
|
50
|
+
"language": {
|
|
51
|
+
"type": "string",
|
|
52
|
+
"description": "Programming language of the lens implementation (e.g., 'python', 'typescript')",
|
|
53
|
+
"maxLength": 50
|
|
54
|
+
},
|
|
55
|
+
"metadata": {
|
|
56
|
+
"type": "object",
|
|
57
|
+
"description": "Arbitrary metadata (author, performance notes, etc.)"
|
|
58
|
+
},
|
|
59
|
+
"createdAt": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"format": "datetime",
|
|
62
|
+
"description": "Timestamp when this lens was created"
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"codeReference": {
|
|
68
|
+
"type": "object",
|
|
69
|
+
"description": "Reference to code in an external repository (GitHub, tangled.org, etc.)",
|
|
70
|
+
"required": [
|
|
71
|
+
"repository",
|
|
72
|
+
"commit",
|
|
73
|
+
"path"
|
|
74
|
+
],
|
|
75
|
+
"properties": {
|
|
76
|
+
"repository": {
|
|
77
|
+
"type": "string",
|
|
78
|
+
"description": "Repository URL (e.g., 'https://github.com/user/repo' or 'at://did/tangled.repo/...')",
|
|
79
|
+
"maxLength": 500
|
|
80
|
+
},
|
|
81
|
+
"commit": {
|
|
82
|
+
"type": "string",
|
|
83
|
+
"description": "Git commit hash (ensures immutability)",
|
|
84
|
+
"maxLength": 40
|
|
85
|
+
},
|
|
86
|
+
"path": {
|
|
87
|
+
"type": "string",
|
|
88
|
+
"description": "Path to function within repository (e.g., 'lenses/vision.py:rgb_to_grayscale')",
|
|
89
|
+
"maxLength": 500
|
|
90
|
+
},
|
|
91
|
+
"branch": {
|
|
92
|
+
"type": "string",
|
|
93
|
+
"description": "Optional branch name (for reference, commit hash is authoritative)",
|
|
94
|
+
"maxLength": 100
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.record",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Index record for a WebDataset-backed dataset with references to storage location and sample schema",
|
|
8
|
+
"key": "tid",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"schemaRef",
|
|
14
|
+
"storage",
|
|
15
|
+
"createdAt"
|
|
16
|
+
],
|
|
17
|
+
"properties": {
|
|
18
|
+
"name": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "Human-readable dataset name",
|
|
21
|
+
"maxLength": 200
|
|
22
|
+
},
|
|
23
|
+
"schemaRef": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"format": "at-uri",
|
|
26
|
+
"description": "AT-URI reference to the schema record for this dataset's samples",
|
|
27
|
+
"maxLength": 500
|
|
28
|
+
},
|
|
29
|
+
"storage": {
|
|
30
|
+
"type": "union",
|
|
31
|
+
"description": "Storage location for dataset files (WebDataset tar archives)",
|
|
32
|
+
"refs": [
|
|
33
|
+
"ac.foundation.dataset.storageExternal",
|
|
34
|
+
"ac.foundation.dataset.storageBlobs"
|
|
35
|
+
]
|
|
36
|
+
},
|
|
37
|
+
"description": {
|
|
38
|
+
"type": "string",
|
|
39
|
+
"description": "Human-readable description of the dataset",
|
|
40
|
+
"maxLength": 5000
|
|
41
|
+
},
|
|
42
|
+
"metadata": {
|
|
43
|
+
"type": "bytes",
|
|
44
|
+
"description": "Msgpack-encoded metadata dict for arbitrary extended key-value pairs. Use this for additional metadata beyond the core top-level fields (license, tags, size). Top-level fields are preferred for discoverable/searchable metadata.",
|
|
45
|
+
"maxLength": 100000
|
|
46
|
+
},
|
|
47
|
+
"tags": {
|
|
48
|
+
"type": "array",
|
|
49
|
+
"description": "Searchable tags for dataset discovery. Aligns with Schema.org keywords property.",
|
|
50
|
+
"items": {
|
|
51
|
+
"type": "string",
|
|
52
|
+
"maxLength": 150
|
|
53
|
+
},
|
|
54
|
+
"maxLength": 30
|
|
55
|
+
},
|
|
56
|
+
"size": {
|
|
57
|
+
"type": "ref",
|
|
58
|
+
"ref": "#datasetSize",
|
|
59
|
+
"description": "Dataset size information (optional)"
|
|
60
|
+
},
|
|
61
|
+
"license": {
|
|
62
|
+
"type": "string",
|
|
63
|
+
"description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
|
|
64
|
+
"maxLength": 200
|
|
65
|
+
},
|
|
66
|
+
"createdAt": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"format": "datetime",
|
|
69
|
+
"description": "Timestamp when this dataset record was created"
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
"datasetSize": {
|
|
75
|
+
"type": "object",
|
|
76
|
+
"description": "Information about dataset size",
|
|
77
|
+
"properties": {
|
|
78
|
+
"samples": {
|
|
79
|
+
"type": "integer",
|
|
80
|
+
"description": "Total number of samples in the dataset",
|
|
81
|
+
"minimum": 0
|
|
82
|
+
},
|
|
83
|
+
"bytes": {
|
|
84
|
+
"type": "integer",
|
|
85
|
+
"description": "Total size in bytes",
|
|
86
|
+
"minimum": 0
|
|
87
|
+
},
|
|
88
|
+
"shards": {
|
|
89
|
+
"type": "integer",
|
|
90
|
+
"description": "Number of WebDataset shards",
|
|
91
|
+
"minimum": 1
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.schema",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "record",
|
|
7
|
+
"description": "Definition of a PackableSample-compatible sample type. Supports versioning via rkey format: {NSID}@{semver}. Schema format is extensible via union type.",
|
|
8
|
+
"key": "any",
|
|
9
|
+
"record": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"required": [
|
|
12
|
+
"name",
|
|
13
|
+
"version",
|
|
14
|
+
"schemaType",
|
|
15
|
+
"schema",
|
|
16
|
+
"createdAt"
|
|
17
|
+
],
|
|
18
|
+
"properties": {
|
|
19
|
+
"name": {
|
|
20
|
+
"type": "string",
|
|
21
|
+
"description": "Human-readable display name for this sample type. Used for documentation and UI. The NSID in the record URI provides unique identification; name collisions across NSIDs are acceptable.",
|
|
22
|
+
"maxLength": 100
|
|
23
|
+
},
|
|
24
|
+
"version": {
|
|
25
|
+
"type": "string",
|
|
26
|
+
"description": "Semantic version (e.g., '1.0.0')",
|
|
27
|
+
"pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
|
|
28
|
+
"maxLength": 100
|
|
29
|
+
},
|
|
30
|
+
"schemaType": {
|
|
31
|
+
"type": "ref",
|
|
32
|
+
"ref": "ac.foundation.dataset.schemaType",
|
|
33
|
+
"description": "Type of schema definition. This field indicates which union member is present in the schema field."
|
|
34
|
+
},
|
|
35
|
+
"schema": {
|
|
36
|
+
"type": "union",
|
|
37
|
+
"refs": ["ac.foundation.dataset.schema#jsonSchemaFormat"],
|
|
38
|
+
"closed": false,
|
|
39
|
+
"description": "Schema definition for this sample type. Currently supports JSON Schema Draft 7. Union allows for future schema formats (Avro, Protobuf, etc.) without breaking changes."
|
|
40
|
+
},
|
|
41
|
+
"description": {
|
|
42
|
+
"type": "string",
|
|
43
|
+
"description": "Human-readable description of what this sample type represents",
|
|
44
|
+
"maxLength": 5000
|
|
45
|
+
},
|
|
46
|
+
"metadata": {
|
|
47
|
+
"type": "object",
|
|
48
|
+
"description": "Optional metadata about this schema. Common fields include license and tags, but any additional fields are permitted.",
|
|
49
|
+
"maxProperties": 50,
|
|
50
|
+
"properties": {
|
|
51
|
+
"license": {
|
|
52
|
+
"type": "string",
|
|
53
|
+
"description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
|
|
54
|
+
"maxLength": 200
|
|
55
|
+
},
|
|
56
|
+
"tags": {
|
|
57
|
+
"type": "array",
|
|
58
|
+
"description": "Categorization keywords for discovery. Aligns with Schema.org keywords property.",
|
|
59
|
+
"items": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"maxLength": 150
|
|
62
|
+
},
|
|
63
|
+
"maxLength": 30
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"createdAt": {
|
|
68
|
+
"type": "string",
|
|
69
|
+
"format": "datetime",
|
|
70
|
+
"description": "Timestamp when this schema version was created. Immutable once set (ATProto records are permanent)."
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
},
|
|
75
|
+
"jsonSchemaFormat": {
|
|
76
|
+
"type": "object",
|
|
77
|
+
"description": "JSON Schema Draft 7 format for sample type definitions. Used with NDArray shim for array types.",
|
|
78
|
+
"required": ["$type", "$schema", "type", "properties"],
|
|
79
|
+
"properties": {
|
|
80
|
+
"$type": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"const": "ac.foundation.dataset.schema#jsonSchemaFormat"
|
|
83
|
+
},
|
|
84
|
+
"$schema": {
|
|
85
|
+
"type": "string",
|
|
86
|
+
"const": "http://json-schema.org/draft-07/schema#",
|
|
87
|
+
"description": "JSON Schema version identifier"
|
|
88
|
+
},
|
|
89
|
+
"type": {
|
|
90
|
+
"type": "string",
|
|
91
|
+
"const": "object",
|
|
92
|
+
"description": "Sample types must be objects"
|
|
93
|
+
},
|
|
94
|
+
"properties": {
|
|
95
|
+
"type": "object",
|
|
96
|
+
"description": "Field definitions for the sample type",
|
|
97
|
+
"minProperties": 1
|
|
98
|
+
},
|
|
99
|
+
"arrayFormatVersions": {
|
|
100
|
+
"type": "object",
|
|
101
|
+
"description": "Mapping from array format identifiers to semantic versions. Keys are ac.foundation.dataset.arrayFormat values (e.g., 'ndarrayBytes'), values are semver strings (e.g., '1.0.0'). Foundation.ac maintains canonical shim schemas at https://foundation.ac/schemas/atdata-{format}-bytes/{version}/.",
|
|
102
|
+
"maxProperties": 10
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.schemaType",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "string",
|
|
7
|
+
"description": "Schema type identifier for atdata sample definitions. Known values correspond to token definitions in this Lexicon. New schema types can be added as tokens without breaking changes.",
|
|
8
|
+
"knownValues": ["jsonSchema"],
|
|
9
|
+
"maxLength": 50
|
|
10
|
+
},
|
|
11
|
+
"jsonSchema": {
|
|
12
|
+
"type": "token",
|
|
13
|
+
"description": "JSON Schema Draft 7 format for sample type definitions. When schemaType is 'jsonSchema', the schema field must contain an object conforming to ac.foundation.dataset.schema#jsonSchemaFormat."
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageBlobs",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "Storage via ATProto PDS blobs for WebDataset tar archives. Each blob contains one or more tar files. Used in ac.foundation.dataset.record storage union for maximum decentralization.",
|
|
8
|
+
"required": [
|
|
9
|
+
"blobs"
|
|
10
|
+
],
|
|
11
|
+
"properties": {
|
|
12
|
+
"blobs": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "Array of blob references for WebDataset tar files",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "blob",
|
|
17
|
+
"description": "Blob reference to a WebDataset tar archive"
|
|
18
|
+
},
|
|
19
|
+
"minLength": 1
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageExternal",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "External storage via URLs (S3, HTTP, IPFS, etc.) for WebDataset tar archives. URLs support brace notation for sharding (e.g., 'data-{000000..000099}.tar'). Used in ac.foundation.dataset.record storage union.",
|
|
8
|
+
"required": [
|
|
9
|
+
"urls"
|
|
10
|
+
],
|
|
11
|
+
"properties": {
|
|
12
|
+
"urls": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "WebDataset URLs with optional brace notation for sharded tar files",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "string",
|
|
17
|
+
"format": "uri",
|
|
18
|
+
"maxLength": 1000
|
|
19
|
+
},
|
|
20
|
+
"minLength": 1
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://foundation.ac/schemas/atdata-ndarray-bytes/1.0.0",
|
|
4
|
+
"title": "ATDataNDArrayBytes",
|
|
5
|
+
"description": "Standard definition for numpy NDArray types in JSON Schema, compatible with atdata WebDataset serialization. This type's contents are interpreted as containing the raw bytes data for a serialized numpy NDArray, and serve as a marker for atdata-based code generation to use standard numpy types, rather than generated dataclasses.",
|
|
6
|
+
"version": "1.0.0",
|
|
7
|
+
"$defs": {
|
|
8
|
+
"ndarray": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"format": "byte",
|
|
11
|
+
"description": "Numpy array serialized using numpy `.npy` format via `np.save` (includes dtype and shape in binary header). When represented in JSON, this is a base64-encoded string. In msgpack, this is raw bytes.",
|
|
12
|
+
"contentEncoding": "base64",
|
|
13
|
+
"contentMediaType": "application/octet-stream"
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
atdata/local/__init__.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Backward-compatibility shim for atdata.local.
|
|
2
|
+
|
|
3
|
+
.. deprecated::
|
|
4
|
+
Import from ``atdata.index`` and ``atdata.stores`` instead::
|
|
5
|
+
|
|
6
|
+
from atdata.index import Index, LocalDatasetEntry
|
|
7
|
+
from atdata.stores import S3DataStore, LocalDiskStore
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from atdata.index import (
|
|
11
|
+
Index,
|
|
12
|
+
LocalDatasetEntry,
|
|
13
|
+
BasicIndexEntry,
|
|
14
|
+
SchemaNamespace,
|
|
15
|
+
SchemaFieldType,
|
|
16
|
+
SchemaField,
|
|
17
|
+
LocalSchemaRecord,
|
|
18
|
+
REDIS_KEY_DATASET_ENTRY,
|
|
19
|
+
REDIS_KEY_SCHEMA,
|
|
20
|
+
_ATDATA_URI_PREFIX,
|
|
21
|
+
_LEGACY_URI_PREFIX,
|
|
22
|
+
_kind_str_for_sample_type,
|
|
23
|
+
_schema_ref_from_type,
|
|
24
|
+
_make_schema_ref,
|
|
25
|
+
_parse_schema_ref,
|
|
26
|
+
_increment_patch,
|
|
27
|
+
_python_type_to_field_type,
|
|
28
|
+
_build_schema_record,
|
|
29
|
+
)
|
|
30
|
+
from atdata.stores import (
|
|
31
|
+
LocalDiskStore,
|
|
32
|
+
S3DataStore,
|
|
33
|
+
_s3_env,
|
|
34
|
+
_s3_from_credentials,
|
|
35
|
+
_create_s3_write_callbacks,
|
|
36
|
+
)
|
|
37
|
+
from atdata.local._repo_legacy import Repo
|
|
38
|
+
|
|
39
|
+
# Re-export third-party types that were previously importable from the
|
|
40
|
+
# monolithic local.py (tests reference atdata.local.S3FileSystem, etc.)
|
|
41
|
+
from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
# Public API
|
|
45
|
+
"LocalDiskStore",
|
|
46
|
+
"Index",
|
|
47
|
+
"LocalDatasetEntry",
|
|
48
|
+
"BasicIndexEntry",
|
|
49
|
+
"S3DataStore",
|
|
50
|
+
"Repo",
|
|
51
|
+
"SchemaNamespace",
|
|
52
|
+
"SchemaFieldType",
|
|
53
|
+
"SchemaField",
|
|
54
|
+
"LocalSchemaRecord",
|
|
55
|
+
"REDIS_KEY_DATASET_ENTRY",
|
|
56
|
+
"REDIS_KEY_SCHEMA",
|
|
57
|
+
# Internal helpers (re-exported for backward compatibility)
|
|
58
|
+
"_ATDATA_URI_PREFIX",
|
|
59
|
+
"_LEGACY_URI_PREFIX",
|
|
60
|
+
"_kind_str_for_sample_type",
|
|
61
|
+
"_schema_ref_from_type",
|
|
62
|
+
"_make_schema_ref",
|
|
63
|
+
"_parse_schema_ref",
|
|
64
|
+
"_increment_patch",
|
|
65
|
+
"_python_type_to_field_type",
|
|
66
|
+
"_build_schema_record",
|
|
67
|
+
"_s3_env",
|
|
68
|
+
"_s3_from_credentials",
|
|
69
|
+
"_create_s3_write_callbacks",
|
|
70
|
+
]
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Deprecated Repo class for legacy S3 repository operations."""
|
|
2
|
+
|
|
3
|
+
from atdata import Dataset
|
|
4
|
+
|
|
5
|
+
from atdata.index._entry import LocalDatasetEntry
|
|
6
|
+
from atdata.stores._s3 import _s3_env, _s3_from_credentials, _create_s3_write_callbacks
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from uuid import uuid4
|
|
10
|
+
from tempfile import TemporaryDirectory
|
|
11
|
+
from typing import Any, BinaryIO, TypeVar, cast
|
|
12
|
+
|
|
13
|
+
from redis import Redis
|
|
14
|
+
import msgpack
|
|
15
|
+
import webdataset as wds
|
|
16
|
+
import warnings
|
|
17
|
+
|
|
18
|
+
from atdata._protocols import Packable
|
|
19
|
+
|
|
20
|
+
T = TypeVar("T", bound=Packable)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Repo:
|
|
24
|
+
"""Repository for storing and managing atdata datasets.
|
|
25
|
+
|
|
26
|
+
.. deprecated::
|
|
27
|
+
Use :class:`Index` with :class:`S3DataStore` instead::
|
|
28
|
+
|
|
29
|
+
store = S3DataStore(credentials, bucket="my-bucket")
|
|
30
|
+
index = Index(redis=redis, data_store=store)
|
|
31
|
+
entry = index.insert_dataset(ds, name="my-dataset")
|
|
32
|
+
|
|
33
|
+
Provides storage of datasets in S3-compatible object storage with Redis-based
|
|
34
|
+
indexing. Datasets are stored as WebDataset tar files with optional metadata.
|
|
35
|
+
|
|
36
|
+
Attributes:
|
|
37
|
+
s3_credentials: S3 credentials dictionary or None.
|
|
38
|
+
bucket_fs: S3FileSystem instance or None.
|
|
39
|
+
hive_path: Path within S3 bucket for storing datasets.
|
|
40
|
+
hive_bucket: Name of the S3 bucket.
|
|
41
|
+
index: Index instance for tracking datasets.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
s3_credentials: str | Path | dict[str, Any] | None = None,
|
|
49
|
+
hive_path: str | Path | None = None,
|
|
50
|
+
redis: Redis | None = None,
|
|
51
|
+
) -> None:
|
|
52
|
+
"""Initialize a repository.
|
|
53
|
+
|
|
54
|
+
.. deprecated::
|
|
55
|
+
Use Index with S3DataStore instead.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
s3_credentials: Path to .env file with S3 credentials, or dict with
|
|
59
|
+
AWS_ENDPOINT, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY.
|
|
60
|
+
If None, S3 functionality will be disabled.
|
|
61
|
+
hive_path: Path within the S3 bucket to store datasets.
|
|
62
|
+
Required if s3_credentials is provided.
|
|
63
|
+
redis: Redis connection for indexing. If None, creates a new connection.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If hive_path is not provided when s3_credentials is set.
|
|
67
|
+
"""
|
|
68
|
+
warnings.warn(
|
|
69
|
+
"Repo is deprecated. Use Index with S3DataStore instead:\n"
|
|
70
|
+
" store = S3DataStore(credentials, bucket='my-bucket')\n"
|
|
71
|
+
" index = Index(redis=redis, data_store=store)\n"
|
|
72
|
+
" entry = index.insert_dataset(ds, name='my-dataset')",
|
|
73
|
+
DeprecationWarning,
|
|
74
|
+
stacklevel=2,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if s3_credentials is None:
|
|
78
|
+
self.s3_credentials = None
|
|
79
|
+
elif isinstance(s3_credentials, dict):
|
|
80
|
+
self.s3_credentials = s3_credentials
|
|
81
|
+
else:
|
|
82
|
+
self.s3_credentials = _s3_env(s3_credentials)
|
|
83
|
+
|
|
84
|
+
if self.s3_credentials is None:
|
|
85
|
+
self.bucket_fs = None
|
|
86
|
+
else:
|
|
87
|
+
self.bucket_fs = _s3_from_credentials(self.s3_credentials)
|
|
88
|
+
|
|
89
|
+
if self.bucket_fs is not None:
|
|
90
|
+
if hive_path is None:
|
|
91
|
+
raise ValueError("Must specify hive path within bucket")
|
|
92
|
+
self.hive_path = Path(hive_path)
|
|
93
|
+
self.hive_bucket = self.hive_path.parts[0]
|
|
94
|
+
else:
|
|
95
|
+
self.hive_path = None
|
|
96
|
+
self.hive_bucket = None
|
|
97
|
+
|
|
98
|
+
#
|
|
99
|
+
|
|
100
|
+
from atdata.index._index import Index
|
|
101
|
+
|
|
102
|
+
self.index = Index(redis=redis)
|
|
103
|
+
|
|
104
|
+
##
|
|
105
|
+
|
|
106
|
+
def insert(
|
|
107
|
+
self,
|
|
108
|
+
ds: Dataset[T],
|
|
109
|
+
*,
|
|
110
|
+
name: str,
|
|
111
|
+
cache_local: bool = False,
|
|
112
|
+
schema_ref: str | None = None,
|
|
113
|
+
**kwargs,
|
|
114
|
+
) -> tuple[LocalDatasetEntry, Dataset[T]]:
|
|
115
|
+
"""Insert a dataset into the repository.
|
|
116
|
+
|
|
117
|
+
Writes the dataset to S3 as WebDataset tar files, stores metadata,
|
|
118
|
+
and creates an index entry in Redis.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
ds: The dataset to insert.
|
|
122
|
+
name: Human-readable name for the dataset.
|
|
123
|
+
cache_local: If True, write to local temporary storage first, then
|
|
124
|
+
copy to S3. This can be faster for some workloads.
|
|
125
|
+
schema_ref: Optional schema reference. If None, generates from sample type.
|
|
126
|
+
**kwargs: Additional arguments passed to wds.ShardWriter.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
A tuple of (index_entry, new_dataset) where:
|
|
130
|
+
- index_entry: LocalDatasetEntry for the stored dataset
|
|
131
|
+
- new_dataset: Dataset object pointing to the stored copy
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
ValueError: If S3 credentials or hive_path are not configured.
|
|
135
|
+
RuntimeError: If no shards were written.
|
|
136
|
+
"""
|
|
137
|
+
if self.s3_credentials is None:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
"S3 credentials required for insert(). Initialize Repo with s3_credentials."
|
|
140
|
+
)
|
|
141
|
+
if self.hive_bucket is None or self.hive_path is None:
|
|
142
|
+
raise ValueError(
|
|
143
|
+
"hive_path required for insert(). Initialize Repo with hive_path."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
new_uuid = str(uuid4())
|
|
147
|
+
|
|
148
|
+
hive_fs = _s3_from_credentials(self.s3_credentials)
|
|
149
|
+
|
|
150
|
+
# Write metadata
|
|
151
|
+
metadata_path = (
|
|
152
|
+
self.hive_path / "metadata" / f"atdata-metadata--{new_uuid}.msgpack"
|
|
153
|
+
)
|
|
154
|
+
# Note: S3 doesn't need directories created beforehand - s3fs handles this
|
|
155
|
+
|
|
156
|
+
if ds.metadata is not None:
|
|
157
|
+
# Use s3:// prefix to ensure s3fs treats this as an S3 path
|
|
158
|
+
with cast(
|
|
159
|
+
BinaryIO, hive_fs.open(f"s3://{metadata_path.as_posix()}", "wb")
|
|
160
|
+
) as f:
|
|
161
|
+
meta_packed = msgpack.packb(ds.metadata)
|
|
162
|
+
f.write(cast(bytes, meta_packed))
|
|
163
|
+
|
|
164
|
+
# Write data
|
|
165
|
+
shard_pattern = (self.hive_path / f"atdata--{new_uuid}--%06d.tar").as_posix()
|
|
166
|
+
|
|
167
|
+
written_shards: list[str] = []
|
|
168
|
+
with TemporaryDirectory() as temp_dir:
|
|
169
|
+
writer_opener, writer_post = _create_s3_write_callbacks(
|
|
170
|
+
credentials=self.s3_credentials,
|
|
171
|
+
temp_dir=temp_dir,
|
|
172
|
+
written_shards=written_shards,
|
|
173
|
+
fs=hive_fs,
|
|
174
|
+
cache_local=cache_local,
|
|
175
|
+
add_s3_prefix=False,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
with wds.writer.ShardWriter(
|
|
179
|
+
shard_pattern,
|
|
180
|
+
opener=writer_opener,
|
|
181
|
+
post=writer_post,
|
|
182
|
+
**kwargs,
|
|
183
|
+
) as sink:
|
|
184
|
+
for sample in ds.ordered(batch_size=None):
|
|
185
|
+
sink.write(sample.as_wds)
|
|
186
|
+
|
|
187
|
+
# Make a new Dataset object for the written dataset copy
|
|
188
|
+
if len(written_shards) == 0:
|
|
189
|
+
raise RuntimeError(
|
|
190
|
+
"Cannot form new dataset entry -- did not write any shards"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
elif len(written_shards) < 2:
|
|
194
|
+
new_dataset_url = (
|
|
195
|
+
self.hive_path / (Path(written_shards[0]).name)
|
|
196
|
+
).as_posix()
|
|
197
|
+
|
|
198
|
+
else:
|
|
199
|
+
shard_s3_format = (
|
|
200
|
+
(self.hive_path / f"atdata--{new_uuid}").as_posix()
|
|
201
|
+
) + "--{shard_id}.tar"
|
|
202
|
+
shard_id_braced = "{" + f"{0:06d}..{len(written_shards) - 1:06d}" + "}"
|
|
203
|
+
new_dataset_url = shard_s3_format.format(shard_id=shard_id_braced)
|
|
204
|
+
|
|
205
|
+
new_dataset = Dataset[ds.sample_type](
|
|
206
|
+
url=new_dataset_url,
|
|
207
|
+
metadata_url=metadata_path.as_posix(),
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Add to index (use ds._metadata to avoid network requests)
|
|
211
|
+
new_entry = self.index.add_entry(
|
|
212
|
+
new_dataset,
|
|
213
|
+
name=name,
|
|
214
|
+
schema_ref=schema_ref,
|
|
215
|
+
metadata=ds._metadata,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return new_entry, new_dataset
|