atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +39 -0
  3. atdata/_cid.py +0 -21
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +41 -15
  6. atdata/_hf_api.py +95 -11
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +77 -238
  9. atdata/_schema_codec.py +7 -6
  10. atdata/_stub_manager.py +5 -25
  11. atdata/_type_utils.py +28 -2
  12. atdata/atmosphere/__init__.py +31 -20
  13. atdata/atmosphere/_types.py +4 -4
  14. atdata/atmosphere/client.py +64 -12
  15. atdata/atmosphere/lens.py +11 -12
  16. atdata/atmosphere/records.py +12 -12
  17. atdata/atmosphere/schema.py +16 -18
  18. atdata/atmosphere/store.py +6 -7
  19. atdata/cli/__init__.py +161 -175
  20. atdata/cli/diagnose.py +2 -2
  21. atdata/cli/{local.py → infra.py} +11 -11
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/preview.py +63 -0
  24. atdata/cli/schema.py +109 -0
  25. atdata/dataset.py +583 -328
  26. atdata/index/__init__.py +54 -0
  27. atdata/index/_entry.py +157 -0
  28. atdata/index/_index.py +1198 -0
  29. atdata/index/_schema.py +380 -0
  30. atdata/lens.py +9 -2
  31. atdata/lexicons/__init__.py +121 -0
  32. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  34. atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
  35. atdata/lexicons/ac.foundation.dataset.record.json +96 -0
  36. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  37. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  38. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
  39. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  40. atdata/lexicons/ndarray_shim.json +16 -0
  41. atdata/local/__init__.py +70 -0
  42. atdata/local/_repo_legacy.py +218 -0
  43. atdata/manifest/__init__.py +28 -0
  44. atdata/manifest/_aggregates.py +156 -0
  45. atdata/manifest/_builder.py +163 -0
  46. atdata/manifest/_fields.py +154 -0
  47. atdata/manifest/_manifest.py +146 -0
  48. atdata/manifest/_query.py +150 -0
  49. atdata/manifest/_writer.py +74 -0
  50. atdata/promote.py +18 -14
  51. atdata/providers/__init__.py +25 -0
  52. atdata/providers/_base.py +140 -0
  53. atdata/providers/_factory.py +69 -0
  54. atdata/providers/_postgres.py +214 -0
  55. atdata/providers/_redis.py +171 -0
  56. atdata/providers/_sqlite.py +191 -0
  57. atdata/repository.py +323 -0
  58. atdata/stores/__init__.py +23 -0
  59. atdata/stores/_disk.py +123 -0
  60. atdata/stores/_s3.py +349 -0
  61. atdata/testing.py +341 -0
  62. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
  63. atdata-0.3.1b1.dist-info/RECORD +67 -0
  64. atdata/local.py +0 -1720
  65. atdata-0.2.3b1.dist-info/RECORD +0 -28
  66. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
  67. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
  68. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,99 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.lens",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Bidirectional transformation (Lens) between two sample types, with code stored in external repositories",
8
+ "key": "tid",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "sourceSchema",
14
+ "targetSchema",
15
+ "getterCode",
16
+ "putterCode",
17
+ "createdAt"
18
+ ],
19
+ "properties": {
20
+ "name": {
21
+ "type": "string",
22
+ "description": "Human-readable lens name",
23
+ "maxLength": 100
24
+ },
25
+ "sourceSchema": {
26
+ "type": "string",
27
+ "description": "AT-URI reference to source schema",
28
+ "maxLength": 500
29
+ },
30
+ "targetSchema": {
31
+ "type": "string",
32
+ "description": "AT-URI reference to target schema",
33
+ "maxLength": 500
34
+ },
35
+ "description": {
36
+ "type": "string",
37
+ "description": "What this transformation does",
38
+ "maxLength": 1000
39
+ },
40
+ "getterCode": {
41
+ "type": "ref",
42
+ "ref": "#codeReference",
43
+ "description": "Code reference for getter function (Source -> Target)"
44
+ },
45
+ "putterCode": {
46
+ "type": "ref",
47
+ "ref": "#codeReference",
48
+ "description": "Code reference for putter function (Target, Source -> Source)"
49
+ },
50
+ "language": {
51
+ "type": "string",
52
+ "description": "Programming language of the lens implementation (e.g., 'python', 'typescript')",
53
+ "maxLength": 50
54
+ },
55
+ "metadata": {
56
+ "type": "object",
57
+ "description": "Arbitrary metadata (author, performance notes, etc.)"
58
+ },
59
+ "createdAt": {
60
+ "type": "string",
61
+ "format": "datetime",
62
+ "description": "Timestamp when this lens was created"
63
+ }
64
+ }
65
+ }
66
+ },
67
+ "codeReference": {
68
+ "type": "object",
69
+ "description": "Reference to code in an external repository (GitHub, tangled.org, etc.)",
70
+ "required": [
71
+ "repository",
72
+ "commit",
73
+ "path"
74
+ ],
75
+ "properties": {
76
+ "repository": {
77
+ "type": "string",
78
+ "description": "Repository URL (e.g., 'https://github.com/user/repo' or 'at://did/tangled.repo/...')",
79
+ "maxLength": 500
80
+ },
81
+ "commit": {
82
+ "type": "string",
83
+ "description": "Git commit hash (ensures immutability)",
84
+ "maxLength": 40
85
+ },
86
+ "path": {
87
+ "type": "string",
88
+ "description": "Path to function within repository (e.g., 'lenses/vision.py:rgb_to_grayscale')",
89
+ "maxLength": 500
90
+ },
91
+ "branch": {
92
+ "type": "string",
93
+ "description": "Optional branch name (for reference, commit hash is authoritative)",
94
+ "maxLength": 100
95
+ }
96
+ }
97
+ }
98
+ }
99
+ }
@@ -0,0 +1,96 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.record",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Index record for a WebDataset-backed dataset with references to storage location and sample schema",
8
+ "key": "tid",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "schemaRef",
14
+ "storage",
15
+ "createdAt"
16
+ ],
17
+ "properties": {
18
+ "name": {
19
+ "type": "string",
20
+ "description": "Human-readable dataset name",
21
+ "maxLength": 200
22
+ },
23
+ "schemaRef": {
24
+ "type": "string",
25
+ "format": "at-uri",
26
+ "description": "AT-URI reference to the schema record for this dataset's samples",
27
+ "maxLength": 500
28
+ },
29
+ "storage": {
30
+ "type": "union",
31
+ "description": "Storage location for dataset files (WebDataset tar archives)",
32
+ "refs": [
33
+ "ac.foundation.dataset.storageExternal",
34
+ "ac.foundation.dataset.storageBlobs"
35
+ ]
36
+ },
37
+ "description": {
38
+ "type": "string",
39
+ "description": "Human-readable description of the dataset",
40
+ "maxLength": 5000
41
+ },
42
+ "metadata": {
43
+ "type": "bytes",
44
+ "description": "Msgpack-encoded metadata dict for arbitrary extended key-value pairs. Use this for additional metadata beyond the core top-level fields (license, tags, size). Top-level fields are preferred for discoverable/searchable metadata.",
45
+ "maxLength": 100000
46
+ },
47
+ "tags": {
48
+ "type": "array",
49
+ "description": "Searchable tags for dataset discovery. Aligns with Schema.org keywords property.",
50
+ "items": {
51
+ "type": "string",
52
+ "maxLength": 150
53
+ },
54
+ "maxLength": 30
55
+ },
56
+ "size": {
57
+ "type": "ref",
58
+ "ref": "#datasetSize",
59
+ "description": "Dataset size information (optional)"
60
+ },
61
+ "license": {
62
+ "type": "string",
63
+ "description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
64
+ "maxLength": 200
65
+ },
66
+ "createdAt": {
67
+ "type": "string",
68
+ "format": "datetime",
69
+ "description": "Timestamp when this dataset record was created"
70
+ }
71
+ }
72
+ }
73
+ },
74
+ "datasetSize": {
75
+ "type": "object",
76
+ "description": "Information about dataset size",
77
+ "properties": {
78
+ "samples": {
79
+ "type": "integer",
80
+ "description": "Total number of samples in the dataset",
81
+ "minimum": 0
82
+ },
83
+ "bytes": {
84
+ "type": "integer",
85
+ "description": "Total size in bytes",
86
+ "minimum": 0
87
+ },
88
+ "shards": {
89
+ "type": "integer",
90
+ "description": "Number of WebDataset shards",
91
+ "minimum": 1
92
+ }
93
+ }
94
+ }
95
+ }
96
+ }
@@ -0,0 +1,107 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.schema",
4
+ "defs": {
5
+ "main": {
6
+ "type": "record",
7
+ "description": "Definition of a PackableSample-compatible sample type. Supports versioning via rkey format: {NSID}@{semver}. Schema format is extensible via union type.",
8
+ "key": "any",
9
+ "record": {
10
+ "type": "object",
11
+ "required": [
12
+ "name",
13
+ "version",
14
+ "schemaType",
15
+ "schema",
16
+ "createdAt"
17
+ ],
18
+ "properties": {
19
+ "name": {
20
+ "type": "string",
21
+ "description": "Human-readable display name for this sample type. Used for documentation and UI. The NSID in the record URI provides unique identification; name collisions across NSIDs are acceptable.",
22
+ "maxLength": 100
23
+ },
24
+ "version": {
25
+ "type": "string",
26
+ "description": "Semantic version (e.g., '1.0.0')",
27
+ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$",
28
+ "maxLength": 100
29
+ },
30
+ "schemaType": {
31
+ "type": "ref",
32
+ "ref": "ac.foundation.dataset.schemaType",
33
+ "description": "Type of schema definition. This field indicates which union member is present in the schema field."
34
+ },
35
+ "schema": {
36
+ "type": "union",
37
+ "refs": ["ac.foundation.dataset.schema#jsonSchemaFormat"],
38
+ "closed": false,
39
+ "description": "Schema definition for this sample type. Currently supports JSON Schema Draft 7. Union allows for future schema formats (Avro, Protobuf, etc.) without breaking changes."
40
+ },
41
+ "description": {
42
+ "type": "string",
43
+ "description": "Human-readable description of what this sample type represents",
44
+ "maxLength": 5000
45
+ },
46
+ "metadata": {
47
+ "type": "object",
48
+ "description": "Optional metadata about this schema. Common fields include license and tags, but any additional fields are permitted.",
49
+ "maxProperties": 50,
50
+ "properties": {
51
+ "license": {
52
+ "type": "string",
53
+ "description": "License identifier or URL. SPDX identifiers recommended (e.g., MIT, Apache-2.0, CC-BY-4.0) or full SPDX URLs (e.g., http://spdx.org/licenses/MIT). Aligns with Schema.org license property.",
54
+ "maxLength": 200
55
+ },
56
+ "tags": {
57
+ "type": "array",
58
+ "description": "Categorization keywords for discovery. Aligns with Schema.org keywords property.",
59
+ "items": {
60
+ "type": "string",
61
+ "maxLength": 150
62
+ },
63
+ "maxLength": 30
64
+ }
65
+ }
66
+ },
67
+ "createdAt": {
68
+ "type": "string",
69
+ "format": "datetime",
70
+ "description": "Timestamp when this schema version was created. Immutable once set (ATProto records are permanent)."
71
+ }
72
+ }
73
+ }
74
+ },
75
+ "jsonSchemaFormat": {
76
+ "type": "object",
77
+ "description": "JSON Schema Draft 7 format for sample type definitions. Used with NDArray shim for array types.",
78
+ "required": ["$type", "$schema", "type", "properties"],
79
+ "properties": {
80
+ "$type": {
81
+ "type": "string",
82
+ "const": "ac.foundation.dataset.schema#jsonSchemaFormat"
83
+ },
84
+ "$schema": {
85
+ "type": "string",
86
+ "const": "http://json-schema.org/draft-07/schema#",
87
+ "description": "JSON Schema version identifier"
88
+ },
89
+ "type": {
90
+ "type": "string",
91
+ "const": "object",
92
+ "description": "Sample types must be objects"
93
+ },
94
+ "properties": {
95
+ "type": "object",
96
+ "description": "Field definitions for the sample type",
97
+ "minProperties": 1
98
+ },
99
+ "arrayFormatVersions": {
100
+ "type": "object",
101
+ "description": "Mapping from array format identifiers to semantic versions. Keys are ac.foundation.dataset.arrayFormat values (e.g., 'ndarrayBytes'), values are semver strings (e.g., '1.0.0'). Foundation.ac maintains canonical shim schemas at https://foundation.ac/schemas/atdata-{format}-bytes/{version}/.",
102
+ "maxProperties": 10
103
+ }
104
+ }
105
+ }
106
+ }
107
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.schemaType",
4
+ "defs": {
5
+ "main": {
6
+ "type": "string",
7
+ "description": "Schema type identifier for atdata sample definitions. Known values correspond to token definitions in this Lexicon. New schema types can be added as tokens without breaking changes.",
8
+ "knownValues": ["jsonSchema"],
9
+ "maxLength": 50
10
+ },
11
+ "jsonSchema": {
12
+ "type": "token",
13
+ "description": "JSON Schema Draft 7 format for sample type definitions. When schemaType is 'jsonSchema', the schema field must contain an object conforming to ac.foundation.dataset.schema#jsonSchemaFormat."
14
+ }
15
+ }
16
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.storageBlobs",
4
+ "defs": {
5
+ "main": {
6
+ "type": "object",
7
+ "description": "Storage via ATProto PDS blobs for WebDataset tar archives. Each blob contains one or more tar files. Used in ac.foundation.dataset.record storage union for maximum decentralization.",
8
+ "required": [
9
+ "blobs"
10
+ ],
11
+ "properties": {
12
+ "blobs": {
13
+ "type": "array",
14
+ "description": "Array of blob references for WebDataset tar files",
15
+ "items": {
16
+ "type": "blob",
17
+ "description": "Blob reference to a WebDataset tar archive"
18
+ },
19
+ "minLength": 1
20
+ }
21
+ }
22
+ }
23
+ }
24
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "lexicon": 1,
3
+ "id": "ac.foundation.dataset.storageExternal",
4
+ "defs": {
5
+ "main": {
6
+ "type": "object",
7
+ "description": "External storage via URLs (S3, HTTP, IPFS, etc.) for WebDataset tar archives. URLs support brace notation for sharding (e.g., 'data-{000000..000099}.tar'). Used in ac.foundation.dataset.record storage union.",
8
+ "required": [
9
+ "urls"
10
+ ],
11
+ "properties": {
12
+ "urls": {
13
+ "type": "array",
14
+ "description": "WebDataset URLs with optional brace notation for sharded tar files",
15
+ "items": {
16
+ "type": "string",
17
+ "format": "uri",
18
+ "maxLength": 1000
19
+ },
20
+ "minLength": 1
21
+ }
22
+ }
23
+ }
24
+ }
25
+ }
@@ -0,0 +1,16 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "$id": "https://foundation.ac/schemas/atdata-ndarray-bytes/1.0.0",
4
+ "title": "ATDataNDArrayBytes",
5
+ "description": "Standard definition for numpy NDArray types in JSON Schema, compatible with atdata WebDataset serialization. This type's contents are interpreted as containing the raw bytes data for a serialized numpy NDArray, and serve as a marker for atdata-based code generation to use standard numpy types, rather than generated dataclasses.",
6
+ "version": "1.0.0",
7
+ "$defs": {
8
+ "ndarray": {
9
+ "type": "string",
10
+ "format": "byte",
11
+ "description": "Numpy array serialized using numpy `.npy` format via `np.save` (includes dtype and shape in binary header). When represented in JSON, this is a base64-encoded string. In msgpack, this is raw bytes.",
12
+ "contentEncoding": "base64",
13
+ "contentMediaType": "application/octet-stream"
14
+ }
15
+ }
16
+ }
@@ -0,0 +1,70 @@
1
+ """Backward-compatibility shim for atdata.local.
2
+
3
+ .. deprecated::
4
+ Import from ``atdata.index`` and ``atdata.stores`` instead::
5
+
6
+ from atdata.index import Index, LocalDatasetEntry
7
+ from atdata.stores import S3DataStore, LocalDiskStore
8
+ """
9
+
10
+ from atdata.index import (
11
+ Index,
12
+ LocalDatasetEntry,
13
+ BasicIndexEntry,
14
+ SchemaNamespace,
15
+ SchemaFieldType,
16
+ SchemaField,
17
+ LocalSchemaRecord,
18
+ REDIS_KEY_DATASET_ENTRY,
19
+ REDIS_KEY_SCHEMA,
20
+ _ATDATA_URI_PREFIX,
21
+ _LEGACY_URI_PREFIX,
22
+ _kind_str_for_sample_type,
23
+ _schema_ref_from_type,
24
+ _make_schema_ref,
25
+ _parse_schema_ref,
26
+ _increment_patch,
27
+ _python_type_to_field_type,
28
+ _build_schema_record,
29
+ )
30
+ from atdata.stores import (
31
+ LocalDiskStore,
32
+ S3DataStore,
33
+ _s3_env,
34
+ _s3_from_credentials,
35
+ _create_s3_write_callbacks,
36
+ )
37
+ from atdata.local._repo_legacy import Repo
38
+
39
+ # Re-export third-party types that were previously importable from the
40
+ # monolithic local.py (tests reference atdata.local.S3FileSystem, etc.)
41
+ from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
42
+
43
+ __all__ = [
44
+ # Public API
45
+ "LocalDiskStore",
46
+ "Index",
47
+ "LocalDatasetEntry",
48
+ "BasicIndexEntry",
49
+ "S3DataStore",
50
+ "Repo",
51
+ "SchemaNamespace",
52
+ "SchemaFieldType",
53
+ "SchemaField",
54
+ "LocalSchemaRecord",
55
+ "REDIS_KEY_DATASET_ENTRY",
56
+ "REDIS_KEY_SCHEMA",
57
+ # Internal helpers (re-exported for backward compatibility)
58
+ "_ATDATA_URI_PREFIX",
59
+ "_LEGACY_URI_PREFIX",
60
+ "_kind_str_for_sample_type",
61
+ "_schema_ref_from_type",
62
+ "_make_schema_ref",
63
+ "_parse_schema_ref",
64
+ "_increment_patch",
65
+ "_python_type_to_field_type",
66
+ "_build_schema_record",
67
+ "_s3_env",
68
+ "_s3_from_credentials",
69
+ "_create_s3_write_callbacks",
70
+ ]
@@ -0,0 +1,218 @@
1
+ """Deprecated Repo class for legacy S3 repository operations."""
2
+
3
+ from atdata import Dataset
4
+
5
+ from atdata.index._entry import LocalDatasetEntry
6
+ from atdata.stores._s3 import _s3_env, _s3_from_credentials, _create_s3_write_callbacks
7
+
8
+ from pathlib import Path
9
+ from uuid import uuid4
10
+ from tempfile import TemporaryDirectory
11
+ from typing import Any, BinaryIO, TypeVar, cast
12
+
13
+ from redis import Redis
14
+ import msgpack
15
+ import webdataset as wds
16
+ import warnings
17
+
18
+ from atdata._protocols import Packable
19
+
20
+ T = TypeVar("T", bound=Packable)
21
+
22
+
23
+ class Repo:
24
+ """Repository for storing and managing atdata datasets.
25
+
26
+ .. deprecated::
27
+ Use :class:`Index` with :class:`S3DataStore` instead::
28
+
29
+ store = S3DataStore(credentials, bucket="my-bucket")
30
+ index = Index(redis=redis, data_store=store)
31
+ entry = index.insert_dataset(ds, name="my-dataset")
32
+
33
+ Provides storage of datasets in S3-compatible object storage with Redis-based
34
+ indexing. Datasets are stored as WebDataset tar files with optional metadata.
35
+
36
+ Attributes:
37
+ s3_credentials: S3 credentials dictionary or None.
38
+ bucket_fs: S3FileSystem instance or None.
39
+ hive_path: Path within S3 bucket for storing datasets.
40
+ hive_bucket: Name of the S3 bucket.
41
+ index: Index instance for tracking datasets.
42
+ """
43
+
44
+ ##
45
+
46
+ def __init__(
47
+ self,
48
+ s3_credentials: str | Path | dict[str, Any] | None = None,
49
+ hive_path: str | Path | None = None,
50
+ redis: Redis | None = None,
51
+ ) -> None:
52
+ """Initialize a repository.
53
+
54
+ .. deprecated::
55
+ Use Index with S3DataStore instead.
56
+
57
+ Args:
58
+ s3_credentials: Path to .env file with S3 credentials, or dict with
59
+ AWS_ENDPOINT, AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY.
60
+ If None, S3 functionality will be disabled.
61
+ hive_path: Path within the S3 bucket to store datasets.
62
+ Required if s3_credentials is provided.
63
+ redis: Redis connection for indexing. If None, creates a new connection.
64
+
65
+ Raises:
66
+ ValueError: If hive_path is not provided when s3_credentials is set.
67
+ """
68
+ warnings.warn(
69
+ "Repo is deprecated. Use Index with S3DataStore instead:\n"
70
+ " store = S3DataStore(credentials, bucket='my-bucket')\n"
71
+ " index = Index(redis=redis, data_store=store)\n"
72
+ " entry = index.insert_dataset(ds, name='my-dataset')",
73
+ DeprecationWarning,
74
+ stacklevel=2,
75
+ )
76
+
77
+ if s3_credentials is None:
78
+ self.s3_credentials = None
79
+ elif isinstance(s3_credentials, dict):
80
+ self.s3_credentials = s3_credentials
81
+ else:
82
+ self.s3_credentials = _s3_env(s3_credentials)
83
+
84
+ if self.s3_credentials is None:
85
+ self.bucket_fs = None
86
+ else:
87
+ self.bucket_fs = _s3_from_credentials(self.s3_credentials)
88
+
89
+ if self.bucket_fs is not None:
90
+ if hive_path is None:
91
+ raise ValueError("Must specify hive path within bucket")
92
+ self.hive_path = Path(hive_path)
93
+ self.hive_bucket = self.hive_path.parts[0]
94
+ else:
95
+ self.hive_path = None
96
+ self.hive_bucket = None
97
+
98
+ #
99
+
100
+ from atdata.index._index import Index
101
+
102
+ self.index = Index(redis=redis)
103
+
104
+ ##
105
+
106
+ def insert(
107
+ self,
108
+ ds: Dataset[T],
109
+ *,
110
+ name: str,
111
+ cache_local: bool = False,
112
+ schema_ref: str | None = None,
113
+ **kwargs,
114
+ ) -> tuple[LocalDatasetEntry, Dataset[T]]:
115
+ """Insert a dataset into the repository.
116
+
117
+ Writes the dataset to S3 as WebDataset tar files, stores metadata,
118
+ and creates an index entry in Redis.
119
+
120
+ Args:
121
+ ds: The dataset to insert.
122
+ name: Human-readable name for the dataset.
123
+ cache_local: If True, write to local temporary storage first, then
124
+ copy to S3. This can be faster for some workloads.
125
+ schema_ref: Optional schema reference. If None, generates from sample type.
126
+ **kwargs: Additional arguments passed to wds.ShardWriter.
127
+
128
+ Returns:
129
+ A tuple of (index_entry, new_dataset) where:
130
+ - index_entry: LocalDatasetEntry for the stored dataset
131
+ - new_dataset: Dataset object pointing to the stored copy
132
+
133
+ Raises:
134
+ ValueError: If S3 credentials or hive_path are not configured.
135
+ RuntimeError: If no shards were written.
136
+ """
137
+ if self.s3_credentials is None:
138
+ raise ValueError(
139
+ "S3 credentials required for insert(). Initialize Repo with s3_credentials."
140
+ )
141
+ if self.hive_bucket is None or self.hive_path is None:
142
+ raise ValueError(
143
+ "hive_path required for insert(). Initialize Repo with hive_path."
144
+ )
145
+
146
+ new_uuid = str(uuid4())
147
+
148
+ hive_fs = _s3_from_credentials(self.s3_credentials)
149
+
150
+ # Write metadata
151
+ metadata_path = (
152
+ self.hive_path / "metadata" / f"atdata-metadata--{new_uuid}.msgpack"
153
+ )
154
+ # Note: S3 doesn't need directories created beforehand - s3fs handles this
155
+
156
+ if ds.metadata is not None:
157
+ # Use s3:// prefix to ensure s3fs treats this as an S3 path
158
+ with cast(
159
+ BinaryIO, hive_fs.open(f"s3://{metadata_path.as_posix()}", "wb")
160
+ ) as f:
161
+ meta_packed = msgpack.packb(ds.metadata)
162
+ f.write(cast(bytes, meta_packed))
163
+
164
+ # Write data
165
+ shard_pattern = (self.hive_path / f"atdata--{new_uuid}--%06d.tar").as_posix()
166
+
167
+ written_shards: list[str] = []
168
+ with TemporaryDirectory() as temp_dir:
169
+ writer_opener, writer_post = _create_s3_write_callbacks(
170
+ credentials=self.s3_credentials,
171
+ temp_dir=temp_dir,
172
+ written_shards=written_shards,
173
+ fs=hive_fs,
174
+ cache_local=cache_local,
175
+ add_s3_prefix=False,
176
+ )
177
+
178
+ with wds.writer.ShardWriter(
179
+ shard_pattern,
180
+ opener=writer_opener,
181
+ post=writer_post,
182
+ **kwargs,
183
+ ) as sink:
184
+ for sample in ds.ordered(batch_size=None):
185
+ sink.write(sample.as_wds)
186
+
187
+ # Make a new Dataset object for the written dataset copy
188
+ if len(written_shards) == 0:
189
+ raise RuntimeError(
190
+ "Cannot form new dataset entry -- did not write any shards"
191
+ )
192
+
193
+ elif len(written_shards) < 2:
194
+ new_dataset_url = (
195
+ self.hive_path / (Path(written_shards[0]).name)
196
+ ).as_posix()
197
+
198
+ else:
199
+ shard_s3_format = (
200
+ (self.hive_path / f"atdata--{new_uuid}").as_posix()
201
+ ) + "--{shard_id}.tar"
202
+ shard_id_braced = "{" + f"{0:06d}..{len(written_shards) - 1:06d}" + "}"
203
+ new_dataset_url = shard_s3_format.format(shard_id=shard_id_braced)
204
+
205
+ new_dataset = Dataset[ds.sample_type](
206
+ url=new_dataset_url,
207
+ metadata_url=metadata_path.as_posix(),
208
+ )
209
+
210
+ # Add to index (use ds._metadata to avoid network requests)
211
+ new_entry = self.index.add_entry(
212
+ new_dataset,
213
+ name=name,
214
+ schema_ref=schema_ref,
215
+ metadata=ds._metadata,
216
+ )
217
+
218
+ return new_entry, new_dataset