atdata 0.2.0a1__py3-none-any.whl → 0.2.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +43 -10
- atdata/_cid.py +150 -0
- atdata/_hf_api.py +692 -0
- atdata/_protocols.py +519 -0
- atdata/_schema_codec.py +442 -0
- atdata/_sources.py +515 -0
- atdata/_stub_manager.py +529 -0
- atdata/_type_utils.py +90 -0
- atdata/atmosphere/__init__.py +278 -7
- atdata/atmosphere/_types.py +9 -7
- atdata/atmosphere/client.py +146 -6
- atdata/atmosphere/lens.py +29 -25
- atdata/atmosphere/records.py +197 -30
- atdata/atmosphere/schema.py +41 -98
- atdata/atmosphere/store.py +208 -0
- atdata/cli/__init__.py +213 -0
- atdata/cli/diagnose.py +165 -0
- atdata/cli/local.py +280 -0
- atdata/dataset.py +482 -167
- atdata/lens.py +61 -57
- atdata/local.py +1400 -185
- atdata/promote.py +199 -0
- {atdata-0.2.0a1.dist-info → atdata-0.2.2b1.dist-info}/METADATA +105 -14
- atdata-0.2.2b1.dist-info/RECORD +28 -0
- atdata-0.2.0a1.dist-info/RECORD +0 -16
- {atdata-0.2.0a1.dist-info → atdata-0.2.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.0a1.dist-info → atdata-0.2.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.0a1.dist-info → atdata-0.2.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/__init__.py
CHANGED
|
@@ -39,20 +39,53 @@ Main Components:
|
|
|
39
39
|
# Expose components
|
|
40
40
|
|
|
41
41
|
from .dataset import (
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
42
|
+
DictSample as DictSample,
|
|
43
|
+
PackableSample as PackableSample,
|
|
44
|
+
SampleBatch as SampleBatch,
|
|
45
|
+
Dataset as Dataset,
|
|
46
|
+
packable as packable,
|
|
46
47
|
)
|
|
47
48
|
|
|
48
49
|
from .lens import (
|
|
49
|
-
Lens,
|
|
50
|
-
LensNetwork,
|
|
51
|
-
lens,
|
|
50
|
+
Lens as Lens,
|
|
51
|
+
LensNetwork as LensNetwork,
|
|
52
|
+
lens as lens,
|
|
52
53
|
)
|
|
53
54
|
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
from ._hf_api import (
|
|
56
|
+
load_dataset as load_dataset,
|
|
57
|
+
DatasetDict as DatasetDict,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
from ._protocols import (
|
|
61
|
+
Packable as Packable,
|
|
62
|
+
IndexEntry as IndexEntry,
|
|
63
|
+
AbstractIndex as AbstractIndex,
|
|
64
|
+
AbstractDataStore as AbstractDataStore,
|
|
65
|
+
DataSource as DataSource,
|
|
66
|
+
)
|
|
56
67
|
|
|
68
|
+
from ._sources import (
|
|
69
|
+
URLSource as URLSource,
|
|
70
|
+
S3Source as S3Source,
|
|
71
|
+
BlobSource as BlobSource,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
from ._schema_codec import (
|
|
75
|
+
schema_to_type as schema_to_type,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
from ._cid import (
|
|
79
|
+
generate_cid as generate_cid,
|
|
80
|
+
verify_cid as verify_cid,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
from .promote import (
|
|
84
|
+
promote_to_atmosphere as promote_to_atmosphere,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# ATProto integration (lazy import to avoid requiring atproto package)
|
|
88
|
+
from . import atmosphere as atmosphere
|
|
57
89
|
|
|
58
|
-
#
|
|
90
|
+
# CLI entry point
|
|
91
|
+
from .cli import main as main
|
atdata/_cid.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""CID (Content Identifier) utilities for atdata.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for generating ATProto-compatible CIDs from
|
|
4
|
+
data. CIDs are content-addressable identifiers that can be used to uniquely
|
|
5
|
+
identify schemas, datasets, and other records.
|
|
6
|
+
|
|
7
|
+
The CIDs generated here use:
|
|
8
|
+
- CIDv1 format
|
|
9
|
+
- dag-cbor codec (0x71)
|
|
10
|
+
- SHA-256 hash (0x12)
|
|
11
|
+
|
|
12
|
+
This ensures compatibility with ATProto's CID requirements and enables
|
|
13
|
+
seamless promotion from local storage to atmosphere (ATProto network).
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
::
|
|
17
|
+
|
|
18
|
+
>>> schema = {"name": "ImageSample", "version": "1.0.0", "fields": [...]}
|
|
19
|
+
>>> cid = generate_cid(schema)
|
|
20
|
+
>>> print(cid)
|
|
21
|
+
bafyreihffx5a2e7k6r5zqgp5iwpjqr2gfyheqhzqtlxagvqjqyxzqpzqaa
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import hashlib
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
import libipld
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# CID constants
|
|
31
|
+
CID_VERSION_1 = 0x01
|
|
32
|
+
CODEC_DAG_CBOR = 0x71
|
|
33
|
+
HASH_SHA256 = 0x12
|
|
34
|
+
SHA256_SIZE = 0x20
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def generate_cid(data: Any) -> str:
|
|
38
|
+
"""Generate an ATProto-compatible CID from arbitrary data.
|
|
39
|
+
|
|
40
|
+
The data is first encoded as DAG-CBOR, then hashed with SHA-256,
|
|
41
|
+
and finally formatted as a CIDv1 string (base32 multibase).
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
data: Any data structure that can be encoded as DAG-CBOR.
|
|
45
|
+
This includes dicts, lists, strings, numbers, bytes, etc.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
CIDv1 string in base32 multibase format (starts with 'bafy').
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
ValueError: If the data cannot be encoded as DAG-CBOR.
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
::
|
|
55
|
+
|
|
56
|
+
>>> generate_cid({"name": "test", "value": 42})
|
|
57
|
+
'bafyrei...'
|
|
58
|
+
"""
|
|
59
|
+
# Encode data as DAG-CBOR
|
|
60
|
+
try:
|
|
61
|
+
cbor_bytes = libipld.encode_dag_cbor(data)
|
|
62
|
+
except (TypeError, ValueError, OverflowError) as e:
|
|
63
|
+
raise ValueError(f"Failed to encode data as DAG-CBOR: {e}") from e
|
|
64
|
+
|
|
65
|
+
# Hash with SHA-256
|
|
66
|
+
sha256_hash = hashlib.sha256(cbor_bytes).digest()
|
|
67
|
+
|
|
68
|
+
# Build raw CID bytes:
|
|
69
|
+
# CIDv1 = version(1) + codec(dag-cbor) + multihash
|
|
70
|
+
# Multihash = code(sha256) + size(32) + digest
|
|
71
|
+
raw_cid_bytes = bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
|
|
72
|
+
|
|
73
|
+
# Encode to base32 multibase string
|
|
74
|
+
return libipld.encode_cid(raw_cid_bytes)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def generate_cid_from_bytes(data_bytes: bytes) -> str:
|
|
78
|
+
"""Generate a CID from raw bytes (already encoded data).
|
|
79
|
+
|
|
80
|
+
Use this when you have pre-encoded data (e.g., DAG-CBOR bytes from
|
|
81
|
+
another source) and want to generate its CID without re-encoding.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
data_bytes: Raw bytes to hash (treated as opaque blob).
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
CIDv1 string in base32 multibase format.
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
::
|
|
91
|
+
|
|
92
|
+
>>> cbor_bytes = libipld.encode_dag_cbor({"key": "value"})
|
|
93
|
+
>>> cid = generate_cid_from_bytes(cbor_bytes)
|
|
94
|
+
"""
|
|
95
|
+
sha256_hash = hashlib.sha256(data_bytes).digest()
|
|
96
|
+
raw_cid_bytes = bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
|
|
97
|
+
return libipld.encode_cid(raw_cid_bytes)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def verify_cid(cid: str, data: Any) -> bool:
|
|
101
|
+
"""Verify that a CID matches the given data.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
cid: CID string to verify.
|
|
105
|
+
data: Data that should correspond to the CID.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
True if the CID matches the data, False otherwise.
|
|
109
|
+
|
|
110
|
+
Example:
|
|
111
|
+
::
|
|
112
|
+
|
|
113
|
+
>>> cid = generate_cid({"name": "test"})
|
|
114
|
+
>>> verify_cid(cid, {"name": "test"})
|
|
115
|
+
True
|
|
116
|
+
>>> verify_cid(cid, {"name": "different"})
|
|
117
|
+
False
|
|
118
|
+
"""
|
|
119
|
+
expected_cid = generate_cid(data)
|
|
120
|
+
return cid == expected_cid
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def parse_cid(cid: str) -> dict:
|
|
124
|
+
"""Parse a CID string into its components.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
cid: CID string to parse.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Dictionary with 'version', 'codec', and 'hash' keys.
|
|
131
|
+
The 'hash' value is itself a dict with 'code', 'size', and 'digest'.
|
|
132
|
+
|
|
133
|
+
Example:
|
|
134
|
+
::
|
|
135
|
+
|
|
136
|
+
>>> info = parse_cid('bafyrei...')
|
|
137
|
+
>>> info['version']
|
|
138
|
+
1
|
|
139
|
+
>>> info['codec']
|
|
140
|
+
113 # 0x71 = dag-cbor
|
|
141
|
+
"""
|
|
142
|
+
return libipld.decode_cid(cid)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
__all__ = [
|
|
146
|
+
"generate_cid",
|
|
147
|
+
"generate_cid_from_bytes",
|
|
148
|
+
"verify_cid",
|
|
149
|
+
"parse_cid",
|
|
150
|
+
]
|