atdata 0.2.0a1__py3-none-any.whl → 0.2.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/__init__.py CHANGED
@@ -39,20 +39,53 @@ Main Components:
39
39
  # Expose components
40
40
 
41
41
  from .dataset import (
42
- PackableSample,
43
- SampleBatch,
44
- Dataset,
45
- packable,
42
+ DictSample as DictSample,
43
+ PackableSample as PackableSample,
44
+ SampleBatch as SampleBatch,
45
+ Dataset as Dataset,
46
+ packable as packable,
46
47
  )
47
48
 
48
49
  from .lens import (
49
- Lens,
50
- LensNetwork,
51
- lens,
50
+ Lens as Lens,
51
+ LensNetwork as LensNetwork,
52
+ lens as lens,
52
53
  )
53
54
 
54
- # ATProto integration (lazy import to avoid requiring atproto package)
55
- from . import atmosphere
55
+ from ._hf_api import (
56
+ load_dataset as load_dataset,
57
+ DatasetDict as DatasetDict,
58
+ )
59
+
60
+ from ._protocols import (
61
+ Packable as Packable,
62
+ IndexEntry as IndexEntry,
63
+ AbstractIndex as AbstractIndex,
64
+ AbstractDataStore as AbstractDataStore,
65
+ DataSource as DataSource,
66
+ )
56
67
 
68
+ from ._sources import (
69
+ URLSource as URLSource,
70
+ S3Source as S3Source,
71
+ BlobSource as BlobSource,
72
+ )
73
+
74
+ from ._schema_codec import (
75
+ schema_to_type as schema_to_type,
76
+ )
77
+
78
+ from ._cid import (
79
+ generate_cid as generate_cid,
80
+ verify_cid as verify_cid,
81
+ )
82
+
83
+ from .promote import (
84
+ promote_to_atmosphere as promote_to_atmosphere,
85
+ )
86
+
87
+ # ATProto integration (lazy import to avoid requiring atproto package)
88
+ from . import atmosphere as atmosphere
57
89
 
58
- #
90
+ # CLI entry point
91
+ from .cli import main as main
atdata/_cid.py ADDED
@@ -0,0 +1,150 @@
1
+ """CID (Content Identifier) utilities for atdata.
2
+
3
+ This module provides utilities for generating ATProto-compatible CIDs from
4
+ data. CIDs are content-addressable identifiers that can be used to uniquely
5
+ identify schemas, datasets, and other records.
6
+
7
+ The CIDs generated here use:
8
+ - CIDv1 format
9
+ - dag-cbor codec (0x71)
10
+ - SHA-256 hash (0x12)
11
+
12
+ This ensures compatibility with ATProto's CID requirements and enables
13
+ seamless promotion from local storage to atmosphere (ATProto network).
14
+
15
+ Example:
16
+ ::
17
+
18
+ >>> schema = {"name": "ImageSample", "version": "1.0.0", "fields": [...]}
19
+ >>> cid = generate_cid(schema)
20
+ >>> print(cid)
21
+ bafyreihffx5a2e7k6r5zqgp5iwpjqr2gfyheqhzqtlxagvqjqyxzqpzqaa
22
+ """
23
+
24
+ import hashlib
25
+ from typing import Any
26
+
27
+ import libipld
28
+
29
+
30
+ # CID constants
31
+ CID_VERSION_1 = 0x01
32
+ CODEC_DAG_CBOR = 0x71
33
+ HASH_SHA256 = 0x12
34
+ SHA256_SIZE = 0x20
35
+
36
+
37
+ def generate_cid(data: Any) -> str:
38
+ """Generate an ATProto-compatible CID from arbitrary data.
39
+
40
+ The data is first encoded as DAG-CBOR, then hashed with SHA-256,
41
+ and finally formatted as a CIDv1 string (base32 multibase).
42
+
43
+ Args:
44
+ data: Any data structure that can be encoded as DAG-CBOR.
45
+ This includes dicts, lists, strings, numbers, bytes, etc.
46
+
47
+ Returns:
48
+ CIDv1 string in base32 multibase format (starts with 'bafy').
49
+
50
+ Raises:
51
+ ValueError: If the data cannot be encoded as DAG-CBOR.
52
+
53
+ Example:
54
+ ::
55
+
56
+ >>> generate_cid({"name": "test", "value": 42})
57
+ 'bafyrei...'
58
+ """
59
+ # Encode data as DAG-CBOR
60
+ try:
61
+ cbor_bytes = libipld.encode_dag_cbor(data)
62
+ except (TypeError, ValueError, OverflowError) as e:
63
+ raise ValueError(f"Failed to encode data as DAG-CBOR: {e}") from e
64
+
65
+ # Hash with SHA-256
66
+ sha256_hash = hashlib.sha256(cbor_bytes).digest()
67
+
68
+ # Build raw CID bytes:
69
+ # CIDv1 = version(1) + codec(dag-cbor) + multihash
70
+ # Multihash = code(sha256) + size(32) + digest
71
+ raw_cid_bytes = bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
72
+
73
+ # Encode to base32 multibase string
74
+ return libipld.encode_cid(raw_cid_bytes)
75
+
76
+
77
+ def generate_cid_from_bytes(data_bytes: bytes) -> str:
78
+ """Generate a CID from raw bytes (already encoded data).
79
+
80
+ Use this when you have pre-encoded data (e.g., DAG-CBOR bytes from
81
+ another source) and want to generate its CID without re-encoding.
82
+
83
+ Args:
84
+ data_bytes: Raw bytes to hash (treated as opaque blob).
85
+
86
+ Returns:
87
+ CIDv1 string in base32 multibase format.
88
+
89
+ Example:
90
+ ::
91
+
92
+ >>> cbor_bytes = libipld.encode_dag_cbor({"key": "value"})
93
+ >>> cid = generate_cid_from_bytes(cbor_bytes)
94
+ """
95
+ sha256_hash = hashlib.sha256(data_bytes).digest()
96
+ raw_cid_bytes = bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
97
+ return libipld.encode_cid(raw_cid_bytes)
98
+
99
+
100
+ def verify_cid(cid: str, data: Any) -> bool:
101
+ """Verify that a CID matches the given data.
102
+
103
+ Args:
104
+ cid: CID string to verify.
105
+ data: Data that should correspond to the CID.
106
+
107
+ Returns:
108
+ True if the CID matches the data, False otherwise.
109
+
110
+ Example:
111
+ ::
112
+
113
+ >>> cid = generate_cid({"name": "test"})
114
+ >>> verify_cid(cid, {"name": "test"})
115
+ True
116
+ >>> verify_cid(cid, {"name": "different"})
117
+ False
118
+ """
119
+ expected_cid = generate_cid(data)
120
+ return cid == expected_cid
121
+
122
+
123
+ def parse_cid(cid: str) -> dict:
124
+ """Parse a CID string into its components.
125
+
126
+ Args:
127
+ cid: CID string to parse.
128
+
129
+ Returns:
130
+ Dictionary with 'version', 'codec', and 'hash' keys.
131
+ The 'hash' value is itself a dict with 'code', 'size', and 'digest'.
132
+
133
+ Example:
134
+ ::
135
+
136
+ >>> info = parse_cid('bafyrei...')
137
+ >>> info['version']
138
+ 1
139
+ >>> info['codec']
140
+ 113 # 0x71 = dag-cbor
141
+ """
142
+ return libipld.decode_cid(cid)
143
+
144
+
145
+ __all__ = [
146
+ "generate_cid",
147
+ "generate_cid_from_bytes",
148
+ "verify_cid",
149
+ "parse_cid",
150
+ ]