atdata 0.2.0a1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/__init__.py CHANGED
@@ -39,20 +39,53 @@ Main Components:
39
39
  # Expose components
40
40
 
41
41
  from .dataset import (
42
- PackableSample,
43
- SampleBatch,
44
- Dataset,
45
- packable,
42
+ DictSample as DictSample,
43
+ PackableSample as PackableSample,
44
+ SampleBatch as SampleBatch,
45
+ Dataset as Dataset,
46
+ packable as packable,
46
47
  )
47
48
 
48
49
  from .lens import (
49
- Lens,
50
- LensNetwork,
51
- lens,
50
+ Lens as Lens,
51
+ LensNetwork as LensNetwork,
52
+ lens as lens,
52
53
  )
53
54
 
54
- # ATProto integration (lazy import to avoid requiring atproto package)
55
- from . import atmosphere
55
+ from ._hf_api import (
56
+ load_dataset as load_dataset,
57
+ DatasetDict as DatasetDict,
58
+ )
59
+
60
+ from ._protocols import (
61
+ Packable as Packable,
62
+ IndexEntry as IndexEntry,
63
+ AbstractIndex as AbstractIndex,
64
+ AbstractDataStore as AbstractDataStore,
65
+ DataSource as DataSource,
66
+ )
56
67
 
68
+ from ._sources import (
69
+ URLSource as URLSource,
70
+ S3Source as S3Source,
71
+ BlobSource as BlobSource,
72
+ )
73
+
74
+ from ._schema_codec import (
75
+ schema_to_type as schema_to_type,
76
+ )
77
+
78
+ from ._cid import (
79
+ generate_cid as generate_cid,
80
+ verify_cid as verify_cid,
81
+ )
82
+
83
+ from .promote import (
84
+ promote_to_atmosphere as promote_to_atmosphere,
85
+ )
86
+
87
+ # ATProto integration (lazy import to avoid requiring atproto package)
88
+ from . import atmosphere as atmosphere
57
89
 
58
- #
90
+ # CLI entry point
91
+ from .cli import main as main
atdata/_cid.py ADDED
@@ -0,0 +1,144 @@
1
+ """CID (Content Identifier) utilities for atdata.
2
+
3
+ This module provides utilities for generating ATProto-compatible CIDs from
4
+ data. CIDs are content-addressable identifiers that can be used to uniquely
5
+ identify schemas, datasets, and other records.
6
+
7
+ The CIDs generated here use:
8
+ - CIDv1 format
9
+ - dag-cbor codec (0x71)
10
+ - SHA-256 hash (0x12)
11
+
12
+ This ensures compatibility with ATProto's CID requirements and enables
13
+ seamless promotion from local storage to atmosphere (ATProto network).
14
+
15
+ Examples:
16
+ >>> schema = {"name": "ImageSample", "version": "1.0.0", "fields": [...]}
17
+ >>> cid = generate_cid(schema)
18
+ >>> print(cid)
19
+ bafyreihffx5a2e7k6r5zqgp5iwpjqr2gfyheqhzqtlxagvqjqyxzqpzqaa
20
+ """
21
+
22
+ import hashlib
23
+ from typing import Any
24
+
25
+ import libipld
26
+
27
+
28
+ # CID constants
29
+ CID_VERSION_1 = 0x01
30
+ CODEC_DAG_CBOR = 0x71
31
+ HASH_SHA256 = 0x12
32
+ SHA256_SIZE = 0x20
33
+
34
+
35
+ def generate_cid(data: Any) -> str:
36
+ """Generate an ATProto-compatible CID from arbitrary data.
37
+
38
+ The data is first encoded as DAG-CBOR, then hashed with SHA-256,
39
+ and finally formatted as a CIDv1 string (base32 multibase).
40
+
41
+ Args:
42
+ data: Any data structure that can be encoded as DAG-CBOR.
43
+ This includes dicts, lists, strings, numbers, bytes, etc.
44
+
45
+ Returns:
46
+ CIDv1 string in base32 multibase format (starts with 'bafy').
47
+
48
+ Raises:
49
+ ValueError: If the data cannot be encoded as DAG-CBOR.
50
+
51
+ Examples:
52
+ >>> generate_cid({"name": "test", "value": 42})
53
+ 'bafyrei...'
54
+ """
55
+ # Encode data as DAG-CBOR
56
+ try:
57
+ cbor_bytes = libipld.encode_dag_cbor(data)
58
+ except (TypeError, ValueError, OverflowError) as e:
59
+ raise ValueError(f"Failed to encode data as DAG-CBOR: {e}") from e
60
+
61
+ # Hash with SHA-256
62
+ sha256_hash = hashlib.sha256(cbor_bytes).digest()
63
+
64
+ # Build raw CID bytes:
65
+ # CIDv1 = version(1) + codec(dag-cbor) + multihash
66
+ # Multihash = code(sha256) + size(32) + digest
67
+ raw_cid_bytes = (
68
+ bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
69
+ )
70
+
71
+ # Encode to base32 multibase string
72
+ return libipld.encode_cid(raw_cid_bytes)
73
+
74
+
75
+ def generate_cid_from_bytes(data_bytes: bytes) -> str:
76
+ """Generate a CID from raw bytes (already encoded data).
77
+
78
+ Use this when you have pre-encoded data (e.g., DAG-CBOR bytes from
79
+ another source) and want to generate its CID without re-encoding.
80
+
81
+ Args:
82
+ data_bytes: Raw bytes to hash (treated as opaque blob).
83
+
84
+ Returns:
85
+ CIDv1 string in base32 multibase format.
86
+
87
+ Examples:
88
+ >>> cbor_bytes = libipld.encode_dag_cbor({"key": "value"})
89
+ >>> cid = generate_cid_from_bytes(cbor_bytes)
90
+ """
91
+ sha256_hash = hashlib.sha256(data_bytes).digest()
92
+ raw_cid_bytes = (
93
+ bytes([CID_VERSION_1, CODEC_DAG_CBOR, HASH_SHA256, SHA256_SIZE]) + sha256_hash
94
+ )
95
+ return libipld.encode_cid(raw_cid_bytes)
96
+
97
+
98
+ def verify_cid(cid: str, data: Any) -> bool:
99
+ """Verify that a CID matches the given data.
100
+
101
+ Args:
102
+ cid: CID string to verify.
103
+ data: Data that should correspond to the CID.
104
+
105
+ Returns:
106
+ True if the CID matches the data, False otherwise.
107
+
108
+ Examples:
109
+ >>> cid = generate_cid({"name": "test"})
110
+ >>> verify_cid(cid, {"name": "test"})
111
+ True
112
+ >>> verify_cid(cid, {"name": "different"})
113
+ False
114
+ """
115
+ expected_cid = generate_cid(data)
116
+ return cid == expected_cid
117
+
118
+
119
+ def parse_cid(cid: str) -> dict:
120
+ """Parse a CID string into its components.
121
+
122
+ Args:
123
+ cid: CID string to parse.
124
+
125
+ Returns:
126
+ Dictionary with 'version', 'codec', and 'hash' keys.
127
+ The 'hash' value is itself a dict with 'code', 'size', and 'digest'.
128
+
129
+ Examples:
130
+ >>> info = parse_cid('bafyrei...')
131
+ >>> info['version']
132
+ 1
133
+ >>> info['codec']
134
+ 113 # 0x71 = dag-cbor
135
+ """
136
+ return libipld.decode_cid(cid)
137
+
138
+
139
+ __all__ = [
140
+ "generate_cid",
141
+ "generate_cid_from_bytes",
142
+ "verify_cid",
143
+ "parse_cid",
144
+ ]
atdata/_helpers.py CHANGED
@@ -22,7 +22,8 @@ import numpy as np
22
22
 
23
23
  ##
24
24
 
25
- def array_to_bytes( x: np.ndarray ) -> bytes:
25
+
26
+ def array_to_bytes(x: np.ndarray) -> bytes:
26
27
  """Convert a numpy array to bytes for msgpack serialization.
27
28
 
28
29
  Uses numpy's native ``save()`` format to preserve array dtype and shape.
@@ -37,10 +38,11 @@ def array_to_bytes( x: np.ndarray ) -> bytes:
37
38
  Uses ``allow_pickle=True`` to support object dtypes.
38
39
  """
39
40
  np_bytes = BytesIO()
40
- np.save( np_bytes, x, allow_pickle = True )
41
+ np.save(np_bytes, x, allow_pickle=True)
41
42
  return np_bytes.getvalue()
42
43
 
43
- def bytes_to_array( b: bytes ) -> np.ndarray:
44
+
45
+ def bytes_to_array(b: bytes) -> np.ndarray:
44
46
  """Convert serialized bytes back to a numpy array.
45
47
 
46
48
  Reverses the serialization performed by ``array_to_bytes()``.
@@ -54,5 +56,5 @@ def bytes_to_array( b: bytes ) -> np.ndarray:
54
56
  Note:
55
57
  Uses ``allow_pickle=True`` to support object dtypes.
56
58
  """
57
- np_bytes = BytesIO( b )
58
- return np.load( np_bytes, allow_pickle = True )
59
+ np_bytes = BytesIO(b)
60
+ return np.load(np_bytes, allow_pickle=True)