vocker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ import attr
2
+
3
+
4
+ class InvalidPathError(ValueError):
5
+ pass
6
+
7
+
8
+ @attr.s(eq=False, hash=False, kw_only=True)
9
+ class IntegerToPath:
10
+ """
11
+ Turn an integer into a structured file path.
12
+ """
13
+
14
+ entries_per_directory: int = attr.ib(default=256)
15
+ format_code: str = attr.ib(default=None)
16
+ file_suffix = attr.ib(default=".bin")
17
+
18
+ def __attrs_post_init__(self):
19
+ if self.format_code is None:
20
+ self.format_code = self._make_default_format_code()
21
+
22
+ def _make_default_format_code(self):
23
+ bits = (self.entries_per_directory - 1).bit_length()
24
+ hexdigits = (bits + 3) // 4
25
+ return f"{{:0{hexdigits:d}x}}"
26
+
27
+ def __call__(self, integer: int) -> str:
28
+ assert integer >= 0
29
+ fmt = self.format_code
30
+ r = []
31
+ div = integer
32
+ ext = self.file_suffix
33
+ n = self.entries_per_directory
34
+ while True:
35
+ div, rem = divmod(div, n)
36
+ r.append(fmt.format(rem) + ext)
37
+ ext = "" # only add extension on the first loop
38
+ if not div:
39
+ break
40
+ r.reverse()
41
+ return "/".join(r)
42
+
43
+ def invert(self, path: str) -> int:
44
+ if not path.endswith(ext := self.file_suffix):
45
+ raise InvalidPathError
46
+
47
+ number_hex = path[: -len(ext)].replace("/", "")
48
+ try:
49
+ return int(number_hex, 16)
50
+ except Exception as exc:
51
+ raise InvalidPathError from exc
vocker/multihash.py ADDED
@@ -0,0 +1,302 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import dataclasses
5
+ from functools import cached_property
6
+ import hashlib
7
+ import re
8
+ import typing as ty
9
+
10
+ from sansio_tools.parser import BinaryParser
11
+
12
+ __all__ = [
13
+ "HashFunction",
14
+ "Hasher",
15
+ "Digest",
16
+ "HashFunctionRegistry",
17
+ "registry",
18
+ "multihash_varint_encode",
19
+ "multihash_varint_decode",
20
+ "multibase_encode_base64url",
21
+ "multibase_decode_base64url",
22
+ "BadHashSpecError",
23
+ "InvalidHashError",
24
+ ]
25
+
26
+
27
+ def multihash_varint_encode(n: int) -> bytes:
28
+ assert n >= 0, "n must be nonnegative"
29
+ output = []
30
+ while n or not output:
31
+ byte = n & 127
32
+ n >>= 7
33
+ if n:
34
+ byte |= 128
35
+ output.append(byte)
36
+ return bytes(output)
37
+
38
+
39
+ def multihash_varint_decode(s: bytes, startpos: int = 0) -> tuple[int, int]:
40
+ """
41
+ Raises ValueError in case of invalid encoding. Raises IndexError in case of truncated data.
42
+ """
43
+
44
+ output = 0
45
+ bit_position = 0
46
+ for i in range(startpos, startpos + 9):
47
+ byte = s[i]
48
+ output |= (byte & 127) << bit_position
49
+ bit_position += 7
50
+ if byte < 128:
51
+ if byte == 0 and i > startpos:
52
+ raise ValueError("non-canonical encoding (unnecessary bytes)")
53
+ return (i + 1, output)
54
+
55
+ raise ValueError("varint too long")
56
+
57
+
58
+ def multihash_varint_decode(p: BinaryParser):
59
+ return p.read_variable_length_int_7bit(
60
+ 9, byteorder="big", continuation_bit_value=True, require_canonical=True
61
+ )
62
+
63
+
64
+ _base64url_re = re.compile(b"[a-zA-Z0-9_-]*")
65
+
66
+
67
+ def multibase_decode_base64url(data: str | bytes) -> bytes:
68
+ """
69
+ Decode base64url data.
70
+
71
+ https://github.com/multiformats/multibase
72
+ """
73
+ if type(data) is str:
74
+ data = data.encode("ascii")
75
+
76
+ if data[:1] != b"u":
77
+ raise AssertionError("only support base64url")
78
+
79
+ b64data = data[1:]
80
+
81
+ # Python's base64.b64decode implementation simply ignores trailing data
82
+ # after the padding. We check that the data is 100% made up of valid
83
+ # characters.
84
+ if not _base64url_re.fullmatch(b64data):
85
+ raise ValueError("base64url-encoded data contains invalid characters")
86
+
87
+ return base64.urlsafe_b64decode(b64data + b"==")
88
+
89
+
90
+ def multibase_encode_base64url(data: bytes) -> str:
91
+ return "u" + base64.urlsafe_b64encode(data).rstrip(b"=").decode("ascii")
92
+
93
+
94
+ class BadHashSpecError(Exception):
95
+ pass
96
+
97
+
98
+ class InvalidHashError(KeyError, BadHashSpecError):
99
+ pass
100
+
101
+
102
+ @dataclasses.dataclass(eq=False)
103
+ class HashFunction:
104
+ """
105
+ Describes a hash function.
106
+
107
+ Parameters
108
+ ----------
109
+ name: str
110
+ User-friendly name.
111
+ function_code: int
112
+ [Multihash](https://github.com/multiformats/multihash) function code
113
+ used as a prefix to the multihash.
114
+ digest_size: int
115
+ Multihash output digest size in bytes, used as the second prefix.
116
+ hashlib_name: str
117
+ Name used to instantiate a hash function using :func:`hashlib.new()`.
118
+ hashlib_needs_digest_size: bool
119
+ Is the function a variable digest size function (like SHAKE-256)?
120
+ """
121
+
122
+ def __str__(self):
123
+ return "registry.name_to_hash[{self.name!r}]"
124
+
125
+ name: str
126
+ function_code: int
127
+ digest_size: int
128
+ hashlib_name: str
129
+ hashlib_needs_digest_size: bool
130
+
131
+ @cached_property
132
+ def multihash_prefix(self) -> bytes:
133
+ e = multihash_varint_encode
134
+ return e(self.function_code) + e(self.digest_size)
135
+
136
+ def digest_from_bytes(self, data: bytes | bytearray | memoryview):
137
+ data = bytes(data)
138
+ if len(data) != self.digest_size:
139
+ raise ValueError("incorrect digest length")
140
+ return Digest(self, data)
141
+
142
+ def __call__(self):
143
+ wrapped = hashlib.new(self.hashlib_name)
144
+ if self.hashlib_needs_digest_size:
145
+ cls = ExplicitLengthHasher
146
+ else:
147
+ cls = ImplicitLengthHasher
148
+ return cls(function=self, wrapped=wrapped)
149
+
150
+
151
+ @dataclasses.dataclass
152
+ class Hasher:
153
+ """
154
+ An instantiated hash function. Can be used to hash data via :meth:`update`
155
+ or to produce a digest using :meth:`digest`.
156
+ """
157
+
158
+ function: HashFunction
159
+ wrapped: object
160
+
161
+ @property
162
+ def digest_size(self):
163
+ return self.function.digest_size
164
+
165
+ def update(self, data):
166
+ self.wrapped.update(data)
167
+ return self
168
+
169
+ def update_iter(self, data):
170
+ for x in data:
171
+ self.update(x)
172
+ return self
173
+
174
+ def digest(self) -> Digest:
175
+ return Digest(self.function, self.digest_bytes())
176
+
177
+ def digest_bytes(self) -> bytes:
178
+ raise NotImplementedError
179
+
180
+ def copy(self):
181
+ return dataclasses.replace(self, wrapped=self.wrapped.copy())
182
+
183
+
184
+ @dataclasses.dataclass
185
+ class MultiHasher:
186
+ hashers: dict[ty.Any, Hasher]
187
+ size: int = 0
188
+
189
+ def update(self, data):
190
+ for h in self.hashers.values():
191
+ h.update(data)
192
+ self.size += len(data)
193
+ return self
194
+
195
+ def digest(self):
196
+ return {k: v.digest() for k, v in self.hashers.items()}
197
+
198
+ def copy(self):
199
+ return dataclasses.replace(self, hashers={k: v.copy() for k, v in self.hashers.items()})
200
+
201
+
202
+ @dataclasses.dataclass
203
+ class ImplicitLengthHasher(Hasher):
204
+ def digest_bytes(self):
205
+ return self.wrapped.digest()
206
+
207
+
208
+ @dataclasses.dataclass
209
+ class ExplicitLengthHasher(Hasher):
210
+ def digest_bytes(self):
211
+ return self.wrapped.digest(self.function.digest_size)
212
+
213
+
214
+ @dataclasses.dataclass(frozen=True, repr=False)
215
+ class Digest:
216
+ function: HashFunction
217
+ digest: bytes
218
+
219
+ def __repr__(self):
220
+ return f"<Digest {self.function.name} {self.digest.hex()}>"
221
+
222
+ def to_multihash_bytes(self) -> bytes:
223
+ """Output a multihash digest as a bytestring."""
224
+ return self.function.multihash_prefix + self.digest
225
+
226
+ def to_multihash_base64url(self) -> str:
227
+ """Output a multihash digest using multibase base64url encoding."""
228
+ return multibase_encode_base64url(self.to_multihash_bytes())
229
+
230
+
231
+ @dataclasses.dataclass(eq=False)
232
+ class HashFunctionRegistry:
233
+ HASHES = (
234
+ HashFunction("sha2-256", 0x12, 32, "sha256", False),
235
+ HashFunction("sha2-512", 0x13, 64, "sha512", False),
236
+ HashFunction("sha3-512", 0x14, 64, "sha3_512", False),
237
+ HashFunction("shake256-512", 0x19, 64, "shake_256", True),
238
+ HashFunction("blake2s", 0xB260, 32, "blake2s", False),
239
+ HashFunction("blake2b", 0xB240, 64, "blake2b", False),
240
+ )
241
+ hashes: list = dataclasses.field(default=None)
242
+ name_to_hash: dict[str, HashFunction] = dataclasses.field(init=False, default_factory=dict)
243
+ code_and_size_to_hash: dict[tuple[int, int], HashFunction] = dataclasses.field(
244
+ init=False, default_factory=dict
245
+ )
246
+
247
+ def __post_init__(self):
248
+ if self.hashes is None:
249
+ self.hashes = self.HASHES
250
+
251
+ for h in self.hashes:
252
+ self._register(h)
253
+
254
+ def _register(self, desc: HashFunction):
255
+ self.name_to_hash[desc.name] = desc
256
+ self.code_and_size_to_hash[desc.function_code, desc.digest_size] = desc
257
+
258
+ def register(self, desc: HashFunction):
259
+ self.hashes.append(desc)
260
+ self._register(desc)
261
+
262
+ def decode(self, multihash: str | bytes | bytearray | memoryview) -> Digest:
263
+ if isinstance(multihash, str):
264
+ multihash = multibase_decode_base64url(multihash)
265
+ code, size, out = self.static_decode(multihash)
266
+ return Digest(self.code_and_size_to_hash[code, size], out)
267
+
268
+ def decode_from_code_and_digest(self, function_code: int, digest: bytes) -> Digest:
269
+ return Digest(self.code_and_size_to_hash[function_code, len(digest)], digest)
270
+
271
+ @staticmethod
272
+ def static_decode(data: bytes | bytearray | memoryview, check=True) -> tuple[int, int, bytes]:
273
+ """
274
+ Decode a multihash (https://github.com/multiformats/multihash) into a
275
+ tuple ``(function_code, digest_size, hash_function_output)``.
276
+
277
+ If ``check`` is True, then check that
278
+ ``len(hash_function_output) == digest_size`` or else raise a
279
+ ValueError.
280
+ """
281
+ result = None
282
+
283
+ def _gen(p):
284
+ nonlocal result
285
+ function_code = yield from multihash_varint_decode(p)
286
+ digest_size = yield from multihash_varint_decode(p)
287
+ result = function_code, digest_size
288
+
289
+ p = BinaryParser(trailing_data_raises=False)
290
+ p.generator = _gen(p)
291
+ p.feed(data)
292
+ p.feed(b"")
293
+ function_code, digest_size = result
294
+ hash_function_output = bytes(p.queue)
295
+
296
+ if check and len(hash_function_output) != digest_size:
297
+ raise ValueError("multihash output does not match digest length")
298
+
299
+ return (function_code, digest_size, hash_function_output)
300
+
301
+
302
+ registry = HashFunctionRegistry()
vocker/py.typed ADDED
File without changes
File without changes
@@ -0,0 +1,239 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ import contextlib
5
+ import io
6
+ import typing as ty
7
+
8
+ import attr
9
+ from sansio_tools.queue import BytesQueue, FileAdapterFromGeneratorBytes
10
+
11
+
12
+ _bytes = bytes | bytearray | memoryview
13
+
14
+
15
+ class CompressorInterface(abc.ABC):
16
+ @abc.abstractmethod
17
+ def compress(self, data: _bytes | None) -> ty.Iterator[bytes]: ...
18
+
19
+
20
+ class DecompressorInterface(abc.ABC):
21
+ eof: bool
22
+ unused_data: bytes
23
+
24
+ @abc.abstractmethod
25
+ def feed(self, data: _bytes | None) -> None: ...
26
+
27
+ @abc.abstractmethod
28
+ def read(self, max_length: int) -> _bytes: ...
29
+
30
+
31
+ @attr.s(eq=False, hash=False)
32
+ class CompressorStdlibAdapter(CompressorInterface):
33
+ object = attr.ib()
34
+
35
+ def compress(self, data) -> ty.Iterator[_bytes]:
36
+ if data is None:
37
+ return iter((self.object.flush(),))
38
+ else:
39
+ return iter((self.object.compress(data),))
40
+
41
+
42
+ @attr.s(eq=False, hash=False)
43
+ class CompressorZstdAdapter(CompressorInterface):
44
+ object = attr.ib()
45
+
46
+ def compress(self, data) -> ty.Iterator[_bytes]:
47
+ if data is None:
48
+ return self.object.flush()
49
+ else:
50
+ return self.object.compress(data)
51
+
52
+
53
+ @attr.s(eq=False, hash=False)
54
+ class DecompressorStdlibAdapter(DecompressorInterface):
55
+ object = attr.ib()
56
+ _input: BytesQueue = attr.ib(factory=BytesQueue, init=False)
57
+ _input_eof = False
58
+ _flushed = False
59
+
60
+ @property
61
+ def eof(self):
62
+ return self.object.eof
63
+
64
+ @property
65
+ def unused_data(self):
66
+ return self.object.unused_data
67
+
68
+ def feed(self, data):
69
+ if data is None:
70
+ self._input_eof = True
71
+ elif self._input_eof:
72
+ raise AssertionError("cannot feed data after eof")
73
+ else:
74
+ self._input.append(data)
75
+
76
+ def read(self, max_length: int):
77
+ obj = self.object
78
+
79
+ if (b := self._input.popleft_any()) is None:
80
+ # Input queue is empty. Perform flush if the input is finished.
81
+ if self._input_eof and not self._flushed:
82
+ self._flushed = True
83
+ return obj.flush()
84
+
85
+ # The input stream isn't done yet, so we must wait for more data.
86
+ return b""
87
+ else:
88
+ result = obj.decompress(b, max_length)
89
+
90
+ # Put back whatever input wasn't consumed. We will need to feed it back in.
91
+ self._input.appendleft(obj.unconsumed_tail)
92
+
93
+ return result
94
+
95
+
96
+ @attr.s(eq=False, hash=False)
97
+ class DecompressorStdlibNeedsInputAdapter(DecompressorStdlibAdapter):
98
+ def read(self, max_length: int):
99
+ obj = self.object
100
+
101
+ if (b := self._input.popleft_any()) is None and obj.needs_input:
102
+ # Input queue is empty. Perform flush if the input is finished.
103
+ if self._input_eof and not self._flushed:
104
+ self._flushed = True
105
+ return obj.flush()
106
+
107
+ # The input stream isn't done yet, so we must wait for more data.
108
+ return b""
109
+ else:
110
+ try:
111
+ return obj.decompress(b or b"", max_length)
112
+ except EOFError:
113
+ return b""
114
+
115
+
116
+ @attr.s(eq=False, hash=False)
117
+ class CompressIO(io.RawIOBase):
118
+ file: ty.BinaryIO = attr.ib()
119
+ compressor: CompressorInterface = attr.ib()
120
+ _closed = False
121
+
122
+ def readinto(self, buffer):
123
+ raise NotImplementedError
124
+
125
+ def write(self, buffer):
126
+ for x in self.compressor.compress(buffer):
127
+ self.file.write(x)
128
+ return len(buffer)
129
+
130
+ def close(self):
131
+ if not self._closed:
132
+ self._closed = True
133
+ for x in self.compressor.compress(None):
134
+ self.file.write(x)
135
+ super().close()
136
+
137
+ def readable(self):
138
+ return False
139
+
140
+ def writable(self):
141
+ return True
142
+
143
+ def seekable(self):
144
+ return False
145
+
146
+
147
+ class DecompressIOError(ValueError):
148
+ pass
149
+
150
+
151
+ @attr.s(eq=False, hash=False)
152
+ class DecompressIO(io.RawIOBase):
153
+ file: ty.BinaryIO = attr.ib()
154
+ _decompressor: DecompressorInterface = attr.ib()
155
+ _buffer_size = attr.ib(default=65536)
156
+ _strict: bool = attr.ib(default=True)
157
+ _closed = False
158
+ _input_eof = False
159
+ _position = 0
160
+
161
+ def tell(self):
162
+ return self._position
163
+
164
+ def readable(self):
165
+ return True
166
+
167
+ def writable(self):
168
+ return False
169
+
170
+ def seekable(self):
171
+ return False
172
+
173
+ def readinto(self, buffer):
174
+ if not buffer:
175
+ return 0
176
+
177
+ dec = self._decompressor
178
+ while True:
179
+ b = dec.read(len(buffer))
180
+ if b:
181
+ buffer[: (n := len(b))] = b
182
+ self._position += n
183
+ return n
184
+
185
+ if self._input_eof:
186
+ if self._strict:
187
+ if not dec.eof:
188
+ raise DecompressIOError("truncated input")
189
+ if dec.unused_data:
190
+ raise DecompressIOError("unused data after end of compressed stream")
191
+ return 0
192
+
193
+ if c := self.file.read(self._buffer_size):
194
+ dec.feed(c)
195
+ else:
196
+ dec.feed(None)
197
+ self._input_eof = True
198
+
199
+
200
+ def make_xz_compressor(preset: int = 6):
201
+ import lzma
202
+
203
+ return CompressorStdlibAdapter(
204
+ lzma.LZMACompressor(format=lzma.FORMAT_XZ, check=lzma.CHECK_CRC32, preset=preset)
205
+ )
206
+
207
+
208
+ def make_xz_decompressor() -> DecompressorInterface:
209
+ import lzma
210
+
211
+ return DecompressorStdlibNeedsInputAdapter(lzma.LZMADecompressor(format=lzma.FORMAT_XZ))
212
+
213
+
214
+ def make_zstd_compressor(level: int = 3) -> CompressorInterface:
215
+ import pyzstd
216
+
217
+ return CompressorStdlibAdapter(pyzstd.ZstdCompressor(level))
218
+
219
+
220
+ def make_zstd_decompressor():
221
+ import pyzstd
222
+
223
+ return DecompressorStdlibNeedsInputAdapter(pyzstd.ZstdDecompressor())
224
+
225
+
226
+ compressors = {"zst": make_zstd_compressor, "xz": make_xz_compressor}
227
+ decompressors = {"zst": make_zstd_decompressor, "xz": make_xz_decompressor}
228
+
229
+
230
+ def open_compressor(file, compressor: str | CompressorInterface) -> CompressIO:
231
+ if not isinstance(compressor, CompressorInterface):
232
+ compressor = compressors[compressor]()
233
+ return contextlib.closing(CompressIO(file, compressor))
234
+
235
+
236
+ def open_decompressor(file, decompressor: str | DecompressorInterface) -> DecompressIO:
237
+ if not isinstance(decompressor, DecompressorInterface):
238
+ decompressor = decompressors[decompressor]()
239
+ return contextlib.closing(DecompressIO(file, decompressor))