vocker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vocker/__init__.py +0 -0
- vocker/__main__.py +3 -0
- vocker/cli.py +384 -0
- vocker/dedup.py +1676 -0
- vocker/dedup_models.py +174 -0
- vocker/image.py +870 -0
- vocker/integer_to_path.py +51 -0
- vocker/multihash.py +302 -0
- vocker/py.typed +0 -0
- vocker/repo/__init__.py +0 -0
- vocker/repo/compression.py +239 -0
- vocker/repo/io.py +711 -0
- vocker/system.py +681 -0
- vocker/util.py +120 -0
- vocker/util_models.py +13 -0
- vocker-0.1.0.dist-info/METADATA +56 -0
- vocker-0.1.0.dist-info/RECORD +19 -0
- vocker-0.1.0.dist-info/WHEEL +5 -0
- vocker-0.1.0.dist-info/top_level.txt +1 -0
vocker/repo/io.py
ADDED
|
@@ -0,0 +1,711 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
import contextlib
|
|
7
|
+
import enum
|
|
8
|
+
import io
|
|
9
|
+
import itertools
|
|
10
|
+
from math import log2
|
|
11
|
+
from pathlib import Path, PurePosixPath
|
|
12
|
+
import shutil
|
|
13
|
+
import struct
|
|
14
|
+
import typing as ty
|
|
15
|
+
|
|
16
|
+
import attr
|
|
17
|
+
import cbor2
|
|
18
|
+
from sansio_tools import parser as sansio_parser
|
|
19
|
+
from sansio_tools.queue import BytesQueue, FileAdapterFromGeneratorBytes
|
|
20
|
+
import structlog
|
|
21
|
+
|
|
22
|
+
from ..integer_to_path import IntegerToPath
|
|
23
|
+
from ..util import assert_
|
|
24
|
+
from .. import image as im, multihash as mh, dedup as de, util as ut
|
|
25
|
+
from . import compression as cx
|
|
26
|
+
|
|
27
|
+
logger = structlog.get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
CBOR_HEADER = b"\xd9\xd9\xf7"
|
|
31
|
+
STRUCT_ARCHIVE_SIZE = struct.Struct("!Q")
|
|
32
|
+
STRUCT_ESTIMATED_SIZE = struct.Struct("!H")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def cbor_dumps(obj) -> bytes:
|
|
36
|
+
return cbor2.dumps(obj, datetime_as_timestamp=True, canonical=True)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def cbor_loads(data: bytes):
|
|
40
|
+
return cbor2.loads(data)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def cbor_load(fp: ty.BinaryIO, max_size: int):
|
|
44
|
+
left = max_size + 1
|
|
45
|
+
q = BytesQueue()
|
|
46
|
+
while buf := fp.read(left):
|
|
47
|
+
left -= len(buf)
|
|
48
|
+
q.append(buf)
|
|
49
|
+
if not left:
|
|
50
|
+
raise ValueError(f"input exceeded maximum size {max_size}")
|
|
51
|
+
return cbor_loads(bytes(q))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def cbor_dump(obj, fp):
|
|
55
|
+
fp.write(b"\xd9\xd9\xf7")
|
|
56
|
+
cbor2.dump(obj, fp, datetime_as_timestamp=True, canonical=True)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@attr.s(eq=False, hash=False)
|
|
60
|
+
class ShardPathsWriter:
|
|
61
|
+
file = attr.ib()
|
|
62
|
+
|
|
63
|
+
def write_all(self, files: ty.Iterable[im.SingleFileImageMetadata]):
|
|
64
|
+
cbor_dump([x.to_shard_entry() for x in files], self.file)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@attr.s(eq=False, hash=False)
|
|
68
|
+
class ArchiveDataWriter:
|
|
69
|
+
file_archive: ty.BinaryIO = attr.ib()
|
|
70
|
+
file_sizes: ty.BinaryIO = attr.ib()
|
|
71
|
+
|
|
72
|
+
def begin_file(self, size: int, digest: mh.Digest):
|
|
73
|
+
self.current_hasher = digest.function()
|
|
74
|
+
self.current_size = 0
|
|
75
|
+
self.expected_digest = digest
|
|
76
|
+
self.expected_size = size
|
|
77
|
+
|
|
78
|
+
def write_file_data(self, data: bytes):
|
|
79
|
+
self.current_size += len(data)
|
|
80
|
+
self.current_hasher.update(data)
|
|
81
|
+
self.file_archive.write(data)
|
|
82
|
+
|
|
83
|
+
def end_file(self):
|
|
84
|
+
h = self.current_hasher.digest()
|
|
85
|
+
h0 = self.expected_digest
|
|
86
|
+
if (size := self.current_size) != (s0 := self.expected_size) or h != h0:
|
|
87
|
+
raise AssertionError(
|
|
88
|
+
f"written file did not match expected info ({size} != {s0}, {h} != {h0})"
|
|
89
|
+
)
|
|
90
|
+
self.file_sizes.write(STRUCT_ARCHIVE_SIZE.pack(size))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@attr.s(eq=False, hash=False)
|
|
94
|
+
class HashesWriter:
|
|
95
|
+
file: ty.BinaryIO = attr.ib()
|
|
96
|
+
|
|
97
|
+
def write_all(self, iterable: ty.Iterable[mh.Digest]):
|
|
98
|
+
w = self.file.write
|
|
99
|
+
for x in iterable:
|
|
100
|
+
w(x.digest)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def estimated_archive_sizes_encode(sizes: ty.Iterable[int]):
|
|
104
|
+
_s = STRUCT_ESTIMATED_SIZE
|
|
105
|
+
result = []
|
|
106
|
+
for size in sizes:
|
|
107
|
+
if size <= 0:
|
|
108
|
+
size = 1
|
|
109
|
+
result.append(_s.pack(round(log2(size) * 1024)))
|
|
110
|
+
return b"".join(result)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def estimated_archive_sizes_decode(data: bytes) -> list[int]:
|
|
114
|
+
_s = STRUCT_ESTIMATED_SIZE
|
|
115
|
+
data = memoryview(data)
|
|
116
|
+
result = []
|
|
117
|
+
for i in range(0, len(data), 2):
|
|
118
|
+
[x] = _s.unpack(data[i : i + 2])
|
|
119
|
+
result.append(round(2.0 ** (x / 1024)))
|
|
120
|
+
return result
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class MapXToYOperatorEnum(enum.Enum):
|
|
124
|
+
OUT = 1
|
|
125
|
+
AND = 2
|
|
126
|
+
OR = 3
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@attr.s(eq=False, hash=False)
|
|
130
|
+
class MapShardToArchiveWriterTrivial:
|
|
131
|
+
file: ty.BinaryIO = attr.ib()
|
|
132
|
+
|
|
133
|
+
def write_all(self, *, shard_id: int, archive_id: int, archive_size: int):
|
|
134
|
+
data = [
|
|
135
|
+
[archive_id],
|
|
136
|
+
[[MapXToYOperatorEnum.OUT.value, 0, shard_id]],
|
|
137
|
+
estimated_archive_sizes_encode([archive_size]),
|
|
138
|
+
]
|
|
139
|
+
cbor_dump(data, self.file)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@attr.s(eq=False, hash=False)
|
|
143
|
+
class MapImageToShardWriterTrivial:
|
|
144
|
+
file: ty.BinaryIO = attr.ib()
|
|
145
|
+
|
|
146
|
+
def write_all(self, *, image_id: int, shard_ids: ty.Iterable[int]):
|
|
147
|
+
shard_ids = list(shard_ids)
|
|
148
|
+
data = [
|
|
149
|
+
shard_ids,
|
|
150
|
+
[[MapXToYOperatorEnum.OUT.value, 0] + list(range(len(shard_ids)))],
|
|
151
|
+
]
|
|
152
|
+
cbor_dump(data, self.file)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def image_file_entries_for_hashing_iter(
|
|
156
|
+
image_user_data_hash: mh.Digest, entries: ty.Iterable[im.SingleFileImageMetadata]
|
|
157
|
+
):
|
|
158
|
+
yield image_user_data_hash.to_multihash_bytes()
|
|
159
|
+
|
|
160
|
+
d = {}
|
|
161
|
+
keys = []
|
|
162
|
+
for e in entries:
|
|
163
|
+
keys.append(k := e.to_image_hash_sort_key())
|
|
164
|
+
d[k] = e
|
|
165
|
+
|
|
166
|
+
keys.sort()
|
|
167
|
+
|
|
168
|
+
for k in keys:
|
|
169
|
+
yield cbor_dumps(d[k].to_data_for_image_hash())
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@attr.s(auto_exc=True, str=False)
|
|
173
|
+
class RepoFileNotFoundError(Exception):
|
|
174
|
+
message = attr.ib(default="repository file not found")
|
|
175
|
+
filename = attr.ib(default=None)
|
|
176
|
+
remote_accessor = attr.ib(default=None)
|
|
177
|
+
local_base_path = attr.ib(default=None)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@attr.s(auto_exc=True, str=False)
|
|
181
|
+
class ImageNotFoundError(Exception):
|
|
182
|
+
message = attr.ib(default="image not found")
|
|
183
|
+
image_id = attr.ib(default=None)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@attr.s(auto_exc=True, str=False)
|
|
187
|
+
class BadHashError(Exception):
|
|
188
|
+
message = attr.ib(
|
|
189
|
+
default="""Manifest hash does not match parent. Either the repository is corrupt or in \
|
|
190
|
+
the middle of an upload, in which case you should retry the operation."""
|
|
191
|
+
)
|
|
192
|
+
path = attr.ib(default=None)
|
|
193
|
+
digest_expected = attr.ib(default=None)
|
|
194
|
+
digest_observed = attr.ib(default=None)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@attr.s(eq=False, hash=False)
|
|
198
|
+
class RepoTransfer:
|
|
199
|
+
path_local: Path = attr.ib()
|
|
200
|
+
dedup: de.Dedup = attr.ib()
|
|
201
|
+
accessor: RemoteRepoAccessor | None = attr.ib()
|
|
202
|
+
_cached_manifests = attr.ib(factory=set, init=False)
|
|
203
|
+
|
|
204
|
+
def _download_manifest(
|
|
205
|
+
self, remote_path: PurePosixPath, destination: Path
|
|
206
|
+
) -> tuple[mh.Digest, ManifestNode]:
|
|
207
|
+
with self.accessor.download_open(remote_path) as file:
|
|
208
|
+
_feed = (reader := ManifestNodeReader()).parser.feed
|
|
209
|
+
q = BytesQueue()
|
|
210
|
+
|
|
211
|
+
def feed(b):
|
|
212
|
+
if b:
|
|
213
|
+
q.append(b)
|
|
214
|
+
_feed(b)
|
|
215
|
+
|
|
216
|
+
def _download_remaining_data():
|
|
217
|
+
yield from q.data
|
|
218
|
+
while block := file.read(65536):
|
|
219
|
+
feed(block)
|
|
220
|
+
yield block
|
|
221
|
+
feed(None)
|
|
222
|
+
|
|
223
|
+
# first we download the header to check the digest and maybe avoid downloading the rest
|
|
224
|
+
block = True
|
|
225
|
+
while block:
|
|
226
|
+
block = file.read(256)
|
|
227
|
+
feed(block if block else None)
|
|
228
|
+
if (digest := reader.out_claimed_digest) is not None:
|
|
229
|
+
# OK, we have our digest
|
|
230
|
+
break
|
|
231
|
+
else:
|
|
232
|
+
raise AssertionError("no digest available but no error from parser?")
|
|
233
|
+
|
|
234
|
+
_open = lambda: FileAdapterFromGeneratorBytes(_download_remaining_data())
|
|
235
|
+
destination.parent.mkdir(exist_ok=True, parents=True)
|
|
236
|
+
req = self.make_manifest_link_request(digest, destination, dict(open_file_once=_open))
|
|
237
|
+
self.dedup.run_batch([req])
|
|
238
|
+
|
|
239
|
+
return digest, reader.out_verified_data
|
|
240
|
+
|
|
241
|
+
@staticmethod
|
|
242
|
+
def make_manifest_link_request(manifest_digest, destination, kwargs):
|
|
243
|
+
return de.DedupLinkRequest(
|
|
244
|
+
hash_function=manifest_digest.function,
|
|
245
|
+
link_path=destination,
|
|
246
|
+
file_metadata=de.DedupFileMetadata.make_plain(),
|
|
247
|
+
file_contents_hash=None,
|
|
248
|
+
tags={b"vmf:" + manifest_digest.to_multihash_bytes()},
|
|
249
|
+
**kwargs,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
@staticmethod
|
|
253
|
+
def make_hash_32_path(digest: mh.Digest):
|
|
254
|
+
return PurePosixPath(digest.digest[:4].hex("/")) / "i.cbor"
|
|
255
|
+
|
|
256
|
+
def _upload_file(self, local_path, remote_path):
|
|
257
|
+
self.accessor.upload(local_path, remote_path)
|
|
258
|
+
|
|
259
|
+
def _local(self, p: PurePosixPath) -> Path:
|
|
260
|
+
return self.path_local / p
|
|
261
|
+
|
|
262
|
+
def upload_full(self):
|
|
263
|
+
stack = [(False, PurePosixPath("."))]
|
|
264
|
+
while stack:
|
|
265
|
+
is_exit, path = stack.pop()
|
|
266
|
+
local_path = self._local(path)
|
|
267
|
+
if is_exit:
|
|
268
|
+
self._upload_file(local_path / "manifest.bin", path / "manifest.bin")
|
|
269
|
+
else:
|
|
270
|
+
new_reader = ManifestNodeReader.from_bytes(
|
|
271
|
+
(local_path / "manifest.bin").read_bytes()
|
|
272
|
+
)
|
|
273
|
+
new_node = new_reader.out_verified_data
|
|
274
|
+
new_digest = new_reader.out_claimed_digest
|
|
275
|
+
with self.dedup.temporary_directory() as tmp:
|
|
276
|
+
old_digest = old_node = None
|
|
277
|
+
try:
|
|
278
|
+
old_digest, old_node = self._download_manifest(
|
|
279
|
+
path / "manifest.bin", (tmp_mf_path := tmp / "m.bin")
|
|
280
|
+
)
|
|
281
|
+
except FileNotFoundError:
|
|
282
|
+
pass
|
|
283
|
+
except Exception:
|
|
284
|
+
logger.warning(
|
|
285
|
+
"error downloading existing manifest",
|
|
286
|
+
exc_info=True,
|
|
287
|
+
data_path=str(path / "manifest.bin"),
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
if old_digest == new_digest:
|
|
291
|
+
continue # manifest is identical, carry on
|
|
292
|
+
|
|
293
|
+
if old_digest is None:
|
|
294
|
+
# empty
|
|
295
|
+
old_node = None
|
|
296
|
+
else:
|
|
297
|
+
# old manifest found
|
|
298
|
+
old_node = ManifestNodeReader.from_bytes(old_node).out_verified_data
|
|
299
|
+
|
|
300
|
+
# we write the new manifest to /new/
|
|
301
|
+
self._upload_file(local_path / "manifest.bin", "new" / path / "manifest.bin")
|
|
302
|
+
|
|
303
|
+
# onto the stack we push a reminder to upload the final version to the right place,
|
|
304
|
+
# but only AFTER all the children have been completed
|
|
305
|
+
stack.append((True, path))
|
|
306
|
+
|
|
307
|
+
# we now perform all the file operations for the current directory and recurse for
|
|
308
|
+
# child directories
|
|
309
|
+
for k, v in new_node.children.items():
|
|
310
|
+
if old_node is None or old_node.children.get(k) != v:
|
|
311
|
+
# this file or directory is different, so we will need to recurse into it
|
|
312
|
+
is_dir, digest = v
|
|
313
|
+
if is_dir:
|
|
314
|
+
# push it onto the stack
|
|
315
|
+
stack.append((False, path / k))
|
|
316
|
+
else:
|
|
317
|
+
# upload file now
|
|
318
|
+
self._upload_file(local_path / k, path / k)
|
|
319
|
+
|
|
320
|
+
def download_full(self, archives: bool = True, manifest_only: bool = False):
|
|
321
|
+
def _should_download(path: PurePosixPath) -> bool:
|
|
322
|
+
if manifest_only:
|
|
323
|
+
return False
|
|
324
|
+
return not (not archives and path.name.startswith("a."))
|
|
325
|
+
|
|
326
|
+
self._cached_manifests.clear()
|
|
327
|
+
loc = self._local
|
|
328
|
+
todo = [(PurePosixPath("."), None)]
|
|
329
|
+
while todo:
|
|
330
|
+
path, digest = todo.pop()
|
|
331
|
+
|
|
332
|
+
with self.open(path / "manifest.bin") as mf:
|
|
333
|
+
node = ManifestNodeReader.from_bytes(mf.read())
|
|
334
|
+
|
|
335
|
+
if digest is not None:
|
|
336
|
+
# avoid checking the top-level hash
|
|
337
|
+
if node.out_claimed_digest != digest:
|
|
338
|
+
raise BadHashError(
|
|
339
|
+
path=path, digest_expected=digest, digest_observed=node.out_claimed_digest
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
for item_name, (is_dir, item_digest) in node.out_verified_data.children.items():
|
|
343
|
+
item_path = path / item_name
|
|
344
|
+
if is_dir:
|
|
345
|
+
todo.append((item_path, item_digest))
|
|
346
|
+
else:
|
|
347
|
+
if _should_download(item_path):
|
|
348
|
+
with self.open(item_path):
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
def _integer_to_path(self, i: int):
|
|
352
|
+
return PurePosixPath(IntegerToPath(file_suffix="_d")(i))
|
|
353
|
+
|
|
354
|
+
DEFAULT_CBOR_MAX_SIZE = 2**24
|
|
355
|
+
|
|
356
|
+
def open(self, path: PurePosixPath):
|
|
357
|
+
loc = self._local
|
|
358
|
+
mf_path = path.parent / "manifest.bin"
|
|
359
|
+
if (loc_path := loc(path)).exists():
|
|
360
|
+
# don't even check the manifest for existing local files
|
|
361
|
+
return loc_path.open("rb")
|
|
362
|
+
|
|
363
|
+
def _not_found():
|
|
364
|
+
raise RepoFileNotFoundError(
|
|
365
|
+
filename=str(path), remote_accessor=self.accessor, local_base_path=self.path_local
|
|
366
|
+
) from None
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
if (acc := self.accessor) is not None and mf_path not in self._cached_manifests:
|
|
370
|
+
h, node = self._download_manifest(mf_path, loc(mf_path))
|
|
371
|
+
self._cached_manifests.add(mf_path)
|
|
372
|
+
else:
|
|
373
|
+
x = loc(mf_path).read_bytes()
|
|
374
|
+
reader = ManifestNodeReader.from_bytes(loc(mf_path).read_bytes())
|
|
375
|
+
h, node = reader.out_claimed_digest, reader.out_verified_data
|
|
376
|
+
except FileNotFoundError:
|
|
377
|
+
_not_found()
|
|
378
|
+
|
|
379
|
+
if mf_path == path:
|
|
380
|
+
# goofy - caller is trying to open the manifest itself
|
|
381
|
+
return loc(path).open("rb")
|
|
382
|
+
|
|
383
|
+
try:
|
|
384
|
+
is_dir, digest = node.children[path.name]
|
|
385
|
+
except KeyError:
|
|
386
|
+
_not_found()
|
|
387
|
+
|
|
388
|
+
assert_(not is_dir)
|
|
389
|
+
|
|
390
|
+
_open = None if acc is None else (lambda: acc.download_open(path))
|
|
391
|
+
|
|
392
|
+
req = de.DedupLinkRequest(
|
|
393
|
+
hash_function=digest.function,
|
|
394
|
+
link_path=(loc_path := loc(path)),
|
|
395
|
+
file_metadata=de.DedupFileMetadata.make_plain(),
|
|
396
|
+
file_contents_hash=digest,
|
|
397
|
+
open_file_once=_open,
|
|
398
|
+
)
|
|
399
|
+
# TODO: handle de.MissingContentError
|
|
400
|
+
try:
|
|
401
|
+
self.dedup.run_batch([req])
|
|
402
|
+
except de.BatchError as exc:
|
|
403
|
+
raise exc.requests[0].exc from None
|
|
404
|
+
return loc_path.open("rb")
|
|
405
|
+
|
|
406
|
+
@contextlib.contextmanager
|
|
407
|
+
def open_compressed(self, path: PurePosixPath):
|
|
408
|
+
# TODO: use gz if zstd not available
|
|
409
|
+
p = path.parent / (path.name + ".xz")
|
|
410
|
+
with self.open(p) as f1:
|
|
411
|
+
with cx.open_decompressor(f1, "xz") as f:
|
|
412
|
+
yield f
|
|
413
|
+
|
|
414
|
+
def download_shard(self, shard_digest: mh.Digest) -> int | None:
|
|
415
|
+
shard_hashes_path = "shard-by-hash-32" / self.make_hash_32_path(shard_digest)
|
|
416
|
+
try:
|
|
417
|
+
shard_hash_to_id = self._read_cbor(shard_hashes_path)
|
|
418
|
+
except RepoFileNotFoundError:
|
|
419
|
+
return None
|
|
420
|
+
try:
|
|
421
|
+
shard_id = shard_hash_to_id[shard_digest.digest]
|
|
422
|
+
except KeyError:
|
|
423
|
+
return None
|
|
424
|
+
assert_(type(shard_id) is int)
|
|
425
|
+
return shard_id
|
|
426
|
+
|
|
427
|
+
def _read_cbor(self, path):
|
|
428
|
+
max_size = self.DEFAULT_CBOR_MAX_SIZE
|
|
429
|
+
with self.open(path) as f:
|
|
430
|
+
return cbor_load(f, max_size=max_size)
|
|
431
|
+
|
|
432
|
+
def download_image(self, image_id: str, download_archives: bool = True):
|
|
433
|
+
# download image index to locate image ID by hash
|
|
434
|
+
# download image metadata cbor and ID of latest image-to-shard mapping
|
|
435
|
+
# download image-to-shard mapping
|
|
436
|
+
# select shard set
|
|
437
|
+
# for each shard, download metadata + ID of latest shard-to-archive mapping
|
|
438
|
+
# download shard-to-archive mapping
|
|
439
|
+
# select archive set
|
|
440
|
+
digest = mh.registry.decode(image_id)
|
|
441
|
+
hf = digest.function
|
|
442
|
+
|
|
443
|
+
def _read_cbor_int(path) -> int:
|
|
444
|
+
with self.open(path) as f:
|
|
445
|
+
value: int = cbor_load(f, max_size=1024)
|
|
446
|
+
assert_(type(value) is int)
|
|
447
|
+
return value
|
|
448
|
+
|
|
449
|
+
_read_cbor = self._read_cbor
|
|
450
|
+
|
|
451
|
+
def _read_compressed_cbor(path, max_size=None):
|
|
452
|
+
if max_size is None:
|
|
453
|
+
max_size = self.DEFAULT_CBOR_MAX_SIZE
|
|
454
|
+
with self.open_compressed(path) as f:
|
|
455
|
+
return cbor_load(f, max_size=max_size)
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
image_hashes_path = "image-by-hash-32" / self.make_hash_32_path(digest)
|
|
459
|
+
image_hash_to_id = _read_cbor(image_hashes_path)
|
|
460
|
+
img_id = image_hash_to_id[digest.digest]
|
|
461
|
+
except Exception as exc:
|
|
462
|
+
raise ImageNotFoundError(image_id=image_id) from exc
|
|
463
|
+
assert_(type(img_id) is int)
|
|
464
|
+
|
|
465
|
+
image_path = "image" / self._integer_to_path(img_id)
|
|
466
|
+
|
|
467
|
+
with self.open_compressed(image_path / "u") as f:
|
|
468
|
+
image_meta_hash = hf().update(f.read()).digest()
|
|
469
|
+
|
|
470
|
+
i2s_path = "is" / self._integer_to_path(_read_cbor_int(image_path / "is.cbor"))
|
|
471
|
+
|
|
472
|
+
with self.open_compressed(i2s_path / "m") as f:
|
|
473
|
+
# HACK: gathering all shards instead of being smart
|
|
474
|
+
shard_ids: list[int] = cbor_loads(f.read())[0]
|
|
475
|
+
assert_(type(shard_ids) is list)
|
|
476
|
+
assert_(all(type(x) is int for x in shard_ids))
|
|
477
|
+
|
|
478
|
+
digest_size = hf.digest_size
|
|
479
|
+
shard_entries: dict[str, im.SingleFileImageMetadata] = {}
|
|
480
|
+
for shard_id in shard_ids:
|
|
481
|
+
shard_path = "shard" / self._integer_to_path(shard_id)
|
|
482
|
+
|
|
483
|
+
shard_entry_data = _read_compressed_cbor(shard_path / "p")
|
|
484
|
+
|
|
485
|
+
with self.open(shard_path / "h.bin") as f:
|
|
486
|
+
for data in shard_entry_data:
|
|
487
|
+
entry = im.SingleFileImageMetadata.from_shard_entry(
|
|
488
|
+
data, hf.digest_from_bytes(f.read(digest_size))
|
|
489
|
+
)
|
|
490
|
+
shard_entries[entry.path] = entry
|
|
491
|
+
|
|
492
|
+
computed_image_hash = (
|
|
493
|
+
hf()
|
|
494
|
+
.update_iter(
|
|
495
|
+
image_file_entries_for_hashing_iter(image_meta_hash, shard_entries.values())
|
|
496
|
+
)
|
|
497
|
+
.digest()
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
if (s := computed_image_hash.to_multihash_base64url()) != image_id:
|
|
501
|
+
raise ValueError(f"image hash does not match, expected {image_id}, calculated {s}")
|
|
502
|
+
|
|
503
|
+
digest_to_shard_entries = defaultdict(list)
|
|
504
|
+
for entry in shard_entries.values():
|
|
505
|
+
digest_to_shard_entries[entry.digest.digest].append(entry)
|
|
506
|
+
|
|
507
|
+
logger.info("finished metadata")
|
|
508
|
+
|
|
509
|
+
for shard_id in shard_ids:
|
|
510
|
+
shard_path = "shard" / self._integer_to_path(shard_id)
|
|
511
|
+
s2a_path = "sa" / self._integer_to_path(_read_cbor_int(shard_path / "sa.cbor"))
|
|
512
|
+
archive_ids = _read_compressed_cbor(s2a_path / "m")[0]
|
|
513
|
+
assert_(type(archive_ids) is list)
|
|
514
|
+
assert_(all(type(x) is int for x in archive_ids))
|
|
515
|
+
|
|
516
|
+
with self.open_compressed(s2a_path / "m") as f:
|
|
517
|
+
# HACK: gathering all archives instead of being smart
|
|
518
|
+
archive_ids: list[int] = cbor_loads(f.read())[0]
|
|
519
|
+
|
|
520
|
+
_size_struct_size = STRUCT_ARCHIVE_SIZE.size
|
|
521
|
+
for archive_id in archive_ids:
|
|
522
|
+
export_inputs = []
|
|
523
|
+
archive_path = "archive" / self._integer_to_path(archive_id)
|
|
524
|
+
offset = 0
|
|
525
|
+
with self.open(archive_path / "h.bin") as f_h, self.open_compressed(
|
|
526
|
+
archive_path / "s"
|
|
527
|
+
) as f_s:
|
|
528
|
+
while sz_bytes := f_s.read(_size_struct_size):
|
|
529
|
+
# archive file hashes and sizes
|
|
530
|
+
[size] = STRUCT_ARCHIVE_SIZE.unpack(sz_bytes)
|
|
531
|
+
digest = f_h.read(digest_size)
|
|
532
|
+
this_digest_shard_entries = digest_to_shard_entries.pop(digest, None)
|
|
533
|
+
if this_digest_shard_entries is not None:
|
|
534
|
+
export_inputs.append(
|
|
535
|
+
im.SolidArchiveFileInfo(
|
|
536
|
+
files=this_digest_shard_entries, offset=offset, size=size
|
|
537
|
+
)
|
|
538
|
+
)
|
|
539
|
+
offset += size
|
|
540
|
+
|
|
541
|
+
if download_archives:
|
|
542
|
+
with self.open_compressed(archive_path / "a") as f:
|
|
543
|
+
# ensure the contents are available
|
|
544
|
+
pass
|
|
545
|
+
|
|
546
|
+
yield ArchiveFilesExportInfo(archive_path=archive_path / "a", files=export_inputs)
|
|
547
|
+
|
|
548
|
+
if digest_to_shard_entries:
|
|
549
|
+
raise ValueError(f"failed to find data for items: {digest_to_shard_entries}")
|
|
550
|
+
|
|
551
|
+
def export(
|
|
552
|
+
self,
|
|
553
|
+
exporter: im.VenvExporter,
|
|
554
|
+
iterable: ty.Iterator[ArchiveFilesExportInfo],
|
|
555
|
+
max_workers: int = None,
|
|
556
|
+
):
|
|
557
|
+
def _process(archive_infos):
|
|
558
|
+
with contextlib.ExitStack() as ex:
|
|
559
|
+
files = []
|
|
560
|
+
for archive_info in archive_infos:
|
|
561
|
+
archive_io = ex.enter_context(self.open_compressed(archive_info.archive_path))
|
|
562
|
+
files += (A(archive_io, info) for info in archive_info.files)
|
|
563
|
+
exporter.provide_files(files)
|
|
564
|
+
|
|
565
|
+
def _group_archive_infos(iterable):
|
|
566
|
+
n = 0
|
|
567
|
+
lst = []
|
|
568
|
+
for a in iterable:
|
|
569
|
+
if ((n1 := len(a.files)) + n < 5000) and len(lst) < 50:
|
|
570
|
+
lst.append(a)
|
|
571
|
+
n += n1
|
|
572
|
+
else:
|
|
573
|
+
yield lst
|
|
574
|
+
n = n1
|
|
575
|
+
lst = [a]
|
|
576
|
+
if lst:
|
|
577
|
+
yield lst
|
|
578
|
+
|
|
579
|
+
exporter.begin_session()
|
|
580
|
+
A = im.VenvExportInputFromSolidArchive
|
|
581
|
+
with ThreadPoolExecutor(max_workers=max_workers) as exe, ut.cancel_futures_on_error(exe):
|
|
582
|
+
ut.raise_as_completed(
|
|
583
|
+
exe.submit(_process, a_info_group)
|
|
584
|
+
for a_info_group in _group_archive_infos(iterable)
|
|
585
|
+
)
|
|
586
|
+
exporter.end_session()
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
@attr.s(frozen=True)
|
|
590
|
+
class ArchiveFilesExportInfo:
|
|
591
|
+
archive_path: PurePosixPath = attr.ib()
|
|
592
|
+
files: tuple[im.SolidArchiveFileInfo] = attr.ib(converter=tuple)
|
|
593
|
+
|
|
594
|
+
|
|
595
|
+
def read_multihash(p: sansio_parser.BinaryParser, maximum_digest_size: int):
|
|
596
|
+
function_code = yield from mh.multihash_varint_decode(p)
|
|
597
|
+
digest_size = yield from mh.multihash_varint_decode(p)
|
|
598
|
+
if digest_size > maximum_digest_size:
|
|
599
|
+
raise ValueError("digest size exceeds maximum")
|
|
600
|
+
digest_bytes = yield from p.read_bytes(digest_size)
|
|
601
|
+
return mh.registry.decode_from_code_and_digest(function_code, digest_bytes)
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
@attr.s(eq=False, hash=False)
|
|
605
|
+
class ManifestNode:
|
|
606
|
+
hash_function: mh.HashFunction = attr.ib()
|
|
607
|
+
children: dict[str, tuple[bool, mh.Digest]] = attr.ib()
|
|
608
|
+
|
|
609
|
+
@classmethod
|
|
610
|
+
def from_cbor_decoded(cls, hash_function: mh.HashFunction, data):
|
|
611
|
+
is_dir_dict = {b"d": True, b"f": False}
|
|
612
|
+
H = hash_function.digest_from_bytes
|
|
613
|
+
return cls(
|
|
614
|
+
hash_function,
|
|
615
|
+
{k: (is_dir_dict[v[:1]], H(v[1:])) for k, v in data.items()},
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
def to_bytes(self) -> tuple[bytes, mh.Digest]:
|
|
619
|
+
hf = self.hash_function
|
|
620
|
+
for is_dir, digest in self.children.values():
|
|
621
|
+
if digest.function != hf:
|
|
622
|
+
raise AssertionError("child and parent must use the same hash function")
|
|
623
|
+
d = {
|
|
624
|
+
name: (b"d" if is_dir else b"f") + digest.digest
|
|
625
|
+
for name, (is_dir, digest) in self.children.items()
|
|
626
|
+
}
|
|
627
|
+
b = cbor_dumps(d)
|
|
628
|
+
h = hf().update(b).digest()
|
|
629
|
+
return h.to_multihash_bytes() + b, h
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
@attr.s(eq=False, hash=False)
|
|
633
|
+
class ManifestNodeReader:
|
|
634
|
+
maximum_digest_size: int = attr.ib(default=1024)
|
|
635
|
+
parser = attr.ib(default=None, init=False)
|
|
636
|
+
|
|
637
|
+
out_claimed_digest: mh.Digest = attr.ib(init=False, default=None)
|
|
638
|
+
out_verified_data: ManifestNode = attr.ib(init=False, default=None)
|
|
639
|
+
|
|
640
|
+
def __attrs_post_init__(self):
|
|
641
|
+
self.parser = sansio_parser.BinaryParser(self._parse)
|
|
642
|
+
|
|
643
|
+
def _parse(self, p: sansio_parser.BinaryParser):
|
|
644
|
+
digest_top = yield from read_multihash(p, self.maximum_digest_size)
|
|
645
|
+
self.out_claimed_digest = digest_top
|
|
646
|
+
hf = digest_top.function
|
|
647
|
+
hasher_top = hf()
|
|
648
|
+
|
|
649
|
+
q2 = BytesQueue()
|
|
650
|
+
while not p.eof:
|
|
651
|
+
while p.queue:
|
|
652
|
+
hasher_top.update(p.queue.popleft_any_to(q2))
|
|
653
|
+
yield
|
|
654
|
+
|
|
655
|
+
if digest_top != hasher_top.digest():
|
|
656
|
+
raise ValueError("content does not match top-level hash")
|
|
657
|
+
|
|
658
|
+
self.out_verified_data = ManifestNode.from_cbor_decoded(hf, cbor2.loads(bytes(q2)))
|
|
659
|
+
|
|
660
|
+
@classmethod
|
|
661
|
+
def from_bytes(cls, data):
|
|
662
|
+
(self := cls()).parser.feed(data).feed(None)
|
|
663
|
+
return self
|
|
664
|
+
|
|
665
|
+
@classmethod
|
|
666
|
+
def parse_bytes(cls, data: bytes | memoryview):
|
|
667
|
+
return cls.from_data(data).out_verified_data
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
class RemoteRepoAccessor(abc.ABC):
|
|
671
|
+
def download(self, path: Path, remote_path: PurePosixPath):
|
|
672
|
+
raise NotImplementedError
|
|
673
|
+
if path.is_file():
|
|
674
|
+
offset = path.stat().st_size
|
|
675
|
+
with self.download_open_iter(remote_path=remote_path, offset=offset) as xs, path.open(
|
|
676
|
+
"w+b"
|
|
677
|
+
) as fw:
|
|
678
|
+
fw.seek(offset)
|
|
679
|
+
for block in xs:
|
|
680
|
+
fw.write(block)
|
|
681
|
+
|
|
682
|
+
@abc.abstractmethod
|
|
683
|
+
def download_open(
|
|
684
|
+
self, remote_path: PurePosixPath, offset: int = 0
|
|
685
|
+
) -> ty.ContextManager[ty.BinaryIO]: ...
|
|
686
|
+
|
|
687
|
+
@abc.abstractmethod
|
|
688
|
+
def upload(self, path: Path, remote_path: PurePosixPath): ...
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
@attr.s(eq=False, hash=False)
|
|
692
|
+
class RemoteRepoAccessorFilesystem(RemoteRepoAccessor):
|
|
693
|
+
base_path: Path = attr.ib()
|
|
694
|
+
|
|
695
|
+
@contextlib.contextmanager
|
|
696
|
+
def download_open(self, remote_path: PurePosixPath, offset: int = 0):
|
|
697
|
+
with (self.base_path / remote_path).open("rb") as f:
|
|
698
|
+
if offset:
|
|
699
|
+
f.seek(offset)
|
|
700
|
+
yield f
|
|
701
|
+
|
|
702
|
+
def upload(self, path, remote_path):
|
|
703
|
+
dst = self.base_path / remote_path
|
|
704
|
+
if dst.exists():
|
|
705
|
+
try:
|
|
706
|
+
dst.unlink()
|
|
707
|
+
except OSError:
|
|
708
|
+
shutil.rmtree(dst)
|
|
709
|
+
|
|
710
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
711
|
+
shutil.copyfile(str(path), str(dst), follow_symlinks=False)
|