vocker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vocker/repo/io.py ADDED
@@ -0,0 +1,711 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ from collections import defaultdict
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ import contextlib
7
+ import enum
8
+ import io
9
+ import itertools
10
+ from math import log2
11
+ from pathlib import Path, PurePosixPath
12
+ import shutil
13
+ import struct
14
+ import typing as ty
15
+
16
+ import attr
17
+ import cbor2
18
+ from sansio_tools import parser as sansio_parser
19
+ from sansio_tools.queue import BytesQueue, FileAdapterFromGeneratorBytes
20
+ import structlog
21
+
22
+ from ..integer_to_path import IntegerToPath
23
+ from ..util import assert_
24
+ from .. import image as im, multihash as mh, dedup as de, util as ut
25
+ from . import compression as cx
26
+
27
+ logger = structlog.get_logger(__name__)
28
+
29
+
30
+ CBOR_HEADER = b"\xd9\xd9\xf7"
31
+ STRUCT_ARCHIVE_SIZE = struct.Struct("!Q")
32
+ STRUCT_ESTIMATED_SIZE = struct.Struct("!H")
33
+
34
+
35
+ def cbor_dumps(obj) -> bytes:
36
+ return cbor2.dumps(obj, datetime_as_timestamp=True, canonical=True)
37
+
38
+
39
+ def cbor_loads(data: bytes):
40
+ return cbor2.loads(data)
41
+
42
+
43
+ def cbor_load(fp: ty.BinaryIO, max_size: int):
44
+ left = max_size + 1
45
+ q = BytesQueue()
46
+ while buf := fp.read(left):
47
+ left -= len(buf)
48
+ q.append(buf)
49
+ if not left:
50
+ raise ValueError(f"input exceeded maximum size {max_size}")
51
+ return cbor_loads(bytes(q))
52
+
53
+
54
+ def cbor_dump(obj, fp):
55
+ fp.write(b"\xd9\xd9\xf7")
56
+ cbor2.dump(obj, fp, datetime_as_timestamp=True, canonical=True)
57
+
58
+
59
+ @attr.s(eq=False, hash=False)
60
+ class ShardPathsWriter:
61
+ file = attr.ib()
62
+
63
+ def write_all(self, files: ty.Iterable[im.SingleFileImageMetadata]):
64
+ cbor_dump([x.to_shard_entry() for x in files], self.file)
65
+
66
+
67
+ @attr.s(eq=False, hash=False)
68
+ class ArchiveDataWriter:
69
+ file_archive: ty.BinaryIO = attr.ib()
70
+ file_sizes: ty.BinaryIO = attr.ib()
71
+
72
+ def begin_file(self, size: int, digest: mh.Digest):
73
+ self.current_hasher = digest.function()
74
+ self.current_size = 0
75
+ self.expected_digest = digest
76
+ self.expected_size = size
77
+
78
+ def write_file_data(self, data: bytes):
79
+ self.current_size += len(data)
80
+ self.current_hasher.update(data)
81
+ self.file_archive.write(data)
82
+
83
+ def end_file(self):
84
+ h = self.current_hasher.digest()
85
+ h0 = self.expected_digest
86
+ if (size := self.current_size) != (s0 := self.expected_size) or h != h0:
87
+ raise AssertionError(
88
+ f"written file did not match expected info ({size} != {s0}, {h} != {h0})"
89
+ )
90
+ self.file_sizes.write(STRUCT_ARCHIVE_SIZE.pack(size))
91
+
92
+
93
+ @attr.s(eq=False, hash=False)
94
+ class HashesWriter:
95
+ file: ty.BinaryIO = attr.ib()
96
+
97
+ def write_all(self, iterable: ty.Iterable[mh.Digest]):
98
+ w = self.file.write
99
+ for x in iterable:
100
+ w(x.digest)
101
+
102
+
103
+ def estimated_archive_sizes_encode(sizes: ty.Iterable[int]):
104
+ _s = STRUCT_ESTIMATED_SIZE
105
+ result = []
106
+ for size in sizes:
107
+ if size <= 0:
108
+ size = 1
109
+ result.append(_s.pack(round(log2(size) * 1024)))
110
+ return b"".join(result)
111
+
112
+
113
+ def estimated_archive_sizes_decode(data: bytes) -> list[int]:
114
+ _s = STRUCT_ESTIMATED_SIZE
115
+ data = memoryview(data)
116
+ result = []
117
+ for i in range(0, len(data), 2):
118
+ [x] = _s.unpack(data[i : i + 2])
119
+ result.append(round(2.0 ** (x / 1024)))
120
+ return result
121
+
122
+
123
+ class MapXToYOperatorEnum(enum.Enum):
124
+ OUT = 1
125
+ AND = 2
126
+ OR = 3
127
+
128
+
129
+ @attr.s(eq=False, hash=False)
130
+ class MapShardToArchiveWriterTrivial:
131
+ file: ty.BinaryIO = attr.ib()
132
+
133
+ def write_all(self, *, shard_id: int, archive_id: int, archive_size: int):
134
+ data = [
135
+ [archive_id],
136
+ [[MapXToYOperatorEnum.OUT.value, 0, shard_id]],
137
+ estimated_archive_sizes_encode([archive_size]),
138
+ ]
139
+ cbor_dump(data, self.file)
140
+
141
+
142
+ @attr.s(eq=False, hash=False)
143
+ class MapImageToShardWriterTrivial:
144
+ file: ty.BinaryIO = attr.ib()
145
+
146
+ def write_all(self, *, image_id: int, shard_ids: ty.Iterable[int]):
147
+ shard_ids = list(shard_ids)
148
+ data = [
149
+ shard_ids,
150
+ [[MapXToYOperatorEnum.OUT.value, 0] + list(range(len(shard_ids)))],
151
+ ]
152
+ cbor_dump(data, self.file)
153
+
154
+
155
+ def image_file_entries_for_hashing_iter(
156
+ image_user_data_hash: mh.Digest, entries: ty.Iterable[im.SingleFileImageMetadata]
157
+ ):
158
+ yield image_user_data_hash.to_multihash_bytes()
159
+
160
+ d = {}
161
+ keys = []
162
+ for e in entries:
163
+ keys.append(k := e.to_image_hash_sort_key())
164
+ d[k] = e
165
+
166
+ keys.sort()
167
+
168
+ for k in keys:
169
+ yield cbor_dumps(d[k].to_data_for_image_hash())
170
+
171
+
172
+ @attr.s(auto_exc=True, str=False)
173
+ class RepoFileNotFoundError(Exception):
174
+ message = attr.ib(default="repository file not found")
175
+ filename = attr.ib(default=None)
176
+ remote_accessor = attr.ib(default=None)
177
+ local_base_path = attr.ib(default=None)
178
+
179
+
180
+ @attr.s(auto_exc=True, str=False)
181
+ class ImageNotFoundError(Exception):
182
+ message = attr.ib(default="image not found")
183
+ image_id = attr.ib(default=None)
184
+
185
+
186
+ @attr.s(auto_exc=True, str=False)
187
+ class BadHashError(Exception):
188
+ message = attr.ib(
189
+ default="""Manifest hash does not match parent. Either the repository is corrupt or in \
190
+ the middle of an upload, in which case you should retry the operation."""
191
+ )
192
+ path = attr.ib(default=None)
193
+ digest_expected = attr.ib(default=None)
194
+ digest_observed = attr.ib(default=None)
195
+
196
+
197
+ @attr.s(eq=False, hash=False)
198
+ class RepoTransfer:
199
+ path_local: Path = attr.ib()
200
+ dedup: de.Dedup = attr.ib()
201
+ accessor: RemoteRepoAccessor | None = attr.ib()
202
+ _cached_manifests = attr.ib(factory=set, init=False)
203
+
204
+ def _download_manifest(
205
+ self, remote_path: PurePosixPath, destination: Path
206
+ ) -> tuple[mh.Digest, ManifestNode]:
207
+ with self.accessor.download_open(remote_path) as file:
208
+ _feed = (reader := ManifestNodeReader()).parser.feed
209
+ q = BytesQueue()
210
+
211
+ def feed(b):
212
+ if b:
213
+ q.append(b)
214
+ _feed(b)
215
+
216
+ def _download_remaining_data():
217
+ yield from q.data
218
+ while block := file.read(65536):
219
+ feed(block)
220
+ yield block
221
+ feed(None)
222
+
223
+ # first we download the header to check the digest and maybe avoid downloading the rest
224
+ block = True
225
+ while block:
226
+ block = file.read(256)
227
+ feed(block if block else None)
228
+ if (digest := reader.out_claimed_digest) is not None:
229
+ # OK, we have our digest
230
+ break
231
+ else:
232
+ raise AssertionError("no digest available but no error from parser?")
233
+
234
+ _open = lambda: FileAdapterFromGeneratorBytes(_download_remaining_data())
235
+ destination.parent.mkdir(exist_ok=True, parents=True)
236
+ req = self.make_manifest_link_request(digest, destination, dict(open_file_once=_open))
237
+ self.dedup.run_batch([req])
238
+
239
+ return digest, reader.out_verified_data
240
+
241
+ @staticmethod
242
+ def make_manifest_link_request(manifest_digest, destination, kwargs):
243
+ return de.DedupLinkRequest(
244
+ hash_function=manifest_digest.function,
245
+ link_path=destination,
246
+ file_metadata=de.DedupFileMetadata.make_plain(),
247
+ file_contents_hash=None,
248
+ tags={b"vmf:" + manifest_digest.to_multihash_bytes()},
249
+ **kwargs,
250
+ )
251
+
252
+ @staticmethod
253
+ def make_hash_32_path(digest: mh.Digest):
254
+ return PurePosixPath(digest.digest[:4].hex("/")) / "i.cbor"
255
+
256
+ def _upload_file(self, local_path, remote_path):
257
+ self.accessor.upload(local_path, remote_path)
258
+
259
+ def _local(self, p: PurePosixPath) -> Path:
260
+ return self.path_local / p
261
+
262
+ def upload_full(self):
263
+ stack = [(False, PurePosixPath("."))]
264
+ while stack:
265
+ is_exit, path = stack.pop()
266
+ local_path = self._local(path)
267
+ if is_exit:
268
+ self._upload_file(local_path / "manifest.bin", path / "manifest.bin")
269
+ else:
270
+ new_reader = ManifestNodeReader.from_bytes(
271
+ (local_path / "manifest.bin").read_bytes()
272
+ )
273
+ new_node = new_reader.out_verified_data
274
+ new_digest = new_reader.out_claimed_digest
275
+ with self.dedup.temporary_directory() as tmp:
276
+ old_digest = old_node = None
277
+ try:
278
+ old_digest, old_node = self._download_manifest(
279
+ path / "manifest.bin", (tmp_mf_path := tmp / "m.bin")
280
+ )
281
+ except FileNotFoundError:
282
+ pass
283
+ except Exception:
284
+ logger.warning(
285
+ "error downloading existing manifest",
286
+ exc_info=True,
287
+ data_path=str(path / "manifest.bin"),
288
+ )
289
+
290
+ if old_digest == new_digest:
291
+ continue # manifest is identical, carry on
292
+
293
+ if old_digest is None:
294
+ # empty
295
+ old_node = None
296
+ else:
297
+ # old manifest found
298
+ old_node = ManifestNodeReader.from_bytes(old_node).out_verified_data
299
+
300
+ # we write the new manifest to /new/
301
+ self._upload_file(local_path / "manifest.bin", "new" / path / "manifest.bin")
302
+
303
+ # onto the stack we push a reminder to upload the final version to the right place,
304
+ # but only AFTER all the children have been completed
305
+ stack.append((True, path))
306
+
307
+ # we now perform all the file operations for the current directory and recurse for
308
+ # child directories
309
+ for k, v in new_node.children.items():
310
+ if old_node is None or old_node.children.get(k) != v:
311
+ # this file or directory is different, so we will need to recurse into it
312
+ is_dir, digest = v
313
+ if is_dir:
314
+ # push it onto the stack
315
+ stack.append((False, path / k))
316
+ else:
317
+ # upload file now
318
+ self._upload_file(local_path / k, path / k)
319
+
320
+ def download_full(self, archives: bool = True, manifest_only: bool = False):
321
+ def _should_download(path: PurePosixPath) -> bool:
322
+ if manifest_only:
323
+ return False
324
+ return not (not archives and path.name.startswith("a."))
325
+
326
+ self._cached_manifests.clear()
327
+ loc = self._local
328
+ todo = [(PurePosixPath("."), None)]
329
+ while todo:
330
+ path, digest = todo.pop()
331
+
332
+ with self.open(path / "manifest.bin") as mf:
333
+ node = ManifestNodeReader.from_bytes(mf.read())
334
+
335
+ if digest is not None:
336
+ # avoid checking the top-level hash
337
+ if node.out_claimed_digest != digest:
338
+ raise BadHashError(
339
+ path=path, digest_expected=digest, digest_observed=node.out_claimed_digest
340
+ )
341
+
342
+ for item_name, (is_dir, item_digest) in node.out_verified_data.children.items():
343
+ item_path = path / item_name
344
+ if is_dir:
345
+ todo.append((item_path, item_digest))
346
+ else:
347
+ if _should_download(item_path):
348
+ with self.open(item_path):
349
+ pass
350
+
351
+ def _integer_to_path(self, i: int):
352
+ return PurePosixPath(IntegerToPath(file_suffix="_d")(i))
353
+
354
+ DEFAULT_CBOR_MAX_SIZE = 2**24
355
+
356
+ def open(self, path: PurePosixPath):
357
+ loc = self._local
358
+ mf_path = path.parent / "manifest.bin"
359
+ if (loc_path := loc(path)).exists():
360
+ # don't even check the manifest for existing local files
361
+ return loc_path.open("rb")
362
+
363
+ def _not_found():
364
+ raise RepoFileNotFoundError(
365
+ filename=str(path), remote_accessor=self.accessor, local_base_path=self.path_local
366
+ ) from None
367
+
368
+ try:
369
+ if (acc := self.accessor) is not None and mf_path not in self._cached_manifests:
370
+ h, node = self._download_manifest(mf_path, loc(mf_path))
371
+ self._cached_manifests.add(mf_path)
372
+ else:
373
+ x = loc(mf_path).read_bytes()
374
+ reader = ManifestNodeReader.from_bytes(loc(mf_path).read_bytes())
375
+ h, node = reader.out_claimed_digest, reader.out_verified_data
376
+ except FileNotFoundError:
377
+ _not_found()
378
+
379
+ if mf_path == path:
380
+ # goofy - caller is trying to open the manifest itself
381
+ return loc(path).open("rb")
382
+
383
+ try:
384
+ is_dir, digest = node.children[path.name]
385
+ except KeyError:
386
+ _not_found()
387
+
388
+ assert_(not is_dir)
389
+
390
+ _open = None if acc is None else (lambda: acc.download_open(path))
391
+
392
+ req = de.DedupLinkRequest(
393
+ hash_function=digest.function,
394
+ link_path=(loc_path := loc(path)),
395
+ file_metadata=de.DedupFileMetadata.make_plain(),
396
+ file_contents_hash=digest,
397
+ open_file_once=_open,
398
+ )
399
+ # TODO: handle de.MissingContentError
400
+ try:
401
+ self.dedup.run_batch([req])
402
+ except de.BatchError as exc:
403
+ raise exc.requests[0].exc from None
404
+ return loc_path.open("rb")
405
+
406
+ @contextlib.contextmanager
407
+ def open_compressed(self, path: PurePosixPath):
408
+ # TODO: use gz if zstd not available
409
+ p = path.parent / (path.name + ".xz")
410
+ with self.open(p) as f1:
411
+ with cx.open_decompressor(f1, "xz") as f:
412
+ yield f
413
+
414
+ def download_shard(self, shard_digest: mh.Digest) -> int | None:
415
+ shard_hashes_path = "shard-by-hash-32" / self.make_hash_32_path(shard_digest)
416
+ try:
417
+ shard_hash_to_id = self._read_cbor(shard_hashes_path)
418
+ except RepoFileNotFoundError:
419
+ return None
420
+ try:
421
+ shard_id = shard_hash_to_id[shard_digest.digest]
422
+ except KeyError:
423
+ return None
424
+ assert_(type(shard_id) is int)
425
+ return shard_id
426
+
427
+ def _read_cbor(self, path):
428
+ max_size = self.DEFAULT_CBOR_MAX_SIZE
429
+ with self.open(path) as f:
430
+ return cbor_load(f, max_size=max_size)
431
+
432
+ def download_image(self, image_id: str, download_archives: bool = True):
433
+ # download image index to locate image ID by hash
434
+ # download image metadata cbor and ID of latest image-to-shard mapping
435
+ # download image-to-shard mapping
436
+ # select shard set
437
+ # for each shard, download metadata + ID of latest shard-to-archive mapping
438
+ # download shard-to-archive mapping
439
+ # select archive set
440
+ digest = mh.registry.decode(image_id)
441
+ hf = digest.function
442
+
443
+ def _read_cbor_int(path) -> int:
444
+ with self.open(path) as f:
445
+ value: int = cbor_load(f, max_size=1024)
446
+ assert_(type(value) is int)
447
+ return value
448
+
449
+ _read_cbor = self._read_cbor
450
+
451
+ def _read_compressed_cbor(path, max_size=None):
452
+ if max_size is None:
453
+ max_size = self.DEFAULT_CBOR_MAX_SIZE
454
+ with self.open_compressed(path) as f:
455
+ return cbor_load(f, max_size=max_size)
456
+
457
+ try:
458
+ image_hashes_path = "image-by-hash-32" / self.make_hash_32_path(digest)
459
+ image_hash_to_id = _read_cbor(image_hashes_path)
460
+ img_id = image_hash_to_id[digest.digest]
461
+ except Exception as exc:
462
+ raise ImageNotFoundError(image_id=image_id) from exc
463
+ assert_(type(img_id) is int)
464
+
465
+ image_path = "image" / self._integer_to_path(img_id)
466
+
467
+ with self.open_compressed(image_path / "u") as f:
468
+ image_meta_hash = hf().update(f.read()).digest()
469
+
470
+ i2s_path = "is" / self._integer_to_path(_read_cbor_int(image_path / "is.cbor"))
471
+
472
+ with self.open_compressed(i2s_path / "m") as f:
473
+ # HACK: gathering all shards instead of being smart
474
+ shard_ids: list[int] = cbor_loads(f.read())[0]
475
+ assert_(type(shard_ids) is list)
476
+ assert_(all(type(x) is int for x in shard_ids))
477
+
478
+ digest_size = hf.digest_size
479
+ shard_entries: dict[str, im.SingleFileImageMetadata] = {}
480
+ for shard_id in shard_ids:
481
+ shard_path = "shard" / self._integer_to_path(shard_id)
482
+
483
+ shard_entry_data = _read_compressed_cbor(shard_path / "p")
484
+
485
+ with self.open(shard_path / "h.bin") as f:
486
+ for data in shard_entry_data:
487
+ entry = im.SingleFileImageMetadata.from_shard_entry(
488
+ data, hf.digest_from_bytes(f.read(digest_size))
489
+ )
490
+ shard_entries[entry.path] = entry
491
+
492
+ computed_image_hash = (
493
+ hf()
494
+ .update_iter(
495
+ image_file_entries_for_hashing_iter(image_meta_hash, shard_entries.values())
496
+ )
497
+ .digest()
498
+ )
499
+
500
+ if (s := computed_image_hash.to_multihash_base64url()) != image_id:
501
+ raise ValueError(f"image hash does not match, expected {image_id}, calculated {s}")
502
+
503
+ digest_to_shard_entries = defaultdict(list)
504
+ for entry in shard_entries.values():
505
+ digest_to_shard_entries[entry.digest.digest].append(entry)
506
+
507
+ logger.info("finished metadata")
508
+
509
+ for shard_id in shard_ids:
510
+ shard_path = "shard" / self._integer_to_path(shard_id)
511
+ s2a_path = "sa" / self._integer_to_path(_read_cbor_int(shard_path / "sa.cbor"))
512
+ archive_ids = _read_compressed_cbor(s2a_path / "m")[0]
513
+ assert_(type(archive_ids) is list)
514
+ assert_(all(type(x) is int for x in archive_ids))
515
+
516
+ with self.open_compressed(s2a_path / "m") as f:
517
+ # HACK: gathering all archives instead of being smart
518
+ archive_ids: list[int] = cbor_loads(f.read())[0]
519
+
520
+ _size_struct_size = STRUCT_ARCHIVE_SIZE.size
521
+ for archive_id in archive_ids:
522
+ export_inputs = []
523
+ archive_path = "archive" / self._integer_to_path(archive_id)
524
+ offset = 0
525
+ with self.open(archive_path / "h.bin") as f_h, self.open_compressed(
526
+ archive_path / "s"
527
+ ) as f_s:
528
+ while sz_bytes := f_s.read(_size_struct_size):
529
+ # archive file hashes and sizes
530
+ [size] = STRUCT_ARCHIVE_SIZE.unpack(sz_bytes)
531
+ digest = f_h.read(digest_size)
532
+ this_digest_shard_entries = digest_to_shard_entries.pop(digest, None)
533
+ if this_digest_shard_entries is not None:
534
+ export_inputs.append(
535
+ im.SolidArchiveFileInfo(
536
+ files=this_digest_shard_entries, offset=offset, size=size
537
+ )
538
+ )
539
+ offset += size
540
+
541
+ if download_archives:
542
+ with self.open_compressed(archive_path / "a") as f:
543
+ # ensure the contents are available
544
+ pass
545
+
546
+ yield ArchiveFilesExportInfo(archive_path=archive_path / "a", files=export_inputs)
547
+
548
+ if digest_to_shard_entries:
549
+ raise ValueError(f"failed to find data for items: {digest_to_shard_entries}")
550
+
551
+ def export(
552
+ self,
553
+ exporter: im.VenvExporter,
554
+ iterable: ty.Iterator[ArchiveFilesExportInfo],
555
+ max_workers: int = None,
556
+ ):
557
+ def _process(archive_infos):
558
+ with contextlib.ExitStack() as ex:
559
+ files = []
560
+ for archive_info in archive_infos:
561
+ archive_io = ex.enter_context(self.open_compressed(archive_info.archive_path))
562
+ files += (A(archive_io, info) for info in archive_info.files)
563
+ exporter.provide_files(files)
564
+
565
+ def _group_archive_infos(iterable):
566
+ n = 0
567
+ lst = []
568
+ for a in iterable:
569
+ if ((n1 := len(a.files)) + n < 5000) and len(lst) < 50:
570
+ lst.append(a)
571
+ n += n1
572
+ else:
573
+ yield lst
574
+ n = n1
575
+ lst = [a]
576
+ if lst:
577
+ yield lst
578
+
579
+ exporter.begin_session()
580
+ A = im.VenvExportInputFromSolidArchive
581
+ with ThreadPoolExecutor(max_workers=max_workers) as exe, ut.cancel_futures_on_error(exe):
582
+ ut.raise_as_completed(
583
+ exe.submit(_process, a_info_group)
584
+ for a_info_group in _group_archive_infos(iterable)
585
+ )
586
+ exporter.end_session()
587
+
588
+
589
+ @attr.s(frozen=True)
590
+ class ArchiveFilesExportInfo:
591
+ archive_path: PurePosixPath = attr.ib()
592
+ files: tuple[im.SolidArchiveFileInfo] = attr.ib(converter=tuple)
593
+
594
+
595
+ def read_multihash(p: sansio_parser.BinaryParser, maximum_digest_size: int):
596
+ function_code = yield from mh.multihash_varint_decode(p)
597
+ digest_size = yield from mh.multihash_varint_decode(p)
598
+ if digest_size > maximum_digest_size:
599
+ raise ValueError("digest size exceeds maximum")
600
+ digest_bytes = yield from p.read_bytes(digest_size)
601
+ return mh.registry.decode_from_code_and_digest(function_code, digest_bytes)
602
+
603
+
604
+ @attr.s(eq=False, hash=False)
605
+ class ManifestNode:
606
+ hash_function: mh.HashFunction = attr.ib()
607
+ children: dict[str, tuple[bool, mh.Digest]] = attr.ib()
608
+
609
+ @classmethod
610
+ def from_cbor_decoded(cls, hash_function: mh.HashFunction, data):
611
+ is_dir_dict = {b"d": True, b"f": False}
612
+ H = hash_function.digest_from_bytes
613
+ return cls(
614
+ hash_function,
615
+ {k: (is_dir_dict[v[:1]], H(v[1:])) for k, v in data.items()},
616
+ )
617
+
618
+ def to_bytes(self) -> tuple[bytes, mh.Digest]:
619
+ hf = self.hash_function
620
+ for is_dir, digest in self.children.values():
621
+ if digest.function != hf:
622
+ raise AssertionError("child and parent must use the same hash function")
623
+ d = {
624
+ name: (b"d" if is_dir else b"f") + digest.digest
625
+ for name, (is_dir, digest) in self.children.items()
626
+ }
627
+ b = cbor_dumps(d)
628
+ h = hf().update(b).digest()
629
+ return h.to_multihash_bytes() + b, h
630
+
631
+
632
+ @attr.s(eq=False, hash=False)
633
+ class ManifestNodeReader:
634
+ maximum_digest_size: int = attr.ib(default=1024)
635
+ parser = attr.ib(default=None, init=False)
636
+
637
+ out_claimed_digest: mh.Digest = attr.ib(init=False, default=None)
638
+ out_verified_data: ManifestNode = attr.ib(init=False, default=None)
639
+
640
+ def __attrs_post_init__(self):
641
+ self.parser = sansio_parser.BinaryParser(self._parse)
642
+
643
+ def _parse(self, p: sansio_parser.BinaryParser):
644
+ digest_top = yield from read_multihash(p, self.maximum_digest_size)
645
+ self.out_claimed_digest = digest_top
646
+ hf = digest_top.function
647
+ hasher_top = hf()
648
+
649
+ q2 = BytesQueue()
650
+ while not p.eof:
651
+ while p.queue:
652
+ hasher_top.update(p.queue.popleft_any_to(q2))
653
+ yield
654
+
655
+ if digest_top != hasher_top.digest():
656
+ raise ValueError("content does not match top-level hash")
657
+
658
+ self.out_verified_data = ManifestNode.from_cbor_decoded(hf, cbor2.loads(bytes(q2)))
659
+
660
+ @classmethod
661
+ def from_bytes(cls, data):
662
+ (self := cls()).parser.feed(data).feed(None)
663
+ return self
664
+
665
+ @classmethod
666
+ def parse_bytes(cls, data: bytes | memoryview):
667
+ return cls.from_data(data).out_verified_data
668
+
669
+
670
+ class RemoteRepoAccessor(abc.ABC):
671
+ def download(self, path: Path, remote_path: PurePosixPath):
672
+ raise NotImplementedError
673
+ if path.is_file():
674
+ offset = path.stat().st_size
675
+ with self.download_open_iter(remote_path=remote_path, offset=offset) as xs, path.open(
676
+ "w+b"
677
+ ) as fw:
678
+ fw.seek(offset)
679
+ for block in xs:
680
+ fw.write(block)
681
+
682
+ @abc.abstractmethod
683
+ def download_open(
684
+ self, remote_path: PurePosixPath, offset: int = 0
685
+ ) -> ty.ContextManager[ty.BinaryIO]: ...
686
+
687
+ @abc.abstractmethod
688
+ def upload(self, path: Path, remote_path: PurePosixPath): ...
689
+
690
+
691
+ @attr.s(eq=False, hash=False)
692
+ class RemoteRepoAccessorFilesystem(RemoteRepoAccessor):
693
+ base_path: Path = attr.ib()
694
+
695
+ @contextlib.contextmanager
696
+ def download_open(self, remote_path: PurePosixPath, offset: int = 0):
697
+ with (self.base_path / remote_path).open("rb") as f:
698
+ if offset:
699
+ f.seek(offset)
700
+ yield f
701
+
702
+ def upload(self, path, remote_path):
703
+ dst = self.base_path / remote_path
704
+ if dst.exists():
705
+ try:
706
+ dst.unlink()
707
+ except OSError:
708
+ shutil.rmtree(dst)
709
+
710
+ dst.parent.mkdir(parents=True, exist_ok=True)
711
+ shutil.copyfile(str(path), str(dst), follow_symlinks=False)