vocker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vocker/image.py ADDED
@@ -0,0 +1,870 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ from collections import defaultdict
5
+ import contextlib
6
+ import enum
7
+ import functools
8
+ import io
9
+ import os
10
+ import re
11
+ import shutil
12
+ import subprocess as sbp
13
+ import tempfile
14
+ import typing as ty
15
+ import zipfile
16
+ from pathlib import Path, PurePosixPath
17
+ import struct
18
+ import sys
19
+ import itertools
20
+
21
+ import attr
22
+ from cached_property import cached_property
23
+ from sansio_tools.queue import BytesQueue
24
+
25
+ from . import dedup as de, multihash as mh
26
+ from .util import assert_, pathwalk
27
+
28
+
29
+ def get_start_of_zipfile(zfile) -> int | None:
30
+ try:
31
+ with zipfile.ZipFile(zfile) as z:
32
+ # offset is the number of bytes from essentially the beginning of the file to a local file header. The smaller
33
+ # the offset, the earlier in the file the local header is.
34
+
35
+ local_offsets = (info.header_offset for info in z.infolist())
36
+ return min(itertools.chain([z.start_dir], local_offsets))
37
+
38
+ # start_dir is a pretty internal attribute of the ZipFile object. It initially starts at 0 (beginning of file)
39
+ # but the first local file header may not be there, as is the case with exe files, where there's a zip file at
40
+ # the end of the executable. It appears that start_dir can be arbitrarily somewhere in the file based on the
41
+ # location of the outcome of file.seek(). The minimum of start_dir and the smallest local_offset would
42
+ # inevitably result in the location of the start of the zip file.
43
+ except zipfile.BadZipfile:
44
+ return None
45
+
46
+
47
+ class Command(enum.Enum):
48
+ LITERAL = b"\x80"
49
+ VENV_BASE_PATH = b"\x81"
50
+ VENV_NAME = b"\x82"
51
+
52
+
53
+ @attr.s(eq=False, hash=False)
54
+ class CommandSequenceWriter:
55
+ """
56
+ Command sequence encoding. There are currently three types of commands:
57
+
58
+ - ``0x80 [8 byte "length"] [...length bytes...]``
59
+
60
+ This means copy "length" bytes to the output stream.
61
+
62
+ - ``0x81``
63
+
64
+ This means copy the current Python executable path here.
65
+
66
+ - ``0x82``
67
+
68
+ This means copy the virtualenv name here.
69
+ """
70
+
71
+ max_literal_length: int = attr.ib(default=10 * 1024 * 1024)
72
+ output: BytesQueue = attr.ib(init=False, factory=BytesQueue)
73
+ _current_literal: BytesQueue = attr.ib(init=False, factory=BytesQueue, repr=False)
74
+
75
+ def _end_literal(self):
76
+ if n := len(lit := self._current_literal):
77
+ out = self.output
78
+ out.append(Command.LITERAL.value)
79
+ out.append(struct.pack(">Q", n))
80
+ lit.popleft_all_to(out)
81
+ lit.clear()
82
+
83
+ def feed(self, b: bytes | memoryview | Command):
84
+ if isinstance(b, Command):
85
+ self.feed_command(b)
86
+ else:
87
+ self.feed_data(b)
88
+
89
+ def feed_data(self, b: bytes | memoryview):
90
+ lit = self._current_literal
91
+ n_max = self.max_literal_length
92
+ while len(b):
93
+ allowed = n_max - len(lit)
94
+ current_b = b[:allowed]
95
+ lit.append(current_b)
96
+ if len(lit) == n_max:
97
+ self._end_literal()
98
+ b = b[allowed:]
99
+
100
+ def feed_command(self, c):
101
+ self._end_literal()
102
+ self.output.append(c.value)
103
+
104
+ def close(self):
105
+ self._end_literal()
106
+
107
+ def generator_pipe(
108
+ self, gen: ty.Iterable[bytes | memoryview | Command]
109
+ ) -> ty.Iterable[bytes, memoryview]:
110
+ out = self.output
111
+ for item in gen:
112
+ self.feed(item)
113
+ while out:
114
+ yield out.popleft_any()
115
+ self.close()
116
+ while out:
117
+ yield out.popleft_any()
118
+
119
+
120
+ @attr.s(eq=False, hash=False)
121
+ class CommandSequenceReader:
122
+ f: ty.IO = attr.ib()
123
+ literal_length_left = 0
124
+
125
+ def _read_from_literal(self, n: int):
126
+ if left := self.literal_length_left:
127
+ output = self.f.read(min(n, left))
128
+ self.literal_length_left -= len(output)
129
+ return output
130
+ return None
131
+
132
+ def read(self, n: int) -> bytes | Command:
133
+ """
134
+ Read at most *n* literal bytes OR a command. EOF when an empty bytes is returned.
135
+ """
136
+ if r := self._read_from_literal(n):
137
+ return r
138
+
139
+ if not (c := self.f.read(1)):
140
+ return b"" # EOF
141
+
142
+ c = Command(c)
143
+ if c == Command.LITERAL:
144
+ [self.literal_length_left] = struct.unpack(">Q", self.f.read(8))
145
+ return self._read_from_literal(n)
146
+ else:
147
+ return c
148
+
149
+
150
+ def _Path(p: Path | str) -> Path:
151
+ if not hasattr(p, "is_absolute"):
152
+ p = Path(p)
153
+ return p if p.is_absolute() else Path.cwd() / p
154
+
155
+
156
+ def pyenv_split(root_path: Path):
157
+ def _make_key(p: Path):
158
+ return "/".join(p.relative_to(root_path).parts) or "."
159
+
160
+ @functools.cache
161
+ def _j(*args):
162
+ return "".join(args)
163
+
164
+ rx_exts = re.compile(r"(\.pyc$)|(\.(?:pkl|pickle|dll|pyd|lib|exe|lib|dylib|so(\.\d+)*)$)")
165
+ out = defaultdict(list)
166
+ joint_names = {"site-packages", "dist-packages"}
167
+
168
+ def _f(p: Path, current_key: str):
169
+ if not p.is_dir():
170
+ if (m := rx_exts.search(p.name.lower())) is None:
171
+ is_binary = False
172
+ elif m.lastindex == 1:
173
+ # it's a pyc file, ignore it
174
+ return
175
+ elif m.lastindex == 2:
176
+ is_binary = True
177
+ out[_j("bin:" if is_binary else "pure:", current_key)].append(p)
178
+ return
179
+
180
+ if len(p.relative_to(root_path).parts) <= 2:
181
+ current_key = _make_key(p)
182
+
183
+ for child in p.iterdir():
184
+ if p.name in joint_names:
185
+ _f(child, _make_key(child))
186
+ else:
187
+ _f(child, current_key)
188
+
189
+ _f(root_path, "")
190
+ return out
191
+
192
+
193
+ class VenvImporterReceiver:
194
+ @abc.abstractmethod
195
+ def call(
196
+ self,
197
+ input_path: Path,
198
+ output_path: PurePosixPath,
199
+ contents: Path | ty.Generator[bytes | memoryview],
200
+ executable: bool,
201
+ template_mode: bool,
202
+ ): ...
203
+
204
+
205
+ def file_block_iter(path: Path, block_size: int = 65536):
206
+ with path.open("rb") as f:
207
+ while block := f.read(block_size):
208
+ yield block
209
+
210
+
211
+ @attr.s(frozen=True)
212
+ class ImageFileMetadata:
213
+ executable: bool = attr.ib()
214
+
215
+
216
+ @attr.s
217
+ class SingleFileImageMetadata:
218
+ path: PurePosixPath = attr.ib()
219
+ metadata: ImageFileMetadata = attr.ib()
220
+ digest: mh.Digest = attr.ib()
221
+
222
+ @classmethod
223
+ def from_shard_entry(cls, data: type[str, str], digest: mh.Digest):
224
+ p, m = data
225
+ path = PurePosixPath(p)
226
+ if m == "":
227
+ metadata = ImageFileMetadata(executable=False)
228
+ elif m == "x":
229
+ metadata = ImageFileMetadata(executable=True)
230
+ else:
231
+ raise ValueError(f"value: {m!r}")
232
+ return cls(path=path, metadata=metadata, digest=digest)
233
+
234
+ def to_shard_entry(self):
235
+ return [str(self.path), "x" if self.metadata.executable else ""]
236
+
237
+ def to_data_for_image_hash(self):
238
+ return [self.digest.to_multihash_bytes()] + self.to_shard_entry()
239
+
240
+ def to_image_hash_sort_key(self):
241
+ s = str(self.path).encode("utf-8")
242
+ return len(s), s
243
+
244
+
245
+ @attr.s
246
+ class VenvImporterFileOutput:
247
+ size: int = attr.ib()
248
+ rest: SingleFileImageMetadata = attr.ib()
249
+
250
+
251
+ @attr.s(eq=False, hash=False)
252
+ class VenvImporterToImageMetadata:
253
+ hash_function: mh.HashFunction = attr.ib()
254
+ dedup: de.Dedup = attr.ib()
255
+
256
+ def __call__(self, output: ImporterOutput) -> VenvImporterFileOutput:
257
+ h = None
258
+ if isinstance(c := output.contents, Path):
259
+ if (r := self.dedup.get_file_hash(self.hash_function, c, check_link=False)) is not None:
260
+ size, h = r
261
+
262
+ if h is None:
263
+ hasher = self.hash_function()
264
+ size = 0
265
+ for block in output.contents_iter():
266
+ hasher.update(block)
267
+ size += len(block)
268
+ h = hasher.digest()
269
+
270
+ return VenvImporterFileOutput(
271
+ size=size,
272
+ rest=SingleFileImageMetadata(
273
+ path=output.path, digest=h, metadata=ImageFileMetadata(executable=False)
274
+ ),
275
+ )
276
+
277
+
278
+ _PurePathBase = object
279
+
280
+
281
+ @attr.s(eq=False, hash=False)
282
+ class VenvImporter:
283
+ """
284
+ Make sure the last component of :attr:`input` is a long random string.
285
+ """
286
+
287
+ input: _PurePathBase | Path = attr.ib(converter=_Path)
288
+ input_real: Path = attr.ib(default=None)
289
+
290
+ def __attrs_post_init__(self):
291
+ self.forbidden_string = self.input.name.encode("utf-8")
292
+ if self.input_real is None:
293
+ self.input_real = self.input
294
+
295
+ def _make_output_path(self, p: Path) -> PurePosixPath:
296
+ return PurePosixPath(*p.relative_to(self.input_real).parts)
297
+
298
+ def _handle_exe(self, p: Path):
299
+ with p.open(mode="rb") as f:
300
+ contents = f.read()
301
+ f.seek(0)
302
+ zip_position = get_start_of_zipfile(zfile=f)
303
+
304
+ if zip_position is not None:
305
+ if (i := contents.rfind(b"\0#!", zip_position - 1024, zip_position)) < 0:
306
+ raise AssertionError("failed to find #!")
307
+ i += 3 # actual start of path
308
+
309
+ path_data = contents[i:zip_position].rstrip(b"\r\n")
310
+ if path_data.startswith(b'"'):
311
+ if not path_data.endswith(b'"'):
312
+ raise AssertionError("not matching doublequotes??")
313
+ path_data = path_data[1:-1]
314
+
315
+ current_path = type(self.input)(path_data.decode("utf-8"))
316
+ if not current_path.is_relative_to(self.input):
317
+ raise AssertionError(f"path {path_data!r} is not under venv base {self.input!r}")
318
+
319
+ rel_path = current_path.relative_to(self.input)
320
+
321
+ # now let's make it a suffix
322
+ suffix_path = str("x" / rel_path)[1:]
323
+
324
+ yield True # template_mode
325
+ yield contents[:i]
326
+ yield b'"'
327
+ yield Command.VENV_BASE_PATH
328
+ yield suffix_path.encode("utf-8")
329
+ yield b'"\n\r\n'
330
+ yield contents[zip_position:]
331
+ else:
332
+ yield p # template_mode
333
+
334
+ def _handle_activate_script(self, p: Path):
335
+ enc = "utf-8"
336
+ with p.open(mode="rt", encoding=enc) as f:
337
+ text = f.read()
338
+
339
+ env_loc = re.escape(str(self.input))
340
+ env_name = re.escape(self.input.name)
341
+
342
+ rx = re.compile(f"({env_loc})|({env_name})")
343
+ last_output_pos = 0
344
+ first = True
345
+ for m in rx.finditer(text):
346
+ if first:
347
+ yield True # template_mode
348
+ first = False
349
+ yield text[last_output_pos : m.start()].encode(enc)
350
+
351
+ if m.lastindex == 1: # env_loc
352
+ yield Command.VENV_BASE_PATH
353
+ else: # env_name
354
+ yield Command.VENV_NAME
355
+
356
+ last_output_pos = m.end()
357
+
358
+ if first:
359
+ # this means we found no matches
360
+ yield False # template_mode
361
+ yield text[last_output_pos:].encode(enc)
362
+
363
+ def _handle_dist_info_record(self, p: Path):
364
+ yield False # template_mode
365
+ enc = "utf-8"
366
+ with p.open(mode="rt", encoding=enc) as f:
367
+ for line in f:
368
+ if line.rstrip():
369
+ path, _hash, _length = line.rsplit(",", maxsplit=2)
370
+ yield path.encode(enc)
371
+ yield b",,\n"
372
+ else:
373
+ yield line.encode(enc)
374
+
375
+ def _handle_simple_copy(self, p: Path):
376
+ yield p # template_mode
377
+
378
+ @cached_property
379
+ def _scripts_dir(self):
380
+ return self.input_real / "Scripts"
381
+
382
+ @cached_property
383
+ def _excluded(self):
384
+ return {self.input_real / "pyvenv.cfg"}
385
+
386
+ def run(self, file_path: Path) -> ty.Iterator[ty.ContextManager[ImporterOutput]]:
387
+ p = file_path
388
+ if p in self._excluded:
389
+ return
390
+
391
+ if p.suffix.lower() == ".pyc":
392
+ return
393
+
394
+ f = None
395
+ if p.parent == self._scripts_dir:
396
+ if p.suffix == ".exe":
397
+ f = self._handle_exe
398
+ elif p.stem.lower() == "activate":
399
+ f = self._handle_activate_script
400
+
401
+ if f is None:
402
+ f = self._handle_simple_copy
403
+ if p.parent.name.endswith(".dist-info"):
404
+ if p.name == "RECORD":
405
+ f = self._handle_dist_info_record
406
+ elif p.name == "direct_url.json":
407
+ return
408
+
409
+ @contextlib.contextmanager
410
+ def make():
411
+ gen = f(p)
412
+ template_mode = ty.cast("bool | Path", next(gen))
413
+
414
+ if isinstance(template_mode, Path):
415
+ sum(0 for _ in gen) # exhaust generator
416
+ gen = template_mode
417
+ template_mode = False
418
+ elif template_mode:
419
+ gen = CommandSequenceWriter().generator_pipe(gen)
420
+
421
+ path = ("template" if template_mode else "literal") / self._make_output_path(p)
422
+
423
+ out = ImporterOutputVenv(
424
+ path=path, template_mode=template_mode, is_executable=False, contents=gen
425
+ )
426
+ try:
427
+ yield out
428
+ finally:
429
+ out.discard()
430
+
431
+ yield make
432
+
433
+
434
+ class _Cancel(Exception):
435
+ pass
436
+
437
+
438
+ class ImporterOutput:
439
+ path: PurePosixPath
440
+ is_executable: bool
441
+ contents: Path | ty.Generator[bytes | memoryview]
442
+
443
+ def contents_iter(self) -> ty.Generator[bytes | memoryview]:
444
+ if isinstance((x := self.contents), Path):
445
+ return file_block_iter(x)
446
+ else:
447
+ return x
448
+
449
+ @abc.abstractmethod
450
+ def discard(self):
451
+ """
452
+ Free up memory taken up by :attr:`contents`, if any.
453
+ """
454
+ if not isinstance(self.contents, Path):
455
+ try:
456
+ self.contents.throw(_Cancel())
457
+ except _Cancel:
458
+ pass
459
+
460
+
461
+ @attr.s(eq=False, hash=False)
462
+ class ImporterOutputVenv(ImporterOutput):
463
+ """
464
+ This object is NOT thread-safe.
465
+ """
466
+
467
+ path: PurePosixPath = attr.ib()
468
+ template_mode: bool = attr.ib()
469
+ is_executable: bool = attr.ib()
470
+ contents: Path | ty.Generator[bytes | memoryview] = attr.ib()
471
+
472
+
473
+ class VenvFile:
474
+ path: PurePosixPath
475
+
476
+ @abc.abstractmethod
477
+ def write_to(self, p: Path, will_never_modify: bool) -> None: ...
478
+
479
+ @abc.abstractmethod
480
+ def open_readonly(self) -> ty.IO: ...
481
+
482
+
483
+ @attr.s(eq=False, hash=False)
484
+ class VenvFileForTesting(VenvFile):
485
+ path: PurePosixPath = attr.ib()
486
+ local_fs_path: Path = attr.ib()
487
+
488
+ def write_to(self, p: Path, will_never_modify: bool) -> None:
489
+ shutil.copyfile(str(self.local_fs_path), str(p))
490
+
491
+ def open_readonly(self):
492
+ return self.local_fs_path.open("rb")
493
+
494
+ @classmethod
495
+ def generate_from_path(cls, base: Path):
496
+ d = {}
497
+ for root, dirs, files in pathwalk(base):
498
+ for p in files:
499
+ p = root / p
500
+ rel = PurePosixPath(*p.relative_to(base).parts)
501
+ d[rel] = cls(path=rel, local_fs_path=p)
502
+ return d
503
+
504
+
505
+ @attr.s(eq=False, hash=False)
506
+ class _PycEntry:
507
+ path_py = attr.ib()
508
+ path_pyc = attr.ib()
509
+
510
+
511
+ def check_process(process: sbp.Popen):
512
+ if retcode := process.poll():
513
+ raise sbp.CalledProcessError(retcode, process.args)
514
+
515
+
516
+ @attr.s(eq=False, hash=False)
517
+ class PycGenerator:
518
+ python_exe: Path = attr.ib()
519
+ optimization_level: int = attr.ib()
520
+ max_threads: int = attr.ib(default=0)
521
+ magic: bytes = attr.ib(init=False, default=None)
522
+ suffix: str = attr.ib(init=False, default=None)
523
+
524
+ def get_data_to_hash(self):
525
+ self._ensure_analyze()
526
+ yield str(self.optimization_level).encode("ascii")
527
+ yield b","
528
+ yield self.magic
529
+
530
+ def _compileall(self, base: Path, files: ty.Iterable[Path]) -> None:
531
+ cmd = [str(self.python_exe.absolute())]
532
+ cmd += "-B", "-m", "compileall", "-l", "-s", str(base)
533
+ cmd += "--invalidation-mode", "unchecked-hash" # disable timestamp and hash checking
534
+ cmd += "-o", str(self.optimization_level)
535
+ cmd += "-j", str(self.max_threads)
536
+ cmd += "-i", "-" # input from stdin
537
+ cmd_input = b"".join(x for f in files for x in (str(f).encode("utf-8"), b"\n"))
538
+
539
+ proc = sbp.Popen(cmd, stdin=sbp.PIPE, stdout=sbp.PIPE, env={"PYTHONUTF8": "1"} | os.environ)
540
+
541
+ # This is not a race condition because compileall does not start until it has received
542
+ # all of its input.
543
+ proc.stdin.write(cmd_input)
544
+ proc.stdin.close()
545
+
546
+ i = 0
547
+ for line in proc.stdout:
548
+ # TODO: progress bar
549
+ if line.startswith(b"Compiling "):
550
+ i += 1
551
+ if i % 100 == 0:
552
+ print(f"{i:>5d}/{len(files):<5d}")
553
+
554
+ proc.wait()
555
+ # We do not check the return code as some of the source files may have failed to compile.
556
+ # This happens surprisingly often with test cases for syntax errors.
557
+
558
+ def _ensure_analyze(self):
559
+ if self.magic is None:
560
+ self._analyze()
561
+
562
+ def _analyze(self):
563
+ """
564
+ Figure out the pyc file naming scheme and magic.
565
+ """
566
+ with tempfile.TemporaryDirectory() as tmp:
567
+ tmp = Path(tmp)
568
+ basename = "xyzzy"
569
+ (py_path := tmp / f"{basename}.py").write_bytes(b"")
570
+ self._compileall(tmp, (py_path,))
571
+ py_path.unlink()
572
+ [pyc_path] = (p for p in tmp.rglob("*") if p.is_file())
573
+ pyc_path: Path
574
+ assert_(pyc_path.name.startswith(basename))
575
+ assert_(pyc_path.name.endswith(".pyc"))
576
+ self.suffix = pyc_path.name[len(basename) :]
577
+ with pyc_path.open("rb") as f:
578
+ self.magic = f.read(4)
579
+
580
+ def map_path(self, p: Path):
581
+ """
582
+ Map a source file path to the corresponding pyc file path.
583
+ """
584
+ assert_(p.name.endswith(".py"))
585
+ return p.parent / "__pycache__" / (p.name[:-3] + self.suffix)
586
+
587
+ def __call__(self, base: Path, paths: ty.Iterable[Path]):
588
+ self._ensure_analyze()
589
+ self._compileall(base, paths)
590
+
591
+
592
+ class PycGeneratorMockUseSystemPython(PycGenerator):
593
+ @property
594
+ def python_exe(self):
595
+ return Path(sys.executable)
596
+
597
+ @python_exe.setter
598
+ def python_exe(self, value):
599
+ pass
600
+
601
+
602
+ @attr.s(eq=False, hash=False)
603
+ class _VenvExporterPycFile:
604
+ path_py: Path = attr.ib()
605
+ path_pyc: Path = attr.ib()
606
+ pyc_tags: frozenset[bytes] = attr.ib(default=None)
607
+ link_request: de.DedupLinkRequest = attr.ib(default=None)
608
+
609
+
610
+ @attr.s(eq=False, hash=False)
611
+ class VenvExporter:
612
+ hash_function: mh.HashFunction = attr.ib()
613
+ dedup: de.Dedup = attr.ib()
614
+ output: _PurePathBase | None = attr.ib()
615
+ output_real: Path = attr.ib(converter=_Path)
616
+ venv_name: str = attr.ib(default=None)
617
+ mock_use_system_python: bool = attr.ib(default=False)
618
+
619
+ def __attrs_post_init__(self):
620
+ if self.output is None:
621
+ self.output = self.output_real
622
+ if self.venv_name is None:
623
+ self.venv_name = re.compile("[^0-9a-zA-Z_-]").sub("_", self.output.name)
624
+
625
+ def _handle_command(self, c: Command) -> bytes:
626
+ if c == Command.VENV_BASE_PATH:
627
+ return str(self.output).encode("utf-8")
628
+ else: # Can only be Command.VENV_NAME
629
+ return self.venv_name.encode("utf-8")
630
+
631
+ def _process_file(self, file: VenvFile):
632
+ # get rid of the "literal"/"template" at the start, then join the path with the output path
633
+ output = self._map_path(file.path)
634
+ output.parent.mkdir(exist_ok=True, parents=True)
635
+ if (first_component := file.path.parts[0]) == "literal":
636
+ file.write_to(output, will_never_modify=True)
637
+ else:
638
+ with file.open_readonly() as f, output.open("wb") as f_w:
639
+ reader = CommandSequenceReader(f)
640
+ while item := reader.read(65536):
641
+ if isinstance(item, bytes):
642
+ f_w.write(item)
643
+ else:
644
+ f_w.write(self._handle_command(item))
645
+
646
+ def begin_session(self):
647
+ pass
648
+
649
+ def end_session(self):
650
+ self._generate_pyc(0)
651
+
652
+ def _generate_pyc(self, optimization_level: int):
653
+ """
654
+ 1. Find source python files that don't have a pyc file.
655
+ 2. Query hashes from dedup.
656
+ 3. Attempt to link dedup'd pyc files.
657
+ 4. Generate pyc files using compileall.
658
+ 5. Adopt newly-generated pyc files into the dedup system.
659
+ """
660
+ exe = self._find_python()
661
+ hf = self.hash_function
662
+
663
+ if self.mock_use_system_python:
664
+ PycGenerator_ = PycGeneratorMockUseSystemPython
665
+ else:
666
+ PycGenerator_ = PycGenerator
667
+
668
+ pg = PycGenerator_(python_exe=exe, optimization_level=optimization_level)
669
+ h_exe: mh.Digest = self.dedup.get_or_compute_file_hash(hf, exe, check_link=False)[1]
670
+ h_key = hf().update(h_exe.to_multihash_bytes()).update_iter(pg.get_data_to_hash()).digest()
671
+ h_key_bytes = h_key.to_multihash_bytes()
672
+
673
+ def _pyc_tag(py_path: Path) -> bytes:
674
+ h_py: mh.Digest = self.dedup.get_or_compute_file_hash(hf, py_path, check_link=False)[1]
675
+ h_pyc = hf().update(h_key_bytes).update(h_py.to_multihash_bytes()).digest()
676
+ return b"pyc:" + h_pyc.to_multihash_bytes()
677
+
678
+ py_paths = self.output_real.rglob("*.py")
679
+
680
+ todo = [
681
+ _VenvExporterPycFile(
682
+ path_py=path_py,
683
+ path_pyc=pg.map_path(path_py),
684
+ pyc_tags=frozenset((_pyc_tag(path_py),)),
685
+ )
686
+ for path_py in py_paths
687
+ ]
688
+
689
+ for x in todo:
690
+ x.link_request = de.DedupLinkRequest(
691
+ hash_function=hf,
692
+ link_path=x.path_pyc,
693
+ file_metadata=de.DedupFileMetadata.make_plain(),
694
+ file_contents_hash=None,
695
+ open_file_once=None,
696
+ tags=x.pyc_tags,
697
+ )
698
+ x.path_pyc.parent.mkdir(parents=True, exist_ok=True)
699
+
700
+ try:
701
+ self.dedup.run_batch([x.link_request for x in todo])
702
+ except de.BatchError as exc:
703
+ for r in exc.requests:
704
+ if not isinstance(r.exc, de.MissingContentError):
705
+ raise r.exc
706
+
707
+ todo = [x for x in todo if not x.link_request.success]
708
+
709
+ if not todo:
710
+ # early exit if there are no files to compile
711
+ return
712
+
713
+ # Now we compile the pyc files.
714
+ pg(self.output_real, [x.path_py for x in todo])
715
+
716
+ # Some pyc files may fail to appear (syntax errors for example) so we must skip them.
717
+ self.dedup.adopt_files(
718
+ hf,
719
+ (
720
+ de.AdoptRequest(x.path_pyc, tags=x.pyc_tags)
721
+ for x in todo
722
+ if x.path_pyc.exists() and x.path_pyc.stat().st_size >= 256
723
+ ),
724
+ )
725
+
726
+ def _find_python(self):
727
+ files = {p.name.lower(): p for p in self.output_real.iterdir()}
728
+ for name in ("python.exe", "python"):
729
+ if path := files.get(name):
730
+ return path
731
+ raise ValueError("could not find python executable")
732
+
733
+ def _map_path(self, path: PurePosixPath):
734
+ return self.output_real / Path(*path.parts[1:])
735
+
736
+ def _dedup_file_metadata(self, x: ImageFileMetadata):
737
+ return de.DedupFileMetadata(executable=x.executable)
738
+
739
+ def _process_template_file(self, f: ty.BinaryIO, f_w: ty.BinaryIO):
740
+ reader = CommandSequenceReader(f)
741
+ while item := reader.read(65536):
742
+ if isinstance(item, bytes):
743
+ f_w.write(item)
744
+ else:
745
+ f_w.write(self._handle_command(item))
746
+
747
+ def provide_files(self, inputs: ty.Iterable[VenvExportInput]):
748
+ """
749
+ Provide the files and their contents.
750
+ """
751
+
752
+ counter = 0
753
+ later: list[tuple[Path, VenvExportInput]] = []
754
+ with self.dedup.temporary_directory() as tmp:
755
+ batch = []
756
+
757
+ for x in inputs:
758
+ digest = x.info.files[0].digest
759
+ d = {f.metadata for f in x.info.files}
760
+ kw = dict(
761
+ hash_function=digest.function,
762
+ file_contents_hash=digest,
763
+ open_file_once=x.contents_open,
764
+ file_not_needed=x.contents_reject,
765
+ )
766
+
767
+ # HACK: We simply copy files smaller than 256 bytes. This is to avoid bumping into
768
+ # the limit on the number of hardlinks for a single file, which is 1023 on Windows.
769
+ # This limit would be exceeded by the numerous zero-length "__init__.py" files.
770
+ if (
771
+ len(d) > 1
772
+ or any(f.path.parts[0] != "literal" for f in x.info.files)
773
+ or x.info.size < 256
774
+ ):
775
+ # slow path - we need to write the file to a temporary location first
776
+ batch.append(
777
+ de.DedupLinkRequest(
778
+ link_path=(tmp_file := tmp / f"c{counter}.bin"),
779
+ file_metadata=de.DedupFileMetadata.make_plain(),
780
+ **kw,
781
+ )
782
+ )
783
+ counter += 1
784
+ later.append((tmp_file, x))
785
+ else:
786
+ # all files are literal
787
+ for f in x.info.files:
788
+ (dst := self._map_path(f.path)).parent.mkdir(exist_ok=True, parents=True)
789
+ batch.append(
790
+ de.DedupLinkRequest(
791
+ link_path=dst,
792
+ file_metadata=self._dedup_file_metadata(f.metadata),
793
+ **kw,
794
+ )
795
+ )
796
+ kw["open_file_once"] = None
797
+
798
+ self.dedup.run_batch(batch)
799
+
800
+ for tmp_file, x in later:
801
+ for f in x.info.files:
802
+ (dst := self._map_path(f.path)).parent.mkdir(exist_ok=True, parents=True)
803
+ with tmp_file.open("rb") as f_r, dst.open("wb") as f_w:
804
+ if f.path.parts[0] == "literal":
805
+ shutil.copyfileobj(f_r, f_w)
806
+ else:
807
+ self._process_template_file(f_r, f_w)
808
+ self.dedup.apply_metadata_to_file(dst, self._dedup_file_metadata(f.metadata))
809
+
810
+
811
+ class VenvExportInput(abc.ABC):
812
+ info: SolidArchiveFileInfo
813
+
814
+ @abc.abstractmethod
815
+ def contents_open(self) -> ty.BinaryIO: ...
816
+
817
+ @abc.abstractmethod
818
+ def contents_reject(self) -> None: ...
819
+
820
+
821
+ @attr.s(eq=False, hash=False)
822
+ class SolidArchiveFileInfo:
823
+ files: list[SingleFileImageMetadata] = attr.ib()
824
+ offset: int = attr.ib()
825
+ size: int = attr.ib()
826
+
827
+
828
+ @attr.s(eq=False, hash=False)
829
+ class VenvExportInputFromSolidArchive(VenvExportInput):
830
+ archive_io: ty.BinaryIO = attr.ib()
831
+ info: SolidArchiveFileInfo = attr.ib()
832
+
833
+ def _skip_to(self, offset: int):
834
+ bs = 2**17
835
+ to_skip = offset - (f := self.archive_io).tell()
836
+ assert_(to_skip >= 0, "solid archive members must be read in the order they appear in")
837
+ while to_skip:
838
+ to_skip -= (n := len(f.read(min(to_skip, bs))))
839
+ assert_(n, "attempted to skip past the end of the solid archive")
840
+
841
+ def contents_open(self):
842
+ self._skip_to(self.info.offset)
843
+ # NOTE: we don't check the hash here because the dedup code does it
844
+ return LimitIO(self.archive_io, self.info.size)
845
+
846
+ def contents_reject(self):
847
+ # With some luck, we might not need to decompress this archive at all if we don't
848
+ # need any of the contents.
849
+ pass
850
+
851
+
852
+ @attr.s(eq=False, hash=False)
853
+ class LimitIO(io.RawIOBase):
854
+ _raw: ty.BinaryIO = attr.ib()
855
+ _bytes_left: int = attr.ib()
856
+
857
+ def writable(self):
858
+ return False
859
+
860
+ def seekable(self):
861
+ return False
862
+
863
+ def readinto(self, b):
864
+ if len(b) > (n := self._bytes_left):
865
+ if n == 0:
866
+ return 0
867
+ b = memoryview(b)[:n]
868
+ read_count = self._raw.readinto(b)
869
+ self._bytes_left -= read_count
870
+ return read_count