vocker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vocker/__init__.py +0 -0
- vocker/__main__.py +3 -0
- vocker/cli.py +384 -0
- vocker/dedup.py +1676 -0
- vocker/dedup_models.py +174 -0
- vocker/image.py +870 -0
- vocker/integer_to_path.py +51 -0
- vocker/multihash.py +302 -0
- vocker/py.typed +0 -0
- vocker/repo/__init__.py +0 -0
- vocker/repo/compression.py +239 -0
- vocker/repo/io.py +711 -0
- vocker/system.py +681 -0
- vocker/util.py +120 -0
- vocker/util_models.py +13 -0
- vocker-0.1.0.dist-info/METADATA +56 -0
- vocker-0.1.0.dist-info/RECORD +19 -0
- vocker-0.1.0.dist-info/WHEEL +5 -0
- vocker-0.1.0.dist-info/top_level.txt +1 -0
vocker/image.py
ADDED
|
@@ -0,0 +1,870 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
import contextlib
|
|
6
|
+
import enum
|
|
7
|
+
import functools
|
|
8
|
+
import io
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import shutil
|
|
12
|
+
import subprocess as sbp
|
|
13
|
+
import tempfile
|
|
14
|
+
import typing as ty
|
|
15
|
+
import zipfile
|
|
16
|
+
from pathlib import Path, PurePosixPath
|
|
17
|
+
import struct
|
|
18
|
+
import sys
|
|
19
|
+
import itertools
|
|
20
|
+
|
|
21
|
+
import attr
|
|
22
|
+
from cached_property import cached_property
|
|
23
|
+
from sansio_tools.queue import BytesQueue
|
|
24
|
+
|
|
25
|
+
from . import dedup as de, multihash as mh
|
|
26
|
+
from .util import assert_, pathwalk
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_start_of_zipfile(zfile) -> int | None:
|
|
30
|
+
try:
|
|
31
|
+
with zipfile.ZipFile(zfile) as z:
|
|
32
|
+
# offset is the number of bytes from essentially the beginning of the file to a local file header. The smaller
|
|
33
|
+
# the offset, the earlier in the file the local header is.
|
|
34
|
+
|
|
35
|
+
local_offsets = (info.header_offset for info in z.infolist())
|
|
36
|
+
return min(itertools.chain([z.start_dir], local_offsets))
|
|
37
|
+
|
|
38
|
+
# start_dir is a pretty internal attribute of the ZipFile object. It initially starts at 0 (beginning of file)
|
|
39
|
+
# but the first local file header may not be there, as is the case with exe files, where there's a zip file at
|
|
40
|
+
# the end of the executable. It appears that start_dir can be arbitrarily somewhere in the file based on the
|
|
41
|
+
# location of the outcome of file.seek(). The minimum of start_dir and the smallest local_offset would
|
|
42
|
+
# inevitably result in the location of the start of the zip file.
|
|
43
|
+
except zipfile.BadZipfile:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Command(enum.Enum):
|
|
48
|
+
LITERAL = b"\x80"
|
|
49
|
+
VENV_BASE_PATH = b"\x81"
|
|
50
|
+
VENV_NAME = b"\x82"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@attr.s(eq=False, hash=False)
|
|
54
|
+
class CommandSequenceWriter:
|
|
55
|
+
"""
|
|
56
|
+
Command sequence encoding. There are currently three types of commands:
|
|
57
|
+
|
|
58
|
+
- ``0x80 [8 byte "length"] [...length bytes...]``
|
|
59
|
+
|
|
60
|
+
This means copy "length" bytes to the output stream.
|
|
61
|
+
|
|
62
|
+
- ``0x81``
|
|
63
|
+
|
|
64
|
+
This means copy the current Python executable path here.
|
|
65
|
+
|
|
66
|
+
- ``0x82``
|
|
67
|
+
|
|
68
|
+
This means copy the virtualenv name here.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
max_literal_length: int = attr.ib(default=10 * 1024 * 1024)
|
|
72
|
+
output: BytesQueue = attr.ib(init=False, factory=BytesQueue)
|
|
73
|
+
_current_literal: BytesQueue = attr.ib(init=False, factory=BytesQueue, repr=False)
|
|
74
|
+
|
|
75
|
+
def _end_literal(self):
|
|
76
|
+
if n := len(lit := self._current_literal):
|
|
77
|
+
out = self.output
|
|
78
|
+
out.append(Command.LITERAL.value)
|
|
79
|
+
out.append(struct.pack(">Q", n))
|
|
80
|
+
lit.popleft_all_to(out)
|
|
81
|
+
lit.clear()
|
|
82
|
+
|
|
83
|
+
def feed(self, b: bytes | memoryview | Command):
|
|
84
|
+
if isinstance(b, Command):
|
|
85
|
+
self.feed_command(b)
|
|
86
|
+
else:
|
|
87
|
+
self.feed_data(b)
|
|
88
|
+
|
|
89
|
+
def feed_data(self, b: bytes | memoryview):
|
|
90
|
+
lit = self._current_literal
|
|
91
|
+
n_max = self.max_literal_length
|
|
92
|
+
while len(b):
|
|
93
|
+
allowed = n_max - len(lit)
|
|
94
|
+
current_b = b[:allowed]
|
|
95
|
+
lit.append(current_b)
|
|
96
|
+
if len(lit) == n_max:
|
|
97
|
+
self._end_literal()
|
|
98
|
+
b = b[allowed:]
|
|
99
|
+
|
|
100
|
+
def feed_command(self, c):
|
|
101
|
+
self._end_literal()
|
|
102
|
+
self.output.append(c.value)
|
|
103
|
+
|
|
104
|
+
def close(self):
|
|
105
|
+
self._end_literal()
|
|
106
|
+
|
|
107
|
+
def generator_pipe(
|
|
108
|
+
self, gen: ty.Iterable[bytes | memoryview | Command]
|
|
109
|
+
) -> ty.Iterable[bytes, memoryview]:
|
|
110
|
+
out = self.output
|
|
111
|
+
for item in gen:
|
|
112
|
+
self.feed(item)
|
|
113
|
+
while out:
|
|
114
|
+
yield out.popleft_any()
|
|
115
|
+
self.close()
|
|
116
|
+
while out:
|
|
117
|
+
yield out.popleft_any()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@attr.s(eq=False, hash=False)
|
|
121
|
+
class CommandSequenceReader:
|
|
122
|
+
f: ty.IO = attr.ib()
|
|
123
|
+
literal_length_left = 0
|
|
124
|
+
|
|
125
|
+
def _read_from_literal(self, n: int):
|
|
126
|
+
if left := self.literal_length_left:
|
|
127
|
+
output = self.f.read(min(n, left))
|
|
128
|
+
self.literal_length_left -= len(output)
|
|
129
|
+
return output
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
def read(self, n: int) -> bytes | Command:
|
|
133
|
+
"""
|
|
134
|
+
Read at most *n* literal bytes OR a command. EOF when an empty bytes is returned.
|
|
135
|
+
"""
|
|
136
|
+
if r := self._read_from_literal(n):
|
|
137
|
+
return r
|
|
138
|
+
|
|
139
|
+
if not (c := self.f.read(1)):
|
|
140
|
+
return b"" # EOF
|
|
141
|
+
|
|
142
|
+
c = Command(c)
|
|
143
|
+
if c == Command.LITERAL:
|
|
144
|
+
[self.literal_length_left] = struct.unpack(">Q", self.f.read(8))
|
|
145
|
+
return self._read_from_literal(n)
|
|
146
|
+
else:
|
|
147
|
+
return c
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _Path(p: Path | str) -> Path:
|
|
151
|
+
if not hasattr(p, "is_absolute"):
|
|
152
|
+
p = Path(p)
|
|
153
|
+
return p if p.is_absolute() else Path.cwd() / p
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def pyenv_split(root_path: Path):
|
|
157
|
+
def _make_key(p: Path):
|
|
158
|
+
return "/".join(p.relative_to(root_path).parts) or "."
|
|
159
|
+
|
|
160
|
+
@functools.cache
|
|
161
|
+
def _j(*args):
|
|
162
|
+
return "".join(args)
|
|
163
|
+
|
|
164
|
+
rx_exts = re.compile(r"(\.pyc$)|(\.(?:pkl|pickle|dll|pyd|lib|exe|lib|dylib|so(\.\d+)*)$)")
|
|
165
|
+
out = defaultdict(list)
|
|
166
|
+
joint_names = {"site-packages", "dist-packages"}
|
|
167
|
+
|
|
168
|
+
def _f(p: Path, current_key: str):
|
|
169
|
+
if not p.is_dir():
|
|
170
|
+
if (m := rx_exts.search(p.name.lower())) is None:
|
|
171
|
+
is_binary = False
|
|
172
|
+
elif m.lastindex == 1:
|
|
173
|
+
# it's a pyc file, ignore it
|
|
174
|
+
return
|
|
175
|
+
elif m.lastindex == 2:
|
|
176
|
+
is_binary = True
|
|
177
|
+
out[_j("bin:" if is_binary else "pure:", current_key)].append(p)
|
|
178
|
+
return
|
|
179
|
+
|
|
180
|
+
if len(p.relative_to(root_path).parts) <= 2:
|
|
181
|
+
current_key = _make_key(p)
|
|
182
|
+
|
|
183
|
+
for child in p.iterdir():
|
|
184
|
+
if p.name in joint_names:
|
|
185
|
+
_f(child, _make_key(child))
|
|
186
|
+
else:
|
|
187
|
+
_f(child, current_key)
|
|
188
|
+
|
|
189
|
+
_f(root_path, "")
|
|
190
|
+
return out
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class VenvImporterReceiver:
|
|
194
|
+
@abc.abstractmethod
|
|
195
|
+
def call(
|
|
196
|
+
self,
|
|
197
|
+
input_path: Path,
|
|
198
|
+
output_path: PurePosixPath,
|
|
199
|
+
contents: Path | ty.Generator[bytes | memoryview],
|
|
200
|
+
executable: bool,
|
|
201
|
+
template_mode: bool,
|
|
202
|
+
): ...
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def file_block_iter(path: Path, block_size: int = 65536):
|
|
206
|
+
with path.open("rb") as f:
|
|
207
|
+
while block := f.read(block_size):
|
|
208
|
+
yield block
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@attr.s(frozen=True)
|
|
212
|
+
class ImageFileMetadata:
|
|
213
|
+
executable: bool = attr.ib()
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@attr.s
|
|
217
|
+
class SingleFileImageMetadata:
|
|
218
|
+
path: PurePosixPath = attr.ib()
|
|
219
|
+
metadata: ImageFileMetadata = attr.ib()
|
|
220
|
+
digest: mh.Digest = attr.ib()
|
|
221
|
+
|
|
222
|
+
@classmethod
|
|
223
|
+
def from_shard_entry(cls, data: type[str, str], digest: mh.Digest):
|
|
224
|
+
p, m = data
|
|
225
|
+
path = PurePosixPath(p)
|
|
226
|
+
if m == "":
|
|
227
|
+
metadata = ImageFileMetadata(executable=False)
|
|
228
|
+
elif m == "x":
|
|
229
|
+
metadata = ImageFileMetadata(executable=True)
|
|
230
|
+
else:
|
|
231
|
+
raise ValueError(f"value: {m!r}")
|
|
232
|
+
return cls(path=path, metadata=metadata, digest=digest)
|
|
233
|
+
|
|
234
|
+
def to_shard_entry(self):
|
|
235
|
+
return [str(self.path), "x" if self.metadata.executable else ""]
|
|
236
|
+
|
|
237
|
+
def to_data_for_image_hash(self):
|
|
238
|
+
return [self.digest.to_multihash_bytes()] + self.to_shard_entry()
|
|
239
|
+
|
|
240
|
+
def to_image_hash_sort_key(self):
|
|
241
|
+
s = str(self.path).encode("utf-8")
|
|
242
|
+
return len(s), s
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@attr.s
|
|
246
|
+
class VenvImporterFileOutput:
|
|
247
|
+
size: int = attr.ib()
|
|
248
|
+
rest: SingleFileImageMetadata = attr.ib()
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@attr.s(eq=False, hash=False)
|
|
252
|
+
class VenvImporterToImageMetadata:
|
|
253
|
+
hash_function: mh.HashFunction = attr.ib()
|
|
254
|
+
dedup: de.Dedup = attr.ib()
|
|
255
|
+
|
|
256
|
+
def __call__(self, output: ImporterOutput) -> VenvImporterFileOutput:
|
|
257
|
+
h = None
|
|
258
|
+
if isinstance(c := output.contents, Path):
|
|
259
|
+
if (r := self.dedup.get_file_hash(self.hash_function, c, check_link=False)) is not None:
|
|
260
|
+
size, h = r
|
|
261
|
+
|
|
262
|
+
if h is None:
|
|
263
|
+
hasher = self.hash_function()
|
|
264
|
+
size = 0
|
|
265
|
+
for block in output.contents_iter():
|
|
266
|
+
hasher.update(block)
|
|
267
|
+
size += len(block)
|
|
268
|
+
h = hasher.digest()
|
|
269
|
+
|
|
270
|
+
return VenvImporterFileOutput(
|
|
271
|
+
size=size,
|
|
272
|
+
rest=SingleFileImageMetadata(
|
|
273
|
+
path=output.path, digest=h, metadata=ImageFileMetadata(executable=False)
|
|
274
|
+
),
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
_PurePathBase = object
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
@attr.s(eq=False, hash=False)
|
|
282
|
+
class VenvImporter:
|
|
283
|
+
"""
|
|
284
|
+
Make sure the last component of :attr:`input` is a long random string.
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
input: _PurePathBase | Path = attr.ib(converter=_Path)
|
|
288
|
+
input_real: Path = attr.ib(default=None)
|
|
289
|
+
|
|
290
|
+
def __attrs_post_init__(self):
|
|
291
|
+
self.forbidden_string = self.input.name.encode("utf-8")
|
|
292
|
+
if self.input_real is None:
|
|
293
|
+
self.input_real = self.input
|
|
294
|
+
|
|
295
|
+
def _make_output_path(self, p: Path) -> PurePosixPath:
|
|
296
|
+
return PurePosixPath(*p.relative_to(self.input_real).parts)
|
|
297
|
+
|
|
298
|
+
def _handle_exe(self, p: Path):
|
|
299
|
+
with p.open(mode="rb") as f:
|
|
300
|
+
contents = f.read()
|
|
301
|
+
f.seek(0)
|
|
302
|
+
zip_position = get_start_of_zipfile(zfile=f)
|
|
303
|
+
|
|
304
|
+
if zip_position is not None:
|
|
305
|
+
if (i := contents.rfind(b"\0#!", zip_position - 1024, zip_position)) < 0:
|
|
306
|
+
raise AssertionError("failed to find #!")
|
|
307
|
+
i += 3 # actual start of path
|
|
308
|
+
|
|
309
|
+
path_data = contents[i:zip_position].rstrip(b"\r\n")
|
|
310
|
+
if path_data.startswith(b'"'):
|
|
311
|
+
if not path_data.endswith(b'"'):
|
|
312
|
+
raise AssertionError("not matching doublequotes??")
|
|
313
|
+
path_data = path_data[1:-1]
|
|
314
|
+
|
|
315
|
+
current_path = type(self.input)(path_data.decode("utf-8"))
|
|
316
|
+
if not current_path.is_relative_to(self.input):
|
|
317
|
+
raise AssertionError(f"path {path_data!r} is not under venv base {self.input!r}")
|
|
318
|
+
|
|
319
|
+
rel_path = current_path.relative_to(self.input)
|
|
320
|
+
|
|
321
|
+
# now let's make it a suffix
|
|
322
|
+
suffix_path = str("x" / rel_path)[1:]
|
|
323
|
+
|
|
324
|
+
yield True # template_mode
|
|
325
|
+
yield contents[:i]
|
|
326
|
+
yield b'"'
|
|
327
|
+
yield Command.VENV_BASE_PATH
|
|
328
|
+
yield suffix_path.encode("utf-8")
|
|
329
|
+
yield b'"\n\r\n'
|
|
330
|
+
yield contents[zip_position:]
|
|
331
|
+
else:
|
|
332
|
+
yield p # template_mode
|
|
333
|
+
|
|
334
|
+
def _handle_activate_script(self, p: Path):
|
|
335
|
+
enc = "utf-8"
|
|
336
|
+
with p.open(mode="rt", encoding=enc) as f:
|
|
337
|
+
text = f.read()
|
|
338
|
+
|
|
339
|
+
env_loc = re.escape(str(self.input))
|
|
340
|
+
env_name = re.escape(self.input.name)
|
|
341
|
+
|
|
342
|
+
rx = re.compile(f"({env_loc})|({env_name})")
|
|
343
|
+
last_output_pos = 0
|
|
344
|
+
first = True
|
|
345
|
+
for m in rx.finditer(text):
|
|
346
|
+
if first:
|
|
347
|
+
yield True # template_mode
|
|
348
|
+
first = False
|
|
349
|
+
yield text[last_output_pos : m.start()].encode(enc)
|
|
350
|
+
|
|
351
|
+
if m.lastindex == 1: # env_loc
|
|
352
|
+
yield Command.VENV_BASE_PATH
|
|
353
|
+
else: # env_name
|
|
354
|
+
yield Command.VENV_NAME
|
|
355
|
+
|
|
356
|
+
last_output_pos = m.end()
|
|
357
|
+
|
|
358
|
+
if first:
|
|
359
|
+
# this means we found no matches
|
|
360
|
+
yield False # template_mode
|
|
361
|
+
yield text[last_output_pos:].encode(enc)
|
|
362
|
+
|
|
363
|
+
def _handle_dist_info_record(self, p: Path):
|
|
364
|
+
yield False # template_mode
|
|
365
|
+
enc = "utf-8"
|
|
366
|
+
with p.open(mode="rt", encoding=enc) as f:
|
|
367
|
+
for line in f:
|
|
368
|
+
if line.rstrip():
|
|
369
|
+
path, _hash, _length = line.rsplit(",", maxsplit=2)
|
|
370
|
+
yield path.encode(enc)
|
|
371
|
+
yield b",,\n"
|
|
372
|
+
else:
|
|
373
|
+
yield line.encode(enc)
|
|
374
|
+
|
|
375
|
+
def _handle_simple_copy(self, p: Path):
|
|
376
|
+
yield p # template_mode
|
|
377
|
+
|
|
378
|
+
@cached_property
|
|
379
|
+
def _scripts_dir(self):
|
|
380
|
+
return self.input_real / "Scripts"
|
|
381
|
+
|
|
382
|
+
@cached_property
|
|
383
|
+
def _excluded(self):
|
|
384
|
+
return {self.input_real / "pyvenv.cfg"}
|
|
385
|
+
|
|
386
|
+
def run(self, file_path: Path) -> ty.Iterator[ty.ContextManager[ImporterOutput]]:
|
|
387
|
+
p = file_path
|
|
388
|
+
if p in self._excluded:
|
|
389
|
+
return
|
|
390
|
+
|
|
391
|
+
if p.suffix.lower() == ".pyc":
|
|
392
|
+
return
|
|
393
|
+
|
|
394
|
+
f = None
|
|
395
|
+
if p.parent == self._scripts_dir:
|
|
396
|
+
if p.suffix == ".exe":
|
|
397
|
+
f = self._handle_exe
|
|
398
|
+
elif p.stem.lower() == "activate":
|
|
399
|
+
f = self._handle_activate_script
|
|
400
|
+
|
|
401
|
+
if f is None:
|
|
402
|
+
f = self._handle_simple_copy
|
|
403
|
+
if p.parent.name.endswith(".dist-info"):
|
|
404
|
+
if p.name == "RECORD":
|
|
405
|
+
f = self._handle_dist_info_record
|
|
406
|
+
elif p.name == "direct_url.json":
|
|
407
|
+
return
|
|
408
|
+
|
|
409
|
+
@contextlib.contextmanager
|
|
410
|
+
def make():
|
|
411
|
+
gen = f(p)
|
|
412
|
+
template_mode = ty.cast("bool | Path", next(gen))
|
|
413
|
+
|
|
414
|
+
if isinstance(template_mode, Path):
|
|
415
|
+
sum(0 for _ in gen) # exhaust generator
|
|
416
|
+
gen = template_mode
|
|
417
|
+
template_mode = False
|
|
418
|
+
elif template_mode:
|
|
419
|
+
gen = CommandSequenceWriter().generator_pipe(gen)
|
|
420
|
+
|
|
421
|
+
path = ("template" if template_mode else "literal") / self._make_output_path(p)
|
|
422
|
+
|
|
423
|
+
out = ImporterOutputVenv(
|
|
424
|
+
path=path, template_mode=template_mode, is_executable=False, contents=gen
|
|
425
|
+
)
|
|
426
|
+
try:
|
|
427
|
+
yield out
|
|
428
|
+
finally:
|
|
429
|
+
out.discard()
|
|
430
|
+
|
|
431
|
+
yield make
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
class _Cancel(Exception):
|
|
435
|
+
pass
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
class ImporterOutput:
|
|
439
|
+
path: PurePosixPath
|
|
440
|
+
is_executable: bool
|
|
441
|
+
contents: Path | ty.Generator[bytes | memoryview]
|
|
442
|
+
|
|
443
|
+
def contents_iter(self) -> ty.Generator[bytes | memoryview]:
|
|
444
|
+
if isinstance((x := self.contents), Path):
|
|
445
|
+
return file_block_iter(x)
|
|
446
|
+
else:
|
|
447
|
+
return x
|
|
448
|
+
|
|
449
|
+
@abc.abstractmethod
|
|
450
|
+
def discard(self):
|
|
451
|
+
"""
|
|
452
|
+
Free up memory taken up by :attr:`contents`, if any.
|
|
453
|
+
"""
|
|
454
|
+
if not isinstance(self.contents, Path):
|
|
455
|
+
try:
|
|
456
|
+
self.contents.throw(_Cancel())
|
|
457
|
+
except _Cancel:
|
|
458
|
+
pass
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
@attr.s(eq=False, hash=False)
|
|
462
|
+
class ImporterOutputVenv(ImporterOutput):
|
|
463
|
+
"""
|
|
464
|
+
This object is NOT thread-safe.
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
path: PurePosixPath = attr.ib()
|
|
468
|
+
template_mode: bool = attr.ib()
|
|
469
|
+
is_executable: bool = attr.ib()
|
|
470
|
+
contents: Path | ty.Generator[bytes | memoryview] = attr.ib()
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
class VenvFile:
|
|
474
|
+
path: PurePosixPath
|
|
475
|
+
|
|
476
|
+
@abc.abstractmethod
|
|
477
|
+
def write_to(self, p: Path, will_never_modify: bool) -> None: ...
|
|
478
|
+
|
|
479
|
+
@abc.abstractmethod
|
|
480
|
+
def open_readonly(self) -> ty.IO: ...
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
@attr.s(eq=False, hash=False)
|
|
484
|
+
class VenvFileForTesting(VenvFile):
|
|
485
|
+
path: PurePosixPath = attr.ib()
|
|
486
|
+
local_fs_path: Path = attr.ib()
|
|
487
|
+
|
|
488
|
+
def write_to(self, p: Path, will_never_modify: bool) -> None:
|
|
489
|
+
shutil.copyfile(str(self.local_fs_path), str(p))
|
|
490
|
+
|
|
491
|
+
def open_readonly(self):
|
|
492
|
+
return self.local_fs_path.open("rb")
|
|
493
|
+
|
|
494
|
+
@classmethod
|
|
495
|
+
def generate_from_path(cls, base: Path):
|
|
496
|
+
d = {}
|
|
497
|
+
for root, dirs, files in pathwalk(base):
|
|
498
|
+
for p in files:
|
|
499
|
+
p = root / p
|
|
500
|
+
rel = PurePosixPath(*p.relative_to(base).parts)
|
|
501
|
+
d[rel] = cls(path=rel, local_fs_path=p)
|
|
502
|
+
return d
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
@attr.s(eq=False, hash=False)
|
|
506
|
+
class _PycEntry:
|
|
507
|
+
path_py = attr.ib()
|
|
508
|
+
path_pyc = attr.ib()
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def check_process(process: sbp.Popen):
|
|
512
|
+
if retcode := process.poll():
|
|
513
|
+
raise sbp.CalledProcessError(retcode, process.args)
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
@attr.s(eq=False, hash=False)
|
|
517
|
+
class PycGenerator:
|
|
518
|
+
python_exe: Path = attr.ib()
|
|
519
|
+
optimization_level: int = attr.ib()
|
|
520
|
+
max_threads: int = attr.ib(default=0)
|
|
521
|
+
magic: bytes = attr.ib(init=False, default=None)
|
|
522
|
+
suffix: str = attr.ib(init=False, default=None)
|
|
523
|
+
|
|
524
|
+
def get_data_to_hash(self):
|
|
525
|
+
self._ensure_analyze()
|
|
526
|
+
yield str(self.optimization_level).encode("ascii")
|
|
527
|
+
yield b","
|
|
528
|
+
yield self.magic
|
|
529
|
+
|
|
530
|
+
def _compileall(self, base: Path, files: ty.Iterable[Path]) -> None:
|
|
531
|
+
cmd = [str(self.python_exe.absolute())]
|
|
532
|
+
cmd += "-B", "-m", "compileall", "-l", "-s", str(base)
|
|
533
|
+
cmd += "--invalidation-mode", "unchecked-hash" # disable timestamp and hash checking
|
|
534
|
+
cmd += "-o", str(self.optimization_level)
|
|
535
|
+
cmd += "-j", str(self.max_threads)
|
|
536
|
+
cmd += "-i", "-" # input from stdin
|
|
537
|
+
cmd_input = b"".join(x for f in files for x in (str(f).encode("utf-8"), b"\n"))
|
|
538
|
+
|
|
539
|
+
proc = sbp.Popen(cmd, stdin=sbp.PIPE, stdout=sbp.PIPE, env={"PYTHONUTF8": "1"} | os.environ)
|
|
540
|
+
|
|
541
|
+
# This is not a race condition because compileall does not start until it has received
|
|
542
|
+
# all of its input.
|
|
543
|
+
proc.stdin.write(cmd_input)
|
|
544
|
+
proc.stdin.close()
|
|
545
|
+
|
|
546
|
+
i = 0
|
|
547
|
+
for line in proc.stdout:
|
|
548
|
+
# TODO: progress bar
|
|
549
|
+
if line.startswith(b"Compiling "):
|
|
550
|
+
i += 1
|
|
551
|
+
if i % 100 == 0:
|
|
552
|
+
print(f"{i:>5d}/{len(files):<5d}")
|
|
553
|
+
|
|
554
|
+
proc.wait()
|
|
555
|
+
# We do not check the return code as some of the source files may have failed to compile.
|
|
556
|
+
# This happens surprisingly often with test cases for syntax errors.
|
|
557
|
+
|
|
558
|
+
def _ensure_analyze(self):
|
|
559
|
+
if self.magic is None:
|
|
560
|
+
self._analyze()
|
|
561
|
+
|
|
562
|
+
def _analyze(self):
|
|
563
|
+
"""
|
|
564
|
+
Figure out the pyc file naming scheme and magic.
|
|
565
|
+
"""
|
|
566
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
567
|
+
tmp = Path(tmp)
|
|
568
|
+
basename = "xyzzy"
|
|
569
|
+
(py_path := tmp / f"{basename}.py").write_bytes(b"")
|
|
570
|
+
self._compileall(tmp, (py_path,))
|
|
571
|
+
py_path.unlink()
|
|
572
|
+
[pyc_path] = (p for p in tmp.rglob("*") if p.is_file())
|
|
573
|
+
pyc_path: Path
|
|
574
|
+
assert_(pyc_path.name.startswith(basename))
|
|
575
|
+
assert_(pyc_path.name.endswith(".pyc"))
|
|
576
|
+
self.suffix = pyc_path.name[len(basename) :]
|
|
577
|
+
with pyc_path.open("rb") as f:
|
|
578
|
+
self.magic = f.read(4)
|
|
579
|
+
|
|
580
|
+
def map_path(self, p: Path):
|
|
581
|
+
"""
|
|
582
|
+
Map a source file path to the corresponding pyc file path.
|
|
583
|
+
"""
|
|
584
|
+
assert_(p.name.endswith(".py"))
|
|
585
|
+
return p.parent / "__pycache__" / (p.name[:-3] + self.suffix)
|
|
586
|
+
|
|
587
|
+
def __call__(self, base: Path, paths: ty.Iterable[Path]):
|
|
588
|
+
self._ensure_analyze()
|
|
589
|
+
self._compileall(base, paths)
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
class PycGeneratorMockUseSystemPython(PycGenerator):
|
|
593
|
+
@property
|
|
594
|
+
def python_exe(self):
|
|
595
|
+
return Path(sys.executable)
|
|
596
|
+
|
|
597
|
+
@python_exe.setter
|
|
598
|
+
def python_exe(self, value):
|
|
599
|
+
pass
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
@attr.s(eq=False, hash=False)
|
|
603
|
+
class _VenvExporterPycFile:
|
|
604
|
+
path_py: Path = attr.ib()
|
|
605
|
+
path_pyc: Path = attr.ib()
|
|
606
|
+
pyc_tags: frozenset[bytes] = attr.ib(default=None)
|
|
607
|
+
link_request: de.DedupLinkRequest = attr.ib(default=None)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
@attr.s(eq=False, hash=False)
|
|
611
|
+
class VenvExporter:
|
|
612
|
+
hash_function: mh.HashFunction = attr.ib()
|
|
613
|
+
dedup: de.Dedup = attr.ib()
|
|
614
|
+
output: _PurePathBase | None = attr.ib()
|
|
615
|
+
output_real: Path = attr.ib(converter=_Path)
|
|
616
|
+
venv_name: str = attr.ib(default=None)
|
|
617
|
+
mock_use_system_python: bool = attr.ib(default=False)
|
|
618
|
+
|
|
619
|
+
def __attrs_post_init__(self):
|
|
620
|
+
if self.output is None:
|
|
621
|
+
self.output = self.output_real
|
|
622
|
+
if self.venv_name is None:
|
|
623
|
+
self.venv_name = re.compile("[^0-9a-zA-Z_-]").sub("_", self.output.name)
|
|
624
|
+
|
|
625
|
+
def _handle_command(self, c: Command) -> bytes:
|
|
626
|
+
if c == Command.VENV_BASE_PATH:
|
|
627
|
+
return str(self.output).encode("utf-8")
|
|
628
|
+
else: # Can only be Command.VENV_NAME
|
|
629
|
+
return self.venv_name.encode("utf-8")
|
|
630
|
+
|
|
631
|
+
def _process_file(self, file: VenvFile):
|
|
632
|
+
# get rid of the "literal"/"template" at the start, then join the path with the output path
|
|
633
|
+
output = self._map_path(file.path)
|
|
634
|
+
output.parent.mkdir(exist_ok=True, parents=True)
|
|
635
|
+
if (first_component := file.path.parts[0]) == "literal":
|
|
636
|
+
file.write_to(output, will_never_modify=True)
|
|
637
|
+
else:
|
|
638
|
+
with file.open_readonly() as f, output.open("wb") as f_w:
|
|
639
|
+
reader = CommandSequenceReader(f)
|
|
640
|
+
while item := reader.read(65536):
|
|
641
|
+
if isinstance(item, bytes):
|
|
642
|
+
f_w.write(item)
|
|
643
|
+
else:
|
|
644
|
+
f_w.write(self._handle_command(item))
|
|
645
|
+
|
|
646
|
+
def begin_session(self):
|
|
647
|
+
pass
|
|
648
|
+
|
|
649
|
+
def end_session(self):
|
|
650
|
+
self._generate_pyc(0)
|
|
651
|
+
|
|
652
|
+
def _generate_pyc(self, optimization_level: int):
|
|
653
|
+
"""
|
|
654
|
+
1. Find source python files that don't have a pyc file.
|
|
655
|
+
2. Query hashes from dedup.
|
|
656
|
+
3. Attempt to link dedup'd pyc files.
|
|
657
|
+
4. Generate pyc files using compileall.
|
|
658
|
+
5. Adopt newly-generated pyc files into the dedup system.
|
|
659
|
+
"""
|
|
660
|
+
exe = self._find_python()
|
|
661
|
+
hf = self.hash_function
|
|
662
|
+
|
|
663
|
+
if self.mock_use_system_python:
|
|
664
|
+
PycGenerator_ = PycGeneratorMockUseSystemPython
|
|
665
|
+
else:
|
|
666
|
+
PycGenerator_ = PycGenerator
|
|
667
|
+
|
|
668
|
+
pg = PycGenerator_(python_exe=exe, optimization_level=optimization_level)
|
|
669
|
+
h_exe: mh.Digest = self.dedup.get_or_compute_file_hash(hf, exe, check_link=False)[1]
|
|
670
|
+
h_key = hf().update(h_exe.to_multihash_bytes()).update_iter(pg.get_data_to_hash()).digest()
|
|
671
|
+
h_key_bytes = h_key.to_multihash_bytes()
|
|
672
|
+
|
|
673
|
+
def _pyc_tag(py_path: Path) -> bytes:
|
|
674
|
+
h_py: mh.Digest = self.dedup.get_or_compute_file_hash(hf, py_path, check_link=False)[1]
|
|
675
|
+
h_pyc = hf().update(h_key_bytes).update(h_py.to_multihash_bytes()).digest()
|
|
676
|
+
return b"pyc:" + h_pyc.to_multihash_bytes()
|
|
677
|
+
|
|
678
|
+
py_paths = self.output_real.rglob("*.py")
|
|
679
|
+
|
|
680
|
+
todo = [
|
|
681
|
+
_VenvExporterPycFile(
|
|
682
|
+
path_py=path_py,
|
|
683
|
+
path_pyc=pg.map_path(path_py),
|
|
684
|
+
pyc_tags=frozenset((_pyc_tag(path_py),)),
|
|
685
|
+
)
|
|
686
|
+
for path_py in py_paths
|
|
687
|
+
]
|
|
688
|
+
|
|
689
|
+
for x in todo:
|
|
690
|
+
x.link_request = de.DedupLinkRequest(
|
|
691
|
+
hash_function=hf,
|
|
692
|
+
link_path=x.path_pyc,
|
|
693
|
+
file_metadata=de.DedupFileMetadata.make_plain(),
|
|
694
|
+
file_contents_hash=None,
|
|
695
|
+
open_file_once=None,
|
|
696
|
+
tags=x.pyc_tags,
|
|
697
|
+
)
|
|
698
|
+
x.path_pyc.parent.mkdir(parents=True, exist_ok=True)
|
|
699
|
+
|
|
700
|
+
try:
|
|
701
|
+
self.dedup.run_batch([x.link_request for x in todo])
|
|
702
|
+
except de.BatchError as exc:
|
|
703
|
+
for r in exc.requests:
|
|
704
|
+
if not isinstance(r.exc, de.MissingContentError):
|
|
705
|
+
raise r.exc
|
|
706
|
+
|
|
707
|
+
todo = [x for x in todo if not x.link_request.success]
|
|
708
|
+
|
|
709
|
+
if not todo:
|
|
710
|
+
# early exit if there are no files to compile
|
|
711
|
+
return
|
|
712
|
+
|
|
713
|
+
# Now we compile the pyc files.
|
|
714
|
+
pg(self.output_real, [x.path_py for x in todo])
|
|
715
|
+
|
|
716
|
+
# Some pyc files may fail to appear (syntax errors for example) so we must skip them.
|
|
717
|
+
self.dedup.adopt_files(
|
|
718
|
+
hf,
|
|
719
|
+
(
|
|
720
|
+
de.AdoptRequest(x.path_pyc, tags=x.pyc_tags)
|
|
721
|
+
for x in todo
|
|
722
|
+
if x.path_pyc.exists() and x.path_pyc.stat().st_size >= 256
|
|
723
|
+
),
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
def _find_python(self):
|
|
727
|
+
files = {p.name.lower(): p for p in self.output_real.iterdir()}
|
|
728
|
+
for name in ("python.exe", "python"):
|
|
729
|
+
if path := files.get(name):
|
|
730
|
+
return path
|
|
731
|
+
raise ValueError("could not find python executable")
|
|
732
|
+
|
|
733
|
+
def _map_path(self, path: PurePosixPath):
|
|
734
|
+
return self.output_real / Path(*path.parts[1:])
|
|
735
|
+
|
|
736
|
+
def _dedup_file_metadata(self, x: ImageFileMetadata):
|
|
737
|
+
return de.DedupFileMetadata(executable=x.executable)
|
|
738
|
+
|
|
739
|
+
def _process_template_file(self, f: ty.BinaryIO, f_w: ty.BinaryIO):
|
|
740
|
+
reader = CommandSequenceReader(f)
|
|
741
|
+
while item := reader.read(65536):
|
|
742
|
+
if isinstance(item, bytes):
|
|
743
|
+
f_w.write(item)
|
|
744
|
+
else:
|
|
745
|
+
f_w.write(self._handle_command(item))
|
|
746
|
+
|
|
747
|
+
def provide_files(self, inputs: ty.Iterable[VenvExportInput]):
|
|
748
|
+
"""
|
|
749
|
+
Provide the files and their contents.
|
|
750
|
+
"""
|
|
751
|
+
|
|
752
|
+
counter = 0
|
|
753
|
+
later: list[tuple[Path, VenvExportInput]] = []
|
|
754
|
+
with self.dedup.temporary_directory() as tmp:
|
|
755
|
+
batch = []
|
|
756
|
+
|
|
757
|
+
for x in inputs:
|
|
758
|
+
digest = x.info.files[0].digest
|
|
759
|
+
d = {f.metadata for f in x.info.files}
|
|
760
|
+
kw = dict(
|
|
761
|
+
hash_function=digest.function,
|
|
762
|
+
file_contents_hash=digest,
|
|
763
|
+
open_file_once=x.contents_open,
|
|
764
|
+
file_not_needed=x.contents_reject,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
# HACK: We simply copy files smaller than 256 bytes. This is to avoid bumping into
|
|
768
|
+
# the limit on the number of hardlinks for a single file, which is 1023 on Windows.
|
|
769
|
+
# This limit would be exceeded by the numerous zero-length "__init__.py" files.
|
|
770
|
+
if (
|
|
771
|
+
len(d) > 1
|
|
772
|
+
or any(f.path.parts[0] != "literal" for f in x.info.files)
|
|
773
|
+
or x.info.size < 256
|
|
774
|
+
):
|
|
775
|
+
# slow path - we need to write the file to a temporary location first
|
|
776
|
+
batch.append(
|
|
777
|
+
de.DedupLinkRequest(
|
|
778
|
+
link_path=(tmp_file := tmp / f"c{counter}.bin"),
|
|
779
|
+
file_metadata=de.DedupFileMetadata.make_plain(),
|
|
780
|
+
**kw,
|
|
781
|
+
)
|
|
782
|
+
)
|
|
783
|
+
counter += 1
|
|
784
|
+
later.append((tmp_file, x))
|
|
785
|
+
else:
|
|
786
|
+
# all files are literal
|
|
787
|
+
for f in x.info.files:
|
|
788
|
+
(dst := self._map_path(f.path)).parent.mkdir(exist_ok=True, parents=True)
|
|
789
|
+
batch.append(
|
|
790
|
+
de.DedupLinkRequest(
|
|
791
|
+
link_path=dst,
|
|
792
|
+
file_metadata=self._dedup_file_metadata(f.metadata),
|
|
793
|
+
**kw,
|
|
794
|
+
)
|
|
795
|
+
)
|
|
796
|
+
kw["open_file_once"] = None
|
|
797
|
+
|
|
798
|
+
self.dedup.run_batch(batch)
|
|
799
|
+
|
|
800
|
+
for tmp_file, x in later:
|
|
801
|
+
for f in x.info.files:
|
|
802
|
+
(dst := self._map_path(f.path)).parent.mkdir(exist_ok=True, parents=True)
|
|
803
|
+
with tmp_file.open("rb") as f_r, dst.open("wb") as f_w:
|
|
804
|
+
if f.path.parts[0] == "literal":
|
|
805
|
+
shutil.copyfileobj(f_r, f_w)
|
|
806
|
+
else:
|
|
807
|
+
self._process_template_file(f_r, f_w)
|
|
808
|
+
self.dedup.apply_metadata_to_file(dst, self._dedup_file_metadata(f.metadata))
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
class VenvExportInput(abc.ABC):
|
|
812
|
+
info: SolidArchiveFileInfo
|
|
813
|
+
|
|
814
|
+
@abc.abstractmethod
|
|
815
|
+
def contents_open(self) -> ty.BinaryIO: ...
|
|
816
|
+
|
|
817
|
+
@abc.abstractmethod
|
|
818
|
+
def contents_reject(self) -> None: ...
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
@attr.s(eq=False, hash=False)
|
|
822
|
+
class SolidArchiveFileInfo:
|
|
823
|
+
files: list[SingleFileImageMetadata] = attr.ib()
|
|
824
|
+
offset: int = attr.ib()
|
|
825
|
+
size: int = attr.ib()
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
@attr.s(eq=False, hash=False)
|
|
829
|
+
class VenvExportInputFromSolidArchive(VenvExportInput):
|
|
830
|
+
archive_io: ty.BinaryIO = attr.ib()
|
|
831
|
+
info: SolidArchiveFileInfo = attr.ib()
|
|
832
|
+
|
|
833
|
+
def _skip_to(self, offset: int):
|
|
834
|
+
bs = 2**17
|
|
835
|
+
to_skip = offset - (f := self.archive_io).tell()
|
|
836
|
+
assert_(to_skip >= 0, "solid archive members must be read in the order they appear in")
|
|
837
|
+
while to_skip:
|
|
838
|
+
to_skip -= (n := len(f.read(min(to_skip, bs))))
|
|
839
|
+
assert_(n, "attempted to skip past the end of the solid archive")
|
|
840
|
+
|
|
841
|
+
def contents_open(self):
|
|
842
|
+
self._skip_to(self.info.offset)
|
|
843
|
+
# NOTE: we don't check the hash here because the dedup code does it
|
|
844
|
+
return LimitIO(self.archive_io, self.info.size)
|
|
845
|
+
|
|
846
|
+
def contents_reject(self):
|
|
847
|
+
# With some luck, we might not need to decompress this archive at all if we don't
|
|
848
|
+
# need any of the contents.
|
|
849
|
+
pass
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
@attr.s(eq=False, hash=False)
|
|
853
|
+
class LimitIO(io.RawIOBase):
|
|
854
|
+
_raw: ty.BinaryIO = attr.ib()
|
|
855
|
+
_bytes_left: int = attr.ib()
|
|
856
|
+
|
|
857
|
+
def writable(self):
|
|
858
|
+
return False
|
|
859
|
+
|
|
860
|
+
def seekable(self):
|
|
861
|
+
return False
|
|
862
|
+
|
|
863
|
+
def readinto(self, b):
|
|
864
|
+
if len(b) > (n := self._bytes_left):
|
|
865
|
+
if n == 0:
|
|
866
|
+
return 0
|
|
867
|
+
b = memoryview(b)[:n]
|
|
868
|
+
read_count = self._raw.readinto(b)
|
|
869
|
+
self._bytes_left -= read_count
|
|
870
|
+
return read_count
|