vocker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vocker/__init__.py +0 -0
- vocker/__main__.py +3 -0
- vocker/cli.py +384 -0
- vocker/dedup.py +1676 -0
- vocker/dedup_models.py +174 -0
- vocker/image.py +870 -0
- vocker/integer_to_path.py +51 -0
- vocker/multihash.py +302 -0
- vocker/py.typed +0 -0
- vocker/repo/__init__.py +0 -0
- vocker/repo/compression.py +239 -0
- vocker/repo/io.py +711 -0
- vocker/system.py +681 -0
- vocker/util.py +120 -0
- vocker/util_models.py +13 -0
- vocker-0.1.0.dist-info/METADATA +56 -0
- vocker-0.1.0.dist-info/RECORD +19 -0
- vocker-0.1.0.dist-info/WHEEL +5 -0
- vocker-0.1.0.dist-info/top_level.txt +1 -0
vocker/system.py
ADDED
|
@@ -0,0 +1,681 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import cbor2
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from collections.abc import MutableMapping
|
|
6
|
+
import contextlib
|
|
7
|
+
import enum
|
|
8
|
+
from functools import cached_property
|
|
9
|
+
import io
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path, PurePath, PurePosixPath
|
|
13
|
+
import re
|
|
14
|
+
import shutil
|
|
15
|
+
import typing as ty
|
|
16
|
+
|
|
17
|
+
import atomicwrites
|
|
18
|
+
import attr
|
|
19
|
+
import platformdirs
|
|
20
|
+
import strictyaml as sy
|
|
21
|
+
import structlog
|
|
22
|
+
|
|
23
|
+
from . import dedup as de, multihash as mh, image as im
|
|
24
|
+
from .repo import io as rio, compression as cx
|
|
25
|
+
from .util import PurePathBase
|
|
26
|
+
from .integer_to_path import IntegerToPath
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
logger = structlog.get_logger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def tqdm():
|
|
33
|
+
import tqdm
|
|
34
|
+
|
|
35
|
+
return tqdm
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def validate_local_repo_name(name: str) -> None:
|
|
39
|
+
if not re.search(r"^(\w|-)*$", name):
|
|
40
|
+
raise ValueError(f"invalid repo name: {name!r}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def cget(x, *args):
|
|
44
|
+
return x.value.get(*args)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@attr.s(eq=False, hash=False)
|
|
48
|
+
class RemoteRepository:
|
|
49
|
+
uri: str = attr.ib()
|
|
50
|
+
|
|
51
|
+
def as_dict(self):
|
|
52
|
+
return attr.asdict(self, recurse=False)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@attr.s(auto_exc=True, hash=False, str=True)
|
|
56
|
+
class LocalRepositoryExistsError(ValueError):
|
|
57
|
+
message: str = attr.ib(default="local repository already exists")
|
|
58
|
+
repo_path: Path = attr.ib(default=None)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@attr.s(auto_exc=True, hash=False, str=True)
|
|
62
|
+
class LocalRepositoryInvalidError(ValueError):
|
|
63
|
+
message: str = attr.ib(default="local repository does not exist or is corrupted")
|
|
64
|
+
repo_path: Path = attr.ib(default=None)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@attr.s(eq=False, hash=False)
|
|
68
|
+
class _Remotes(MutableMapping[str, RemoteRepository]):
|
|
69
|
+
system: System = attr.ib()
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def _data(self):
|
|
73
|
+
return self.system._config["remote_repositories"]
|
|
74
|
+
|
|
75
|
+
def __getitem__(self, k):
|
|
76
|
+
d = self._data[k].data
|
|
77
|
+
return RemoteRepository(**d)
|
|
78
|
+
|
|
79
|
+
def __setitem__(self, k, v: RemoteRepository | None):
|
|
80
|
+
if v is None:
|
|
81
|
+
try:
|
|
82
|
+
del self._data[k]
|
|
83
|
+
except KeyError:
|
|
84
|
+
pass
|
|
85
|
+
else:
|
|
86
|
+
self._data[k] = v.as_dict()
|
|
87
|
+
self.system._config_write()
|
|
88
|
+
|
|
89
|
+
def __delitem__(self, k):
|
|
90
|
+
self[k] = None
|
|
91
|
+
|
|
92
|
+
def __iter__(self):
|
|
93
|
+
return iter(x.value for x in self._data.value)
|
|
94
|
+
|
|
95
|
+
def __len__(self, k, v):
|
|
96
|
+
return len(self._data.value)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
config_schema = sy.EmptyDict() | sy.MapCombined(
|
|
100
|
+
{
|
|
101
|
+
sy.Optional("remote_repositories"): (
|
|
102
|
+
sy.EmptyDict() | sy.MapPattern(sy.Str(), sy.Map({"uri": sy.Str()}))
|
|
103
|
+
),
|
|
104
|
+
},
|
|
105
|
+
sy.Str(),
|
|
106
|
+
sy.Any(),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class ImageType(enum.Enum):
|
|
111
|
+
PYENV_V1 = "pyenv1"
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@attr.s(eq=False, hash=False)
|
|
115
|
+
class StrictYamlFileWithCaching:
|
|
116
|
+
path: Path = attr.ib()
|
|
117
|
+
schema = attr.ib(default=None)
|
|
118
|
+
_mtime = None
|
|
119
|
+
_document = None
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def document(self):
|
|
123
|
+
if (mtime := (p := self.path).stat().st_mtime_ns) != self._mtime:
|
|
124
|
+
self._document = doc = sy.load(
|
|
125
|
+
p.read_bytes().decode("utf-8"), schema=self.schema, label=str(self.path)
|
|
126
|
+
)
|
|
127
|
+
self._mtime = mtime
|
|
128
|
+
else:
|
|
129
|
+
doc = self._document
|
|
130
|
+
return doc
|
|
131
|
+
|
|
132
|
+
@document.setter
|
|
133
|
+
def document(self, new_value):
|
|
134
|
+
with atomicwrites.atomic_write(
|
|
135
|
+
str(self.path), mode="wt", overwrite=True, encoding="utf-8", newline="\n"
|
|
136
|
+
) as fp:
|
|
137
|
+
fp.write(new_value.as_yaml())
|
|
138
|
+
self._document = new_value
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@attr.s(eq=False, hash=False)
|
|
142
|
+
class UpdatingLocalRepository:
|
|
143
|
+
parent: LocalRepository = attr.ib()
|
|
144
|
+
workspace_path: Path = attr.ib()
|
|
145
|
+
hash_function = attr.ib()
|
|
146
|
+
updated_paths: set[PurePath] = attr.ib(init=False, factory=set)
|
|
147
|
+
|
|
148
|
+
def get_path_for_open(self, path: PurePath | str, mode: str):
|
|
149
|
+
path = PurePath(path)
|
|
150
|
+
if mode == "rb":
|
|
151
|
+
if path in self.updated_paths:
|
|
152
|
+
return self.workspace_path / path
|
|
153
|
+
else:
|
|
154
|
+
return self.parent._path_base / path
|
|
155
|
+
elif mode == "wb":
|
|
156
|
+
self.updated_paths.add(path)
|
|
157
|
+
(p := self.workspace_path / path).parent.mkdir(exist_ok=True, parents=True)
|
|
158
|
+
return p
|
|
159
|
+
else:
|
|
160
|
+
raise ValueError(f"mode={mode!r}")
|
|
161
|
+
|
|
162
|
+
def open(self, path: PurePath | str, mode: str):
|
|
163
|
+
return self.get_path_for_open(path, mode).open(mode)
|
|
164
|
+
|
|
165
|
+
@contextlib.contextmanager
|
|
166
|
+
def open_for_write_multi_compressed(self, path: PurePath | str):
|
|
167
|
+
path = PurePath(path)
|
|
168
|
+
|
|
169
|
+
def p(suffix):
|
|
170
|
+
return path.with_name(path.name + suffix)
|
|
171
|
+
|
|
172
|
+
# compress contents directly to zstandard
|
|
173
|
+
with self.open(p(".zst"), "wb") as f1, cx.open_compressor(f1, "zst") as f:
|
|
174
|
+
yield f
|
|
175
|
+
|
|
176
|
+
# compress to xz as well
|
|
177
|
+
with self.open(p(".zst"), "rb") as fr1, cx.open_decompressor(fr1, "zst") as fr, self.open(
|
|
178
|
+
p(".xz"), "wb"
|
|
179
|
+
) as fw1, cx.open_compressor(fw1, "xz") as fw:
|
|
180
|
+
shutil.copyfileobj(fr, fw)
|
|
181
|
+
|
|
182
|
+
def iterdir(self, path: PurePath):
|
|
183
|
+
raise NotImplementedError("not needed yet")
|
|
184
|
+
|
|
185
|
+
def unlink(self, path: PurePath):
|
|
186
|
+
self.updated_paths.add(path)
|
|
187
|
+
(self.workspace_path / path).unlink(missing_ok=True)
|
|
188
|
+
|
|
189
|
+
def id_to_path(self, name: str, value: int):
|
|
190
|
+
return PurePath(name) / self.parent.integer_to_path(value)
|
|
191
|
+
|
|
192
|
+
def allocate_id(self, name: str) -> tuple[int, PurePath]:
|
|
193
|
+
with self.open("counters.json", "rb") as f:
|
|
194
|
+
counters: dict[str, int] = json.loads(f.read())
|
|
195
|
+
counters[name] = counter = counters.get(name, 0) + 1
|
|
196
|
+
with self.open("counters.json", "wb") as f:
|
|
197
|
+
f.write(json.dumps(counters, sort_keys=True, separators=(",", ":")).encode("utf-8"))
|
|
198
|
+
return counter
|
|
199
|
+
|
|
200
|
+
def add_image_to_index(self, digest: mh.Digest, image_id: int | None):
|
|
201
|
+
self._add_id_to_hash32_index("image-by-hash-32", digest, image_id)
|
|
202
|
+
|
|
203
|
+
def add_shard_to_index(self, digest: mh.Digest, shard_id: int | None):
|
|
204
|
+
self._add_id_to_hash32_index("shard-by-hash-32", digest, shard_id)
|
|
205
|
+
|
|
206
|
+
def _add_id_to_hash32_index(self, dir_name: str, digest: mh.Digest, object_id: int | None):
|
|
207
|
+
path = dir_name / rio.RepoTransfer.make_hash_32_path(digest)
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
with self.open(path, "rb") as f:
|
|
211
|
+
d = rio.cbor_load(f, 1024 * 1024)
|
|
212
|
+
except FileNotFoundError:
|
|
213
|
+
d = {}
|
|
214
|
+
|
|
215
|
+
if object_id is None:
|
|
216
|
+
d.pop(digest.digest, None)
|
|
217
|
+
else:
|
|
218
|
+
d[digest.digest] = object_id
|
|
219
|
+
|
|
220
|
+
with self.open(path, "wb") as f:
|
|
221
|
+
rio.cbor_dump(d, f)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@attr.s
|
|
225
|
+
class LocalRepository:
|
|
226
|
+
system: System = attr.ib()
|
|
227
|
+
path: Path = attr.ib()
|
|
228
|
+
integer_to_path: IntegerToPath = attr.ib(factory=lambda: IntegerToPath(file_suffix="_d"))
|
|
229
|
+
|
|
230
|
+
@property
|
|
231
|
+
def name(self):
|
|
232
|
+
return self.path.name
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def _path_ok(self):
|
|
236
|
+
return self.path / "ok"
|
|
237
|
+
|
|
238
|
+
@property
|
|
239
|
+
def _path_base(self):
|
|
240
|
+
return self.path / "b"
|
|
241
|
+
|
|
242
|
+
def init_new(self, hash_function: mh.HashFunction):
|
|
243
|
+
self.delete()
|
|
244
|
+
with self.updating(init_hash_function=hash_function) as u:
|
|
245
|
+
with u.open("version.txt", "wb") as f:
|
|
246
|
+
f.write(b"1")
|
|
247
|
+
with u.open("counters.json", "wb") as f:
|
|
248
|
+
f.write(b"{}")
|
|
249
|
+
|
|
250
|
+
def get_hash_function(self):
|
|
251
|
+
return self.manifest_read_toplevel_hash_only(self._path_base / "manifest.bin").function
|
|
252
|
+
|
|
253
|
+
@contextlib.contextmanager
|
|
254
|
+
def updating(self, *, init_hash_function=None):
|
|
255
|
+
with self.system.repo_dedup.temporary_directory() as tmp_path:
|
|
256
|
+
if init_hash_function is None:
|
|
257
|
+
hf = self.get_hash_function()
|
|
258
|
+
else:
|
|
259
|
+
hf = init_hash_function
|
|
260
|
+
u = UpdatingLocalRepository(parent=self, workspace_path=tmp_path, hash_function=hf)
|
|
261
|
+
yield u
|
|
262
|
+
|
|
263
|
+
# Adopt the files.
|
|
264
|
+
base = self._path_base
|
|
265
|
+
reqs_adopt = []
|
|
266
|
+
reqs_copy = []
|
|
267
|
+
for p_rel in u.updated_paths:
|
|
268
|
+
p = u.workspace_path / p_rel
|
|
269
|
+
p_rel_str = "/".join(p_rel.parts)
|
|
270
|
+
if p.exists():
|
|
271
|
+
if not p.is_file():
|
|
272
|
+
raise ValueError("only regular files are supported")
|
|
273
|
+
reqs_adopt.append(adopt_req := de.AdoptRequest(p))
|
|
274
|
+
reqs_copy.append(de.DedupCopyLinkRequest(src=p, dst=(dst := base / p_rel)))
|
|
275
|
+
dst.parent.mkdir(exist_ok=True, parents=True)
|
|
276
|
+
|
|
277
|
+
dedup = self.system.repo_dedup
|
|
278
|
+
dedup.adopt_files(hf, reqs_adopt)
|
|
279
|
+
|
|
280
|
+
# Now we gather the hashes for all the files so we can update the manifest nodes. We
|
|
281
|
+
# need to do this after the `adopt_files` above because that operation computes the
|
|
282
|
+
# hashes and stores them in the dedup db.
|
|
283
|
+
dirs: dict[int, dict[PurePosixPath, dict[str, tuple[bool, mh.Digest] | None]]] = (
|
|
284
|
+
defaultdict(lambda: defaultdict(dict))
|
|
285
|
+
)
|
|
286
|
+
for p_rel in u.updated_paths:
|
|
287
|
+
if (p := u.workspace_path / p_rel).exists():
|
|
288
|
+
r = dedup.get_file_hash(hf, u.workspace_path / p_rel, check_link=True)
|
|
289
|
+
assert r is not None
|
|
290
|
+
value = False, r[1]
|
|
291
|
+
else:
|
|
292
|
+
value = None
|
|
293
|
+
dirs[len(p_rel.parts) - 1][p_rel.parent][p_rel.name] = value
|
|
294
|
+
|
|
295
|
+
# Here begins the critical section. If this part fails, the local repository will be broken.
|
|
296
|
+
self._path_ok.unlink(missing_ok=True)
|
|
297
|
+
|
|
298
|
+
for p_rel in u.updated_paths:
|
|
299
|
+
dedup.delete_tree(base / p_rel)
|
|
300
|
+
|
|
301
|
+
# Copy the links from `u.workspace` to `base`.
|
|
302
|
+
dedup.run_batch(reqs_copy)
|
|
303
|
+
|
|
304
|
+
# Recursively update the manifest nodes.
|
|
305
|
+
max_depth = max(dirs)
|
|
306
|
+
for i in range(max_depth, -1, -1):
|
|
307
|
+
for dir_path, children in dirs[i].items():
|
|
308
|
+
mf_path = dir_path / "manifest.bin"
|
|
309
|
+
if (dst := base / mf_path).exists():
|
|
310
|
+
node = rio.ManifestNodeReader.from_bytes(dst.read_bytes()).out_verified_data
|
|
311
|
+
else:
|
|
312
|
+
node = rio.ManifestNode(hash_function=hf, children={})
|
|
313
|
+
|
|
314
|
+
for child_name, child in children.items():
|
|
315
|
+
if child is None:
|
|
316
|
+
node.children.pop(child_name, None)
|
|
317
|
+
else:
|
|
318
|
+
node.children[child_name] = child
|
|
319
|
+
|
|
320
|
+
if node.children or i == 0:
|
|
321
|
+
node_bytes, node_hash = node.to_bytes()
|
|
322
|
+
kw = dict(open_file_once=lambda: io.BytesIO(node_bytes))
|
|
323
|
+
req = rio.RepoTransfer.make_manifest_link_request(node_hash, dst, kw)
|
|
324
|
+
dedup.run_batch([req])
|
|
325
|
+
del node_bytes, kw, req
|
|
326
|
+
else:
|
|
327
|
+
# delete empty manifest
|
|
328
|
+
dedup.delete_file(dst)
|
|
329
|
+
node_hash = None
|
|
330
|
+
|
|
331
|
+
if i > 0:
|
|
332
|
+
parent_value = None if node_hash is None else (True, node_hash)
|
|
333
|
+
dirs[i - 1][dir_path.parent][dir_path.name] = parent_value
|
|
334
|
+
|
|
335
|
+
self._path_ok.write_bytes(b"")
|
|
336
|
+
# Here ends the critical section.
|
|
337
|
+
|
|
338
|
+
def delete(self):
|
|
339
|
+
if self.path.exists():
|
|
340
|
+
self._path_ok.unlink(missing_ok=True)
|
|
341
|
+
self.system.repo_dedup.delete_tree(self.path)
|
|
342
|
+
|
|
343
|
+
def check(self) -> bool:
|
|
344
|
+
return self._path_ok.exists()
|
|
345
|
+
|
|
346
|
+
def raise_if_not_valid(self) -> None:
|
|
347
|
+
if not self.check():
|
|
348
|
+
raise LocalRepositoryInvalidError(repo_path=self.path)
|
|
349
|
+
|
|
350
|
+
def ensure_deleted_and_raise_if_exists(self) -> None:
|
|
351
|
+
if self.path.exists():
|
|
352
|
+
if self.check():
|
|
353
|
+
self._raise_exists_error()
|
|
354
|
+
else:
|
|
355
|
+
logger.warning(
|
|
356
|
+
"deleting corrupted or incomplete repository", data_path=str(self.path)
|
|
357
|
+
)
|
|
358
|
+
self.delete()
|
|
359
|
+
|
|
360
|
+
def _raise_exists_error(self):
|
|
361
|
+
raise LocalRepositoryExistsError(repo_path=self.path)
|
|
362
|
+
|
|
363
|
+
@staticmethod
|
|
364
|
+
def manifest_read_toplevel_hash_only(p: Path):
|
|
365
|
+
reader = rio.ManifestNodeReader()
|
|
366
|
+
feed = reader.parser.feed
|
|
367
|
+
with p.open("rb") as f:
|
|
368
|
+
while b := f.read(4096):
|
|
369
|
+
feed(b)
|
|
370
|
+
if (h := reader.out_claimed_digest) is not None:
|
|
371
|
+
return h
|
|
372
|
+
feed(None)
|
|
373
|
+
raise AssertionError
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
@attr.s(eq=False, hash=False, kw_only=True)
|
|
377
|
+
class System:
|
|
378
|
+
path_base: Path = attr.ib(default=None)
|
|
379
|
+
path_dedup: Path = attr.ib(default=None)
|
|
380
|
+
path_repo_base: Path = attr.ib(default=None)
|
|
381
|
+
path_repo_dedup: Path = attr.ib(default=None)
|
|
382
|
+
path_repo_local: Path = attr.ib(default=None)
|
|
383
|
+
dedup: de.Dedup = attr.ib(default=None)
|
|
384
|
+
repo_dedup: de.Dedup = attr.ib(default=None)
|
|
385
|
+
|
|
386
|
+
def __attrs_post_init__(self):
|
|
387
|
+
self._init()
|
|
388
|
+
|
|
389
|
+
def _get_default_dedup_path(self):
|
|
390
|
+
if (p := os.environ.get("VOCKER_BASE", None)) is not None:
|
|
391
|
+
return Path(p)
|
|
392
|
+
return platformdirs.user_data_path("vocker", False)
|
|
393
|
+
|
|
394
|
+
def _init(self):
|
|
395
|
+
if self.path_base is None:
|
|
396
|
+
self.path_base = self._get_default_dedup_path()
|
|
397
|
+
|
|
398
|
+
if self.path_dedup is None:
|
|
399
|
+
self.path_dedup = self.path_base / "dup"
|
|
400
|
+
|
|
401
|
+
if self.path_repo_base is None:
|
|
402
|
+
self.path_repo_base = self.path_base / "repo"
|
|
403
|
+
|
|
404
|
+
if self.path_repo_dedup is None:
|
|
405
|
+
self.path_repo_dedup = self.path_repo_base / "dup"
|
|
406
|
+
|
|
407
|
+
if self.path_repo_local is None:
|
|
408
|
+
self.path_repo_local = self.path_repo_base / "local"
|
|
409
|
+
|
|
410
|
+
# FIXME: support other backends
|
|
411
|
+
if self.dedup is None:
|
|
412
|
+
self.dedup = de.DedupBackendHardlink(self.path_dedup)
|
|
413
|
+
self.dedup.garbage_collect_deleted()
|
|
414
|
+
|
|
415
|
+
if self.repo_dedup is None:
|
|
416
|
+
self.repo_dedup = de.DedupBackendHardlink(self.path_repo_dedup)
|
|
417
|
+
self.repo_dedup.garbage_collect_deleted()
|
|
418
|
+
|
|
419
|
+
self.path_repo_local.mkdir(exist_ok=True, parents=True)
|
|
420
|
+
|
|
421
|
+
config_path = self.path_base / "config.yaml"
|
|
422
|
+
cfg = StrictYamlFileWithCaching(config_path, schema=config_schema)
|
|
423
|
+
try:
|
|
424
|
+
cfg.document
|
|
425
|
+
except FileNotFoundError:
|
|
426
|
+
config_path.parent.mkdir(exist_ok=True, parents=True)
|
|
427
|
+
config_path.write_bytes(b"")
|
|
428
|
+
cfg.document
|
|
429
|
+
self._config_file = cfg
|
|
430
|
+
self._init_config()
|
|
431
|
+
|
|
432
|
+
def _init_config(self):
|
|
433
|
+
c = self._config
|
|
434
|
+
modified = False
|
|
435
|
+
|
|
436
|
+
if cget(c, k := "remote_repositories") is None:
|
|
437
|
+
c[k] = {}
|
|
438
|
+
modified = True
|
|
439
|
+
|
|
440
|
+
if modified:
|
|
441
|
+
self._config_write(c)
|
|
442
|
+
|
|
443
|
+
@property
|
|
444
|
+
def _config(self):
|
|
445
|
+
return self._config_file.document
|
|
446
|
+
|
|
447
|
+
def _config_write(self, value=None) -> None:
|
|
448
|
+
if value is None:
|
|
449
|
+
value = self._config_file.document
|
|
450
|
+
self._config_file.document = value
|
|
451
|
+
|
|
452
|
+
def repo_init_new(self, repo_name: str, hash_function_name: str):
|
|
453
|
+
self.repo_get(repo_name).init_new(mh.registry.name_to_hash[hash_function_name])
|
|
454
|
+
|
|
455
|
+
def repo_get(self, repo_name: str):
|
|
456
|
+
validate_local_repo_name(repo_name)
|
|
457
|
+
return LocalRepository(self, self.path_repo_local / repo_name)
|
|
458
|
+
|
|
459
|
+
def repo_list(self):
|
|
460
|
+
return [LocalRepository(self, p) for p in self.path_repo_local.iterdir() if p.is_dir()]
|
|
461
|
+
|
|
462
|
+
def repo_add_image(
|
|
463
|
+
self,
|
|
464
|
+
repo_name: str,
|
|
465
|
+
image_path: Path,
|
|
466
|
+
image_type: str | None,
|
|
467
|
+
mock_image_path: PurePathBase = None,
|
|
468
|
+
) -> int:
|
|
469
|
+
"""
|
|
470
|
+
Return new image ID.
|
|
471
|
+
"""
|
|
472
|
+
# 1. split image into shards based off the paths
|
|
473
|
+
# 2. send each shard into VenvImporter - if it gets too big, then split it into multiple
|
|
474
|
+
# shards of a more manageable size
|
|
475
|
+
# 3. check whether there already exists a shard with the exact same contents inside the
|
|
476
|
+
# repo, in which case just re-use it
|
|
477
|
+
(repo := self.repo_get(repo_name)).raise_if_not_valid()
|
|
478
|
+
|
|
479
|
+
if image_type is None:
|
|
480
|
+
# TODO: proper autodetection
|
|
481
|
+
image_type = "pyenv1"
|
|
482
|
+
|
|
483
|
+
image_type = ImageType(image_type)
|
|
484
|
+
|
|
485
|
+
if mock_image_path is None:
|
|
486
|
+
kw = dict(input=image_path)
|
|
487
|
+
else:
|
|
488
|
+
kw = dict(input=mock_image_path, input_real=image_path)
|
|
489
|
+
|
|
490
|
+
hf = repo.get_hash_function()
|
|
491
|
+
importer = im.VenvImporter(**kw)
|
|
492
|
+
d = im.pyenv_split(image_path)
|
|
493
|
+
|
|
494
|
+
transfer = self.get_repo_transfer(repo, None)
|
|
495
|
+
|
|
496
|
+
shard_ids = []
|
|
497
|
+
|
|
498
|
+
with repo.updating() as u:
|
|
499
|
+
all_shard_entries = []
|
|
500
|
+
|
|
501
|
+
for key, paths in tqdm().tqdm(d.items()):
|
|
502
|
+
outs = [x for p in paths for x in importer.run(p)]
|
|
503
|
+
make_file_meta = im.VenvImporterToImageMetadata(
|
|
504
|
+
hash_function=hf, dedup=self.repo_dedup
|
|
505
|
+
)
|
|
506
|
+
archive_digests_and_sizes = {}
|
|
507
|
+
archive_digest_to_output = {}
|
|
508
|
+
shard_entries = []
|
|
509
|
+
for out in outs:
|
|
510
|
+
with out() as o:
|
|
511
|
+
entry = make_file_meta(o)
|
|
512
|
+
shard_entries.append(entry)
|
|
513
|
+
all_shard_entries.append(entry.rest)
|
|
514
|
+
archive_digests_and_sizes[entry.rest.digest] = entry.size
|
|
515
|
+
archive_digest_to_output[entry.rest.digest] = out
|
|
516
|
+
archive_digests = list(archive_digests_and_sizes)
|
|
517
|
+
archive_digests.sort(key=lambda x: x.digest)
|
|
518
|
+
archive_digest_to_index = {k: i for i, k in enumerate(archive_digests)}
|
|
519
|
+
archive_sizes = tuple(archive_digests_and_sizes[k] for k in archive_digests)
|
|
520
|
+
|
|
521
|
+
shard_digest = (
|
|
522
|
+
hf()
|
|
523
|
+
.update_iter(
|
|
524
|
+
rio.image_file_entries_for_hashing_iter(
|
|
525
|
+
hf().digest(), (x.rest for x in shard_entries)
|
|
526
|
+
)
|
|
527
|
+
)
|
|
528
|
+
.digest()
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
shard_id = transfer.download_shard(shard_digest)
|
|
532
|
+
if shard_id is not None:
|
|
533
|
+
shard_ids.append(shard_id)
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
archive_path = u.id_to_path("archive", (archive_id := u.allocate_id("archive")))
|
|
537
|
+
|
|
538
|
+
with u.open_for_write_multi_compressed(
|
|
539
|
+
archive_path / "s"
|
|
540
|
+
) as f_s, u.open_for_write_multi_compressed(archive_path / "a") as f_a:
|
|
541
|
+
writer = rio.ArchiveDataWriter(file_archive=f_a, file_sizes=f_s)
|
|
542
|
+
for entry_size, entry_digest in zip(archive_sizes, archive_digests):
|
|
543
|
+
out = archive_digest_to_output[entry_digest]
|
|
544
|
+
writer.begin_file(size=entry_size, digest=entry_digest)
|
|
545
|
+
with out() as o:
|
|
546
|
+
for block in o.contents_iter():
|
|
547
|
+
writer.write_file_data(block)
|
|
548
|
+
writer.end_file()
|
|
549
|
+
with u.open(archive_path / "h.bin", "wb") as f:
|
|
550
|
+
rio.HashesWriter(f).write_all(h for h in archive_digests)
|
|
551
|
+
|
|
552
|
+
archive_size = u.get_path_for_open(archive_path / "a.zst", "wb").stat().st_size
|
|
553
|
+
|
|
554
|
+
shard_path = u.id_to_path("shard", (shard_id := u.allocate_id("shard")))
|
|
555
|
+
with u.open_for_write_multi_compressed(shard_path / "p") as f:
|
|
556
|
+
rio.ShardPathsWriter(f).write_all(e.rest for e in shard_entries)
|
|
557
|
+
|
|
558
|
+
with u.open(shard_path / "h.bin", "wb") as f:
|
|
559
|
+
rio.HashesWriter(f).write_all(e.rest.digest for e in shard_entries)
|
|
560
|
+
|
|
561
|
+
# allocate shard-to-archive mapping
|
|
562
|
+
s2a_path = u.id_to_path("sa", (s2a_id := u.allocate_id("sa")))
|
|
563
|
+
|
|
564
|
+
with u.open(shard_path / "sa.cbor", "wb") as f:
|
|
565
|
+
rio.cbor_dump(s2a_id, f)
|
|
566
|
+
|
|
567
|
+
with u.open_for_write_multi_compressed(s2a_path / "m") as f:
|
|
568
|
+
rio.MapShardToArchiveWriterTrivial(f).write_all(
|
|
569
|
+
shard_id=shard_id, archive_id=archive_id, archive_size=archive_size
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
u.add_shard_to_index(shard_digest, shard_id)
|
|
573
|
+
shard_ids.append(shard_id)
|
|
574
|
+
|
|
575
|
+
img_path = u.id_to_path("image", (img_id := u.allocate_id("image")))
|
|
576
|
+
|
|
577
|
+
# image user data
|
|
578
|
+
with u.open_for_write_multi_compressed(img_path / "u") as f:
|
|
579
|
+
rio.cbor_dump({"image_type": image_type.value}, f)
|
|
580
|
+
|
|
581
|
+
with u.open(img_path / "u.zst", "rb") as f1, cx.open_decompressor(f1, "zst") as f:
|
|
582
|
+
hasher = hf()
|
|
583
|
+
while b := f.read(65536):
|
|
584
|
+
hasher.update(b)
|
|
585
|
+
image_meta_hash = hasher.digest()
|
|
586
|
+
|
|
587
|
+
# allocate image-to-shard mapping
|
|
588
|
+
i2s_path = u.id_to_path("is", (i2s_id := u.allocate_id("is")))
|
|
589
|
+
|
|
590
|
+
with u.open(img_path / "is.cbor", "wb") as f:
|
|
591
|
+
rio.cbor_dump(i2s_id, f)
|
|
592
|
+
|
|
593
|
+
with u.open_for_write_multi_compressed(i2s_path / "m") as f:
|
|
594
|
+
rio.MapImageToShardWriterTrivial(f).write_all(shard_ids=shard_ids, image_id=img_id)
|
|
595
|
+
|
|
596
|
+
computed_image_hash = (
|
|
597
|
+
hf()
|
|
598
|
+
.update_iter(
|
|
599
|
+
rio.image_file_entries_for_hashing_iter(image_meta_hash, all_shard_entries)
|
|
600
|
+
)
|
|
601
|
+
.digest()
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
u.add_image_to_index(computed_image_hash, img_id)
|
|
605
|
+
|
|
606
|
+
return {"image_id": computed_image_hash.to_multihash_base64url()}
|
|
607
|
+
|
|
608
|
+
def export_image(
|
|
609
|
+
self,
|
|
610
|
+
*,
|
|
611
|
+
repo_name: str = None,
|
|
612
|
+
remote_name: str = None,
|
|
613
|
+
image_id: str,
|
|
614
|
+
target: Path,
|
|
615
|
+
mock_use_system_python: bool,
|
|
616
|
+
mock_target: PurePathBase | None = None,
|
|
617
|
+
):
|
|
618
|
+
"""
|
|
619
|
+
Write image contents to *target*.
|
|
620
|
+
"""
|
|
621
|
+
assert (repo_name is not None) + (remote_name is not None) == 1
|
|
622
|
+
|
|
623
|
+
with contextlib.ExitStack() as ex:
|
|
624
|
+
if repo_name is None:
|
|
625
|
+
tmp_repo_path = ex.enter_context(self.repo_dedup.temporary_directory())
|
|
626
|
+
repo = LocalRepository(self, tmp_repo_path)
|
|
627
|
+
else:
|
|
628
|
+
repo = self.repo_get(repo_name)
|
|
629
|
+
|
|
630
|
+
# TODO: support partial local clones
|
|
631
|
+
|
|
632
|
+
transfer = self.get_repo_transfer(local_repo=repo, remote_name=remote_name)
|
|
633
|
+
with transfer.open(PurePosixPath("manifest.bin")):
|
|
634
|
+
pass # ensure the top-level manifest is available
|
|
635
|
+
hf = repo.get_hash_function()
|
|
636
|
+
exporter = im.VenvExporter(
|
|
637
|
+
hash_function=hf,
|
|
638
|
+
dedup=self.dedup,
|
|
639
|
+
output=mock_target,
|
|
640
|
+
output_real=target,
|
|
641
|
+
mock_use_system_python=mock_use_system_python,
|
|
642
|
+
)
|
|
643
|
+
transfer.export(exporter, transfer.download_image(image_id), max_workers=3)
|
|
644
|
+
|
|
645
|
+
def repo_upload(self, repo_name: str, remote_name: str, force: str | None = None) -> None:
|
|
646
|
+
"""
|
|
647
|
+
Upload local ``repo_name`` to ``remote_name``. If the remote manifest hash changed, then
|
|
648
|
+
raise an Exception unless ``force`` is not None. In that case, then ``force`` must contain
|
|
649
|
+
the current remote manifest hash.
|
|
650
|
+
"""
|
|
651
|
+
(src := self.repo_get(repo_name)).raise_if_not_valid()
|
|
652
|
+
self.get_repo_transfer(src, remote_name).upload_full()
|
|
653
|
+
# FIXME: implement 'force' argument
|
|
654
|
+
|
|
655
|
+
def get_remote_repo_accessor(self, remote_name: str) -> rio.RemoteRepoAccessor:
|
|
656
|
+
base = Path.from_uri(self.remotes[remote_name].uri)
|
|
657
|
+
return rio.RemoteRepoAccessorFilesystem(base)
|
|
658
|
+
|
|
659
|
+
def get_repo_transfer(self, local_repo: LocalRepository, remote_name: str | None):
|
|
660
|
+
return rio.RepoTransfer(
|
|
661
|
+
path_local=local_repo._path_base,
|
|
662
|
+
dedup=self.repo_dedup,
|
|
663
|
+
accessor=None if remote_name is None else self.get_remote_repo_accessor(remote_name),
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
def repo_download(self, remote_name: str, repo_name: str) -> None:
|
|
667
|
+
"""
|
|
668
|
+
Download ``remote_name`` to ``repo_name``.
|
|
669
|
+
"""
|
|
670
|
+
(dst := self.repo_get(repo_name)).ensure_deleted_and_raise_if_exists()
|
|
671
|
+
transfer = self.get_repo_transfer(dst, remote_name)
|
|
672
|
+
transfer.download_full()
|
|
673
|
+
|
|
674
|
+
def repo_copy(self, src: str, dst: str) -> None:
|
|
675
|
+
(src := self.repo_get(src)).raise_if_not_valid()
|
|
676
|
+
(dst := self.repo_get(dst)).ensure_deleted_and_raise_if_exists()
|
|
677
|
+
self.dedup.copy_tree(src, dst)
|
|
678
|
+
|
|
679
|
+
@cached_property
|
|
680
|
+
def remotes(self):
|
|
681
|
+
return _Remotes(self)
|