vocker 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
4
+
5
+ ## [Unreleased]
6
+
7
+ ## [0.1.0] - 2025-12-17
8
+
9
+ ### Added
10
+
11
+ - Initial version.
vocker-0.1.0/DESIGN.md ADDED
@@ -0,0 +1,476 @@
1
+ # vocker
2
+
3
+ ## Why?
4
+
5
+ See README.
6
+
7
+ ## Terminology
8
+
9
+ - image
10
+ - A file tree representing a clean virtualenv.
11
+ - This file tree excludes pyc files. Paths that are embedded inside files have been removed.
12
+ - File metadata is excluded with the exception of the execute bit (which is relevant on UNIX).
13
+ - The file tree is processed using a hash tree.
14
+ - An image is identified by the top-level hash. This hash therefore allows someone to authenticate the entire file tree.
15
+ - container
16
+ - An unpacked image inside a directory, ready to be used as a Python virtualenv. A container is identified by the path to its directory.
17
+ - The same image can be unpacked into multiple containers. It's totally fine to have multiple copies of the same image for testing/development purposes.
18
+ - image tag, container tag
19
+ - A locally-assigned name for an image or for a container. It's just a sort of alias to make it easier to refer to an image or container.
20
+
21
+ ## Goals
22
+
23
+ See README.
24
+
25
+ ## File locations
26
+
27
+ - IMAGES - where downloaded images/archives are stored
28
+ - CONTAINERS - where images are unpacked into directories called containers
29
+
30
+ ## Example command usage
31
+
32
+ - `vocker image download REPO IMAGE-ID...`
33
+ - Download files from `REPO` such that the every `IMAGE-ID` is available locally.
34
+
35
+ - `vocker image upload REPO IMAGE-ID...`
36
+ - Upload files to `REPO` such that each `IMAGE-ID` is available remotely to other users.
37
+
38
+ - `vocker repo repack REPO (IMAGE-ID... | all)`
39
+ - Repack the files on `REPO` such that each of `IMAGE-ID` can be downloaded independently without wasting "too much" bandwidth. If "all" is specified, then ensure that every image can be downloaded independently.
40
+
41
+ - `vocker image import --type=TYPE PATH`
42
+ - Create a new image from the file tree at `PATH`. Return image ID of the newly-created image. The `TYPE` is either "venv" or "plain".
43
+
44
+ - `vocker image (ls | list)`
45
+ - List the available images and their creation timestamps. Also show the current containers for each image (if any).
46
+
47
+ - `vocker container (ls | list)`
48
+ - Does the same thing as `vocker image ls`, actually (lol).
49
+
50
+ - `vocker container create IMAGE-ID PATH`
51
+ - Extract a fresh clean copy of the container inside `PATH`. The path is assumed to be relative to the CONTAINERS path specified in the configuration.
52
+ - For a virtualenv this operation involves fixing some scripts which need the absolute path of the container inside of them, as well as generating or linking pyc files.
53
+
54
+ - `vocker container commit PATH`
55
+ - Create a new image from the container at `PATH`. The type of the container is assumed to be the same as the parent container.
56
+ - This is for lazy people who don't want to use `vocker image import --type=venv PATH`.
57
+
58
+ - `vocker container delete PATH`
59
+ - Delete the container at `PATH`.
60
+
61
+ - `vocker fsck`
62
+ - Check shared/deduplicated file integrity. If a file is found to have been modified (corrupted), then list every location where it exists (inside every container).
63
+
64
+ - `vocker gc`
65
+ - Check for deleted containers and delete associated data. This includes shared pyc files that are no longer used by any container.
66
+
67
+ `vocker import -t pyenv path/`
68
+
69
+ ## Concerns
70
+
71
+ ### File deduplication backends
72
+
73
+ There are multiple ways of achieving file content deduplication (that is, having the exact same file contents available at multiple paths without paying multiple times the file size in storage).
74
+
75
+ This feature is needed both for deduplicating the same image file across different containers, as well as sharing pyc files across containers that have the same Python version.
76
+
77
+ Relevant filesystem features are:
78
+
79
+ - Copy-on-write file copies (reflinks)
80
+ - This means that the file contents are initially shared between the two file "copies". If one of the copies is modified, only that section of the file is copied and modified without affecting other copy.
81
+ - The file metadata (owning user, permissions, extended attributes) is not shared between the different copies, and can be modified independently.
82
+ - Because this data sharing is very filesystem-specific, reflinks only work within the same filesystem. You can never make a reflink copy across two different filesystems (like a hard drive and a USB drive).
83
+ - This sort of copy requires special filesystem support.
84
+ - On GNU+Linux, that's bcachefs, btrfs, and XFS [(ref)](https://unix.stackexchange.com/a/631238).
85
+ - On Windows, it's only supported on ReFS v2, which is available [only](https://github.com/0xbadfca11/reflink) on Windows Server 2016 and Windows 10 version 1703 (build 15063) or later.
86
+ - There are no other operating systems relevant today. But if one existed, it would support it only on [APFS](https://unix.stackexchange.com/a/550932).
87
+ - For example, let's say you're ripping your favourite BluRay movie to a high-quality mkv file, and you've produced a 20 GB file after several hours. You now want to embed a subtitle translation inside the mkv file, but you're not familiar with the tooling and you're afraid of corrupting your precious mkv file. You could make a backup copy, but that would take a long time (it's 20 GB after all). What you can do instead is `cp --reflink=always movie.mkv backup.mkv` which works instantly (since it doesn't make an actual copy), and then you can run `mkv-embed-subtitle movie.mkv my_subtitle.sub` which modifies a small part of `movie.mkv` without affecting the contents of `backup.mkv` and while still sharing most of the data between the two files.
88
+ - Symbolic links (also called "soft links" and "symlinks")
89
+ - These are a UNIX thing, so they are natively supported on UNIX-likes like GNU+Linux and OS X.
90
+ - A symbolic link is kind of like a shortcut to a file. It is NOT a separate copy of the file. When you open the file, the operating system automatically follows the link and opens the actual file that it points to. Opening and modifying the symbolic link actually just modifies the original file.
91
+ - Because a symbolic link is just a shortcut containing an arbitrary path, a symbolic link can point anywhere, including locations on another filesystem.
92
+ - Also requires special filesystem support. Symbolic links are a native UNIX feature, so they are natively supported on GNU+Linux and OS X.
93
+ - GNU+Linux and OSX support them on all native filesystems (ext2/ext3/ext4/btrfs/etc).
94
+ - Windows sort of supports symbolic links but may require admin privileges to enable them. They are supported on NTFS and ReFS but not on FAT16/FAT32.
95
+ - Hard links
96
+ - Hard links are a bit weirder. Both the file contents and metadata are shared across files that are hard linked together. Metadata includes ownership, permissions, and extended attributes. Unlike symbolic links, there is no concept of link vs original file. All hard links are completely equivalent to each other.
97
+ - Also requires special filesystem support. Hard links are a native UNIX feature, so they are natively supported on GNU+Linux and OS X.
98
+ - On GNU+Linux and OS X, they are supported on roughly the same filesystems that also support symbolic links, which means all of the native filesystems.
99
+ - On Windows, they are supported AND allowed by default on NTFS and probably ReFS!
100
+ - There are some funny issues arising with hard links on Windows. For instance, Windows has extensive file locking (you can't edit a file while it's in use by another application). If a file is opened through a hard link, then it cannot be edited through another hard link, which makes sense. However, if a DLL file is hard linked and in use by a program, then you cannot delete another hard link of that file, because that counts as "editing"! The best you can do is to move the file to another location (on the same filesystem) and delete it later when it's not in use.
101
+
102
+ This [link](https://superuser.com/a/1340149) summarizes state of affairs on Windows.
103
+
104
+ No one solution fits all platforms, and different users may want different things.
105
+
106
+ | Backend | Windows support | Linux support | Prevent accidental corruption? |
107
+ | :---------------- | :-------------- | :------------ | :----------------------------- |
108
+ | Reflinks | very limited | limited | yes |
109
+ | Symbolic links | no - admin only | yes | no |
110
+ | Hard links | yes | yes | no |
111
+
112
+ The item "Prevent accidental corruption" refers to whether modifying a deduplicated file also affects other deduplicated links of the same file.
113
+
114
+ A full check for corruption involves re-reading all the file data and checking the hash. However, a much faster way to check for modifications is to just look at the file [modification time](https://linux.die.net/man/2/stat) and see whether it has been modified recently. The file mtime can be set to a date far into the past (year 1970), and any change from that would indicate an accidental modification. (This quick method is not reliable against intentional modifications, because a program could also overwrite the mtime and reset it to its initial value after modifying a file.)
115
+
116
+ ### pyc files
117
+
118
+ pyc files are specific to the Python version (3.9, 3.12, etc) and implementation (CPython vs PyPy).
119
+
120
+ They should NOT be stored inside the images, and instead re-created when a container is created. This can be done by running `python -m compileall --invalidation-mode unchecked-hash <DIRECTORY>`.
121
+
122
+ #### pyc file sharing
123
+
124
+ pyc files can be pretty large overall. They should therefore be shared between containers that have the same Python version. For example, all Python 3.9 containers that have the same ".py" file in some package should share the same pyc file (using hardlinks or filesystem reflinks).
125
+
126
+ This implies a database table mapping `(python_version, source_file_hash)` to `pyc_file_hash`. You may notice that this also creates an additional annoying coupling between the image storage component and the Python containerizer.
127
+
128
+ ### activation scripts
129
+
130
+ The scripts like "activate.bat" and "activate.ps1" and "activate.sh" usually embed the path to the container inside themselves. This is bad. During image creation, the paths must be removed from these files. During container creation (from an image), the paths must be put back inside the script OR an alternative relocatable script must be written instead (relocatable = you can move it elsewhere without breaking it).
131
+
132
+ ### command executables
133
+
134
+ This is a unique thing on Windows, and only exists because Windows doesn't have a shebang mechanism for running scripts using an interpreter. Each of these executables is a true Windows executable with a zip archive appended at the end. The zip archive contains a script which does have a shebang, and that shebang has the container path explicit in there.
135
+
136
+ ## Design ideas
137
+
138
+ This project can be largely split into two components. So much that they could arguably be put in separate packages.
139
+
140
+ ### Image storage component
141
+
142
+ This manages the file tree, does hashing, implements pull/push/repack, implements hard links / CoW reflinks to save space on identical files.
143
+
144
+ ### Python containerizer
145
+
146
+ This component handles two important operations:
147
+
148
+ - Turning a (venv) directory into an image. This involves removing the pyc files and removing the absolute paths inside various files.
149
+ - Turning an image into a container (venv directory). This involves re-generating the pyc files and putting back the paths inside various files.
150
+
151
+ These operations are both very Python-specific, and has not much to do with the image storage component.
152
+
153
+ ## Module organization
154
+
155
+ - `dedup`: Deduplicated file storage.
156
+ - Uses: n/a.
157
+ - `image`: Image creation and storage.
158
+ - Provides:
159
+ - Image creation from a venv directory.
160
+ - Image repository format.
161
+ - To make things very easy, the repository doesn't need to be served by a special server. Just plain static HTTP server is fine.
162
+ - Must compress related files together (like tar.xz). This can be done simply by sorting the file list.
163
+ - Must allow retrieving any image and its files without wasting "too much" bandwidth by downloading unwanted content. The maximum wasted bandwidth could be specified as a fixed amount when downloading an image, or as a percentage of the image content size, or both. If a "similar" image is already locally available, then this module must be able to reuse as much of the content as possible to minimize the download. This is a nontrivial CS problem, so have fun! It's a tradeoff, so let's consider a few shitty solutions:
164
+ - Compress each content file individually.
165
+ - ✅ You can download any subset of the files without any "waste".
166
+ - ❌ The compression won't be very good because you didn't pack together related files.
167
+ - Compress ALL of the content files together.
168
+ - ✅ You can download any subset of the files without any "waste".
169
+ - ✅ The compression will be very good (because related files got compressed together).
170
+ - ❌ You can't download any subset of the content efficiently.
171
+ - Compress together the files for a given image. One image = one big compressed archive.
172
+ - ✅ You can download any subset of the files without any "waste".
173
+ - ✅ The compression will be very good (because related files got compressed together).
174
+ - ✅ You can download one full image without wasting any bandwidth.
175
+ - ❌ If you previously downloaded a nearly identical image, you still won't be able to download "just the difference".
176
+ - Compress together the files for a given Python package by guessing from the file path. For example, all files under an image's `VENV/lib/python3.11/site-packages/waitress` are grouped together for compression.
177
+ - ✅ You can download any subset of the files without any "waste".
178
+ - ✅ The compression will be very good (because files within the same package are related and they got compressed together).
179
+ - ✅ You can download one full image without wasting any bandwidth.
180
+ - ✅ If you previously downloaded a nearly identical image, only different packages need to be downloaded.
181
+ - ❌ If a previously-downloaded package is really large but the new version only has a few changed files, that's still a large download.
182
+ - Same as above, but split each group into subsets with a maximum size (e.g., 2 megabytes).
183
+ - ✅ You can download any subset of the files without any "waste".
184
+ - ✅ The compression will be very good (because files within the same package are related and they got compressed together).
185
+ - ✅ You can download one full image without wasting any bandwidth.
186
+ - ✅ If you previously downloaded a nearly identical image, only different packages need to be downloaded.
187
+ - ✅ If a previously-downloaded package is really large but the new version only has a few changed files, you can probably only download the affected subsets.
188
+ - Uses:
189
+ - `dedup` to unpack an image's files into a container directory.
190
+ - `venv` to "fix" a newly-unpacked image so that it works correctly as a Python virtualenv, and conversely to "generalize" an existing venv so that it can be reproducibly turned into an image.
191
+ - `plugin.venv`: Virtualenv-specific stuff.
192
+ - Uses:
193
+ - `dedup` to efficiently store pyc files that are shared across multiple containers.
194
+
195
+ ## Implementation starter notes
196
+
197
+ ### Dependencies
198
+
199
+ - attrs
200
+ - marshmallow, probably
201
+
202
+ You will most likely need to keep around content indexes (which image archive file contains what data), as well as a record of which hardlink file contains which hash. The easiest way for both of these is to use a sqlite file. Either use `sqlalchemy` (a 1.4MB dependency!) as an ORM layer, or just use raw sqlite (the standard library Python module). Honestly raw sqlite isn't that bad. Using `sqlalchemy` would maybe also be a good experience. Idk.
203
+
204
+ ### Virtualenv creation
205
+
206
+ To create a venv manually inside directory `./venvy/`:
207
+
208
+ python -m venv --copies venvy
209
+
210
+ You can now "enter" the virtual environment using:
211
+
212
+ source bin/activate # on OSX/Linux
213
+ Scripts\activate.bat # on Windows
214
+
215
+ The shell will remain "activated" until it is closed. It does not affect other shells. The activation actually just modifies the "$PATH" environment variable.
216
+
217
+ You can check that `python` is now referring to the virtual environment:
218
+
219
+ which python # on OSX/Linux
220
+ where python # on Windows
221
+
222
+ You should see a path with `venvy` in it.
223
+
224
+ Note that `pip install` will, by default, download code from the internet and execute it! That's why I always use the `--no-index` flag. If you care about security and you don't want to use `--no-index`, please use a virtual machine or a docker/podman container during development, or idk use WSL2 on a Windows install you don't care about. I will be providing commands using `--no-index` because I am paranoid about security. Even the `pip download` command isn't safe, it will still sometimes execute code from the internet!
225
+
226
+ Let's add some packages to venvy! First download the latest wheel ".whl" files from:
227
+
228
+ - https://pypi.org/project/attrs/#files
229
+ - https://pypi.org/project/pure-radix/#files
230
+ - https://pypi.org/project/waitress/#files
231
+
232
+ (Just a few example packages.)
233
+
234
+ Feel free to open their contents and look inside. They're just zip archives! Place them in a directory called `./wheelhouse/`.
235
+
236
+ Now you can do something like:
237
+
238
+ python -m pip install --no-index --find-links /path/to/wheelhouse pure-radix waitress
239
+
240
+ which will install those packages. Notice now that there is a new command `waitress-serve` available inside the virtual environment. This command is located inside the `./venvy/bin/` (UNIX) or `./venvy/Scripts/` (Windows) directory. Look at its contents.
241
+
242
+ On UNIX, it will just be a script with a shebang like "#!/path/to/venvy/bin/python".
243
+ On Windows, it will be an exe file with a zip archive appended at the end which will contain a similar shebang with double quotes around the path.
244
+
245
+ You will see a whole bunch of pyc files inside the virtualenv. You can list them using
246
+
247
+ find -name '*.pyc'
248
+
249
+ You can get a list of all files that contain the virtualenv path explicitly using:
250
+
251
+ find -type f -not -name '*.pyc' -print0 | xargs -0 grep -H venvy
252
+
253
+ (We're excluding the pyc files because those always contain the path.)
254
+
255
+ To turn a virtualenv into an image, all instances of the virtualenv path MUST be removed from all of the files, and all pyc files must be removed.
256
+
257
+ To turn an image back into a virtualenv, the new virtualenv root path must be embedded back into the files, and the pyc files must be regenerated.
258
+
259
+ ## Repository format
260
+
261
+ To be quantitative, our goal is to efficiently deal with a 2GB virtualenv containing 100k files, and many variations of it (with slightly different package versions). Most images will share the same files.
262
+
263
+ The content hash for each file is 32-64 bytes, and the average path length seems to be about 50 characters. Maybe add one bit of metadata for the executable bit. A full index for such a virtualenv would be 10 MB in size. This is acceptable for the initial download, but not for the download of mostly-identical images.
264
+
265
+ I would prefer to avoid explicit delta compression between index files because then I have to decide how long to keep around the deltas and base files.
266
+
267
+ Maybe we can use HTTP request ranges?
268
+
269
+ ### Assumptions
270
+
271
+ From image to image, most files don't change. Only some related files change together, for example when a Python package is updated.
272
+
273
+ ### Idea
274
+
275
+ Given an image ID, assemble the image metadata which hashes to this image ID *and* determine which archives to fetch.
276
+
277
+ #### Image metadata resolution
278
+
279
+ The image ID is the hash of the image metadata, which contains:
280
+
281
+ - An arbitrary user-data block (probably JSON or CBOR format)
282
+ - A file dictionary {file_path: (file_metadata, file_content_hash)}
283
+
284
+ The file dictionary is very large (could be 100k entries) and only some parts of it change.
285
+
286
+ ##### Image metadata compression
287
+
288
+ The image metadata is split into two files:
289
+
290
+ 1. The paths and file metadata come first and are compressed together. The paths contains lots of repeated strings, so compression should be highly effective.
291
+ 2. The content hash of every file, in the same order as in the previous file. Hashes are incompressible, so there's no point in even trying to compress them.
292
+
293
+ ##### Variant 1: Image metadata sharding (good)
294
+
295
+ Split image metadata into multiple shards. An (overly) simple design would be that each shard gets its own compressed archive file with file contents.
296
+
297
+ But then what about deltas between archives? Maybe the client has an older archive and very little has changed since then. Each shard lists the relevant archive files, then efficiently states which archive contains which file contents. This could even be a compressed matrix bitfield. In other words, given archive index i and file content j, the entry `m[i,j]` encodes whether archive i contains file j.
298
+
299
+ ##### Variant 2: Image metadata delta encoding (bad)
300
+
301
+ Compute deltas between image metadatas, and have an index file listing the available images and deltas and their sizes. The client can simply pick a favourable path through the deltas.
302
+
303
+ The delta stuff adds a lot of complexity however. It also doesn't work that well if the images are mixing and matching among package versions.
304
+
305
+ ##### Hash digest swizzling
306
+
307
+ The image metadata shards contain lists of file content hashes. We can swizzle the hashes such that first we have the first byte of every content hash, then the second byte of every content hash, and so on. This allows a client to download the first few bytes of the hash for every file.
308
+
309
+ #### Server sync
310
+
311
+ When uploading a new image, a client should be aware of all of the existing images and archives on the server. This is so that it can create archives efficiently.
312
+
313
+ The easiest way is probably to just download all of the image metadata from the server. Wait but there could be files inside archives that aren't inside any image, and that are being re-added. I guess we can have a list of orphaned files in one of those image metadata shards using empty filenames.
314
+
315
+ Basically make it an invariant that we never remove image metadata shards if it would result in archive files not being listed inside any image metadata shard.
316
+
317
+ ##### Sync algorithm
318
+
319
+ 1. Pull all image metadata. This includes everything except archive contents.
320
+ 2. Lock the remote repository by writing to "lock.txt".
321
+ 3. Pull all image metadata again, in case new content was pushed before the lock was acquired.
322
+ 4. Create a new catalog with an updated image list (images added or removed).
323
+ 5. Create meta shards for each image, or reuse existing meta shards.
324
+ 6. Create archives to contain new files that aren't in any archive yet or which would incur too large of a download cost (pulling a 5MB archive for a 10KB file) or too large of a decompression cost (1 MB archive containing 1000 versions of a library expanding to 1 GB of source code)
325
+
326
+ ##### File creation algorithm
327
+
328
+ There's **a lot** of flexibility in choosing what shards and archives to create. Here's a crappy provisional draft.
329
+
330
+ - Partition the image file dictionary based off paths. Label each subset with a stable string called the "shard key".
331
+ - For each subset of the partition:
332
+ - If there is already an existing shard with exactly the same files, then use that one and continue to the next subset in the loop.
333
+ - Try to find a similar shard - if one exists, also create a "diff shard" which has file entries only for the files that are actually different between the older shard and the file subset.
334
+ - (We now know what files we will have in the new shard, so what's left is archive creation.)
335
+ - Identify archives that are quasi-subsets of this shard's files. In other words, find archives whose contents are mostly subsets of this shard's files.
336
+ - Create a new archive containing the files that are not in any quasi-subset archive.
337
+ - If the estimated wasted bandwidth exceeds the allowed limit, then create new archives. The new archives must support fetching any two images in succession without excessive wasted bandwidth.
338
+
339
+ ###### Required queries
340
+
341
+ - Given a content hash and a repository, find all archives that contain that content.
342
+ - Given a content hash and a repository and a shard key, find all images that reference that content hash.
343
+
344
+ #### Example
345
+
346
+ ```
347
+ /vocker.cbor # Contains the vocker version, and the "project-code" and "server-code". Also contains the cryptographic hash function used for all hashes.
348
+ /current-catalog.cbor # Current catalog index.
349
+ /manifest-history.cbor # Top-level hash and timestamp of current and several past manifest files.
350
+ /manifest/current.bin # List of every file path inside the repository and its hash. This is only used by the repository management stuff, not by clients that only download images.
351
+ /manifest/backup.bin # Old manifest.
352
+ /manifest/lock.cbor # Lock file to prevent concurrent writers from corrupting the repository. Shows who locked the repository and the lock expiration time.
353
+ /catalog/13/h.bin # Lists the image index and image ID for all available images. Also lists the "orphan file" image metadata entries.
354
+ /image/1/is.cbor # ID of the latest image-to-shard mapping. For example, it contains the integer 77.
355
+ /image/1/u.bin # Contains the image "user-data".
356
+ /image/2/a.bin
357
+ ...
358
+ /shard/1/p.bin # Compressed paths and file metadata. For example, "acme/foo.txt: not executable" and "acme/bar.exe: executable"
359
+ /shard/1/h.bin # Hash of each file content, in the same order as the file above.
360
+ /shard/1/sa.cbor # ID of the latest shard-to-archive mapping. For example, it contains the integer 67.
361
+ /shard/5/p.zst
362
+ ...
363
+ /shard/6/p.zst
364
+ ...
365
+ /shard/7/p.zst
366
+ ...
367
+ /sa/67/m.zst # Contains the mapping of shard to archive for one or more shards. For example, `shard[1] = archive[23] & archive[57]`. Also contains the approximate size of each archive.
368
+ /is/15/m.zst # Contains the mapping of image to shard for one or more images. For example, `image[1] = shard[1] & shard[5]`.
369
+ /archive/23/a.zst # Compressed contents of multiple files. Among them is the contents of "acme/foo.txt".
370
+ /archive/23/s.zst # Size of each of the compressed files. Compressed array of int64.
371
+ /archive/23/h.bin # Hash of each of the compressed files.
372
+ /archive/57/a.bin # Compressed contents of multiple files. Among them is the contents of "acme/bar.exe".
373
+ ```
374
+
375
+ #### Manifest file vs hash tree
376
+
377
+ Problem: there is no way for the client to check the integrity of their download. They cannot detect a truncated or corrupted download.
378
+
379
+ One way to prevent this is to have the client download checksums. A single manifest file that contains all the checksums for all the files in a repository could end up being very large.
380
+
381
+ Another way is to arrange the file hashes in a hash tree. The client normally only downloads the leaves of the hash tree to verify its downloaded content. A developer would download the entire hash tree instead.
382
+
383
+ Either way, there's a problem with a mismatch between content and checksum when updating the content.
384
+
385
+ ##### Checksum draft then content then checksum
386
+
387
+ Write procedure:
388
+
389
+ - Write all modified checksum nodes to a parallel file tree, suggestively named "/new/".
390
+ - Recursively traverse all modified directories, in a depth-first fashion:
391
+ - Perform all content updates (generally only additions and removals, very rarely modifications of existing files).
392
+ - Write the modified checksum nodes to the normal file tree.
393
+
394
+ Main advantage is that in case of an interrupted upload, it is easy to detect and skip directories that are finished by just reading the corresponding checksum node.
395
+
396
+ Read procedure:
397
+
398
+ - Read checksum node.
399
+ - Read file contents.
400
+ - If checksum matches, exit successfully.
401
+ - Read checksum node again, and read checksum from "new" parallel file tree. If the file contents matches neither, then exit with an error.
402
+
403
+ A great advantage is that a client knows what hash to expect and can reuse previously downloaded content.
404
+
405
+ #### Other notes
406
+
407
+ We can use the deduplication to store repository files. That way, two repositories that have (largely) the same contents won't incur twice the cost.
408
+
409
+ We should use a separate deduplication store, not together with the vocker file contents.
410
+
411
+ For cleanliness, we can keep a separate cache directory for every repository location ("project-code" + "server-code").
412
+
413
+ #### Meta-to-archive map
414
+
415
+ A meta shard has files that can be assembled from many archives. We precompute the possible ways that they can be assembled, as in:
416
+
417
+ shard1 = ((archive1 | archive2) & (archive3 | archive4 | archive5)) | archive6
418
+
419
+ If a client already has archive1 then it can pick the smallest among archive{3,4,5} and have the full file set. No need to muck about with a compressed matrix bitfield.
420
+
421
+ The shard-to-archive map must also contain the size of each of the archives to allow the client to choose the optimal archive-set to satisfy the shard file-set requirements.
422
+
423
+ Once the client chooses particular archives, then it can download the corresponding archive hash list and size list.
424
+
425
+ MVP: exactly one archive per shard.
426
+
427
+ Format:
428
+
429
+ ```
430
+ [ARCHIVE_INDEX_LIST, ARCHIVE_SIZES_UINT16_BE_ARRAY, RULES_LIST]
431
+
432
+ Each rule is one of:
433
+
434
+ - ["OR", OPERANDS...]
435
+ - ["AND", OPERANDS...]
436
+ - ["OUT", OPERAND, SHARD1, SHARD2, ...]
437
+
438
+ Let n be the number of archives referenced. Then len(ARCHIVE_INDEX_LIST) = n.
439
+ Each OPERAND above is an integer k. If k>=0, then it refers to an archive by its position in the ARCHIVE_INDEX_LIST.
440
+ If k<0, then it refers to the output of the rule at index current_index+k.
441
+ ```
442
+
443
+ ### MVP
444
+
445
+ #### Image creation/deletion workflow
446
+
447
+ ```sh
448
+ # Download the current repository state into directory "backup1". It will be good to have in case
449
+ # things go awfully wrong.
450
+ vocker repo download @my-official-repository backup1
451
+
452
+ # Make a shallow copy of "backup1" that we will make edits to.
453
+ vocker repo copy backup1 edit1
454
+
455
+ # Add the image to local repository "edit1". This reuses pre-existing metas and archives from
456
+ # "edit1" as much as possible.
457
+ vocker image import -R edit1 --type=pyenv1 /path/to/my/python/environment/
458
+
459
+ # Delete another image by ID.
460
+ vocker image delete -R edit1 01234567
461
+
462
+ # Export an image out of a local repository for testing.
463
+ vocker image export -R edit1 /path/to/new/python/env/
464
+
465
+ # Export an image out of a remote repository for testing.
466
+ vocker image export -R @my-other-official-repository /path/to/new/python/env/
467
+
468
+ # Repack repository and delete archives.
469
+ vocker repo repack edit1
470
+
471
+ # Push the changes to the remote repository.
472
+ # This is fast because it simply compares the local "manifest.bin" with the remote "manifest.bin".
473
+ # If the remote repository has changed since we downloaded it, show an error and tell the user to
474
+ # use the --force flag if they really want to squash whatever remote changes have occurred.
475
+ vocker repo upload edit1 @my-official-repository
476
+ ```
@@ -0,0 +1,3 @@
1
+ include CHANGELOG.md
2
+ include DESIGN.md
3
+ include tests/*
vocker-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: vocker
3
+ Version: 0.1.0
4
+ Summary: Docker-like manager for virtualenvs
5
+ Author-email: Eduard Christian Dumitrescu <eduard.c.dumitrescu@gmail.com>
6
+ License: General Public License v3
7
+ Project-URL: Homepage, https://hydra.ecd.space/deaduard/vocker/
8
+ Project-URL: Changelog, https://hydra.ecd.space/deaduard/vocker/file?name=CHANGELOG.md&ci=trunk
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: atomicwrites
11
+ Requires-Dist: attrs
12
+ Requires-Dist: boltons
13
+ Requires-Dist: cached_property
14
+ Requires-Dist: filelock
15
+ Requires-Dist: immutabledict
16
+ Requires-Dist: marshmallow
17
+ Requires-Dist: platformdirs
18
+ Requires-Dist: sansio_tools>=1.0.0
19
+ Requires-Dist: sqlalchemy_boltons>=2.4.0
20
+ Requires-Dist: SQLAlchemy
21
+ Requires-Dist: strictyaml
22
+ Requires-Dist: structlog
23
+ Requires-Dist: cbor2
24
+ Provides-Extra: zstandard
25
+ Requires-Dist: pyzstd; extra == "zstandard"
26
+ Provides-Extra: tests
27
+ Requires-Dist: pytest; extra == "tests"
28
+
29
+ # vocker
30
+
31
+ Manager for complete Python environments written with security in mind. Mostly for Windows.
32
+
33
+ ## Why
34
+
35
+ OK so here's a typical experience. You're working on different Python projects which require incompatible versions of dependencies. For example, one of them needs `libfoo==1.0.0` and the other needs `libfoo>3.0.0`. There's just no way to satisfy both. Python people encourage you to create different virtualenvs ("venvs") for different purposes. Sometimes a user reports a bug that they experience with some very specific version of a dependency, so you need to create yet another venv just to investigate that.
36
+
37
+ Here's a problem: every venv you install takes up a few hundred megabytes of disk space, and a lot of it is for completely redundant files. You were conned into buying an overpriced non-modular computer, so now your tiny non-upgradeable SSD space is now filled with many copies of the same files. You regret your life choices. Wouldn't it be nice if the duplicate files across different venvs didn't take up any additional space?
38
+
39
+ Users often report bugs against very specific versions of your software, and the café you work at has pretty slow WiFi. Installing hundreds of megabytes of the same packages over and over quickly grows tiresome. Wouldn't it be nice if you could just copy an existing venv and just tweak it a bit, for example replace the few packages that are actually different?
40
+
41
+ Finally, some of your nontechnical users refuse to compile and install their own software, but they do want to sometimes have multiple versions installed for testing purposes. However, they also bought non-upgradeable hardware so they don't want multiple copies of the same files that are identical across different versions of the software. Wouldn't it be nice if installing a new venv somehow recycled the existing files from the currently-installed venvs?
42
+
43
+ Some of your users are paranoid about security. Wouldn't it be nice if the software integrity of the venv-based software package were guaranteed through hashing and Merkle trees?
44
+
45
+ That's why.
46
+
47
+ ## Goals
48
+
49
+ - Developers can easily create images, and then distribute them to users who use them to run applications. The users don't necessarily use vocker directly to create containers, they may use some extra layer on top of it (like an installer that provides a GUI and maybe digital signature verification).
50
+ - Developers can easily create images from existing images by tweaking whatever needs to be different. For example, installing new software or modifying files.
51
+ - Image creation should be reproducible. That is, creating a Python environment and then turning it into an image should give you exactly the same image if you do that a second time. The resulting image hash should be identical.
52
+ - Developers can easily audit existing images by just rebuilding them from scratch and checking whether the final result is the same.
53
+
54
+ ## Non-goals
55
+
56
+ - Digital signature verification.
vocker-0.1.0/README.md ADDED
@@ -0,0 +1,28 @@
1
+ # vocker
2
+
3
+ Manager for complete Python environments written with security in mind. Mostly for Windows.
4
+
5
+ ## Why
6
+
7
+ OK so here's a typical experience. You're working on different Python projects which require incompatible versions of dependencies. For example, one of them needs `libfoo==1.0.0` and the other needs `libfoo>3.0.0`. There's just no way to satisfy both. Python people encourage you to create different virtualenvs ("venvs") for different purposes. Sometimes a user reports a bug that they experience with some very specific version of a dependency, so you need to create yet another venv just to investigate that.
8
+
9
+ Here's a problem: every venv you install takes up a few hundred megabytes of disk space, and a lot of it is for completely redundant files. You were conned into buying an overpriced non-modular computer, so now your tiny non-upgradeable SSD space is now filled with many copies of the same files. You regret your life choices. Wouldn't it be nice if the duplicate files across different venvs didn't take up any additional space?
10
+
11
+ Users often report bugs against very specific versions of your software, and the café you work at has pretty slow WiFi. Installing hundreds of megabytes of the same packages over and over quickly grows tiresome. Wouldn't it be nice if you could just copy an existing venv and just tweak it a bit, for example replace the few packages that are actually different?
12
+
13
+ Finally, some of your nontechnical users refuse to compile and install their own software, but they do want to sometimes have multiple versions installed for testing purposes. However, they also bought non-upgradeable hardware so they don't want multiple copies of the same files that are identical across different versions of the software. Wouldn't it be nice if installing a new venv somehow recycled the existing files from the currently-installed venvs?
14
+
15
+ Some of your users are paranoid about security. Wouldn't it be nice if the software integrity of the venv-based software package were guaranteed through hashing and Merkle trees?
16
+
17
+ That's why.
18
+
19
+ ## Goals
20
+
21
+ - Developers can easily create images, and then distribute them to users who use them to run applications. The users don't necessarily use vocker directly to create containers, they may use some extra layer on top of it (like an installer that provides a GUI and maybe digital signature verification).
22
+ - Developers can easily create images from existing images by tweaking whatever needs to be different. For example, installing new software or modifying files.
23
+ - Image creation should be reproducible. That is, creating a Python environment and then turning it into an image should give you exactly the same image if you do that a second time. The resulting image hash should be identical.
24
+ - Developers can easily audit existing images by just rebuilding them from scratch and checking whether the final result is the same.
25
+
26
+ ## Non-goals
27
+
28
+ - Digital signature verification.
@@ -0,0 +1,43 @@
1
+ [tool.black]
2
+ line-length = 100
3
+
4
+ [project]
5
+ name = "vocker"
6
+ version = "0.1.0"
7
+ description = "Docker-like manager for virtualenvs"
8
+ readme = "README.md"
9
+ license = { text = "General Public License v3" }
10
+ authors = [
11
+ { name = "Eduard Christian Dumitrescu", email = "eduard.c.dumitrescu@gmail.com" },
12
+ ]
13
+ dependencies = [
14
+ "atomicwrites",
15
+ "attrs",
16
+ "boltons",
17
+ "cached_property",
18
+ "filelock",
19
+ "immutabledict",
20
+ "marshmallow",
21
+ "platformdirs",
22
+ "sansio_tools>=1.0.0",
23
+ "sqlalchemy_boltons>=2.4.0",
24
+ "SQLAlchemy",
25
+ "strictyaml",
26
+ "structlog",
27
+ "cbor2",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ zstandard = [
32
+ "pyzstd",
33
+ ]
34
+ tests = [
35
+ "pytest",
36
+ ]
37
+
38
+ [project.urls]
39
+ Homepage = "https://hydra.ecd.space/deaduard/vocker/"
40
+ Changelog = "https://hydra.ecd.space/deaduard/vocker/file?name=CHANGELOG.md&ci=trunk"
41
+
42
+ [tool.setuptools.package-data]
43
+ "vocker" = ["py.typed"]
vocker-0.1.0/setup.cfg ADDED
@@ -0,0 +1,9 @@
1
+ [flake8]
2
+ ignore = E203,E302,E305,E704,E711,E712,E731,E741,W,C901,MC0001
3
+ max-line-length = 100
4
+ max-complexity = 99
5
+
6
+ [egg_info]
7
+ tag_build =
8
+ tag_date = 0
9
+
vocker-0.1.0/setup.py ADDED
@@ -0,0 +1,3 @@
1
+ from setuptools import setup
2
+
3
+ setup(name="vocker", version="0.0.0.1")
File without changes
@@ -0,0 +1,3 @@
1
+ from .cli import Main
2
+
3
+ Main.main()