canonzip 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- canonzip-1.0.0/LICENSE +21 -0
- canonzip-1.0.0/PKG-INFO +161 -0
- canonzip-1.0.0/README.md +137 -0
- canonzip-1.0.0/pyproject.toml +71 -0
- canonzip-1.0.0/src/canonzip/__init__.py +31 -0
- canonzip-1.0.0/src/canonzip/__main__.py +5 -0
- canonzip-1.0.0/src/canonzip/cli.py +111 -0
- canonzip-1.0.0/src/canonzip/exceptions.py +73 -0
- canonzip-1.0.0/src/canonzip/hashing.py +92 -0
- canonzip-1.0.0/src/canonzip/manifest.py +258 -0
- canonzip-1.0.0/src/canonzip/py.typed +0 -0
- canonzip-1.0.0/src/canonzip/zipping.py +112 -0
canonzip-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tyler Coles
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
canonzip-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: canonzip
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Produce canonical zips and hashes.
|
|
5
|
+
Author: Tyler Coles
|
|
6
|
+
Author-email: Tyler Coles <tylercoles@javadocmd.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Utilities
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Dist: pygit2==1.19.1
|
|
20
|
+
Requires-Dist: typer==0.24.1
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Project-URL: Repository, https://github.com/JavadocMD/canonzip.git
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# canonzip
|
|
26
|
+
|
|
27
|
+
Produce canonical zips and hashes from directory contents.
|
|
28
|
+
|
|
29
|
+
A canonical zip produces the exact same file for the same inputs,
|
|
30
|
+
regardless of when it was made or what machine made it.
|
|
31
|
+
|
|
32
|
+
A canonical hash produces the exact same hash for the same inputs,
|
|
33
|
+
regardless of when it was made or what machine made it.
|
|
34
|
+
|
|
35
|
+
This is particularly useful when zipping things like code for
|
|
36
|
+
AWS Lambda Functions, where you want to upload a new zip if and
|
|
37
|
+
only if the code has truly changed.
|
|
38
|
+
|
|
39
|
+
canonzip supports two usage modes: as a CLI or as an API.
|
|
40
|
+
|
|
41
|
+
Check out [`examples/terraform-aws-lambda`](./examples/terraform-aws-lambda/)
|
|
42
|
+
for an example use-case.
|
|
43
|
+
|
|
44
|
+
## Command Line Interface (CLI)
|
|
45
|
+
|
|
46
|
+
### `canonzip hash [OPTIONS] TARGET`
|
|
47
|
+
|
|
48
|
+
Print a canonical SHA-1 hash of `TARGET` to stdout.
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
$ canonzip hash path/to/target
|
|
52
|
+
4959e4b9a1812e511570eee14fe65b90098a0db6
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### `canonzip zip [OPTIONS] OUTPUT_PATH TARGET`
|
|
56
|
+
|
|
57
|
+
Write a canonical zip archive of `TARGET` to `OUTPUT_PATH`.
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
$ canonzip zip path/to/output.zip path/to/target
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
NOTE: the output of `hash` is *NOT* the same as the SHA-1 hash of the output
|
|
64
|
+
from `zip`. `hash` is specifically designed to avoid the extra overhead of
|
|
65
|
+
writing a zip file while fulfilling a similar use-case — detecting
|
|
66
|
+
changes in the files.
|
|
67
|
+
|
|
68
|
+
### CLI options
|
|
69
|
+
|
|
70
|
+
Both commands accept:
|
|
71
|
+
|
|
72
|
+
| Option | Description |
|
|
73
|
+
|---|---|
|
|
74
|
+
| `--exclude TEXT, -e TEXT` | Glob pattern to exclude (repeatable) |
|
|
75
|
+
| `--gitignore` | Exclude files based on `.gitignore` rules from the target's git repository |
|
|
76
|
+
| `--follow-symlinks` | Follow symbolic links; otherwise symlinks are ignored |
|
|
77
|
+
| `--verbose, -v` | Print included file paths (relative to target) to stderr |
|
|
78
|
+
| `--json` | Output result as JSON (e.g. `{"hash": "..."}`) |
|
|
79
|
+
|
|
80
|
+
If you specify both `exclude` and `gitignore`, files will be excluded as long
|
|
81
|
+
as they match at least one rule (logical or).
|
|
82
|
+
|
|
83
|
+
NOTE: exclude double-star globs (**) match one-or-more path segments;
|
|
84
|
+
contrary to gitignore syntax where they match zero-or-more.
|
|
85
|
+
|
|
86
|
+
## Programmatic Interface (API)
|
|
87
|
+
|
|
88
|
+
### `canonzip.hash(target, *, exclude, gitignore, follow_symlinks) -> str`
|
|
89
|
+
|
|
90
|
+
Compute a canonical SHA-1 hash of a directory.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import canonzip
|
|
94
|
+
|
|
95
|
+
digest = canonzip.hash("path/to/target")
|
|
96
|
+
#> "4959e4b9a1812e511570eee14fe65b90098a0db6"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### `canonzip.zip(output_path, target, *, exclude, gitignore, follow_symlinks) -> None`
|
|
100
|
+
|
|
101
|
+
Create a canonical zip archive of a directory.
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
canonzip.zip("path/to/output.zip", "path/to/target")
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Shared options
|
|
108
|
+
|
|
109
|
+
Both functions accept:
|
|
110
|
+
|
|
111
|
+
| Parameter | Type | Default | Description |
|
|
112
|
+
|---|---|---|---|
|
|
113
|
+
| `exclude` | `list[str] \| None` | `None` | Glob patterns to exclude |
|
|
114
|
+
| `gitignore` | `bool` | `False` | Exclude files based on `.gitignore` rules from the target's git repository |
|
|
115
|
+
| `follow_symlinks` | `bool` | `False` | Follow symbolic links; if `False`, symlinks are ignored |
|
|
116
|
+
|
|
117
|
+
If you specify both `exclude` and `gitignore`, files will be excluded as long
|
|
118
|
+
as they match at least one rule (logical or).
|
|
119
|
+
|
|
120
|
+
NOTE: exclude double-star globs (**) match one-or-more path segments;
|
|
121
|
+
contrary to gitignore syntax where they match zero-or-more.
|
|
122
|
+
|
|
123
|
+
### Exceptions
|
|
124
|
+
|
|
125
|
+
canonzip will raise standard errors if it cannot read or write files,
|
|
126
|
+
typically inheriting from `OSError`.
|
|
127
|
+
|
|
128
|
+
Additionally there are special cases which raise errors which inherit
|
|
129
|
+
from `canonzip.CanonzipError`:
|
|
130
|
+
|
|
131
|
+
| Exception | Raised when |
|
|
132
|
+
|---|---|
|
|
133
|
+
| `OutputPathError` | `output_path` is inside `target` |
|
|
134
|
+
| `GitRepositoryError` | `gitignore=True` but target is not in a git repo |
|
|
135
|
+
| `BrokenSymlinkError` | A broken symlink is encountered with `follow_symlinks=True` |
|
|
136
|
+
| `SymlinkCycleError` | A symlink cycle is detected with `follow_symlinks=True` |
|
|
137
|
+
|
|
138
|
+
### Advanced: build manifests explicitly
|
|
139
|
+
|
|
140
|
+
If you need direct access to the list of files that *would* be included in the
|
|
141
|
+
canonical hash or zip, you can use `build_manifest` to read the target
|
|
142
|
+
directory and return a `Manifest` object containing the list of files.
|
|
143
|
+
To save yourself from having to generate the manifest twice, you can then pass
|
|
144
|
+
it directly to `hash_from_manifest` or `zip_from_manifest` to complete the
|
|
145
|
+
operation.
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from canonzip import build_manifest, hash_from_manifest, zip_from_manifest
|
|
149
|
+
|
|
150
|
+
manifest = build_manifest("path/to/target", exclude=[".venv"])
|
|
151
|
+
|
|
152
|
+
# Do something interesting with the manifest...
|
|
153
|
+
print(manifest.target.as_posix())
|
|
154
|
+
|
|
155
|
+
for entry in manifest.entries:
|
|
156
|
+
print(entry.path.as_posix())
|
|
157
|
+
|
|
158
|
+
# Then compute the hash or zip
|
|
159
|
+
digest = hash_from_manifest(manifest)
|
|
160
|
+
zip_from_manifest("path/to/output.zip", manifest)
|
|
161
|
+
```
|
canonzip-1.0.0/README.md
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# canonzip
|
|
2
|
+
|
|
3
|
+
Produce canonical zips and hashes from directory contents.
|
|
4
|
+
|
|
5
|
+
A canonical zip produces the exact same file for the same inputs,
|
|
6
|
+
regardless of when it was made or what machine made it.
|
|
7
|
+
|
|
8
|
+
A canonical hash produces the exact same hash for the same inputs,
|
|
9
|
+
regardless of when it was made or what machine made it.
|
|
10
|
+
|
|
11
|
+
This is particularly useful when zipping things like code for
|
|
12
|
+
AWS Lambda Functions, where you want to upload a new zip if and
|
|
13
|
+
only if the code has truly changed.
|
|
14
|
+
|
|
15
|
+
canonzip supports two usage modes: as a CLI or as an API.
|
|
16
|
+
|
|
17
|
+
Check out [`examples/terraform-aws-lambda`](./examples/terraform-aws-lambda/)
|
|
18
|
+
for an example use-case.
|
|
19
|
+
|
|
20
|
+
## Command Line Interface (CLI)
|
|
21
|
+
|
|
22
|
+
### `canonzip hash [OPTIONS] TARGET`
|
|
23
|
+
|
|
24
|
+
Print a canonical SHA-1 hash of `TARGET` to stdout.
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
$ canonzip hash path/to/target
|
|
28
|
+
4959e4b9a1812e511570eee14fe65b90098a0db6
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### `canonzip zip [OPTIONS] OUTPUT_PATH TARGET`
|
|
32
|
+
|
|
33
|
+
Write a canonical zip archive of `TARGET` to `OUTPUT_PATH`.
|
|
34
|
+
|
|
35
|
+
```
|
|
36
|
+
$ canonzip zip path/to/output.zip path/to/target
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
NOTE: the output of `hash` is *NOT* the same as the SHA-1 hash of the output
|
|
40
|
+
from `zip`. `hash` is specifically designed to avoid the extra overhead of
|
|
41
|
+
writing a zip file while fulfilling a similar use-case — detecting
|
|
42
|
+
changes in the files.
|
|
43
|
+
|
|
44
|
+
### CLI options
|
|
45
|
+
|
|
46
|
+
Both commands accept:
|
|
47
|
+
|
|
48
|
+
| Option | Description |
|
|
49
|
+
|---|---|
|
|
50
|
+
| `--exclude TEXT, -e TEXT` | Glob pattern to exclude (repeatable) |
|
|
51
|
+
| `--gitignore` | Exclude files based on `.gitignore` rules from the target's git repository |
|
|
52
|
+
| `--follow-symlinks` | Follow symbolic links; otherwise symlinks are ignored |
|
|
53
|
+
| `--verbose, -v` | Print included file paths (relative to target) to stderr |
|
|
54
|
+
| `--json` | Output result as JSON (e.g. `{"hash": "..."}`) |
|
|
55
|
+
|
|
56
|
+
If you specify both `exclude` and `gitignore`, files will be excluded as long
|
|
57
|
+
as they match at least one rule (logical or).
|
|
58
|
+
|
|
59
|
+
NOTE: exclude double-star globs (**) match one-or-more path segments;
|
|
60
|
+
contrary to gitignore syntax where they match zero-or-more.
|
|
61
|
+
|
|
62
|
+
## Programmatic Interface (API)
|
|
63
|
+
|
|
64
|
+
### `canonzip.hash(target, *, exclude, gitignore, follow_symlinks) -> str`
|
|
65
|
+
|
|
66
|
+
Compute a canonical SHA-1 hash of a directory.
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import canonzip
|
|
70
|
+
|
|
71
|
+
digest = canonzip.hash("path/to/target")
|
|
72
|
+
#> "4959e4b9a1812e511570eee14fe65b90098a0db6"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### `canonzip.zip(output_path, target, *, exclude, gitignore, follow_symlinks) -> None`
|
|
76
|
+
|
|
77
|
+
Create a canonical zip archive of a directory.
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
canonzip.zip("path/to/output.zip", "path/to/target")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Shared options
|
|
84
|
+
|
|
85
|
+
Both functions accept:
|
|
86
|
+
|
|
87
|
+
| Parameter | Type | Default | Description |
|
|
88
|
+
|---|---|---|---|
|
|
89
|
+
| `exclude` | `list[str] \| None` | `None` | Glob patterns to exclude |
|
|
90
|
+
| `gitignore` | `bool` | `False` | Exclude files based on `.gitignore` rules from the target's git repository |
|
|
91
|
+
| `follow_symlinks` | `bool` | `False` | Follow symbolic links; if `False`, symlinks are ignored |
|
|
92
|
+
|
|
93
|
+
If you specify both `exclude` and `gitignore`, files will be excluded as long
|
|
94
|
+
as they match at least one rule (logical or).
|
|
95
|
+
|
|
96
|
+
NOTE: exclude double-star globs (**) match one-or-more path segments;
|
|
97
|
+
contrary to gitignore syntax where they match zero-or-more.
|
|
98
|
+
|
|
99
|
+
### Exceptions
|
|
100
|
+
|
|
101
|
+
canonzip will raise standard errors if it cannot read or write files,
|
|
102
|
+
typically inheriting from `OSError`.
|
|
103
|
+
|
|
104
|
+
Additionally there are special cases which raise errors which inherit
|
|
105
|
+
from `canonzip.CanonzipError`:
|
|
106
|
+
|
|
107
|
+
| Exception | Raised when |
|
|
108
|
+
|---|---|
|
|
109
|
+
| `OutputPathError` | `output_path` is inside `target` |
|
|
110
|
+
| `GitRepositoryError` | `gitignore=True` but target is not in a git repo |
|
|
111
|
+
| `BrokenSymlinkError` | A broken symlink is encountered with `follow_symlinks=True` |
|
|
112
|
+
| `SymlinkCycleError` | A symlink cycle is detected with `follow_symlinks=True` |
|
|
113
|
+
|
|
114
|
+
### Advanced: build manifests explicitly
|
|
115
|
+
|
|
116
|
+
If you need direct access to the list of files that *would* be included in the
|
|
117
|
+
canonical hash or zip, you can use `build_manifest` to read the target
|
|
118
|
+
directory and return a `Manifest` object containing the list of files.
|
|
119
|
+
To save yourself from having to generate the manifest twice, you can then pass
|
|
120
|
+
it directly to `hash_from_manifest` or `zip_from_manifest` to complete the
|
|
121
|
+
operation.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from canonzip import build_manifest, hash_from_manifest, zip_from_manifest
|
|
125
|
+
|
|
126
|
+
manifest = build_manifest("path/to/target", exclude=[".venv"])
|
|
127
|
+
|
|
128
|
+
# Do something interesting with the manifest...
|
|
129
|
+
print(manifest.target.as_posix())
|
|
130
|
+
|
|
131
|
+
for entry in manifest.entries:
|
|
132
|
+
print(entry.path.as_posix())
|
|
133
|
+
|
|
134
|
+
# Then compute the hash or zip
|
|
135
|
+
digest = hash_from_manifest(manifest)
|
|
136
|
+
zip_from_manifest("path/to/output.zip", manifest)
|
|
137
|
+
```
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "canonzip"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
requires-python = ">=3.11"
|
|
5
|
+
|
|
6
|
+
authors = [{ name = "Tyler Coles", email = "tylercoles@javadocmd.com" }]
|
|
7
|
+
description = "Produce canonical zips and hashes."
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = "MIT"
|
|
10
|
+
license-files = ["LICENSE"]
|
|
11
|
+
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 5 - Production/Stable",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Operating System :: OS Independent",
|
|
16
|
+
"Topic :: Utilities",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Programming Language :: Python :: 3.14",
|
|
22
|
+
"Typing :: Typed",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
dependencies = [
|
|
26
|
+
"pygit2==1.19.1",
|
|
27
|
+
"typer==0.24.1",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Repository = "https://github.com/JavadocMD/canonzip.git"
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
canonzip = "canonzip.cli:app"
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["uv_build>=0.11.0,<0.12.0"]
|
|
38
|
+
build-backend = "uv_build"
|
|
39
|
+
|
|
40
|
+
[dependency-groups]
|
|
41
|
+
dev = [
|
|
42
|
+
"pre-commit==4.5.1",
|
|
43
|
+
"pytest==9.0.2",
|
|
44
|
+
"ruff==0.15.7",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
extend-exclude = ["examples"]
|
|
49
|
+
|
|
50
|
+
[tool.ruff.lint]
|
|
51
|
+
preview = true
|
|
52
|
+
select = ["ALL"]
|
|
53
|
+
|
|
54
|
+
# Ruff ignores:
|
|
55
|
+
# - CPY: no need for copyright notice at the top of every file
|
|
56
|
+
# - COM812: Ruff recommends disabling this when using formatter
|
|
57
|
+
ignore = ["CPY", "COM812"]
|
|
58
|
+
|
|
59
|
+
[tool.ruff.lint.extend-per-file-ignores]
|
|
60
|
+
# - S101: we are allowed to use `assert` in pytest
|
|
61
|
+
# - D103: no docstrings required
|
|
62
|
+
"tests/**/*.py" = ["S101", "D103"]
|
|
63
|
+
|
|
64
|
+
[tool.ruff.lint.flake8-builtins]
|
|
65
|
+
ignorelist = ["hash", "zip"]
|
|
66
|
+
|
|
67
|
+
[tool.ruff.lint.pydocstyle]
|
|
68
|
+
convention = "google"
|
|
69
|
+
|
|
70
|
+
[tool.ruff.format]
|
|
71
|
+
preview = true
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""canonzip is a library for producing canonical zips and hashes."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import version
|
|
4
|
+
|
|
5
|
+
from canonzip.exceptions import (
|
|
6
|
+
BrokenSymlinkError,
|
|
7
|
+
CanonzipError,
|
|
8
|
+
GitRepositoryError,
|
|
9
|
+
OutputPathError,
|
|
10
|
+
SymlinkCycleError,
|
|
11
|
+
)
|
|
12
|
+
from canonzip.hashing import hash, hash_from_manifest
|
|
13
|
+
from canonzip.manifest import FileEntry, Manifest, build_manifest
|
|
14
|
+
from canonzip.zipping import zip, zip_from_manifest
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"BrokenSymlinkError",
|
|
18
|
+
"CanonzipError",
|
|
19
|
+
"FileEntry",
|
|
20
|
+
"GitRepositoryError",
|
|
21
|
+
"Manifest",
|
|
22
|
+
"OutputPathError",
|
|
23
|
+
"SymlinkCycleError",
|
|
24
|
+
"build_manifest",
|
|
25
|
+
"hash",
|
|
26
|
+
"hash_from_manifest",
|
|
27
|
+
"zip",
|
|
28
|
+
"zip_from_manifest",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
__version__: str = version("canonzip")
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Command-line interface for canonzip."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Annotated
|
|
8
|
+
|
|
9
|
+
import typer
|
|
10
|
+
|
|
11
|
+
from canonzip.exceptions import CanonzipError
|
|
12
|
+
from canonzip.hashing import hash_from_manifest
|
|
13
|
+
from canonzip.manifest import build_manifest
|
|
14
|
+
from canonzip.zipping import zip_from_manifest
|
|
15
|
+
|
|
16
|
+
app = typer.Typer(no_args_is_help=True, help="Produce canonical zips and hashes.")
|
|
17
|
+
|
|
18
|
+
ExcludeOption = Annotated[
|
|
19
|
+
list[str] | None,
|
|
20
|
+
typer.Option("--exclude", "-e", help="Glob pattern to exclude (repeatable)."),
|
|
21
|
+
]
|
|
22
|
+
GitignoreOption = Annotated[
|
|
23
|
+
bool,
|
|
24
|
+
typer.Option("--gitignore", help="Exclude files matching .gitignore patterns."),
|
|
25
|
+
]
|
|
26
|
+
FollowSymlinksOption = Annotated[
|
|
27
|
+
bool,
|
|
28
|
+
typer.Option("--follow-symlinks", help="Follow symbolic links."),
|
|
29
|
+
]
|
|
30
|
+
VerboseOption = Annotated[
|
|
31
|
+
bool,
|
|
32
|
+
typer.Option("--verbose", "-v", help="Print included file paths to stderr."),
|
|
33
|
+
]
|
|
34
|
+
JsonOption = Annotated[
|
|
35
|
+
bool,
|
|
36
|
+
typer.Option("--json", help="Output result as JSON."),
|
|
37
|
+
]
|
|
38
|
+
TargetArgument = Annotated[
|
|
39
|
+
Path,
|
|
40
|
+
typer.Argument(
|
|
41
|
+
exists=True,
|
|
42
|
+
file_okay=False,
|
|
43
|
+
dir_okay=True,
|
|
44
|
+
resolve_path=True,
|
|
45
|
+
help="Target directory.",
|
|
46
|
+
),
|
|
47
|
+
]
|
|
48
|
+
OutputArgument = Annotated[
|
|
49
|
+
Path,
|
|
50
|
+
typer.Argument(dir_okay=False, help="Output zip file path."),
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@app.command("hash")
|
|
55
|
+
def hash_command( # noqa: PLR0913
|
|
56
|
+
target: TargetArgument,
|
|
57
|
+
*,
|
|
58
|
+
exclude: ExcludeOption = None,
|
|
59
|
+
gitignore: GitignoreOption = False,
|
|
60
|
+
follow_symlinks: FollowSymlinksOption = False,
|
|
61
|
+
verbose: VerboseOption = False,
|
|
62
|
+
output_json: JsonOption = False,
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Compute a canonical hash of a directory.""" # noqa: DOC501
|
|
65
|
+
try:
|
|
66
|
+
manifest = build_manifest(
|
|
67
|
+
target,
|
|
68
|
+
exclude=exclude,
|
|
69
|
+
gitignore=gitignore,
|
|
70
|
+
follow_symlinks=follow_symlinks,
|
|
71
|
+
)
|
|
72
|
+
if verbose:
|
|
73
|
+
for path in manifest.relative_paths:
|
|
74
|
+
typer.echo(path, err=True)
|
|
75
|
+
digest = hash_from_manifest(manifest)
|
|
76
|
+
output = json.dumps({"hash": digest}) if output_json else digest
|
|
77
|
+
typer.echo(output)
|
|
78
|
+
except CanonzipError as exc:
|
|
79
|
+
typer.echo(str(exc), err=True)
|
|
80
|
+
raise typer.Exit(code=1) from None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@app.command("zip")
|
|
84
|
+
def zip_command( # noqa: PLR0913
|
|
85
|
+
output_path: OutputArgument,
|
|
86
|
+
target: TargetArgument,
|
|
87
|
+
*,
|
|
88
|
+
exclude: ExcludeOption = None,
|
|
89
|
+
gitignore: GitignoreOption = False,
|
|
90
|
+
follow_symlinks: FollowSymlinksOption = False,
|
|
91
|
+
verbose: VerboseOption = False,
|
|
92
|
+
output_json: JsonOption = False,
|
|
93
|
+
) -> None:
|
|
94
|
+
"""Create a canonical zip archive of a directory.""" # noqa: DOC501
|
|
95
|
+
try:
|
|
96
|
+
manifest = build_manifest(
|
|
97
|
+
target,
|
|
98
|
+
exclude=exclude,
|
|
99
|
+
gitignore=gitignore,
|
|
100
|
+
follow_symlinks=follow_symlinks,
|
|
101
|
+
)
|
|
102
|
+
if verbose:
|
|
103
|
+
for path in manifest.relative_paths:
|
|
104
|
+
typer.echo(path, err=True)
|
|
105
|
+
zip_from_manifest(output_path, manifest)
|
|
106
|
+
if output_json:
|
|
107
|
+
output = json.dumps({"hash": hash_from_manifest(manifest)})
|
|
108
|
+
typer.echo(output)
|
|
109
|
+
except CanonzipError as exc:
|
|
110
|
+
typer.echo(str(exc), err=True)
|
|
111
|
+
raise typer.Exit(code=1) from None
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Custom exception types for canonzip."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"BrokenSymlinkError",
|
|
12
|
+
"CanonzipError",
|
|
13
|
+
"GitRepositoryError",
|
|
14
|
+
"OutputPathError",
|
|
15
|
+
"SymlinkCycleError",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CanonzipError(Exception):
|
|
20
|
+
"""Base exception for all canonzip errors."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SymlinkCycleError(CanonzipError):
|
|
24
|
+
"""A symlink cycle was detected during directory traversal."""
|
|
25
|
+
|
|
26
|
+
path: Path
|
|
27
|
+
"""The path where the cycle was detected."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, path: Path) -> None:
|
|
30
|
+
"""Initialize with the path where the cycle was detected."""
|
|
31
|
+
super().__init__(f"Symlink cycle detected at {path}")
|
|
32
|
+
self.path = path
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BrokenSymlinkError(CanonzipError):
|
|
36
|
+
"""A broken symlink was found during directory traversal."""
|
|
37
|
+
|
|
38
|
+
path: Path
|
|
39
|
+
"""The path of the broken symlink."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, path: Path) -> None:
|
|
42
|
+
"""Initialize with the path of the broken symlink."""
|
|
43
|
+
super().__init__(f"Broken symlink found at {path}")
|
|
44
|
+
self.path = path
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class GitRepositoryError(CanonzipError):
|
|
48
|
+
"""The target is not in a valid (non-bare) git repository."""
|
|
49
|
+
|
|
50
|
+
path: Path
|
|
51
|
+
"""The path that is not in a valid git repository."""
|
|
52
|
+
|
|
53
|
+
def __init__(self, path: Path) -> None:
|
|
54
|
+
"""Initialize with the path that is not in a valid git repository."""
|
|
55
|
+
super().__init__(f"Not a valid (non-bare) git repository: {path}")
|
|
56
|
+
self.path = path
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class OutputPathError(CanonzipError):
|
|
60
|
+
"""The output path is inside the target directory."""
|
|
61
|
+
|
|
62
|
+
output_path: Path
|
|
63
|
+
"""The (invalid) output path attempted."""
|
|
64
|
+
target: Path
|
|
65
|
+
"""The target directory that contains the output path."""
|
|
66
|
+
|
|
67
|
+
def __init__(self, output_path: Path, target: Path) -> None:
|
|
68
|
+
"""Initialize with the output path and target directory."""
|
|
69
|
+
super().__init__(
|
|
70
|
+
f"Output path ({output_path}) cannot be inside target directory ({target})",
|
|
71
|
+
)
|
|
72
|
+
self.output_path = output_path
|
|
73
|
+
self.target = target
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Implements canonical hashing of directories."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from canonzip.manifest import build_manifest
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Sequence
|
|
13
|
+
from os import PathLike
|
|
14
|
+
|
|
15
|
+
from canonzip.manifest import Manifest
|
|
16
|
+
|
|
17
|
+
__all__ = ["hash", "hash_from_manifest"]
|
|
18
|
+
|
|
19
|
+
CHUNK_SIZE = 1024 * 1024
|
|
20
|
+
"""The chunk size to use when reading files for hashing, in bytes.
|
|
21
|
+
|
|
22
|
+
1 MiB is a good balance between performance and memory usage.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def hash(
|
|
27
|
+
target: str | PathLike[str],
|
|
28
|
+
*,
|
|
29
|
+
exclude: Sequence[str] | None = None,
|
|
30
|
+
gitignore: bool = False,
|
|
31
|
+
follow_symlinks: bool = False,
|
|
32
|
+
) -> str:
|
|
33
|
+
"""Compute a canonical hash of the directory at the given target path.
|
|
34
|
+
|
|
35
|
+
This hash should be stable across different platforms and runs, as long as the
|
|
36
|
+
directory contents and structure remain the same.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
target: The target path to hash.
|
|
40
|
+
exclude: Optional sequence of glob patterns to exclude from the hash.
|
|
41
|
+
Uses pathlib.Path.match() syntax, so "**" globs are not supported.
|
|
42
|
+
gitignore: If True, exclude files matching .gitignore patterns.
|
|
43
|
+
If True, the target directory must be in a valid git repository.
|
|
44
|
+
follow_symlinks: If True, follow symbolic links; if False, ignore them.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
A stable SHA-1 hash representing the directory files' paths and contents.
|
|
48
|
+
"""
|
|
49
|
+
manifest = build_manifest(
|
|
50
|
+
Path(target).resolve(),
|
|
51
|
+
exclude=exclude,
|
|
52
|
+
gitignore=gitignore,
|
|
53
|
+
follow_symlinks=follow_symlinks,
|
|
54
|
+
)
|
|
55
|
+
return hash_from_manifest(manifest)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def hash_from_manifest(manifest: Manifest) -> str:
|
|
59
|
+
"""Compute a canonical hash from a pre-built manifest.
|
|
60
|
+
|
|
61
|
+
This is useful when you want to inspect the manifest (e.g. to print
|
|
62
|
+
included paths) before hashing, without walking the directory twice.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
manifest: The pre-built manifest to hash.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
A stable SHA-1 hash representing the directory files' paths and contents.
|
|
69
|
+
"""
|
|
70
|
+
digest = hashlib.sha1() # noqa: S324 (SHA-1 is not used for security here)
|
|
71
|
+
for entry in manifest.entries:
|
|
72
|
+
# For each file, digest:
|
|
73
|
+
# - the relative path prefixed by its size, and
|
|
74
|
+
# - the file content prefixed by its size.
|
|
75
|
+
# The sizes act as separators to mitigate hash collisions.
|
|
76
|
+
# For example, without the size prefixes, the following two file sets would
|
|
77
|
+
# produce the same hash:
|
|
78
|
+
# - foobar (content: "baz")
|
|
79
|
+
# - foo (content: "barbaz")
|
|
80
|
+
rel_path = entry.path.relative_to(manifest.target)
|
|
81
|
+
rel_path_bytes = rel_path.as_posix().encode("utf-8")
|
|
82
|
+
digest.update(len(rel_path_bytes).to_bytes(8))
|
|
83
|
+
digest.update(rel_path_bytes)
|
|
84
|
+
digest.update(entry.size.to_bytes(8))
|
|
85
|
+
with entry.path.open("rb") as handle:
|
|
86
|
+
while True:
|
|
87
|
+
chunk = handle.read(CHUNK_SIZE)
|
|
88
|
+
if not chunk:
|
|
89
|
+
break
|
|
90
|
+
digest.update(chunk)
|
|
91
|
+
|
|
92
|
+
return digest.hexdigest()
|
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
"""Implements canonical ordering of files in directories.
|
|
2
|
+
|
|
3
|
+
Hashing and zipping use this manifest to ensure consistent behavior.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import stat
|
|
9
|
+
from collections.abc import Callable, Generator, Iterable, Sequence
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path, PurePosixPath
|
|
12
|
+
|
|
13
|
+
import pygit2
|
|
14
|
+
|
|
15
|
+
from canonzip.exceptions import (
|
|
16
|
+
BrokenSymlinkError,
|
|
17
|
+
GitRepositoryError,
|
|
18
|
+
SymlinkCycleError,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = ["FileEntry", "Manifest", "build_manifest"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True, slots=True)
|
|
25
|
+
class FileEntry:
|
|
26
|
+
"""Represents a file or directory entry in the manifest."""
|
|
27
|
+
|
|
28
|
+
path: Path
|
|
29
|
+
"""The absolute path to the file or directory."""
|
|
30
|
+
path_relative: Path
|
|
31
|
+
"""The path relative to the manifest target directory."""
|
|
32
|
+
mode: int
|
|
33
|
+
"""The file mode (permissions and type) as returned by os.stat()."""
|
|
34
|
+
is_dir: bool
|
|
35
|
+
"""Indicates if the entry is a directory."""
|
|
36
|
+
is_file: bool
|
|
37
|
+
"""Indicates if the entry is a file."""
|
|
38
|
+
is_symlink: bool
|
|
39
|
+
"""Indicates if the entry is a symbolic link."""
|
|
40
|
+
is_broken: bool
|
|
41
|
+
"""Indicates if the entry is a broken symbolic link."""
|
|
42
|
+
size: int
|
|
43
|
+
"""The size of the file in bytes."""
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def from_path(path: Path, path_relative: Path) -> FileEntry:
|
|
47
|
+
"""Create a FileEntry from a given path, gathering necessary metadata.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
path: The absolute path to a file or directory.
|
|
51
|
+
path_relative: The path relative to the manifest target directory.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
The FileEntry object.
|
|
55
|
+
"""
|
|
56
|
+
# Take care to handle broken symlinks correctly.
|
|
57
|
+
# First stat without following symlinks to determine if it's a symlink.
|
|
58
|
+
# If so, stat again with following enabled to determine if it's broken.
|
|
59
|
+
stat_nofollow = path.stat(follow_symlinks=False)
|
|
60
|
+
is_symlink = stat.S_ISLNK(stat_nofollow.st_mode)
|
|
61
|
+
if not is_symlink:
|
|
62
|
+
stat_result = stat_nofollow
|
|
63
|
+
is_broken = False
|
|
64
|
+
else:
|
|
65
|
+
try:
|
|
66
|
+
stat_result = path.stat(follow_symlinks=True)
|
|
67
|
+
is_broken = False
|
|
68
|
+
except OSError:
|
|
69
|
+
stat_result = stat_nofollow
|
|
70
|
+
is_broken = True
|
|
71
|
+
mode = stat_result.st_mode
|
|
72
|
+
return FileEntry(
|
|
73
|
+
path=path,
|
|
74
|
+
path_relative=path_relative,
|
|
75
|
+
mode=mode,
|
|
76
|
+
is_dir=stat.S_ISDIR(mode),
|
|
77
|
+
is_file=stat.S_ISREG(mode),
|
|
78
|
+
is_symlink=is_symlink,
|
|
79
|
+
is_broken=is_broken,
|
|
80
|
+
size=stat_result.st_size,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass(frozen=True, slots=True)
|
|
85
|
+
class Manifest:
|
|
86
|
+
"""A canonical ordered collection of files for a given target directory."""
|
|
87
|
+
|
|
88
|
+
target: Path
|
|
89
|
+
"""The resolved path of the target directory."""
|
|
90
|
+
entries: tuple[FileEntry, ...]
|
|
91
|
+
"""The canonical ordered file entries."""
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def relative_paths(self) -> Iterable[str]:
|
|
95
|
+
"""The relative paths of the manifest entries (posix-formatted strings)."""
|
|
96
|
+
return (entry.path_relative.as_posix() for entry in self.entries)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
ExcludePredicate = Callable[[FileEntry], bool]
|
|
100
|
+
"""A predicate function: should a FileEntry be excluded from the manifest?"""
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def exclude_none(_file_entry: FileEntry) -> bool:
|
|
104
|
+
"""The default exclusion predicate: exclude nothing.""" # noqa: DOC201
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def exclude_symlinks(prev: ExcludePredicate) -> ExcludePredicate:
|
|
109
|
+
"""Exclude symbolic links.""" # noqa: DOC201
|
|
110
|
+
|
|
111
|
+
def exclude(file_entry: FileEntry) -> bool:
|
|
112
|
+
return file_entry.is_symlink or prev(file_entry)
|
|
113
|
+
|
|
114
|
+
return exclude
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def exclude_by_patterns(
|
|
118
|
+
patterns: Sequence[str],
|
|
119
|
+
prev: ExcludePredicate,
|
|
120
|
+
) -> ExcludePredicate:
|
|
121
|
+
"""Exclude files matching any of the given glob patterns.""" # noqa: DOC201
|
|
122
|
+
patterns_list = list(patterns)
|
|
123
|
+
|
|
124
|
+
def exclude(file_entry: FileEntry) -> bool:
|
|
125
|
+
path = PurePosixPath(file_entry.path_relative)
|
|
126
|
+
match = any(
|
|
127
|
+
any(candidate.match(p) for p in patterns_list) # check each pattern
|
|
128
|
+
for candidate in [path, *path.parents[:-1]] # check each path and parents
|
|
129
|
+
)
|
|
130
|
+
return match or prev(file_entry)
|
|
131
|
+
|
|
132
|
+
return exclude
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def exclude_gitignored(path: Path, prev: ExcludePredicate) -> ExcludePredicate:
|
|
136
|
+
"""Exclude files using the .gitignore of the repository containing the path.""" # noqa: DOC201, DOC501
|
|
137
|
+
try:
|
|
138
|
+
repository = pygit2.Repository(path)
|
|
139
|
+
except pygit2.GitError:
|
|
140
|
+
raise GitRepositoryError(path) from None
|
|
141
|
+
if repository.is_bare:
|
|
142
|
+
raise GitRepositoryError(path) from None
|
|
143
|
+
workdir = Path(repository.workdir).resolve()
|
|
144
|
+
|
|
145
|
+
def is_ignored(file_entry: FileEntry) -> bool:
|
|
146
|
+
rel_path = file_entry.path.relative_to(workdir).as_posix()
|
|
147
|
+
if file_entry.is_dir:
|
|
148
|
+
rel_path = f"{rel_path}/"
|
|
149
|
+
return repository.path_is_ignored(rel_path)
|
|
150
|
+
|
|
151
|
+
def exclude(file_entry: FileEntry) -> bool:
|
|
152
|
+
return is_ignored(file_entry) or prev(file_entry)
|
|
153
|
+
|
|
154
|
+
return exclude
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def build_manifest(
|
|
158
|
+
target: Path,
|
|
159
|
+
*,
|
|
160
|
+
exclude: Sequence[str] | None = None,
|
|
161
|
+
gitignore: bool = False,
|
|
162
|
+
follow_symlinks: bool = False,
|
|
163
|
+
) -> Manifest:
|
|
164
|
+
"""Build the canonical manifest for the given target directory.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
target: The target directory to build the manifest for.
|
|
168
|
+
exclude: Optional sequence of glob patterns to exclude from themanifest.
|
|
169
|
+
Uses pathlib.Path.match() syntax, so "**" globs are not supported.
|
|
170
|
+
gitignore: If True, exclude files matching .gitignore patterns.
|
|
171
|
+
If True, the target directory must be in a valid git repository.
|
|
172
|
+
follow_symlinks: If True, follow symbolic links; if False, ignore them.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
A Manifest containing the resolved target path and its canonical file entries.
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
FileNotFoundError: If the target directory does not exist.
|
|
179
|
+
NotADirectoryError: If the target path is not a directory.
|
|
180
|
+
SymlinkCycleError: If a symlink cycle is detected.
|
|
181
|
+
GitRepositoryError: If gitignore=True but the target is not in a valid
|
|
182
|
+
git repository.
|
|
183
|
+
BrokenSymlinkError: If a broken symlink is found when follow_symlinks=True.
|
|
184
|
+
""" # noqa: DOC502
|
|
185
|
+
if not target.exists():
|
|
186
|
+
raise FileNotFoundError(target)
|
|
187
|
+
if not target.is_dir():
|
|
188
|
+
raise NotADirectoryError(target)
|
|
189
|
+
target = target.resolve()
|
|
190
|
+
|
|
191
|
+
# Build up the exclude predicate based on the options.
|
|
192
|
+
# Chained function application implements a logical or of the selected conditions,
|
|
193
|
+
# so a file is excluded if it matches any of the criteria.
|
|
194
|
+
exclude_fn = exclude_none
|
|
195
|
+
if not follow_symlinks:
|
|
196
|
+
exclude_fn = exclude_symlinks(exclude_fn)
|
|
197
|
+
if exclude:
|
|
198
|
+
exclude_fn = exclude_by_patterns(exclude, exclude_fn)
|
|
199
|
+
if gitignore:
|
|
200
|
+
exclude_fn = exclude_gitignored(target, exclude_fn)
|
|
201
|
+
|
|
202
|
+
walk = walk_directory(
|
|
203
|
+
target,
|
|
204
|
+
target,
|
|
205
|
+
walked_dirs=set(),
|
|
206
|
+
exclude=exclude_fn,
|
|
207
|
+
follow_symlinks=follow_symlinks,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
return Manifest(target=target, entries=tuple(walk))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def walk_directory(
|
|
214
|
+
root: Path,
|
|
215
|
+
path: Path,
|
|
216
|
+
*,
|
|
217
|
+
walked_dirs: set[Path],
|
|
218
|
+
exclude: ExcludePredicate,
|
|
219
|
+
follow_symlinks: bool,
|
|
220
|
+
) -> Generator[FileEntry, None, None]:
|
|
221
|
+
"""Recursively yield FileEntry objects for non-excluded files in a directory.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
root: The root directory for relative path calculations.
|
|
225
|
+
path: The directory to walk.
|
|
226
|
+
walked_dirs: A set of directories that have already been walked.
|
|
227
|
+
exclude: A predicate that returns True for files that should be excluded.
|
|
228
|
+
follow_symlinks: If True, follow symbolic links; if False, ignore them.
|
|
229
|
+
|
|
230
|
+
Yields:
|
|
231
|
+
FileEntry objects for files that are not excluded.
|
|
232
|
+
|
|
233
|
+
Raises:
|
|
234
|
+
SymlinkCycleError: If a symlink cycle is detected.
|
|
235
|
+
BrokenSymlinkError: If a broken symlink is found when follow_symlinks=True.
|
|
236
|
+
"""
|
|
237
|
+
if any(path.samefile(x) for x in walked_dirs):
|
|
238
|
+
raise SymlinkCycleError(path)
|
|
239
|
+
walked_dirs.add(path)
|
|
240
|
+
|
|
241
|
+
# Sorting entries by name ensures a stable order regardless of filesystem behavior.
|
|
242
|
+
for entry in sorted(path.iterdir(), key=lambda p: p.name):
|
|
243
|
+
file_entry = FileEntry.from_path(entry, entry.relative_to(root))
|
|
244
|
+
if follow_symlinks and file_entry.is_broken:
|
|
245
|
+
raise BrokenSymlinkError(entry)
|
|
246
|
+
if exclude(file_entry):
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
if file_entry.is_file:
|
|
250
|
+
yield file_entry
|
|
251
|
+
elif file_entry.is_dir:
|
|
252
|
+
yield from walk_directory(
|
|
253
|
+
root,
|
|
254
|
+
entry,
|
|
255
|
+
walked_dirs=walked_dirs,
|
|
256
|
+
exclude=exclude,
|
|
257
|
+
follow_symlinks=follow_symlinks,
|
|
258
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Implements canonical zipping of directories."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
import stat
|
|
7
|
+
import zipfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from canonzip.exceptions import OutputPathError
|
|
12
|
+
from canonzip.manifest import build_manifest
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Sequence
|
|
16
|
+
from os import PathLike
|
|
17
|
+
|
|
18
|
+
from canonzip.manifest import Manifest
|
|
19
|
+
|
|
20
|
+
__all__ = ["zip", "zip_from_manifest"]
|
|
21
|
+
|
|
22
|
+
CHUNK_SIZE = 1024 * 1024
|
|
23
|
+
"""The chunk size to use when copying files into the zip, in bytes.
|
|
24
|
+
|
|
25
|
+
1 MiB is a good balance between performance and memory usage.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def zip(
|
|
30
|
+
output_path: str | PathLike[str],
|
|
31
|
+
target: str | PathLike[str],
|
|
32
|
+
*,
|
|
33
|
+
exclude: Sequence[str] | None = None,
|
|
34
|
+
gitignore: bool = False,
|
|
35
|
+
follow_symlinks: bool = False,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""Create a canonical zip file of the directory at the given target path.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
output_path: The path to write the zip file to.
|
|
41
|
+
Must not be inside the target directory.
|
|
42
|
+
target: The target directory to zip.
|
|
43
|
+
exclude: Optional sequence of glob patterns to exclude from the zip.
|
|
44
|
+
Uses pathlib.Path.match() syntax, so "**" globs are not supported.
|
|
45
|
+
gitignore: If True, exclude files matching .gitignore patterns.
|
|
46
|
+
If True, the target directory must be in a valid git repository.
|
|
47
|
+
follow_symlinks: If True, follow symbolic links; if False, ignore them.
|
|
48
|
+
"""
|
|
49
|
+
manifest = build_manifest(
|
|
50
|
+
Path(target).resolve(),
|
|
51
|
+
exclude=exclude,
|
|
52
|
+
gitignore=gitignore,
|
|
53
|
+
follow_symlinks=follow_symlinks,
|
|
54
|
+
)
|
|
55
|
+
zip_from_manifest(output_path, manifest)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def zip_from_manifest(output_path: str | PathLike[str], manifest: Manifest) -> None:
|
|
59
|
+
"""Create a canonical zip file from a pre-built manifest.
|
|
60
|
+
|
|
61
|
+
This is useful when you want to inspect the manifest (e.g. to print
|
|
62
|
+
included paths) before zipping, without walking the directory twice.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
output_path: The path to write the zip file to.
|
|
66
|
+
Must not be inside the target directory.
|
|
67
|
+
manifest: The pre-built manifest whose entries to include.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
OutputPathError: If the output path is inside the target directory.
|
|
71
|
+
"""
|
|
72
|
+
destination = Path(output_path).resolve()
|
|
73
|
+
target_path = manifest.target
|
|
74
|
+
|
|
75
|
+
# Ensure the output path is not inside the target directory;
|
|
76
|
+
# I'm not sure this would cause problems, but it seems better to rule this case out.
|
|
77
|
+
try:
|
|
78
|
+
destination.relative_to(target_path)
|
|
79
|
+
except ValueError:
|
|
80
|
+
pass
|
|
81
|
+
else:
|
|
82
|
+
raise OutputPathError(destination, target_path)
|
|
83
|
+
|
|
84
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
fixed_timestamp = (1980, 1, 1, 0, 0, 0)
|
|
87
|
+
compression = zipfile.ZIP_DEFLATED
|
|
88
|
+
with zipfile.ZipFile(
|
|
89
|
+
destination,
|
|
90
|
+
mode="w",
|
|
91
|
+
compression=compression,
|
|
92
|
+
compresslevel=9,
|
|
93
|
+
) as zip:
|
|
94
|
+
for entry in manifest.entries:
|
|
95
|
+
rel_path = entry.path.relative_to(target_path)
|
|
96
|
+
# This seems like a decent reference on zip file format:
|
|
97
|
+
# https://pkwaredownloads.blob.core.windows.net/pkware-general/Documentation/APPNOTE-6.3.9.TXT
|
|
98
|
+
# create_system=3 indicates file attributes are UNIX compatible
|
|
99
|
+
# attr is shifted because left two bytes are for UNIX, right two for Windows
|
|
100
|
+
info = zipfile.ZipInfo(rel_path.as_posix(), fixed_timestamp)
|
|
101
|
+
info.compress_type = compression
|
|
102
|
+
info.create_system = 3
|
|
103
|
+
info.external_attr = normalized_mode(entry.mode) << 16
|
|
104
|
+
with entry.path.open("rb") as src, zip.open(info, "w") as dst:
|
|
105
|
+
shutil.copyfileobj(src, dst, length=CHUNK_SIZE)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def normalized_mode(mode: int) -> int:
|
|
109
|
+
"""Normalize the mode of zipped files (for cross-platform stability).""" # noqa: DOC201
|
|
110
|
+
if mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH):
|
|
111
|
+
return 0o755
|
|
112
|
+
return 0o644
|