canonzip 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
canonzip-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tyler Coles
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,161 @@
1
+ Metadata-Version: 2.4
2
+ Name: canonzip
3
+ Version: 1.0.0
4
+ Summary: Produce canonical zips and hashes.
5
+ Author: Tyler Coles
6
+ Author-email: Tyler Coles <tylercoles@javadocmd.com>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 5 - Production/Stable
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Topic :: Utilities
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Programming Language :: Python :: 3.14
18
+ Classifier: Typing :: Typed
19
+ Requires-Dist: pygit2==1.19.1
20
+ Requires-Dist: typer==0.24.1
21
+ Requires-Python: >=3.11
22
+ Project-URL: Repository, https://github.com/JavadocMD/canonzip.git
23
+ Description-Content-Type: text/markdown
24
+
25
+ # canonzip
26
+
27
+ Produce canonical zips and hashes from directory contents.
28
+
29
+ A canonical zip produces the exact same file for the same inputs,
30
+ regardless of when it was made or what machine made it.
31
+
32
+ A canonical hash produces the exact same hash for the same inputs,
33
+ regardless of when it was made or what machine made it.
34
+
35
+ This is particularly useful when zipping things like code for
36
+ AWS Lambda Functions, where you want to upload a new zip if and
37
+ only if the code has truly changed.
38
+
39
+ canonzip supports two usage modes: as a CLI or as an API.
40
+
41
+ Check out [`examples/terraform-aws-lambda`](./examples/terraform-aws-lambda/)
42
+ for an example use-case.
43
+
44
+ ## Command Line Interface (CLI)
45
+
46
+ ### `canonzip hash [OPTIONS] TARGET`
47
+
48
+ Print a canonical SHA-1 hash of `TARGET` to stdout.
49
+
50
+ ```
51
+ $ canonzip hash path/to/target
52
+ 4959e4b9a1812e511570eee14fe65b90098a0db6
53
+ ```
54
+
55
+ ### `canonzip zip [OPTIONS] OUTPUT_PATH TARGET`
56
+
57
+ Write a canonical zip archive of `TARGET` to `OUTPUT_PATH`.
58
+
59
+ ```
60
+ $ canonzip zip path/to/output.zip path/to/target
61
+ ```
62
+
63
+ NOTE: the output of `hash` is *NOT* the same as the SHA-1 hash of the output
64
+ from `zip`. `hash` is specifically designed to avoid the extra overhead of
65
+ writing a zip file while fulfilling a similar use-case &mdash; detecting
66
+ changes in the files.
67
+
68
+ ### CLI options
69
+
70
+ Both commands accept:
71
+
72
+ | Option | Description |
73
+ |---|---|
74
+ | `--exclude TEXT, -e TEXT` | Glob pattern to exclude (repeatable) |
75
+ | `--gitignore` | Exclude files based on `.gitignore` rules from the target's git repository |
76
+ | `--follow-symlinks` | Follow symbolic links; otherwise symlinks are ignored |
77
+ | `--verbose, -v` | Print included file paths (relative to target) to stderr |
78
+ | `--json` | Output result as JSON (e.g. `{"hash": "..."}`) |
79
+
80
+ If you specify both `exclude` and `gitignore`, files will be excluded as long
81
+ as they match at least one rule (logical or).
82
+
83
+ NOTE: exclude double-star globs (**) match one-or-more path segments;
84
+ contrary to gitignore syntax where they match zero-or-more.
85
+
86
+ ## Programmatic Interface (API)
87
+
88
+ ### `canonzip.hash(target, *, exclude, gitignore, follow_symlinks) -> str`
89
+
90
+ Compute a canonical SHA-1 hash of a directory.
91
+
92
+ ```python
93
+ import canonzip
94
+
95
+ digest = canonzip.hash("path/to/target")
96
+ #> "4959e4b9a1812e511570eee14fe65b90098a0db6"
97
+ ```
98
+
99
+ ### `canonzip.zip(output_path, target, *, exclude, gitignore, follow_symlinks) -> None`
100
+
101
+ Create a canonical zip archive of a directory.
102
+
103
+ ```python
104
+ canonzip.zip("path/to/output.zip", "path/to/target")
105
+ ```
106
+
107
+ ### Shared options
108
+
109
+ Both functions accept:
110
+
111
+ | Parameter | Type | Default | Description |
112
+ |---|---|---|---|
113
+ | `exclude` | `list[str] \| None` | `None` | Glob patterns to exclude |
114
+ | `gitignore` | `bool` | `False` | Exclude files based on `.gitignore` rules from the target's git repository |
115
+ | `follow_symlinks` | `bool` | `False` | Follow symbolic links; if `False`, symlinks are ignored |
116
+
117
+ If you specify both `exclude` and `gitignore`, files will be excluded as long
118
+ as they match at least one rule (logical or).
119
+
120
+ NOTE: exclude double-star globs (**) match one-or-more path segments;
121
+ contrary to gitignore syntax where they match zero-or-more.
122
+
123
+ ### Exceptions
124
+
125
+ canonzip will raise standard errors if it cannot read or write files,
126
+ typically inheriting from `OSError`.
127
+
128
+ Additionally there are special cases which raise errors which inherit
129
+ from `canonzip.CanonzipError`:
130
+
131
+ | Exception | Raised when |
132
+ |---|---|
133
+ | `OutputPathError` | `output_path` is inside `target` |
134
+ | `GitRepositoryError` | `gitignore=True` but target is not in a git repo |
135
+ | `BrokenSymlinkError` | A broken symlink is encountered with `follow_symlinks=True` |
136
+ | `SymlinkCycleError` | A symlink cycle is detected with `follow_symlinks=True` |
137
+
138
+ ### Advanced: build manifests explicitly
139
+
140
+ If you need direct access to the list of files that *would* be included in the
141
+ canonical hash or zip, you can use `build_manifest` to read the target
142
+ directory and return a `Manifest` object containing the list of files.
143
+ To save yourself from having to generate the manifest twice, you can then pass
144
+ it directly to `hash_from_manifest` or `zip_from_manifest` to complete the
145
+ operation.
146
+
147
+ ```python
148
+ from canonzip import build_manifest, hash_from_manifest, zip_from_manifest
149
+
150
+ manifest = build_manifest("path/to/target", exclude=[".venv"])
151
+
152
+ # Do something interesting with the manifest...
153
+ print(manifest.target.as_posix())
154
+
155
+ for entry in manifest.entries:
156
+ print(entry.path.as_posix())
157
+
158
+ # Then compute the hash or zip
159
+ digest = hash_from_manifest(manifest)
160
+ zip_from_manifest("path/to/output.zip", manifest)
161
+ ```
@@ -0,0 +1,137 @@
1
+ # canonzip
2
+
3
+ Produce canonical zips and hashes from directory contents.
4
+
5
+ A canonical zip produces the exact same file for the same inputs,
6
+ regardless of when it was made or what machine made it.
7
+
8
+ A canonical hash produces the exact same hash for the same inputs,
9
+ regardless of when it was made or what machine made it.
10
+
11
+ This is particularly useful when zipping things like code for
12
+ AWS Lambda Functions, where you want to upload a new zip if and
13
+ only if the code has truly changed.
14
+
15
+ canonzip supports two usage modes: as a CLI or as an API.
16
+
17
+ Check out [`examples/terraform-aws-lambda`](./examples/terraform-aws-lambda/)
18
+ for an example use-case.
19
+
20
+ ## Command Line Interface (CLI)
21
+
22
+ ### `canonzip hash [OPTIONS] TARGET`
23
+
24
+ Print a canonical SHA-1 hash of `TARGET` to stdout.
25
+
26
+ ```
27
+ $ canonzip hash path/to/target
28
+ 4959e4b9a1812e511570eee14fe65b90098a0db6
29
+ ```
30
+
31
+ ### `canonzip zip [OPTIONS] OUTPUT_PATH TARGET`
32
+
33
+ Write a canonical zip archive of `TARGET` to `OUTPUT_PATH`.
34
+
35
+ ```
36
+ $ canonzip zip path/to/output.zip path/to/target
37
+ ```
38
+
39
+ NOTE: the output of `hash` is *NOT* the same as the SHA-1 hash of the output
40
+ from `zip`. `hash` is specifically designed to avoid the extra overhead of
41
+ writing a zip file while fulfilling a similar use-case &mdash; detecting
42
+ changes in the files.
43
+
44
+ ### CLI options
45
+
46
+ Both commands accept:
47
+
48
+ | Option | Description |
49
+ |---|---|
50
+ | `--exclude TEXT, -e TEXT` | Glob pattern to exclude (repeatable) |
51
+ | `--gitignore` | Exclude files based on `.gitignore` rules from the target's git repository |
52
+ | `--follow-symlinks` | Follow symbolic links; otherwise symlinks are ignored |
53
+ | `--verbose, -v` | Print included file paths (relative to target) to stderr |
54
+ | `--json` | Output result as JSON (e.g. `{"hash": "..."}`) |
55
+
56
+ If you specify both `exclude` and `gitignore`, files will be excluded as long
57
+ as they match at least one rule (logical or).
58
+
59
+ NOTE: exclude double-star globs (**) match one-or-more path segments;
60
+ contrary to gitignore syntax where they match zero-or-more.
61
+
62
+ ## Programmatic Interface (API)
63
+
64
+ ### `canonzip.hash(target, *, exclude, gitignore, follow_symlinks) -> str`
65
+
66
+ Compute a canonical SHA-1 hash of a directory.
67
+
68
+ ```python
69
+ import canonzip
70
+
71
+ digest = canonzip.hash("path/to/target")
72
+ #> "4959e4b9a1812e511570eee14fe65b90098a0db6"
73
+ ```
74
+
75
+ ### `canonzip.zip(output_path, target, *, exclude, gitignore, follow_symlinks) -> None`
76
+
77
+ Create a canonical zip archive of a directory.
78
+
79
+ ```python
80
+ canonzip.zip("path/to/output.zip", "path/to/target")
81
+ ```
82
+
83
+ ### Shared options
84
+
85
+ Both functions accept:
86
+
87
+ | Parameter | Type | Default | Description |
88
+ |---|---|---|---|
89
+ | `exclude` | `list[str] \| None` | `None` | Glob patterns to exclude |
90
+ | `gitignore` | `bool` | `False` | Exclude files based on `.gitignore` rules from the target's git repository |
91
+ | `follow_symlinks` | `bool` | `False` | Follow symbolic links; if `False`, symlinks are ignored |
92
+
93
+ If you specify both `exclude` and `gitignore`, files will be excluded as long
94
+ as they match at least one rule (logical or).
95
+
96
+ NOTE: exclude double-star globs (**) match one-or-more path segments;
97
+ contrary to gitignore syntax where they match zero-or-more.
98
+
99
+ ### Exceptions
100
+
101
+ canonzip will raise standard errors if it cannot read or write files,
102
+ typically inheriting from `OSError`.
103
+
104
+ Additionally there are special cases which raise errors which inherit
105
+ from `canonzip.CanonzipError`:
106
+
107
+ | Exception | Raised when |
108
+ |---|---|
109
+ | `OutputPathError` | `output_path` is inside `target` |
110
+ | `GitRepositoryError` | `gitignore=True` but target is not in a git repo |
111
+ | `BrokenSymlinkError` | A broken symlink is encountered with `follow_symlinks=True` |
112
+ | `SymlinkCycleError` | A symlink cycle is detected with `follow_symlinks=True` |
113
+
114
+ ### Advanced: build manifests explicitly
115
+
116
+ If you need direct access to the list of files that *would* be included in the
117
+ canonical hash or zip, you can use `build_manifest` to read the target
118
+ directory and return a `Manifest` object containing the list of files.
119
+ To save yourself from having to generate the manifest twice, you can then pass
120
+ it directly to `hash_from_manifest` or `zip_from_manifest` to complete the
121
+ operation.
122
+
123
+ ```python
124
+ from canonzip import build_manifest, hash_from_manifest, zip_from_manifest
125
+
126
+ manifest = build_manifest("path/to/target", exclude=[".venv"])
127
+
128
+ # Do something interesting with the manifest...
129
+ print(manifest.target.as_posix())
130
+
131
+ for entry in manifest.entries:
132
+ print(entry.path.as_posix())
133
+
134
+ # Then compute the hash or zip
135
+ digest = hash_from_manifest(manifest)
136
+ zip_from_manifest("path/to/output.zip", manifest)
137
+ ```
@@ -0,0 +1,71 @@
1
+ [project]
2
+ name = "canonzip"
3
+ version = "1.0.0"
4
+ requires-python = ">=3.11"
5
+
6
+ authors = [{ name = "Tyler Coles", email = "tylercoles@javadocmd.com" }]
7
+ description = "Produce canonical zips and hashes."
8
+ readme = "README.md"
9
+ license = "MIT"
10
+ license-files = ["LICENSE"]
11
+
12
+ classifiers = [
13
+ "Development Status :: 5 - Production/Stable",
14
+ "Intended Audience :: Developers",
15
+ "Operating System :: OS Independent",
16
+ "Topic :: Utilities",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Programming Language :: Python :: 3.14",
22
+ "Typing :: Typed",
23
+ ]
24
+
25
+ dependencies = [
26
+ "pygit2==1.19.1",
27
+ "typer==0.24.1",
28
+ ]
29
+
30
+ [project.urls]
31
+ Repository = "https://github.com/JavadocMD/canonzip.git"
32
+
33
+ [project.scripts]
34
+ canonzip = "canonzip.cli:app"
35
+
36
+ [build-system]
37
+ requires = ["uv_build>=0.11.0,<0.12.0"]
38
+ build-backend = "uv_build"
39
+
40
+ [dependency-groups]
41
+ dev = [
42
+ "pre-commit==4.5.1",
43
+ "pytest==9.0.2",
44
+ "ruff==0.15.7",
45
+ ]
46
+
47
+ [tool.ruff]
48
+ extend-exclude = ["examples"]
49
+
50
+ [tool.ruff.lint]
51
+ preview = true
52
+ select = ["ALL"]
53
+
54
+ # Ruff ignores:
55
+ # - CPY: no need for copyright notice at the top of every file
56
+ # - COM812: Ruff recommends disabling this when using formatter
57
+ ignore = ["CPY", "COM812"]
58
+
59
+ [tool.ruff.lint.extend-per-file-ignores]
60
+ # - S101: we are allowed to use `assert` in pytest
61
+ # - D103: no docstrings required
62
+ "tests/**/*.py" = ["S101", "D103"]
63
+
64
+ [tool.ruff.lint.flake8-builtins]
65
+ ignorelist = ["hash", "zip"]
66
+
67
+ [tool.ruff.lint.pydocstyle]
68
+ convention = "google"
69
+
70
+ [tool.ruff.format]
71
+ preview = true
@@ -0,0 +1,31 @@
1
+ """canonzip is a library for producing canonical zips and hashes."""
2
+
3
+ from importlib.metadata import version
4
+
5
+ from canonzip.exceptions import (
6
+ BrokenSymlinkError,
7
+ CanonzipError,
8
+ GitRepositoryError,
9
+ OutputPathError,
10
+ SymlinkCycleError,
11
+ )
12
+ from canonzip.hashing import hash, hash_from_manifest
13
+ from canonzip.manifest import FileEntry, Manifest, build_manifest
14
+ from canonzip.zipping import zip, zip_from_manifest
15
+
16
+ __all__ = [
17
+ "BrokenSymlinkError",
18
+ "CanonzipError",
19
+ "FileEntry",
20
+ "GitRepositoryError",
21
+ "Manifest",
22
+ "OutputPathError",
23
+ "SymlinkCycleError",
24
+ "build_manifest",
25
+ "hash",
26
+ "hash_from_manifest",
27
+ "zip",
28
+ "zip_from_manifest",
29
+ ]
30
+
31
+ __version__: str = version("canonzip")
@@ -0,0 +1,5 @@
1
+ """Allow running canonzip as `python -m canonzip`."""
2
+
3
+ from canonzip.cli import app
4
+
5
+ app()
@@ -0,0 +1,111 @@
1
+ """Command-line interface for canonzip."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ import typer
10
+
11
+ from canonzip.exceptions import CanonzipError
12
+ from canonzip.hashing import hash_from_manifest
13
+ from canonzip.manifest import build_manifest
14
+ from canonzip.zipping import zip_from_manifest
15
+
16
+ app = typer.Typer(no_args_is_help=True, help="Produce canonical zips and hashes.")
17
+
18
+ ExcludeOption = Annotated[
19
+ list[str] | None,
20
+ typer.Option("--exclude", "-e", help="Glob pattern to exclude (repeatable)."),
21
+ ]
22
+ GitignoreOption = Annotated[
23
+ bool,
24
+ typer.Option("--gitignore", help="Exclude files matching .gitignore patterns."),
25
+ ]
26
+ FollowSymlinksOption = Annotated[
27
+ bool,
28
+ typer.Option("--follow-symlinks", help="Follow symbolic links."),
29
+ ]
30
+ VerboseOption = Annotated[
31
+ bool,
32
+ typer.Option("--verbose", "-v", help="Print included file paths to stderr."),
33
+ ]
34
+ JsonOption = Annotated[
35
+ bool,
36
+ typer.Option("--json", help="Output result as JSON."),
37
+ ]
38
+ TargetArgument = Annotated[
39
+ Path,
40
+ typer.Argument(
41
+ exists=True,
42
+ file_okay=False,
43
+ dir_okay=True,
44
+ resolve_path=True,
45
+ help="Target directory.",
46
+ ),
47
+ ]
48
+ OutputArgument = Annotated[
49
+ Path,
50
+ typer.Argument(dir_okay=False, help="Output zip file path."),
51
+ ]
52
+
53
+
54
+ @app.command("hash")
55
+ def hash_command( # noqa: PLR0913
56
+ target: TargetArgument,
57
+ *,
58
+ exclude: ExcludeOption = None,
59
+ gitignore: GitignoreOption = False,
60
+ follow_symlinks: FollowSymlinksOption = False,
61
+ verbose: VerboseOption = False,
62
+ output_json: JsonOption = False,
63
+ ) -> None:
64
+ """Compute a canonical hash of a directory.""" # noqa: DOC501
65
+ try:
66
+ manifest = build_manifest(
67
+ target,
68
+ exclude=exclude,
69
+ gitignore=gitignore,
70
+ follow_symlinks=follow_symlinks,
71
+ )
72
+ if verbose:
73
+ for path in manifest.relative_paths:
74
+ typer.echo(path, err=True)
75
+ digest = hash_from_manifest(manifest)
76
+ output = json.dumps({"hash": digest}) if output_json else digest
77
+ typer.echo(output)
78
+ except CanonzipError as exc:
79
+ typer.echo(str(exc), err=True)
80
+ raise typer.Exit(code=1) from None
81
+
82
+
83
+ @app.command("zip")
84
+ def zip_command( # noqa: PLR0913
85
+ output_path: OutputArgument,
86
+ target: TargetArgument,
87
+ *,
88
+ exclude: ExcludeOption = None,
89
+ gitignore: GitignoreOption = False,
90
+ follow_symlinks: FollowSymlinksOption = False,
91
+ verbose: VerboseOption = False,
92
+ output_json: JsonOption = False,
93
+ ) -> None:
94
+ """Create a canonical zip archive of a directory.""" # noqa: DOC501
95
+ try:
96
+ manifest = build_manifest(
97
+ target,
98
+ exclude=exclude,
99
+ gitignore=gitignore,
100
+ follow_symlinks=follow_symlinks,
101
+ )
102
+ if verbose:
103
+ for path in manifest.relative_paths:
104
+ typer.echo(path, err=True)
105
+ zip_from_manifest(output_path, manifest)
106
+ if output_json:
107
+ output = json.dumps({"hash": hash_from_manifest(manifest)})
108
+ typer.echo(output)
109
+ except CanonzipError as exc:
110
+ typer.echo(str(exc), err=True)
111
+ raise typer.Exit(code=1) from None
@@ -0,0 +1,73 @@
1
+ """Custom exception types for canonzip."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from pathlib import Path
9
+
10
+ __all__ = [
11
+ "BrokenSymlinkError",
12
+ "CanonzipError",
13
+ "GitRepositoryError",
14
+ "OutputPathError",
15
+ "SymlinkCycleError",
16
+ ]
17
+
18
+
19
+ class CanonzipError(Exception):
20
+ """Base exception for all canonzip errors."""
21
+
22
+
23
+ class SymlinkCycleError(CanonzipError):
24
+ """A symlink cycle was detected during directory traversal."""
25
+
26
+ path: Path
27
+ """The path where the cycle was detected."""
28
+
29
+ def __init__(self, path: Path) -> None:
30
+ """Initialize with the path where the cycle was detected."""
31
+ super().__init__(f"Symlink cycle detected at {path}")
32
+ self.path = path
33
+
34
+
35
+ class BrokenSymlinkError(CanonzipError):
36
+ """A broken symlink was found during directory traversal."""
37
+
38
+ path: Path
39
+ """The path of the broken symlink."""
40
+
41
+ def __init__(self, path: Path) -> None:
42
+ """Initialize with the path of the broken symlink."""
43
+ super().__init__(f"Broken symlink found at {path}")
44
+ self.path = path
45
+
46
+
47
+ class GitRepositoryError(CanonzipError):
48
+ """The target is not in a valid (non-bare) git repository."""
49
+
50
+ path: Path
51
+ """The path that is not in a valid git repository."""
52
+
53
+ def __init__(self, path: Path) -> None:
54
+ """Initialize with the path that is not in a valid git repository."""
55
+ super().__init__(f"Not a valid (non-bare) git repository: {path}")
56
+ self.path = path
57
+
58
+
59
+ class OutputPathError(CanonzipError):
60
+ """The output path is inside the target directory."""
61
+
62
+ output_path: Path
63
+ """The (invalid) output path attempted."""
64
+ target: Path
65
+ """The target directory that contains the output path."""
66
+
67
+ def __init__(self, output_path: Path, target: Path) -> None:
68
+ """Initialize with the output path and target directory."""
69
+ super().__init__(
70
+ f"Output path ({output_path}) cannot be inside target directory ({target})",
71
+ )
72
+ self.output_path = output_path
73
+ self.target = target
@@ -0,0 +1,92 @@
1
+ """Implements canonical hashing of directories."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ from canonzip.manifest import build_manifest
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Sequence
13
+ from os import PathLike
14
+
15
+ from canonzip.manifest import Manifest
16
+
17
+ __all__ = ["hash", "hash_from_manifest"]
18
+
19
+ CHUNK_SIZE = 1024 * 1024
20
+ """The chunk size to use when reading files for hashing, in bytes.
21
+
22
+ 1 MiB is a good balance between performance and memory usage.
23
+ """
24
+
25
+
26
+ def hash(
27
+ target: str | PathLike[str],
28
+ *,
29
+ exclude: Sequence[str] | None = None,
30
+ gitignore: bool = False,
31
+ follow_symlinks: bool = False,
32
+ ) -> str:
33
+ """Compute a canonical hash of the directory at the given target path.
34
+
35
+ This hash should be stable across different platforms and runs, as long as the
36
+ directory contents and structure remain the same.
37
+
38
+ Args:
39
+ target: The target path to hash.
40
+ exclude: Optional sequence of glob patterns to exclude from the hash.
41
+ Uses pathlib.Path.match() syntax, so "**" globs are not supported.
42
+ gitignore: If True, exclude files matching .gitignore patterns.
43
+ If True, the target directory must be in a valid git repository.
44
+ follow_symlinks: If True, follow symbolic links; if False, ignore them.
45
+
46
+ Returns:
47
+ A stable SHA-1 hash representing the directory files' paths and contents.
48
+ """
49
+ manifest = build_manifest(
50
+ Path(target).resolve(),
51
+ exclude=exclude,
52
+ gitignore=gitignore,
53
+ follow_symlinks=follow_symlinks,
54
+ )
55
+ return hash_from_manifest(manifest)
56
+
57
+
58
+ def hash_from_manifest(manifest: Manifest) -> str:
59
+ """Compute a canonical hash from a pre-built manifest.
60
+
61
+ This is useful when you want to inspect the manifest (e.g. to print
62
+ included paths) before hashing, without walking the directory twice.
63
+
64
+ Args:
65
+ manifest: The pre-built manifest to hash.
66
+
67
+ Returns:
68
+ A stable SHA-1 hash representing the directory files' paths and contents.
69
+ """
70
+ digest = hashlib.sha1() # noqa: S324 (SHA-1 is not used for security here)
71
+ for entry in manifest.entries:
72
+ # For each file, digest:
73
+ # - the relative path prefixed by its size, and
74
+ # - the file content prefixed by its size.
75
+ # The sizes act as separators to mitigate hash collisions.
76
+ # For example, without the size prefixes, the following two file sets would
77
+ # produce the same hash:
78
+ # - foobar (content: "baz")
79
+ # - foo (content: "barbaz")
80
+ rel_path = entry.path.relative_to(manifest.target)
81
+ rel_path_bytes = rel_path.as_posix().encode("utf-8")
82
+ digest.update(len(rel_path_bytes).to_bytes(8))
83
+ digest.update(rel_path_bytes)
84
+ digest.update(entry.size.to_bytes(8))
85
+ with entry.path.open("rb") as handle:
86
+ while True:
87
+ chunk = handle.read(CHUNK_SIZE)
88
+ if not chunk:
89
+ break
90
+ digest.update(chunk)
91
+
92
+ return digest.hexdigest()
@@ -0,0 +1,258 @@
1
+ """Implements canonical ordering of files in directories.
2
+
3
+ Hashing and zipping use this manifest to ensure consistent behavior.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import stat
9
+ from collections.abc import Callable, Generator, Iterable, Sequence
10
+ from dataclasses import dataclass
11
+ from pathlib import Path, PurePosixPath
12
+
13
+ import pygit2
14
+
15
+ from canonzip.exceptions import (
16
+ BrokenSymlinkError,
17
+ GitRepositoryError,
18
+ SymlinkCycleError,
19
+ )
20
+
21
+ __all__ = ["FileEntry", "Manifest", "build_manifest"]
22
+
23
+
24
+ @dataclass(frozen=True, slots=True)
25
+ class FileEntry:
26
+ """Represents a file or directory entry in the manifest."""
27
+
28
+ path: Path
29
+ """The absolute path to the file or directory."""
30
+ path_relative: Path
31
+ """The path relative to the manifest target directory."""
32
+ mode: int
33
+ """The file mode (permissions and type) as returned by os.stat()."""
34
+ is_dir: bool
35
+ """Indicates if the entry is a directory."""
36
+ is_file: bool
37
+ """Indicates if the entry is a file."""
38
+ is_symlink: bool
39
+ """Indicates if the entry is a symbolic link."""
40
+ is_broken: bool
41
+ """Indicates if the entry is a broken symbolic link."""
42
+ size: int
43
+ """The size of the file in bytes."""
44
+
45
+ @staticmethod
46
+ def from_path(path: Path, path_relative: Path) -> FileEntry:
47
+ """Create a FileEntry from a given path, gathering necessary metadata.
48
+
49
+ Args:
50
+ path: The absolute path to a file or directory.
51
+ path_relative: The path relative to the manifest target directory.
52
+
53
+ Returns:
54
+ The FileEntry object.
55
+ """
56
+ # Take care to handle broken symlinks correctly.
57
+ # First stat without following symlinks to determine if it's a symlink.
58
+ # If so, stat again with following enabled to determine if it's broken.
59
+ stat_nofollow = path.stat(follow_symlinks=False)
60
+ is_symlink = stat.S_ISLNK(stat_nofollow.st_mode)
61
+ if not is_symlink:
62
+ stat_result = stat_nofollow
63
+ is_broken = False
64
+ else:
65
+ try:
66
+ stat_result = path.stat(follow_symlinks=True)
67
+ is_broken = False
68
+ except OSError:
69
+ stat_result = stat_nofollow
70
+ is_broken = True
71
+ mode = stat_result.st_mode
72
+ return FileEntry(
73
+ path=path,
74
+ path_relative=path_relative,
75
+ mode=mode,
76
+ is_dir=stat.S_ISDIR(mode),
77
+ is_file=stat.S_ISREG(mode),
78
+ is_symlink=is_symlink,
79
+ is_broken=is_broken,
80
+ size=stat_result.st_size,
81
+ )
82
+
83
+
84
+ @dataclass(frozen=True, slots=True)
85
+ class Manifest:
86
+ """A canonical ordered collection of files for a given target directory."""
87
+
88
+ target: Path
89
+ """The resolved path of the target directory."""
90
+ entries: tuple[FileEntry, ...]
91
+ """The canonical ordered file entries."""
92
+
93
+ @property
94
+ def relative_paths(self) -> Iterable[str]:
95
+ """The relative paths of the manifest entries (posix-formatted strings)."""
96
+ return (entry.path_relative.as_posix() for entry in self.entries)
97
+
98
+
99
+ ExcludePredicate = Callable[[FileEntry], bool]
100
+ """A predicate function: should a FileEntry be excluded from the manifest?"""
101
+
102
+
103
+ def exclude_none(_file_entry: FileEntry) -> bool:
104
+ """The default exclusion predicate: exclude nothing.""" # noqa: DOC201
105
+ return False
106
+
107
+
108
+ def exclude_symlinks(prev: ExcludePredicate) -> ExcludePredicate:
109
+ """Exclude symbolic links.""" # noqa: DOC201
110
+
111
+ def exclude(file_entry: FileEntry) -> bool:
112
+ return file_entry.is_symlink or prev(file_entry)
113
+
114
+ return exclude
115
+
116
+
117
+ def exclude_by_patterns(
118
+ patterns: Sequence[str],
119
+ prev: ExcludePredicate,
120
+ ) -> ExcludePredicate:
121
+ """Exclude files matching any of the given glob patterns.""" # noqa: DOC201
122
+ patterns_list = list(patterns)
123
+
124
+ def exclude(file_entry: FileEntry) -> bool:
125
+ path = PurePosixPath(file_entry.path_relative)
126
+ match = any(
127
+ any(candidate.match(p) for p in patterns_list) # check each pattern
128
+ for candidate in [path, *path.parents[:-1]] # check each path and parents
129
+ )
130
+ return match or prev(file_entry)
131
+
132
+ return exclude
133
+
134
+
135
+ def exclude_gitignored(path: Path, prev: ExcludePredicate) -> ExcludePredicate:
136
+ """Exclude files using the .gitignore of the repository containing the path.""" # noqa: DOC201, DOC501
137
+ try:
138
+ repository = pygit2.Repository(path)
139
+ except pygit2.GitError:
140
+ raise GitRepositoryError(path) from None
141
+ if repository.is_bare:
142
+ raise GitRepositoryError(path) from None
143
+ workdir = Path(repository.workdir).resolve()
144
+
145
+ def is_ignored(file_entry: FileEntry) -> bool:
146
+ rel_path = file_entry.path.relative_to(workdir).as_posix()
147
+ if file_entry.is_dir:
148
+ rel_path = f"{rel_path}/"
149
+ return repository.path_is_ignored(rel_path)
150
+
151
+ def exclude(file_entry: FileEntry) -> bool:
152
+ return is_ignored(file_entry) or prev(file_entry)
153
+
154
+ return exclude
155
+
156
+
157
+ def build_manifest(
158
+ target: Path,
159
+ *,
160
+ exclude: Sequence[str] | None = None,
161
+ gitignore: bool = False,
162
+ follow_symlinks: bool = False,
163
+ ) -> Manifest:
164
+ """Build the canonical manifest for the given target directory.
165
+
166
+ Args:
167
+ target: The target directory to build the manifest for.
168
+ exclude: Optional sequence of glob patterns to exclude from themanifest.
169
+ Uses pathlib.Path.match() syntax, so "**" globs are not supported.
170
+ gitignore: If True, exclude files matching .gitignore patterns.
171
+ If True, the target directory must be in a valid git repository.
172
+ follow_symlinks: If True, follow symbolic links; if False, ignore them.
173
+
174
+ Returns:
175
+ A Manifest containing the resolved target path and its canonical file entries.
176
+
177
+ Raises:
178
+ FileNotFoundError: If the target directory does not exist.
179
+ NotADirectoryError: If the target path is not a directory.
180
+ SymlinkCycleError: If a symlink cycle is detected.
181
+ GitRepositoryError: If gitignore=True but the target is not in a valid
182
+ git repository.
183
+ BrokenSymlinkError: If a broken symlink is found when follow_symlinks=True.
184
+ """ # noqa: DOC502
185
+ if not target.exists():
186
+ raise FileNotFoundError(target)
187
+ if not target.is_dir():
188
+ raise NotADirectoryError(target)
189
+ target = target.resolve()
190
+
191
+ # Build up the exclude predicate based on the options.
192
+ # Chained function application implements a logical or of the selected conditions,
193
+ # so a file is excluded if it matches any of the criteria.
194
+ exclude_fn = exclude_none
195
+ if not follow_symlinks:
196
+ exclude_fn = exclude_symlinks(exclude_fn)
197
+ if exclude:
198
+ exclude_fn = exclude_by_patterns(exclude, exclude_fn)
199
+ if gitignore:
200
+ exclude_fn = exclude_gitignored(target, exclude_fn)
201
+
202
+ walk = walk_directory(
203
+ target,
204
+ target,
205
+ walked_dirs=set(),
206
+ exclude=exclude_fn,
207
+ follow_symlinks=follow_symlinks,
208
+ )
209
+
210
+ return Manifest(target=target, entries=tuple(walk))
211
+
212
+
213
+ def walk_directory(
214
+ root: Path,
215
+ path: Path,
216
+ *,
217
+ walked_dirs: set[Path],
218
+ exclude: ExcludePredicate,
219
+ follow_symlinks: bool,
220
+ ) -> Generator[FileEntry, None, None]:
221
+ """Recursively yield FileEntry objects for non-excluded files in a directory.
222
+
223
+ Args:
224
+ root: The root directory for relative path calculations.
225
+ path: The directory to walk.
226
+ walked_dirs: A set of directories that have already been walked.
227
+ exclude: A predicate that returns True for files that should be excluded.
228
+ follow_symlinks: If True, follow symbolic links; if False, ignore them.
229
+
230
+ Yields:
231
+ FileEntry objects for files that are not excluded.
232
+
233
+ Raises:
234
+ SymlinkCycleError: If a symlink cycle is detected.
235
+ BrokenSymlinkError: If a broken symlink is found when follow_symlinks=True.
236
+ """
237
+ if any(path.samefile(x) for x in walked_dirs):
238
+ raise SymlinkCycleError(path)
239
+ walked_dirs.add(path)
240
+
241
+ # Sorting entries by name ensures a stable order regardless of filesystem behavior.
242
+ for entry in sorted(path.iterdir(), key=lambda p: p.name):
243
+ file_entry = FileEntry.from_path(entry, entry.relative_to(root))
244
+ if follow_symlinks and file_entry.is_broken:
245
+ raise BrokenSymlinkError(entry)
246
+ if exclude(file_entry):
247
+ continue
248
+
249
+ if file_entry.is_file:
250
+ yield file_entry
251
+ elif file_entry.is_dir:
252
+ yield from walk_directory(
253
+ root,
254
+ entry,
255
+ walked_dirs=walked_dirs,
256
+ exclude=exclude,
257
+ follow_symlinks=follow_symlinks,
258
+ )
File without changes
@@ -0,0 +1,112 @@
1
+ """Implements canonical zipping of directories."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import shutil
6
+ import stat
7
+ import zipfile
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ from canonzip.exceptions import OutputPathError
12
+ from canonzip.manifest import build_manifest
13
+
14
+ if TYPE_CHECKING:
15
+ from collections.abc import Sequence
16
+ from os import PathLike
17
+
18
+ from canonzip.manifest import Manifest
19
+
20
+ __all__ = ["zip", "zip_from_manifest"]
21
+
22
+ CHUNK_SIZE = 1024 * 1024
23
+ """The chunk size to use when copying files into the zip, in bytes.
24
+
25
+ 1 MiB is a good balance between performance and memory usage.
26
+ """
27
+
28
+
29
+ def zip(
30
+ output_path: str | PathLike[str],
31
+ target: str | PathLike[str],
32
+ *,
33
+ exclude: Sequence[str] | None = None,
34
+ gitignore: bool = False,
35
+ follow_symlinks: bool = False,
36
+ ) -> None:
37
+ """Create a canonical zip file of the directory at the given target path.
38
+
39
+ Args:
40
+ output_path: The path to write the zip file to.
41
+ Must not be inside the target directory.
42
+ target: The target directory to zip.
43
+ exclude: Optional sequence of glob patterns to exclude from the zip.
44
+ Uses pathlib.Path.match() syntax, so "**" globs are not supported.
45
+ gitignore: If True, exclude files matching .gitignore patterns.
46
+ If True, the target directory must be in a valid git repository.
47
+ follow_symlinks: If True, follow symbolic links; if False, ignore them.
48
+ """
49
+ manifest = build_manifest(
50
+ Path(target).resolve(),
51
+ exclude=exclude,
52
+ gitignore=gitignore,
53
+ follow_symlinks=follow_symlinks,
54
+ )
55
+ zip_from_manifest(output_path, manifest)
56
+
57
+
58
+ def zip_from_manifest(output_path: str | PathLike[str], manifest: Manifest) -> None:
59
+ """Create a canonical zip file from a pre-built manifest.
60
+
61
+ This is useful when you want to inspect the manifest (e.g. to print
62
+ included paths) before zipping, without walking the directory twice.
63
+
64
+ Args:
65
+ output_path: The path to write the zip file to.
66
+ Must not be inside the target directory.
67
+ manifest: The pre-built manifest whose entries to include.
68
+
69
+ Raises:
70
+ OutputPathError: If the output path is inside the target directory.
71
+ """
72
+ destination = Path(output_path).resolve()
73
+ target_path = manifest.target
74
+
75
+ # Ensure the output path is not inside the target directory;
76
+ # I'm not sure this would cause problems, but it seems better to rule this case out.
77
+ try:
78
+ destination.relative_to(target_path)
79
+ except ValueError:
80
+ pass
81
+ else:
82
+ raise OutputPathError(destination, target_path)
83
+
84
+ destination.parent.mkdir(parents=True, exist_ok=True)
85
+
86
+ fixed_timestamp = (1980, 1, 1, 0, 0, 0)
87
+ compression = zipfile.ZIP_DEFLATED
88
+ with zipfile.ZipFile(
89
+ destination,
90
+ mode="w",
91
+ compression=compression,
92
+ compresslevel=9,
93
+ ) as zip:
94
+ for entry in manifest.entries:
95
+ rel_path = entry.path.relative_to(target_path)
96
+ # This seems like a decent reference on zip file format:
97
+ # https://pkwaredownloads.blob.core.windows.net/pkware-general/Documentation/APPNOTE-6.3.9.TXT
98
+ # create_system=3 indicates file attributes are UNIX compatible
99
+ # attr is shifted because left two bytes are for UNIX, right two for Windows
100
+ info = zipfile.ZipInfo(rel_path.as_posix(), fixed_timestamp)
101
+ info.compress_type = compression
102
+ info.create_system = 3
103
+ info.external_attr = normalized_mode(entry.mode) << 16
104
+ with entry.path.open("rb") as src, zip.open(info, "w") as dst:
105
+ shutil.copyfileobj(src, dst, length=CHUNK_SIZE)
106
+
107
+
108
+ def normalized_mode(mode: int) -> int:
109
+ """Normalize the mode of zipped files (for cross-platform stability).""" # noqa: DOC201
110
+ if mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH):
111
+ return 0o755
112
+ return 0o644