git-undigest 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- git_undigest-0.2.0/.gitignore +45 -0
- git_undigest-0.2.0/LICENSE +21 -0
- git_undigest-0.2.0/PKG-INFO +331 -0
- git_undigest-0.2.0/README.md +268 -0
- git_undigest-0.2.0/benchmarks/bench_streaming.py +105 -0
- git_undigest-0.2.0/pyproject.toml +98 -0
- git_undigest-0.2.0/src/git_undigest/__init__.py +290 -0
- git_undigest-0.2.0/src/git_undigest/checksum.py +72 -0
- git_undigest-0.2.0/src/git_undigest/cli.py +251 -0
- git_undigest-0.2.0/src/git_undigest/exceptions.py +114 -0
- git_undigest-0.2.0/src/git_undigest/formats/__init__.py +351 -0
- git_undigest-0.2.0/src/git_undigest/formats/gitingest.py +339 -0
- git_undigest-0.2.0/src/git_undigest/models.py +289 -0
- git_undigest-0.2.0/src/git_undigest/parser.py +290 -0
- git_undigest-0.2.0/src/git_undigest/utils.py +196 -0
- git_undigest-0.2.0/src/git_undigest/validator.py +244 -0
- git_undigest-0.2.0/src/git_undigest/writer.py +371 -0
- git_undigest-0.2.0/tests/conftest.py +39 -0
- git_undigest-0.2.0/tests/test_api.py +99 -0
- git_undigest-0.2.0/tests/test_cli.py +120 -0
- git_undigest-0.2.0/tests/test_formats_and_placeholders.py +99 -0
- git_undigest-0.2.0/tests/test_fuzz.py +183 -0
- git_undigest-0.2.0/tests/test_parser.py +117 -0
- git_undigest-0.2.0/tests/test_phase1.py +353 -0
- git_undigest-0.2.0/tests/test_validator.py +106 -0
- git_undigest-0.2.0/tests/test_writer.py +138 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.python-version
|
|
7
|
+
|
|
8
|
+
# Virtual environments
|
|
9
|
+
.venv/
|
|
10
|
+
.venv-audit/
|
|
11
|
+
venv/
|
|
12
|
+
.env
|
|
13
|
+
|
|
14
|
+
# Testing
|
|
15
|
+
.pytest_cache/
|
|
16
|
+
.ruff_cache/
|
|
17
|
+
.mypy_cache/
|
|
18
|
+
.coverage
|
|
19
|
+
htmlcov/
|
|
20
|
+
.coverage.*
|
|
21
|
+
|
|
22
|
+
# Build
|
|
23
|
+
build/
|
|
24
|
+
dist/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.eggs/
|
|
27
|
+
|
|
28
|
+
# IDE
|
|
29
|
+
.idea/
|
|
30
|
+
.vscode/
|
|
31
|
+
*.swp
|
|
32
|
+
*.swo
|
|
33
|
+
*~
|
|
34
|
+
|
|
35
|
+
# OS
|
|
36
|
+
.DS_Store
|
|
37
|
+
Thumbs.db
|
|
38
|
+
|
|
39
|
+
# Project specific
|
|
40
|
+
*.bak
|
|
41
|
+
output/
|
|
42
|
+
repo/
|
|
43
|
+
|
|
44
|
+
digest.txt
|
|
45
|
+
docs/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 git-undigest contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: git-undigest
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Reconstruct a full repository from a GitIngest-style digest file.
|
|
5
|
+
Project-URL: Homepage, https://github.com/vnparmane/git-undigest
|
|
6
|
+
Project-URL: Repository, https://github.com/vnparmane/git-undigest
|
|
7
|
+
Project-URL: Issues, https://github.com/vnparmane/git-undigest/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/vnparmane/git-undigest#readme
|
|
9
|
+
Project-URL: Changelog, https://github.com/vnparmane/git-undigest/blob/main/CHANGELOG.md
|
|
10
|
+
Author-email: Vedant Chainani <16848858+anomalyco@users.noreply.github.com>
|
|
11
|
+
Maintainer: git-undigest contributors
|
|
12
|
+
License: MIT License
|
|
13
|
+
|
|
14
|
+
Copyright (c) 2024-2026 git-undigest contributors
|
|
15
|
+
|
|
16
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
17
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
18
|
+
in the Software without restriction, including without limitation the rights
|
|
19
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
20
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
21
|
+
furnished to do so, subject to the following conditions:
|
|
22
|
+
|
|
23
|
+
The above copyright notice and this permission notice shall be included in all
|
|
24
|
+
copies or substantial portions of the Software.
|
|
25
|
+
|
|
26
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
27
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
28
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
29
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
30
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
31
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
32
|
+
SOFTWARE.
|
|
33
|
+
License-File: LICENSE
|
|
34
|
+
Keywords: cli,code,development,digest,gitingest,gitingest-reverse,llm,reconstruct,repository
|
|
35
|
+
Classifier: Development Status :: 4 - Beta
|
|
36
|
+
Classifier: Environment :: Console
|
|
37
|
+
Classifier: Intended Audience :: Developers
|
|
38
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
39
|
+
Classifier: Natural Language :: English
|
|
40
|
+
Classifier: Operating System :: OS Independent
|
|
41
|
+
Classifier: Programming Language :: Python :: 3
|
|
42
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
43
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
44
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
45
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
46
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
47
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
48
|
+
Classifier: Topic :: Software Development :: Build Tools
|
|
49
|
+
Classifier: Topic :: Utilities
|
|
50
|
+
Classifier: Typing :: Typed
|
|
51
|
+
Requires-Python: >=3.10
|
|
52
|
+
Provides-Extra: dev
|
|
53
|
+
Requires-Dist: black>=24.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
55
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
56
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
57
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
58
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
59
|
+
Requires-Dist: twine>=4.0; extra == 'dev'
|
|
60
|
+
Provides-Extra: zstd
|
|
61
|
+
Requires-Dist: zstandard>=0.22; extra == 'zstd'
|
|
62
|
+
Description-Content-Type: text/markdown
|
|
63
|
+
|
|
64
|
+
# git-undigest
|
|
65
|
+
|
|
66
|
+
[](https://pypi.org/project/git-undigest/)
|
|
67
|
+
[](https://pypi.org/project/git-undigest/)
|
|
68
|
+
[](LICENSE)
|
|
69
|
+
[](https://github.com/vnparmane/git-undigest/actions/workflows/ci.yml)
|
|
70
|
+
[](https://github.com/astral-sh/ruff)
|
|
71
|
+
[](https://github.com/psf/black)
|
|
72
|
+
[](https://mypy-lang.org/)
|
|
73
|
+
|
|
74
|
+
Reconstruct a full repository — folder structure and all — from a
|
|
75
|
+
[GitIngest](https://gitingest.com)-style digest file.
|
|
76
|
+
|
|
77
|
+
GitIngest turns a repository into a single flat text digest for feeding to
|
|
78
|
+
an LLM. `git-undigest` does the reverse: it parses that digest and rebuilds
|
|
79
|
+
the original directory tree and files on disk, safely and deterministically.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Features
|
|
84
|
+
|
|
85
|
+
- **Streaming parser** — parses multi-GB digests with constant memory
|
|
86
|
+
(proportional to the largest single file, not total size).
|
|
87
|
+
- **Pluggable formats** — add support for Repomix, Repopack, or custom digest
|
|
88
|
+
formats without modifying core code.
|
|
89
|
+
- **Security-first** — path traversal protection, absolute path rejection,
|
|
90
|
+
Windows reserved name detection, atomic writes.
|
|
91
|
+
- **Conflict handling** — `--overwrite`, `--skip-existing`, `--backup` policies.
|
|
92
|
+
- **Dry-run mode** — preview what would happen without touching the filesystem.
|
|
93
|
+
- **Compressed digests** — transparent `.gz`, `.xz` support built-in;
|
|
94
|
+
`.zst` via `pip install git-undigest[zstd]`.
|
|
95
|
+
- **Plugin discovery** — third-party format packages auto-discovered via
|
|
96
|
+
entry points.
|
|
97
|
+
- **No runtime dependencies** — pure Python, zero required installs beyond
|
|
98
|
+
the standard library.
|
|
99
|
+
|
|
100
|
+
## Installation
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
pip install git-undigest
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Requires Python 3.10+.
|
|
107
|
+
|
|
108
|
+
Optional compression support:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pip install "git-undigest[zstd]" # for .zst (zstandard) files
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Quick Start
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
# Reconstruct a repository from a digest file
|
|
118
|
+
git-undigest digest.txt
|
|
119
|
+
|
|
120
|
+
# Reconstruct into a specific directory
|
|
121
|
+
git-undigest digest.txt output/
|
|
122
|
+
|
|
123
|
+
# Validate without writing
|
|
124
|
+
git-undigest validate digest.txt
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## CLI Examples
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
# Reconstruct with overwrite policy
|
|
131
|
+
git-undigest digest.txt --overwrite
|
|
132
|
+
|
|
133
|
+
# Dry-run preview
|
|
134
|
+
git-undigest digest.txt output/ --dry-run
|
|
135
|
+
|
|
136
|
+
# Skip files that already exist
|
|
137
|
+
git-undigest digest.txt output/ --skip-existing
|
|
138
|
+
|
|
139
|
+
# Back up existing files before overwriting
|
|
140
|
+
git-undigest digest.txt output/ --backup
|
|
141
|
+
|
|
142
|
+
# Verbose output (one line per file action)
|
|
143
|
+
git-undigest digest.txt --verbose
|
|
144
|
+
|
|
145
|
+
# Inspect repository metadata
|
|
146
|
+
git-undigest inspect digest.txt
|
|
147
|
+
|
|
148
|
+
# List all files in the digest
|
|
149
|
+
git-undigest list digest.txt
|
|
150
|
+
|
|
151
|
+
# Get statistics
|
|
152
|
+
git-undigest stats digest.txt
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Conflict Flags
|
|
156
|
+
|
|
157
|
+
| Flag | Behavior |
|
|
158
|
+
|------|----------|
|
|
159
|
+
| `--overwrite` | Overwrite existing files instead of erroring |
|
|
160
|
+
| `--skip-existing` | Leave existing files untouched |
|
|
161
|
+
| `--backup` | Rename existing files to `name.bak` before writing |
|
|
162
|
+
| `--dry-run` | Show what would happen without touching the filesystem |
|
|
163
|
+
| `--verbose` | Print a line for every file action taken |
|
|
164
|
+
| `--quiet` | Suppress summary output |
|
|
165
|
+
|
|
166
|
+
## Python API
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from git_undigest import reconstruct, validate, inspect, stats, list_files
|
|
170
|
+
|
|
171
|
+
# Reconstruct a repository
|
|
172
|
+
result = reconstruct("digest.txt", output="repo")
|
|
173
|
+
print(f"{len(result.created)} files created in {result.output_dir}")
|
|
174
|
+
|
|
175
|
+
# Validate without writing
|
|
176
|
+
summary = validate("digest.txt")
|
|
177
|
+
print(f"Repository: {summary.repo_name}, {summary.file_count} files")
|
|
178
|
+
|
|
179
|
+
# Inspect
|
|
180
|
+
info = inspect("digest.txt")
|
|
181
|
+
print("Languages:", info["languages"])
|
|
182
|
+
print("Directory tree:\n", info["tree"])
|
|
183
|
+
|
|
184
|
+
# Get statistics
|
|
185
|
+
s = stats("digest.txt")
|
|
186
|
+
print(f"Total: {s.total_bytes} bytes, ~{s.estimated_tokens} tokens")
|
|
187
|
+
|
|
188
|
+
# List all files
|
|
189
|
+
for path in list_files("digest.txt"):
|
|
190
|
+
print(path)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Streaming API
|
|
194
|
+
|
|
195
|
+
For large digests, use the streaming parser directly:
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from git_undigest import parse_stream, reconstruct_files_stream
|
|
199
|
+
|
|
200
|
+
entries = parse_stream("large_digest.txt")
|
|
201
|
+
result = reconstruct_files_stream(entries, "output", overwrite=True)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
This keeps memory constant regardless of digest size.
|
|
205
|
+
|
|
206
|
+
## Supported Digest Formats
|
|
207
|
+
|
|
208
|
+
| Format | Status | Notes |
|
|
209
|
+
|--------|--------|-------|
|
|
210
|
+
| GitIngest | Stable | Default format |
|
|
211
|
+
| Custom | Pluggable | Subclass `DigestFormat` |
|
|
212
|
+
|
|
213
|
+
To add support for a new format, create a subclass of `DigestFormat`,
|
|
214
|
+
implement `sniff()`, `parse_stream()`, and `serialize()`, then register it:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from git_undigest.formats import DigestFormat, register_format_class
|
|
218
|
+
from git_undigest.models import FileEntry
|
|
219
|
+
|
|
220
|
+
class MyFormat(DigestFormat):
|
|
221
|
+
name = "myformat"
|
|
222
|
+
|
|
223
|
+
@classmethod
|
|
224
|
+
def sniff(cls, prefix: str) -> bool:
|
|
225
|
+
return prefix.startswith("MAGIC")
|
|
226
|
+
|
|
227
|
+
def parse_stream(self, stream):
|
|
228
|
+
... # yield FileEntry instances
|
|
229
|
+
|
|
230
|
+
def serialize(self, repo) -> str:
|
|
231
|
+
...
|
|
232
|
+
|
|
233
|
+
register_format_class(MyFormat)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Third-party packages are auto-discovered via the `git_undigest.formats`
|
|
237
|
+
entry point group.
|
|
238
|
+
|
|
239
|
+
## Security
|
|
240
|
+
|
|
241
|
+
Every path in the digest is validated before anything is written:
|
|
242
|
+
|
|
243
|
+
- **No path traversal.** `../../../etc/passwd`, `../secret.txt`, and any
|
|
244
|
+
path containing a `..` segment that would escape the output directory
|
|
245
|
+
is rejected with `PathTraversalError`.
|
|
246
|
+
- **No absolute paths.** POSIX absolute paths (`/etc/shadow`), Windows
|
|
247
|
+
drive-qualified paths (`C:\Windows\System32`), and UNC paths
|
|
248
|
+
(`\\server\share`) are all rejected.
|
|
249
|
+
- **No Windows reserved device names.** `CON`, `PRN`, `AUX`, `NUL`,
|
|
250
|
+
`COM1`–`COM9`, `LPT1`–`LPT9` are rejected as path components.
|
|
251
|
+
- **No null bytes** are permitted in paths.
|
|
252
|
+
- **Final containment check.** Every resolved path is confirmed, via
|
|
253
|
+
`Path.relative_to`, to be a real descendant of the output directory
|
|
254
|
+
after full filesystem resolution.
|
|
255
|
+
- **Atomic writes.** Files are written to a temporary file in the same
|
|
256
|
+
directory and then renamed into place, so a crash or interruption never
|
|
257
|
+
leaves a partially-written file at the destination.
|
|
258
|
+
|
|
259
|
+
## Architecture
|
|
260
|
+
|
|
261
|
+
```
|
|
262
|
+
src/git_undigest/
|
|
263
|
+
├── __init__.py # Public API: reconstruct, validate, inspect, stats
|
|
264
|
+
├── cli.py # argparse CLI entry point
|
|
265
|
+
├── parser.py # Streaming and bulk digest parsing
|
|
266
|
+
├── formats/
|
|
267
|
+
│ ├── __init__.py # DigestFormat ABC, registry, plugin discovery
|
|
268
|
+
│ └── gitingest.py # GitIngest format implementation
|
|
269
|
+
├── validator.py # Path safety + structural validation
|
|
270
|
+
├── writer.py # Streaming filesystem reconstruction
|
|
271
|
+
├── checksum.py # SHA-256 checksum utilities
|
|
272
|
+
├── models.py # Dataclasses (FileEntry, Repository, results)
|
|
273
|
+
├── exceptions.py # Exception hierarchy
|
|
274
|
+
└── utils.py # Shared helpers
|
|
275
|
+
|
|
276
|
+
benchmarks/
|
|
277
|
+
└── bench_streaming.py
|
|
278
|
+
tests/
|
|
279
|
+
├── test_api.py
|
|
280
|
+
├── test_cli.py
|
|
281
|
+
├── test_parser.py
|
|
282
|
+
├── test_writer.py
|
|
283
|
+
├── test_validator.py
|
|
284
|
+
├── test_fuzz.py
|
|
285
|
+
├── test_phase1.py
|
|
286
|
+
├── test_formats_and_placeholders.py
|
|
287
|
+
└── ...
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
### Design Principles
|
|
291
|
+
|
|
292
|
+
- **Parser only parses.** It never touches the filesystem or makes security
|
|
293
|
+
decisions.
|
|
294
|
+
- **Writer only writes.** It assumes the digest has been validated, but
|
|
295
|
+
re-validates every path as defense-in-depth.
|
|
296
|
+
- **Validator owns all validation.** No duplicated logic.
|
|
297
|
+
- **Formats are pluggable.** Adding a new format means a new module in
|
|
298
|
+
`formats/` — no changes to `parser.py`, `validator.py`, or `writer.py`.
|
|
299
|
+
- **Streaming by default.** All public APIs use constant-memory streaming
|
|
300
|
+
internally.
|
|
301
|
+
|
|
302
|
+
## Performance
|
|
303
|
+
|
|
304
|
+
The streaming parser is 2–30x faster than bulk parsing for typical digest
|
|
305
|
+
sizes because it avoids allocating a single large string for the entire
|
|
306
|
+
digest:
|
|
307
|
+
|
|
308
|
+
| Files | Digest Size | Bulk (s) | Stream (s) | Speedup |
|
|
309
|
+
|-------|-------------|----------|------------|---------|
|
|
310
|
+
| 100 | 12 KB | 0.036 | 0.001 | 32x |
|
|
311
|
+
| 1,000 | 120 KB | 0.040 | 0.007 | 6x |
|
|
312
|
+
| 10,000| 1.2 MB | 0.124 | 0.072 | 1.7x |
|
|
313
|
+
|
|
314
|
+
Memory usage is O(largest file) for streaming vs O(total digest) for bulk.
|
|
315
|
+
|
|
316
|
+
## Roadmap
|
|
317
|
+
|
|
318
|
+
- [ ] SHA-256 checksum manifest verification
|
|
319
|
+
- [ ] Binary file reconstruction (base64-embedded digests)
|
|
320
|
+
- [ ] Parallel reconstruction for very large digests
|
|
321
|
+
- [ ] Resumable reconstruction
|
|
322
|
+
- [ ] Plugin distribution guide for third-party format packages
|
|
323
|
+
|
|
324
|
+
## Contributing
|
|
325
|
+
|
|
326
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, testing
|
|
327
|
+
instructions, and pull request guidelines.
|
|
328
|
+
|
|
329
|
+
## License
|
|
330
|
+
|
|
331
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# git-undigest
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/git-undigest/)
|
|
4
|
+
[](https://pypi.org/project/git-undigest/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://github.com/vnparmane/git-undigest/actions/workflows/ci.yml)
|
|
7
|
+
[](https://github.com/astral-sh/ruff)
|
|
8
|
+
[](https://github.com/psf/black)
|
|
9
|
+
[](https://mypy-lang.org/)
|
|
10
|
+
|
|
11
|
+
Reconstruct a full repository — folder structure and all — from a
|
|
12
|
+
[GitIngest](https://gitingest.com)-style digest file.
|
|
13
|
+
|
|
14
|
+
GitIngest turns a repository into a single flat text digest for feeding to
|
|
15
|
+
an LLM. `git-undigest` does the reverse: it parses that digest and rebuilds
|
|
16
|
+
the original directory tree and files on disk, safely and deterministically.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- **Streaming parser** — parses multi-GB digests with constant memory
|
|
23
|
+
(proportional to the largest single file, not total size).
|
|
24
|
+
- **Pluggable formats** — add support for Repomix, Repopack, or custom digest
|
|
25
|
+
formats without modifying core code.
|
|
26
|
+
- **Security-first** — path traversal protection, absolute path rejection,
|
|
27
|
+
Windows reserved name detection, atomic writes.
|
|
28
|
+
- **Conflict handling** — `--overwrite`, `--skip-existing`, `--backup` policies.
|
|
29
|
+
- **Dry-run mode** — preview what would happen without touching the filesystem.
|
|
30
|
+
- **Compressed digests** — transparent `.gz`, `.xz` support built-in;
|
|
31
|
+
`.zst` via `pip install git-undigest[zstd]`.
|
|
32
|
+
- **Plugin discovery** — third-party format packages auto-discovered via
|
|
33
|
+
entry points.
|
|
34
|
+
- **No runtime dependencies** — pure Python, zero required installs beyond
|
|
35
|
+
the standard library.
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install git-undigest
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Requires Python 3.10+.
|
|
44
|
+
|
|
45
|
+
Optional compression support:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install "git-undigest[zstd]" # for .zst (zstandard) files
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# Reconstruct a repository from a digest file
|
|
55
|
+
git-undigest digest.txt
|
|
56
|
+
|
|
57
|
+
# Reconstruct into a specific directory
|
|
58
|
+
git-undigest digest.txt output/
|
|
59
|
+
|
|
60
|
+
# Validate without writing
|
|
61
|
+
git-undigest validate digest.txt
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## CLI Examples
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Reconstruct with overwrite policy
|
|
68
|
+
git-undigest digest.txt --overwrite
|
|
69
|
+
|
|
70
|
+
# Dry-run preview
|
|
71
|
+
git-undigest digest.txt output/ --dry-run
|
|
72
|
+
|
|
73
|
+
# Skip files that already exist
|
|
74
|
+
git-undigest digest.txt output/ --skip-existing
|
|
75
|
+
|
|
76
|
+
# Back up existing files before overwriting
|
|
77
|
+
git-undigest digest.txt output/ --backup
|
|
78
|
+
|
|
79
|
+
# Verbose output (one line per file action)
|
|
80
|
+
git-undigest digest.txt --verbose
|
|
81
|
+
|
|
82
|
+
# Inspect repository metadata
|
|
83
|
+
git-undigest inspect digest.txt
|
|
84
|
+
|
|
85
|
+
# List all files in the digest
|
|
86
|
+
git-undigest list digest.txt
|
|
87
|
+
|
|
88
|
+
# Get statistics
|
|
89
|
+
git-undigest stats digest.txt
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Conflict Flags
|
|
93
|
+
|
|
94
|
+
| Flag | Behavior |
|
|
95
|
+
|------|----------|
|
|
96
|
+
| `--overwrite` | Overwrite existing files instead of erroring |
|
|
97
|
+
| `--skip-existing` | Leave existing files untouched |
|
|
98
|
+
| `--backup` | Rename existing files to `name.bak` before writing |
|
|
99
|
+
| `--dry-run` | Show what would happen without touching the filesystem |
|
|
100
|
+
| `--verbose` | Print a line for every file action taken |
|
|
101
|
+
| `--quiet` | Suppress summary output |
|
|
102
|
+
|
|
103
|
+
## Python API
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from git_undigest import reconstruct, validate, inspect, stats, list_files
|
|
107
|
+
|
|
108
|
+
# Reconstruct a repository
|
|
109
|
+
result = reconstruct("digest.txt", output="repo")
|
|
110
|
+
print(f"{len(result.created)} files created in {result.output_dir}")
|
|
111
|
+
|
|
112
|
+
# Validate without writing
|
|
113
|
+
summary = validate("digest.txt")
|
|
114
|
+
print(f"Repository: {summary.repo_name}, {summary.file_count} files")
|
|
115
|
+
|
|
116
|
+
# Inspect
|
|
117
|
+
info = inspect("digest.txt")
|
|
118
|
+
print("Languages:", info["languages"])
|
|
119
|
+
print("Directory tree:\n", info["tree"])
|
|
120
|
+
|
|
121
|
+
# Get statistics
|
|
122
|
+
s = stats("digest.txt")
|
|
123
|
+
print(f"Total: {s.total_bytes} bytes, ~{s.estimated_tokens} tokens")
|
|
124
|
+
|
|
125
|
+
# List all files
|
|
126
|
+
for path in list_files("digest.txt"):
|
|
127
|
+
print(path)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Streaming API
|
|
131
|
+
|
|
132
|
+
For large digests, use the streaming parser directly:
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from git_undigest import parse_stream, reconstruct_files_stream
|
|
136
|
+
|
|
137
|
+
entries = parse_stream("large_digest.txt")
|
|
138
|
+
result = reconstruct_files_stream(entries, "output", overwrite=True)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
This keeps memory constant regardless of digest size.
|
|
142
|
+
|
|
143
|
+
## Supported Digest Formats
|
|
144
|
+
|
|
145
|
+
| Format | Status | Notes |
|
|
146
|
+
|--------|--------|-------|
|
|
147
|
+
| GitIngest | Stable | Default format |
|
|
148
|
+
| Custom | Pluggable | Subclass `DigestFormat` |
|
|
149
|
+
|
|
150
|
+
To add support for a new format, create a subclass of `DigestFormat`,
|
|
151
|
+
implement `sniff()`, `parse_stream()`, and `serialize()`, then register it:
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
from git_undigest.formats import DigestFormat, register_format_class
|
|
155
|
+
from git_undigest.models import FileEntry
|
|
156
|
+
|
|
157
|
+
class MyFormat(DigestFormat):
|
|
158
|
+
name = "myformat"
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def sniff(cls, prefix: str) -> bool:
|
|
162
|
+
return prefix.startswith("MAGIC")
|
|
163
|
+
|
|
164
|
+
def parse_stream(self, stream):
|
|
165
|
+
... # yield FileEntry instances
|
|
166
|
+
|
|
167
|
+
def serialize(self, repo) -> str:
|
|
168
|
+
...
|
|
169
|
+
|
|
170
|
+
register_format_class(MyFormat)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Third-party packages are auto-discovered via the `git_undigest.formats`
|
|
174
|
+
entry point group.
|
|
175
|
+
|
|
176
|
+
## Security
|
|
177
|
+
|
|
178
|
+
Every path in the digest is validated before anything is written:
|
|
179
|
+
|
|
180
|
+
- **No path traversal.** `../../../etc/passwd`, `../secret.txt`, and any
|
|
181
|
+
path containing a `..` segment that would escape the output directory
|
|
182
|
+
is rejected with `PathTraversalError`.
|
|
183
|
+
- **No absolute paths.** POSIX absolute paths (`/etc/shadow`), Windows
|
|
184
|
+
drive-qualified paths (`C:\Windows\System32`), and UNC paths
|
|
185
|
+
(`\\server\share`) are all rejected.
|
|
186
|
+
- **No Windows reserved device names.** `CON`, `PRN`, `AUX`, `NUL`,
|
|
187
|
+
`COM1`–`COM9`, `LPT1`–`LPT9` are rejected as path components.
|
|
188
|
+
- **No null bytes** are permitted in paths.
|
|
189
|
+
- **Final containment check.** Every resolved path is confirmed, via
|
|
190
|
+
`Path.relative_to`, to be a real descendant of the output directory
|
|
191
|
+
after full filesystem resolution.
|
|
192
|
+
- **Atomic writes.** Files are written to a temporary file in the same
|
|
193
|
+
directory and then renamed into place, so a crash or interruption never
|
|
194
|
+
leaves a partially-written file at the destination.
|
|
195
|
+
|
|
196
|
+
## Architecture
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
src/git_undigest/
|
|
200
|
+
├── __init__.py # Public API: reconstruct, validate, inspect, stats
|
|
201
|
+
├── cli.py # argparse CLI entry point
|
|
202
|
+
├── parser.py # Streaming and bulk digest parsing
|
|
203
|
+
├── formats/
|
|
204
|
+
│ ├── __init__.py # DigestFormat ABC, registry, plugin discovery
|
|
205
|
+
│ └── gitingest.py # GitIngest format implementation
|
|
206
|
+
├── validator.py # Path safety + structural validation
|
|
207
|
+
├── writer.py # Streaming filesystem reconstruction
|
|
208
|
+
├── checksum.py # SHA-256 checksum utilities
|
|
209
|
+
├── models.py # Dataclasses (FileEntry, Repository, results)
|
|
210
|
+
├── exceptions.py # Exception hierarchy
|
|
211
|
+
└── utils.py # Shared helpers
|
|
212
|
+
|
|
213
|
+
benchmarks/
|
|
214
|
+
└── bench_streaming.py
|
|
215
|
+
tests/
|
|
216
|
+
├── test_api.py
|
|
217
|
+
├── test_cli.py
|
|
218
|
+
├── test_parser.py
|
|
219
|
+
├── test_writer.py
|
|
220
|
+
├── test_validator.py
|
|
221
|
+
├── test_fuzz.py
|
|
222
|
+
├── test_phase1.py
|
|
223
|
+
├── test_formats_and_placeholders.py
|
|
224
|
+
└── ...
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### Design Principles
|
|
228
|
+
|
|
229
|
+
- **Parser only parses.** It never touches the filesystem or makes security
|
|
230
|
+
decisions.
|
|
231
|
+
- **Writer only writes.** It assumes the digest has been validated, but
|
|
232
|
+
re-validates every path as defense-in-depth.
|
|
233
|
+
- **Validator owns all validation.** No duplicated logic.
|
|
234
|
+
- **Formats are pluggable.** Adding a new format means a new module in
|
|
235
|
+
`formats/` — no changes to `parser.py`, `validator.py`, or `writer.py`.
|
|
236
|
+
- **Streaming by default.** All public APIs use constant-memory streaming
|
|
237
|
+
internally.
|
|
238
|
+
|
|
239
|
+
## Performance
|
|
240
|
+
|
|
241
|
+
The streaming parser is 2–30x faster than bulk parsing for typical digest
|
|
242
|
+
sizes because it avoids allocating a single large string for the entire
|
|
243
|
+
digest:
|
|
244
|
+
|
|
245
|
+
| Files | Digest Size | Bulk (s) | Stream (s) | Speedup |
|
|
246
|
+
|-------|-------------|----------|------------|---------|
|
|
247
|
+
| 100 | 12 KB | 0.036 | 0.001 | 32x |
|
|
248
|
+
| 1,000 | 120 KB | 0.040 | 0.007 | 6x |
|
|
249
|
+
| 10,000| 1.2 MB | 0.124 | 0.072 | 1.7x |
|
|
250
|
+
|
|
251
|
+
Memory usage is O(largest file) for streaming vs O(total digest) for bulk.
|
|
252
|
+
|
|
253
|
+
## Roadmap
|
|
254
|
+
|
|
255
|
+
- [ ] SHA-256 checksum manifest verification
|
|
256
|
+
- [ ] Binary file reconstruction (base64-embedded digests)
|
|
257
|
+
- [ ] Parallel reconstruction for very large digests
|
|
258
|
+
- [ ] Resumable reconstruction
|
|
259
|
+
- [ ] Plugin distribution guide for third-party format packages
|
|
260
|
+
|
|
261
|
+
## Contributing
|
|
262
|
+
|
|
263
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup, testing
|
|
264
|
+
instructions, and pull request guidelines.
|
|
265
|
+
|
|
266
|
+
## License
|
|
267
|
+
|
|
268
|
+
MIT — see [LICENSE](LICENSE).
|