git-undigest 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,290 @@
1
+ """git-undigest: reconstruct a full repository from a GitIngest-style digest.
2
+
3
+ Public API:
4
+ reconstruct: Parse a digest and write the resulting files to disk.
5
+ validate: Validate a digest without writing anything.
6
+ inspect: Produce a human-oriented summary of a digest's contents.
7
+ stats: Produce numeric statistics about a digest.
8
+ list_files: List every file path contained in a digest.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ from . import utils
17
+ from .exceptions import (
18
+ BinaryDecodeError,
19
+ ChecksumMismatchError,
20
+ DigestIntegrityError,
21
+ DigestParseError,
22
+ DuplicateFileError,
23
+ FileExistsConflictError,
24
+ FileSizeLimitError,
25
+ FileWriteError,
26
+ GitUndigestError,
27
+ InvalidDigestError,
28
+ NotImplementedFeatureError,
29
+ PathTraversalError,
30
+ UnsupportedFormatError,
31
+ )
32
+ from .models import (
33
+ DigestFile,
34
+ DigestSummary,
35
+ FileEntry,
36
+ ReconstructionResult,
37
+ Repository,
38
+ StatsResult,
39
+ WriteResult,
40
+ )
41
+ from .parser import iter_files, parse_digest, parse_digest_text, parse_stream
42
+ from .validator import validate_summary
43
+ from .writer import reconstruct_files, reconstruct_files_stream
44
+
45
+ __version__ = "0.2.0"
46
+
47
+ __all__ = [
48
+ "reconstruct",
49
+ "validate",
50
+ "inspect",
51
+ "stats",
52
+ "list_files",
53
+ "parse_digest",
54
+ "parse_digest_text",
55
+ "parse_stream",
56
+ "iter_files",
57
+ "reconstruct_files",
58
+ "reconstruct_files_stream",
59
+ "__version__",
60
+ # Models
61
+ "DigestFile",
62
+ "DigestSummary",
63
+ "FileEntry",
64
+ "ReconstructionResult",
65
+ "Repository",
66
+ "StatsResult",
67
+ "WriteResult",
68
+ # Exceptions
69
+ "GitUndigestError",
70
+ "DigestParseError",
71
+ "InvalidDigestError",
72
+ "PathTraversalError",
73
+ "DuplicateFileError",
74
+ "ChecksumMismatchError",
75
+ "FileWriteError",
76
+ "FileExistsConflictError",
77
+ "FileSizeLimitError",
78
+ "BinaryDecodeError",
79
+ "DigestIntegrityError",
80
+ "UnsupportedFormatError",
81
+ "NotImplementedFeatureError",
82
+ ]
83
+
84
+
85
+ def reconstruct(
86
+ digest_path: str | Path,
87
+ output: str | Path = "output",
88
+ *,
89
+ overwrite: bool = False,
90
+ skip_existing: bool = False,
91
+ backup: bool = False,
92
+ dry_run: bool = False,
93
+ format_name: str | None = None,
94
+ ) -> ReconstructionResult:
95
+ """Reconstruct a full repository from a digest file.
96
+
97
+ Uses streaming internally — the digest is parsed and written
98
+ line-by-line without loading the entire file into memory.
99
+
100
+ Args:
101
+ digest_path: Path to the GitIngest-style digest file.
102
+ output: Directory to reconstruct the repository into.
103
+ overwrite: Overwrite existing files instead of erroring.
104
+ skip_existing: Skip existing files instead of erroring.
105
+ backup: Back up existing files to ``*.bak`` before overwriting.
106
+ dry_run: Report what would happen without writing anything.
107
+ format_name: Optional explicit digest format name. Auto-detected
108
+ if omitted.
109
+
110
+ Returns:
111
+ A :class:`ReconstructionResult` describing what was written (or
112
+ would be written, in dry-run mode).
113
+
114
+ Raises:
115
+ InvalidDigestError: If the digest is empty, unreadable, or
116
+ structurally invalid.
117
+ DigestParseError: If the digest format is recognized but a section
118
+ is malformed.
119
+ DuplicateFileError: If the digest contains duplicate file paths.
120
+ PathTraversalError: If any file path is unsafe.
121
+ FileExistsConflictError: If a target file exists and no overwrite
122
+ policy was given.
123
+ FileWriteError: If a filesystem operation fails.
124
+
125
+ Example:
126
+ >>> from git_undigest import reconstruct
127
+ >>> result = reconstruct("digest.txt", output="repo")
128
+ >>> result.output_dir
129
+ 'repo'
130
+ """
131
+ entries = parse_stream(digest_path, format_name=format_name)
132
+ return reconstruct_files_stream(
133
+ entries,
134
+ output,
135
+ overwrite=overwrite,
136
+ skip_existing=skip_existing,
137
+ backup=backup,
138
+ dry_run=dry_run,
139
+ )
140
+
141
+
142
+ def validate(
143
+ digest_path: str | Path, *, format_name: str | None = None
144
+ ) -> DigestSummary:
145
+ """Validate a digest file without writing anything to disk.
146
+
147
+ Args:
148
+ digest_path: Path to the digest file.
149
+ format_name: Optional explicit digest format name.
150
+
151
+ Returns:
152
+ The parsed and validated :class:`DigestSummary`, if valid.
153
+
154
+ Raises:
155
+ InvalidDigestError: If the digest is empty or structurally invalid.
156
+ DigestParseError: If a section is malformed.
157
+ DuplicateFileError: If duplicate file paths are present.
158
+ PathTraversalError: If any file path is empty or unsafe.
159
+ """
160
+ summary = parse_digest(digest_path, format_name=format_name)
161
+ validate_summary(summary)
162
+ return summary
163
+
164
+
165
+ def inspect(
166
+ digest_path: str | Path, *, format_name: str | None = None
167
+ ) -> dict[str, Any]:
168
+ """Produce a human-oriented summary of a digest's contents.
169
+
170
+ Args:
171
+ digest_path: Path to the digest file.
172
+ format_name: Optional explicit digest format name.
173
+
174
+ Returns:
175
+ A dictionary with keys ``repo_name``, ``file_count``,
176
+ ``languages`` (dict of language -> file count), ``largest_files``
177
+ (list of ``(path, size)`` tuples, largest first, top 10), and
178
+ ``tree`` (a rendered directory tree string).
179
+ """
180
+ summary = validate(digest_path, format_name=format_name)
181
+
182
+ languages: dict[str, int] = {}
183
+ for f in summary.files:
184
+ lang = utils.detect_language(f.extension)
185
+ if lang:
186
+ languages[lang] = languages.get(lang, 0) + 1
187
+
188
+ largest = sorted(summary.files, key=lambda f: f.size, reverse=True)[:10]
189
+
190
+ return {
191
+ "repo_name": summary.repo_name,
192
+ "file_count": summary.file_count,
193
+ "languages": dict(sorted(languages.items(), key=lambda kv: -kv[1])),
194
+ "largest_files": [(f.path, f.size) for f in largest],
195
+ "tree": render_tree(summary),
196
+ }
197
+
198
+
199
+ def stats(digest_path: str | Path, *, format_name: str | None = None) -> StatsResult:
200
+ """Compute numeric statistics about a digest.
201
+
202
+ Args:
203
+ digest_path: Path to the digest file.
204
+ format_name: Optional explicit digest format name.
205
+
206
+ Returns:
207
+ A populated :class:`StatsResult`.
208
+ """
209
+ summary = validate(digest_path, format_name=format_name)
210
+
211
+ ext_counts: dict[str, int] = {}
212
+ for f in summary.files:
213
+ key = f.extension or "(no extension)"
214
+ ext_counts[key] = ext_counts.get(key, 0) + 1
215
+
216
+ total_chars = sum(len(f.content) for f in summary.files)
217
+
218
+ if summary.files:
219
+ largest = max(summary.files, key=lambda f: f.size)
220
+ largest_path: str | None = largest.path
221
+ largest_size = largest.size
222
+ else:
223
+ largest_path = None
224
+ largest_size = 0
225
+
226
+ file_count = summary.file_count
227
+ avg = (summary.total_bytes / file_count) if file_count else 0.0
228
+
229
+ return StatsResult(
230
+ file_count=file_count,
231
+ folder_count=len(summary.directories),
232
+ total_bytes=summary.total_bytes,
233
+ largest_file=largest_path,
234
+ largest_file_size=largest_size,
235
+ average_file_size=avg,
236
+ extension_counts=dict(sorted(ext_counts.items(), key=lambda kv: -kv[1])),
237
+ estimated_tokens=utils.estimate_tokens(total_chars),
238
+ )
239
+
240
+
241
+ def list_files(digest_path: str | Path, *, format_name: str | None = None) -> list[str]:
242
+ """List every file path contained in a digest.
243
+
244
+ Args:
245
+ digest_path: Path to the digest file.
246
+ format_name: Optional explicit digest format name.
247
+
248
+ Returns:
249
+ A sorted list of file paths.
250
+ """
251
+ summary = validate(digest_path, format_name=format_name)
252
+ return sorted(f.path for f in summary.files)
253
+
254
+
255
+ def render_tree(summary: DigestSummary) -> str:
256
+ """Render a directory tree string for a digest summary.
257
+
258
+ Args:
259
+ summary: The parsed digest.
260
+
261
+ Returns:
262
+ A multi-line string depicting the directory/file hierarchy.
263
+ """
264
+ root: dict[str, dict[str, Any] | None] = {}
265
+ for f in summary.files:
266
+ parts = f.path.split("/")
267
+ node = root
268
+ for part in parts[:-1]:
269
+ existing = node.get(part)
270
+ if existing is None:
271
+ existing = {}
272
+ node[part] = existing
273
+ node = existing
274
+ node[parts[-1]] = None
275
+
276
+ lines = [f"{summary.repo_name}/"]
277
+
278
+ def _walk(node: dict[str, dict[str, Any] | None], prefix: str) -> None:
279
+ entries = sorted(node.items(), key=lambda kv: (kv[1] is None, kv[0]))
280
+ for idx, (name, child) in enumerate(entries):
281
+ is_last = idx == len(entries) - 1
282
+ connector = "└── " if is_last else "├── "
283
+ suffix = "/" if isinstance(child, dict) else ""
284
+ lines.append(f"{prefix}{connector}{name}{suffix}")
285
+ if isinstance(child, dict):
286
+ extension = " " if is_last else "│ "
287
+ _walk(child, prefix + extension)
288
+
289
+ _walk(root, "")
290
+ return "\n".join(lines)
@@ -0,0 +1,72 @@
1
+ """Checksum utilities.
2
+
3
+ Provides SHA-256 hashing helpers used for integrity verification. Full
4
+ digest-embedded checksum verification is a designed-but-not-yet-implemented
5
+ feature (see :func:`verify_checksum`), since the GitIngest format itself
6
+ does not currently embed per-file checksums.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+
13
+ from .exceptions import ChecksumMismatchError, NotImplementedFeatureError
14
+
15
+
16
+ def sha256_text(content: str) -> str:
17
+ """Compute the SHA-256 hex digest of a text string (UTF-8 encoded).
18
+
19
+ Args:
20
+ content: The text content to hash.
21
+
22
+ Returns:
23
+ The hex-encoded SHA-256 digest.
24
+ """
25
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()
26
+
27
+
28
+ def sha256_bytes(data: bytes) -> str:
29
+ """Compute the SHA-256 hex digest of raw bytes.
30
+
31
+ Args:
32
+ data: The raw bytes to hash.
33
+
34
+ Returns:
35
+ The hex-encoded SHA-256 digest.
36
+ """
37
+ return hashlib.sha256(data).hexdigest()
38
+
39
+
40
+ def verify_checksum(path: str, content: str, expected_checksum: str) -> None:
41
+ """Verify file content against an expected SHA-256 checksum.
42
+
43
+ This is a placeholder for future digest formats that embed per-file
44
+ checksums. The current GitIngest text format has no such field, so
45
+ callers should not invoke this unless they have an external checksum
46
+ manifest.
47
+
48
+ Args:
49
+ path: The file path, used for error reporting.
50
+ content: The file's text content.
51
+ expected_checksum: The expected SHA-256 hex digest.
52
+
53
+ Raises:
54
+ ChecksumMismatchError: If the computed checksum does not match.
55
+ """
56
+ actual = sha256_text(content)
57
+ if actual != expected_checksum:
58
+ raise ChecksumMismatchError(path, expected_checksum, actual)
59
+
60
+
61
+ def verify_checksum_manifest(*_args: object, **_kwargs: object) -> None:
62
+ """Verify an entire reconstruction against a SHA-256 checksum manifest.
63
+
64
+ Designed interface for a future feature where a digest is accompanied
65
+ by a manifest file mapping paths to expected checksums.
66
+
67
+ Raises:
68
+ NotImplementedFeatureError: Always, until implemented.
69
+ """
70
+ raise NotImplementedFeatureError(
71
+ "Checksum manifest verification is not yet implemented."
72
+ )
git_undigest/cli.py ADDED
@@ -0,0 +1,251 @@
1
+ """Command-line interface for git-undigest.
2
+
3
+ Commands:
4
+ git-undigest DIGEST [OUTPUT] Reconstruct a repository.
5
+ git-undigest validate DIGEST Validate a digest without writing.
6
+ git-undigest inspect DIGEST Show a human summary of the digest.
7
+ git-undigest list DIGEST List all files in the digest.
8
+ git-undigest stats DIGEST Show numeric statistics.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import sys
15
+
16
+ from . import inspect as inspect_api
17
+ from . import list_files as list_files_api
18
+ from . import reconstruct as reconstruct_api
19
+ from . import stats as stats_api
20
+ from . import validate as validate_api
21
+ from .exceptions import GitUndigestError
22
+ from .models import ReconstructionResult
23
+ from .utils import human_size
24
+
25
+
26
+ def _build_parser() -> argparse.ArgumentParser:
27
+ parser = argparse.ArgumentParser(
28
+ prog="git-undigest",
29
+ description=(
30
+ "Reconstruct a full repository from a GitIngest-style digest " "file."
31
+ ),
32
+ )
33
+ parser.add_argument("--version", action="version", version=_version_string())
34
+
35
+ subparsers = parser.add_subparsers(dest="command")
36
+
37
+ def add_common_overwrite_flags(sp: argparse.ArgumentParser) -> None:
38
+ group = sp.add_mutually_exclusive_group()
39
+ group.add_argument(
40
+ "--overwrite",
41
+ action="store_true",
42
+ help="Overwrite existing files instead of erroring.",
43
+ )
44
+ group.add_argument(
45
+ "--skip-existing",
46
+ action="store_true",
47
+ help="Skip files that already exist instead of erroring.",
48
+ )
49
+ sp.add_argument(
50
+ "--backup",
51
+ action="store_true",
52
+ help="Back up existing files to '<name>.bak' before writing.",
53
+ )
54
+ sp.add_argument(
55
+ "--dry-run",
56
+ action="store_true",
57
+ help="Show what would happen without writing anything.",
58
+ )
59
+ sp.add_argument(
60
+ "--force",
61
+ action="store_true",
62
+ help="Alias for --overwrite.",
63
+ )
64
+ verbosity = sp.add_mutually_exclusive_group()
65
+ verbosity.add_argument(
66
+ "--verbose", action="store_true", help="Print detailed per-file output."
67
+ )
68
+ verbosity.add_argument(
69
+ "--quiet", action="store_true", help="Suppress non-essential output."
70
+ )
71
+
72
+ # Default reconstruction: `git-undigest digest.txt [output]`
73
+ recon = subparsers.add_parser(
74
+ "reconstruct", help="Reconstruct a repository from a digest (default command)."
75
+ )
76
+ recon.add_argument("digest", help="Path to the digest file.")
77
+ recon.add_argument(
78
+ "output",
79
+ nargs="?",
80
+ default="output",
81
+ help="Output directory (default: output).",
82
+ )
83
+ add_common_overwrite_flags(recon)
84
+
85
+ validate_p = subparsers.add_parser("validate", help="Validate a digest file.")
86
+ validate_p.add_argument("digest", help="Path to the digest file.")
87
+
88
+ inspect_p = subparsers.add_parser(
89
+ "inspect", help="Show repository name, languages, tree, and largest files."
90
+ )
91
+ inspect_p.add_argument("digest", help="Path to the digest file.")
92
+
93
+ list_p = subparsers.add_parser("list", help="List every file in the digest.")
94
+ list_p.add_argument("digest", help="Path to the digest file.")
95
+
96
+ stats_p = subparsers.add_parser(
97
+ "stats", help="Show numeric statistics about a digest."
98
+ )
99
+ stats_p.add_argument("digest", help="Path to the digest file.")
100
+
101
+ return parser
102
+
103
+
104
+ def _version_string() -> str:
105
+ from . import __version__
106
+
107
+ return f"git-undigest {__version__}"
108
+
109
+
110
+ def _print_reconstruction_result(
111
+ result: ReconstructionResult, *, verbose: bool, quiet: bool
112
+ ) -> None:
113
+ if quiet:
114
+ return
115
+
116
+ if verbose:
117
+ for r in result.results:
118
+ label = {
119
+ "created": "CREATE",
120
+ "would_create": "WOULD CREATE",
121
+ "overwritten": "OVERWRITE",
122
+ "would_overwrite": "WOULD OVERWRITE",
123
+ "skipped": "SKIP",
124
+ "backed_up": "BACKUP",
125
+ }.get(r.action, r.action.upper())
126
+ print(f" [{label}] {r.path}")
127
+
128
+ for w in result.warnings:
129
+ print(f"warning: {w}", file=sys.stderr)
130
+
131
+ prefix = "Would write" if result.dry_run else "Wrote"
132
+ print(
133
+ f"{prefix} {len(result.created)} created, "
134
+ f"{len(result.overwritten)} overwritten, "
135
+ f"{len(result.skipped)} skipped, "
136
+ f"{len(result.backed_up)} backed up "
137
+ f"({human_size(result.total_bytes_written)}) to {result.output_dir}"
138
+ )
139
+
140
+
141
+ def _cmd_reconstruct(args: argparse.Namespace) -> int:
142
+ result = reconstruct_api(
143
+ args.digest,
144
+ args.output,
145
+ overwrite=args.overwrite or args.force,
146
+ skip_existing=args.skip_existing,
147
+ backup=args.backup,
148
+ dry_run=args.dry_run,
149
+ )
150
+ _print_reconstruction_result(result, verbose=args.verbose, quiet=args.quiet)
151
+ return 0
152
+
153
+
154
+ def _cmd_validate(args: argparse.Namespace) -> int:
155
+ summary = validate_api(args.digest)
156
+ print(f"OK: digest is valid ({summary.file_count} files).")
157
+ return 0
158
+
159
+
160
+ def _cmd_inspect(args: argparse.Namespace) -> int:
161
+ info = inspect_api(args.digest)
162
+ print(f"Repository: {info['repo_name']}")
163
+ print(f"Files: {info['file_count']}")
164
+ if info["languages"]:
165
+ print("Languages:")
166
+ for lang, count in info["languages"].items():
167
+ print(f" {lang}: {count}")
168
+ if info["largest_files"]:
169
+ print("Largest files:")
170
+ for path, size in info["largest_files"]:
171
+ print(f" {human_size(size):>10} {path}")
172
+ print("Directory tree:")
173
+ print(info["tree"])
174
+ return 0
175
+
176
+
177
+ def _cmd_list(args: argparse.Namespace) -> int:
178
+ for path in list_files_api(args.digest):
179
+ print(path)
180
+ return 0
181
+
182
+
183
+ def _cmd_stats(args: argparse.Namespace) -> int:
184
+ s = stats_api(args.digest)
185
+ print(f"Files: {s.file_count}")
186
+ print(f"Folders: {s.folder_count}")
187
+ print(f"Total size: {human_size(s.total_bytes)}")
188
+ if s.largest_file:
189
+ print(f"Largest file: {s.largest_file} ({human_size(s.largest_file_size)})")
190
+ print(f"Average size: {human_size(s.average_file_size)}")
191
+ print(f"Estimated tokens: {s.estimated_tokens:,}")
192
+ if s.extension_counts:
193
+ print("Extensions:")
194
+ for ext, count in s.extension_counts.items():
195
+ print(f" .{ext}: {count}")
196
+ return 0
197
+
198
+
199
+ def main(argv: list[str] | None = None) -> int:
200
+ """Entry point for the ``git-undigest`` console script.
201
+
202
+ Args:
203
+ argv: Optional argument list (defaults to ``sys.argv[1:]``).
204
+
205
+ Returns:
206
+ Process exit code: 0 on success, 1 on a handled error, 2 on
207
+ argument-parsing errors (raised by argparse itself).
208
+ """
209
+ raw_args = sys.argv[1:] if argv is None else argv
210
+
211
+ # Support the bare `git-undigest digest.txt [output]` form by
212
+ # injecting the implicit "reconstruct" subcommand when the first
213
+ # token isn't a known subcommand or flag.
214
+ known_commands = {"reconstruct", "validate", "inspect", "list", "stats"}
215
+ args_to_parse = list(raw_args)
216
+ if (
217
+ args_to_parse
218
+ and args_to_parse[0] not in known_commands
219
+ and not args_to_parse[0].startswith("-")
220
+ ):
221
+ args_to_parse = ["reconstruct", *args_to_parse]
222
+ elif not args_to_parse:
223
+ args_to_parse = ["--help"]
224
+
225
+ parser = _build_parser()
226
+ args = parser.parse_args(args_to_parse)
227
+
228
+ if args.command is None:
229
+ parser.print_help()
230
+ return 0
231
+
232
+ handlers = {
233
+ "reconstruct": _cmd_reconstruct,
234
+ "validate": _cmd_validate,
235
+ "inspect": _cmd_inspect,
236
+ "list": _cmd_list,
237
+ "stats": _cmd_stats,
238
+ }
239
+
240
+ try:
241
+ return handlers[args.command](args)
242
+ except GitUndigestError as exc:
243
+ print(f"error: {exc}", file=sys.stderr)
244
+ return 1
245
+ except FileNotFoundError as exc:
246
+ print(f"error: {exc}", file=sys.stderr)
247
+ return 1
248
+
249
+
250
+ if __name__ == "__main__":
251
+ sys.exit(main())