git-reaper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
git_reaper/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ """git-reaper: reap structured knowledge from repositories."""
2
+
3
+ try:
4
+ from git_reaper._version import __version__
5
+ except ImportError: # no VCS metadata and no build hook output; should not happen in installs
6
+ __version__ = "0.0.0"
7
+
8
+ __all__ = ["__version__"]
git_reaper/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Allow `python -m git_reaper` to invoke the CLI."""
2
+
3
+ from git_reaper.cli import app
4
+
5
+ if __name__ == "__main__":
6
+ app()
git_reaper/_version.py ADDED
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '0.1.0'
22
+ __version_tuple__ = version_tuple = (0, 1, 0)
23
+
24
+ __commit_id__ = commit_id = None
git_reaper/art.py ADDED
@@ -0,0 +1,60 @@
1
+ """The skull gallery: banners, tombstones, and spinner frames.
2
+
3
+ Everything here is decoration. Every caller must honor --plain / NO_COLOR
4
+ by simply not calling into this module (see theme.theme_enabled).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import random
10
+
11
+ HERO_SKULL = r"""
12
+ ______
13
+ .-" "-.
14
+ / \
15
+ |, .-. .-. ,|
16
+ | )(_o/ \o_)( |
17
+ |/ /\ \|
18
+ (_ ^^ _)
19
+ \__|IIIIII|__/
20
+ | \IIIIII/ |
21
+ \ /
22
+ `--------`
23
+ g i t - r e a p e r
24
+ """
25
+
26
+ # Chosen automatically for skinny terminals.
27
+ NARROW_SKULL = r"""
28
+ .-.
29
+ (o.o)
30
+ |=|
31
+ git-reaper
32
+ """
33
+
34
+ MINI_SKULL = ".-.\n|x|\n'-'"
35
+
36
+ SCYTHE_FRAMES = ["/", "-", "\\", "|"]
37
+
38
+ TOMBSTONE_DIVIDER = " _______\n | RIP |\n_|_______|_"
39
+
40
+
41
+ def banner(version: str, width: int = 80) -> str:
42
+ """The CLI banner, sized to the terminal."""
43
+ skull = HERO_SKULL if width >= 40 else NARROW_SKULL
44
+ return f"{skull.rstrip()}\n v{version}\n"
45
+
46
+
47
+ def tombstone(lines: list[str]) -> str:
48
+ """Render lines of text inside ASCII tombstone art."""
49
+ inner = max(len(line) for line in lines) if lines else 0
50
+ inner = max(inner, 11)
51
+ top = " " + "_" * inner
52
+ body = [f" /{' ' * inner}\\"]
53
+ body.extend(f" | {line.center(inner - 2)} |" for line in ["R I P", "", *lines])
54
+ body.append(" ___|" + "_" * inner + "|___")
55
+ return "\n".join([top, *body])
56
+
57
+
58
+ def boo() -> str:
59
+ """A random piece from the gallery, for the hidden `reaper boo`."""
60
+ return random.choice([HERO_SKULL, NARROW_SKULL, MINI_SKULL, TOMBSTONE_DIVIDER])
git_reaper/cache.py ADDED
@@ -0,0 +1,131 @@
1
+ """The catacombs: the clone cache.
2
+
3
+ Remote clones land in a content-addressed cache under
4
+ ``~/.cache/git-reaper/catacombs/<host>/<owner>/<repo>``, shallow by default,
5
+ reused across runs, and cleared by ``banish``. Local ``file://`` sources are
6
+ buried flat as ``localhost/<name>-<digest>`` to stay inside Windows path
7
+ limits.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import os
14
+ import re
15
+ import time
16
+ from pathlib import Path
17
+ from urllib.parse import urlparse
18
+
19
+ from git_reaper import fsutil
20
+ from git_reaper.models import BanishResult, CacheEntry
21
+
22
+ _SCP_RE = re.compile(r"^(?:\w+@)?(?P<host>[\w.-]+):(?P<path>.+)$")
23
+
24
+
25
+ def catacombs_root() -> Path:
26
+ """Cache root, overridable via GIT_REAPER_CACHE for tests and CI."""
27
+ override = os.environ.get("GIT_REAPER_CACHE")
28
+ if override:
29
+ return Path(override)
30
+ xdg = os.environ.get("XDG_CACHE_HOME") or str(Path.home() / ".cache")
31
+ return Path(xdg) / "git-reaper" / "catacombs"
32
+
33
+
34
+ def _sanitize(part: str) -> str:
35
+ part = part.strip("/").removesuffix(".git")
36
+ return re.sub(r"[^\w.-]", "_", part) or "_"
37
+
38
+
39
+ def grave_path(url: str) -> Path:
40
+ """Map a remote URL to its plot in the catacombs."""
41
+ parsed = urlparse(url)
42
+ if parsed.scheme == "file":
43
+ # Tolerate Windows spellings (file://C:\repos\x): backslashes never
44
+ # delimit for urlparse, so the drive letter lands in netloc.
45
+ path = parsed.path.replace("\\", "/")
46
+ if re.match(r"^[A-Za-z]:", parsed.netloc):
47
+ path = parsed.netloc.replace("\\", "/") + path
48
+ path = path.strip("/")
49
+ if not path:
50
+ raise ValueError(f"URL has no repository path: {url!r}")
51
+ # Local paths can be arbitrarily deep; mirroring them under the
52
+ # catacombs would breach Windows' 260-char path limit. Bury them
53
+ # flat: basename plus a short digest of the full path.
54
+ digest = hashlib.sha256(path.encode("utf-8")).hexdigest()[:12]
55
+ name = _sanitize(path.rsplit("/", 1)[-1])
56
+ return catacombs_root() / "localhost" / f"{name}-{digest}"
57
+ if parsed.scheme:
58
+ host, path = parsed.netloc or "localhost", parsed.path
59
+ else:
60
+ scp = _SCP_RE.match(url)
61
+ if not scp:
62
+ raise ValueError(f"cannot read this incantation as a repo URL: {url!r}")
63
+ host, path = scp.group("host"), scp.group("path")
64
+ segments = [_sanitize(seg) for seg in path.strip("/").split("/") if seg]
65
+ if not segments:
66
+ raise ValueError(f"URL has no repository path: {url!r}")
67
+ return catacombs_root() / _sanitize(host) / Path(*segments)
68
+
69
+
70
+ def _dir_size(path: Path) -> int:
71
+ return sum(f.stat().st_size for f in path.rglob("*") if f.is_file())
72
+
73
+
74
+ def list_graves() -> list[CacheEntry]:
75
+ """Every interred repo, oldest first."""
76
+ root = catacombs_root()
77
+ entries: list[CacheEntry] = []
78
+ if not root.is_dir():
79
+ return entries
80
+ for git_dir in sorted(root.rglob(".git")):
81
+ repo = git_dir.parent
82
+ marker = repo / ".git-reaper-url"
83
+ url = marker.read_text(encoding="utf-8").strip() if marker.is_file() else ""
84
+ entries.append(
85
+ CacheEntry(
86
+ path=str(repo),
87
+ url=url,
88
+ size_bytes=_dir_size(repo),
89
+ last_used=repo.stat().st_mtime,
90
+ )
91
+ )
92
+ entries.sort(key=lambda e: e.last_used)
93
+ return entries
94
+
95
+
96
+ def mark_grave(repo_path: Path, url: str) -> None:
97
+ """Record the source URL and refresh the last-used stamp."""
98
+ (repo_path / ".git-reaper-url").write_text(url + "\n", encoding="utf-8")
99
+ os.utime(repo_path)
100
+
101
+
102
+ def banish(older_than_seconds: float | None = None) -> BanishResult:
103
+ """Clear the catacombs. With older_than, a partial exorcism."""
104
+ result = BanishResult()
105
+ cutoff = time.time() - older_than_seconds if older_than_seconds is not None else None
106
+ for entry in list_graves():
107
+ if cutoff is not None and entry.last_used > cutoff:
108
+ result.kept.append(entry)
109
+ continue
110
+ try:
111
+ fsutil.force_rmtree(entry.path)
112
+ except OSError:
113
+ # A grave we cannot dig up (locked file?) is kept, not "removed".
114
+ result.kept.append(entry)
115
+ continue
116
+ result.removed.append(entry)
117
+ result.reclaimed_bytes += entry.size_bytes
118
+ return result
119
+
120
+
121
+ _AGE_RE = re.compile(r"^\s*(\d+)\s*([smhdw])\s*$", re.IGNORECASE)
122
+ _AGE_UNITS = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800}
123
+
124
+
125
+ def parse_age(text: str) -> float:
126
+ """Parse '7d', '12h', '90m' into seconds. Raises ValueError."""
127
+ match = _AGE_RE.match(text)
128
+ if not match:
129
+ raise ValueError(f"unreadable age: {text!r} (try '7d', '12h', '30m')")
130
+ value, unit = match.groups()
131
+ return int(value) * _AGE_UNITS[unit.lower()]
git_reaper/cli.py ADDED
@@ -0,0 +1,340 @@
1
+ """The CLI face of the reaper: a thin Typer adapter over git_reaper.core.
2
+
3
+ Rules of the house:
4
+ - Artifacts go to --out or stdout. Narration goes to stderr, always.
5
+ - Every themed message still carries the plain cause and a next step.
6
+ - Exit codes: 0 rest in peace, 1 the ritual failed, 2 bad incantation.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import shlex
13
+ import sys
14
+ from dataclasses import dataclass
15
+ from pathlib import Path
16
+
17
+ import typer
18
+ from rich.console import Console
19
+ from rich.markup import escape
20
+ from rich.table import Table
21
+
22
+ from git_reaper import __version__, art, cache, fsutil, schemas
23
+ from git_reaper.core import harvest as harvest_core
24
+ from git_reaper.core import pulse as pulse_core
25
+ from git_reaper.core import tree as tree_core
26
+ from git_reaper.core.source import resolve_source
27
+ from git_reaper.formatters import jsonfmt, markdown
28
+ from git_reaper.gitio import GitError
29
+ from git_reaper.theme import make_console, theme_enabled
30
+
31
+ app = typer.Typer(
32
+ name="reaper",
33
+ help="A spooky utility for data mining git repositories.",
34
+ no_args_is_help=True,
35
+ context_settings={"help_option_names": ["-h", "--help"]},
36
+ )
37
+
38
+
39
+ @dataclass
40
+ class State:
41
+ plain: bool = False
42
+ verbosity: int = 0 # -1 whisper, 0 default, 1 moan, 2 shriek
43
+ console: Console = None # type: ignore[assignment]
44
+
45
+
46
+ state = State()
47
+
48
+
49
+ def _say(style: str, message: str, level: int = 0) -> None:
50
+ """Narrate to stderr if the current verbosity allows it."""
51
+ if state.verbosity >= level:
52
+ state.console.print(f"[{style}]\\[{style}][/{style}] {escape(message)}")
53
+
54
+
55
+ def _die(message: str, hint: str | None = None) -> typer.Exit:
56
+ _say("blood", f"the ritual failed: {message}")
57
+ if hint:
58
+ _say("ash", f"next step: {hint}")
59
+ return typer.Exit(code=1)
60
+
61
+
62
+ def _invocation() -> str:
63
+ return "reaper " + " ".join(shlex.quote(a) for a in sys.argv[1:])
64
+
65
+
66
+ def _emit(text: str, out: Path | None) -> None:
67
+ """Write an artifact to --out or stdout. Chatter never comes here."""
68
+ if out:
69
+ out.parent.mkdir(parents=True, exist_ok=True)
70
+ out.write_text(text, encoding="utf-8")
71
+ else:
72
+ sys.stdout.write(text)
73
+
74
+
75
+ def _print_schema(command: str) -> None:
76
+ schema = schemas.schema_for(schemas.COMMAND_MODELS[command])
77
+ sys.stdout.write(json.dumps(schema, indent=2) + "\n")
78
+
79
+
80
+ def _validate_format(fmt: str) -> None:
81
+ if fmt not in ("md", "json"):
82
+ raise _die(f"unknown format {fmt!r}", "use --format md or --format json")
83
+
84
+
85
+ def _version_callback(value: bool) -> None:
86
+ if value:
87
+ sys.stdout.write(f"git-reaper {__version__}\n")
88
+ raise typer.Exit()
89
+
90
+
91
+ @app.callback()
92
+ def main(
93
+ plain: bool = typer.Option(
94
+ False, "--plain", "--no-theme", help="Clean ASCII output; no color, no art."
95
+ ),
96
+ whisper: bool = typer.Option(False, "-q", "--whisper", help="Only errors."),
97
+ verbose: int = typer.Option(
98
+ 0, "-v", "--moan", count=True, help="More narration; -vv (--shriek) for debug."
99
+ ),
100
+ version: bool = typer.Option(
101
+ False, "--version", callback=_version_callback, is_eager=True, help="Print version."
102
+ ),
103
+ ) -> None:
104
+ state.plain = plain
105
+ state.verbosity = -1 if whisper else verbose
106
+ state.console = make_console(plain=plain, quiet=False)
107
+
108
+
109
+ def _banner() -> None:
110
+ if theme_enabled(state.plain) and state.verbosity >= 0:
111
+ state.console.print(f"[eldritch]{art.MINI_SKULL}[/eldritch]", highlight=False)
112
+
113
+
114
+ # --------------------------------------------------------------------------
115
+ # harvest (reap)
116
+ # --------------------------------------------------------------------------
117
+
118
+
119
+ def _harvest_impl(
120
+ source: str,
121
+ pattern: list[str],
122
+ exclude: list[str],
123
+ out: Path | None,
124
+ ref: str | None,
125
+ depth: int,
126
+ max_file_size: str | None,
127
+ max_total_size: str | None,
128
+ include_binary: bool,
129
+ ) -> None:
130
+ """Gather files matching a pattern and concatenate them into one artifact."""
131
+ _banner()
132
+ try:
133
+ file_cap = fsutil.parse_size(max_file_size) if max_file_size else None
134
+ total_cap = fsutil.parse_size(max_total_size) if max_total_size else None
135
+ except ValueError as exc:
136
+ raise _die(str(exc)) from exc
137
+
138
+ try:
139
+ resolved = resolve_source(source, ref=ref, depth=depth)
140
+ except (FileNotFoundError, ValueError, GitError) as exc:
141
+ raise _die(str(exc), "check the path or URL; `reaper pulse` checks your setup") from exc
142
+ if resolved.cached:
143
+ _say("necro", f"catacombs hit: {resolved.repo.source} already interred, reusing")
144
+
145
+ patterns = tuple(pattern) if pattern else harvest_core.DEFAULT_PATTERNS
146
+ try:
147
+ result = harvest_core.harvest(
148
+ resolved.repo,
149
+ patterns=patterns,
150
+ excludes=exclude,
151
+ max_file_size=file_cap,
152
+ max_total_size=total_cap,
153
+ include_binary=include_binary,
154
+ invoked=_invocation(),
155
+ )
156
+ except harvest_core.CapExceeded as exc:
157
+ raise _die(str(exc)) from exc
158
+
159
+ _say(
160
+ "necro",
161
+ f"gathered {len(result.files)} souls ({', '.join(patterns)}) ... "
162
+ f"{result.total_lines:,} lines, {fsutil.human_size(result.total_bytes)}",
163
+ )
164
+ for skipped in result.skipped:
165
+ _say("ember", f"skipped {skipped.path}: {skipped.skip_reason}", level=0)
166
+
167
+ if out:
168
+ with out.open("w", encoding="utf-8") as fh:
169
+ markdown.write_harvest(result, fh)
170
+ _say(
171
+ "bone",
172
+ f"wrote {out} ({len(result.files)} files, ~{result.token_estimate:,} tokens)",
173
+ )
174
+ else:
175
+ markdown.write_harvest(result, sys.stdout)
176
+ _say("ash", "the reaping is complete.")
177
+
178
+
179
+ @app.command("harvest")
180
+ def harvest_cmd(
181
+ source: str | None = typer.Argument(None, help="Local path or repo URL."),
182
+ pattern: list[str] = typer.Option(
183
+ [], "--pattern", "--glob", "-p", help="Glob(s) to gather (default: *.md)."
184
+ ),
185
+ exclude: list[str] = typer.Option([], "--exclude", "-x", help="Glob(s) to skip."),
186
+ out: Path | None = typer.Option(None, "--out", "-o", help="Output file (default stdout)."),
187
+ ref: str | None = typer.Option(None, "--ref", help="Branch, tag, or sha (remote sources)."),
188
+ depth: int = typer.Option(1, "--depth", help="Clone depth for remote sources."),
189
+ max_file_size: str | None = typer.Option(
190
+ None, "--max-file-size", help="Skip files larger than this (e.g. 1MB)."
191
+ ),
192
+ max_total_size: str | None = typer.Option(
193
+ None, "--max-total-size", help="Abort past this total (e.g. 100MB)."
194
+ ),
195
+ include_binary: bool = typer.Option(False, "--include-binary", help="Do not skip binaries."),
196
+ schema: bool = typer.Option(False, "--schema", help="Print the JSON schema and exit."),
197
+ ) -> None:
198
+ """Gather files matching a pattern into one flat artifact."""
199
+ if schema:
200
+ _print_schema("harvest")
201
+ return
202
+ if source is None:
203
+ raise _die("no source given", "pass a local path or a repo URL")
204
+ _harvest_impl(
205
+ source,
206
+ pattern,
207
+ exclude,
208
+ out,
209
+ ref,
210
+ depth,
211
+ max_file_size,
212
+ max_total_size,
213
+ include_binary,
214
+ )
215
+
216
+
217
+ # --------------------------------------------------------------------------
218
+ # tree (map)
219
+ # --------------------------------------------------------------------------
220
+
221
+
222
+ @app.command("tree")
223
+ def tree_cmd(
224
+ source: str = typer.Argument(".", help="Local path or repo URL."),
225
+ depth: int | None = typer.Option(None, "--depth", "-d", help="Max depth."),
226
+ dirs_only: bool = typer.Option(False, "--dirs-only", help="Directories only."),
227
+ sizes: bool = typer.Option(False, "--sizes", help="Show file sizes."),
228
+ lines: bool = typer.Option(False, "--lines", help="Show line counts."),
229
+ exclude: list[str] = typer.Option([], "--exclude", "-x", help="Glob(s) to skip."),
230
+ fmt: str = typer.Option("md", "--format", "-f", help="md or json."),
231
+ out: Path | None = typer.Option(None, "--out", "-o", help="Output file (default stdout)."),
232
+ ref: str | None = typer.Option(None, "--ref", help="Branch, tag, or sha (remote sources)."),
233
+ schema: bool = typer.Option(False, "--schema", help="Print the JSON schema and exit."),
234
+ ) -> None:
235
+ """Emit a hierarchical file listing."""
236
+ if schema:
237
+ _print_schema("tree")
238
+ return
239
+ _validate_format(fmt)
240
+ _banner()
241
+ try:
242
+ resolved = resolve_source(source, ref=ref)
243
+ except (FileNotFoundError, ValueError, GitError) as exc:
244
+ raise _die(str(exc), "check the path or URL; `reaper pulse` checks your setup") from exc
245
+
246
+ result = tree_core.tree(
247
+ resolved.repo,
248
+ max_depth=depth,
249
+ dirs_only=dirs_only,
250
+ with_sizes=sizes,
251
+ with_lines=lines,
252
+ excludes=exclude,
253
+ invoked=_invocation(),
254
+ )
255
+ _say("necro", f"mapped {result.dir_count} crypts, {result.file_count} souls")
256
+ if fmt == "json":
257
+ _emit(jsonfmt.render(result), out)
258
+ else:
259
+ _emit(markdown.render_tree(result, with_sizes=sizes, with_lines=lines), out)
260
+
261
+
262
+ # --------------------------------------------------------------------------
263
+ # pulse (doctor)
264
+ # --------------------------------------------------------------------------
265
+
266
+
267
+ @app.command("pulse")
268
+ def pulse_cmd(
269
+ fmt: str = typer.Option("md", "--format", "-f", help="md or json."),
270
+ schema: bool = typer.Option(False, "--schema", help="Print the JSON schema and exit."),
271
+ ) -> None:
272
+ """Signs-of-life check: git, extras, cache health."""
273
+ if schema:
274
+ _print_schema("pulse")
275
+ return
276
+ _validate_format(fmt)
277
+ result = pulse_core.pulse()
278
+ if fmt == "json":
279
+ _emit(jsonfmt.render(result), None)
280
+ else:
281
+ table = Table(title="signs of life", title_style="eldritch", border_style="grave")
282
+ table.add_column("check", style="bone")
283
+ table.add_column("", justify="center")
284
+ table.add_column("detail", style="ash")
285
+ for check in result.checks:
286
+ mark = "[necro]ok[/necro]" if check.ok else "[blood]DEAD[/blood]"
287
+ # escape: details like "[git] extra" are text, not Rich markup
288
+ table.add_row(check.name, mark, escape(check.detail))
289
+ state.console.print(table)
290
+ if not result.ok:
291
+ _say("blood", "the patient is unwell; fix the DEAD rows above")
292
+ raise typer.Exit(code=1)
293
+ _say("necro", "there is a pulse. faint, but there.")
294
+
295
+
296
+ # --------------------------------------------------------------------------
297
+ # banish (purge)
298
+ # --------------------------------------------------------------------------
299
+
300
+
301
+ @app.command("banish")
302
+ def banish_cmd(
303
+ older_than: str | None = typer.Option(
304
+ None, "--older-than", help="Only clear graves older than this (e.g. 7d, 12h)."
305
+ ),
306
+ schema: bool = typer.Option(False, "--schema", help="Print the JSON schema and exit."),
307
+ ) -> None:
308
+ """Clear the catacombs (the clone cache)."""
309
+ if schema:
310
+ _print_schema("banish")
311
+ return
312
+ _banner()
313
+ try:
314
+ age = cache.parse_age(older_than) if older_than else None
315
+ except ValueError as exc:
316
+ raise _die(str(exc)) from exc
317
+ result = cache.banish(older_than_seconds=age)
318
+ for entry in result.removed:
319
+ _say("ember", f"banished {entry.url or entry.path}", level=1)
320
+ _say(
321
+ "necro",
322
+ f"banished {len(result.removed)} graves, kept {len(result.kept)}, "
323
+ f"reclaimed {fsutil.human_size(result.reclaimed_bytes)}",
324
+ )
325
+
326
+
327
+ # --------------------------------------------------------------------------
328
+ # easter egg
329
+ # --------------------------------------------------------------------------
330
+
331
+
332
+ @app.command("boo", hidden=True)
333
+ def boo_cmd() -> None:
334
+ """A random piece from the gallery."""
335
+ if not state.plain:
336
+ state.console.print(f"[eldritch]{art.boo()}[/eldritch]", highlight=False)
337
+
338
+
339
+ def run() -> None: # pragma: no cover - console-script shim
340
+ app()
@@ -0,0 +1 @@
1
+ """The engine. Everything here returns models, never formatted strings."""
@@ -0,0 +1,81 @@
1
+ """The flagship: gather files matching patterns and prepare them for
2
+ concatenation. Returns a HarvestResult; rendering lives in formatters/."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import fnmatch
7
+ from pathlib import Path
8
+
9
+ from git_reaper import fsutil, schemas
10
+ from git_reaper.core.provenance import make_provenance
11
+ from git_reaper.ignore import IgnoreMatcher, walk_files
12
+ from git_reaper.models import FileEntry, HarvestResult, RepoRef
13
+
14
+ DEFAULT_PATTERNS = ("*.md",)
15
+
16
+
17
+ class CapExceeded(RuntimeError):
18
+ """The total size cap was hit. The message says exactly where."""
19
+
20
+
21
+ def _matches(rel_path: str, patterns: tuple[str, ...]) -> bool:
22
+ name = rel_path.rsplit("/", 1)[-1]
23
+ return any(fnmatch.fnmatch(name, pat) or fnmatch.fnmatch(rel_path, pat) for pat in patterns)
24
+
25
+
26
+ def harvest(
27
+ repo: RepoRef,
28
+ patterns: tuple[str, ...] = DEFAULT_PATTERNS,
29
+ excludes: list[str] | None = None,
30
+ max_file_size: int | None = None,
31
+ max_total_size: int | None = None,
32
+ include_binary: bool = False,
33
+ invoked: str = "reaper harvest",
34
+ generated: str | None = None,
35
+ ) -> HarvestResult:
36
+ """Gather every matching file under the resolved source.
37
+
38
+ Skips are never silent: each skipped file is recorded with its reason
39
+ so the report can show exactly what was left in the ground.
40
+ """
41
+ root = Path(repo.path)
42
+ matcher = IgnoreMatcher(root, extra_excludes=excludes)
43
+ result = HarvestResult(
44
+ provenance=make_provenance(schemas.artifact_schema("harvest"), repo, invoked, generated),
45
+ root=str(root),
46
+ )
47
+
48
+ for path in walk_files(root, matcher):
49
+ rel = path.relative_to(root).as_posix()
50
+ if not _matches(rel, patterns):
51
+ continue
52
+ size = path.stat().st_size
53
+ if max_file_size is not None and size > max_file_size:
54
+ result.skipped.append(
55
+ FileEntry(
56
+ path=rel,
57
+ size_bytes=size,
58
+ skipped=True,
59
+ skip_reason=f"over size cap ({fsutil.human_size(size)})",
60
+ )
61
+ )
62
+ continue
63
+ if not include_binary and fsutil.is_binary(path):
64
+ result.skipped.append(
65
+ FileEntry(path=rel, size_bytes=size, skipped=True, skip_reason="binary")
66
+ )
67
+ continue
68
+ if max_total_size is not None and result.total_bytes + size > max_total_size:
69
+ raise CapExceeded(
70
+ f"total size cap {fsutil.human_size(max_total_size)} reached at {rel}; "
71
+ "raise --max-total-size or narrow the pattern"
72
+ )
73
+ entry = FileEntry(path=rel, size_bytes=size, line_count=fsutil.count_lines(path))
74
+ result.files.append(entry)
75
+ result.total_bytes += size
76
+ result.total_lines += entry.line_count
77
+
78
+ result.token_estimate = fsutil.estimate_tokens(result.total_bytes)
79
+ result.provenance.files = len(result.files)
80
+ result.provenance.token_estimate = result.token_estimate
81
+ return result
@@ -0,0 +1,27 @@
1
+ """Provenance stamps: every combined artifact says where it came from."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+
7
+ from git_reaper import __version__
8
+ from git_reaper.models import Provenance, RepoRef
9
+
10
+
11
+ def make_provenance(
12
+ schema: str,
13
+ repo: RepoRef,
14
+ invoked: str,
15
+ generated: str | None = None,
16
+ ) -> Provenance:
17
+ """Build the stamp. `generated` is injectable so tests stay deterministic;
18
+ it is the only wall-clock value allowed anywhere in an artifact."""
19
+ return Provenance(
20
+ schema=schema,
21
+ source=repo.source,
22
+ ref=repo.ref,
23
+ sha=repo.sha,
24
+ generated=generated or datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
25
+ tool_version=__version__,
26
+ invoked=invoked,
27
+ )