whycode-cli 0.4.2__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {whycode_cli-0.4.2/src/whycode_cli.egg-info → whycode_cli-0.5.2}/PKG-INFO +27 -1
  2. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/README.md +26 -0
  3. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/pyproject.toml +1 -1
  4. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/__init__.py +1 -1
  5. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/cli.py +100 -13
  6. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/git_facts.py +236 -0
  7. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/risk_card.py +139 -16
  8. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/signals.py +215 -1
  9. {whycode_cli-0.4.2 → whycode_cli-0.5.2/src/whycode_cli.egg-info}/PKG-INFO +27 -1
  10. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_cli.py +95 -0
  11. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_git_facts.py +89 -0
  12. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_signals.py +175 -0
  13. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/LICENSE +0 -0
  14. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/setup.cfg +0 -0
  15. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/__main__.py +0 -0
  16. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/cache.py +0 -0
  17. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/decisions.py +0 -0
  18. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/ignore.py +0 -0
  19. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/llm.py +0 -0
  20. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/mcp_server.py +0 -0
  21. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/scorer.py +0 -0
  22. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/suppressions.py +0 -0
  23. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/templates/__init__.py +0 -0
  24. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/templates/github-workflow.yml +0 -0
  25. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/templates/pre-commit +0 -0
  26. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode_cli.egg-info/SOURCES.txt +0 -0
  27. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode_cli.egg-info/dependency_links.txt +0 -0
  28. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode_cli.egg-info/entry_points.txt +0 -0
  29. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode_cli.egg-info/requires.txt +0 -0
  30. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode_cli.egg-info/top_level.txt +0 -0
  31. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_cache.py +0 -0
  32. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_decisions.py +0 -0
  33. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_ignore.py +0 -0
  34. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_mcp_prompts.py +0 -0
  35. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_scorer.py +0 -0
  36. {whycode_cli-0.4.2 → whycode_cli-0.5.2}/tests/test_suppressions.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: whycode-cli
3
- Version: 0.4.2
3
+ Version: 0.5.2
4
4
  Summary: Tells you what to be afraid of before you touch a file.
5
5
  Author: Kevin
6
6
  License-Expression: MIT
@@ -140,6 +140,32 @@ Score interpretation:
140
140
  | 25–49 | WORTH A LOOK | One thing might bite you. Glance. |
141
141
  | 0–24 | NO FLAGS | Quiet history — but read the diff anyway. |
142
142
 
143
+ ### "But why exactly did this fire?" — `--explain`
144
+
145
+ When a signal looks wrong (or you just want to understand the reasoning
146
+ before trusting the tool), pass `--explain`. Each fired signal grows a
147
+ small block naming the precise rule that produced it, the literal evidence
148
+ the rule looked at, and the source location of the ladder branch:
149
+
150
+ ```
151
+ $ whycode why src/payment/refund.py --explain
152
+
153
+ MED 1 incident-flagged change in history
154
+ 1 commit matched incident keywords (latest 12 days ago:
155
+ 'hotfix: idempotency token regression').
156
+ evidence: a3f4b2c
157
+ ─ rule: incident_subject_keyword src/whycode/git_facts.py:find_incidents
158
+ fired because: subject 'hotfix: idempotency token regression'
159
+ matched the literal token 'hotfix'
160
+ evidence: hotfix
161
+ ```
162
+
163
+ Without `--explain`, output is exactly as before — this is purely an
164
+ opt-in transparency surface. `--explain --json` adds an `explanation`
165
+ key per signal in the JSON output, with the same fields. The flag covers
166
+ L1+L2 detectors only; if you also pass `--llm`, the L3 decision block is
167
+ unaffected.
168
+
143
169
  ## The killer use case: hand it to your AI editor
144
170
 
145
171
  WhyCode is also an MCP server. Configure it in any MCP-aware editor or
@@ -110,6 +110,32 @@ Score interpretation:
110
110
  | 25–49 | WORTH A LOOK | One thing might bite you. Glance. |
111
111
  | 0–24 | NO FLAGS | Quiet history — but read the diff anyway. |
112
112
 
113
+ ### "But why exactly did this fire?" — `--explain`
114
+
115
+ When a signal looks wrong (or you just want to understand the reasoning
116
+ before trusting the tool), pass `--explain`. Each fired signal grows a
117
+ small block naming the precise rule that produced it, the literal evidence
118
+ the rule looked at, and the source location of the ladder branch:
119
+
120
+ ```
121
+ $ whycode why src/payment/refund.py --explain
122
+
123
+ MED 1 incident-flagged change in history
124
+ 1 commit matched incident keywords (latest 12 days ago:
125
+ 'hotfix: idempotency token regression').
126
+ evidence: a3f4b2c
127
+ ─ rule: incident_subject_keyword src/whycode/git_facts.py:find_incidents
128
+ fired because: subject 'hotfix: idempotency token regression'
129
+ matched the literal token 'hotfix'
130
+ evidence: hotfix
131
+ ```
132
+
133
+ Without `--explain`, output is exactly as before — this is purely an
134
+ opt-in transparency surface. `--explain --json` adds an `explanation`
135
+ key per signal in the JSON output, with the same fields. The flag covers
136
+ L1+L2 detectors only; if you also pass `--llm`, the L3 decision block is
137
+ unaffected.
138
+
113
139
  ## The killer use case: hand it to your AI editor
114
140
 
115
141
  WhyCode is also an MCP server. Configure it in any MCP-aware editor or
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "whycode-cli"
7
- version = "0.4.2"
7
+ version = "0.5.2"
8
8
  description = "Tells you what to be afraid of before you touch a file."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -1,3 +1,3 @@
1
1
  """WhyCode — tells you what to be afraid of before touching a file."""
2
2
 
3
- __version__ = "0.4.2"
3
+ __version__ = "0.5.2"
@@ -20,10 +20,11 @@ Commands
20
20
 
21
21
  from __future__ import annotations
22
22
 
23
+ import contextlib
23
24
  import functools
24
25
  import json
25
26
  import sys
26
- from collections.abc import Callable
27
+ from collections.abc import Callable, Iterator
27
28
  from pathlib import Path
28
29
  from typing import Any, TypeVar
29
30
 
@@ -126,6 +127,42 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
126
127
  return repo_root, rel
127
128
 
128
129
 
130
+ @contextlib.contextmanager
131
+ def _memoised_is_ignored(repo_root: Path) -> Iterator[None]:
132
+ """Memoise ``ign.is_ignored`` for the duration of the ``with`` block.
133
+
134
+ The diff command's evaluation re-applies the same ``is_ignored`` test
135
+ against thousands of co-change candidates per file. Each call resolves
136
+ fnmatch over ~83 patterns; uncached, that is ~100 CPU-seconds across
137
+ a 1,927-file diff on django.
138
+
139
+ A path's verdict is fully determined by the path string and the
140
+ repo's effective ignore-pattern tuple, so we cache by ``(path,
141
+ patterns)`` for the duration of the diff and restore the original
142
+ function on exit. The cache is process-local; the rest of the CLI
143
+ (``why``, ``scan``, …) sees the un-memoised function. ``ign`` itself
144
+ is unchanged.
145
+ """
146
+ patterns = ign.effective_patterns(repo_root)
147
+ cache: dict[str, bool] = {}
148
+ original = ign.is_ignored
149
+
150
+ def memoised(path: str, patterns_arg: object = patterns) -> bool:
151
+ if patterns_arg is patterns:
152
+ cached = cache.get(path)
153
+ if cached is None:
154
+ cached = original(path, patterns)
155
+ cache[path] = cached
156
+ return cached
157
+ return original(path, patterns_arg) # type: ignore[arg-type]
158
+
159
+ ign.is_ignored = memoised # type: ignore[assignment]
160
+ try:
161
+ yield
162
+ finally:
163
+ ign.is_ignored = original
164
+
165
+
129
166
  _F = TypeVar("_F", bound=Callable[..., Any])
130
167
 
131
168
 
@@ -240,6 +277,15 @@ def why(
240
277
  "--no-cache",
241
278
  help="Bypass the local SQLite cache at .whycode/cache.db.",
242
279
  ),
280
+ explain: bool = typer.Option(
281
+ False,
282
+ "--explain",
283
+ help=(
284
+ "Below each signal, print the precise rule that fired: the literal "
285
+ "matched tokens, threshold values, and the source location of the "
286
+ "ladder branch. L1+L2 only — L3 (--llm) decisions are not annotated."
287
+ ),
288
+ ),
243
289
  ) -> None:
244
290
  """Print the Risk Card for ``path``."""
245
291
  repo_root, rel = _require_tracked(path)
@@ -328,12 +374,12 @@ def why(
328
374
  card = card.with_decisions(tuple(decisions))
329
375
 
330
376
  if json_out:
331
- console.print_json(json.dumps(card.to_dict()))
377
+ console.print_json(json.dumps(card.to_dict(explain=explain)))
332
378
  return
333
379
  if brief:
334
380
  _print_brief(card)
335
381
  return
336
- console.print(rc.render_text(card))
382
+ console.print(rc.render_text(card, explain=explain))
337
383
  finally:
338
384
  if cache is not None:
339
385
  cache.close()
@@ -428,16 +474,57 @@ def diff(
428
474
 
429
475
  cache = _open_cache(repo_root, no_cache)
430
476
  try:
431
- cards: list[rc.RiskCard] = []
432
- for f in files:
433
- try:
434
- cards.append(rc.build(repo_root, f, cache=cache))
435
- except gf.GitError:
436
- continue
437
- # Stable tie-break: lex smallest path on identical scores so cache
438
- # and --no-cache truncate the same files at --top N.
439
- cards.sort(key=lambda c: (-c.score.value, c.path))
440
- cards = cards[:top]
477
+ # One git log walk feeds every changed file's scoring. Without this
478
+ # batched load, diff against an old base on a large repo runs N
479
+ # `git log --follow` calls (one per changed file): on django at 1,927
480
+ # changed files the legacy path measured 6+ minutes, with the
481
+ # 12+ minute variant timing out outright. ``load_diff_facts`` parses
482
+ # one un-pathed walk into a path -> [Commit] map; per-file scoring
483
+ # then does dict lookups instead of re-shelling-out.
484
+ try:
485
+ diff_facts = gf.load_diff_facts(repo_root, cache=cache)
486
+ except gf.GitError as exc:
487
+ err.print(f"[red]error:[/red] {exc}")
488
+ raise typer.Exit(2) from exc
489
+ # Pre-compute the ignore-pattern set ONCE and a verdict-per-path
490
+ # memo. ``signals.detect_coupling`` (re-introduced in 0.4.1 as F10)
491
+ # filters every coupling candidate through ``ign.is_ignored`` —
492
+ # without memoisation that's 83 patterns x 700 candidates x 1,927
493
+ # files = ~100 CPU-seconds across the diff. The memo cache turns
494
+ # each path's verdict into a dict lookup after the first hit.
495
+ with _memoised_is_ignored(repo_root):
496
+ # First pass: every changed file is scored without the
497
+ # ghost-keeper detector, which would otherwise fire ``git
498
+ # blame`` per file. With 1,927 changed files on django this
499
+ # single deferral saves ~5 minutes. We then sort and
500
+ # re-evaluate only the top-N with full signals — at most
501
+ # ``top`` blame calls instead of ``len(files)``.
502
+ prelim: list[rc.RiskCard] = []
503
+ for f in files:
504
+ try:
505
+ prelim.append(
506
+ rc.build_from_diff_facts(diff_facts, f, skip_ghost_keeper=True)
507
+ )
508
+ except gf.GitError:
509
+ continue
510
+ # Stable tie-break (from 0.4.2): lex smallest path on identical
511
+ # scores so cache and --no-cache truncate the same files at --top N.
512
+ prelim.sort(key=lambda c: (-c.score.value, c.path))
513
+ # Second pass: re-score the top-N with the full detector ladder
514
+ # so the rendered table includes ghost-keeper findings where
515
+ # they apply. Files outside the top-N keep their first-pass
516
+ # score; they were not going to appear in the user's view
517
+ # anyway.
518
+ refined_top: list[rc.RiskCard] = []
519
+ for prelim_card in prelim[:top]:
520
+ try:
521
+ refined_top.append(
522
+ rc.build_from_diff_facts(diff_facts, prelim_card.path)
523
+ )
524
+ except gf.GitError:
525
+ refined_top.append(prelim_card)
526
+ cards = refined_top
527
+ cards.sort(key=lambda c: (-c.score.value, c.path))
441
528
  finally:
442
529
  if cache is not None:
443
530
  cache.close()
@@ -735,6 +735,242 @@ def _populate_diffstat_cache(
735
735
  cache.upsert_commit_files(rows)
736
736
 
737
737
 
738
+ # ---- batch loading for whycode diff ---------------------------------------
739
+
740
+
741
+ @dataclass(frozen=True)
742
+ class DiffFacts:
743
+ """A whole-repo snapshot built once for a single ``whycode diff`` evaluation.
744
+
745
+ The diff command scores N changed files; previously each file fired its
746
+ own ``git log --follow`` plus a co-change diffstat pass, so wall-clock
747
+ cost scaled with N. ``DiffFacts`` replaces N path-restricted log walks
748
+ with a single un-pathed walk: one ``git log --no-merges --numstat`` over
749
+ the repo, parsed once into ``commits_by_path`` (every commit that named
750
+ each path) and ``co_change_index`` (each commit's full file-set, used
751
+ for in-memory coupling counts). Per-file scoring then reads from this
752
+ map rather than re-shelling-out.
753
+
754
+ The map deliberately does NOT follow renames: the diff command only
755
+ scores files present in HEAD's working tree, so the tradeoff is "lose
756
+ rename-resolved history pre-rename" against "scoring 1,927 files in
757
+ seconds rather than minutes". Coupling against pre-rename names still
758
+ surfaces under those names in the map; the surface diff in practice is
759
+ a stable-tie-break difference, not a structural one.
760
+ """
761
+
762
+ repo_root: Path
763
+ commits_by_path: dict[str, list[Commit]]
764
+ """``path -> [Commit]``, newest-first, capped per path during load.
765
+
766
+ A missing key — i.e. ``commits_by_path.get(path)`` returns ``None`` —
767
+ means the loader walk did not see this path. ``gather_for_diff`` treats
768
+ that the same as an empty list: a path that the un-pathed walk did not
769
+ touch has no history to score from.
770
+ """
771
+
772
+ co_change_index: dict[str, tuple[str, ...]]
773
+ """``commit_sha -> tuple of paths touched by that commit``.
774
+
775
+ Snapshot of the same numstat parse used to build ``commits_by_path``.
776
+ Per-file ``co_changes`` reads this for in-memory coupling counts so
777
+ the diff pipeline never re-issues ``git log --no-walk`` per file.
778
+ """
779
+
780
+ cache: CacheStore | None = None
781
+ """Optional cache, threaded through so signal detectors (specifically
782
+ ``detect_ghost_keeper``) reuse it for ``git blame`` line ownership."""
783
+
784
+
785
+ _NUMSTAT_LINE_RE = re.compile(r"^(\d+|-)\t(\d+|-)\t(.+)$")
786
+
787
+
788
+ def load_diff_facts(
789
+ repo_root: Path,
790
+ *,
791
+ max_commits: int | None = None,
792
+ cache: CacheStore | None = None,
793
+ ) -> DiffFacts:
794
+ """Build a :class:`DiffFacts` snapshot from one ``git log`` invocation.
795
+
796
+ Strategy:
797
+ 1. Walk HEAD with ``git log --no-merges --numstat --pretty=...`` once.
798
+ 2. Parse each commit + its full file-set into a single in-memory map.
799
+ 3. Return the snapshot for the diff command's per-file scorer to drive.
800
+
801
+ With a ``cache`` supplied, the walked commits are persisted to
802
+ ``commits``; per-file diffstat presence rows are persisted to
803
+ ``commit_files`` so a subsequent ``why`` / ``scan`` / ``diff`` invocation
804
+ on the same HEAD reuses what we just paid for.
805
+
806
+ The walk is intentionally un-pathed: the diff command scores files
807
+ that appear in ``git diff --name-only base...HEAD``, all of which exist
808
+ at HEAD by definition. A single un-pathed walk that captures every
809
+ commit's diffstat is strictly cheaper than N path-restricted walks
810
+ that each re-walk the full graph. ``max_commits`` is applied per-path
811
+ *after* the walk so callers can cap per-file depth without changing
812
+ the cost of the walk itself.
813
+ """
814
+ # Pretty format: RECORD_SEP starts each commit; metadata fields are
815
+ # UNIT_SEP-delimited; the body is the last metadata field. Numstat
816
+ # output git appends after the body needs no further separator —
817
+ # the next commit's leading RECORD_SEP marks the boundary.
818
+ pretty_format = (
819
+ f"{RECORD_SEP}%H{UNIT_SEP}%an{UNIT_SEP}%ae{UNIT_SEP}"
820
+ f"%aI{UNIT_SEP}%s{UNIT_SEP}%b"
821
+ )
822
+ raw = _run_git(
823
+ repo_root,
824
+ "log",
825
+ "--no-merges",
826
+ "--numstat",
827
+ f"--pretty=format:{pretty_format}",
828
+ )
829
+ all_commits, commits_by_path, co_change_index = _parse_log_with_files(raw)
830
+ if max_commits is not None:
831
+ commits_by_path = {p: cs[:max_commits] for p, cs in commits_by_path.items()}
832
+ if cache is not None and all_commits:
833
+ _store_commits(cache, all_commits)
834
+ # Persist diffstat presence rows so a subsequent `why` against the
835
+ # same HEAD does not re-shell-out per file. Insertion/deletion
836
+ # widths are not captured by this walk (the diff command's
837
+ # detectors only depend on the *path set* of each commit), so they
838
+ # are stored as zero — see the paragraph in ``DiffFacts``.
839
+ files_rows: list[tuple[str, str, int, int]] = []
840
+ for sha, paths in co_change_index.items():
841
+ for p in paths:
842
+ files_rows.append((sha, p, 0, 0))
843
+ if files_rows:
844
+ cache.upsert_commit_files(files_rows)
845
+ try:
846
+ head_sha = _run_git(repo_root, "rev-parse", "HEAD").strip()
847
+ except GitError:
848
+ head_sha = ""
849
+ if head_sha and not cache.head_sha:
850
+ cache.set_head_sha(head_sha)
851
+ return DiffFacts(
852
+ repo_root=repo_root,
853
+ commits_by_path=commits_by_path,
854
+ co_change_index=co_change_index,
855
+ cache=cache,
856
+ )
857
+
858
+
859
+ def _parse_log_with_files(
860
+ raw: str,
861
+ ) -> tuple[list[Commit], dict[str, list[Commit]], dict[str, tuple[str, ...]]]:
862
+ """Parse ``git log --no-merges --numstat --pretty=<sep><commit>`` output.
863
+
864
+ Returns ``(all_commits, commits_by_path, co_change_index)``:
865
+ - ``all_commits`` is every parsed commit, newest first.
866
+ - ``commits_by_path[path]`` is the subset whose numstat block named
867
+ ``path``, preserving the newest-first order of the walk.
868
+ - ``co_change_index[sha]`` is the full path tuple from the same
869
+ numstat block, used by the diff command's in-memory coupling.
870
+
871
+ Within one record the format is
872
+ ``<sha>\\x1f<an>\\x1f<ae>\\x1f<aI>\\x1f<subject>\\x1f<body...>``
873
+ followed by zero or more numstat lines (``ins\\tdel\\tpath``). The body
874
+ is free-form prose; numstat is tab-delimited 3-column. We walk lines
875
+ forward, holding the first line as the metadata + start-of-body, and
876
+ accumulate further lines as either body (free-form) or numstat
877
+ (matches :data:`_NUMSTAT_LINE_RE`). Once a numstat line fires, the
878
+ remaining lines for that record are taken to be more numstat lines.
879
+ """
880
+ all_commits: list[Commit] = []
881
+ commits_by_path: dict[str, list[Commit]] = {}
882
+ co_change_index: dict[str, tuple[str, ...]] = {}
883
+ for record in raw.split(RECORD_SEP):
884
+ record = record.strip("\n")
885
+ if not record:
886
+ continue
887
+ lines = record.split("\n")
888
+ # The first line carries every metadata field plus the first body
889
+ # line (the body itself was emitted verbatim by ``%b``).
890
+ head_parts = lines[0].split(UNIT_SEP)
891
+ if len(head_parts) < 6:
892
+ continue
893
+ sha = head_parts[0].strip()
894
+ if not sha:
895
+ continue
896
+ author_name = head_parts[1]
897
+ author_email = head_parts[2]
898
+ authored_at = head_parts[3]
899
+ subject = head_parts[4]
900
+ first_body = UNIT_SEP.join(head_parts[5:])
901
+ body_lines: list[str] = [first_body] if first_body else []
902
+ files: list[str] = []
903
+ in_numstat = False
904
+ for line in lines[1:]:
905
+ m = _NUMSTAT_LINE_RE.match(line)
906
+ if in_numstat:
907
+ if m is not None:
908
+ files.append(m.group(3))
909
+ continue
910
+ if m is not None:
911
+ in_numstat = True
912
+ files.append(m.group(3))
913
+ continue
914
+ body_lines.append(line)
915
+ try:
916
+ authored = _parse_iso(authored_at)
917
+ except ValueError:
918
+ # Bad timestamps from a single 15-year-old commit shouldn't kill
919
+ # the diff command. F1 (full timezone-tolerant parser) is owned
920
+ # by another branch; we degrade locally rather than crash.
921
+ continue
922
+ body = "\n".join(body_lines).strip("\n")
923
+ commit = Commit(
924
+ sha=sha,
925
+ author_name=author_name,
926
+ author_email=author_email,
927
+ authored_at=authored,
928
+ subject=subject,
929
+ body=body,
930
+ files=tuple(files),
931
+ )
932
+ all_commits.append(commit)
933
+ co_change_index[sha] = commit.files
934
+ for path in files:
935
+ commits_by_path.setdefault(path, []).append(commit)
936
+ return all_commits, commits_by_path, co_change_index
937
+
938
+
939
+ def gather_for_diff(
940
+ diff_facts: DiffFacts,
941
+ path: str,
942
+ *,
943
+ max_commits: int | None = None,
944
+ ) -> RepoFacts:
945
+ """Build a :class:`RepoFacts` for ``path`` using only the in-memory map.
946
+
947
+ The diff command calls this once per changed file, replacing the per-file
948
+ ``gather()`` (and its embedded ``git log --follow`` + co-change shell-out)
949
+ with O(1) dict lookups. All higher-layer detectors run unchanged on the
950
+ returned ``RepoFacts``.
951
+ """
952
+ commits = diff_facts.commits_by_path.get(path, [])
953
+ if max_commits is not None:
954
+ commits = commits[:max_commits]
955
+ co_changed: Counter[str] = Counter()
956
+ for commit in commits:
957
+ touched = diff_facts.co_change_index.get(commit.sha, ())
958
+ for other in touched:
959
+ if other == path:
960
+ continue
961
+ co_changed[other] += 1
962
+ return RepoFacts(
963
+ repo_root=diff_facts.repo_root,
964
+ path=path,
965
+ commits=commits,
966
+ co_changed_files=co_changed,
967
+ revert_pairs=find_revert_pairs(commits),
968
+ incident_commits=find_incidents(commits),
969
+ invariant_quotes=extract_invariant_quotes(commits),
970
+ cache=diff_facts.cache,
971
+ )
972
+
973
+
738
974
  _REVERT_PREFIX = 'this reverts commit '
739
975
 
740
976