whycode-cli 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
whycode/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """WhyCode — tells you what to be afraid of before touching a file."""
2
2
 
3
- __version__ = "0.4.1"
3
+ __version__ = "0.5.0"
whycode/cache.py CHANGED
@@ -112,10 +112,21 @@ class CacheStore:
112
112
  cache misses; this class never invokes ``git`` itself.
113
113
  """
114
114
 
115
- def __init__(self, db_path: Path) -> None:
115
+ def __init__(self, db_path: Path, *, in_memory: bool = False) -> None:
116
+ """Open (creating if needed) the SQLite cache at ``db_path``.
117
+
118
+ ``in_memory=True`` opens a transient ``:memory:`` connection
119
+ instead — the disk file is never created and is never read.
120
+ Used by ``--no-cache`` to retain in-session amortisation
121
+ (matches the cold-fill code path) without persisting anything.
122
+ """
116
123
  self.db_path = db_path
117
- self.db_path.parent.mkdir(parents=True, exist_ok=True)
118
- self._conn = sqlite3.connect(self.db_path)
124
+ self._in_memory = in_memory
125
+ if in_memory:
126
+ self._conn = sqlite3.connect(":memory:")
127
+ else:
128
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
129
+ self._conn = sqlite3.connect(self.db_path)
119
130
  # row_factory makes column access readable in tests / debug.
120
131
  self._conn.row_factory = sqlite3.Row
121
132
  self._conn.execute("PRAGMA foreign_keys = ON")
@@ -402,13 +413,18 @@ class CacheStore:
402
413
  file_row_count = int(
403
414
  self._conn.execute("SELECT COUNT(*) FROM commit_files").fetchone()[0]
404
415
  )
405
- try:
406
- size_bytes = self.db_path.stat().st_size
407
- except OSError:
416
+ if self._in_memory:
408
417
  size_bytes = 0
418
+ exists = False
419
+ else:
420
+ try:
421
+ size_bytes = self.db_path.stat().st_size
422
+ except OSError:
423
+ size_bytes = 0
424
+ exists = self.db_path.exists()
409
425
  return CacheStats(
410
426
  path=self.db_path,
411
- exists=self.db_path.exists(),
427
+ exists=exists,
412
428
  schema_version=self.schema_version,
413
429
  head_sha=self.head_sha,
414
430
  commit_count=commit_count,
@@ -430,6 +446,16 @@ def open_for(repo_root: Path) -> CacheStore:
430
446
  return CacheStore(cache_path_for(repo_root))
431
447
 
432
448
 
449
+ def open_in_memory(repo_root: Path) -> CacheStore:
450
+ """Open a transient in-memory cache for ``repo_root``.
451
+
452
+ Used by ``--no-cache`` to keep within-session amortisation (the same
453
+ cold-fill code path everything else uses) while never touching disk.
454
+ The store is destroyed on ``close()`` and has no after-effects.
455
+ """
456
+ return CacheStore(cache_path_for(repo_root), in_memory=True)
457
+
458
+
433
459
  def parse_authored_at(value: str) -> datetime:
434
460
  """Parse the ``authored_at`` string we stored from git.
435
461
 
whycode/cli.py CHANGED
@@ -20,10 +20,11 @@ Commands
20
20
 
21
21
  from __future__ import annotations
22
22
 
23
+ import contextlib
23
24
  import functools
24
25
  import json
25
26
  import sys
26
- from collections.abc import Callable
27
+ from collections.abc import Callable, Iterator
27
28
  from pathlib import Path
28
29
  from typing import Any, TypeVar
29
30
 
@@ -50,18 +51,27 @@ err = Console(stderr=True)
50
51
 
51
52
 
52
53
  def _open_cache(repo_root: Path, no_cache: bool) -> ch.CacheStore | None:
53
- """Open the on-disk cache for ``repo_root`` unless suppressed.
54
-
55
- A None return means "do not pass a cache through git_facts" — every
56
- git-side helper falls back to its original network-free, cache-free
57
- implementation. This is the escape hatch behind ``--no-cache`` and
58
- is also the default when the cache cannot be initialised at all
59
- (read-only filesystem, etc.); we never want a cache failure to
60
- block the main read path.
54
+ """Open the cache for ``repo_root`` according to the no-cache flag.
55
+
56
+ Modes:
57
+ * ``no_cache=False`` (the default): persistent on-disk SQLite at
58
+ ``.whycode/cache.db``.
59
+ * ``no_cache=True``: a transient ``:memory:`` SQLite store. The
60
+ same git-walk code path runs as for the cold-fill, but the
61
+ database is destroyed on ``close()`` — nothing lands on disk
62
+ and the next run starts cold. Keeping per-run amortisation
63
+ (one ``git log`` walk shared across files) is what makes
64
+ ``--no-cache`` at most as slow as a cold persistent fill;
65
+ the previous ``cache=None`` short-circuit lost that and so
66
+ ``--no-cache`` re-issued per-file walks every iteration.
67
+
68
+ A ``None`` return means "do not pass a cache through git_facts".
69
+ Happens only when even an in-memory open fails — very rare and
70
+ we never want a cache problem to block the main read path.
61
71
  """
62
- if no_cache:
63
- return None
64
72
  try:
73
+ if no_cache:
74
+ return ch.open_in_memory(repo_root)
65
75
  return ch.open_for(repo_root)
66
76
  except OSError:
67
77
  return None
@@ -117,6 +127,42 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
117
127
  return repo_root, rel
118
128
 
119
129
 
130
+ @contextlib.contextmanager
131
+ def _memoised_is_ignored(repo_root: Path) -> Iterator[None]:
132
+ """Memoise ``ign.is_ignored`` for the duration of the ``with`` block.
133
+
134
+ The diff command's evaluation re-applies the same ``is_ignored`` test
135
+ against thousands of co-change candidates per file. Each call resolves
136
+ fnmatch over ~83 patterns; uncached, that is ~100 CPU-seconds across
137
+ a 1,927-file diff on django.
138
+
139
+ A path's verdict is fully determined by the path string and the
140
+ repo's effective ignore-pattern tuple, so we cache by ``(path,
141
+ patterns)`` for the duration of the diff and restore the original
142
+ function on exit. The cache is process-local; the rest of the CLI
143
+ (``why``, ``scan``, …) sees the un-memoised function. ``ign`` itself
144
+ is unchanged.
145
+ """
146
+ patterns = ign.effective_patterns(repo_root)
147
+ cache: dict[str, bool] = {}
148
+ original = ign.is_ignored
149
+
150
+ def memoised(path: str, patterns_arg: object = patterns) -> bool:
151
+ if patterns_arg is patterns:
152
+ cached = cache.get(path)
153
+ if cached is None:
154
+ cached = original(path, patterns)
155
+ cache[path] = cached
156
+ return cached
157
+ return original(path, patterns_arg) # type: ignore[arg-type]
158
+
159
+ ign.is_ignored = memoised # type: ignore[assignment]
160
+ try:
161
+ yield
162
+ finally:
163
+ ign.is_ignored = original
164
+
165
+
120
166
  _F = TypeVar("_F", bound=Callable[..., Any])
121
167
 
122
168
 
@@ -419,14 +465,57 @@ def diff(
419
465
 
420
466
  cache = _open_cache(repo_root, no_cache)
421
467
  try:
422
- cards: list[rc.RiskCard] = []
423
- for f in files:
424
- try:
425
- cards.append(rc.build(repo_root, f, cache=cache))
426
- except gf.GitError:
427
- continue
428
- cards.sort(key=lambda c: -c.score.value)
429
- cards = cards[:top]
468
+ # One git log walk feeds every changed file's scoring. Without this
469
+ # batched load, diff against an old base on a large repo runs N
470
+ # `git log --follow` calls (one per changed file): on django at 1,927
471
+ # changed files the legacy path measured 6+ minutes, with the
472
+ # 12+ minute variant timing out outright. ``load_diff_facts`` parses
473
+ # one un-pathed walk into a path -> [Commit] map; per-file scoring
474
+ # then does dict lookups instead of re-shelling-out.
475
+ try:
476
+ diff_facts = gf.load_diff_facts(repo_root, cache=cache)
477
+ except gf.GitError as exc:
478
+ err.print(f"[red]error:[/red] {exc}")
479
+ raise typer.Exit(2) from exc
480
+ # Pre-compute the ignore-pattern set ONCE and a verdict-per-path
481
+ # memo. ``signals.detect_coupling`` (re-introduced in 0.4.1 as F10)
482
+ # filters every coupling candidate through ``ign.is_ignored`` —
483
+ # without memoisation that's 83 patterns x 700 candidates x 1,927
484
+ # files = ~100 CPU-seconds across the diff. The memo cache turns
485
+ # each path's verdict into a dict lookup after the first hit.
486
+ with _memoised_is_ignored(repo_root):
487
+ # First pass: every changed file is scored without the
488
+ # ghost-keeper detector, which would otherwise fire ``git
489
+ # blame`` per file. With 1,927 changed files on django this
490
+ # single deferral saves ~5 minutes. We then sort and
491
+ # re-evaluate only the top-N with full signals — at most
492
+ # ``top`` blame calls instead of ``len(files)``.
493
+ prelim: list[rc.RiskCard] = []
494
+ for f in files:
495
+ try:
496
+ prelim.append(
497
+ rc.build_from_diff_facts(diff_facts, f, skip_ghost_keeper=True)
498
+ )
499
+ except gf.GitError:
500
+ continue
501
+ # Stable tie-break (from 0.4.2): lex smallest path on identical
502
+ # scores so cache and --no-cache truncate the same files at --top N.
503
+ prelim.sort(key=lambda c: (-c.score.value, c.path))
504
+ # Second pass: re-score the top-N with the full detector ladder
505
+ # so the rendered table includes ghost-keeper findings where
506
+ # they apply. Files outside the top-N keep their first-pass
507
+ # score; they were not going to appear in the user's view
508
+ # anyway.
509
+ refined_top: list[rc.RiskCard] = []
510
+ for prelim_card in prelim[:top]:
511
+ try:
512
+ refined_top.append(
513
+ rc.build_from_diff_facts(diff_facts, prelim_card.path)
514
+ )
515
+ except gf.GitError:
516
+ refined_top.append(prelim_card)
517
+ cards = refined_top
518
+ cards.sort(key=lambda c: (-c.score.value, c.path))
430
519
  finally:
431
520
  if cache is not None:
432
521
  cache.close()
@@ -565,16 +654,17 @@ def highlights(
565
654
 
566
655
  inv_pairs = gf.extract_invariant_quotes(commits)
567
656
  sha_to_commit = {c.sha: c for c in commits}
568
- seen_lines: dict[str, str] = {}
569
- for sha, line in inv_pairs:
570
- seen_lines.setdefault(line, sha)
657
+ deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
571
658
  inv_records: list[tuple[str, str, gf.Commit]] = []
572
- for line, sha in seen_lines.items():
659
+ for sha, line in deduped:
573
660
  commit = sha_to_commit.get(sha)
574
661
  if commit is None:
575
662
  continue
576
663
  inv_records.append((line, sha, commit))
577
- inv_records.sort(key=lambda t: t[2].authored_at, reverse=True)
664
+ # Sort newest first; on identical timestamps fall back to lexicographically
665
+ # smallest sha so cache and --no-cache emit byte-identical output.
666
+ inv_records.sort(key=lambda t: t[1]) # secondary: sha asc
667
+ inv_records.sort(key=lambda t: t[2].authored_at, reverse=True) # primary
578
668
  inv_records = inv_records[:invariants]
579
669
 
580
670
  incident_records = gf.find_incidents(commits)[:incidents]
@@ -827,7 +917,10 @@ def scan(
827
917
  if cache is not None:
828
918
  cache.close()
829
919
 
830
- cards.sort(key=lambda c: -c.score.value)
920
+ # Stable tie-break on identical scores: lexicographically smallest path
921
+ # so cache and --no-cache produce byte-identical text output for the
922
+ # same HEAD. Without this, the truncation at --top N is non-deterministic.
923
+ cards.sort(key=lambda c: (-c.score.value, c.path))
831
924
  top_cards = cards[:top]
832
925
  if not top_cards:
833
926
  # Be honest about what "no flagged files" actually means. A user who
@@ -949,7 +1042,8 @@ def show(
949
1042
  cards.append(rc.build(repo_root, change.path))
950
1043
  except gf.GitError:
951
1044
  continue
952
- cards.sort(key=lambda c: -c.score.value)
1045
+ # Stable tie-break on identical scores: lex smallest path.
1046
+ cards.sort(key=lambda c: (-c.score.value, c.path))
953
1047
 
954
1048
  if json_out:
955
1049
  console.print_json(
@@ -1065,13 +1159,18 @@ def tour(
1065
1159
 
1066
1160
  inv_pairs = gf.extract_invariant_quotes(commits)
1067
1161
  sha_to_commit = {c.sha: c for c in commits}
1068
- seen_lines: dict[str, str] = {}
1069
- for sha, line in inv_pairs:
1070
- seen_lines.setdefault(line, sha)
1162
+ deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
1163
+ # Sort newest first with sha-asc tie-break so cache and --no-cache
1164
+ # surface the same three lines in the same order.
1165
+ deduped_sorted = sorted(
1166
+ (p for p in deduped if p[0] in sha_to_commit),
1167
+ key=lambda p: p[0],
1168
+ )
1169
+ deduped_sorted.sort(
1170
+ key=lambda p: sha_to_commit[p[0]].authored_at, reverse=True
1171
+ )
1071
1172
  invariants_top = [
1072
- (line, sha_to_commit[sha])
1073
- for line, sha in seen_lines.items()
1074
- if sha in sha_to_commit
1173
+ (line, sha_to_commit[sha]) for sha, line in deduped_sorted
1075
1174
  ][:3]
1076
1175
  incidents_top = gf.find_incidents(commits)[:3]
1077
1176
 
@@ -1135,7 +1234,8 @@ def tour(
1135
1234
  ]
1136
1235
  if useful:
1137
1236
  cards.append(card)
1138
- cards.sort(key=lambda c: -c.score.value)
1237
+ # Stable tie-break: lex smallest path on identical scores.
1238
+ cards.sort(key=lambda c: (-c.score.value, c.path))
1139
1239
 
1140
1240
  if cards:
1141
1241
  console.print("[bold red]Top 3 risky files[/bold red]")
whycode/git_facts.py CHANGED
@@ -735,6 +735,242 @@ def _populate_diffstat_cache(
735
735
  cache.upsert_commit_files(rows)
736
736
 
737
737
 
738
+ # ---- batch loading for whycode diff ---------------------------------------
739
+
740
+
741
+ @dataclass(frozen=True)
742
+ class DiffFacts:
743
+ """A whole-repo snapshot built once for a single ``whycode diff`` evaluation.
744
+
745
+ The diff command scores N changed files; previously each file fired its
746
+ own ``git log --follow`` plus a co-change diffstat pass, so wall-clock
747
+ cost scaled with N. ``DiffFacts`` replaces N path-restricted log walks
748
+ with a single un-pathed walk: one ``git log --no-merges --numstat`` over
749
+ the repo, parsed once into ``commits_by_path`` (every commit that named
750
+ each path) and ``co_change_index`` (each commit's full file-set, used
751
+ for in-memory coupling counts). Per-file scoring then reads from this
752
+ map rather than re-shelling-out.
753
+
754
+ The map deliberately does NOT follow renames: the diff command only
755
+ scores files present in HEAD's working tree, so the tradeoff is "lose
756
+ rename-resolved history pre-rename" against "scoring 1,927 files in
757
+ seconds rather than minutes". Coupling against pre-rename names still
758
+ surfaces under those names in the map; the surface diff in practice is
759
+ a stable-tie-break difference, not a structural one.
760
+ """
761
+
762
+ repo_root: Path
763
+ commits_by_path: dict[str, list[Commit]]
764
+ """``path -> [Commit]``, newest-first, capped per path during load.
765
+
766
+ A missing key — i.e. ``commits_by_path.get(path)`` returns ``None`` —
767
+ means the loader walk did not see this path. ``gather_for_diff`` treats
768
+ that the same as an empty list: a path that the un-pathed walk did not
769
+ touch has no history to score from.
770
+ """
771
+
772
+ co_change_index: dict[str, tuple[str, ...]]
773
+ """``commit_sha -> tuple of paths touched by that commit``.
774
+
775
+ Snapshot of the same numstat parse used to build ``commits_by_path``.
776
+ Per-file ``co_changes`` reads this for in-memory coupling counts so
777
+ the diff pipeline never re-issues ``git log --no-walk`` per file.
778
+ """
779
+
780
+ cache: CacheStore | None = None
781
+ """Optional cache, threaded through so signal detectors (specifically
782
+ ``detect_ghost_keeper``) reuse it for ``git blame`` line ownership."""
783
+
784
+
785
+ _NUMSTAT_LINE_RE = re.compile(r"^(\d+|-)\t(\d+|-)\t(.+)$")
786
+
787
+
788
+ def load_diff_facts(
789
+ repo_root: Path,
790
+ *,
791
+ max_commits: int | None = None,
792
+ cache: CacheStore | None = None,
793
+ ) -> DiffFacts:
794
+ """Build a :class:`DiffFacts` snapshot from one ``git log`` invocation.
795
+
796
+ Strategy:
797
+ 1. Walk HEAD with ``git log --no-merges --numstat --pretty=...`` once.
798
+ 2. Parse each commit + its full file-set into a single in-memory map.
799
+ 3. Return the snapshot for the diff command's per-file scorer to drive.
800
+
801
+ With a ``cache`` supplied, the walked commits are persisted to
802
+ ``commits``; per-file diffstat presence rows are persisted to
803
+ ``commit_files`` so a subsequent ``why`` / ``scan`` / ``diff`` invocation
804
+ on the same HEAD reuses what we just paid for.
805
+
806
+ The walk is intentionally un-pathed: the diff command scores files
807
+ that appear in ``git diff --name-only base...HEAD``, all of which exist
808
+ at HEAD by definition. A single un-pathed walk that captures every
809
+ commit's diffstat is strictly cheaper than N path-restricted walks
810
+ that each re-walk the full graph. ``max_commits`` is applied per-path
811
+ *after* the walk so callers can cap per-file depth without changing
812
+ the cost of the walk itself.
813
+ """
814
+ # Pretty format: RECORD_SEP starts each commit; metadata fields are
815
+ # UNIT_SEP-delimited; the body is the last metadata field. Numstat
816
+ # output git appends after the body needs no further separator —
817
+ # the next commit's leading RECORD_SEP marks the boundary.
818
+ pretty_format = (
819
+ f"{RECORD_SEP}%H{UNIT_SEP}%an{UNIT_SEP}%ae{UNIT_SEP}"
820
+ f"%aI{UNIT_SEP}%s{UNIT_SEP}%b"
821
+ )
822
+ raw = _run_git(
823
+ repo_root,
824
+ "log",
825
+ "--no-merges",
826
+ "--numstat",
827
+ f"--pretty=format:{pretty_format}",
828
+ )
829
+ all_commits, commits_by_path, co_change_index = _parse_log_with_files(raw)
830
+ if max_commits is not None:
831
+ commits_by_path = {p: cs[:max_commits] for p, cs in commits_by_path.items()}
832
+ if cache is not None and all_commits:
833
+ _store_commits(cache, all_commits)
834
+ # Persist diffstat presence rows so a subsequent `why` against the
835
+ # same HEAD does not re-shell-out per file. Insertion/deletion
836
+ # widths are not captured by this walk (the diff command's
837
+ # detectors only depend on the *path set* of each commit), so they
838
+ # are stored as zero — see the paragraph in ``DiffFacts``.
839
+ files_rows: list[tuple[str, str, int, int]] = []
840
+ for sha, paths in co_change_index.items():
841
+ for p in paths:
842
+ files_rows.append((sha, p, 0, 0))
843
+ if files_rows:
844
+ cache.upsert_commit_files(files_rows)
845
+ try:
846
+ head_sha = _run_git(repo_root, "rev-parse", "HEAD").strip()
847
+ except GitError:
848
+ head_sha = ""
849
+ if head_sha and not cache.head_sha:
850
+ cache.set_head_sha(head_sha)
851
+ return DiffFacts(
852
+ repo_root=repo_root,
853
+ commits_by_path=commits_by_path,
854
+ co_change_index=co_change_index,
855
+ cache=cache,
856
+ )
857
+
858
+
859
+ def _parse_log_with_files(
860
+ raw: str,
861
+ ) -> tuple[list[Commit], dict[str, list[Commit]], dict[str, tuple[str, ...]]]:
862
+ """Parse ``git log --no-merges --numstat --pretty=<sep><commit>`` output.
863
+
864
+ Returns ``(all_commits, commits_by_path, co_change_index)``:
865
+ - ``all_commits`` is every parsed commit, newest first.
866
+ - ``commits_by_path[path]`` is the subset whose numstat block named
867
+ ``path``, preserving the newest-first order of the walk.
868
+ - ``co_change_index[sha]`` is the full path tuple from the same
869
+ numstat block, used by the diff command's in-memory coupling.
870
+
871
+ Within one record the format is
872
+ ``<sha>\\x1f<an>\\x1f<ae>\\x1f<aI>\\x1f<subject>\\x1f<body...>``
873
+ followed by zero or more numstat lines (``ins\\tdel\\tpath``). The body
874
+ is free-form prose; numstat is tab-delimited 3-column. We walk lines
875
+ forward, holding the first line as the metadata + start-of-body, and
876
+ accumulate further lines as either body (free-form) or numstat
877
+ (matches :data:`_NUMSTAT_LINE_RE`). Once a numstat line fires, the
878
+ remaining lines for that record are taken to be more numstat lines.
879
+ """
880
+ all_commits: list[Commit] = []
881
+ commits_by_path: dict[str, list[Commit]] = {}
882
+ co_change_index: dict[str, tuple[str, ...]] = {}
883
+ for record in raw.split(RECORD_SEP):
884
+ record = record.strip("\n")
885
+ if not record:
886
+ continue
887
+ lines = record.split("\n")
888
+ # The first line carries every metadata field plus the first body
889
+ # line (the body itself was emitted verbatim by ``%b``).
890
+ head_parts = lines[0].split(UNIT_SEP)
891
+ if len(head_parts) < 6:
892
+ continue
893
+ sha = head_parts[0].strip()
894
+ if not sha:
895
+ continue
896
+ author_name = head_parts[1]
897
+ author_email = head_parts[2]
898
+ authored_at = head_parts[3]
899
+ subject = head_parts[4]
900
+ first_body = UNIT_SEP.join(head_parts[5:])
901
+ body_lines: list[str] = [first_body] if first_body else []
902
+ files: list[str] = []
903
+ in_numstat = False
904
+ for line in lines[1:]:
905
+ m = _NUMSTAT_LINE_RE.match(line)
906
+ if in_numstat:
907
+ if m is not None:
908
+ files.append(m.group(3))
909
+ continue
910
+ if m is not None:
911
+ in_numstat = True
912
+ files.append(m.group(3))
913
+ continue
914
+ body_lines.append(line)
915
+ try:
916
+ authored = _parse_iso(authored_at)
917
+ except ValueError:
918
+ # Bad timestamps from a single 15-year-old commit shouldn't kill
919
+ # the diff command. F1 (full timezone-tolerant parser) is owned
920
+ # by another branch; we degrade locally rather than crash.
921
+ continue
922
+ body = "\n".join(body_lines).strip("\n")
923
+ commit = Commit(
924
+ sha=sha,
925
+ author_name=author_name,
926
+ author_email=author_email,
927
+ authored_at=authored,
928
+ subject=subject,
929
+ body=body,
930
+ files=tuple(files),
931
+ )
932
+ all_commits.append(commit)
933
+ co_change_index[sha] = commit.files
934
+ for path in files:
935
+ commits_by_path.setdefault(path, []).append(commit)
936
+ return all_commits, commits_by_path, co_change_index
937
+
938
+
939
+ def gather_for_diff(
940
+ diff_facts: DiffFacts,
941
+ path: str,
942
+ *,
943
+ max_commits: int | None = None,
944
+ ) -> RepoFacts:
945
+ """Build a :class:`RepoFacts` for ``path`` using only the in-memory map.
946
+
947
+ The diff command calls this once per changed file, replacing the per-file
948
+ ``gather()`` (and its embedded ``git log --follow`` + co-change shell-out)
949
+ with O(1) dict lookups. All higher-layer detectors run unchanged on the
950
+ returned ``RepoFacts``.
951
+ """
952
+ commits = diff_facts.commits_by_path.get(path, [])
953
+ if max_commits is not None:
954
+ commits = commits[:max_commits]
955
+ co_changed: Counter[str] = Counter()
956
+ for commit in commits:
957
+ touched = diff_facts.co_change_index.get(commit.sha, ())
958
+ for other in touched:
959
+ if other == path:
960
+ continue
961
+ co_changed[other] += 1
962
+ return RepoFacts(
963
+ repo_root=diff_facts.repo_root,
964
+ path=path,
965
+ commits=commits,
966
+ co_changed_files=co_changed,
967
+ revert_pairs=find_revert_pairs(commits),
968
+ incident_commits=find_incidents(commits),
969
+ invariant_quotes=extract_invariant_quotes(commits),
970
+ cache=diff_facts.cache,
971
+ )
972
+
973
+
738
974
  _REVERT_PREFIX = 'this reverts commit '
739
975
 
740
976
 
@@ -967,6 +1203,47 @@ def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]
967
1203
  return out
968
1204
 
969
1205
 
1206
+ def dedupe_invariant_lines(
1207
+ pairs: Sequence[tuple[str, str]],
1208
+ sha_to_commit: dict[str, Commit],
1209
+ ) -> list[tuple[str, str]]:
1210
+ """Collapse identical invariant lines to one canonical (sha, line) pair.
1211
+
1212
+ When two commits state the same invariant line — typically a cherry-pick
1213
+ onto a maintenance branch, or a rebase that duplicated the message — we
1214
+ must pick exactly one to surface. Without a deterministic rule the cache
1215
+ and ``--no-cache`` paths can disagree (their walk orders differ when
1216
+ timestamps tie), and downstream JSON consumers see flaky output across
1217
+ runs.
1218
+
1219
+ The rule:
1220
+
1221
+ 1. Earliest ``authored_at`` wins. The original statement is canonical;
1222
+ cherry-picks and rebases are derivatives.
1223
+ 2. Lexicographically smallest ``sha`` breaks ties on identical timestamps.
1224
+
1225
+ The returned list preserves first-encounter order of the (now-unique)
1226
+ lines so downstream code that sorts by date sees a stable input.
1227
+ Pairs whose ``sha`` is not in ``sha_to_commit`` keep their first-seen
1228
+ record (no metadata to compare on).
1229
+ """
1230
+ canonical: dict[str, str] = {}
1231
+ for sha, line in pairs:
1232
+ existing = canonical.get(line)
1233
+ if existing is None:
1234
+ canonical[line] = sha
1235
+ continue
1236
+ old_commit = sha_to_commit.get(existing)
1237
+ new_commit = sha_to_commit.get(sha)
1238
+ if old_commit is None or new_commit is None:
1239
+ continue
1240
+ old_key = (old_commit.authored_at, existing)
1241
+ new_key = (new_commit.authored_at, sha)
1242
+ if new_key < old_key:
1243
+ canonical[line] = sha
1244
+ return [(sha, line) for line, sha in canonical.items()]
1245
+
1246
+
970
1247
  def author_last_activity(repo_root: Path, email: str) -> datetime | None:
971
1248
  """Most recent commit timestamp by ``email`` anywhere in the repo, or None."""
972
1249
  raw = _run_git(
whycode/risk_card.py CHANGED
@@ -101,7 +101,63 @@ def build(
101
101
  the same repo (e.g. inside ``scan`` or ``diff``) share a warm cache.
102
102
  """
103
103
  facts = gf.gather(repo_root, path, max_commits=max_commits, ref=ref, cache=cache)
104
- signals = sig.all_signals(facts)
104
+ return _from_facts(
105
+ path=path,
106
+ facts=facts,
107
+ repo_root=repo_root,
108
+ ref=ref,
109
+ apply_suppressions=apply_suppressions,
110
+ )
111
+
112
+
113
+ def build_from_diff_facts(
114
+ diff_facts: gf.DiffFacts,
115
+ path: str,
116
+ *,
117
+ max_commits: int | None = None,
118
+ apply_suppressions: bool = True,
119
+ skip_ghost_keeper: bool = False,
120
+ ) -> RiskCard:
121
+ """Build a Risk Card from an in-memory :class:`DiffFacts` map.
122
+
123
+ The diff command pre-loads one ``DiffFacts`` for the whole evaluation
124
+ via :func:`whycode.git_facts.load_diff_facts`, then calls this helper
125
+ once per changed file. The card's signals, score, and ``most_recent_*``
126
+ fields all derive from the same in-memory map, so per-file cost is
127
+ O(1) rather than the per-file ``git log --follow`` it replaces.
128
+
129
+ With ``skip_ghost_keeper=True`` the per-file ``git blame`` call is
130
+ deferred — the diff command uses this for its first pass over every
131
+ changed file, then re-evaluates only the top-N with full signals.
132
+ Without this skip, scoring 1,927 files spends ~4-5 minutes inside
133
+ ``git blame`` even though > 95% of those files never reach the table
134
+ the user sees.
135
+ """
136
+ facts = gf.gather_for_diff(diff_facts, path, max_commits=max_commits)
137
+ return _from_facts(
138
+ path=path,
139
+ facts=facts,
140
+ repo_root=diff_facts.repo_root,
141
+ ref=None,
142
+ apply_suppressions=apply_suppressions,
143
+ skip_ghost_keeper=skip_ghost_keeper,
144
+ )
145
+
146
+
147
+ def _from_facts(
148
+ *,
149
+ path: str,
150
+ facts: gf.RepoFacts,
151
+ repo_root: Path,
152
+ ref: str | None,
153
+ apply_suppressions: bool,
154
+ skip_ghost_keeper: bool = False,
155
+ ) -> RiskCard:
156
+ """Common tail of :func:`build` and :func:`build_from_diff_facts`."""
157
+ if skip_ghost_keeper:
158
+ signals = _all_signals_without_ghost_keeper(facts)
159
+ else:
160
+ signals = sig.all_signals(facts)
105
161
  if apply_suppressions:
106
162
  suppressions = supp.load(repo_root)
107
163
  signals = supp.filter_signals(signals, suppressions, path)
@@ -120,6 +176,40 @@ def build(
120
176
  )
121
177
 
122
178
 
179
+ # Detectors whose evidence is already in :class:`RepoFacts` (no git blame, no
180
+ # follow-up shell-out). The ghost-keeper detector is the only one missing
181
+ # here — it calls ``git blame`` per-file, which is the diff command's
182
+ # remaining bottleneck after the log walk is shared.
183
+ _FAST_DETECTORS = (
184
+ sig.detect_revert_chain,
185
+ sig.detect_incident_history,
186
+ sig.detect_invariant_quotes,
187
+ sig.detect_coupling,
188
+ sig.detect_high_churn,
189
+ sig.detect_silence,
190
+ sig.detect_newborn,
191
+ )
192
+
193
+
194
+ def _all_signals_without_ghost_keeper(facts: gf.RepoFacts) -> list[sig.Signal]:
195
+ """Re-implement the public ``all_signals`` ladder, minus ghost-keeper.
196
+
197
+ Mirrors the NEWBORN-suppression rule that ``signals.all_signals`` uses
198
+ so that an empty signal list collapses cleanly to NEWBORN-only when the
199
+ other detectors are all silent. If any other detector fires, NEWBORN is
200
+ dropped, exactly as the canonical helper does.
201
+ """
202
+ out: list[sig.Signal] = []
203
+ for detector in _FAST_DETECTORS:
204
+ signal = detector(facts)
205
+ if signal is not None:
206
+ out.append(signal)
207
+ if any(s.kind is not sig.SignalKind.NEWBORN for s in out):
208
+ out = [s for s in out if s.kind is not sig.SignalKind.NEWBORN]
209
+ out.sort(key=lambda s: (-s.severity, s.kind.value))
210
+ return out
211
+
212
+
123
213
  # ----- rendering ------------------------------------------------------------
124
214
 
125
215
  _BAND_STYLE: dict[Band, str] = {
@@ -247,4 +337,4 @@ def render_text(card: RiskCard) -> Group:
247
337
  return Group(*pieces)
248
338
 
249
339
 
250
- __all__ = ["RiskCard", "build", "render_text"]
340
+ __all__ = ["RiskCard", "build", "build_from_diff_facts", "render_text"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: whycode-cli
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: Tells you what to be afraid of before you touch a file.
5
5
  Author: Kevin
6
6
  License-Expression: MIT
@@ -1,22 +1,22 @@
1
- whycode/__init__.py,sha256=dPQOppaGvPoPBoACrHwxqGykCdDMNZRROtDjOmyRuf8,96
1
+ whycode/__init__.py,sha256=8tSXddh1D6FGyHRpuHrUFKhnhzLKOoeY0RS3DbJlOjw,96
2
2
  whycode/__main__.py,sha256=dqAk6746YpuM-FTIH4TBOULegGc5WweojiZjce0VYgQ,105
3
- whycode/cache.py,sha256=v55KbSlTqmP_ot1FEFqxCNpAApj6vthpHl2l0lGLX3A,17477
4
- whycode/cli.py,sha256=OTYPhp8ItBXPRrQ1y6zGt0BwKyAYEuHAo3T0hMHqINk,47836
3
+ whycode/cache.py,sha256=0cEPZHdolQbSiBLAOnMu20tobIrc7G0MNycpldHRpkk,18536
4
+ whycode/cli.py,sha256=SNllAm2fVU5O3Ke-Ktkqag_p3f-9Dn0mXtkMJM_nqg0,53209
5
5
  whycode/decisions.py,sha256=oCVhEF7QfHeci0LAWNtEjV2mUAEBJloL1rT3I4XXbkw,7570
6
- whycode/git_facts.py,sha256=vAeyhxZTrqa_6zmVuBV-06JhZ-TFBiRmcaISK1oOQjM,40162
6
+ whycode/git_facts.py,sha256=zevyDVZTIvWJrtaiufhOdFboltH5__pooWFBdO8AhBY,51567
7
7
  whycode/ignore.py,sha256=O_8bHIt0d1U-sYrBajBa7oEqpnHWU3f6Zf-8PU8CpO0,4748
8
8
  whycode/llm.py,sha256=leB94pBg8kUCq_BujZq5ixny0urGtKskjdaKoum_eCA,4092
9
9
  whycode/mcp_server.py,sha256=ht1tStAkOwmQzNIRkm1eA8Tnc59fzDRSGkgyIprft-0,18503
10
- whycode/risk_card.py,sha256=xOJkHwIkS_6yw_dSowsQ6LHfeD9Mwr2tymL7_wqxs0U,8855
10
+ whycode/risk_card.py,sha256=i2QsQS8y5U0ODf7GCt-6L3bpiroPIC-fq2Kz8nmzN9U,12031
11
11
  whycode/scorer.py,sha256=4pBejunfxzYhGUzMeL8uGEMQzC6DWiqwcTeMdo3eras,1444
12
12
  whycode/signals.py,sha256=z0kZfXR60nS-j56nchHd1V3aK8A5CGR1BAyHZZAff3s,13899
13
13
  whycode/suppressions.py,sha256=1lKSs-kCgpnJbcxozcgiSP8ZAfjEDMHXuM3sw4FaY78,3836
14
14
  whycode/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  whycode/templates/github-workflow.yml,sha256=LAfHMDG2TkAwi4vCNinHk-4zOt-mCWErBpmpaqlW5oA,2251
16
16
  whycode/templates/pre-commit,sha256=IhU11CvoDwqRAAsvHwUo-BwaNbdgy1cpXc54Z_phrmQ,316
17
- whycode_cli-0.4.1.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
18
- whycode_cli-0.4.1.dist-info/METADATA,sha256=M2XBAL02LMRZtW4Pj4L3Gcuifqh2lIAQa_1Hpt3xfPI,10218
19
- whycode_cli-0.4.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
- whycode_cli-0.4.1.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
21
- whycode_cli-0.4.1.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
22
- whycode_cli-0.4.1.dist-info/RECORD,,
17
+ whycode_cli-0.5.0.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
18
+ whycode_cli-0.5.0.dist-info/METADATA,sha256=RwcX-W8MQJWbYfwxW_Tdv8CtA8SKWS8HH-39NLFDxnA,10218
19
+ whycode_cli-0.5.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
+ whycode_cli-0.5.0.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
21
+ whycode_cli-0.5.0.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
22
+ whycode_cli-0.5.0.dist-info/RECORD,,