whycode-cli 0.4.2__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- whycode/__init__.py +1 -1
- whycode/cli.py +100 -13
- whycode/git_facts.py +236 -0
- whycode/risk_card.py +139 -16
- whycode/signals.py +215 -1
- {whycode_cli-0.4.2.dist-info → whycode_cli-0.5.2.dist-info}/METADATA +27 -1
- {whycode_cli-0.4.2.dist-info → whycode_cli-0.5.2.dist-info}/RECORD +11 -11
- {whycode_cli-0.4.2.dist-info → whycode_cli-0.5.2.dist-info}/WHEEL +0 -0
- {whycode_cli-0.4.2.dist-info → whycode_cli-0.5.2.dist-info}/entry_points.txt +0 -0
- {whycode_cli-0.4.2.dist-info → whycode_cli-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {whycode_cli-0.4.2.dist-info → whycode_cli-0.5.2.dist-info}/top_level.txt +0 -0
whycode/__init__.py
CHANGED
whycode/cli.py
CHANGED
|
@@ -20,10 +20,11 @@ Commands
|
|
|
20
20
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
+
import contextlib
|
|
23
24
|
import functools
|
|
24
25
|
import json
|
|
25
26
|
import sys
|
|
26
|
-
from collections.abc import Callable
|
|
27
|
+
from collections.abc import Callable, Iterator
|
|
27
28
|
from pathlib import Path
|
|
28
29
|
from typing import Any, TypeVar
|
|
29
30
|
|
|
@@ -126,6 +127,42 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
|
|
|
126
127
|
return repo_root, rel
|
|
127
128
|
|
|
128
129
|
|
|
130
|
+
@contextlib.contextmanager
|
|
131
|
+
def _memoised_is_ignored(repo_root: Path) -> Iterator[None]:
|
|
132
|
+
"""Memoise ``ign.is_ignored`` for the duration of the ``with`` block.
|
|
133
|
+
|
|
134
|
+
The diff command's evaluation re-applies the same ``is_ignored`` test
|
|
135
|
+
against thousands of co-change candidates per file. Each call resolves
|
|
136
|
+
fnmatch over ~83 patterns; uncached, that is ~100 CPU-seconds across
|
|
137
|
+
a 1,927-file diff on django.
|
|
138
|
+
|
|
139
|
+
A path's verdict is fully determined by the path string and the
|
|
140
|
+
repo's effective ignore-pattern tuple, so we cache by ``(path,
|
|
141
|
+
patterns)`` for the duration of the diff and restore the original
|
|
142
|
+
function on exit. The cache is process-local; the rest of the CLI
|
|
143
|
+
(``why``, ``scan``, …) sees the un-memoised function. ``ign`` itself
|
|
144
|
+
is unchanged.
|
|
145
|
+
"""
|
|
146
|
+
patterns = ign.effective_patterns(repo_root)
|
|
147
|
+
cache: dict[str, bool] = {}
|
|
148
|
+
original = ign.is_ignored
|
|
149
|
+
|
|
150
|
+
def memoised(path: str, patterns_arg: object = patterns) -> bool:
|
|
151
|
+
if patterns_arg is patterns:
|
|
152
|
+
cached = cache.get(path)
|
|
153
|
+
if cached is None:
|
|
154
|
+
cached = original(path, patterns)
|
|
155
|
+
cache[path] = cached
|
|
156
|
+
return cached
|
|
157
|
+
return original(path, patterns_arg) # type: ignore[arg-type]
|
|
158
|
+
|
|
159
|
+
ign.is_ignored = memoised # type: ignore[assignment]
|
|
160
|
+
try:
|
|
161
|
+
yield
|
|
162
|
+
finally:
|
|
163
|
+
ign.is_ignored = original
|
|
164
|
+
|
|
165
|
+
|
|
129
166
|
_F = TypeVar("_F", bound=Callable[..., Any])
|
|
130
167
|
|
|
131
168
|
|
|
@@ -240,6 +277,15 @@ def why(
|
|
|
240
277
|
"--no-cache",
|
|
241
278
|
help="Bypass the local SQLite cache at .whycode/cache.db.",
|
|
242
279
|
),
|
|
280
|
+
explain: bool = typer.Option(
|
|
281
|
+
False,
|
|
282
|
+
"--explain",
|
|
283
|
+
help=(
|
|
284
|
+
"Below each signal, print the precise rule that fired: the literal "
|
|
285
|
+
"matched tokens, threshold values, and the source location of the "
|
|
286
|
+
"ladder branch. L1+L2 only — L3 (--llm) decisions are not annotated."
|
|
287
|
+
),
|
|
288
|
+
),
|
|
243
289
|
) -> None:
|
|
244
290
|
"""Print the Risk Card for ``path``."""
|
|
245
291
|
repo_root, rel = _require_tracked(path)
|
|
@@ -328,12 +374,12 @@ def why(
|
|
|
328
374
|
card = card.with_decisions(tuple(decisions))
|
|
329
375
|
|
|
330
376
|
if json_out:
|
|
331
|
-
console.print_json(json.dumps(card.to_dict()))
|
|
377
|
+
console.print_json(json.dumps(card.to_dict(explain=explain)))
|
|
332
378
|
return
|
|
333
379
|
if brief:
|
|
334
380
|
_print_brief(card)
|
|
335
381
|
return
|
|
336
|
-
console.print(rc.render_text(card))
|
|
382
|
+
console.print(rc.render_text(card, explain=explain))
|
|
337
383
|
finally:
|
|
338
384
|
if cache is not None:
|
|
339
385
|
cache.close()
|
|
@@ -428,16 +474,57 @@ def diff(
|
|
|
428
474
|
|
|
429
475
|
cache = _open_cache(repo_root, no_cache)
|
|
430
476
|
try:
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
#
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
477
|
+
# One git log walk feeds every changed file's scoring. Without this
|
|
478
|
+
# batched load, diff against an old base on a large repo runs N
|
|
479
|
+
# `git log --follow` calls (one per changed file): on django at 1,927
|
|
480
|
+
# changed files the legacy path measured 6+ minutes, with the
|
|
481
|
+
# 12+ minute variant timing out outright. ``load_diff_facts`` parses
|
|
482
|
+
# one un-pathed walk into a path -> [Commit] map; per-file scoring
|
|
483
|
+
# then does dict lookups instead of re-shelling-out.
|
|
484
|
+
try:
|
|
485
|
+
diff_facts = gf.load_diff_facts(repo_root, cache=cache)
|
|
486
|
+
except gf.GitError as exc:
|
|
487
|
+
err.print(f"[red]error:[/red] {exc}")
|
|
488
|
+
raise typer.Exit(2) from exc
|
|
489
|
+
# Pre-compute the ignore-pattern set ONCE and a verdict-per-path
|
|
490
|
+
# memo. ``signals.detect_coupling`` (re-introduced in 0.4.1 as F10)
|
|
491
|
+
# filters every coupling candidate through ``ign.is_ignored`` —
|
|
492
|
+
# without memoisation that's 83 patterns x 700 candidates x 1,927
|
|
493
|
+
# files = ~100 CPU-seconds across the diff. The memo cache turns
|
|
494
|
+
# each path's verdict into a dict lookup after the first hit.
|
|
495
|
+
with _memoised_is_ignored(repo_root):
|
|
496
|
+
# First pass: every changed file is scored without the
|
|
497
|
+
# ghost-keeper detector, which would otherwise fire ``git
|
|
498
|
+
# blame`` per file. With 1,927 changed files on django this
|
|
499
|
+
# single deferral saves ~5 minutes. We then sort and
|
|
500
|
+
# re-evaluate only the top-N with full signals — at most
|
|
501
|
+
# ``top`` blame calls instead of ``len(files)``.
|
|
502
|
+
prelim: list[rc.RiskCard] = []
|
|
503
|
+
for f in files:
|
|
504
|
+
try:
|
|
505
|
+
prelim.append(
|
|
506
|
+
rc.build_from_diff_facts(diff_facts, f, skip_ghost_keeper=True)
|
|
507
|
+
)
|
|
508
|
+
except gf.GitError:
|
|
509
|
+
continue
|
|
510
|
+
# Stable tie-break (from 0.4.2): lex smallest path on identical
|
|
511
|
+
# scores so cache and --no-cache truncate the same files at --top N.
|
|
512
|
+
prelim.sort(key=lambda c: (-c.score.value, c.path))
|
|
513
|
+
# Second pass: re-score the top-N with the full detector ladder
|
|
514
|
+
# so the rendered table includes ghost-keeper findings where
|
|
515
|
+
# they apply. Files outside the top-N keep their first-pass
|
|
516
|
+
# score; they were not going to appear in the user's view
|
|
517
|
+
# anyway.
|
|
518
|
+
refined_top: list[rc.RiskCard] = []
|
|
519
|
+
for prelim_card in prelim[:top]:
|
|
520
|
+
try:
|
|
521
|
+
refined_top.append(
|
|
522
|
+
rc.build_from_diff_facts(diff_facts, prelim_card.path)
|
|
523
|
+
)
|
|
524
|
+
except gf.GitError:
|
|
525
|
+
refined_top.append(prelim_card)
|
|
526
|
+
cards = refined_top
|
|
527
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
441
528
|
finally:
|
|
442
529
|
if cache is not None:
|
|
443
530
|
cache.close()
|
whycode/git_facts.py
CHANGED
|
@@ -735,6 +735,242 @@ def _populate_diffstat_cache(
|
|
|
735
735
|
cache.upsert_commit_files(rows)
|
|
736
736
|
|
|
737
737
|
|
|
738
|
+
# ---- batch loading for whycode diff ---------------------------------------
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
@dataclass(frozen=True)
|
|
742
|
+
class DiffFacts:
|
|
743
|
+
"""A whole-repo snapshot built once for a single ``whycode diff`` evaluation.
|
|
744
|
+
|
|
745
|
+
The diff command scores N changed files; previously each file fired its
|
|
746
|
+
own ``git log --follow`` plus a co-change diffstat pass, so wall-clock
|
|
747
|
+
cost scaled with N. ``DiffFacts`` replaces N path-restricted log walks
|
|
748
|
+
with a single un-pathed walk: one ``git log --no-merges --numstat`` over
|
|
749
|
+
the repo, parsed once into ``commits_by_path`` (every commit that named
|
|
750
|
+
each path) and ``co_change_index`` (each commit's full file-set, used
|
|
751
|
+
for in-memory coupling counts). Per-file scoring then reads from this
|
|
752
|
+
map rather than re-shelling-out.
|
|
753
|
+
|
|
754
|
+
The map deliberately does NOT follow renames: the diff command only
|
|
755
|
+
scores files present in HEAD's working tree, so the tradeoff is "lose
|
|
756
|
+
rename-resolved history pre-rename" against "scoring 1,927 files in
|
|
757
|
+
seconds rather than minutes". Coupling against pre-rename names still
|
|
758
|
+
surfaces under those names in the map; the surface diff in practice is
|
|
759
|
+
a stable-tie-break difference, not a structural one.
|
|
760
|
+
"""
|
|
761
|
+
|
|
762
|
+
repo_root: Path
|
|
763
|
+
commits_by_path: dict[str, list[Commit]]
|
|
764
|
+
"""``path -> [Commit]``, newest-first, capped per path during load.
|
|
765
|
+
|
|
766
|
+
A missing key — i.e. ``commits_by_path.get(path)`` returns ``None`` —
|
|
767
|
+
means the loader walk did not see this path. ``gather_for_diff`` treats
|
|
768
|
+
that the same as an empty list: a path that the un-pathed walk did not
|
|
769
|
+
touch has no history to score from.
|
|
770
|
+
"""
|
|
771
|
+
|
|
772
|
+
co_change_index: dict[str, tuple[str, ...]]
|
|
773
|
+
"""``commit_sha -> tuple of paths touched by that commit``.
|
|
774
|
+
|
|
775
|
+
Snapshot of the same numstat parse used to build ``commits_by_path``.
|
|
776
|
+
Per-file ``co_changes`` reads this for in-memory coupling counts so
|
|
777
|
+
the diff pipeline never re-issues ``git log --no-walk`` per file.
|
|
778
|
+
"""
|
|
779
|
+
|
|
780
|
+
cache: CacheStore | None = None
|
|
781
|
+
"""Optional cache, threaded through so signal detectors (specifically
|
|
782
|
+
``detect_ghost_keeper``) reuse it for ``git blame`` line ownership."""
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
_NUMSTAT_LINE_RE = re.compile(r"^(\d+|-)\t(\d+|-)\t(.+)$")
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def load_diff_facts(
|
|
789
|
+
repo_root: Path,
|
|
790
|
+
*,
|
|
791
|
+
max_commits: int | None = None,
|
|
792
|
+
cache: CacheStore | None = None,
|
|
793
|
+
) -> DiffFacts:
|
|
794
|
+
"""Build a :class:`DiffFacts` snapshot from one ``git log`` invocation.
|
|
795
|
+
|
|
796
|
+
Strategy:
|
|
797
|
+
1. Walk HEAD with ``git log --no-merges --numstat --pretty=...`` once.
|
|
798
|
+
2. Parse each commit + its full file-set into a single in-memory map.
|
|
799
|
+
3. Return the snapshot for the diff command's per-file scorer to drive.
|
|
800
|
+
|
|
801
|
+
With a ``cache`` supplied, the walked commits are persisted to
|
|
802
|
+
``commits``; per-file diffstat presence rows are persisted to
|
|
803
|
+
``commit_files`` so a subsequent ``why`` / ``scan`` / ``diff`` invocation
|
|
804
|
+
on the same HEAD reuses what we just paid for.
|
|
805
|
+
|
|
806
|
+
The walk is intentionally un-pathed: the diff command scores files
|
|
807
|
+
that appear in ``git diff --name-only base...HEAD``, all of which exist
|
|
808
|
+
at HEAD by definition. A single un-pathed walk that captures every
|
|
809
|
+
commit's diffstat is strictly cheaper than N path-restricted walks
|
|
810
|
+
that each re-walk the full graph. ``max_commits`` is applied per-path
|
|
811
|
+
*after* the walk so callers can cap per-file depth without changing
|
|
812
|
+
the cost of the walk itself.
|
|
813
|
+
"""
|
|
814
|
+
# Pretty format: RECORD_SEP starts each commit; metadata fields are
|
|
815
|
+
# UNIT_SEP-delimited; the body is the last metadata field. Numstat
|
|
816
|
+
# output git appends after the body needs no further separator —
|
|
817
|
+
# the next commit's leading RECORD_SEP marks the boundary.
|
|
818
|
+
pretty_format = (
|
|
819
|
+
f"{RECORD_SEP}%H{UNIT_SEP}%an{UNIT_SEP}%ae{UNIT_SEP}"
|
|
820
|
+
f"%aI{UNIT_SEP}%s{UNIT_SEP}%b"
|
|
821
|
+
)
|
|
822
|
+
raw = _run_git(
|
|
823
|
+
repo_root,
|
|
824
|
+
"log",
|
|
825
|
+
"--no-merges",
|
|
826
|
+
"--numstat",
|
|
827
|
+
f"--pretty=format:{pretty_format}",
|
|
828
|
+
)
|
|
829
|
+
all_commits, commits_by_path, co_change_index = _parse_log_with_files(raw)
|
|
830
|
+
if max_commits is not None:
|
|
831
|
+
commits_by_path = {p: cs[:max_commits] for p, cs in commits_by_path.items()}
|
|
832
|
+
if cache is not None and all_commits:
|
|
833
|
+
_store_commits(cache, all_commits)
|
|
834
|
+
# Persist diffstat presence rows so a subsequent `why` against the
|
|
835
|
+
# same HEAD does not re-shell-out per file. Insertion/deletion
|
|
836
|
+
# widths are not captured by this walk (the diff command's
|
|
837
|
+
# detectors only depend on the *path set* of each commit), so they
|
|
838
|
+
# are stored as zero — see the paragraph in ``DiffFacts``.
|
|
839
|
+
files_rows: list[tuple[str, str, int, int]] = []
|
|
840
|
+
for sha, paths in co_change_index.items():
|
|
841
|
+
for p in paths:
|
|
842
|
+
files_rows.append((sha, p, 0, 0))
|
|
843
|
+
if files_rows:
|
|
844
|
+
cache.upsert_commit_files(files_rows)
|
|
845
|
+
try:
|
|
846
|
+
head_sha = _run_git(repo_root, "rev-parse", "HEAD").strip()
|
|
847
|
+
except GitError:
|
|
848
|
+
head_sha = ""
|
|
849
|
+
if head_sha and not cache.head_sha:
|
|
850
|
+
cache.set_head_sha(head_sha)
|
|
851
|
+
return DiffFacts(
|
|
852
|
+
repo_root=repo_root,
|
|
853
|
+
commits_by_path=commits_by_path,
|
|
854
|
+
co_change_index=co_change_index,
|
|
855
|
+
cache=cache,
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def _parse_log_with_files(
|
|
860
|
+
raw: str,
|
|
861
|
+
) -> tuple[list[Commit], dict[str, list[Commit]], dict[str, tuple[str, ...]]]:
|
|
862
|
+
"""Parse ``git log --no-merges --numstat --pretty=<sep><commit>`` output.
|
|
863
|
+
|
|
864
|
+
Returns ``(all_commits, commits_by_path, co_change_index)``:
|
|
865
|
+
- ``all_commits`` is every parsed commit, newest first.
|
|
866
|
+
- ``commits_by_path[path]`` is the subset whose numstat block named
|
|
867
|
+
``path``, preserving the newest-first order of the walk.
|
|
868
|
+
- ``co_change_index[sha]`` is the full path tuple from the same
|
|
869
|
+
numstat block, used by the diff command's in-memory coupling.
|
|
870
|
+
|
|
871
|
+
Within one record the format is
|
|
872
|
+
``<sha>\\x1f<an>\\x1f<ae>\\x1f<aI>\\x1f<subject>\\x1f<body...>``
|
|
873
|
+
followed by zero or more numstat lines (``ins\\tdel\\tpath``). The body
|
|
874
|
+
is free-form prose; numstat is tab-delimited 3-column. We walk lines
|
|
875
|
+
forward, holding the first line as the metadata + start-of-body, and
|
|
876
|
+
accumulate further lines as either body (free-form) or numstat
|
|
877
|
+
(matches :data:`_NUMSTAT_LINE_RE`). Once a numstat line fires, the
|
|
878
|
+
remaining lines for that record are taken to be more numstat lines.
|
|
879
|
+
"""
|
|
880
|
+
all_commits: list[Commit] = []
|
|
881
|
+
commits_by_path: dict[str, list[Commit]] = {}
|
|
882
|
+
co_change_index: dict[str, tuple[str, ...]] = {}
|
|
883
|
+
for record in raw.split(RECORD_SEP):
|
|
884
|
+
record = record.strip("\n")
|
|
885
|
+
if not record:
|
|
886
|
+
continue
|
|
887
|
+
lines = record.split("\n")
|
|
888
|
+
# The first line carries every metadata field plus the first body
|
|
889
|
+
# line (the body itself was emitted verbatim by ``%b``).
|
|
890
|
+
head_parts = lines[0].split(UNIT_SEP)
|
|
891
|
+
if len(head_parts) < 6:
|
|
892
|
+
continue
|
|
893
|
+
sha = head_parts[0].strip()
|
|
894
|
+
if not sha:
|
|
895
|
+
continue
|
|
896
|
+
author_name = head_parts[1]
|
|
897
|
+
author_email = head_parts[2]
|
|
898
|
+
authored_at = head_parts[3]
|
|
899
|
+
subject = head_parts[4]
|
|
900
|
+
first_body = UNIT_SEP.join(head_parts[5:])
|
|
901
|
+
body_lines: list[str] = [first_body] if first_body else []
|
|
902
|
+
files: list[str] = []
|
|
903
|
+
in_numstat = False
|
|
904
|
+
for line in lines[1:]:
|
|
905
|
+
m = _NUMSTAT_LINE_RE.match(line)
|
|
906
|
+
if in_numstat:
|
|
907
|
+
if m is not None:
|
|
908
|
+
files.append(m.group(3))
|
|
909
|
+
continue
|
|
910
|
+
if m is not None:
|
|
911
|
+
in_numstat = True
|
|
912
|
+
files.append(m.group(3))
|
|
913
|
+
continue
|
|
914
|
+
body_lines.append(line)
|
|
915
|
+
try:
|
|
916
|
+
authored = _parse_iso(authored_at)
|
|
917
|
+
except ValueError:
|
|
918
|
+
# Bad timestamps from a single 15-year-old commit shouldn't kill
|
|
919
|
+
# the diff command. F1 (full timezone-tolerant parser) is owned
|
|
920
|
+
# by another branch; we degrade locally rather than crash.
|
|
921
|
+
continue
|
|
922
|
+
body = "\n".join(body_lines).strip("\n")
|
|
923
|
+
commit = Commit(
|
|
924
|
+
sha=sha,
|
|
925
|
+
author_name=author_name,
|
|
926
|
+
author_email=author_email,
|
|
927
|
+
authored_at=authored,
|
|
928
|
+
subject=subject,
|
|
929
|
+
body=body,
|
|
930
|
+
files=tuple(files),
|
|
931
|
+
)
|
|
932
|
+
all_commits.append(commit)
|
|
933
|
+
co_change_index[sha] = commit.files
|
|
934
|
+
for path in files:
|
|
935
|
+
commits_by_path.setdefault(path, []).append(commit)
|
|
936
|
+
return all_commits, commits_by_path, co_change_index
|
|
937
|
+
|
|
938
|
+
|
|
939
|
+
def gather_for_diff(
|
|
940
|
+
diff_facts: DiffFacts,
|
|
941
|
+
path: str,
|
|
942
|
+
*,
|
|
943
|
+
max_commits: int | None = None,
|
|
944
|
+
) -> RepoFacts:
|
|
945
|
+
"""Build a :class:`RepoFacts` for ``path`` using only the in-memory map.
|
|
946
|
+
|
|
947
|
+
The diff command calls this once per changed file, replacing the per-file
|
|
948
|
+
``gather()`` (and its embedded ``git log --follow`` + co-change shell-out)
|
|
949
|
+
with O(1) dict lookups. All higher-layer detectors run unchanged on the
|
|
950
|
+
returned ``RepoFacts``.
|
|
951
|
+
"""
|
|
952
|
+
commits = diff_facts.commits_by_path.get(path, [])
|
|
953
|
+
if max_commits is not None:
|
|
954
|
+
commits = commits[:max_commits]
|
|
955
|
+
co_changed: Counter[str] = Counter()
|
|
956
|
+
for commit in commits:
|
|
957
|
+
touched = diff_facts.co_change_index.get(commit.sha, ())
|
|
958
|
+
for other in touched:
|
|
959
|
+
if other == path:
|
|
960
|
+
continue
|
|
961
|
+
co_changed[other] += 1
|
|
962
|
+
return RepoFacts(
|
|
963
|
+
repo_root=diff_facts.repo_root,
|
|
964
|
+
path=path,
|
|
965
|
+
commits=commits,
|
|
966
|
+
co_changed_files=co_changed,
|
|
967
|
+
revert_pairs=find_revert_pairs(commits),
|
|
968
|
+
incident_commits=find_incidents(commits),
|
|
969
|
+
invariant_quotes=extract_invariant_quotes(commits),
|
|
970
|
+
cache=diff_facts.cache,
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
|
|
738
974
|
_REVERT_PREFIX = 'this reverts commit '
|
|
739
975
|
|
|
740
976
|
|
whycode/risk_card.py
CHANGED
|
@@ -50,7 +50,37 @@ class RiskCard:
|
|
|
50
50
|
|
|
51
51
|
return replace(self, decisions=decisions)
|
|
52
52
|
|
|
53
|
-
def to_dict(self) -> dict[str, Any]:
|
|
53
|
+
def to_dict(self, *, explain: bool = False) -> dict[str, Any]:
|
|
54
|
+
"""Render the card as a JSON-friendly dict.
|
|
55
|
+
|
|
56
|
+
With ``explain=True``, each signal entry grows an ``explanation``
|
|
57
|
+
key carrying the rule identifier, prose, evidence, and source
|
|
58
|
+
location populated by the detector. ``None`` is emitted when a
|
|
59
|
+
signal has no explanation attached (e.g. data ingested from an
|
|
60
|
+
older cache); the key is omitted entirely when ``explain`` is
|
|
61
|
+
off, so default consumers see no shape change.
|
|
62
|
+
"""
|
|
63
|
+
signals_out: list[dict[str, Any]] = []
|
|
64
|
+
for s in self.signals:
|
|
65
|
+
entry: dict[str, Any] = {
|
|
66
|
+
"kind": s.kind.value,
|
|
67
|
+
"severity": s.severity,
|
|
68
|
+
"headline": s.headline,
|
|
69
|
+
"detail": s.detail,
|
|
70
|
+
"evidence": list(s.evidence),
|
|
71
|
+
}
|
|
72
|
+
if explain:
|
|
73
|
+
entry["explanation"] = (
|
|
74
|
+
{
|
|
75
|
+
"rule": s.explanation.rule,
|
|
76
|
+
"why_it_fired": s.explanation.why_it_fired,
|
|
77
|
+
"evidence": list(s.explanation.evidence),
|
|
78
|
+
"source_ref": s.explanation.source_ref,
|
|
79
|
+
}
|
|
80
|
+
if s.explanation is not None
|
|
81
|
+
else None
|
|
82
|
+
)
|
|
83
|
+
signals_out.append(entry)
|
|
54
84
|
return {
|
|
55
85
|
"path": self.path,
|
|
56
86
|
"score": self.score.value,
|
|
@@ -67,16 +97,7 @@ class RiskCard:
|
|
|
67
97
|
if self.most_recent_sha
|
|
68
98
|
else None
|
|
69
99
|
),
|
|
70
|
-
"signals":
|
|
71
|
-
{
|
|
72
|
-
"kind": s.kind.value,
|
|
73
|
-
"severity": s.severity,
|
|
74
|
-
"headline": s.headline,
|
|
75
|
-
"detail": s.detail,
|
|
76
|
-
"evidence": list(s.evidence),
|
|
77
|
-
}
|
|
78
|
-
for s in self.signals
|
|
79
|
-
],
|
|
100
|
+
"signals": signals_out,
|
|
80
101
|
"decisions": [d.to_dict() for d in self.decisions],
|
|
81
102
|
}
|
|
82
103
|
|
|
@@ -101,7 +122,63 @@ def build(
|
|
|
101
122
|
the same repo (e.g. inside ``scan`` or ``diff``) share a warm cache.
|
|
102
123
|
"""
|
|
103
124
|
facts = gf.gather(repo_root, path, max_commits=max_commits, ref=ref, cache=cache)
|
|
104
|
-
|
|
125
|
+
return _from_facts(
|
|
126
|
+
path=path,
|
|
127
|
+
facts=facts,
|
|
128
|
+
repo_root=repo_root,
|
|
129
|
+
ref=ref,
|
|
130
|
+
apply_suppressions=apply_suppressions,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def build_from_diff_facts(
|
|
135
|
+
diff_facts: gf.DiffFacts,
|
|
136
|
+
path: str,
|
|
137
|
+
*,
|
|
138
|
+
max_commits: int | None = None,
|
|
139
|
+
apply_suppressions: bool = True,
|
|
140
|
+
skip_ghost_keeper: bool = False,
|
|
141
|
+
) -> RiskCard:
|
|
142
|
+
"""Build a Risk Card from an in-memory :class:`DiffFacts` map.
|
|
143
|
+
|
|
144
|
+
The diff command pre-loads one ``DiffFacts`` for the whole evaluation
|
|
145
|
+
via :func:`whycode.git_facts.load_diff_facts`, then calls this helper
|
|
146
|
+
once per changed file. The card's signals, score, and ``most_recent_*``
|
|
147
|
+
fields all derive from the same in-memory map, so per-file cost is
|
|
148
|
+
O(1) rather than the per-file ``git log --follow`` it replaces.
|
|
149
|
+
|
|
150
|
+
With ``skip_ghost_keeper=True`` the per-file ``git blame`` call is
|
|
151
|
+
deferred — the diff command uses this for its first pass over every
|
|
152
|
+
changed file, then re-evaluates only the top-N with full signals.
|
|
153
|
+
Without this skip, scoring 1,927 files spends ~4-5 minutes inside
|
|
154
|
+
``git blame`` even though > 95% of those files never reach the table
|
|
155
|
+
the user sees.
|
|
156
|
+
"""
|
|
157
|
+
facts = gf.gather_for_diff(diff_facts, path, max_commits=max_commits)
|
|
158
|
+
return _from_facts(
|
|
159
|
+
path=path,
|
|
160
|
+
facts=facts,
|
|
161
|
+
repo_root=diff_facts.repo_root,
|
|
162
|
+
ref=None,
|
|
163
|
+
apply_suppressions=apply_suppressions,
|
|
164
|
+
skip_ghost_keeper=skip_ghost_keeper,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _from_facts(
|
|
169
|
+
*,
|
|
170
|
+
path: str,
|
|
171
|
+
facts: gf.RepoFacts,
|
|
172
|
+
repo_root: Path,
|
|
173
|
+
ref: str | None,
|
|
174
|
+
apply_suppressions: bool,
|
|
175
|
+
skip_ghost_keeper: bool = False,
|
|
176
|
+
) -> RiskCard:
|
|
177
|
+
"""Common tail of :func:`build` and :func:`build_from_diff_facts`."""
|
|
178
|
+
if skip_ghost_keeper:
|
|
179
|
+
signals = _all_signals_without_ghost_keeper(facts)
|
|
180
|
+
else:
|
|
181
|
+
signals = sig.all_signals(facts)
|
|
105
182
|
if apply_suppressions:
|
|
106
183
|
suppressions = supp.load(repo_root)
|
|
107
184
|
signals = supp.filter_signals(signals, suppressions, path)
|
|
@@ -120,6 +197,40 @@ def build(
|
|
|
120
197
|
)
|
|
121
198
|
|
|
122
199
|
|
|
200
|
+
# Detectors whose evidence is already in :class:`RepoFacts` (no git blame, no
|
|
201
|
+
# follow-up shell-out). The ghost-keeper detector is the only one missing
|
|
202
|
+
# here — it calls ``git blame`` per-file, which is the diff command's
|
|
203
|
+
# remaining bottleneck after the log walk is shared.
|
|
204
|
+
_FAST_DETECTORS = (
|
|
205
|
+
sig.detect_revert_chain,
|
|
206
|
+
sig.detect_incident_history,
|
|
207
|
+
sig.detect_invariant_quotes,
|
|
208
|
+
sig.detect_coupling,
|
|
209
|
+
sig.detect_high_churn,
|
|
210
|
+
sig.detect_silence,
|
|
211
|
+
sig.detect_newborn,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _all_signals_without_ghost_keeper(facts: gf.RepoFacts) -> list[sig.Signal]:
|
|
216
|
+
"""Re-implement the public ``all_signals`` ladder, minus ghost-keeper.
|
|
217
|
+
|
|
218
|
+
Mirrors the NEWBORN-suppression rule that ``signals.all_signals`` uses
|
|
219
|
+
so that an empty signal list collapses cleanly to NEWBORN-only when the
|
|
220
|
+
other detectors are all silent. If any other detector fires, NEWBORN is
|
|
221
|
+
dropped, exactly as the canonical helper does.
|
|
222
|
+
"""
|
|
223
|
+
out: list[sig.Signal] = []
|
|
224
|
+
for detector in _FAST_DETECTORS:
|
|
225
|
+
signal = detector(facts)
|
|
226
|
+
if signal is not None:
|
|
227
|
+
out.append(signal)
|
|
228
|
+
if any(s.kind is not sig.SignalKind.NEWBORN for s in out):
|
|
229
|
+
out = [s for s in out if s.kind is not sig.SignalKind.NEWBORN]
|
|
230
|
+
out.sort(key=lambda s: (-s.severity, s.kind.value))
|
|
231
|
+
return out
|
|
232
|
+
|
|
233
|
+
|
|
123
234
|
# ----- rendering ------------------------------------------------------------
|
|
124
235
|
|
|
125
236
|
_BAND_STYLE: dict[Band, str] = {
|
|
@@ -174,7 +285,7 @@ def _evidence_redundant(evidence: tuple[str, ...], detail: str) -> bool:
|
|
|
174
285
|
return all(token in detail for token in evidence)
|
|
175
286
|
|
|
176
287
|
|
|
177
|
-
def _signals_table(signals: tuple[sig.Signal, ...]) -> Table | Text:
|
|
288
|
+
def _signals_table(signals: tuple[sig.Signal, ...], *, explain: bool = False) -> Table | Text:
|
|
178
289
|
if not signals:
|
|
179
290
|
return Text(
|
|
180
291
|
"No flags fired. The history is quiet — this is information, "
|
|
@@ -190,6 +301,18 @@ def _signals_table(signals: tuple[sig.Signal, ...]) -> Table | Text:
|
|
|
190
301
|
block.append(s.detail, style="")
|
|
191
302
|
if s.evidence and not _evidence_redundant(s.evidence, s.detail):
|
|
192
303
|
block.append("\nevidence: " + ", ".join(s.evidence), style="dim")
|
|
304
|
+
if explain and s.explanation is not None:
|
|
305
|
+
ex = s.explanation
|
|
306
|
+
block.append("\n", style="")
|
|
307
|
+
block.append("─ rule: ", style="dim")
|
|
308
|
+
block.append(ex.rule, style="dim bold")
|
|
309
|
+
if ex.source_ref:
|
|
310
|
+
block.append(" ", style="dim")
|
|
311
|
+
block.append(ex.source_ref, style="dim")
|
|
312
|
+
block.append("\n fired because: ", style="dim")
|
|
313
|
+
block.append(ex.why_it_fired, style="dim")
|
|
314
|
+
if ex.evidence:
|
|
315
|
+
block.append("\n evidence: " + ", ".join(ex.evidence), style="dim")
|
|
193
316
|
table.add_row(_severity_badge(s.severity), block)
|
|
194
317
|
return table
|
|
195
318
|
|
|
@@ -234,10 +357,10 @@ def _decisions_block(decisions: tuple[Decision, ...]) -> Padding:
|
|
|
234
357
|
return Padding(panel, (1, 1, 0, 1))
|
|
235
358
|
|
|
236
359
|
|
|
237
|
-
def render_text(card: RiskCard) -> Group:
|
|
360
|
+
def render_text(card: RiskCard, *, explain: bool = False) -> Group:
|
|
238
361
|
pieces: list[Any] = [
|
|
239
362
|
_header(card),
|
|
240
|
-
Padding(_signals_table(card.signals), (0, 1, 0, 1)),
|
|
363
|
+
Padding(_signals_table(card.signals, explain=explain), (0, 1, 0, 1)),
|
|
241
364
|
]
|
|
242
365
|
if card.decisions:
|
|
243
366
|
pieces.append(_decisions_block(card.decisions))
|
|
@@ -247,4 +370,4 @@ def render_text(card: RiskCard) -> Group:
|
|
|
247
370
|
return Group(*pieces)
|
|
248
371
|
|
|
249
372
|
|
|
250
|
-
__all__ = ["RiskCard", "build", "render_text"]
|
|
373
|
+
__all__ = ["RiskCard", "build", "build_from_diff_facts", "render_text"]
|
whycode/signals.py
CHANGED
|
@@ -16,7 +16,7 @@ from whycode import git_facts as gf
|
|
|
16
16
|
from whycode import ignore as ign
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
|
-
from whycode.git_facts import RepoFacts
|
|
19
|
+
from whycode.git_facts import Commit, RepoFacts
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class SignalKind(StrEnum):
|
|
@@ -30,6 +30,29 @@ class SignalKind(StrEnum):
|
|
|
30
30
|
NEWBORN = "newborn"
|
|
31
31
|
|
|
32
32
|
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class Explanation:
|
|
35
|
+
"""Structured trace of why a single signal fired.
|
|
36
|
+
|
|
37
|
+
Each detector that produces a :class:`Signal` populates this with the
|
|
38
|
+
concrete rule branch that matched, the literal evidence (token, count,
|
|
39
|
+
threshold) the rule looked at, and a stable ``file:function`` reference
|
|
40
|
+
so a curious reader can open the source and audit the ladder.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
rule: str
|
|
44
|
+
"""Short stable identifier, e.g. ``"incident_subject_keyword"``."""
|
|
45
|
+
|
|
46
|
+
why_it_fired: str
|
|
47
|
+
"""One-sentence prose describing the matching condition."""
|
|
48
|
+
|
|
49
|
+
evidence: tuple[str, ...] = field(default_factory=tuple)
|
|
50
|
+
"""Literal matched substrings, threshold values, or counts."""
|
|
51
|
+
|
|
52
|
+
source_ref: str = ""
|
|
53
|
+
"""``path:function`` pointer into the project source for the rule."""
|
|
54
|
+
|
|
55
|
+
|
|
33
56
|
@dataclass(frozen=True)
|
|
34
57
|
class Signal:
|
|
35
58
|
kind: SignalKind
|
|
@@ -39,6 +62,9 @@ class Signal:
|
|
|
39
62
|
evidence: tuple[str, ...] = field(default_factory=tuple)
|
|
40
63
|
"""Commit SHAs (or other identifiers) backing this signal."""
|
|
41
64
|
|
|
65
|
+
explanation: Explanation | None = None
|
|
66
|
+
"""Per-rule trace populated when ``whycode why --explain`` is on."""
|
|
67
|
+
|
|
42
68
|
|
|
43
69
|
# ----- thresholds -----------------------------------------------------------
|
|
44
70
|
COUPLING_MIN_COCHANGES = 3
|
|
@@ -90,6 +116,84 @@ def _decay_severity(severity: int, days_since_most_recent: int) -> int:
|
|
|
90
116
|
return severity
|
|
91
117
|
|
|
92
118
|
|
|
119
|
+
def _classify_incident_commit(commit: Commit) -> tuple[str, str, tuple[str, ...]]:
|
|
120
|
+
"""Return ``(rule, why, evidence_tokens)`` for the rule branch that fired.
|
|
121
|
+
|
|
122
|
+
Mirrors the acceptance ladder in :func:`whycode.git_facts.find_incidents`
|
|
123
|
+
so an :class:`Explanation` can name which clause matched on this specific
|
|
124
|
+
commit. Evaluation order matches the ladder; the first match wins.
|
|
125
|
+
"""
|
|
126
|
+
subject = commit.subject
|
|
127
|
+
body = commit.body
|
|
128
|
+
cve = gf._CVE_RE.search(subject)
|
|
129
|
+
if cve:
|
|
130
|
+
return (
|
|
131
|
+
"incident_subject_security_advisory",
|
|
132
|
+
f"subject cites the security advisory {cve.group(0)!r}",
|
|
133
|
+
(cve.group(0),),
|
|
134
|
+
)
|
|
135
|
+
ghsa = gf._GHSA_RE.search(subject)
|
|
136
|
+
if ghsa:
|
|
137
|
+
return (
|
|
138
|
+
"incident_subject_security_advisory",
|
|
139
|
+
f"subject cites the security advisory {ghsa.group(0)!r}",
|
|
140
|
+
(ghsa.group(0),),
|
|
141
|
+
)
|
|
142
|
+
if gf._REVERTED_SUBJECT_RE.search(subject) or gf._REVERTS_SUBJECT_RE.search(subject):
|
|
143
|
+
return (
|
|
144
|
+
"incident_subject_revert_marker",
|
|
145
|
+
"subject is a default git revert pointer (Reverted/Reverts)",
|
|
146
|
+
("Reverted",) if gf._REVERTED_SUBJECT_RE.search(subject) else ("Reverts",),
|
|
147
|
+
)
|
|
148
|
+
cc = gf._BREAKING_CC_RE.search(subject)
|
|
149
|
+
if cc:
|
|
150
|
+
return (
|
|
151
|
+
"incident_subject_conventional_commits_breaking",
|
|
152
|
+
f"subject carries the Conventional Commits breaking marker {cc.group(0).strip()!r}",
|
|
153
|
+
(cc.group(0).strip(),),
|
|
154
|
+
)
|
|
155
|
+
# Subject keyword path (regression handled with corroboration inside the helper).
|
|
156
|
+
if gf._is_subject_incident(subject, body):
|
|
157
|
+
# Find which keyword actually matched in subject.
|
|
158
|
+
match = gf._INCIDENT_RE.search(subject)
|
|
159
|
+
if match:
|
|
160
|
+
tok = match.group(0)
|
|
161
|
+
return (
|
|
162
|
+
"incident_subject_keyword",
|
|
163
|
+
f"subject {subject[:60]!r} matched the literal token {tok!r}",
|
|
164
|
+
(tok,),
|
|
165
|
+
)
|
|
166
|
+
# _is_subject_incident also accepts the regression+id corroboration
|
|
167
|
+
# path even when no other keyword is in the subject.
|
|
168
|
+
return (
|
|
169
|
+
"incident_subject_regression_corroborated",
|
|
170
|
+
"subject contains 'regression' and a corroborating issue id",
|
|
171
|
+
("regression",),
|
|
172
|
+
)
|
|
173
|
+
if gf._BREAKING_FOOTER_RE.search(body):
|
|
174
|
+
return (
|
|
175
|
+
"incident_body_breaking_change_footer",
|
|
176
|
+
"body carries the structured 'BREAKING CHANGE:' footer",
|
|
177
|
+
("BREAKING CHANGE:",),
|
|
178
|
+
)
|
|
179
|
+
body_kw = gf._INCIDENT_RE.search(body)
|
|
180
|
+
issue = gf._ISSUE_ID_RE.search(body)
|
|
181
|
+
if body_kw and issue:
|
|
182
|
+
return (
|
|
183
|
+
"incident_body_keyword_with_issue_id",
|
|
184
|
+
(
|
|
185
|
+
f"body keyword {body_kw.group(0)!r} corroborated by issue id "
|
|
186
|
+
f"{issue.group(0)!r}"
|
|
187
|
+
),
|
|
188
|
+
(body_kw.group(0), issue.group(0)),
|
|
189
|
+
)
|
|
190
|
+
# Should not happen — find_incidents only returns commits that match one
|
|
191
|
+
# of the branches above. Falling through means the ladder grew without
|
|
192
|
+
# this helper. Return a neutral classification so the explanation surface
|
|
193
|
+
# never crashes.
|
|
194
|
+
return ("incident_unclassified", "matched the incident ladder", ())
|
|
195
|
+
|
|
196
|
+
|
|
93
197
|
def detect_revert_chain(facts: RepoFacts) -> Signal | None:
|
|
94
198
|
if not facts.revert_pairs:
|
|
95
199
|
return None
|
|
@@ -104,12 +208,22 @@ def detect_revert_chain(facts: RepoFacts) -> Signal | None:
|
|
|
104
208
|
age_phrase = f" (most recent: {_age_phrase(days)})"
|
|
105
209
|
evidence = tuple(_short(rev) for rev, _ in facts.revert_pairs)
|
|
106
210
|
pairs_text = ", ".join(f"{_short(rev)} reverts {_short(orig)}" for rev, orig in facts.revert_pairs)
|
|
211
|
+
explanation = Explanation(
|
|
212
|
+
rule="revert_pair_default_message",
|
|
213
|
+
why_it_fired=(
|
|
214
|
+
f"{n} commit{' body matches' if n == 1 else ' bodies match'} the default git "
|
|
215
|
+
"revert footer 'This reverts commit <sha>'"
|
|
216
|
+
),
|
|
217
|
+
evidence=evidence,
|
|
218
|
+
source_ref="src/whycode/git_facts.py:find_revert_pairs",
|
|
219
|
+
)
|
|
107
220
|
return Signal(
|
|
108
221
|
kind=SignalKind.REVERT_CHAIN,
|
|
109
222
|
severity=severity,
|
|
110
223
|
headline=f"{n} revert{'s' if n != 1 else ''} touched this file{age_phrase}",
|
|
111
224
|
detail=f"Reverts in this file's history: {pairs_text}.",
|
|
112
225
|
evidence=evidence,
|
|
226
|
+
explanation=explanation,
|
|
113
227
|
)
|
|
114
228
|
|
|
115
229
|
|
|
@@ -124,12 +238,20 @@ def detect_incident_history(facts: RepoFacts) -> Signal | None:
|
|
|
124
238
|
f"{n} commit{'s' if n != 1 else ''} matched incident keywords "
|
|
125
239
|
f"(latest {days} day{'s' if days != 1 else ''} ago: '{most_recent.subject[:80]}')."
|
|
126
240
|
)
|
|
241
|
+
rule, why, tokens = _classify_incident_commit(most_recent)
|
|
242
|
+
explanation = Explanation(
|
|
243
|
+
rule=rule,
|
|
244
|
+
why_it_fired=why,
|
|
245
|
+
evidence=tokens,
|
|
246
|
+
source_ref="src/whycode/git_facts.py:find_incidents",
|
|
247
|
+
)
|
|
127
248
|
return Signal(
|
|
128
249
|
kind=SignalKind.INCIDENT_HISTORY,
|
|
129
250
|
severity=severity,
|
|
130
251
|
headline=f"{n} incident-flagged change{'s' if n != 1 else ''} in history",
|
|
131
252
|
detail=detail,
|
|
132
253
|
evidence=tuple(_short(c.sha) for c in facts.incident_commits[:5]),
|
|
254
|
+
explanation=explanation,
|
|
133
255
|
)
|
|
134
256
|
|
|
135
257
|
|
|
@@ -139,12 +261,26 @@ def detect_high_churn(facts: RepoFacts) -> Signal | None:
|
|
|
139
261
|
if len(recent) < HIGH_CHURN_MIN_COMMITS:
|
|
140
262
|
return None
|
|
141
263
|
severity = 3 if len(recent) < 12 else 4
|
|
264
|
+
explanation = Explanation(
|
|
265
|
+
rule="high_churn_recent_window",
|
|
266
|
+
why_it_fired=(
|
|
267
|
+
f"{len(recent)} commits within the last {cutoff_days} days crossed the "
|
|
268
|
+
f"threshold of {HIGH_CHURN_MIN_COMMITS}"
|
|
269
|
+
),
|
|
270
|
+
evidence=(
|
|
271
|
+
f"recent_commits={len(recent)}",
|
|
272
|
+
f"window_days={cutoff_days}",
|
|
273
|
+
f"threshold={HIGH_CHURN_MIN_COMMITS}",
|
|
274
|
+
),
|
|
275
|
+
source_ref="src/whycode/signals.py:detect_high_churn",
|
|
276
|
+
)
|
|
142
277
|
return Signal(
|
|
143
278
|
kind=SignalKind.HIGH_CHURN,
|
|
144
279
|
severity=severity,
|
|
145
280
|
headline=f"High churn: {len(recent)} commits in last {cutoff_days} days",
|
|
146
281
|
detail="Code that changes this often is rarely settled — read recent diffs first.",
|
|
147
282
|
evidence=tuple(_short(c.sha) for c in recent[:5]),
|
|
283
|
+
explanation=explanation,
|
|
148
284
|
)
|
|
149
285
|
|
|
150
286
|
|
|
@@ -172,12 +308,27 @@ def detect_coupling(facts: RepoFacts) -> Signal | None:
|
|
|
172
308
|
top = paired[:5]
|
|
173
309
|
severity = 3 if top[0][1] < 6 else 4
|
|
174
310
|
listed = "; ".join(f"{p} (x{n})" for p, n in top)
|
|
311
|
+
top_path, top_count = top[0]
|
|
312
|
+
explanation = Explanation(
|
|
313
|
+
rule="coupling_co_change_threshold",
|
|
314
|
+
why_it_fired=(
|
|
315
|
+
f"{top_path!r} changed together with this file {top_count} times "
|
|
316
|
+
f"(threshold {COUPLING_MIN_COCHANGES})"
|
|
317
|
+
),
|
|
318
|
+
evidence=(
|
|
319
|
+
f"top_co_changer={top_path}",
|
|
320
|
+
f"co_changes={top_count}",
|
|
321
|
+
f"threshold={COUPLING_MIN_COCHANGES}",
|
|
322
|
+
),
|
|
323
|
+
source_ref="src/whycode/signals.py:detect_coupling",
|
|
324
|
+
)
|
|
175
325
|
return Signal(
|
|
176
326
|
kind=SignalKind.COUPLING,
|
|
177
327
|
severity=severity,
|
|
178
328
|
headline=f"Tightly coupled to {len(top)} other file{'s' if len(top) != 1 else ''}",
|
|
179
329
|
detail=f"Tends to change together with: {listed}.",
|
|
180
330
|
evidence=tuple(p for p, _ in top),
|
|
331
|
+
explanation=explanation,
|
|
181
332
|
)
|
|
182
333
|
|
|
183
334
|
|
|
@@ -189,6 +340,18 @@ def detect_silence(facts: RepoFacts) -> Signal | None:
|
|
|
189
340
|
if days < SILENCE_DAYS:
|
|
190
341
|
return None
|
|
191
342
|
severity = 2 if days < 365 else 3
|
|
343
|
+
explanation = Explanation(
|
|
344
|
+
rule="silence_untouched_threshold",
|
|
345
|
+
why_it_fired=(
|
|
346
|
+
f"most recent commit on this file is {days} days old, exceeding the "
|
|
347
|
+
f"{SILENCE_DAYS}-day silence threshold"
|
|
348
|
+
),
|
|
349
|
+
evidence=(
|
|
350
|
+
f"days_since_last_commit={days}",
|
|
351
|
+
f"threshold={SILENCE_DAYS}",
|
|
352
|
+
),
|
|
353
|
+
source_ref="src/whycode/signals.py:detect_silence",
|
|
354
|
+
)
|
|
192
355
|
return Signal(
|
|
193
356
|
kind=SignalKind.SILENCE,
|
|
194
357
|
severity=severity,
|
|
@@ -198,6 +361,7 @@ def detect_silence(facts: RepoFacts) -> Signal | None:
|
|
|
198
361
|
"before assuming the silence means stability."
|
|
199
362
|
),
|
|
200
363
|
evidence=(_short(most_recent.sha),),
|
|
364
|
+
explanation=explanation,
|
|
201
365
|
)
|
|
202
366
|
|
|
203
367
|
|
|
@@ -208,12 +372,25 @@ def detect_newborn(facts: RepoFacts) -> Signal | None:
|
|
|
208
372
|
days = _days_since(oldest.authored_at)
|
|
209
373
|
if days > NEWBORN_DAYS:
|
|
210
374
|
return None
|
|
375
|
+
explanation = Explanation(
|
|
376
|
+
rule="newborn_first_commit_window",
|
|
377
|
+
why_it_fired=(
|
|
378
|
+
f"oldest commit on this file is {days} day(s) old, within the "
|
|
379
|
+
f"{NEWBORN_DAYS}-day newborn window"
|
|
380
|
+
),
|
|
381
|
+
evidence=(
|
|
382
|
+
f"days_since_first_commit={days}",
|
|
383
|
+
f"threshold={NEWBORN_DAYS}",
|
|
384
|
+
),
|
|
385
|
+
source_ref="src/whycode/signals.py:detect_newborn",
|
|
386
|
+
)
|
|
211
387
|
return Signal(
|
|
212
388
|
kind=SignalKind.NEWBORN,
|
|
213
389
|
severity=1,
|
|
214
390
|
headline=f"New file (first commit {days} day{'s' if days != 1 else ''} ago)",
|
|
215
391
|
detail="Limited history — the usual signals are not yet trustworthy.",
|
|
216
392
|
evidence=(_short(oldest.sha),),
|
|
393
|
+
explanation=explanation,
|
|
217
394
|
)
|
|
218
395
|
|
|
219
396
|
|
|
@@ -266,9 +443,27 @@ def detect_ghost_keeper(facts: RepoFacts) -> Signal | None:
|
|
|
266
443
|
|
|
267
444
|
if ownership_share is not None:
|
|
268
445
|
ownership_phrase = f"wrote {ownership_share:.0%} of current lines"
|
|
446
|
+
ownership_evidence = f"line_ownership_share={ownership_share:.2f}"
|
|
447
|
+
rule = "ghost_keeper_inactive_line_owner"
|
|
269
448
|
else:
|
|
270
449
|
share = sum(1 for c in facts.commits if c.author_email == primary_email)
|
|
271
450
|
ownership_phrase = f"wrote {share} of {len(facts.commits)} commits here"
|
|
451
|
+
ownership_evidence = f"commits_by_primary={share}"
|
|
452
|
+
rule = "ghost_keeper_inactive_commit_owner"
|
|
453
|
+
|
|
454
|
+
explanation = Explanation(
|
|
455
|
+
rule=rule,
|
|
456
|
+
why_it_fired=(
|
|
457
|
+
f"primary author of this file has not committed anywhere for "
|
|
458
|
+
f"{days_since_seen} days (threshold {GHOST_KEEPER_DAYS})"
|
|
459
|
+
),
|
|
460
|
+
evidence=(
|
|
461
|
+
f"days_since_active={days_since_seen}",
|
|
462
|
+
f"threshold={GHOST_KEEPER_DAYS}",
|
|
463
|
+
ownership_evidence,
|
|
464
|
+
),
|
|
465
|
+
source_ref="src/whycode/signals.py:detect_ghost_keeper",
|
|
466
|
+
)
|
|
272
467
|
|
|
273
468
|
return Signal(
|
|
274
469
|
kind=SignalKind.GHOST_KEEPER,
|
|
@@ -280,6 +475,7 @@ def detect_ghost_keeper(facts: RepoFacts) -> Signal | None:
|
|
|
280
475
|
f"Knowledge may have left the team."
|
|
281
476
|
),
|
|
282
477
|
evidence=(_short(primary_commit.sha),),
|
|
478
|
+
explanation=explanation,
|
|
283
479
|
)
|
|
284
480
|
|
|
285
481
|
|
|
@@ -345,6 +541,22 @@ def detect_invariant_quotes(facts: RepoFacts) -> Signal | None:
|
|
|
345
541
|
if short not in seen_shas:
|
|
346
542
|
seen_shas.add(short)
|
|
347
543
|
evidence_shas.append(short)
|
|
544
|
+
matched_tokens: list[str] = []
|
|
545
|
+
seen_tokens: set[str] = set()
|
|
546
|
+
for line in seen:
|
|
547
|
+
m = gf._INVARIANT_RE.search(line)
|
|
548
|
+
if m and m.group(0).lower() not in seen_tokens:
|
|
549
|
+
seen_tokens.add(m.group(0).lower())
|
|
550
|
+
matched_tokens.append(m.group(0))
|
|
551
|
+
explanation = Explanation(
|
|
552
|
+
rule="invariant_token_match_in_body",
|
|
553
|
+
why_it_fired=(
|
|
554
|
+
f"{total} commit body line{'s' if total != 1 else ''} contained a known "
|
|
555
|
+
"invariant token (e.g. 'do not', 'must not', 'workaround', …)"
|
|
556
|
+
),
|
|
557
|
+
evidence=tuple(matched_tokens) if matched_tokens else (f"matches={total}",),
|
|
558
|
+
source_ref="src/whycode/git_facts.py:extract_invariant_quotes",
|
|
559
|
+
)
|
|
348
560
|
return Signal(
|
|
349
561
|
kind=SignalKind.INVARIANT_QUOTE,
|
|
350
562
|
severity=severity,
|
|
@@ -354,6 +566,7 @@ def detect_invariant_quotes(facts: RepoFacts) -> Signal | None:
|
|
|
354
566
|
),
|
|
355
567
|
detail="Past authors used cautionary language in commit messages:\n" + rendered,
|
|
356
568
|
evidence=tuple(evidence_shas),
|
|
569
|
+
explanation=explanation,
|
|
357
570
|
)
|
|
358
571
|
|
|
359
572
|
|
|
@@ -388,6 +601,7 @@ def all_signals(facts: RepoFacts) -> list[Signal]:
|
|
|
388
601
|
|
|
389
602
|
|
|
390
603
|
__all__ = [
|
|
604
|
+
"Explanation",
|
|
391
605
|
"Signal",
|
|
392
606
|
"SignalKind",
|
|
393
607
|
"all_signals",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: whycode-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: Tells you what to be afraid of before you touch a file.
|
|
5
5
|
Author: Kevin
|
|
6
6
|
License-Expression: MIT
|
|
@@ -140,6 +140,32 @@ Score interpretation:
|
|
|
140
140
|
| 25–49 | WORTH A LOOK | One thing might bite you. Glance. |
|
|
141
141
|
| 0–24 | NO FLAGS | Quiet history — but read the diff anyway. |
|
|
142
142
|
|
|
143
|
+
### "But why exactly did this fire?" — `--explain`
|
|
144
|
+
|
|
145
|
+
When a signal looks wrong (or you just want to understand the reasoning
|
|
146
|
+
before trusting the tool), pass `--explain`. Each fired signal grows a
|
|
147
|
+
small block naming the precise rule that produced it, the literal evidence
|
|
148
|
+
the rule looked at, and the source location of the ladder branch:
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
$ whycode why src/payment/refund.py --explain
|
|
152
|
+
|
|
153
|
+
MED 1 incident-flagged change in history
|
|
154
|
+
1 commit matched incident keywords (latest 12 days ago:
|
|
155
|
+
'hotfix: idempotency token regression').
|
|
156
|
+
evidence: a3f4b2c
|
|
157
|
+
─ rule: incident_subject_keyword src/whycode/git_facts.py:find_incidents
|
|
158
|
+
fired because: subject 'hotfix: idempotency token regression'
|
|
159
|
+
matched the literal token 'hotfix'
|
|
160
|
+
evidence: hotfix
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Without `--explain`, output is exactly as before — this is purely an
|
|
164
|
+
opt-in transparency surface. `--explain --json` adds an `explanation`
|
|
165
|
+
key per signal in the JSON output, with the same fields. The flag covers
|
|
166
|
+
L1+L2 detectors only; if you also pass `--llm`, the L3 decision block is
|
|
167
|
+
unaffected.
|
|
168
|
+
|
|
143
169
|
## The killer use case: hand it to your AI editor
|
|
144
170
|
|
|
145
171
|
WhyCode is also an MCP server. Configure it in any MCP-aware editor or
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
whycode/__init__.py,sha256=
|
|
1
|
+
whycode/__init__.py,sha256=HkmUVIKP--eR8gyVbGEEEzJ1fogqXGGp-PTF9UIPdYw,96
|
|
2
2
|
whycode/__main__.py,sha256=dqAk6746YpuM-FTIH4TBOULegGc5WweojiZjce0VYgQ,105
|
|
3
3
|
whycode/cache.py,sha256=0cEPZHdolQbSiBLAOnMu20tobIrc7G0MNycpldHRpkk,18536
|
|
4
|
-
whycode/cli.py,sha256=
|
|
4
|
+
whycode/cli.py,sha256=CRDzVcZJur5DcYg1eoLfwZMSAqDJ8kwkSWg0U4ktmI4,53589
|
|
5
5
|
whycode/decisions.py,sha256=oCVhEF7QfHeci0LAWNtEjV2mUAEBJloL1rT3I4XXbkw,7570
|
|
6
|
-
whycode/git_facts.py,sha256=
|
|
6
|
+
whycode/git_facts.py,sha256=zevyDVZTIvWJrtaiufhOdFboltH5__pooWFBdO8AhBY,51567
|
|
7
7
|
whycode/ignore.py,sha256=O_8bHIt0d1U-sYrBajBa7oEqpnHWU3f6Zf-8PU8CpO0,4748
|
|
8
8
|
whycode/llm.py,sha256=leB94pBg8kUCq_BujZq5ixny0urGtKskjdaKoum_eCA,4092
|
|
9
9
|
whycode/mcp_server.py,sha256=ht1tStAkOwmQzNIRkm1eA8Tnc59fzDRSGkgyIprft-0,18503
|
|
10
|
-
whycode/risk_card.py,sha256=
|
|
10
|
+
whycode/risk_card.py,sha256=gur_01ESKVhVQJrEQeVcOvJq5KDSIu8cZbaFZtVHBMQ,13718
|
|
11
11
|
whycode/scorer.py,sha256=4pBejunfxzYhGUzMeL8uGEMQzC6DWiqwcTeMdo3eras,1444
|
|
12
|
-
whycode/signals.py,sha256=
|
|
12
|
+
whycode/signals.py,sha256=yHsNmo6fbUQS8vQwuhAPSgP7Tn3fD-Etio62upacIsQ,22176
|
|
13
13
|
whycode/suppressions.py,sha256=1lKSs-kCgpnJbcxozcgiSP8ZAfjEDMHXuM3sw4FaY78,3836
|
|
14
14
|
whycode/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
whycode/templates/github-workflow.yml,sha256=LAfHMDG2TkAwi4vCNinHk-4zOt-mCWErBpmpaqlW5oA,2251
|
|
16
16
|
whycode/templates/pre-commit,sha256=IhU11CvoDwqRAAsvHwUo-BwaNbdgy1cpXc54Z_phrmQ,316
|
|
17
|
-
whycode_cli-0.
|
|
18
|
-
whycode_cli-0.
|
|
19
|
-
whycode_cli-0.
|
|
20
|
-
whycode_cli-0.
|
|
21
|
-
whycode_cli-0.
|
|
22
|
-
whycode_cli-0.
|
|
17
|
+
whycode_cli-0.5.2.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
|
|
18
|
+
whycode_cli-0.5.2.dist-info/METADATA,sha256=dvNlR2ck9ZATEtphb69C98V3XDqjX1CLzgb1_iSFtKQ,11364
|
|
19
|
+
whycode_cli-0.5.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
20
|
+
whycode_cli-0.5.2.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
|
|
21
|
+
whycode_cli-0.5.2.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
|
|
22
|
+
whycode_cli-0.5.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|