PyPI - whycode-cli - Versions diffs - 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

whycode-cli 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

whycode/__init__.py +1 -1
whycode/cache.py +33 -7
whycode/cli.py +120 -34
whycode/git_facts.py +298 -13
whycode/ignore.py +53 -1
whycode/signals.py +18 -1
{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/METADATA +1 -1
{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/RECORD +12 -12
{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/WHEEL +0 -0
{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/entry_points.txt +0 -0
{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/licenses/LICENSE +0 -0
{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/top_level.txt +0 -0

whycode/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """WhyCode — tells you what to be afraid of before touching a file."""
-__version__ = "0.4.0"
+__version__ = "0.4.2"

whycode/cache.py CHANGED Viewed

@@ -112,10 +112,21 @@ class CacheStore:
     cache misses; this class never invokes ``git`` itself.
     """
-    def __init__(self, db_path: Path) -> None:
+    def __init__(self, db_path: Path, *, in_memory: bool = False) -> None:
+        """Open (creating if needed) the SQLite cache at ``db_path``.
+        ``in_memory=True`` opens a transient ``:memory:`` connection
+        instead — the disk file is never created and is never read.
+        Used by ``--no-cache`` to retain in-session amortisation
+        (matches the cold-fill code path) without persisting anything.
+        """
         self.db_path = db_path
-        self.db_path.parent.mkdir(parents=True, exist_ok=True)
-        self._conn = sqlite3.connect(self.db_path)
+        self._in_memory = in_memory
+        if in_memory:
+            self._conn = sqlite3.connect(":memory:")
+        else:
+            self.db_path.parent.mkdir(parents=True, exist_ok=True)
+            self._conn = sqlite3.connect(self.db_path)
         # row_factory makes column access readable in tests / debug.
         self._conn.row_factory = sqlite3.Row
         self._conn.execute("PRAGMA foreign_keys = ON")
@@ -402,13 +413,18 @@ class CacheStore:
         file_row_count = int(
             self._conn.execute("SELECT COUNT(*) FROM commit_files").fetchone()[0]
         )
-        try:
-            size_bytes = self.db_path.stat().st_size
-        except OSError:
+        if self._in_memory:
             size_bytes = 0
+            exists = False
+        else:
+            try:
+                size_bytes = self.db_path.stat().st_size
+            except OSError:
+                size_bytes = 0
+            exists = self.db_path.exists()
         return CacheStats(
             path=self.db_path,
-            exists=self.db_path.exists(),
+            exists=exists,
             schema_version=self.schema_version,
             head_sha=self.head_sha,
             commit_count=commit_count,
@@ -430,6 +446,16 @@ def open_for(repo_root: Path) -> CacheStore:
     return CacheStore(cache_path_for(repo_root))
+def open_in_memory(repo_root: Path) -> CacheStore:
+    """Open a transient in-memory cache for ``repo_root``.
+    Used by ``--no-cache`` to keep within-session amortisation (the same
+    cold-fill code path everything else uses) while never touching disk.
+    The store is destroyed on ``close()`` and has no after-effects.
+    """
+    return CacheStore(cache_path_for(repo_root), in_memory=True)
 def parse_authored_at(value: str) -> datetime:
     """Parse the ``authored_at`` string we stored from git.

whycode/cli.py CHANGED Viewed

@@ -20,10 +20,12 @@ Commands
 from __future__ import annotations
+import functools
 import json
 import sys
+from collections.abc import Callable
 from pathlib import Path
-from typing import Any
+from typing import Any, TypeVar
 import typer
 from rich.console import Console
@@ -48,18 +50,27 @@ err = Console(stderr=True)
 def _open_cache(repo_root: Path, no_cache: bool) -> ch.CacheStore | None:
-    """Open the on-disk cache for ``repo_root`` unless suppressed.
-    A None return means "do not pass a cache through git_facts" — every
-    git-side helper falls back to its original network-free, cache-free
-    implementation. This is the escape hatch behind ``--no-cache`` and
-    is also the default when the cache cannot be initialised at all
-    (read-only filesystem, etc.); we never want a cache failure to
-    block the main read path.
+    """Open the cache for ``repo_root`` according to the no-cache flag.
+    Modes:
+      * ``no_cache=False`` (the default): persistent on-disk SQLite at
+        ``.whycode/cache.db``.
+      * ``no_cache=True``: a transient ``:memory:`` SQLite store. The
+        same git-walk code path runs as for the cold-fill, but the
+        database is destroyed on ``close()`` — nothing lands on disk
+        and the next run starts cold. Keeping per-run amortisation
+        (one ``git log`` walk shared across files) is what makes
+        ``--no-cache`` at most as slow as a cold persistent fill;
+        the previous ``cache=None`` short-circuit lost that and so
+        ``--no-cache`` re-issued per-file walks every iteration.
+    A ``None`` return means "do not pass a cache through git_facts".
+    Happens only when even an in-memory open fails — very rare and
+    we never want a cache problem to block the main read path.
     """
-    if no_cache:
-        return None
     try:
+        if no_cache:
+            return ch.open_in_memory(repo_root)
         return ch.open_for(repo_root)
     except OSError:
         return None
@@ -115,6 +126,37 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
     return repo_root, rel
+_F = TypeVar("_F", bound=Callable[..., Any])
+def _propagate_failures(func: _F) -> _F:
+    """Convert any uncaught exception into ``typer.Exit(2)``.
+    A read-only field test against psf/requests caught a bug where a single
+    bad-timezone commit raised ``ValueError`` deep inside ``_parse_log_records``;
+    Rich rendered the traceback to stderr, but the process exited with status
+    0. CI integrations could not tell that the run had silently failed
+    (a ``whycode diff --fail-on history`` step was reported as green even
+    though it had crashed). We wrap each command body so any unhandled
+    exception leaves the existing rich traceback rendering in place but
+    forces a non-zero exit code (``2`` for general failure). ``typer.Exit``
+    and ``KeyboardInterrupt`` propagate untouched so explicit exit-code
+    paths and Ctrl-C still behave normally.
+    """
+    @functools.wraps(func)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        try:
+            return func(*args, **kwargs)
+        except (typer.Exit, typer.Abort, KeyboardInterrupt):
+            raise
+        except Exception as exc:
+            err.print_exception(show_locals=False)
+            raise typer.Exit(2) from exc
+    return wrapper  # type: ignore[return-value]
 # --- shared: band threshold parsing ----------------------------------------
 _BAND_THRESHOLDS_BY_KEY: dict[str, int] = {
@@ -148,6 +190,7 @@ def _print_brief(card: rc.RiskCard) -> None:
 @app.command()
+@_propagate_failures
 def why(
     path: str = typer.Argument(..., help="File path to inspect."),
     json_out: bool = typer.Option(
@@ -317,6 +360,7 @@ def _resolve_base_ref(repo_root: Path, requested: str | None) -> str:
 @app.command()
+@_propagate_failures
 def diff(
     base: str | None = typer.Option(
         None, "--base", help="Base ref (default: origin/main → main → HEAD~1)."
@@ -390,7 +434,9 @@ def diff(
                 cards.append(rc.build(repo_root, f, cache=cache))
             except gf.GitError:
                 continue
-        cards.sort(key=lambda c: -c.score.value)
+        # Stable tie-break: lex smallest path on identical scores so cache
+        # and --no-cache truncate the same files at --top N.
+        cards.sort(key=lambda c: (-c.score.value, c.path))
         cards = cards[:top]
     finally:
         if cache is not None:
@@ -482,6 +528,7 @@ def diff(
 @app.command()
+@_propagate_failures
 def highlights(
     invariants: int = typer.Option(
         5, "--invariants", help="How many invariant lines to surface."
@@ -529,16 +576,17 @@ def highlights(
     inv_pairs = gf.extract_invariant_quotes(commits)
     sha_to_commit = {c.sha: c for c in commits}
-    seen_lines: dict[str, str] = {}
-    for sha, line in inv_pairs:
-        seen_lines.setdefault(line, sha)
+    deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
     inv_records: list[tuple[str, str, gf.Commit]] = []
-    for line, sha in seen_lines.items():
+    for sha, line in deduped:
         commit = sha_to_commit.get(sha)
         if commit is None:
             continue
         inv_records.append((line, sha, commit))
-    inv_records.sort(key=lambda t: t[2].authored_at, reverse=True)
+    # Sort newest first; on identical timestamps fall back to lexicographically
+    # smallest sha so cache and --no-cache emit byte-identical output.
+    inv_records.sort(key=lambda t: t[1])  # secondary: sha asc
+    inv_records.sort(key=lambda t: t[2].authored_at, reverse=True)  # primary
     inv_records = inv_records[:invariants]
     incident_records = gf.find_incidents(commits)[:incidents]
@@ -636,6 +684,7 @@ def _sample_indices(total: int, max_samples: int) -> list[int]:
 @app.command()
+@_propagate_failures
 def timeline(
     path: str = typer.Argument(..., help="File path to inspect."),
     samples: int = typer.Option(
@@ -677,6 +726,12 @@ def timeline(
                     top,
                 )
             )
+    # Field-test report F14: ``timeline`` used to render rows in whatever
+    # non-monotonic order ``_sample_indices`` produced (uniform-across-index
+    # selection on a list whose ordering is git's parent traversal).  Sort
+    # by date ascending before rendering so a reader can scan left-to-right
+    # without misreading the trajectory.
+    rows.sort(key=lambda r: r[0])
     if json_out:
         console.print_json(
@@ -714,6 +769,7 @@ def timeline(
 @app.command()
+@_propagate_failures
 def scan(
     top: int = typer.Option(10, "--top", help="How many files to list."),
     sample: int = typer.Option(
@@ -783,7 +839,10 @@ def scan(
         if cache is not None:
             cache.close()
-    cards.sort(key=lambda c: -c.score.value)
+    # Stable tie-break on identical scores: lexicographically smallest path
+    # so cache and --no-cache produce byte-identical text output for the
+    # same HEAD. Without this, the truncation at --top N is non-deterministic.
+    cards.sort(key=lambda c: (-c.score.value, c.path))
     top_cards = cards[:top]
     if not top_cards:
         # Be honest about what "no flagged files" actually means. A user who
@@ -811,6 +870,7 @@ def scan(
 @app.command()
+@_propagate_failures
 def honest(
     path: str = typer.Argument(..., help="File path to inspect."),
     json_out: bool = typer.Option(False, "--json", help="Emit JSON instead of prose."),
@@ -874,6 +934,7 @@ def honest(
 @app.command()
+@_propagate_failures
 def show(
     sha: str = typer.Argument(..., help="Commit SHA (full or short) to inspect."),
     repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
@@ -903,7 +964,8 @@ def show(
             cards.append(rc.build(repo_root, change.path))
         except gf.GitError:
             continue
-    cards.sort(key=lambda c: -c.score.value)
+    # Stable tie-break on identical scores: lex smallest path.
+    cards.sort(key=lambda c: (-c.score.value, c.path))
     if json_out:
         console.print_json(
@@ -981,6 +1043,7 @@ _MCP_SNIPPET = '''    {
 @app.command()
+@_propagate_failures
 def tour(
     repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
     no_cache: bool = typer.Option(
@@ -1018,29 +1081,50 @@ def tour(
         inv_pairs = gf.extract_invariant_quotes(commits)
         sha_to_commit = {c.sha: c for c in commits}
-        seen_lines: dict[str, str] = {}
-        for sha, line in inv_pairs:
-            seen_lines.setdefault(line, sha)
+        deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
+        # Sort newest first with sha-asc tie-break so cache and --no-cache
+        # surface the same three lines in the same order.
+        deduped_sorted = sorted(
+            (p for p in deduped if p[0] in sha_to_commit),
+            key=lambda p: p[0],
+        )
+        deduped_sorted.sort(
+            key=lambda p: sha_to_commit[p[0]].authored_at, reverse=True
+        )
         invariants_top = [
-            (line, sha_to_commit[sha])
-            for line, sha in seen_lines.items()
-            if sha in sha_to_commit
+            (line, sha_to_commit[sha]) for sha, line in deduped_sorted
         ][:3]
         incidents_top = gf.find_incidents(commits)[:3]
         if invariants_top or incidents_top:
-            console.print("[bold yellow]Decisions and incidents[/bold yellow]")
-            for line, c in invariants_top:
-                console.print(f"  [italic]{line}[/italic]")
+            # Field-test report F16: the original tour rendered both classes
+            # under one ``Decisions and incidents`` header, so a parenthetical
+            # invariant prose line was visually indistinguishable from a real
+            # incident commit. Render two subheads matching the layout
+            # ``highlights`` already uses.
+            if invariants_top:
                 console.print(
-                    f"  [dim]{c.sha[:7]}  {c.authored_at.date()}  {c.author_name}[/dim]\n"
+                    f"[bold yellow]Stated invariants[/bold yellow] "
+                    f"[dim]({len(invariants_top)} most recent)[/dim]"
                 )
-            for c in incidents_top:
-                subj = c.subject if len(c.subject) <= 70 else c.subject[:69] + "…"
-                console.print(f"  [red]{subj}[/red]")
+                for line, c in invariants_top:
+                    console.print(f"  [italic]{line}[/italic]")
+                    console.print(
+                        f"  [dim]{c.sha[:7]}  {c.authored_at.date()}  "
+                        f"{c.author_name}[/dim]\n"
+                    )
+            if incidents_top:
                 console.print(
-                    f"  [dim]{c.sha[:7]}  {c.authored_at.date()}  {c.author_name}[/dim]\n"
+                    f"[bold red]Recent incidents[/bold red] "
+                    f"[dim]({len(incidents_top)} most recent)[/dim]"
                 )
+                for c in incidents_top:
+                    subj = c.subject if len(c.subject) <= 70 else c.subject[:69] + "…"
+                    console.print(f"  [red]{subj}[/red]")
+                    console.print(
+                        f"  [dim]{c.sha[:7]}  {c.authored_at.date()}  "
+                        f"{c.author_name}[/dim]\n"
+                    )
         else:
             console.print(
                 "[dim]No headline decisions or incidents in recent history.[/dim]"
@@ -1072,7 +1156,8 @@ def tour(
                     ]
                     if useful:
                         cards.append(card)
-            cards.sort(key=lambda c: -c.score.value)
+            # Stable tie-break: lex smallest path on identical scores.
+            cards.sort(key=lambda c: (-c.score.value, c.path))
         if cards:
             console.print("[bold red]Top 3 risky files[/bold red]")
@@ -1113,6 +1198,7 @@ def tour(
 @app.command()
+@_propagate_failures
 def init(
     force: bool = typer.Option(
         False, "--force", "-f", help="Overwrite existing files instead of skipping."

whycode/git_facts.py CHANGED Viewed

@@ -18,10 +18,11 @@ from __future__ import annotations
 import re
 import subprocess
+import sys
 from collections import Counter
 from collections.abc import Sequence
 from dataclasses import dataclass, field
-from datetime import datetime
+from datetime import UTC, datetime
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -31,6 +32,17 @@ if TYPE_CHECKING:
 UNIT_SEP = "\x1f"
 RECORD_SEP = "\x1e"
+# Per-process record of commits whose authored timestamp could not be parsed
+# even after defensive normalisation. We surface these once per session via
+# a single stderr line so a single bad record does not spam a per-line warning
+# — never on every read, never to a network.
+_UNPARSEABLE_TIMESTAMPS: set[str] = set()
+_BAD_TZ_WARNING_EMITTED = False
+# The Unix epoch as a tz-aware UTC datetime; used as a safe fallback when a
+# commit's authored_at is irrecoverably malformed. Picked over datetime.min
+# because callers expect a tz-aware value (signal age math compares to UTC).
+_EPOCH_FALLBACK = datetime.fromtimestamp(0, UTC)
 # A commit subject/body containing one of these markers is treated as evidence
 # that the original author flagged something worth carrying forward.
 INCIDENT_TOKENS: tuple[str, ...] = (
@@ -69,6 +81,39 @@ _BREAKING_CC_RE = re.compile(
 _ISSUE_ID_RE = re.compile(
     r"(?:#\d+|\b[A-Z][A-Z0-9_]+-\d+|\bSEV[- ]?\d\b|\bP[01]\b)",
 )
+# Security-advisory tokens fire as incidents on subject alone — the deliberate
+# act of citing one is unambiguous high-confidence evidence.
+_CVE_RE = re.compile(r"\bCVE-\d{4}-\d+\b")
+_GHSA_RE = re.compile(r"\bGHSA-[a-z0-9-]+\b", re.IGNORECASE)
+# Default ``git revert`` body subject ("Reverted ...") and the human variant
+# ("Reverts <sha>") are both unambiguous incident-class evidence on subject.
+_REVERTED_SUBJECT_RE = re.compile(r'^Reverted\s+"', re.IGNORECASE)
+_REVERTS_SUBJECT_RE = re.compile(r"\bReverts\s+[0-9a-f]{7,}\b", re.IGNORECASE)
+# Subject-level "regression" usage that is descriptive rather than incident:
+# "regression test(s)", "regression suite", "no regression", "regression nature".
+# These prevention/test-housekeeping phrases must NOT fire as incidents.
+_BENIGN_REGRESSION_RE = re.compile(
+    r"\b(?:regression\s+(?:tests?|suite|nature)|no\s+regression)\b",
+    re.IGNORECASE,
+)
+# Conversely, a subject using "regression" with a corroborating incident id
+# (``#1234``, ``INC-447``, …), or as part of an unambiguous incident phrase
+# like ``regression in <something>`` / ``Fixed: regression`` / ``fix the
+# regression``, IS an incident.  These are the patterns that distinguish
+# "split the regression-test files" (housekeeping) from "fix the refund
+# regression" (a real outage marker).
+_INCIDENT_REGRESSION_RE = re.compile(
+    # ``regression in <something>`` — "fix the refund regression in admin".
+    r"\bregression\s+in\b"
+    # ``fix the regression`` / ``fix a regression`` — explicit incident verb.
+    r"|\bfix(?:ed|es)?\s+(?:the\s+|a\s+)?regression\b"
+    # ``regression — …`` / ``regression: …`` — stated subject category.
+    r"|\bregression\s*[:—-]"
+    # ``Fixed: regression`` / ``Hotfix: regression`` — pre-colon incident verb
+    # explicitly framing the rest as the incident category itself.
+    r"|\b(?:fix(?:ed|es)?|hotfix|revert(?:ed)?)\s*:\s*regression\b",
+    re.IGNORECASE,
+)
 INVARIANT_TOKENS: tuple[str, ...] = (
     "do not",
     "don't",
@@ -187,8 +232,94 @@ def is_tracked(repo_root: Path, path: str) -> bool:
     return bool(out.strip())
+def _normalise_tz_offset(timestamp: str) -> str:
+    """Repair pathological tz offsets that ``datetime.fromisoformat`` rejects.
+    Real-world git history contains commits authored on systems with broken
+    timezone configuration (e.g. an offset of ``+518:00`` or ``+51800`` —
+    encountered on a 2011 commit in psf/requests, where the underlying object
+    really stores ``+51800``). ``fromisoformat`` raises ``ValueError`` on
+    those, which would otherwise poison every command that walks history.
+    We coerce the suffix into the canonical ``[+-]HH:MM`` form when we can
+    recognise it. Anything else is left untouched and the caller falls back
+    to a safe default.
+    """
+    stripped = timestamp.strip()
+    if "T" not in stripped:
+        return stripped
+    body, _, after_t = stripped.partition("T")
+    sign_idx = -1
+    for i, ch in enumerate(after_t):
+        if ch in "+-":
+            sign_idx = i
+            break
+    if sign_idx < 0:
+        return stripped
+    prefix = body + "T" + after_t[:sign_idx]
+    sign = after_t[sign_idx]
+    rest = after_t[sign_idx + 1 :]
+    digits = rest.replace(":", "")
+    if not digits.isdigit():
+        return stripped
+    # Acceptable shapes: 4 digits → HHMM; 5 digits → HHHMM (broken, e.g.
+    # ``+51800`` → hours ``5``, minutes ``18``); 6 digits → HHMMSS (rare).
+    if len(digits) == 4:
+        hh, mm = digits[:2], digits[2:]
+    elif len(digits) == 5:
+        hh, mm = "0" + digits[0], digits[1:3]
+    elif len(digits) == 6:
+        hh, mm = digits[:2], digits[2:4]
+    elif len(digits) == 2:
+        hh, mm = digits, "00"
+    else:
+        return stripped
+    try:
+        if int(hh) > 23 or int(mm) > 59:
+            return stripped
+    except ValueError:
+        return stripped
+    return f"{prefix}{sign}{hh}:{mm}"
 def _parse_iso(timestamp: str) -> datetime:
-    return datetime.fromisoformat(timestamp.strip())
+    """Parse an ISO 8601 timestamp; tolerate malformed tz offsets.
+    Returns a Unix-epoch sentinel if the offset cannot be repaired so a
+    single bad record never crashes a whole-repo analysis. The bad raw
+    string is tracked in a per-session set so verbose callers can mention
+    which commits were affected.
+    """
+    raw = timestamp.strip()
+    try:
+        return datetime.fromisoformat(raw)
+    except ValueError:
+        repaired = _normalise_tz_offset(raw)
+        if repaired != raw:
+            try:
+                return datetime.fromisoformat(repaired)
+            except ValueError:
+                pass
+        _UNPARSEABLE_TIMESTAMPS.add(raw)
+        return _EPOCH_FALLBACK
+def _maybe_warn_bad_timestamps() -> None:
+    """Emit a single stderr line per session if any record fell back to epoch.
+    Called once at the end of a top-level read so a single bad commit never
+    spams a per-line warning. Stays purely local — no network, no telemetry.
+    """
+    global _BAD_TZ_WARNING_EMITTED
+    if _BAD_TZ_WARNING_EMITTED or not _UNPARSEABLE_TIMESTAMPS:
+        return
+    _BAD_TZ_WARNING_EMITTED = True
+    n = len(_UNPARSEABLE_TIMESTAMPS)
+    print(
+        f"warning: {n} commit{'s' if n != 1 else ''} had an unparseable "
+        f"authored timestamp; treating those as epoch for date math.",
+        file=sys.stderr,
+    )
 def _log_format() -> str:
@@ -285,6 +416,7 @@ def commits_for_path(
             cache.store_path_log(path, head_sha, [c.sha for c in commits])
     if max_count is not None and len(commits) > max_count:
         commits = commits[:max_count]
+    _maybe_warn_bad_timestamps()
     return commits
@@ -342,12 +474,15 @@ def all_commits(
     """
     if cache is not None:
         full = _all_commits_via_cache(repo_root, cache)
+        _maybe_warn_bad_timestamps()
         return full if max_count is None else full[:max_count]
     args = ["log", "--no-merges", f"--pretty=format:{_log_format()}"]
     if max_count is not None:
         args.append(f"--max-count={max_count}")
     raw = _run_git(repo_root, *args)
-    return _parse_log_records(raw)
+    out = _parse_log_records(raw)
+    _maybe_warn_bad_timestamps()
+    return out
 def _store_commits(cache: CacheStore, commits: Sequence[Commit]) -> None:
@@ -624,26 +759,77 @@ def find_revert_pairs(commits: Sequence[Commit]) -> list[tuple[str, str]]:
     return pairs
+def _is_subject_incident(subject: str, body: str) -> bool:
+    """Determine whether a single commit's subject signals incident intent.
+    The trickiest case is ``regression``. The 0.4.0 classifier accepted any
+    subject that contained the word and so flagged routine bug fixes that
+    happened to mention "regression tests" or "regression nature" as
+    incidents. The new rule:
+    - ``regression`` in a subject fires only when corroborated:
+        * an issue / incident id on the same subject or anywhere in the body
+          (``#1234``, ``INC-447``, ``SEV-1``, …); OR
+        * a pre-marker that anchors the word as an incident reference,
+          such as ``regression in <something>`` / ``Fixed: regression`` /
+          ``fix the regression`` / ``regression — …``.
+    - The phrases ``regression test(s)``, ``regression suite``,
+      ``no regression``, ``regression nature`` never fire on their own.
+    - Subjects citing a security advisory (``CVE-…`` / ``GHSA-…``) always
+      fire — the act of naming an advisory is unambiguous high-confidence.
+    - The default ``git revert`` body subject (``Reverted "…"``) and the
+      human variant (``Reverts <sha>``) always fire — both are explicit
+      rollback markers.
+    - Other incident keywords (``hotfix``, ``outage``, ``rollback``, …)
+      keep their existing subject-level acceptance.
+    """
+    if _CVE_RE.search(subject) or _GHSA_RE.search(subject):
+        return True
+    if _REVERTED_SUBJECT_RE.search(subject) or _REVERTS_SUBJECT_RE.search(subject):
+        return True
+    if _BREAKING_CC_RE.search(subject):
+        return True
+    # Regression demands corroboration: either it appears as part of a
+    # high-confidence phrase, or an issue/incident id is present.
+    has_regression = bool(re.search(r"\bregression\b", subject, re.IGNORECASE))
+    if has_regression:
+        if _BENIGN_REGRESSION_RE.search(subject):
+            return False
+        if _INCIDENT_REGRESSION_RE.search(subject):
+            return True
+        if _ISSUE_ID_RE.search(subject) or _ISSUE_ID_RE.search(body):
+            return True
+        # Strip the word and check whether any other incident keyword carries
+        # the subject — a "rollback regression" should still fire on
+        # "rollback" alone.
+        without_regression = re.sub(r"\bregression\b", "", subject, flags=re.IGNORECASE)
+        return bool(_INCIDENT_RE.search(without_regression))
+    return bool(_INCIDENT_RE.search(subject))
 def find_incidents(commits: Sequence[Commit]) -> list[Commit]:
     """Return commits whose evidence-level signals incident-flavored intent.
     Acceptance ladder (highest to lowest confidence):
-      1. Subject contains an incident keyword.  A commit's subject is its
-         declared purpose, so a subject hit is treated as ground truth.
-      2. Subject carries the Conventional Commits breaking marker
+      1. Subject cites a security advisory (``CVE-…`` / ``GHSA-…``) — fires
+         on subject alone.
+      2. Subject is a default ``git revert`` body (``Reverted "…"``) or a
+         human revert pointer (``Reverts <sha>``) — fires on subject alone.
+      3. Subject carries the Conventional Commits breaking marker
          (``feat!:`` / ``fix!:`` / …).
-      3. Body carries the structured ``BREAKING CHANGE:`` footer.  This is a
-         deliberate, anchored marker, not free-form prose.
-      4. Body contains an incident keyword AND an issue / incident
-         identifier nearby (``#1234``, ``INC-447``, ``SEV-1``, ``P0``).
-         This filters out passing mentions in prose like "feat: add
-         incident-aware logging" where the keyword describes a *feature*.
+      4. Subject contains an incident keyword that is NOT a benign
+         "regression test/suite/nature" phrase. ``regression`` requires
+         either an issue id (on subject or body) or a pre-marker that
+         anchors it as an incident reference.
+      5. Body carries the structured ``BREAKING CHANGE:`` footer.
+      6. Body contains an incident keyword AND an issue / incident
+         identifier nearby. Filters out passing mentions in prose.
     A bare body keyword with no corroborating ID does NOT fire.
     """
     out: list[Commit] = []
     for c in commits:
-        if _INCIDENT_RE.search(c.subject) or _BREAKING_CC_RE.search(c.subject):
+        if _is_subject_incident(c.subject, c.body):
             out.append(c)
             continue
         if _BREAKING_FOOTER_RE.search(c.body):
@@ -696,6 +882,41 @@ def _all_matches_are_quoted(line: str, regex: re.Pattern[str]) -> bool:
     return True
+# An ALLCAPS line prefix (e.g. ``WARNING:``, ``ERROR:``, ``DEBUG:``) is the
+# canonical signature of pasted compiler / linter / spell-checker output.
+# A genuine human invariant statement opens with a normal sentence ("Do
+# not...", "Important: ...") and never with two or more uppercase letters
+# followed by an immediate colon.
+_TOOL_OUTPUT_ALLCAPS_RE = re.compile(r"^[A-Z]{2,}:\s")
+# A ``path:line:`` or ``path:line:col:`` prefix near the start of a line is
+# the unmistakable shape of compiler / aspell output. We accept any path-
+# shaped token (slashes, dots, hyphens, underscores, alnum) followed by
+# ``:<digits>:`` — anchored so it also catches ``./foo/bar.py:50:``.
+_TOOL_OUTPUT_PATH_RE = re.compile(r"^[\w./-]+:\d+:")
+# Per-(commit, file) cap on invariant lines pulled from one body. A real
+# author rarely states more than two crisp invariants in a single message;
+# anything beyond is almost certainly a paste. Set deliberately low so a
+# single noisy commit can no longer dominate the "highlights" view.
+_PER_COMMIT_INVARIANT_CAP = 2
+def _is_tool_output_line(line: str, prev_line: str) -> bool:
+    """True if ``line`` looks like quoted compiler / linter / aspell output.
+    Heuristics:
+      - ALLCAPS followed immediately by a colon (``WARNING:``, ``ERROR:``,
+        ``DEBUG:``…) — pasted tool output.
+      - ``path/to/file:line:`` prefix near the start — clang / mypy / aspell.
+      - Preceded by a ``> `` block-quote line — markdown-style "this is
+        what the tool said" framing.
+    """
+    if _TOOL_OUTPUT_ALLCAPS_RE.match(line):
+        return True
+    if _TOOL_OUTPUT_PATH_RE.match(line):
+        return True
+    return prev_line.startswith("> ")
 def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]:
     """Pull lines from commit *bodies* that match invariant tokens.
@@ -706,23 +927,87 @@ def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]
     eliminates the meta-mention failure mode where a commit *about* an
     invariant token (e.g. "fix invariant matcher") would self-flag.
+    Two filters keep pasted tool output out of the "stated invariants"
+    surface:
+    1. Lines that look like quoted compiler / linter / aspell output are
+       dropped (``WARNING: …``, ``foo/bar.py:50: …``, lines preceded by a
+       ``> `` block-quote). One noisy spell-check commit on django used to
+       supply 15 of the top-20 highlights; this rule kills it at the
+       source.
+    2. A per-commit cap of two invariants. Real authors rarely state more
+       than two crisp constraints in one message; anything beyond is
+       almost certainly a paste. The first two matches are preserved
+       (most informative-looking entries rank).
     Lines where every matching token is wrapped in quotes (``"do not"``) are
     treated as references rather than statements and are skipped.
     """
     out: list[tuple[str, str]] = []
     for commit in commits:
+        per_commit = 0
+        prev_line = ""
         for raw_line in commit.body.splitlines():
             line = raw_line.strip()
             if not line:
+                prev_line = raw_line
                 continue
+            if _is_tool_output_line(line, prev_line):
+                prev_line = raw_line
+                continue
+            prev_line = raw_line
             if not _INVARIANT_RE.search(line):
                 continue
             if _all_matches_are_quoted(line, _INVARIANT_RE):
                 continue
+            if per_commit >= _PER_COMMIT_INVARIANT_CAP:
+                continue
             out.append((commit.sha, line[:200]))
+            per_commit += 1
     return out
+def dedupe_invariant_lines(
+    pairs: Sequence[tuple[str, str]],
+    sha_to_commit: dict[str, Commit],
+) -> list[tuple[str, str]]:
+    """Collapse identical invariant lines to one canonical (sha, line) pair.
+    When two commits state the same invariant line — typically a cherry-pick
+    onto a maintenance branch, or a rebase that duplicated the message — we
+    must pick exactly one to surface. Without a deterministic rule the cache
+    and ``--no-cache`` paths can disagree (their walk orders differ when
+    timestamps tie), and downstream JSON consumers see flaky output across
+    runs.
+    The rule:
+    1. Earliest ``authored_at`` wins. The original statement is canonical;
+       cherry-picks and rebases are derivatives.
+    2. Lexicographically smallest ``sha`` breaks ties on identical timestamps.
+    The returned list preserves first-encounter order of the (now-unique)
+    lines so downstream code that sorts by date sees a stable input.
+    Pairs whose ``sha`` is not in ``sha_to_commit`` keep their first-seen
+    record (no metadata to compare on).
+    """
+    canonical: dict[str, str] = {}
+    for sha, line in pairs:
+        existing = canonical.get(line)
+        if existing is None:
+            canonical[line] = sha
+            continue
+        old_commit = sha_to_commit.get(existing)
+        new_commit = sha_to_commit.get(sha)
+        if old_commit is None or new_commit is None:
+            continue
+        old_key = (old_commit.authored_at, existing)
+        new_key = (new_commit.authored_at, sha)
+        if new_key < old_key:
+            canonical[line] = sha
+    return [(sha, line) for line, sha in canonical.items()]
 def author_last_activity(repo_root: Path, email: str) -> datetime | None:
     """Most recent commit timestamp by ``email`` anywhere in the repo, or None."""
     raw = _run_git(

whycode/ignore.py CHANGED Viewed

@@ -3,7 +3,16 @@
 These are paths/files that almost always pollute risk analysis without
 adding signal: changelogs (touched on every release, so they look "tightly
 coupled to everything"), lockfiles (regenerated on every dependency bump),
-vendored third-party code, and machine-generated stubs.
+vendored third-party code, machine-generated stubs, CI / packaging
+metadata, project-membership files (``AUTHORS``, ``LICENSE``), and
+translation catalogues (``*.po`` / ``*.mo``).
+A field test against django (10,000 commits, 7,043 files) showed the
+top-10 risk list was dominated by these high-touch metadata files —
+``AUTHORS``, ``.github/workflows/*.yml``, locale ``.po``, ``.gitignore``
+— and no application code at all reached the top 10. A scan-top list
+that surfaces zero source files is unactionable; demoting these
+metadata files lets real source code rank.
 Users can extend this list with a ``.whycodeignore`` file at repo root,
 one ``fnmatch``-style pattern per line. Comments start with ``#``.
@@ -71,6 +80,49 @@ DEFAULT_IGNORE_PATTERNS: tuple[str, ...] = (
     "*.ttf",
     "*.otf",
     "*.eot",
+    # CI / repo metadata — high-touch but never the source of risk in code.
+    ".github/**",
+    ".gitlab/**",
+    ".circleci/**",
+    ".gitignore",
+    ".gitattributes",
+    ".editorconfig",
+    ".pre-commit-config.yaml",
+    ".readthedocs.yaml",
+    ".readthedocs.yml",
+    ".flake8",
+    ".coveragerc",
+    "tox.ini",
+    "pytest.ini",
+    "Makefile",
+    # Project-membership / licensing files — touched on every contributor add.
+    "AUTHORS",
+    "AUTHORS.*",
+    "CONTRIBUTORS",
+    "CONTRIBUTORS.*",
+    "LICENSE",
+    "LICENSE.*",
+    "LICENSES/**",
+    "COPYING",
+    "COPYING.*",
+    "NOTICE",
+    "NOTICE.*",
+    # Python packaging metadata — low-signal-per-touch.
+    "setup.py",
+    "setup.cfg",
+    "MANIFEST.in",
+    # Translation catalogues — bulk-edited every release, never an indicator
+    # of code risk.
+    "*.po",
+    "*.mo",
+    "*.pot",
+    # Release-notes-style ``*.txt`` files only — narrow patterns; we are
+    # deliberately conservative here so a random ``requirements.txt`` is not
+    # ignored. The shapes below match common repo layouts (django, flask).
+    "release_notes/*.txt",
+    "docs/releases/*.txt",
+    "docs/release-notes/*.txt",
+    "release-notes/*.txt",
 )
 _USER_IGNORE_FILE = ".whycodeignore"

whycode/signals.py CHANGED Viewed

@@ -13,6 +13,7 @@ from enum import StrEnum
 from typing import TYPE_CHECKING
 from whycode import git_facts as gf
+from whycode import ignore as ign
 if TYPE_CHECKING:
     from whycode.git_facts import RepoFacts
@@ -148,7 +149,23 @@ def detect_high_churn(facts: RepoFacts) -> Signal | None:
 def detect_coupling(facts: RepoFacts) -> Signal | None:
-    paired = [(p, n) for p, n in facts.co_changed_files.items() if n >= COUPLING_MIN_COCHANGES]
+    """Files that change together with the target file, ranked by frequency.
+    Co-change candidates are filtered through the same ignore list that
+    powers ``whycode scan`` (built-in defaults plus an optional repo-local
+    ``.whycodeignore``). Without this filter, a per-file coupling signal
+    would surface ``CHANGELOG``, ``.github/workflows/*.yml``, ``AUTHORS``
+    and similar high-touch metadata as the file's "tight coupling" — the
+    field-test report flagged ``flask/app.py``'s top co-changers as 60%
+    metadata, leaving only two genuinely informative entries. Applying
+    the same filter here keeps the most-shown signal honest.
+    """
+    patterns = ign.effective_patterns(facts.repo_root)
+    paired = [
+        (p, n)
+        for p, n in facts.co_changed_files.items()
+        if n >= COUPLING_MIN_COCHANGES and not ign.is_ignored(p, patterns)
+    ]
     if not paired:
         return None
     paired.sort(key=lambda x: (-x[1], x[0]))

{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: whycode-cli
-Version: 0.4.0
+Version: 0.4.2
 Summary: Tells you what to be afraid of before you touch a file.
 Author: Kevin
 License-Expression: MIT

{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/RECORD RENAMED Viewed

@@ -1,22 +1,22 @@
-whycode/__init__.py,sha256=DT8PsrrvRkCF3U7cRgChch8aCNUobwQ9iFpPowLmLWY,96
+whycode/__init__.py,sha256=YXMeIO9f86OJ3_EonP3wlcLW6Qv9sIHQQZqr-Ja4HV8,96
 whycode/__main__.py,sha256=dqAk6746YpuM-FTIH4TBOULegGc5WweojiZjce0VYgQ,105
-whycode/cache.py,sha256=v55KbSlTqmP_ot1FEFqxCNpAApj6vthpHl2l0lGLX3A,17477
-whycode/cli.py,sha256=97LJmxOYBTtHkEtMlCabIogCxJNswOGusAdpvT3-mf8,45146
+whycode/cache.py,sha256=0cEPZHdolQbSiBLAOnMu20tobIrc7G0MNycpldHRpkk,18536
+whycode/cli.py,sha256=uRW5aysC2ufYvs_qPC1gzZcjQTFUZHdXxAmF25d4oY8,49328
 whycode/decisions.py,sha256=oCVhEF7QfHeci0LAWNtEjV2mUAEBJloL1rT3I4XXbkw,7570
-whycode/git_facts.py,sha256=cKPywdrAsQBsPl7R4kLO5zBAELmXlhoy23g29XjuK18,29044
-whycode/ignore.py,sha256=sdRO_0HSedm8aO69CSGl-zQrUVX5MEg9QGcAJWwAvP4,3021
+whycode/git_facts.py,sha256=MLp8e4nGaam6lBGCHY5-sftHj71lyg_HmmBOBx3g-kg,41829
+whycode/ignore.py,sha256=O_8bHIt0d1U-sYrBajBa7oEqpnHWU3f6Zf-8PU8CpO0,4748
 whycode/llm.py,sha256=leB94pBg8kUCq_BujZq5ixny0urGtKskjdaKoum_eCA,4092
 whycode/mcp_server.py,sha256=ht1tStAkOwmQzNIRkm1eA8Tnc59fzDRSGkgyIprft-0,18503
 whycode/risk_card.py,sha256=xOJkHwIkS_6yw_dSowsQ6LHfeD9Mwr2tymL7_wqxs0U,8855
 whycode/scorer.py,sha256=4pBejunfxzYhGUzMeL8uGEMQzC6DWiqwcTeMdo3eras,1444
-whycode/signals.py,sha256=M2x6868G1YQ4eWoIuwE0PMjurCoZn1jyJWySLF7FlW0,13085
+whycode/signals.py,sha256=z0kZfXR60nS-j56nchHd1V3aK8A5CGR1BAyHZZAff3s,13899
 whycode/suppressions.py,sha256=1lKSs-kCgpnJbcxozcgiSP8ZAfjEDMHXuM3sw4FaY78,3836
 whycode/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 whycode/templates/github-workflow.yml,sha256=LAfHMDG2TkAwi4vCNinHk-4zOt-mCWErBpmpaqlW5oA,2251
 whycode/templates/pre-commit,sha256=IhU11CvoDwqRAAsvHwUo-BwaNbdgy1cpXc54Z_phrmQ,316
-whycode_cli-0.4.0.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
-whycode_cli-0.4.0.dist-info/METADATA,sha256=3VurI0V9_AtQdTTC8Fyis3C3pulEIdEe_bMC4_iH7xs,10218
-whycode_cli-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
-whycode_cli-0.4.0.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
-whycode_cli-0.4.0.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
-whycode_cli-0.4.0.dist-info/RECORD,,
+whycode_cli-0.4.2.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
+whycode_cli-0.4.2.dist-info/METADATA,sha256=GD3cP18eEcHePHEXxroFuuZ-2pysLn51biNROQKDBXw,10218
+whycode_cli-0.4.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+whycode_cli-0.4.2.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
+whycode_cli-0.4.2.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
+whycode_cli-0.4.2.dist-info/RECORD,,

{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

whycode-cli 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

whycode-cli 0.4.0py3-none-any.whl → 0.4.2py3-none-any.whl