PyPI - whycode-cli - Versions diffs - 0.4.2__tar.gz → 0.5.2__tar.gz - Mend

whycode-cli 0.4.2tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{whycode_cli-0.4.2/src/whycode_cli.egg-info → whycode_cli-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: whycode-cli
-Version: 0.4.2
+Version: 0.5.2
 Summary: Tells you what to be afraid of before you touch a file.
 Author: Kevin
 License-Expression: MIT
@@ -140,6 +140,32 @@ Score interpretation:
 | 25–49   | WORTH A LOOK        | One thing might bite you. Glance.       |
 | 0–24    | NO FLAGS            | Quiet history — but read the diff anyway. |
+### "But why exactly did this fire?" — `--explain`
+When a signal looks wrong (or you just want to understand the reasoning
+before trusting the tool), pass `--explain`. Each fired signal grows a
+small block naming the precise rule that produced it, the literal evidence
+the rule looked at, and the source location of the ladder branch:
+```
+$ whycode why src/payment/refund.py --explain
+   MED     1 incident-flagged change in history
+           1 commit matched incident keywords (latest 12 days ago:
+           'hotfix: idempotency token regression').
+           evidence: a3f4b2c
+           ─ rule: incident_subject_keyword  src/whycode/git_facts.py:find_incidents
+             fired because: subject 'hotfix: idempotency token regression'
+                            matched the literal token 'hotfix'
+             evidence: hotfix
+```
+Without `--explain`, output is exactly as before — this is purely an
+opt-in transparency surface. `--explain --json` adds an `explanation`
+key per signal in the JSON output, with the same fields. The flag covers
+L1+L2 detectors only; if you also pass `--llm`, the L3 decision block is
+unaffected.
 ## The killer use case: hand it to your AI editor
 WhyCode is also an MCP server. Configure it in any MCP-aware editor or

{whycode_cli-0.4.2 → whycode_cli-0.5.2}/README.md RENAMED Viewed

@@ -110,6 +110,32 @@ Score interpretation:
 | 25–49   | WORTH A LOOK        | One thing might bite you. Glance.       |
 | 0–24    | NO FLAGS            | Quiet history — but read the diff anyway. |
+### "But why exactly did this fire?" — `--explain`
+When a signal looks wrong (or you just want to understand the reasoning
+before trusting the tool), pass `--explain`. Each fired signal grows a
+small block naming the precise rule that produced it, the literal evidence
+the rule looked at, and the source location of the ladder branch:
+```
+$ whycode why src/payment/refund.py --explain
+   MED     1 incident-flagged change in history
+           1 commit matched incident keywords (latest 12 days ago:
+           'hotfix: idempotency token regression').
+           evidence: a3f4b2c
+           ─ rule: incident_subject_keyword  src/whycode/git_facts.py:find_incidents
+             fired because: subject 'hotfix: idempotency token regression'
+                            matched the literal token 'hotfix'
+             evidence: hotfix
+```
+Without `--explain`, output is exactly as before — this is purely an
+opt-in transparency surface. `--explain --json` adds an `explanation`
+key per signal in the JSON output, with the same fields. The flag covers
+L1+L2 detectors only; if you also pass `--llm`, the L3 decision block is
+unaffected.
 ## The killer use case: hand it to your AI editor
 WhyCode is also an MCP server. Configure it in any MCP-aware editor or

{whycode_cli-0.4.2 → whycode_cli-0.5.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "whycode-cli"
-version = "0.4.2"
+version = "0.5.2"
 description = "Tells you what to be afraid of before you touch a file."
 readme = "README.md"
 license = "MIT"

{whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """WhyCode — tells you what to be afraid of before touching a file."""
-__version__ = "0.4.2"
+__version__ = "0.5.2"

{whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/cli.py RENAMED Viewed

@@ -20,10 +20,11 @@ Commands
 from __future__ import annotations
+import contextlib
 import functools
 import json
 import sys
-from collections.abc import Callable
+from collections.abc import Callable, Iterator
 from pathlib import Path
 from typing import Any, TypeVar
@@ -126,6 +127,42 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
     return repo_root, rel
+@contextlib.contextmanager
+def _memoised_is_ignored(repo_root: Path) -> Iterator[None]:
+    """Memoise ``ign.is_ignored`` for the duration of the ``with`` block.
+    The diff command's evaluation re-applies the same ``is_ignored`` test
+    against thousands of co-change candidates per file. Each call resolves
+    fnmatch over ~83 patterns; uncached, that is ~100 CPU-seconds across
+    a 1,927-file diff on django.
+    A path's verdict is fully determined by the path string and the
+    repo's effective ignore-pattern tuple, so we cache by ``(path,
+    patterns)`` for the duration of the diff and restore the original
+    function on exit. The cache is process-local; the rest of the CLI
+    (``why``, ``scan``, …) sees the un-memoised function. ``ign`` itself
+    is unchanged.
+    """
+    patterns = ign.effective_patterns(repo_root)
+    cache: dict[str, bool] = {}
+    original = ign.is_ignored
+    def memoised(path: str, patterns_arg: object = patterns) -> bool:
+        if patterns_arg is patterns:
+            cached = cache.get(path)
+            if cached is None:
+                cached = original(path, patterns)
+                cache[path] = cached
+            return cached
+        return original(path, patterns_arg)  # type: ignore[arg-type]
+    ign.is_ignored = memoised  # type: ignore[assignment]
+    try:
+        yield
+    finally:
+        ign.is_ignored = original
 _F = TypeVar("_F", bound=Callable[..., Any])
@@ -240,6 +277,15 @@ def why(
         "--no-cache",
         help="Bypass the local SQLite cache at .whycode/cache.db.",
     ),
+    explain: bool = typer.Option(
+        False,
+        "--explain",
+        help=(
+            "Below each signal, print the precise rule that fired: the literal "
+            "matched tokens, threshold values, and the source location of the "
+            "ladder branch. L1+L2 only — L3 (--llm) decisions are not annotated."
+        ),
+    ),
 ) -> None:
     """Print the Risk Card for ``path``."""
     repo_root, rel = _require_tracked(path)
@@ -328,12 +374,12 @@ def why(
                 card = card.with_decisions(tuple(decisions))
         if json_out:
-            console.print_json(json.dumps(card.to_dict()))
+            console.print_json(json.dumps(card.to_dict(explain=explain)))
             return
         if brief:
             _print_brief(card)
             return
-        console.print(rc.render_text(card))
+        console.print(rc.render_text(card, explain=explain))
     finally:
         if cache is not None:
             cache.close()
@@ -428,16 +474,57 @@ def diff(
     cache = _open_cache(repo_root, no_cache)
     try:
-        cards: list[rc.RiskCard] = []
-        for f in files:
-            try:
-                cards.append(rc.build(repo_root, f, cache=cache))
-            except gf.GitError:
-                continue
-        # Stable tie-break: lex smallest path on identical scores so cache
-        # and --no-cache truncate the same files at --top N.
-        cards.sort(key=lambda c: (-c.score.value, c.path))
-        cards = cards[:top]
+        # One git log walk feeds every changed file's scoring. Without this
+        # batched load, diff against an old base on a large repo runs N
+        # `git log --follow` calls (one per changed file): on django at 1,927
+        # changed files the legacy path measured 6+ minutes, with the
+        # 12+ minute variant timing out outright. ``load_diff_facts`` parses
+        # one un-pathed walk into a path -> [Commit] map; per-file scoring
+        # then does dict lookups instead of re-shelling-out.
+        try:
+            diff_facts = gf.load_diff_facts(repo_root, cache=cache)
+        except gf.GitError as exc:
+            err.print(f"[red]error:[/red] {exc}")
+            raise typer.Exit(2) from exc
+        # Pre-compute the ignore-pattern set ONCE and a verdict-per-path
+        # memo. ``signals.detect_coupling`` (re-introduced in 0.4.1 as F10)
+        # filters every coupling candidate through ``ign.is_ignored`` —
+        # without memoisation that's 83 patterns x 700 candidates x 1,927
+        # files = ~100 CPU-seconds across the diff. The memo cache turns
+        # each path's verdict into a dict lookup after the first hit.
+        with _memoised_is_ignored(repo_root):
+            # First pass: every changed file is scored without the
+            # ghost-keeper detector, which would otherwise fire ``git
+            # blame`` per file. With 1,927 changed files on django this
+            # single deferral saves ~5 minutes. We then sort and
+            # re-evaluate only the top-N with full signals — at most
+            # ``top`` blame calls instead of ``len(files)``.
+            prelim: list[rc.RiskCard] = []
+            for f in files:
+                try:
+                    prelim.append(
+                        rc.build_from_diff_facts(diff_facts, f, skip_ghost_keeper=True)
+                    )
+                except gf.GitError:
+                    continue
+            # Stable tie-break (from 0.4.2): lex smallest path on identical
+            # scores so cache and --no-cache truncate the same files at --top N.
+            prelim.sort(key=lambda c: (-c.score.value, c.path))
+            # Second pass: re-score the top-N with the full detector ladder
+            # so the rendered table includes ghost-keeper findings where
+            # they apply. Files outside the top-N keep their first-pass
+            # score; they were not going to appear in the user's view
+            # anyway.
+            refined_top: list[rc.RiskCard] = []
+            for prelim_card in prelim[:top]:
+                try:
+                    refined_top.append(
+                        rc.build_from_diff_facts(diff_facts, prelim_card.path)
+                    )
+                except gf.GitError:
+                    refined_top.append(prelim_card)
+            cards = refined_top
+            cards.sort(key=lambda c: (-c.score.value, c.path))
     finally:
         if cache is not None:
             cache.close()

{whycode_cli-0.4.2 → whycode_cli-0.5.2}/src/whycode/git_facts.py RENAMED Viewed

@@ -735,6 +735,242 @@ def _populate_diffstat_cache(
         cache.upsert_commit_files(rows)
+# ---- batch loading for whycode diff ---------------------------------------
+@dataclass(frozen=True)
+class DiffFacts:
+    """A whole-repo snapshot built once for a single ``whycode diff`` evaluation.
+    The diff command scores N changed files; previously each file fired its
+    own ``git log --follow`` plus a co-change diffstat pass, so wall-clock
+    cost scaled with N. ``DiffFacts`` replaces N path-restricted log walks
+    with a single un-pathed walk: one ``git log --no-merges --numstat`` over
+    the repo, parsed once into ``commits_by_path`` (every commit that named
+    each path) and ``co_change_index`` (each commit's full file-set, used
+    for in-memory coupling counts). Per-file scoring then reads from this
+    map rather than re-shelling-out.
+    The map deliberately does NOT follow renames: the diff command only
+    scores files present in HEAD's working tree, so the tradeoff is "lose
+    rename-resolved history pre-rename" against "scoring 1,927 files in
+    seconds rather than minutes". Coupling against pre-rename names still
+    surfaces under those names in the map; the surface diff in practice is
+    a stable-tie-break difference, not a structural one.
+    """
+    repo_root: Path
+    commits_by_path: dict[str, list[Commit]]
+    """``path -> [Commit]``, newest-first, capped per path during load.
+    A missing key — i.e. ``commits_by_path.get(path)`` returns ``None`` —
+    means the loader walk did not see this path. ``gather_for_diff`` treats
+    that the same as an empty list: a path that the un-pathed walk did not
+    touch has no history to score from.
+    """
+    co_change_index: dict[str, tuple[str, ...]]
+    """``commit_sha -> tuple of paths touched by that commit``.
+    Snapshot of the same numstat parse used to build ``commits_by_path``.
+    Per-file ``co_changes`` reads this for in-memory coupling counts so
+    the diff pipeline never re-issues ``git log --no-walk`` per file.
+    """
+    cache: CacheStore | None = None
+    """Optional cache, threaded through so signal detectors (specifically
+    ``detect_ghost_keeper``) reuse it for ``git blame`` line ownership."""
+_NUMSTAT_LINE_RE = re.compile(r"^(\d+|-)\t(\d+|-)\t(.+)$")
+def load_diff_facts(
+    repo_root: Path,
+    *,
+    max_commits: int | None = None,
+    cache: CacheStore | None = None,
+) -> DiffFacts:
+    """Build a :class:`DiffFacts` snapshot from one ``git log`` invocation.
+    Strategy:
+      1. Walk HEAD with ``git log --no-merges --numstat --pretty=...`` once.
+      2. Parse each commit + its full file-set into a single in-memory map.
+      3. Return the snapshot for the diff command's per-file scorer to drive.
+    With a ``cache`` supplied, the walked commits are persisted to
+    ``commits``; per-file diffstat presence rows are persisted to
+    ``commit_files`` so a subsequent ``why`` / ``scan`` / ``diff`` invocation
+    on the same HEAD reuses what we just paid for.
+    The walk is intentionally un-pathed: the diff command scores files
+    that appear in ``git diff --name-only base...HEAD``, all of which exist
+    at HEAD by definition. A single un-pathed walk that captures every
+    commit's diffstat is strictly cheaper than N path-restricted walks
+    that each re-walk the full graph. ``max_commits`` is applied per-path
+    *after* the walk so callers can cap per-file depth without changing
+    the cost of the walk itself.
+    """
+    # Pretty format: RECORD_SEP starts each commit; metadata fields are
+    # UNIT_SEP-delimited; the body is the last metadata field. Numstat
+    # output git appends after the body needs no further separator —
+    # the next commit's leading RECORD_SEP marks the boundary.
+    pretty_format = (
+        f"{RECORD_SEP}%H{UNIT_SEP}%an{UNIT_SEP}%ae{UNIT_SEP}"
+        f"%aI{UNIT_SEP}%s{UNIT_SEP}%b"
+    )
+    raw = _run_git(
+        repo_root,
+        "log",
+        "--no-merges",
+        "--numstat",
+        f"--pretty=format:{pretty_format}",
+    )
+    all_commits, commits_by_path, co_change_index = _parse_log_with_files(raw)
+    if max_commits is not None:
+        commits_by_path = {p: cs[:max_commits] for p, cs in commits_by_path.items()}
+    if cache is not None and all_commits:
+        _store_commits(cache, all_commits)
+        # Persist diffstat presence rows so a subsequent `why` against the
+        # same HEAD does not re-shell-out per file. Insertion/deletion
+        # widths are not captured by this walk (the diff command's
+        # detectors only depend on the *path set* of each commit), so they
+        # are stored as zero — see the paragraph in ``DiffFacts``.
+        files_rows: list[tuple[str, str, int, int]] = []
+        for sha, paths in co_change_index.items():
+            for p in paths:
+                files_rows.append((sha, p, 0, 0))
+        if files_rows:
+            cache.upsert_commit_files(files_rows)
+        try:
+            head_sha = _run_git(repo_root, "rev-parse", "HEAD").strip()
+        except GitError:
+            head_sha = ""
+        if head_sha and not cache.head_sha:
+            cache.set_head_sha(head_sha)
+    return DiffFacts(
+        repo_root=repo_root,
+        commits_by_path=commits_by_path,
+        co_change_index=co_change_index,
+        cache=cache,
+    )
+def _parse_log_with_files(
+    raw: str,
+) -> tuple[list[Commit], dict[str, list[Commit]], dict[str, tuple[str, ...]]]:
+    """Parse ``git log --no-merges --numstat --pretty=<sep><commit>`` output.
+    Returns ``(all_commits, commits_by_path, co_change_index)``:
+      - ``all_commits`` is every parsed commit, newest first.
+      - ``commits_by_path[path]`` is the subset whose numstat block named
+        ``path``, preserving the newest-first order of the walk.
+      - ``co_change_index[sha]`` is the full path tuple from the same
+        numstat block, used by the diff command's in-memory coupling.
+    Within one record the format is
+    ``<sha>\\x1f<an>\\x1f<ae>\\x1f<aI>\\x1f<subject>\\x1f<body...>``
+    followed by zero or more numstat lines (``ins\\tdel\\tpath``). The body
+    is free-form prose; numstat is tab-delimited 3-column. We walk lines
+    forward, holding the first line as the metadata + start-of-body, and
+    accumulate further lines as either body (free-form) or numstat
+    (matches :data:`_NUMSTAT_LINE_RE`). Once a numstat line fires, the
+    remaining lines for that record are taken to be more numstat lines.
+    """
+    all_commits: list[Commit] = []
+    commits_by_path: dict[str, list[Commit]] = {}
+    co_change_index: dict[str, tuple[str, ...]] = {}
+    for record in raw.split(RECORD_SEP):
+        record = record.strip("\n")
+        if not record:
+            continue
+        lines = record.split("\n")
+        # The first line carries every metadata field plus the first body
+        # line (the body itself was emitted verbatim by ``%b``).
+        head_parts = lines[0].split(UNIT_SEP)
+        if len(head_parts) < 6:
+            continue
+        sha = head_parts[0].strip()
+        if not sha:
+            continue
+        author_name = head_parts[1]
+        author_email = head_parts[2]
+        authored_at = head_parts[3]
+        subject = head_parts[4]
+        first_body = UNIT_SEP.join(head_parts[5:])
+        body_lines: list[str] = [first_body] if first_body else []
+        files: list[str] = []
+        in_numstat = False
+        for line in lines[1:]:
+            m = _NUMSTAT_LINE_RE.match(line)
+            if in_numstat:
+                if m is not None:
+                    files.append(m.group(3))
+                continue
+            if m is not None:
+                in_numstat = True
+                files.append(m.group(3))
+                continue
+            body_lines.append(line)
+        try:
+            authored = _parse_iso(authored_at)
+        except ValueError:
+            # Bad timestamps from a single 15-year-old commit shouldn't kill
+            # the diff command. F1 (full timezone-tolerant parser) is owned
+            # by another branch; we degrade locally rather than crash.
+            continue
+        body = "\n".join(body_lines).strip("\n")
+        commit = Commit(
+            sha=sha,
+            author_name=author_name,
+            author_email=author_email,
+            authored_at=authored,
+            subject=subject,
+            body=body,
+            files=tuple(files),
+        )
+        all_commits.append(commit)
+        co_change_index[sha] = commit.files
+        for path in files:
+            commits_by_path.setdefault(path, []).append(commit)
+    return all_commits, commits_by_path, co_change_index
+def gather_for_diff(
+    diff_facts: DiffFacts,
+    path: str,
+    *,
+    max_commits: int | None = None,
+) -> RepoFacts:
+    """Build a :class:`RepoFacts` for ``path`` using only the in-memory map.
+    The diff command calls this once per changed file, replacing the per-file
+    ``gather()`` (and its embedded ``git log --follow`` + co-change shell-out)
+    with O(1) dict lookups. All higher-layer detectors run unchanged on the
+    returned ``RepoFacts``.
+    """
+    commits = diff_facts.commits_by_path.get(path, [])
+    if max_commits is not None:
+        commits = commits[:max_commits]
+    co_changed: Counter[str] = Counter()
+    for commit in commits:
+        touched = diff_facts.co_change_index.get(commit.sha, ())
+        for other in touched:
+            if other == path:
+                continue
+            co_changed[other] += 1
+    return RepoFacts(
+        repo_root=diff_facts.repo_root,
+        path=path,
+        commits=commits,
+        co_changed_files=co_changed,
+        revert_pairs=find_revert_pairs(commits),
+        incident_commits=find_incidents(commits),
+        invariant_quotes=extract_invariant_quotes(commits),
+        cache=diff_facts.cache,
+    )
 _REVERT_PREFIX = 'this reverts commit '

whycode-cli 0.4.2__tar.gz → 0.5.2__tar.gz

whycode-cli 0.4.2tar.gz → 0.5.2tar.gz