PyPI - whycode-cli - Versions diffs - 0.2.0__py3-none-any.whl - Mend

whycode-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

whycode/__init__.py +3 -0
whycode/__main__.py +6 -0
whycode/cli.py +709 -0
whycode/git_facts.py +450 -0
whycode/mcp_server.py +204 -0
whycode/risk_card.py +192 -0
whycode/scorer.py +55 -0
whycode/signals.py +389 -0
whycode/templates/__init__.py +0 -0
whycode/templates/github-workflow.yml +42 -0
whycode/templates/pre-commit +7 -0
whycode_cli-0.2.0.dist-info/METADATA +223 -0
whycode_cli-0.2.0.dist-info/RECORD +17 -0
whycode_cli-0.2.0.dist-info/WHEEL +5 -0
whycode_cli-0.2.0.dist-info/entry_points.txt +2 -0
whycode_cli-0.2.0.dist-info/licenses/LICENSE +21 -0
whycode_cli-0.2.0.dist-info/top_level.txt +1 -0

whycode/git_facts.py ADDED Viewed

@@ -0,0 +1,450 @@
+"""Layer 1: deterministic git facts.
+Pure git plumbing wrapped in safe Python. Never interprets, never guesses.
+The output is the bedrock that Layer 2 builds on.
+Design notes
+------------
+- We delimit log output with ASCII unit (0x1f) and record (0x1e) separators
+  because they essentially never appear in commit messages or paths.
+- We use ``--follow`` so file rename history is traced through.
+- We never invoke a subcommand that mutates the repo.
+"""
+from __future__ import annotations
+import re
+import subprocess
+from collections import Counter
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+UNIT_SEP = "\x1f"
+RECORD_SEP = "\x1e"
+# A commit subject/body containing one of these markers is treated as evidence
+# that the original author flagged something worth carrying forward.
+INCIDENT_TOKENS: tuple[str, ...] = (
+    "hotfix",
+    "incident",
+    "outage",
+    "p0",
+    "p1",
+    "sev1",
+    "sev2",
+    "production down",
+    "rollback",
+    "regression",
+)
+_INCIDENT_RE = re.compile(
+    r"|".join(rf"\b{re.escape(tok)}\b" for tok in INCIDENT_TOKENS),
+    re.IGNORECASE,
+)
+# A Conventional Commits structured marker. Unlike free-form keywords above,
+# this is a deliberate, anchored footer — high enough confidence to fire on
+# body alone with no need for a corroborating issue ID.
+_BREAKING_FOOTER_RE = re.compile(r"\bBREAKING[- ]CHANGE:", re.IGNORECASE)
+# Conventional Commits "breaking" indicator: ``feat!:``, ``fix!:``, ``refactor!:``…
+# Anchored to the start of the subject line (or after whitespace) and limited
+# to known type tokens so we don't match URL fragments like ``foo!:bar``.
+_BREAKING_CC_RE = re.compile(
+    r"(?:^|\s)(?:feat|fix|chore|refactor|perf|build|ci|docs|test|style|revert)!:",
+    re.IGNORECASE,
+)
+# Issue / incident identifiers that corroborate a body-only incident keyword:
+# - GitHub-style: #1234
+# - Jira-style:   ABC-123
+# - Severity:     SEV-1, sev1, P0, P1
+# Used to raise body matches above the "passing mention in prose" floor.
+_ISSUE_ID_RE = re.compile(
+    r"(?:#\d+|\b[A-Z][A-Z0-9_]+-\d+|\bSEV[- ]?\d\b|\bP[01]\b)",
+)
+INVARIANT_TOKENS: tuple[str, ...] = (
+    "do not",
+    "don't",
+    "must not",
+    "warning:",
+    "important:",
+    "danger:",
+    "note:",
+    "invariant",
+    "workaround",
+    "tradeoff",
+)
+# Compiled once: each token must appear as a whole phrase. Tokens that already
+# end in a colon or apostrophe are treated literally; otherwise we require word
+# boundaries so e.g. "guard" does not match "scope guard" of "guard rail".
+_INVARIANT_RE = re.compile(
+    r"|".join(
+        rf"\b{re.escape(tok)}\b" if re.match(r"^[a-z][a-z ]*$", tok) else re.escape(tok)
+        for tok in INVARIANT_TOKENS
+    ),
+    re.IGNORECASE,
+)
+@dataclass(frozen=True)
+class Commit:
+    sha: str
+    author_name: str
+    author_email: str
+    authored_at: datetime
+    subject: str
+    body: str
+    files: tuple[str, ...] = ()
+    @property
+    def message(self) -> str:
+        return f"{self.subject}\n\n{self.body}".strip()
+@dataclass(frozen=True)
+class FileChange:
+    sha: str
+    path: str
+    insertions: int
+    deletions: int
+@dataclass
+class RepoFacts:
+    """Snapshot of facts relevant to a single file."""
+    repo_root: Path
+    path: str
+    commits: list[Commit] = field(default_factory=list)
+    co_changed_files: Counter[str] = field(default_factory=Counter)
+    revert_pairs: list[tuple[str, str]] = field(default_factory=list)
+    """Pairs of (revert_commit_sha, reverted_commit_sha)."""
+    incident_commits: list[Commit] = field(default_factory=list)
+    invariant_quotes: list[tuple[str, str]] = field(default_factory=list)
+    """Pairs of (commit_sha, line containing an invariant token)."""
+class GitError(RuntimeError):
+    """Raised when a git invocation fails or produces unexpected output."""
+def _run_git(repo_root: Path, *args: str) -> str:
+    """Invoke git, return stdout. Raises GitError on non-zero exit."""
+    cmd = ["git", "-C", str(repo_root), *args]
+    try:
+        proc = subprocess.run(
+            cmd,
+            check=False,
+            capture_output=True,
+            text=True,
+            encoding="utf-8",
+            errors="replace",
+        )
+    except FileNotFoundError as exc:
+        raise GitError("git executable not found on PATH") from exc
+    if proc.returncode != 0:
+        raise GitError(
+            f"git {' '.join(args)} failed (exit {proc.returncode}): {proc.stderr.strip()}"
+        )
+    return proc.stdout
+def discover_repo_root(start: Path) -> Path:
+    """Find the enclosing git repo root for ``start``."""
+    out = _run_git(start, "rev-parse", "--show-toplevel").strip()
+    if not out:
+        raise GitError(f"{start} is not inside a git repository")
+    return Path(out)
+def is_tracked(repo_root: Path, path: str) -> bool:
+    """Return True if ``path`` is tracked by git in ``repo_root``."""
+    try:
+        out = _run_git(repo_root, "ls-files", "--error-unmatch", "--", path)
+    except GitError:
+        return False
+    return bool(out.strip())
+def _parse_iso(timestamp: str) -> datetime:
+    return datetime.fromisoformat(timestamp.strip())
+def _log_format() -> str:
+    """The format used to serialise a commit on a single record."""
+    fields = ["%H", "%an", "%ae", "%aI", "%s", "%b"]
+    return UNIT_SEP.join(fields) + RECORD_SEP
+def _parse_log_records(raw: str) -> list[Commit]:
+    commits: list[Commit] = []
+    for record in raw.split(RECORD_SEP):
+        record = record.strip("\n")
+        if not record:
+            continue
+        parts = record.split(UNIT_SEP)
+        if len(parts) < 6:
+            # Body may contain a UNIT_SEP only if a contributor pasted one in
+            # — vanishingly rare, but be defensive: re-stitch the trailing fields.
+            head = parts[:5]
+            body = UNIT_SEP.join(parts[5:])
+            parts = [*head, body]
+        sha, author_name, author_email, authored_at, subject, body = parts
+        commits.append(
+            Commit(
+                sha=sha.strip(),
+                author_name=author_name,
+                author_email=author_email,
+                authored_at=_parse_iso(authored_at),
+                subject=subject,
+                body=body.strip("\n"),
+            )
+        )
+    return commits
+def commits_for_path(
+    repo_root: Path,
+    path: str,
+    *,
+    max_count: int | None = None,
+    ref: str | None = None,
+) -> list[Commit]:
+    """Return commits that touched ``path`` (rename-aware), newest first.
+    When ``ref`` is given, only commits reachable from that revision are
+    returned — i.e., the file's history *as of* that point in time.
+    """
+    args = [
+        "log",
+        "--follow",
+        "--no-merges",
+        f"--pretty=format:{_log_format()}",
+    ]
+    if max_count is not None:
+        args.append(f"--max-count={max_count}")
+    if ref is not None:
+        args.append(ref)
+    args.extend(["--", path])
+    raw = _run_git(repo_root, *args)
+    return _parse_log_records(raw)
+def all_commits(repo_root: Path, *, max_count: int | None = None) -> list[Commit]:
+    """Return all commits in repo, newest first. Used for revert / ghost-author scans."""
+    args = ["log", "--no-merges", f"--pretty=format:{_log_format()}"]
+    if max_count is not None:
+        args.append(f"--max-count={max_count}")
+    raw = _run_git(repo_root, *args)
+    return _parse_log_records(raw)
+def files_changed_in(repo_root: Path, sha: str) -> list[FileChange]:
+    """Return the list of files (with diffstat) changed in ``sha``."""
+    raw = _run_git(
+        repo_root, "show", "--no-renames", "--numstat", "--format=", sha
+    )
+    out: list[FileChange] = []
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        parts = line.split("\t")
+        if len(parts) != 3:
+            continue
+        ins_s, del_s, path = parts
+        # Binary files appear as "-" "-".
+        try:
+            insertions = int(ins_s) if ins_s != "-" else 0
+            deletions = int(del_s) if del_s != "-" else 0
+        except ValueError:
+            continue
+        out.append(FileChange(sha=sha, path=path, insertions=insertions, deletions=deletions))
+    return out
+def co_changes(
+    repo_root: Path,
+    commits: Sequence[Commit],
+    target_path: str,
+) -> Counter[str]:
+    """Count, across the given commits, how often other files changed alongside ``target_path``.
+    The target file is excluded from the result.
+    """
+    counter: Counter[str] = Counter()
+    for commit in commits:
+        for change in files_changed_in(repo_root, commit.sha):
+            if change.path == target_path:
+                continue
+            counter[change.path] += 1
+    return counter
+_REVERT_PREFIX = 'this reverts commit '
+def find_revert_pairs(commits: Sequence[Commit]) -> list[tuple[str, str]]:
+    """Detect (revert_sha, reverted_sha) pairs from commit messages.
+    Git's default revert message body contains ``This reverts commit <sha>.``.
+    We are tolerant of leading whitespace and trailing punctuation.
+    """
+    pairs: list[tuple[str, str]] = []
+    for commit in commits:
+        for line in commit.message.splitlines():
+            stripped = line.strip().lower()
+            if not stripped.startswith(_REVERT_PREFIX):
+                continue
+            after = stripped[len(_REVERT_PREFIX) :].strip().rstrip(".")
+            # The first whitespace-separated token is the SHA.
+            token = after.split()[0] if after else ""
+            if len(token) >= 7 and all(c in "0123456789abcdef" for c in token):
+                pairs.append((commit.sha, token))
+                break
+    return pairs
+def find_incidents(commits: Sequence[Commit]) -> list[Commit]:
+    """Return commits whose evidence-level signals incident-flavored intent.
+    Acceptance ladder (highest to lowest confidence):
+      1. Subject contains an incident keyword.  A commit's subject is its
+         declared purpose, so a subject hit is treated as ground truth.
+      2. Subject carries the Conventional Commits breaking marker
+         (``feat!:`` / ``fix!:`` / …).
+      3. Body carries the structured ``BREAKING CHANGE:`` footer.  This is a
+         deliberate, anchored marker, not free-form prose.
+      4. Body contains an incident keyword AND an issue / incident
+         identifier nearby (``#1234``, ``INC-447``, ``SEV-1``, ``P0``).
+         This filters out passing mentions in prose like "feat: add
+         incident-aware logging" where the keyword describes a *feature*.
+    A bare body keyword with no corroborating ID does NOT fire.
+    """
+    out: list[Commit] = []
+    for c in commits:
+        if _INCIDENT_RE.search(c.subject) or _BREAKING_CC_RE.search(c.subject):
+            out.append(c)
+            continue
+        if _BREAKING_FOOTER_RE.search(c.body):
+            out.append(c)
+            continue
+        if _INCIDENT_RE.search(c.body) and _ISSUE_ID_RE.search(c.body):
+            out.append(c)
+    return out
+# Straight, backtick, and the four common Unicode "smart" quote code points.
+# We build the string from chr() calls because ruff's RUF001 ambiguous-char
+# check rejects the literal Unicode quotes inline.
+_QUOTE_CHARS = "\"'`" + "".join(chr(c) for c in (0x2018, 0x2019, 0x201C, 0x201D))
+def _all_matches_are_quoted(line: str, regex: re.Pattern[str]) -> bool:
+    """True iff every match of ``regex`` in ``line`` is immediately bracketed
+    by quote characters — i.e. the tokens are being *named* rather than used.
+    """
+    matches = list(regex.finditer(line))
+    if not matches:
+        return False
+    for m in matches:
+        before = line[m.start() - 1] if m.start() > 0 else ""
+        after = line[m.end()] if m.end() < len(line) else ""
+        if before in _QUOTE_CHARS and after in _QUOTE_CHARS:
+            continue
+        return False
+    return True
+def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]:
+    """Pull lines from commit *bodies* that match invariant tokens.
+    Returns pairs of (sha, the matching line) — verbatim, capped at 200 chars.
+    Body-only because the subject describes what the commit did; an actual
+    constraint is almost always stated in the body. Skipping the subject also
+    eliminates the meta-mention failure mode where a commit *about* an
+    invariant token (e.g. "fix invariant matcher") would self-flag.
+    Lines where every matching token is wrapped in quotes (``"do not"``) are
+    treated as references rather than statements and are skipped.
+    """
+    out: list[tuple[str, str]] = []
+    for commit in commits:
+        for raw_line in commit.body.splitlines():
+            line = raw_line.strip()
+            if not line:
+                continue
+            if not _INVARIANT_RE.search(line):
+                continue
+            if _all_matches_are_quoted(line, _INVARIANT_RE):
+                continue
+            out.append((commit.sha, line[:200]))
+    return out
+def author_last_activity(repo_root: Path, email: str) -> datetime | None:
+    """Most recent commit timestamp by ``email`` anywhere in the repo, or None."""
+    raw = _run_git(
+        repo_root,
+        "log",
+        "-1",
+        "--all",
+        f"--author={email}",
+        "--pretty=format:%aI",
+    )
+    raw = raw.strip()
+    if not raw:
+        return None
+    try:
+        return _parse_iso(raw)
+    except ValueError:
+        return None
+def line_ownership(repo_root: Path, path: str) -> dict[str, int]:
+    """Return ``{author_email: line_count}`` from ``git blame`` of HEAD's ``path``.
+    Empty dict if blame is unavailable (file deleted, binary, etc.). Used by
+    Layer 2 to refine ghost-keeper detection: line ownership is a stronger
+    signal than commit count, which can be skewed by a single big initial
+    commit followed by many tiny fixes.
+    """
+    try:
+        raw = _run_git(repo_root, "blame", "--line-porcelain", "HEAD", "--", path)
+    except GitError:
+        return {}
+    counts: dict[str, int] = {}
+    current_email: str | None = None
+    for line in raw.splitlines():
+        if line.startswith("author-mail "):
+            current_email = line[len("author-mail "):].strip().strip("<>")
+        elif line.startswith("\t") and current_email:
+            counts[current_email] = counts.get(current_email, 0) + 1
+    return counts
+def gather(
+    repo_root: Path,
+    path: str,
+    *,
+    max_commits: int | None = None,
+    ref: str | None = None,
+) -> RepoFacts:
+    """Top-level convenience: build a RepoFacts snapshot for ``path``.
+    Pass ``ref`` to compute facts as of a past commit (e.g., for postmortem
+    "what did this file's risk look like at the time of the outage" queries).
+    """
+    commits = commits_for_path(repo_root, path, max_count=max_commits, ref=ref)
+    return RepoFacts(
+        repo_root=repo_root,
+        path=path,
+        commits=commits,
+        co_changed_files=co_changes(repo_root, commits, path),
+        revert_pairs=find_revert_pairs(commits),
+        incident_commits=find_incidents(commits),
+        invariant_quotes=extract_invariant_quotes(commits),
+    )

whycode/mcp_server.py ADDED Viewed

@@ -0,0 +1,204 @@
+"""MCP server for WhyCode.
+Exposes WhyCode's Risk Card to MCP-aware editors and assistants so the host
+LLM can pull a file's risk profile *before* it edits the code.
+Tools
+-----
+- ``get_risk_profile(path)`` — full Risk Card.
+- ``get_file_decisions(path, limit=5)`` — decision-flavoured signals only
+  (incidents, reverts, invariants), highest severity first.
+The server speaks stdio. Configure your client with:
+    {
+      "mcpServers": {
+        "whycode": {"command": "whycode", "args": ["mcp"]}
+      }
+    }
+"""
+from __future__ import annotations
+import asyncio
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any
+from mcp.server import Server
+from mcp.server.stdio import stdio_server
+from mcp.types import TextContent, Tool
+from whycode import git_facts as gf
+from whycode import risk_card as rc
+from whycode.signals import SignalKind
+DECISION_KINDS = {
+    SignalKind.REVERT_CHAIN,
+    SignalKind.INCIDENT_HISTORY,
+    SignalKind.INVARIANT_QUOTE,
+    SignalKind.GHOST_KEEPER,
+}
+def _resolve(path: str) -> tuple[Path, str]:
+    p = Path(path).resolve()
+    start = p if p.is_dir() else p.parent if p.exists() else Path.cwd()
+    repo_root = gf.discover_repo_root(start)
+    if p.exists():
+        try:
+            return repo_root, str(p.relative_to(repo_root))
+        except ValueError as exc:
+            raise gf.GitError(f"{p} is not inside {repo_root}") from exc
+    return repo_root, path
+def _log_call(name: str, arguments: dict[str, Any]) -> None:
+    """Print a one-line audit record to stderr (for `whycode mcp --verbose`)."""
+    stamp = time.strftime("%H:%M:%S")
+    path = arguments.get("path", "?")
+    print(f"[whycode {stamp}] {name}(path={path!r})", file=sys.stderr, flush=True)
+def _build_server(verbose: bool = False) -> Server:
+    server: Server = Server("whycode")
+    @server.list_tools()  # type: ignore[no-untyped-call,untyped-decorator]
+    async def _list_tools() -> list[Tool]:
+        return [
+            Tool(
+                name="get_risk_profile",
+                description=(
+                    "Return the WhyCode Risk Card for the given file path: a 0..100 "
+                    "score, a band label, and the list of fired signals (revert "
+                    "chains, incidents, coupling, silence, ghost keeper, invariant "
+                    "quotes). Call this BEFORE editing any file you are unfamiliar "
+                    "with — the response includes the SHAs that justify each flag."
+                ),
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "path": {
+                            "type": "string",
+                            "description": "Path to the file (absolute or repo-relative).",
+                        },
+                        "max_commits": {
+                            "type": "integer",
+                            "description": "Optional cap on commits scanned.",
+                        },
+                    },
+                    "required": ["path"],
+                },
+            ),
+            Tool(
+                name="get_file_decisions",
+                description=(
+                    "Return decision-flavoured signals only — past reverts, "
+                    "incident-tagged changes, ghost keepers, and invariants stated "
+                    "verbatim by past authors. Use when you specifically want the "
+                    "'why' of past changes, not the broader risk picture."
+                ),
+                inputSchema={
+                    "type": "object",
+                    "properties": {
+                        "path": {"type": "string"},
+                        "limit": {"type": "integer", "default": 5},
+                    },
+                    "required": ["path"],
+                },
+            ),
+        ]
+    @server.call_tool()  # type: ignore[untyped-decorator]
+    async def _call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]:
+        if verbose:
+            _log_call(name, arguments)
+        if name == "get_risk_profile":
+            return _handle_risk_profile(arguments)
+        if name == "get_file_decisions":
+            return _handle_file_decisions(arguments)
+        raise ValueError(f"Unknown tool: {name}")
+    return server
+def _summary_text(card: rc.RiskCard) -> str:
+    """One-paragraph prose summary of the card. Designed to be quotable verbatim
+    by an LLM consumer without further processing."""
+    if not card.signals:
+        return (
+            f"{card.path}: {card.score.band.value} ({card.score.value}/100). "
+            f"No flagged signals across {card.commit_count} commits — but read "
+            f"the diff anyway."
+        )
+    top = card.signals[0]
+    extras = ""
+    if len(card.signals) > 1:
+        extras = f" Plus {len(card.signals) - 1} more signal(s) in the full card."
+    return (
+        f"{card.path}: {card.score.band.value} ({card.score.value}/100). "
+        f"Top concern: {top.headline}.{extras}"
+    )
+def _handle_risk_profile(arguments: dict[str, Any]) -> list[TextContent]:
+    path = str(arguments["path"])
+    max_commits = arguments.get("max_commits")
+    try:
+        repo_root, rel = _resolve(path)
+        card = rc.build(repo_root, rel, max_commits=max_commits)
+    except gf.GitError as exc:
+        return [TextContent(type="text", text=json.dumps({"error": str(exc)}))]
+    payload = card.to_dict()
+    payload["summary"] = _summary_text(card)
+    return [TextContent(type="text", text=json.dumps(payload, indent=2))]
+def _handle_file_decisions(arguments: dict[str, Any]) -> list[TextContent]:
+    path = str(arguments["path"])
+    limit = int(arguments.get("limit", 5))
+    try:
+        repo_root, rel = _resolve(path)
+        card = rc.build(repo_root, rel)
+    except gf.GitError as exc:
+        return [TextContent(type="text", text=json.dumps({"error": str(exc)}))]
+    decisions = [s for s in card.signals if s.kind in DECISION_KINDS][:limit]
+    payload = {
+        "path": card.path,
+        "score": card.score.value,
+        "band": card.score.band.value,
+        "summary": _summary_text(card),
+        "decisions": [
+            {
+                "kind": s.kind.value,
+                "severity": s.severity,
+                "headline": s.headline,
+                "detail": s.detail,
+                "evidence": list(s.evidence),
+            }
+            for s in decisions
+        ],
+    }
+    return [TextContent(type="text", text=json.dumps(payload, indent=2))]
+async def _run(verbose: bool) -> None:
+    server = _build_server(verbose=verbose)
+    if verbose:
+        print(
+            "[whycode] MCP server up. Tool calls from the AI will be logged below.",
+            file=sys.stderr,
+            flush=True,
+        )
+    async with stdio_server() as (reader, writer):
+        await server.run(reader, writer, server.create_initialization_options())
+def serve(verbose: bool = False) -> None:
+    """Block on the MCP server. Used by ``whycode mcp``."""
+    asyncio.run(_run(verbose))
+__all__ = ["serve"]