culprit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
culprit/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ """culprit — root-cause analysis for a PR or branch.
2
+
3
+ Repo-agnostic engine: deterministic git/PR analysis that emits structured JSON.
4
+ The only LLM step (the "why it broke" narrative) is isolated behind
5
+ ``culprit.reasoning`` so the same engine drives both the Claude Code skill
6
+ (harness reasons) and the standalone CLI (Claude API reasons).
7
+ """
8
+
9
+ __version__ = "0.1.0"
culprit/_proc.py ADDED
@@ -0,0 +1,46 @@
1
+ """Thin, read-only subprocess helpers for git and gh.
2
+
3
+ Every command here is read-only by construction. Nothing in culprit ever
4
+ mutates the target repository or the PR.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import shutil
9
+ import subprocess
10
+ from typing import List, Optional
11
+
12
+
13
+ class ProcError(RuntimeError):
14
+ """A subprocess exited non-zero."""
15
+
16
+ def __init__(self, cmd: List[str], returncode: int, stderr: str):
17
+ self.cmd = cmd
18
+ self.returncode = returncode
19
+ self.stderr = stderr
20
+ super().__init__("`{}` exited {}: {}".format(" ".join(cmd), returncode, stderr.strip()))
21
+
22
+
23
+ def run(cmd: List[str], cwd: Optional[str] = None, check: bool = True) -> str:
24
+ """Run a command and return stdout. Raise ProcError on failure when check."""
25
+ proc = subprocess.run(
26
+ cmd,
27
+ cwd=cwd,
28
+ stdout=subprocess.PIPE,
29
+ stderr=subprocess.PIPE,
30
+ text=True,
31
+ )
32
+ if check and proc.returncode != 0:
33
+ raise ProcError(cmd, proc.returncode, proc.stderr)
34
+ return proc.stdout
35
+
36
+
37
+ def git(args: List[str], repo: str, check: bool = True) -> str:
38
+ return run(["git", "-C", repo] + args, check=check)
39
+
40
+
41
+ def have_gh() -> bool:
42
+ return shutil.which("gh") is not None
43
+
44
+
45
+ def gh(args: List[str], repo: str, check: bool = True) -> str:
46
+ return run(["gh"] + args, cwd=repo, check=check)
@@ -0,0 +1,124 @@
1
+ """Feature path: what can this change break?
2
+
3
+ For each changed source file, find who imports it (reverse-import map), which
4
+ tests cover those modules, and which touched files live in shared/core areas
5
+ (high blast radius). Heuristic but grounded — the reasoning layer ranks risk
6
+ and recommends the test surface from this structured map.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import os
11
+ import re
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from . import _proc
15
+
16
+ DEFAULT_SOURCE_GLOBS = [
17
+ "*.js", "*.jsx", "*.ts", "*.tsx", "*.mjs", "*.cjs", "*.vue", "*.svelte",
18
+ "*.py", "*.go", "*.rb", "*.java", "*.kt", "*.scala", "*.cs", "*.php",
19
+ "*.rs", "*.c", "*.h", "*.cc", "*.cpp", "*.hpp", "*.m", "*.swift",
20
+ ]
21
+ # Test-file conventions across ecosystems: JS spec/test, Python test_*/*_test,
22
+ # Go *_test.go, Java/Kotlin/C# *Test/*Tests, Ruby *_spec, plus test dirs.
23
+ DEFAULT_TEST_RE = re.compile(
24
+ r"(\.spec\.|\.test\.|_test\.|_spec\.|/__tests__/|(^|/)cypress/|(^|/)tests?/"
25
+ r"|(^|/)test_[^/]*\.(py|rb)$|Tests?\.(java|kt|cs|scala|swift)$|_test\.go$)", re.I)
26
+ HIGH_RISK_RE = re.compile(r"(^|/)(shared|common|core|lib|utils?|helpers?|base|hooks|store)(/|$)", re.I)
27
+
28
+ _INDEX_RE = re.compile(r"(^|/)(index|__init__|mod)\.[^/]+$")
29
+
30
+
31
+ def _module_token(path: str) -> str:
32
+ """The identifier other files most likely import this module by."""
33
+ if _INDEX_RE.search(path):
34
+ # package entry files (index.js / __init__.py / mod.go) are imported by dir name
35
+ return os.path.basename(os.path.dirname(path)) or os.path.basename(path)
36
+ return os.path.splitext(os.path.basename(path))[0]
37
+
38
+
39
+ def _importers(repo: str, token: str, exclude: str, source_globs: List[str]) -> List[str]:
40
+ if not token:
41
+ return []
42
+ tok = re.escape(token)
43
+ # An import-ish line that references the token as a delimited path segment.
44
+ # Covers JS/TS (`import x from '…/token'`, `require('…token…')`), Python
45
+ # (`from a.token import x`, `import a.token`), Java (`import a.b.Token;`),
46
+ # Go/Ruby/C (`"…/token"`, `<token.h>`). Uses POSIX classes only — git grep -E
47
+ # has no \w / \b, so token boundaries are spelled [^A-Za-z0-9_].
48
+ pat = r"(import|require|include|from|use).*[^A-Za-z0-9_]{}([^A-Za-z0-9_]|$)".format(tok)
49
+ args = ["grep", "-l", "-I", "-E", "-e", pat, "--"] + source_globs
50
+ out = _proc.git(args, repo, check=False)
51
+ return [f for f in out.splitlines() if f.strip() and f != exclude]
52
+
53
+
54
+ def test_gap(changed_files: List[str], repo: str,
55
+ source_globs: Optional[List[str]] = None, max_files: int = 60) -> Dict[str, Any]:
56
+ """For a bugfix: which changed (non-test) files have no covering tests.
57
+
58
+ A regression usually slips through because the touched code isn't tested.
59
+ Reuses the reverse-import map to find test files that import each module.
60
+ """
61
+ source_globs = source_globs or DEFAULT_SOURCE_GLOBS
62
+ files = [f for f in changed_files if f]
63
+ notes: List[str] = []
64
+ if len(files) > max_files:
65
+ notes.append("{} files; checked the first {}".format(len(files), max_files))
66
+ files = files[:max_files]
67
+ covering = set()
68
+ untested: List[str] = []
69
+ for path in files:
70
+ if DEFAULT_TEST_RE.search(path):
71
+ continue # the changed file is itself a test
72
+ token = _module_token(path)
73
+ tests = [i for i in _importers(repo, token, path, source_globs) if DEFAULT_TEST_RE.search(i)]
74
+ if tests:
75
+ covering.update(tests)
76
+ else:
77
+ untested.append(path)
78
+ return {"untested": untested, "covering_tests": sorted(covering), "notes": notes}
79
+
80
+
81
+ def analyze(ctx: Dict[str, Any], repo: str,
82
+ source_globs: Optional[List[str]] = None,
83
+ max_dependents: int = 50, max_files: int = 200) -> Dict[str, Any]:
84
+ source_globs = source_globs or DEFAULT_SOURCE_GLOBS
85
+ changed = [f for f in ctx.get("changed_files", []) if f]
86
+ notes: List[str] = []
87
+ if len(changed) > max_files:
88
+ notes.append("changeset has {} files; mapping dependents for the first {} "
89
+ "(narrow the base or analyze one commit)".format(len(changed), max_files))
90
+ changed = changed[:max_files]
91
+
92
+ dependents: Dict[str, List[str]] = {}
93
+ covering_tests = set()
94
+ high_risk: List[str] = []
95
+
96
+ for path in changed:
97
+ if DEFAULT_TEST_RE.search(path):
98
+ covering_tests.add(path) # the change itself touches a test
99
+ if HIGH_RISK_RE.search(path):
100
+ high_risk.append(path)
101
+
102
+ token = _module_token(path)
103
+ imps = _importers(repo, token, path, source_globs)[:max_dependents]
104
+ if imps:
105
+ dependents[path] = imps
106
+ for imp in imps:
107
+ if DEFAULT_TEST_RE.search(imp):
108
+ covering_tests.add(imp)
109
+
110
+ # A changed file with many dependents is also high-risk even outside shared/.
111
+ for path, imps in dependents.items():
112
+ if len(imps) >= 10 and path not in high_risk:
113
+ high_risk.append(path)
114
+
115
+ ranked = sorted(dependents.items(), key=lambda kv: len(kv[1]), reverse=True)
116
+ return {
117
+ "changed_files": changed,
118
+ "dependents": dict(ranked),
119
+ "dependent_counts": {p: len(v) for p, v in ranked},
120
+ "covering_tests": sorted(covering_tests),
121
+ "high_risk": high_risk,
122
+ "total_dependents": sum(len(v) for v in dependents.values()),
123
+ "notes": notes,
124
+ }
culprit/classify.py ADDED
@@ -0,0 +1,87 @@
1
+ """Classify a change as a bugfix or a feature, with evidence.
2
+
3
+ Deterministic scoring over branch name, PR labels, and commit/title prefixes.
4
+ The verdict is advisory: the Claude Code harness (or the API reasoning layer)
5
+ makes the final call, but the score + evidence give it grounded signal instead
6
+ of guessing.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from typing import Any, Dict, List, Tuple
12
+
13
+ _BUG_BRANCH = re.compile(r"^(bug|bugfix|fix|hotfix|patch)[/\-_]", re.I)
14
+ _FEAT_BRANCH = re.compile(r"^(feat|feature|enhancement|chore|refactor)[/\-_]", re.I)
15
+
16
+ # Leading [\W_]* tolerates real-world prefixes like "- fix:", "🚀 feat:", ": fixes".
17
+ _BUG_PREFIX = re.compile(r"^[\W_]*(bug\s*)?fix(es|ed)?\b|^[\W_]*hotfix\b|^[\W_]*patch\b", re.I)
18
+ _FEAT_PREFIX = re.compile(r"^[\W_]*(feat|feature|add|implement|introduce|chore|refactor)\b", re.I)
19
+
20
+ _BUG_LABELS = {"bug", "bugfix", "regression", "defect", "hotfix"}
21
+ _FEAT_LABELS = {"feature", "enhancement", "feat", "improvement"}
22
+
23
+
24
+ def _add(evidence: List[str], score: int, delta: int, msg: str) -> int:
25
+ evidence.append(msg)
26
+ return score + delta
27
+
28
+
29
+ def classify(ctx: Dict[str, Any]) -> Dict[str, Any]:
30
+ """Return {verdict, confidence, evidence, score} from a pr_context dict."""
31
+ score = 0 # positive → bugfix, negative → feature
32
+ evidence: List[str] = []
33
+
34
+ branch = ctx.get("head_ref") or ""
35
+ if _BUG_BRANCH.match(branch):
36
+ score = _add(evidence, score, 2, "branch '{}' uses a fix/bug prefix".format(branch))
37
+ elif _FEAT_BRANCH.match(branch):
38
+ score = _add(evidence, score, -2, "branch '{}' uses a feat/feature prefix".format(branch))
39
+
40
+ labels = [str(l).lower() for l in (ctx.get("labels") or [])]
41
+ for lab in labels:
42
+ if lab in _BUG_LABELS:
43
+ score = _add(evidence, score, 3, "PR label '{}' indicates a bug".format(lab))
44
+ elif lab in _FEAT_LABELS:
45
+ score = _add(evidence, score, -3, "PR label '{}' indicates a feature".format(lab))
46
+
47
+ title = ctx.get("title") or ""
48
+ if title:
49
+ if _BUG_PREFIX.search(title):
50
+ score = _add(evidence, score, 2, "PR title '{}' reads like a fix".format(title))
51
+ elif _FEAT_PREFIX.search(title):
52
+ score = _add(evidence, score, -2, "PR title '{}' reads like a feature".format(title))
53
+
54
+ bug_commits = 0
55
+ feat_commits = 0
56
+ for c in ctx.get("commits", []):
57
+ subj = c.get("subject") or ""
58
+ if _BUG_PREFIX.search(subj):
59
+ bug_commits += 1
60
+ elif _FEAT_PREFIX.search(subj):
61
+ feat_commits += 1
62
+ if bug_commits or feat_commits:
63
+ if bug_commits > feat_commits:
64
+ score = _add(evidence, score, 1,
65
+ "{} of {} commit subjects look like fixes".format(
66
+ bug_commits, len(ctx.get("commits", []))))
67
+ elif feat_commits > bug_commits:
68
+ score = _add(evidence, score, -1,
69
+ "{} of {} commit subjects look like features".format(
70
+ feat_commits, len(ctx.get("commits", []))))
71
+
72
+ if score > 0:
73
+ verdict = "bugfix"
74
+ elif score < 0:
75
+ verdict = "feature"
76
+ else:
77
+ verdict = "unknown"
78
+
79
+ # Confidence scales with the margin; capped at a readable 0.95.
80
+ confidence = min(0.95, 0.5 + 0.1 * abs(score)) if verdict != "unknown" else 0.0
81
+
82
+ return {
83
+ "verdict": verdict,
84
+ "confidence": round(confidence, 2),
85
+ "score": score,
86
+ "evidence": evidence,
87
+ }
culprit/cli.py ADDED
@@ -0,0 +1,152 @@
1
+ """culprit CLI — orchestrate the engine and emit a report.
2
+
3
+ rca # analyze the current branch (local git or its PR)
4
+ rca --pr 16786 # analyze a specific GitHub PR
5
+ rca --repo /path --base main
6
+ rca --mode api --fast # use the Claude API reasoning layer (standalone)
7
+ rca --json # print the structured result only
8
+
9
+ In Claude Code the default --mode harness emits the skeleton and the harness
10
+ writes the narrative. --mode api calls Claude directly for terminal/CI use.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import datetime
16
+ import json
17
+ import os
18
+ import sys
19
+ from typing import Any, Dict, Optional
20
+
21
+ from . import blast_radius, classify, config, evolution, pr_context, reasoning, report, suspect
22
+
23
+
24
+ def analyze(repo: str, pr: Optional[int], base: str, head: Optional[str],
25
+ force: Optional[str] = None) -> Dict[str, Any]:
26
+ """Run the full deterministic pipeline; return the structured result."""
27
+ ctx = pr_context.resolve(repo, pr=pr, base=base, head=head)
28
+ cls = classify.classify(ctx)
29
+ if force:
30
+ # Reflect the override in the displayed classification, not just the path.
31
+ cls = dict(cls)
32
+ cls["verdict"] = force
33
+ cls["evidence"] = ["forced to '{}' via --force".format(force)] + list(cls.get("evidence", []))
34
+ verdict = force or cls["verdict"]
35
+
36
+ bugfix = feature = None
37
+ if verdict == "feature":
38
+ feature = blast_radius.analyze(ctx, repo)
39
+ else:
40
+ # bugfix or unknown → run RCA (the more actionable default)
41
+ bugfix = suspect.find_suspects(ctx, repo)
42
+ # Attach the line-evolution timeline (origin → … → suspect → fix).
43
+ bugfix["timeline"] = evolution.build_timeline(ctx, repo, bugfix.get("suspects", []))
44
+ # Did the touched files have any tests? (why the bug slipped through)
45
+ bugfix["test_gap"] = blast_radius.test_gap(ctx.get("changed_files", []), repo)
46
+
47
+ return report.build(ctx, cls, bugfix, feature)
48
+
49
+
50
+ def _save(result: Dict[str, Any], narrative: str) -> str:
51
+ run = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
52
+ out_dir = os.path.join(os.path.expanduser("~/culprit/output"), run)
53
+ os.makedirs(out_dir, exist_ok=True)
54
+ with open(os.path.join(out_dir, "result.json"), "w") as fh:
55
+ json.dump(result, fh, indent=2, default=str)
56
+ with open(os.path.join(out_dir, "report.md"), "w") as fh:
57
+ fh.write(narrative)
58
+ return out_dir
59
+
60
+
61
+ def _serve_cmd(argv: list) -> int:
62
+ from . import serve
63
+ sp = argparse.ArgumentParser(prog="rca serve",
64
+ description="Interactive local web UI with a base-branch picker.")
65
+ sp.add_argument("--repo", default=".", help="repo path (default: cwd)")
66
+ sp.add_argument("--host", default="127.0.0.1", help="bind host (default: 127.0.0.1)")
67
+ sp.add_argument("--port", type=int, default=8722, help="port (default: 8722)")
68
+ sp.add_argument("--no-open", action="store_true", help="don't open a browser")
69
+ a = sp.parse_args(argv)
70
+ return serve.run(repo=a.repo, host=a.host, port=a.port, open_browser=not a.no_open)
71
+
72
+
73
+ def main(argv: Optional[list] = None) -> int:
74
+ argv = list(sys.argv[1:] if argv is None else argv)
75
+ if argv and argv[0] == "serve":
76
+ return _serve_cmd(argv[1:])
77
+
78
+ p = argparse.ArgumentParser(prog="rca", description="Root-cause analysis for a PR or branch.")
79
+ p.add_argument("pr", nargs="?", type=int, help="PR number (optional)")
80
+ p.add_argument("--pr", dest="pr_flag", type=int, help="PR number")
81
+ p.add_argument("--repo", default=".", help="repo path (default: cwd)")
82
+ p.add_argument("--base", default=None,
83
+ help="base ref for the diff. Default (local, no PR): the latest commit "
84
+ "(HEAD~1) — 'the change I just made'. Pass a branch (e.g. develop) "
85
+ "to analyze a whole branch.")
86
+ p.add_argument("--head", default=None, help="head ref (default: current branch)")
87
+ p.add_argument("--last", action="store_true",
88
+ help="analyze only the latest commit (HEAD~1), ignoring the configured base")
89
+ p.add_argument("--force", choices=["bugfix", "feature"], help="override classification")
90
+ p.add_argument("--mode", choices=["harness", "api"], default="harness",
91
+ help="reasoning layer (default: harness)")
92
+ p.add_argument("--fast", action="store_true", help="api mode: use the faster/cheaper model")
93
+ p.add_argument("--json", action="store_true", help="print structured result only")
94
+ p.add_argument("--html", metavar="PATH", help="write a self-contained HTML report to PATH")
95
+ p.add_argument("--open", dest="open_", action="store_true", help="open the HTML report in a browser")
96
+ p.add_argument("--narrative-file", metavar="PATH",
97
+ help="embed a pre-written markdown narrative in the HTML report")
98
+ p.add_argument("--no-save", action="store_true", help="don't write to ~/culprit/output")
99
+ args = p.parse_args(argv)
100
+
101
+ repo = os.path.abspath(os.path.expanduser(args.repo))
102
+ pr = args.pr_flag if args.pr_flag is not None else args.pr
103
+
104
+ # Base resolution (local mode): --last forces latest commit; else explicit
105
+ # --base; else the repo's configured base (CULPRIT_BASE / .culprit.toml);
106
+ # else None → latest commit.
107
+ if args.last:
108
+ base = None
109
+ elif args.base is not None:
110
+ base = args.base
111
+ else:
112
+ base = config.repo_base(repo)
113
+
114
+ result = analyze(repo, pr=pr, base=base, head=args.head, force=args.force)
115
+
116
+ if args.json:
117
+ print(json.dumps(result, indent=2, default=str))
118
+ return 0
119
+
120
+ # Resolve the "why" narrative for the report: an explicit file wins, else
121
+ # the API adapter generates one; harness mode leaves it empty (the visual
122
+ # timeline stands on its own with no API key).
123
+ narrative_md = ""
124
+ if args.narrative_file:
125
+ with open(os.path.expanduser(args.narrative_file), encoding="utf-8") as fh:
126
+ narrative_md = fh.read()
127
+ elif args.mode == "api":
128
+ narrative_md = reasoning.get_adapter(mode="api", fast=args.fast).explain(result)
129
+
130
+ if args.html:
131
+ from . import htmlreport
132
+ out_path = os.path.abspath(os.path.expanduser(args.html))
133
+ with open(out_path, "w", encoding="utf-8") as fh:
134
+ fh.write(htmlreport.render(result, narrative_md))
135
+ sys.stderr.write("Wrote HTML report to {}\n".format(out_path))
136
+ if args.open_:
137
+ import webbrowser
138
+ webbrowser.open("file://" + out_path)
139
+ return 0
140
+
141
+ # Default: markdown to stdout.
142
+ narrative = narrative_md or reasoning.get_adapter(mode=args.mode, fast=args.fast).explain(result)
143
+ print(narrative)
144
+
145
+ if not args.no_save:
146
+ out_dir = _save(result, narrative)
147
+ sys.stderr.write("\nSaved to {}\n".format(out_dir))
148
+ return 0
149
+
150
+
151
+ if __name__ == "__main__":
152
+ raise SystemExit(main())
culprit/config.py ADDED
@@ -0,0 +1,43 @@
1
+ """Per-repo configuration so culprit can track a repo's real base branch (and host).
2
+
3
+ Resolution order for the default base (local mode only — a PR always carries
4
+ its own base):
5
+ 1. ``--base`` on the CLI (handled by the caller)
6
+ 2. ``CULPRIT_BASE`` environment variable
7
+ 3. ``base = "..."`` in a ``.culprit.toml`` at the repo root
8
+ 4. None → fall back to the latest commit (HEAD~1)
9
+
10
+ ``host`` (``CULPRIT_HOST`` / ``host = "gitlab"``) overrides host auto-detection
11
+ for self-hosted forges where the URL alone can't tell GitHub from GitLab/Gitea.
12
+
13
+ The ``.culprit.toml`` parse is intentionally tiny (one regex, no TOML dep) so
14
+ the package stays dependency-free on Python 3.9.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import re
20
+ from typing import Optional
21
+
22
+
23
+ def _get(repo: str, key: str, env: str) -> Optional[str]:
24
+ val = os.environ.get(env)
25
+ if val:
26
+ return val.strip()
27
+ path = os.path.join(repo, ".culprit.toml")
28
+ try:
29
+ with open(path) as fh:
30
+ text = fh.read()
31
+ except (IOError, OSError):
32
+ return None
33
+ m = re.search(r"""^\s*{}\s*=\s*['"]?([^'"\n#]+?)['"]?\s*(?:#.*)?$""".format(re.escape(key)),
34
+ text, re.M)
35
+ return m.group(1).strip() if m else None
36
+
37
+
38
+ def repo_base(repo: str) -> Optional[str]:
39
+ return _get(repo, "base", "CULPRIT_BASE")
40
+
41
+
42
+ def repo_host(repo: str) -> Optional[str]:
43
+ return _get(repo, "host", "CULPRIT_HOST")
culprit/evolution.py ADDED
@@ -0,0 +1,171 @@
1
+ """Line-evolution timeline: how the buggy lines became a bug, commit by commit.
2
+
3
+ For each line range the fix touched, ``git log -L<start>,<end>:<file>`` over the
4
+ base history gives every commit that ever modified those exact lines, oldest
5
+ first. We tag the earliest as ``origin``, the prime-suspect commit as
6
+ ``suspect``, the rest as ``modified``, and append a synthetic ``fix`` step from
7
+ the fix diff. That ordered list is what the HTML report visualizes as a vertical
8
+ timeline (origin → … → the commit that broke it → the fix).
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+
14
+ from . import _proc
15
+ from .suspect import _parse_hunks, _iso, _pr_for_commit
16
+
17
+ # Field/record delimiters that won't appear in commit metadata.
18
+ _SOH, _US, _STX = "\x01", "\x1f", "\x02"
19
+ _FMT = _SOH + "%H" + _US + "%an" + _US + "%aI" + _US + "%s" + _STX
20
+
21
+ _MAX_PR_LOOKUPS = 12 # _pr_for_commit is costly; bound total lookups per run
22
+
23
+
24
+ def _hunk_text(body: str) -> str:
25
+ """Keep the unified-diff from the first @@ hunk header onward (drop the
26
+ `diff --git` / `index` / `---` / `+++` preamble)."""
27
+ lines = body.splitlines()
28
+ out: List[str] = []
29
+ started = False
30
+ for ln in lines:
31
+ if not started and ln.startswith("@@"):
32
+ started = True
33
+ if started:
34
+ out.append(ln)
35
+ return "\n".join(out).strip("\n")
36
+
37
+
38
+ def _file_block(diff: str, path: str) -> str:
39
+ """Extract the fix's diff block for `path` from the full unified diff."""
40
+ lines = (diff or "").splitlines()
41
+ block: List[str] = []
42
+ capturing = False
43
+ for ln in lines:
44
+ if ln.startswith("diff --git "):
45
+ if capturing:
46
+ break
47
+ capturing = ("a/" + path in ln) or ("b/" + path in ln) or (path in ln)
48
+ if capturing:
49
+ block = []
50
+ continue
51
+ if capturing:
52
+ block.append(ln)
53
+ return _hunk_text("\n".join(block))
54
+
55
+
56
+ def _log_L(repo: str, start: int, end: int, path: str, base: str) -> List[Dict[str, Any]]:
57
+ """Parse `git log -L<start>,<end>:<file> --reverse <base>` into ordered steps."""
58
+ spec = "-L{},{}:{}".format(start, end, path)
59
+ try:
60
+ out = _proc.git(
61
+ ["log", "--reverse", spec, "--format=" + _FMT, str(base)],
62
+ repo, check=False,
63
+ )
64
+ except _proc.ProcError:
65
+ return []
66
+ steps: List[Dict[str, Any]] = []
67
+ # Each commit block starts at _SOH; split and drop the empty leading chunk.
68
+ for chunk in out.split(_SOH)[1:]:
69
+ if _STX not in chunk:
70
+ continue
71
+ header, body = chunk.split(_STX, 1)
72
+ fields = header.split(_US)
73
+ if len(fields) < 4:
74
+ continue
75
+ sha, author, date_iso, subject = fields[0], fields[1], fields[2], fields[3]
76
+ steps.append({
77
+ "hash": sha.strip(),
78
+ "short": sha.strip()[:10],
79
+ "author": author,
80
+ "date": date_iso,
81
+ "subject": subject,
82
+ "diff": _hunk_text(body),
83
+ })
84
+ return steps
85
+
86
+
87
+ def build_timeline(ctx: Dict[str, Any], repo: str, suspects: List[Dict[str, Any]],
88
+ max_ranges: int = 10, max_steps: int = 25) -> Dict[str, Any]:
89
+ """Build per-range line-evolution timelines. Returns {ranges:[...], notes:[...]}."""
90
+ base = ctx.get("base_sha") or ctx.get("base_ref")
91
+ head_diff = ctx.get("diff") or ""
92
+ notes: List[str] = []
93
+ if not base:
94
+ return {"ranges": [], "notes": ["no base revision; timeline unavailable"]}
95
+
96
+ suspect_hashes = {s["hash"] for s in suspects} if suspects else set()
97
+ prime = suspects[0]["hash"] if suspects else None
98
+
99
+ parsed = _parse_hunks(head_diff)
100
+ pairs: List[Tuple[str, int, int]] = []
101
+ for f in parsed:
102
+ path = f["old_path"]
103
+ for (start, end) in (f["removed_ranges"] or f["context_ranges"]):
104
+ pairs.append((path, start, end))
105
+ if len(pairs) > max_ranges:
106
+ notes.append("{} buggy ranges; showing the first {}".format(len(pairs), max_ranges))
107
+ pairs = pairs[:max_ranges]
108
+
109
+ pr_cache: Dict[str, Optional[int]] = {}
110
+ pr_budget = [_MAX_PR_LOOKUPS]
111
+ head = ctx.get("head_sha") or ctx.get("head_ref") or "HEAD"
112
+
113
+ def pr_for(sha: str) -> Optional[int]:
114
+ if sha in pr_cache:
115
+ return pr_cache[sha]
116
+ if pr_budget[0] <= 0:
117
+ return None
118
+ pr_budget[0] -= 1
119
+ pr_cache[sha] = _pr_for_commit(repo, sha, str(head))
120
+ return pr_cache[sha]
121
+
122
+ ranges_out: List[Dict[str, Any]] = []
123
+ for (path, start, end) in pairs:
124
+ steps = _log_L(repo, start, end, path, str(base))
125
+ if not steps:
126
+ continue
127
+ truncated = False
128
+ if len(steps) > max_steps:
129
+ steps = [steps[0]] + steps[-(max_steps - 1):]
130
+ truncated = True
131
+
132
+ # Only the PRIME suspect is the red "broke" node — one clear culprit.
133
+ # The earliest commit is the origin; everything else is a modification.
134
+ # (Other ranked suspects still appear in the suspect-set section.)
135
+ has_prime = any(st["hash"] == prime for st in steps)
136
+ for i, st in enumerate(steps):
137
+ if st["hash"] == prime:
138
+ st["role"] = "suspect"
139
+ elif i == 0:
140
+ st["role"] = "origin"
141
+ else:
142
+ st["role"] = "modified"
143
+ # PR attribution: only spend the budget on the interesting steps.
144
+ st["pr_number"] = pr_for(st["hash"]) if st["role"] in ("origin", "suspect") else None
145
+ # If the prime suspect didn't touch this range, mark the latest pre-fix
146
+ # commit as the suspect so every range still shows where it last changed.
147
+ if not has_prime and len(steps) > 1:
148
+ steps[-1]["role"] = "suspect"
149
+ steps[-1]["pr_number"] = pr_for(steps[-1]["hash"])
150
+
151
+ # Synthetic fix step from the head diff for this file.
152
+ fix_diff = _file_block(head_diff, path)
153
+ steps.append({
154
+ "hash": ctx.get("head_sha") or "",
155
+ "short": (ctx.get("head_sha") or "")[:10] or (ctx.get("head_ref") or "HEAD"),
156
+ "author": None,
157
+ "subject": ctx.get("title") or "THE FIX",
158
+ "date": ctx.get("head_date"),
159
+ "pr_number": ctx.get("pr_number"),
160
+ "role": "fix",
161
+ "diff": fix_diff,
162
+ })
163
+
164
+ ranges_out.append({
165
+ "file": path,
166
+ "range": [start, end],
167
+ "truncated": truncated,
168
+ "steps": steps,
169
+ })
170
+
171
+ return {"ranges": ranges_out, "notes": notes}
culprit/htmlreport.py ADDED
@@ -0,0 +1,37 @@
1
+ """Render a self-contained HTML RCA report from a structured result.
2
+
3
+ One file, no external CDN, no build step: the template ships as package data,
4
+ the result JSON and an optional narrative are injected as text nodes. Open the
5
+ output in any browser — works offline.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from typing import Any, Dict
11
+
12
+ try: # Python 3.9+: importlib.resources.files
13
+ from importlib.resources import files as _res_files
14
+
15
+ def _template() -> str:
16
+ return _res_files("culprit").joinpath("templates/report.html").read_text(encoding="utf-8")
17
+ except Exception: # pragma: no cover - very old runtimes
18
+ import os
19
+
20
+ def _template() -> str:
21
+ here = os.path.dirname(__file__)
22
+ with open(os.path.join(here, "templates", "report.html"), encoding="utf-8") as fh:
23
+ return fh.read()
24
+
25
+
26
+ def _safe_json(obj: Any) -> str:
27
+ # Embedded in a <script type="application/json"> node read via JSON.parse;
28
+ # the only sequence that can break out of the node is "</".
29
+ return json.dumps(obj, default=str).replace("</", "<\\/")
30
+
31
+
32
+ def render(result: Dict[str, Any], narrative_md: str = "") -> str:
33
+ tpl = _template()
34
+ data = _safe_json(result)
35
+ narrative = (narrative_md or "").replace("</script", "<\\/script")
36
+ # Placeholders are unique literals in the template.
37
+ return tpl.replace("__CULPRIT_DATA__", data).replace("__NARRATIVE__", narrative)