@codexstar/bug-hunter 3.0.0 → 3.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CHANGELOG.md +149 -83
  2. package/README.md +150 -15
  3. package/SKILL.md +94 -27
  4. package/agents/openai.yaml +4 -0
  5. package/bin/bug-hunter +9 -3
  6. package/docs/images/2026-03-12-fix-plan-rollout.png +0 -0
  7. package/docs/images/2026-03-12-hero-bug-hunter-overview.png +0 -0
  8. package/docs/images/2026-03-12-machine-readable-artifacts.png +0 -0
  9. package/docs/images/2026-03-12-pr-review-flow.png +0 -0
  10. package/docs/images/2026-03-12-security-pack.png +0 -0
  11. package/docs/images/adversarial-debate.png +0 -0
  12. package/docs/images/doc-verify-fix-plan.png +0 -0
  13. package/docs/images/hero.png +0 -0
  14. package/docs/images/pipeline-overview.png +0 -0
  15. package/docs/images/security-finding-card.png +0 -0
  16. package/docs/plans/2026-03-11-structured-output-migration-plan.md +288 -0
  17. package/docs/plans/2026-03-12-audit-bug-fixes-surgical-plan.md +193 -0
  18. package/docs/plans/2026-03-12-enterprise-security-pack-e2e-plan.md +59 -0
  19. package/docs/plans/2026-03-12-local-security-skills-integration-plan.md +39 -0
  20. package/docs/plans/2026-03-12-pr-review-strategic-fix-flow.md +78 -0
  21. package/evals/evals.json +366 -102
  22. package/modes/extended.md +2 -2
  23. package/modes/fix-loop.md +30 -30
  24. package/modes/fix-pipeline.md +32 -6
  25. package/modes/large-codebase.md +14 -15
  26. package/modes/local-sequential.md +44 -20
  27. package/modes/loop.md +56 -56
  28. package/modes/parallel.md +3 -3
  29. package/modes/scaled.md +2 -2
  30. package/modes/single-file.md +3 -3
  31. package/modes/small.md +11 -11
  32. package/package.json +10 -1
  33. package/prompts/fixer.md +37 -23
  34. package/prompts/hunter.md +39 -20
  35. package/prompts/referee.md +34 -20
  36. package/prompts/skeptic.md +25 -22
  37. package/schemas/coverage.schema.json +67 -0
  38. package/schemas/examples/findings.invalid.json +13 -0
  39. package/schemas/examples/findings.valid.json +17 -0
  40. package/schemas/findings.schema.json +76 -0
  41. package/schemas/fix-plan.schema.json +94 -0
  42. package/schemas/fix-report.schema.json +105 -0
  43. package/schemas/fix-strategy.schema.json +99 -0
  44. package/schemas/recon.schema.json +31 -0
  45. package/schemas/referee.schema.json +46 -0
  46. package/schemas/shared.schema.json +51 -0
  47. package/schemas/skeptic.schema.json +21 -0
  48. package/scripts/bug-hunter-state.cjs +35 -12
  49. package/scripts/code-index.cjs +11 -4
  50. package/scripts/fix-lock.cjs +95 -25
  51. package/scripts/payload-guard.cjs +24 -10
  52. package/scripts/pr-scope.cjs +181 -0
  53. package/scripts/render-report.cjs +346 -0
  54. package/scripts/run-bug-hunter.cjs +667 -32
  55. package/scripts/schema-runtime.cjs +273 -0
  56. package/scripts/schema-validate.cjs +40 -0
  57. package/scripts/tests/bug-hunter-state.test.cjs +68 -3
  58. package/scripts/tests/code-index.test.cjs +15 -0
  59. package/scripts/tests/fix-lock.test.cjs +60 -2
  60. package/scripts/tests/fixtures/flaky-worker.cjs +6 -1
  61. package/scripts/tests/fixtures/low-confidence-worker.cjs +8 -2
  62. package/scripts/tests/fixtures/success-worker.cjs +6 -1
  63. package/scripts/tests/payload-guard.test.cjs +154 -2
  64. package/scripts/tests/pr-scope.test.cjs +212 -0
  65. package/scripts/tests/render-report.test.cjs +180 -0
  66. package/scripts/tests/run-bug-hunter.test.cjs +686 -2
  67. package/scripts/tests/security-skills-integration.test.cjs +29 -0
  68. package/scripts/tests/skills-packaging.test.cjs +30 -0
  69. package/scripts/tests/worktree-harvest.test.cjs +66 -0
  70. package/scripts/worktree-harvest.cjs +62 -9
  71. package/skills/README.md +19 -0
  72. package/skills/commit-security-scan/SKILL.md +63 -0
  73. package/skills/security-review/SKILL.md +57 -0
  74. package/skills/threat-model-generation/SKILL.md +47 -0
  75. package/skills/vulnerability-validation/SKILL.md +59 -0
  76. package/templates/subagent-wrapper.md +12 -3
  77. package/modes/_dispatch.md +0 -121
package/evals/evals.json CHANGED
@@ -3,8 +3,8 @@
3
3
  "evals": [
4
4
  {
5
5
  "id": 1,
6
- "prompt": "/bug-hunter test-fixture/",
7
- "expected_output": "Full pipeline execution on the included test fixture (Express app with 6 planted bugs). Should run Recon -> Hunter -> Skeptic -> Referee and produce a final report confirming at least 5 of 6 planted bugs with severity ratings, file paths, and suggested fixes.",
6
+ "prompt": "/bug-hunter --scan-only test-fixture/",
7
+ "expected_output": "Scan-only self-test on the included Express fixture. Should run Recon -> Hunter -> Skeptic -> Referee, confirm most planted bugs, and write canonical JSON artifacts plus a rendered report.",
8
8
  "files": [
9
9
  "test-fixture/server.js",
10
10
  "test-fixture/auth.js",
@@ -13,347 +13,611 @@
13
13
  ],
14
14
  "assertions": [
15
15
  {
16
- "text": "Pipeline runs all phases: Recon, Hunter, Skeptic, Referee",
16
+ "text": "Pipeline runs Recon, Hunter, Skeptic, and Referee",
17
17
  "type": "content_check"
18
18
  },
19
19
  {
20
- "text": "At least 5 of 6 planted bugs are confirmed in the final report",
20
+ "text": "Writes .bug-hunter/findings.json, .bug-hunter/referee.json, and .bug-hunter/report.md",
21
21
  "type": "content_check"
22
22
  },
23
23
  {
24
- "text": "Each confirmed bug includes file path, line numbers, severity, and suggested fix",
24
+ "text": "Confirms at least 5 of the 6 planted bugs in the fixture",
25
25
  "type": "content_check"
26
26
  },
27
27
  {
28
- "text": "False positives are challenged and filtered by the Skeptic/Referee pipeline",
28
+ "text": "Rendered report includes mode, files scanned, and coverage metadata",
29
+ "type": "content_check"
30
+ }
31
+ ]
32
+ },
33
+ {
34
+ "id": 2,
35
+ "prompt": "/bug-hunter src/api/auth.ts",
36
+ "expected_output": "Single-file scan should skip Recon, run Hunter -> Skeptic -> Referee, and keep the output scoped to the target file while still writing canonical JSON artifacts.",
37
+ "files": [],
38
+ "assertions": [
39
+ {
40
+ "text": "Selects single-file mode when one source file is targeted",
29
41
  "type": "content_check"
30
42
  },
31
43
  {
32
- "text": "Final report includes scan metadata (mode, files scanned, coverage)",
44
+ "text": "Skips Recon for single-file mode",
33
45
  "type": "content_check"
34
46
  },
35
47
  {
36
- "text": "Fix pipeline is triggered by default when confirmed bugs exist; only --scan-only disables fixes",
48
+ "text": "Writes .bug-hunter/findings.json and .bug-hunter/referee.json for the single-file run",
49
+ "type": "content_check"
50
+ },
51
+ {
52
+ "text": "Referee returns REAL_BUG, NOT_A_BUG, or MANUAL_REVIEW verdicts for the findings",
37
53
  "type": "content_check"
38
54
  }
39
55
  ]
40
56
  },
41
57
  {
42
- "id": 2,
43
- "prompt": "/bug-hunter src/api/auth.ts",
44
- "expected_output": "Single-file mode scan of an auth file. Should skip Recon (not needed for single file), run one Hunter, one Skeptic, and one Referee. Output should focus on security and logic bugs in the auth file specifically.",
58
+ "id": 3,
59
+ "prompt": "/bug-hunter -b feature-auth --base develop",
60
+ "expected_output": "Branch diff mode should diff the branches, filter non-source files, report the resulting scan set, and choose the execution mode from the surviving source files.",
45
61
  "files": [],
46
62
  "assertions": [
47
63
  {
48
- "text": "Selects single-file mode (1 file detected)",
64
+ "text": "Runs git diff --name-only develop...feature-auth to resolve changed files",
65
+ "type": "content_check"
66
+ },
67
+ {
68
+ "text": "Filters docs, configs, assets, lockfiles, and other non-source files before scanning",
49
69
  "type": "content_check"
50
70
  },
51
71
  {
52
- "text": "Skips Recon agent (not needed for single-file mode)",
72
+ "text": "Reports the number of scannable source files after filtering",
53
73
  "type": "content_check"
54
74
  },
55
75
  {
56
- "text": "Hunter scans the target file and reports findings with BUG-ID format",
76
+ "text": "Chooses small, parallel, extended, scaled, or large-codebase mode from the filtered file count",
77
+ "type": "content_check"
78
+ }
79
+ ]
80
+ },
81
+ {
82
+ "id": 4,
83
+ "prompt": "/bug-hunter --staged",
84
+ "expected_output": "Staged mode should scan full contents of staged source files after resolving them through git diff --cached --name-only and filtering non-source files.",
85
+ "files": [],
86
+ "assertions": [
87
+ {
88
+ "text": "Runs git diff --cached --name-only to collect staged files",
57
89
  "type": "content_check"
58
90
  },
59
91
  {
60
- "text": "Skeptic challenges the findings with code-based counter-arguments",
92
+ "text": "Filters non-source files from the staged list before scanning",
61
93
  "type": "content_check"
62
94
  },
63
95
  {
64
- "text": "Referee produces a final verdict (REAL BUG or NOT A BUG) for each finding",
96
+ "text": "Scans full file contents of staged source files rather than scanning only the patch",
65
97
  "type": "content_check"
66
98
  }
67
99
  ]
68
100
  },
69
101
  {
70
- "id": 3,
71
- "prompt": "/bug-hunter -b feature-auth --base develop",
72
- "expected_output": "Branch diff mode. Should run git diff to find changed files between feature-auth and develop branches, filter out non-source files, then run the full pipeline on the changed source files.",
102
+ "id": 5,
103
+ "prompt": "/bug-hunter --fix src/",
104
+ "expected_output": "Default fix mode should run Phase 1, then acquire the fix lock, capture verification baselines, apply eligible fixes, write a machine-readable fix report, and release the lock.",
73
105
  "files": [],
74
106
  "assertions": [
75
107
  {
76
- "text": "Runs git diff --name-only to extract changed files between branches",
108
+ "text": "Creates a git safety branch before applying fixes when git safety is available",
77
109
  "type": "content_check"
78
110
  },
79
111
  {
80
- "text": "Filters out non-source files (configs, docs, assets, lockfiles)",
112
+ "text": "Acquires and releases .bug-hunter/fix.lock around the fix phase",
81
113
  "type": "content_check"
82
114
  },
83
115
  {
84
- "text": "Reports the number of source files to scan after filtering",
116
+ "text": "Captures verification baseline before applying fixes",
85
117
  "type": "content_check"
86
118
  },
87
119
  {
88
- "text": "Selects appropriate mode based on file count (small, parallel, extended, etc.)",
120
+ "text": "Writes .bug-hunter/fix-report.json as the canonical fix artifact",
121
+ "type": "content_check"
122
+ },
123
+ {
124
+ "text": "Auto-fixes only bugs that pass the confidence eligibility threshold",
89
125
  "type": "content_check"
90
126
  }
91
127
  ]
92
128
  },
93
129
  {
94
- "id": 4,
95
- "prompt": "/bug-hunter --staged",
96
- "expected_output": "Staged file mode for pre-commit checking. Should run git diff --cached --name-only to get staged files, filter non-source files, then scan the staged source files.",
130
+ "id": 6,
131
+ "prompt": "/bug-hunter src/",
132
+ "expected_output": "Loop mode is the default. A normal directory scan should create loop state, iterate until queued files are covered, and track canonical coverage in JSON with a rendered Markdown companion.",
97
133
  "files": [],
98
134
  "assertions": [
99
135
  {
100
- "text": "Runs git diff --cached --name-only to get staged files",
136
+ "text": "Treats loop mode as the default without requiring an explicit --loop flag",
101
137
  "type": "content_check"
102
138
  },
103
139
  {
104
- "text": "Filters out non-source files from the staged list",
140
+ "text": "Creates or updates .bug-hunter/coverage.json as canonical loop state and renders .bug-hunter/coverage.md from it",
105
141
  "type": "content_check"
106
142
  },
107
143
  {
108
- "text": "Scans full file contents of staged files (not just diffs)",
144
+ "text": "Tracks per-file coverage state in coverage.json across iterations",
145
+ "type": "content_check"
146
+ },
147
+ {
148
+ "text": "Marks completion only when all queued scannable files are done",
109
149
  "type": "content_check"
110
150
  }
111
151
  ]
112
152
  },
113
153
  {
114
- "id": 5,
115
- "prompt": "/bug-hunter --fix src/",
116
- "expected_output": "Full pipeline with auto-fix. After Phase 1 (find & verify), should proceed to Phase 2: create a git branch, acquire single-writer lock, detect test infrastructure, capture test baseline, run Fixer clusters sequentially with checkpoint commits, run post-fix tests, auto-revert regressions, and release lock.",
154
+ "id": 7,
155
+ "prompt": "Can you check my Express API for security vulnerabilities? The code is in src/",
156
+ "expected_output": "Natural-language trigger should invoke the bug-hunter skill and run a security-focused audit with trust-boundary mapping and security-oriented Hunter analysis.",
117
157
  "files": [],
118
158
  "assertions": [
119
159
  {
120
- "text": "Creates a git safety branch (bug-hunter-fix-*) before applying fixes",
160
+ "text": "Triggers bug-hunter from natural language security-audit intent without requiring /bug-hunter",
161
+ "type": "content_check"
162
+ },
163
+ {
164
+ "text": "Runs Recon to identify architecture, trust boundaries, and high-risk areas",
121
165
  "type": "content_check"
122
166
  },
123
167
  {
124
- "text": "Detects test command from package.json or project config",
168
+ "text": "Hunter prioritizes injection, auth bypass, input validation, and secrets exposure checks",
125
169
  "type": "content_check"
126
170
  },
127
171
  {
128
- "text": "Captures test baseline before applying fixes",
172
+ "text": "Findings use severity labels and canonical JSON fields rather than free-form Markdown only",
173
+ "type": "content_check"
174
+ }
175
+ ]
176
+ },
177
+ {
178
+ "id": 8,
179
+ "prompt": "/bug-hunter --fix --approve src/auth/",
180
+ "expected_output": "Approval mode should still run the fix pipeline, but Fixer agents should operate in reviewed mode and report that approval is required for edits.",
181
+ "files": [],
182
+ "assertions": [
183
+ {
184
+ "text": "Sets APPROVE_MODE=true from the --approve flag",
129
185
  "type": "content_check"
130
186
  },
131
187
  {
132
- "text": "Fixer agents implement minimal, surgical code changes",
188
+ "text": "Runs Fixers in reviewed/default mode instead of unattended auto-edit mode",
133
189
  "type": "content_check"
134
190
  },
135
191
  {
136
- "text": "Each fix is a separate checkpoint commit with descriptive message",
192
+ "text": "Tells the user it is running in approval mode",
193
+ "type": "content_check"
194
+ }
195
+ ]
196
+ },
197
+ {
198
+ "id": 9,
199
+ "prompt": "/bug-hunter huge-repo/",
200
+ "expected_output": "Large-repo mode should initialize persistent chunk state, process chunks sequentially, and resume from .bug-hunter/state.json when interrupted.",
201
+ "files": [],
202
+ "assertions": [
203
+ {
204
+ "text": "Initializes .bug-hunter/state.json with chunk metadata",
137
205
  "type": "content_check"
138
206
  },
139
207
  {
140
- "text": "Post-fix test run compares against baseline (new failures vs pre-existing)",
208
+ "text": "Processes large scans in sequential chunks and records chunk status",
141
209
  "type": "content_check"
142
210
  },
143
211
  {
144
- "text": "Fixes that cause new test failures are auto-reverted",
212
+ "text": "Resumes from existing .bug-hunter/state.json without rescanning completed chunks",
213
+ "type": "content_check"
214
+ }
215
+ ]
216
+ },
217
+ {
218
+ "id": 10,
219
+ "prompt": "/bug-hunter src/ (second run with unchanged files)",
220
+ "expected_output": "A repeat run should apply the hash cache through bug-hunter-state and skip unchanged files before deep scan work starts.",
221
+ "files": [],
222
+ "assertions": [
223
+ {
224
+ "text": "Runs hash-filter against .bug-hunter/state.json before deep scan work",
145
225
  "type": "content_check"
146
226
  },
147
227
  {
148
- "text": "Acquires and releases .claude/bug-hunter-fix.lock around fix phase",
228
+ "text": "Reports skipped unchanged files from the hash cache",
229
+ "type": "content_check"
230
+ }
231
+ ]
232
+ },
233
+ {
234
+ "id": 11,
235
+ "prompt": "/bug-hunter src/ with malformed subagent payload",
236
+ "expected_output": "Payload validation should fail before any subagent launch when the generated payload does not match the required contract.",
237
+ "files": [],
238
+ "assertions": [
239
+ {
240
+ "text": "Validates subagent payloads with payload-guard.cjs before launch",
149
241
  "type": "content_check"
150
242
  },
151
243
  {
152
- "text": "Auto-fixes only bugs that pass confidence eligibility threshold",
244
+ "text": "Does not launch a subagent when payload validation fails",
153
245
  "type": "content_check"
154
246
  }
155
247
  ]
156
248
  },
157
249
  {
158
- "id": 6,
159
- "prompt": "/bug-hunter --loop src/",
160
- "expected_output": "Loop mode for thorough coverage. Should create ralph-loop state files, iterate the pipeline until all CRITICAL and HIGH files are scanned, track coverage in .claude/bug-hunter-coverage.md, and mark ALL_TASKS_COMPLETE when done.",
250
+ "id": 12,
251
+ "prompt": "/bug-hunter --fix src/ while another fix run is active",
252
+ "expected_output": "The fix phase should stop cleanly when the single-writer lock cannot be acquired.",
161
253
  "files": [],
162
254
  "assertions": [
163
255
  {
164
- "text": "Creates .claude/ralph-loop.local.md state file for loop mode",
256
+ "text": "Attempts to acquire .bug-hunter/fix.lock before any edits",
165
257
  "type": "content_check"
166
258
  },
167
259
  {
168
- "text": "Creates or updates .claude/bug-hunter-coverage.md with machine-parseable format",
260
+ "text": "Stops Phase 2 with a clear lock-held message when the fix lock is already held",
261
+ "type": "content_check"
262
+ }
263
+ ]
264
+ },
265
+ {
266
+ "id": 13,
267
+ "prompt": "/bug-hunter --fix src/ with mixed-confidence bugs",
268
+ "expected_output": "Auto-fix should edit only eligible high-confidence bugs and leave the rest in manual review.",
269
+ "files": [],
270
+ "assertions": [
271
+ {
272
+ "text": "Applies the >=75 confidence threshold for auto-fix eligibility",
169
273
  "type": "content_check"
170
274
  },
171
275
  {
172
- "text": "Tracks file coverage status (DONE, PARTIAL, SKIPPED) per iteration",
276
+ "text": "Keeps low-confidence bugs in manual review instead of auto-editing them",
277
+ "type": "content_check"
278
+ }
279
+ ]
280
+ },
281
+ {
282
+ "id": 14,
283
+ "prompt": "/bug-hunter src/ on a CLI without spawn_agent",
284
+ "expected_output": "The skill should select the best available orchestration backend at runtime and fall back to local-sequential execution when delegation backends are unavailable.",
285
+ "files": [],
286
+ "assertions": [
287
+ {
288
+ "text": "Chooses AGENT_BACKEND during preflight based on available runtime tools",
173
289
  "type": "content_check"
174
290
  },
175
291
  {
176
- "text": "Subsequent iterations only scan uncovered files (no re-scanning DONE files)",
292
+ "text": "Falls back to the next backend when a preferred launch path fails",
177
293
  "type": "content_check"
178
294
  },
179
295
  {
180
- "text": "Marks ALL_TASKS_COMPLETE when all CRITICAL and HIGH files show DONE",
296
+ "text": "Completes the run with local-sequential fallback when no delegation backend is available",
181
297
  "type": "content_check"
182
298
  }
183
299
  ]
184
300
  },
185
301
  {
186
- "id": 7,
187
- "prompt": "Can you check my Express API for security vulnerabilities? The code is in src/",
188
- "expected_output": "Should trigger the bug-hunter skill (even though the user didn't say /bug-hunter) and run a security-focused scan on the src/ directory. The deep Hunter should prioritize security findings, with optional triage hints when enabled.",
302
+ "id": 15,
303
+ "prompt": "/bug-hunter huge-repo/ with flaky chunk worker",
304
+ "expected_output": "The chunk orchestrator should enforce retries with backoff and write attempt details to the canonical run journal.",
189
305
  "files": [],
190
306
  "assertions": [
191
307
  {
192
- "text": "Triggers bug-hunter skill from natural language (security audit request)",
308
+ "text": "Uses run-bug-hunter.cjs for autonomous chunk orchestration",
193
309
  "type": "content_check"
194
310
  },
195
311
  {
196
- "text": "Runs Recon to map architecture and identify trust boundaries",
312
+ "text": "Retries timed out or failed chunks according to max-retries and backoff policy",
197
313
  "type": "content_check"
198
314
  },
199
315
  {
200
- "text": "Deep Hunter focuses on injection, auth bypass, input validation, and secrets exposure in security audit requests",
316
+ "text": "Writes attempt events to .bug-hunter/run.log",
317
+ "type": "content_check"
318
+ }
319
+ ]
320
+ },
321
+ {
322
+ "id": 16,
323
+ "prompt": "/bug-hunter --deps src/",
324
+ "expected_output": "Dependency scan mode should run the dependency audit helper, write dep-findings output, and feed reachable dependency issues into Hunter context.",
325
+ "files": [],
326
+ "assertions": [
327
+ {
328
+ "text": "Runs scripts/dep-scan.cjs when --deps is supplied",
201
329
  "type": "content_check"
202
330
  },
203
331
  {
204
- "text": "Output includes severity ratings (Critical, Medium, Low) for each finding",
332
+ "text": "Writes .bug-hunter/dep-findings.json for dependency scan output",
205
333
  "type": "content_check"
206
334
  },
207
335
  {
208
- "text": "Framework-specific protections are checked (Express middleware, helmet, etc.)",
336
+ "text": "Includes reachable dependency findings in Hunter analysis context",
209
337
  "type": "content_check"
210
338
  }
211
339
  ]
212
340
  },
213
341
  {
214
- "id": 8,
215
- "prompt": "/bug-hunter --fix --approve src/auth/",
216
- "expected_output": "Fix mode with approval. Should find bugs in auth directory, then fix them but prompt the user before each edit (approval mode). Fixer agents run in default mode rather than auto mode.",
342
+ "id": 17,
343
+ "prompt": "/bug-hunter --threat-model src/",
344
+ "expected_output": "Threat-model mode should load or generate a STRIDE threat model and feed it into Recon and Hunter.",
217
345
  "files": [],
218
346
  "assertions": [
219
347
  {
220
- "text": "APPROVE_MODE is set to true from --approve flag",
348
+ "text": "Loads an existing .bug-hunter/threat-model.md or generates one when missing",
221
349
  "type": "content_check"
222
350
  },
223
351
  {
224
- "text": "Fixer agents run in mode: default (user reviews each edit)",
352
+ "text": "Marks THREAT_MODEL_AVAILABLE and uses the threat model in Recon and Hunter context",
353
+ "type": "content_check"
354
+ },
355
+ {
356
+ "text": "Keeps threat-model generation non-blocking relative to the rest of the bug-hunt flow",
357
+ "type": "content_check"
358
+ }
359
+ ]
360
+ },
361
+ {
362
+ "id": 18,
363
+ "prompt": "/bug-hunter --fix --dry-run src/",
364
+ "expected_output": "Dry-run fix mode should build the fix plan and produce machine-readable fix output without editing files, committing, or taking the lock.",
365
+ "files": [],
366
+ "assertions": [
367
+ {
368
+ "text": "Sets DRY_RUN_MODE=true and forces FIX_MODE=true when --dry-run is provided",
225
369
  "type": "content_check"
226
370
  },
227
371
  {
228
- "text": "Reports 'Running in approval mode' to the user",
372
+ "text": "Produces .bug-hunter/fix-report.json with dry_run set to true",
229
373
  "type": "content_check"
230
374
  },
231
375
  {
232
- "text": "Fixes are still committed as individual checkpoint commits",
376
+ "text": "Skips file edits, git commits, and fix lock acquisition in dry-run mode",
233
377
  "type": "content_check"
234
378
  }
235
379
  ]
236
380
  },
237
381
  {
238
- "id": 9,
239
- "prompt": "/bug-hunter huge-repo/",
240
- "expected_output": "Large-repo run should initialize .claude/bug-hunter-state.json, split files into sequential chunks, and resume from state if interrupted.",
382
+ "id": 19,
383
+ "prompt": "/bug-hunter --autonomous src/",
384
+ "expected_output": "Autonomous mode should force fix mode and run canary-first, confidence-gated fixes without waiting for per-edit approval.",
241
385
  "files": [],
242
386
  "assertions": [
243
387
  {
244
- "text": "Initializes bug-hunter-state.json with chunk metadata",
388
+ "text": "Sets AUTONOMOUS_MODE=true and forces FIX_MODE=true when --autonomous is supplied",
245
389
  "type": "content_check"
246
390
  },
247
391
  {
248
- "text": "Processes chunks sequentially and marks each chunk state",
392
+ "text": "Runs canary-first, confidence-gated fix rollout in autonomous mode",
249
393
  "type": "content_check"
250
394
  },
251
395
  {
252
- "text": "Can resume from existing state file without rescanning completed chunks",
396
+ "text": "Does not require approval-mode prompts for unattended autonomous fixes",
253
397
  "type": "content_check"
254
398
  }
255
399
  ]
256
400
  },
257
401
  {
258
- "id": 10,
259
- "prompt": "/bug-hunter src/ (second run with unchanged files)",
260
- "expected_output": "Hash cache should skip unchanged files and focus scan effort on changed files only.",
402
+ "id": 20,
403
+ "prompt": "/bug-hunter --pr current",
404
+ "expected_output": "PR review mode should resolve the current PR scope, save PR metadata, and scan the resolved changed files rather than the whole repository.",
261
405
  "files": [],
262
406
  "assertions": [
263
407
  {
264
- "text": "Runs hash-filter against bug-hunter-state.json before deep scan",
408
+ "text": "Uses scripts/pr-scope.cjs to resolve current PR metadata and changed files",
265
409
  "type": "content_check"
266
410
  },
267
411
  {
268
- "text": "Reports skipped unchanged files from cache",
412
+ "text": "Writes .bug-hunter/pr-scope.json for later reporting",
413
+ "type": "content_check"
414
+ },
415
+ {
416
+ "text": "Scans the resolved changed files as the PR review scope",
269
417
  "type": "content_check"
270
418
  }
271
419
  ]
272
420
  },
273
421
  {
274
- "id": 11,
275
- "prompt": "/bug-hunter src/ with malformed subagent payload",
276
- "expected_output": "Pipeline should fail fast before spawning subagents when payload validation fails.",
422
+ "id": 21,
423
+ "prompt": "/bug-hunter --pr recent --scan-only",
424
+ "expected_output": "Recent-PR review mode should resolve the most recent PR through GitHub metadata, limit analysis to its changed files, and stop after reporting.",
277
425
  "files": [],
278
426
  "assertions": [
279
427
  {
280
- "text": "Validates payload via payload-guard.cjs before each subagent launch",
428
+ "text": "Resolves the most recent PR through pr-scope using GitHub metadata",
429
+ "type": "content_check"
430
+ },
431
+ {
432
+ "text": "Keeps FIX_MODE disabled because scan-only was requested",
281
433
  "type": "content_check"
282
434
  },
283
435
  {
284
- "text": "Does not launch subagent when payload validation fails",
436
+ "text": "Produces the normal findings/referee/report artifacts for the PR-scoped review",
285
437
  "type": "content_check"
286
438
  }
287
439
  ]
288
440
  },
289
441
  {
290
- "id": 12,
291
- "prompt": "/bug-hunter --fix src/ while another fix run is active",
292
- "expected_output": "Fix phase should stop when single-writer lock cannot be acquired.",
442
+ "id": 22,
443
+ "prompt": "/bug-hunter --plan-only src/",
444
+ "expected_output": "Plan-only mode should build a remediation strategy and fix plan but stop before the Fixer edits code.",
293
445
  "files": [],
294
446
  "assertions": [
295
447
  {
296
- "text": "Attempts to acquire .claude/bug-hunter-fix.lock before any edits",
448
+ "text": "Builds .bug-hunter/fix-strategy.json and .bug-hunter/fix-strategy.md before fix execution",
449
+ "type": "content_check"
450
+ },
451
+ {
452
+ "text": "Builds .bug-hunter/fix-plan.json while PLAN_ONLY_MODE is active",
297
453
  "type": "content_check"
298
454
  },
299
455
  {
300
- "text": "Stops Phase 2 with clear lock-held message when lock is already held",
456
+ "text": "Stops before the Fixer edits files when --plan-only is supplied",
301
457
  "type": "content_check"
302
458
  }
303
459
  ]
304
460
  },
305
461
  {
306
- "id": 13,
307
- "prompt": "/bug-hunter --fix src/ with mixed-confidence bugs",
308
- "expected_output": "Auto-fix should run only on high-confidence bugs and leave low-confidence bugs as manual review.",
462
+ "id": 23,
463
+ "prompt": "/bug-hunter --plan src/ then /bug-hunter --preview src/ then /bug-hunter --safe src/ then /bug-hunter --last-pr --review",
464
+ "expected_output": "Shortcut aliases should map cleanly onto their canonical behaviors without changing the underlying execution semantics.",
309
465
  "files": [],
310
466
  "assertions": [
311
467
  {
312
- "text": "Applies confidence threshold gating (>=75%) for auto-fix eligibility",
468
+ "text": "Treats --plan as an alias for --plan-only",
313
469
  "type": "content_check"
314
470
  },
315
471
  {
316
- "text": "Reports low-confidence bugs as manual-review and does not auto-edit them",
472
+ "text": "Treats --preview as an alias for --fix --dry-run",
473
+ "type": "content_check"
474
+ },
475
+ {
476
+ "text": "Treats --safe as an alias for --fix --approve",
477
+ "type": "content_check"
478
+ },
479
+ {
480
+ "text": "Treats --last-pr and --review as aliases for --pr recent and --scan-only",
317
481
  "type": "content_check"
318
482
  }
319
483
  ]
320
484
  },
321
485
  {
322
- "id": 14,
323
- "prompt": "/bug-hunter src/ on a CLI without spawn_agent",
324
- "expected_output": "Pipeline should auto-select the available orchestration backend and continue. If remote orchestration is unavailable, it should fall back to local sequential execution.",
486
+ "id": 24,
487
+ "prompt": "/bug-hunter --fix src/ with a high-confidence architectural-remediation finding",
488
+ "expected_output": "Execution gating should honor fix-strategy classifications so non-autofix findings never enter the executable canary or rollout queue.",
325
489
  "files": [],
326
490
  "assertions": [
327
491
  {
328
- "text": "Selects AGENT_BACKEND in preflight based on available runtime tools",
492
+ "text": "Builds fix-strategy classifications before building the executable fix plan",
329
493
  "type": "content_check"
330
494
  },
331
495
  {
332
- "text": "Falls back to next backend when launch fails",
496
+ "text": "Excludes manual-review, larger-refactor, and architectural-remediation findings from fixPlan canary/rollout",
333
497
  "type": "content_check"
334
498
  },
335
499
  {
336
- "text": "Completes run with local-sequential fallback when no delegation backend is available",
500
+ "text": "Allows only autofixEligible safe-autofix findings into the executable fix queue",
337
501
  "type": "content_check"
338
502
  }
339
503
  ]
340
504
  },
341
505
  {
342
- "id": 15,
343
- "prompt": "/bug-hunter huge-repo/ with flaky chunk worker",
344
- "expected_output": "Orchestrator should enforce per-chunk timeout, retry failed chunk once with backoff, and persist attempt details in run journal.",
506
+ "id": 25,
507
+ "prompt": "/bug-hunter --pr current with gh unavailable and no trustworthy default base branch",
508
+ "expected_output": "Current-PR fallback should fail explicitly when it cannot determine a trustworthy base branch instead of silently assuming main.",
345
509
  "files": [],
346
510
  "assertions": [
347
511
  {
348
- "text": "Uses run-bug-hunter.cjs for autonomous chunk orchestration",
512
+ "text": "Uses the discovered default branch or explicit --base for current-branch git fallback",
513
+ "type": "content_check"
514
+ },
515
+ {
516
+ "text": "Fails explicitly when no trustworthy base branch can be determined for current PR fallback",
517
+ "type": "content_check"
518
+ },
519
+ {
520
+ "text": "Does not silently assume main for current-PR fallback scope resolution",
521
+ "type": "content_check"
522
+ }
523
+ ]
524
+ },
525
+ {
526
+ "id": 26,
527
+ "prompt": "/bug-hunter concurrent query-bugs and expired live fix-lock scenarios",
528
+ "expected_output": "Utility helpers should preserve correctness under failure and concurrency pressure.",
529
+ "files": [],
530
+ "assertions": [
531
+ {
532
+ "text": "query-bugs uses invocation-scoped temp seed files and cleans them up even on failure",
533
+ "type": "content_check"
534
+ },
535
+ {
536
+ "text": "fix-lock does not recover an expired lock when the recorded owner PID is still alive",
537
+ "type": "content_check"
538
+ },
539
+ {
540
+ "text": "Reports a live-owner lock conflict instead of allowing overlapping fixers",
541
+ "type": "content_check"
542
+ }
543
+ ]
544
+ },
545
+ {
546
+ "id": 27,
547
+ "prompt": "/bug-hunter --pr-security",
548
+ "expected_output": "Enterprise PR security review should route through the bundled local commit-security-scan workflow, using PR scope, threat-model context, and dependency-awareness without editing code.",
549
+ "files": [],
550
+ "assertions": [
551
+ {
552
+ "text": "Treats --pr-security as PR-scoped security review with FIX_MODE disabled",
553
+ "type": "content_check"
554
+ },
555
+ {
556
+ "text": "Loads the bundled local skills/commit-security-scan/SKILL.md guidance for PR-focused security review",
557
+ "type": "content_check"
558
+ },
559
+ {
560
+ "text": "Combines PR scope resolution with threat-model and dependency-scan context",
561
+ "type": "content_check"
562
+ }
563
+ ]
564
+ },
565
+ {
566
+ "id": 28,
567
+ "prompt": "/bug-hunter --security-review src/",
568
+ "expected_output": "Enterprise security-review mode should route through the bundled local security-review workflow and combine threat model, code review, dependency findings, and security validation semantics.",
569
+ "files": [],
570
+ "assertions": [
571
+ {
572
+ "text": "Treats --security-review as a bundled enterprise security workflow with FIX_MODE disabled",
573
+ "type": "content_check"
574
+ },
575
+ {
576
+ "text": "Loads the bundled local skills/security-review/SKILL.md guidance during execution",
577
+ "type": "content_check"
578
+ },
579
+ {
580
+ "text": "Runs with threat-model and dependency-scan context enabled",
581
+ "type": "content_check"
582
+ }
583
+ ]
584
+ },
585
+ {
586
+ "id": 29,
587
+ "prompt": "/bug-hunter --threat-model src/ when no threat model exists yet",
588
+ "expected_output": "Threat-model mode should route through the bundled local threat-model-generation skill and produce Bug Hunter-native threat-model artifacts.",
589
+ "files": [],
590
+ "assertions": [
591
+ {
592
+ "text": "Loads the bundled local skills/threat-model-generation/SKILL.md before generating the threat model",
593
+ "type": "content_check"
594
+ },
595
+ {
596
+ "text": "Writes .bug-hunter/threat-model.md and .bug-hunter/security-config.json",
597
+ "type": "content_check"
598
+ },
599
+ {
600
+ "text": "Keeps all threat-model artifacts under .bug-hunter instead of external .factory paths",
601
+ "type": "content_check"
602
+ }
603
+ ]
604
+ },
605
+ {
606
+ "id": 30,
607
+ "prompt": "/bug-hunter --validate-security src/ with confirmed security findings",
608
+ "expected_output": "Security-validation mode should route through the bundled local vulnerability-validation skill and enrich confirmed security findings with exploitability-oriented reasoning.",
609
+ "files": [],
610
+ "assertions": [
611
+ {
612
+ "text": "Loads the bundled local skills/vulnerability-validation/SKILL.md when security validation is requested",
349
613
  "type": "content_check"
350
614
  },
351
615
  {
352
- "text": "Retries timed out/failed chunk according to max-retries and backoff policy",
616
+ "text": "Re-checks reachability, exploitability, PoC quality, and CVSS details for confirmed security findings",
353
617
  "type": "content_check"
354
618
  },
355
619
  {
356
- "text": "Writes attempt events to .claude/bug-hunter-run.log",
620
+ "text": "Uses Bug Hunter-native artifacts rather than a separate external validation pipeline",
357
621
  "type": "content_check"
358
622
  }
359
623
  ]