@codexstar/bug-hunter 3.0.0 → 3.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +149 -83
- package/README.md +150 -15
- package/SKILL.md +94 -27
- package/agents/openai.yaml +4 -0
- package/bin/bug-hunter +9 -3
- package/docs/images/2026-03-12-fix-plan-rollout.png +0 -0
- package/docs/images/2026-03-12-hero-bug-hunter-overview.png +0 -0
- package/docs/images/2026-03-12-machine-readable-artifacts.png +0 -0
- package/docs/images/2026-03-12-pr-review-flow.png +0 -0
- package/docs/images/2026-03-12-security-pack.png +0 -0
- package/docs/images/adversarial-debate.png +0 -0
- package/docs/images/doc-verify-fix-plan.png +0 -0
- package/docs/images/hero.png +0 -0
- package/docs/images/pipeline-overview.png +0 -0
- package/docs/images/security-finding-card.png +0 -0
- package/docs/plans/2026-03-11-structured-output-migration-plan.md +288 -0
- package/docs/plans/2026-03-12-audit-bug-fixes-surgical-plan.md +193 -0
- package/docs/plans/2026-03-12-enterprise-security-pack-e2e-plan.md +59 -0
- package/docs/plans/2026-03-12-local-security-skills-integration-plan.md +39 -0
- package/docs/plans/2026-03-12-pr-review-strategic-fix-flow.md +78 -0
- package/evals/evals.json +366 -102
- package/modes/extended.md +2 -2
- package/modes/fix-loop.md +30 -30
- package/modes/fix-pipeline.md +32 -6
- package/modes/large-codebase.md +14 -15
- package/modes/local-sequential.md +44 -20
- package/modes/loop.md +56 -56
- package/modes/parallel.md +3 -3
- package/modes/scaled.md +2 -2
- package/modes/single-file.md +3 -3
- package/modes/small.md +11 -11
- package/package.json +11 -1
- package/prompts/fixer.md +37 -23
- package/prompts/hunter.md +39 -20
- package/prompts/referee.md +34 -20
- package/prompts/skeptic.md +25 -22
- package/schemas/coverage.schema.json +67 -0
- package/schemas/examples/findings.invalid.json +13 -0
- package/schemas/examples/findings.valid.json +17 -0
- package/schemas/findings.schema.json +76 -0
- package/schemas/fix-plan.schema.json +94 -0
- package/schemas/fix-report.schema.json +105 -0
- package/schemas/fix-strategy.schema.json +99 -0
- package/schemas/recon.schema.json +31 -0
- package/schemas/referee.schema.json +46 -0
- package/schemas/shared.schema.json +51 -0
- package/schemas/skeptic.schema.json +21 -0
- package/scripts/bug-hunter-state.cjs +35 -12
- package/scripts/code-index.cjs +11 -4
- package/scripts/fix-lock.cjs +95 -25
- package/scripts/payload-guard.cjs +24 -10
- package/scripts/pr-scope.cjs +181 -0
- package/scripts/prepublish-guard.cjs +82 -0
- package/scripts/render-report.cjs +346 -0
- package/scripts/run-bug-hunter.cjs +669 -33
- package/scripts/schema-runtime.cjs +273 -0
- package/scripts/schema-validate.cjs +40 -0
- package/scripts/tests/bug-hunter-state.test.cjs +68 -3
- package/scripts/tests/code-index.test.cjs +15 -0
- package/scripts/tests/fix-lock.test.cjs +60 -2
- package/scripts/tests/fixtures/flaky-worker.cjs +6 -1
- package/scripts/tests/fixtures/low-confidence-worker.cjs +8 -2
- package/scripts/tests/fixtures/success-worker.cjs +6 -1
- package/scripts/tests/payload-guard.test.cjs +154 -2
- package/scripts/tests/pr-scope.test.cjs +212 -0
- package/scripts/tests/render-report.test.cjs +180 -0
- package/scripts/tests/run-bug-hunter.test.cjs +686 -2
- package/scripts/tests/security-skills-integration.test.cjs +29 -0
- package/scripts/tests/skills-packaging.test.cjs +30 -0
- package/scripts/tests/worktree-harvest.test.cjs +67 -1
- package/scripts/worktree-harvest.cjs +62 -9
- package/skills/README.md +19 -0
- package/skills/commit-security-scan/SKILL.md +63 -0
- package/skills/security-review/SKILL.md +57 -0
- package/skills/threat-model-generation/SKILL.md +47 -0
- package/skills/vulnerability-validation/SKILL.md +59 -0
- package/templates/subagent-wrapper.md +12 -3
- package/modes/_dispatch.md +0 -121
package/evals/evals.json
CHANGED
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
"evals": [
|
|
4
4
|
{
|
|
5
5
|
"id": 1,
|
|
6
|
-
"prompt": "/bug-hunter test-fixture/",
|
|
7
|
-
"expected_output": "
|
|
6
|
+
"prompt": "/bug-hunter --scan-only test-fixture/",
|
|
7
|
+
"expected_output": "Scan-only self-test on the included Express fixture. Should run Recon -> Hunter -> Skeptic -> Referee, confirm most planted bugs, and write canonical JSON artifacts plus a rendered report.",
|
|
8
8
|
"files": [
|
|
9
9
|
"test-fixture/server.js",
|
|
10
10
|
"test-fixture/auth.js",
|
|
@@ -13,347 +13,611 @@
|
|
|
13
13
|
],
|
|
14
14
|
"assertions": [
|
|
15
15
|
{
|
|
16
|
-
"text": "Pipeline runs
|
|
16
|
+
"text": "Pipeline runs Recon, Hunter, Skeptic, and Referee",
|
|
17
17
|
"type": "content_check"
|
|
18
18
|
},
|
|
19
19
|
{
|
|
20
|
-
"text": "
|
|
20
|
+
"text": "Writes .bug-hunter/findings.json, .bug-hunter/referee.json, and .bug-hunter/report.md",
|
|
21
21
|
"type": "content_check"
|
|
22
22
|
},
|
|
23
23
|
{
|
|
24
|
-
"text": "
|
|
24
|
+
"text": "Confirms at least 5 of the 6 planted bugs in the fixture",
|
|
25
25
|
"type": "content_check"
|
|
26
26
|
},
|
|
27
27
|
{
|
|
28
|
-
"text": "
|
|
28
|
+
"text": "Rendered report includes mode, files scanned, and coverage metadata",
|
|
29
|
+
"type": "content_check"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"id": 2,
|
|
35
|
+
"prompt": "/bug-hunter src/api/auth.ts",
|
|
36
|
+
"expected_output": "Single-file scan should skip Recon, run Hunter -> Skeptic -> Referee, and keep the output scoped to the target file while still writing canonical JSON artifacts.",
|
|
37
|
+
"files": [],
|
|
38
|
+
"assertions": [
|
|
39
|
+
{
|
|
40
|
+
"text": "Selects single-file mode when one source file is targeted",
|
|
29
41
|
"type": "content_check"
|
|
30
42
|
},
|
|
31
43
|
{
|
|
32
|
-
"text": "
|
|
44
|
+
"text": "Skips Recon for single-file mode",
|
|
33
45
|
"type": "content_check"
|
|
34
46
|
},
|
|
35
47
|
{
|
|
36
|
-
"text": "
|
|
48
|
+
"text": "Writes .bug-hunter/findings.json and .bug-hunter/referee.json for the single-file run",
|
|
49
|
+
"type": "content_check"
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"text": "Referee returns REAL_BUG, NOT_A_BUG, or MANUAL_REVIEW verdicts for the findings",
|
|
37
53
|
"type": "content_check"
|
|
38
54
|
}
|
|
39
55
|
]
|
|
40
56
|
},
|
|
41
57
|
{
|
|
42
|
-
"id":
|
|
43
|
-
"prompt": "/bug-hunter
|
|
44
|
-
"expected_output": "
|
|
58
|
+
"id": 3,
|
|
59
|
+
"prompt": "/bug-hunter -b feature-auth --base develop",
|
|
60
|
+
"expected_output": "Branch diff mode should diff the branches, filter non-source files, report the resulting scan set, and choose the execution mode from the surviving source files.",
|
|
45
61
|
"files": [],
|
|
46
62
|
"assertions": [
|
|
47
63
|
{
|
|
48
|
-
"text": "
|
|
64
|
+
"text": "Runs git diff --name-only develop...feature-auth to resolve changed files",
|
|
65
|
+
"type": "content_check"
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"text": "Filters docs, configs, assets, lockfiles, and other non-source files before scanning",
|
|
49
69
|
"type": "content_check"
|
|
50
70
|
},
|
|
51
71
|
{
|
|
52
|
-
"text": "
|
|
72
|
+
"text": "Reports the number of scannable source files after filtering",
|
|
53
73
|
"type": "content_check"
|
|
54
74
|
},
|
|
55
75
|
{
|
|
56
|
-
"text": "
|
|
76
|
+
"text": "Chooses small, parallel, extended, scaled, or large-codebase mode from the filtered file count",
|
|
77
|
+
"type": "content_check"
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"id": 4,
|
|
83
|
+
"prompt": "/bug-hunter --staged",
|
|
84
|
+
"expected_output": "Staged mode should scan full contents of staged source files after resolving them through git diff --cached --name-only and filtering non-source files.",
|
|
85
|
+
"files": [],
|
|
86
|
+
"assertions": [
|
|
87
|
+
{
|
|
88
|
+
"text": "Runs git diff --cached --name-only to collect staged files",
|
|
57
89
|
"type": "content_check"
|
|
58
90
|
},
|
|
59
91
|
{
|
|
60
|
-
"text": "
|
|
92
|
+
"text": "Filters non-source files from the staged list before scanning",
|
|
61
93
|
"type": "content_check"
|
|
62
94
|
},
|
|
63
95
|
{
|
|
64
|
-
"text": "
|
|
96
|
+
"text": "Scans full file contents of staged source files rather than scanning only the patch",
|
|
65
97
|
"type": "content_check"
|
|
66
98
|
}
|
|
67
99
|
]
|
|
68
100
|
},
|
|
69
101
|
{
|
|
70
|
-
"id":
|
|
71
|
-
"prompt": "/bug-hunter
|
|
72
|
-
"expected_output": "
|
|
102
|
+
"id": 5,
|
|
103
|
+
"prompt": "/bug-hunter --fix src/",
|
|
104
|
+
"expected_output": "Default fix mode should run Phase 1, then acquire the fix lock, capture verification baselines, apply eligible fixes, write a machine-readable fix report, and release the lock.",
|
|
73
105
|
"files": [],
|
|
74
106
|
"assertions": [
|
|
75
107
|
{
|
|
76
|
-
"text": "
|
|
108
|
+
"text": "Creates a git safety branch before applying fixes when git safety is available",
|
|
77
109
|
"type": "content_check"
|
|
78
110
|
},
|
|
79
111
|
{
|
|
80
|
-
"text": "
|
|
112
|
+
"text": "Acquires and releases .bug-hunter/fix.lock around the fix phase",
|
|
81
113
|
"type": "content_check"
|
|
82
114
|
},
|
|
83
115
|
{
|
|
84
|
-
"text": "
|
|
116
|
+
"text": "Captures verification baseline before applying fixes",
|
|
85
117
|
"type": "content_check"
|
|
86
118
|
},
|
|
87
119
|
{
|
|
88
|
-
"text": "
|
|
120
|
+
"text": "Writes .bug-hunter/fix-report.json as the canonical fix artifact",
|
|
121
|
+
"type": "content_check"
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"text": "Auto-fixes only bugs that pass the confidence eligibility threshold",
|
|
89
125
|
"type": "content_check"
|
|
90
126
|
}
|
|
91
127
|
]
|
|
92
128
|
},
|
|
93
129
|
{
|
|
94
|
-
"id":
|
|
95
|
-
"prompt": "/bug-hunter
|
|
96
|
-
"expected_output": "
|
|
130
|
+
"id": 6,
|
|
131
|
+
"prompt": "/bug-hunter src/",
|
|
132
|
+
"expected_output": "Loop mode is the default. A normal directory scan should create loop state, iterate until queued files are covered, and track canonical coverage in JSON with a rendered Markdown companion.",
|
|
97
133
|
"files": [],
|
|
98
134
|
"assertions": [
|
|
99
135
|
{
|
|
100
|
-
"text": "
|
|
136
|
+
"text": "Treats loop mode as the default without requiring an explicit --loop flag",
|
|
101
137
|
"type": "content_check"
|
|
102
138
|
},
|
|
103
139
|
{
|
|
104
|
-
"text": "
|
|
140
|
+
"text": "Creates or updates .bug-hunter/coverage.json as canonical loop state and renders .bug-hunter/coverage.md from it",
|
|
105
141
|
"type": "content_check"
|
|
106
142
|
},
|
|
107
143
|
{
|
|
108
|
-
"text": "
|
|
144
|
+
"text": "Tracks per-file coverage state in coverage.json across iterations",
|
|
145
|
+
"type": "content_check"
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
"text": "Marks completion only when all queued scannable files are done",
|
|
109
149
|
"type": "content_check"
|
|
110
150
|
}
|
|
111
151
|
]
|
|
112
152
|
},
|
|
113
153
|
{
|
|
114
|
-
"id":
|
|
115
|
-
"prompt": "
|
|
116
|
-
"expected_output": "
|
|
154
|
+
"id": 7,
|
|
155
|
+
"prompt": "Can you check my Express API for security vulnerabilities? The code is in src/",
|
|
156
|
+
"expected_output": "Natural-language trigger should invoke the bug-hunter skill and run a security-focused audit with trust-boundary mapping and security-oriented Hunter analysis.",
|
|
117
157
|
"files": [],
|
|
118
158
|
"assertions": [
|
|
119
159
|
{
|
|
120
|
-
"text": "
|
|
160
|
+
"text": "Triggers bug-hunter from natural language security-audit intent without requiring /bug-hunter",
|
|
161
|
+
"type": "content_check"
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
"text": "Runs Recon to identify architecture, trust boundaries, and high-risk areas",
|
|
121
165
|
"type": "content_check"
|
|
122
166
|
},
|
|
123
167
|
{
|
|
124
|
-
"text": "
|
|
168
|
+
"text": "Hunter prioritizes injection, auth bypass, input validation, and secrets exposure checks",
|
|
125
169
|
"type": "content_check"
|
|
126
170
|
},
|
|
127
171
|
{
|
|
128
|
-
"text": "
|
|
172
|
+
"text": "Findings use severity labels and canonical JSON fields rather than free-form Markdown only",
|
|
173
|
+
"type": "content_check"
|
|
174
|
+
}
|
|
175
|
+
]
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
"id": 8,
|
|
179
|
+
"prompt": "/bug-hunter --fix --approve src/auth/",
|
|
180
|
+
"expected_output": "Approval mode should still run the fix pipeline, but Fixer agents should operate in reviewed mode and report that approval is required for edits.",
|
|
181
|
+
"files": [],
|
|
182
|
+
"assertions": [
|
|
183
|
+
{
|
|
184
|
+
"text": "Sets APPROVE_MODE=true from the --approve flag",
|
|
129
185
|
"type": "content_check"
|
|
130
186
|
},
|
|
131
187
|
{
|
|
132
|
-
"text": "
|
|
188
|
+
"text": "Runs Fixers in reviewed/default mode instead of unattended auto-edit mode",
|
|
133
189
|
"type": "content_check"
|
|
134
190
|
},
|
|
135
191
|
{
|
|
136
|
-
"text": "
|
|
192
|
+
"text": "Tells the user it is running in approval mode",
|
|
193
|
+
"type": "content_check"
|
|
194
|
+
}
|
|
195
|
+
]
|
|
196
|
+
},
|
|
197
|
+
{
|
|
198
|
+
"id": 9,
|
|
199
|
+
"prompt": "/bug-hunter huge-repo/",
|
|
200
|
+
"expected_output": "Large-repo mode should initialize persistent chunk state, process chunks sequentially, and resume from .bug-hunter/state.json when interrupted.",
|
|
201
|
+
"files": [],
|
|
202
|
+
"assertions": [
|
|
203
|
+
{
|
|
204
|
+
"text": "Initializes .bug-hunter/state.json with chunk metadata",
|
|
137
205
|
"type": "content_check"
|
|
138
206
|
},
|
|
139
207
|
{
|
|
140
|
-
"text": "
|
|
208
|
+
"text": "Processes large scans in sequential chunks and records chunk status",
|
|
141
209
|
"type": "content_check"
|
|
142
210
|
},
|
|
143
211
|
{
|
|
144
|
-
"text": "
|
|
212
|
+
"text": "Resumes from existing .bug-hunter/state.json without rescanning completed chunks",
|
|
213
|
+
"type": "content_check"
|
|
214
|
+
}
|
|
215
|
+
]
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
"id": 10,
|
|
219
|
+
"prompt": "/bug-hunter src/ (second run with unchanged files)",
|
|
220
|
+
"expected_output": "A repeat run should apply the hash cache through bug-hunter-state and skip unchanged files before deep scan work starts.",
|
|
221
|
+
"files": [],
|
|
222
|
+
"assertions": [
|
|
223
|
+
{
|
|
224
|
+
"text": "Runs hash-filter against .bug-hunter/state.json before deep scan work",
|
|
145
225
|
"type": "content_check"
|
|
146
226
|
},
|
|
147
227
|
{
|
|
148
|
-
"text": "
|
|
228
|
+
"text": "Reports skipped unchanged files from the hash cache",
|
|
229
|
+
"type": "content_check"
|
|
230
|
+
}
|
|
231
|
+
]
|
|
232
|
+
},
|
|
233
|
+
{
|
|
234
|
+
"id": 11,
|
|
235
|
+
"prompt": "/bug-hunter src/ with malformed subagent payload",
|
|
236
|
+
"expected_output": "Payload validation should fail before any subagent launch when the generated payload does not match the required contract.",
|
|
237
|
+
"files": [],
|
|
238
|
+
"assertions": [
|
|
239
|
+
{
|
|
240
|
+
"text": "Validates subagent payloads with payload-guard.cjs before launch",
|
|
149
241
|
"type": "content_check"
|
|
150
242
|
},
|
|
151
243
|
{
|
|
152
|
-
"text": "
|
|
244
|
+
"text": "Does not launch a subagent when payload validation fails",
|
|
153
245
|
"type": "content_check"
|
|
154
246
|
}
|
|
155
247
|
]
|
|
156
248
|
},
|
|
157
249
|
{
|
|
158
|
-
"id":
|
|
159
|
-
"prompt": "/bug-hunter --
|
|
160
|
-
"expected_output": "
|
|
250
|
+
"id": 12,
|
|
251
|
+
"prompt": "/bug-hunter --fix src/ while another fix run is active",
|
|
252
|
+
"expected_output": "The fix phase should stop cleanly when the single-writer lock cannot be acquired.",
|
|
161
253
|
"files": [],
|
|
162
254
|
"assertions": [
|
|
163
255
|
{
|
|
164
|
-
"text": "
|
|
256
|
+
"text": "Attempts to acquire .bug-hunter/fix.lock before any edits",
|
|
165
257
|
"type": "content_check"
|
|
166
258
|
},
|
|
167
259
|
{
|
|
168
|
-
"text": "
|
|
260
|
+
"text": "Stops Phase 2 with a clear lock-held message when the fix lock is already held",
|
|
261
|
+
"type": "content_check"
|
|
262
|
+
}
|
|
263
|
+
]
|
|
264
|
+
},
|
|
265
|
+
{
|
|
266
|
+
"id": 13,
|
|
267
|
+
"prompt": "/bug-hunter --fix src/ with mixed-confidence bugs",
|
|
268
|
+
"expected_output": "Auto-fix should edit only eligible high-confidence bugs and leave the rest in manual review.",
|
|
269
|
+
"files": [],
|
|
270
|
+
"assertions": [
|
|
271
|
+
{
|
|
272
|
+
"text": "Applies the >=75 confidence threshold for auto-fix eligibility",
|
|
169
273
|
"type": "content_check"
|
|
170
274
|
},
|
|
171
275
|
{
|
|
172
|
-
"text": "
|
|
276
|
+
"text": "Keeps low-confidence bugs in manual review instead of auto-editing them",
|
|
277
|
+
"type": "content_check"
|
|
278
|
+
}
|
|
279
|
+
]
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
"id": 14,
|
|
283
|
+
"prompt": "/bug-hunter src/ on a CLI without spawn_agent",
|
|
284
|
+
"expected_output": "The skill should select the best available orchestration backend at runtime and fall back to local-sequential execution when delegation backends are unavailable.",
|
|
285
|
+
"files": [],
|
|
286
|
+
"assertions": [
|
|
287
|
+
{
|
|
288
|
+
"text": "Chooses AGENT_BACKEND during preflight based on available runtime tools",
|
|
173
289
|
"type": "content_check"
|
|
174
290
|
},
|
|
175
291
|
{
|
|
176
|
-
"text": "
|
|
292
|
+
"text": "Falls back to the next backend when a preferred launch path fails",
|
|
177
293
|
"type": "content_check"
|
|
178
294
|
},
|
|
179
295
|
{
|
|
180
|
-
"text": "
|
|
296
|
+
"text": "Completes the run with local-sequential fallback when no delegation backend is available",
|
|
181
297
|
"type": "content_check"
|
|
182
298
|
}
|
|
183
299
|
]
|
|
184
300
|
},
|
|
185
301
|
{
|
|
186
|
-
"id":
|
|
187
|
-
"prompt": "
|
|
188
|
-
"expected_output": "
|
|
302
|
+
"id": 15,
|
|
303
|
+
"prompt": "/bug-hunter huge-repo/ with flaky chunk worker",
|
|
304
|
+
"expected_output": "The chunk orchestrator should enforce retries with backoff and write attempt details to the canonical run journal.",
|
|
189
305
|
"files": [],
|
|
190
306
|
"assertions": [
|
|
191
307
|
{
|
|
192
|
-
"text": "
|
|
308
|
+
"text": "Uses run-bug-hunter.cjs for autonomous chunk orchestration",
|
|
193
309
|
"type": "content_check"
|
|
194
310
|
},
|
|
195
311
|
{
|
|
196
|
-
"text": "
|
|
312
|
+
"text": "Retries timed out or failed chunks according to max-retries and backoff policy",
|
|
197
313
|
"type": "content_check"
|
|
198
314
|
},
|
|
199
315
|
{
|
|
200
|
-
"text": "
|
|
316
|
+
"text": "Writes attempt events to .bug-hunter/run.log",
|
|
317
|
+
"type": "content_check"
|
|
318
|
+
}
|
|
319
|
+
]
|
|
320
|
+
},
|
|
321
|
+
{
|
|
322
|
+
"id": 16,
|
|
323
|
+
"prompt": "/bug-hunter --deps src/",
|
|
324
|
+
"expected_output": "Dependency scan mode should run the dependency audit helper, write dep-findings output, and feed reachable dependency issues into Hunter context.",
|
|
325
|
+
"files": [],
|
|
326
|
+
"assertions": [
|
|
327
|
+
{
|
|
328
|
+
"text": "Runs scripts/dep-scan.cjs when --deps is supplied",
|
|
201
329
|
"type": "content_check"
|
|
202
330
|
},
|
|
203
331
|
{
|
|
204
|
-
"text": "
|
|
332
|
+
"text": "Writes .bug-hunter/dep-findings.json for dependency scan output",
|
|
205
333
|
"type": "content_check"
|
|
206
334
|
},
|
|
207
335
|
{
|
|
208
|
-
"text": "
|
|
336
|
+
"text": "Includes reachable dependency findings in Hunter analysis context",
|
|
209
337
|
"type": "content_check"
|
|
210
338
|
}
|
|
211
339
|
]
|
|
212
340
|
},
|
|
213
341
|
{
|
|
214
|
-
"id":
|
|
215
|
-
"prompt": "/bug-hunter --
|
|
216
|
-
"expected_output": "
|
|
342
|
+
"id": 17,
|
|
343
|
+
"prompt": "/bug-hunter --threat-model src/",
|
|
344
|
+
"expected_output": "Threat-model mode should load or generate a STRIDE threat model and feed it into Recon and Hunter.",
|
|
217
345
|
"files": [],
|
|
218
346
|
"assertions": [
|
|
219
347
|
{
|
|
220
|
-
"text": "
|
|
348
|
+
"text": "Loads an existing .bug-hunter/threat-model.md or generates one when missing",
|
|
221
349
|
"type": "content_check"
|
|
222
350
|
},
|
|
223
351
|
{
|
|
224
|
-
"text": "
|
|
352
|
+
"text": "Marks THREAT_MODEL_AVAILABLE and uses the threat model in Recon and Hunter context",
|
|
353
|
+
"type": "content_check"
|
|
354
|
+
},
|
|
355
|
+
{
|
|
356
|
+
"text": "Keeps threat-model generation non-blocking relative to the rest of the bug-hunt flow",
|
|
357
|
+
"type": "content_check"
|
|
358
|
+
}
|
|
359
|
+
]
|
|
360
|
+
},
|
|
361
|
+
{
|
|
362
|
+
"id": 18,
|
|
363
|
+
"prompt": "/bug-hunter --fix --dry-run src/",
|
|
364
|
+
"expected_output": "Dry-run fix mode should build the fix plan and produce machine-readable fix output without editing files, committing, or taking the lock.",
|
|
365
|
+
"files": [],
|
|
366
|
+
"assertions": [
|
|
367
|
+
{
|
|
368
|
+
"text": "Sets DRY_RUN_MODE=true and forces FIX_MODE=true when --dry-run is provided",
|
|
225
369
|
"type": "content_check"
|
|
226
370
|
},
|
|
227
371
|
{
|
|
228
|
-
"text": "
|
|
372
|
+
"text": "Produces .bug-hunter/fix-report.json with dry_run set to true",
|
|
229
373
|
"type": "content_check"
|
|
230
374
|
},
|
|
231
375
|
{
|
|
232
|
-
"text": "
|
|
376
|
+
"text": "Skips file edits, git commits, and fix lock acquisition in dry-run mode",
|
|
233
377
|
"type": "content_check"
|
|
234
378
|
}
|
|
235
379
|
]
|
|
236
380
|
},
|
|
237
381
|
{
|
|
238
|
-
"id":
|
|
239
|
-
"prompt": "/bug-hunter
|
|
240
|
-
"expected_output": "
|
|
382
|
+
"id": 19,
|
|
383
|
+
"prompt": "/bug-hunter --autonomous src/",
|
|
384
|
+
"expected_output": "Autonomous mode should force fix mode and run canary-first, confidence-gated fixes without waiting for per-edit approval.",
|
|
241
385
|
"files": [],
|
|
242
386
|
"assertions": [
|
|
243
387
|
{
|
|
244
|
-
"text": "
|
|
388
|
+
"text": "Sets AUTONOMOUS_MODE=true and forces FIX_MODE=true when --autonomous is supplied",
|
|
245
389
|
"type": "content_check"
|
|
246
390
|
},
|
|
247
391
|
{
|
|
248
|
-
"text": "
|
|
392
|
+
"text": "Runs canary-first, confidence-gated fix rollout in autonomous mode",
|
|
249
393
|
"type": "content_check"
|
|
250
394
|
},
|
|
251
395
|
{
|
|
252
|
-
"text": "
|
|
396
|
+
"text": "Does not require approval-mode prompts for unattended autonomous fixes",
|
|
253
397
|
"type": "content_check"
|
|
254
398
|
}
|
|
255
399
|
]
|
|
256
400
|
},
|
|
257
401
|
{
|
|
258
|
-
"id":
|
|
259
|
-
"prompt": "/bug-hunter
|
|
260
|
-
"expected_output": "
|
|
402
|
+
"id": 20,
|
|
403
|
+
"prompt": "/bug-hunter --pr current",
|
|
404
|
+
"expected_output": "PR review mode should resolve the current PR scope, save PR metadata, and scan the resolved changed files rather than the whole repository.",
|
|
261
405
|
"files": [],
|
|
262
406
|
"assertions": [
|
|
263
407
|
{
|
|
264
|
-
"text": "
|
|
408
|
+
"text": "Uses scripts/pr-scope.cjs to resolve current PR metadata and changed files",
|
|
265
409
|
"type": "content_check"
|
|
266
410
|
},
|
|
267
411
|
{
|
|
268
|
-
"text": "
|
|
412
|
+
"text": "Writes .bug-hunter/pr-scope.json for later reporting",
|
|
413
|
+
"type": "content_check"
|
|
414
|
+
},
|
|
415
|
+
{
|
|
416
|
+
"text": "Scans the resolved changed files as the PR review scope",
|
|
269
417
|
"type": "content_check"
|
|
270
418
|
}
|
|
271
419
|
]
|
|
272
420
|
},
|
|
273
421
|
{
|
|
274
|
-
"id":
|
|
275
|
-
"prompt": "/bug-hunter
|
|
276
|
-
"expected_output": "
|
|
422
|
+
"id": 21,
|
|
423
|
+
"prompt": "/bug-hunter --pr recent --scan-only",
|
|
424
|
+
"expected_output": "Recent-PR review mode should resolve the most recent PR through GitHub metadata, limit analysis to its changed files, and stop after reporting.",
|
|
277
425
|
"files": [],
|
|
278
426
|
"assertions": [
|
|
279
427
|
{
|
|
280
|
-
"text": "
|
|
428
|
+
"text": "Resolves the most recent PR through pr-scope using GitHub metadata",
|
|
429
|
+
"type": "content_check"
|
|
430
|
+
},
|
|
431
|
+
{
|
|
432
|
+
"text": "Keeps FIX_MODE disabled because scan-only was requested",
|
|
281
433
|
"type": "content_check"
|
|
282
434
|
},
|
|
283
435
|
{
|
|
284
|
-
"text": "
|
|
436
|
+
"text": "Produces the normal findings/referee/report artifacts for the PR-scoped review",
|
|
285
437
|
"type": "content_check"
|
|
286
438
|
}
|
|
287
439
|
]
|
|
288
440
|
},
|
|
289
441
|
{
|
|
290
|
-
"id":
|
|
291
|
-
"prompt": "/bug-hunter --
|
|
292
|
-
"expected_output": "
|
|
442
|
+
"id": 22,
|
|
443
|
+
"prompt": "/bug-hunter --plan-only src/",
|
|
444
|
+
"expected_output": "Plan-only mode should build a remediation strategy and fix plan but stop before the Fixer edits code.",
|
|
293
445
|
"files": [],
|
|
294
446
|
"assertions": [
|
|
295
447
|
{
|
|
296
|
-
"text": "
|
|
448
|
+
"text": "Builds .bug-hunter/fix-strategy.json and .bug-hunter/fix-strategy.md before fix execution",
|
|
449
|
+
"type": "content_check"
|
|
450
|
+
},
|
|
451
|
+
{
|
|
452
|
+
"text": "Builds .bug-hunter/fix-plan.json while PLAN_ONLY_MODE is active",
|
|
297
453
|
"type": "content_check"
|
|
298
454
|
},
|
|
299
455
|
{
|
|
300
|
-
"text": "Stops
|
|
456
|
+
"text": "Stops before the Fixer edits files when --plan-only is supplied",
|
|
301
457
|
"type": "content_check"
|
|
302
458
|
}
|
|
303
459
|
]
|
|
304
460
|
},
|
|
305
461
|
{
|
|
306
|
-
"id":
|
|
307
|
-
"prompt": "/bug-hunter --
|
|
308
|
-
"expected_output": "
|
|
462
|
+
"id": 23,
|
|
463
|
+
"prompt": "/bug-hunter --plan src/ then /bug-hunter --preview src/ then /bug-hunter --safe src/ then /bug-hunter --last-pr --review",
|
|
464
|
+
"expected_output": "Shortcut aliases should map cleanly onto their canonical behaviors without changing the underlying execution semantics.",
|
|
309
465
|
"files": [],
|
|
310
466
|
"assertions": [
|
|
311
467
|
{
|
|
312
|
-
"text": "
|
|
468
|
+
"text": "Treats --plan as an alias for --plan-only",
|
|
313
469
|
"type": "content_check"
|
|
314
470
|
},
|
|
315
471
|
{
|
|
316
|
-
"text": "
|
|
472
|
+
"text": "Treats --preview as an alias for --fix --dry-run",
|
|
473
|
+
"type": "content_check"
|
|
474
|
+
},
|
|
475
|
+
{
|
|
476
|
+
"text": "Treats --safe as an alias for --fix --approve",
|
|
477
|
+
"type": "content_check"
|
|
478
|
+
},
|
|
479
|
+
{
|
|
480
|
+
"text": "Treats --last-pr and --review as aliases for --pr recent and --scan-only",
|
|
317
481
|
"type": "content_check"
|
|
318
482
|
}
|
|
319
483
|
]
|
|
320
484
|
},
|
|
321
485
|
{
|
|
322
|
-
"id":
|
|
323
|
-
"prompt": "/bug-hunter src/
|
|
324
|
-
"expected_output": "
|
|
486
|
+
"id": 24,
|
|
487
|
+
"prompt": "/bug-hunter --fix src/ with a high-confidence architectural-remediation finding",
|
|
488
|
+
"expected_output": "Execution gating should honor fix-strategy classifications so non-autofix findings never enter the executable canary or rollout queue.",
|
|
325
489
|
"files": [],
|
|
326
490
|
"assertions": [
|
|
327
491
|
{
|
|
328
|
-
"text": "
|
|
492
|
+
"text": "Builds fix-strategy classifications before building the executable fix plan",
|
|
329
493
|
"type": "content_check"
|
|
330
494
|
},
|
|
331
495
|
{
|
|
332
|
-
"text": "
|
|
496
|
+
"text": "Excludes manual-review, larger-refactor, and architectural-remediation findings from fixPlan canary/rollout",
|
|
333
497
|
"type": "content_check"
|
|
334
498
|
},
|
|
335
499
|
{
|
|
336
|
-
"text": "
|
|
500
|
+
"text": "Allows only autofixEligible safe-autofix findings into the executable fix queue",
|
|
337
501
|
"type": "content_check"
|
|
338
502
|
}
|
|
339
503
|
]
|
|
340
504
|
},
|
|
341
505
|
{
|
|
342
|
-
"id":
|
|
343
|
-
"prompt": "/bug-hunter
|
|
344
|
-
"expected_output": "
|
|
506
|
+
"id": 25,
|
|
507
|
+
"prompt": "/bug-hunter --pr current with gh unavailable and no trustworthy default base branch",
|
|
508
|
+
"expected_output": "Current-PR fallback should fail explicitly when it cannot determine a trustworthy base branch instead of silently assuming main.",
|
|
345
509
|
"files": [],
|
|
346
510
|
"assertions": [
|
|
347
511
|
{
|
|
348
|
-
"text": "Uses
|
|
512
|
+
"text": "Uses the discovered default branch or explicit --base for current-branch git fallback",
|
|
513
|
+
"type": "content_check"
|
|
514
|
+
},
|
|
515
|
+
{
|
|
516
|
+
"text": "Fails explicitly when no trustworthy base branch can be determined for current PR fallback",
|
|
517
|
+
"type": "content_check"
|
|
518
|
+
},
|
|
519
|
+
{
|
|
520
|
+
"text": "Does not silently assume main for current-PR fallback scope resolution",
|
|
521
|
+
"type": "content_check"
|
|
522
|
+
}
|
|
523
|
+
]
|
|
524
|
+
},
|
|
525
|
+
{
|
|
526
|
+
"id": 26,
|
|
527
|
+
"prompt": "/bug-hunter concurrent query-bugs and expired live fix-lock scenarios",
|
|
528
|
+
"expected_output": "Utility helpers should preserve correctness under failure and concurrency pressure.",
|
|
529
|
+
"files": [],
|
|
530
|
+
"assertions": [
|
|
531
|
+
{
|
|
532
|
+
"text": "query-bugs uses invocation-scoped temp seed files and cleans them up even on failure",
|
|
533
|
+
"type": "content_check"
|
|
534
|
+
},
|
|
535
|
+
{
|
|
536
|
+
"text": "fix-lock does not recover an expired lock when the recorded owner PID is still alive",
|
|
537
|
+
"type": "content_check"
|
|
538
|
+
},
|
|
539
|
+
{
|
|
540
|
+
"text": "Reports a live-owner lock conflict instead of allowing overlapping fixers",
|
|
541
|
+
"type": "content_check"
|
|
542
|
+
}
|
|
543
|
+
]
|
|
544
|
+
},
|
|
545
|
+
{
|
|
546
|
+
"id": 27,
|
|
547
|
+
"prompt": "/bug-hunter --pr-security",
|
|
548
|
+
"expected_output": "Enterprise PR security review should route through the bundled local commit-security-scan workflow, using PR scope, threat-model context, and dependency-awareness without editing code.",
|
|
549
|
+
"files": [],
|
|
550
|
+
"assertions": [
|
|
551
|
+
{
|
|
552
|
+
"text": "Treats --pr-security as PR-scoped security review with FIX_MODE disabled",
|
|
553
|
+
"type": "content_check"
|
|
554
|
+
},
|
|
555
|
+
{
|
|
556
|
+
"text": "Loads the bundled local skills/commit-security-scan/SKILL.md guidance for PR-focused security review",
|
|
557
|
+
"type": "content_check"
|
|
558
|
+
},
|
|
559
|
+
{
|
|
560
|
+
"text": "Combines PR scope resolution with threat-model and dependency-scan context",
|
|
561
|
+
"type": "content_check"
|
|
562
|
+
}
|
|
563
|
+
]
|
|
564
|
+
},
|
|
565
|
+
{
|
|
566
|
+
"id": 28,
|
|
567
|
+
"prompt": "/bug-hunter --security-review src/",
|
|
568
|
+
"expected_output": "Enterprise security-review mode should route through the bundled local security-review workflow and combine threat model, code review, dependency findings, and security validation semantics.",
|
|
569
|
+
"files": [],
|
|
570
|
+
"assertions": [
|
|
571
|
+
{
|
|
572
|
+
"text": "Treats --security-review as a bundled enterprise security workflow with FIX_MODE disabled",
|
|
573
|
+
"type": "content_check"
|
|
574
|
+
},
|
|
575
|
+
{
|
|
576
|
+
"text": "Loads the bundled local skills/security-review/SKILL.md guidance during execution",
|
|
577
|
+
"type": "content_check"
|
|
578
|
+
},
|
|
579
|
+
{
|
|
580
|
+
"text": "Runs with threat-model and dependency-scan context enabled",
|
|
581
|
+
"type": "content_check"
|
|
582
|
+
}
|
|
583
|
+
]
|
|
584
|
+
},
|
|
585
|
+
{
|
|
586
|
+
"id": 29,
|
|
587
|
+
"prompt": "/bug-hunter --threat-model src/ when no threat model exists yet",
|
|
588
|
+
"expected_output": "Threat-model mode should route through the bundled local threat-model-generation skill and produce Bug Hunter-native threat-model artifacts.",
|
|
589
|
+
"files": [],
|
|
590
|
+
"assertions": [
|
|
591
|
+
{
|
|
592
|
+
"text": "Loads the bundled local skills/threat-model-generation/SKILL.md before generating the threat model",
|
|
593
|
+
"type": "content_check"
|
|
594
|
+
},
|
|
595
|
+
{
|
|
596
|
+
"text": "Writes .bug-hunter/threat-model.md and .bug-hunter/security-config.json",
|
|
597
|
+
"type": "content_check"
|
|
598
|
+
},
|
|
599
|
+
{
|
|
600
|
+
"text": "Keeps all threat-model artifacts under .bug-hunter instead of external .factory paths",
|
|
601
|
+
"type": "content_check"
|
|
602
|
+
}
|
|
603
|
+
]
|
|
604
|
+
},
|
|
605
|
+
{
|
|
606
|
+
"id": 30,
|
|
607
|
+
"prompt": "/bug-hunter --validate-security src/ with confirmed security findings",
|
|
608
|
+
"expected_output": "Security-validation mode should route through the bundled local vulnerability-validation skill and enrich confirmed security findings with exploitability-oriented reasoning.",
|
|
609
|
+
"files": [],
|
|
610
|
+
"assertions": [
|
|
611
|
+
{
|
|
612
|
+
"text": "Loads the bundled local skills/vulnerability-validation/SKILL.md when security validation is requested",
|
|
349
613
|
"type": "content_check"
|
|
350
614
|
},
|
|
351
615
|
{
|
|
352
|
-
"text": "
|
|
616
|
+
"text": "Re-checks reachability, exploitability, PoC quality, and CVSS details for confirmed security findings",
|
|
353
617
|
"type": "content_check"
|
|
354
618
|
},
|
|
355
619
|
{
|
|
356
|
-
"text": "
|
|
620
|
+
"text": "Uses Bug Hunter-native artifacts rather than a separate external validation pipeline",
|
|
357
621
|
"type": "content_check"
|
|
358
622
|
}
|
|
359
623
|
]
|