gauntlet-spec 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (223) hide show
  1. gauntlet_spec-0.1.1/.claude/settings.json +17 -0
  2. gauntlet_spec-0.1.1/.codex/hooks.json +17 -0
  3. gauntlet_spec-0.1.1/.gauntlet/config.yaml +65 -0
  4. gauntlet_spec-0.1.1/.gauntlet/pins.yaml +185 -0
  5. gauntlet_spec-0.1.1/.github/workflows/release.yml +52 -0
  6. gauntlet_spec-0.1.1/.gitignore +18 -0
  7. gauntlet_spec-0.1.1/AGENTS.md +266 -0
  8. gauntlet_spec-0.1.1/BOOTSTRAP-NOTES.md +675 -0
  9. gauntlet_spec-0.1.1/CLAUDE.md +266 -0
  10. gauntlet_spec-0.1.1/FUTURE.md +25 -0
  11. gauntlet_spec-0.1.1/LICENSE +21 -0
  12. gauntlet_spec-0.1.1/PKG-INFO +364 -0
  13. gauntlet_spec-0.1.1/PRD-gauntlet.md +458 -0
  14. gauntlet_spec-0.1.1/README.md +337 -0
  15. gauntlet_spec-0.1.1/pipelines/bootstrap.yaml +59 -0
  16. gauntlet_spec-0.1.1/pipelines/standard.yaml +79 -0
  17. gauntlet_spec-0.1.1/policy.yaml +179 -0
  18. gauntlet_spec-0.1.1/prompts/CHANGELOG.md +11 -0
  19. gauntlet_spec-0.1.1/prompts/bootstrap-implement-p5.md +26 -0
  20. gauntlet_spec-0.1.1/prompts/bootstrap-implement-p6.md +20 -0
  21. gauntlet_spec-0.1.1/prompts/bootstrap-implement-p7.md +19 -0
  22. gauntlet_spec-0.1.1/prompts/commit-message.md +14 -0
  23. gauntlet_spec-0.1.1/prompts/cycle-confirm.md +19 -0
  24. gauntlet_spec-0.1.1/prompts/cycle-fix.md +16 -0
  25. gauntlet_spec-0.1.1/prompts/cycle-rereview.md +25 -0
  26. gauntlet_spec-0.1.1/prompts/cycle-review.md +23 -0
  27. gauntlet_spec-0.1.1/prompts/implement-phase.md +31 -0
  28. gauntlet_spec-0.1.1/prompts/plan-author.md +42 -0
  29. gauntlet_spec-0.1.1/prompts/proposal-synthesis.md +36 -0
  30. gauntlet_spec-0.1.1/prompts/retro.md +26 -0
  31. gauntlet_spec-0.1.1/prompts/review-code.md +26 -0
  32. gauntlet_spec-0.1.1/prompts/review-document.md +26 -0
  33. gauntlet_spec-0.1.1/prompts/triage-corpus.jsonl +36 -0
  34. gauntlet_spec-0.1.1/prompts/triage.md +107 -0
  35. gauntlet_spec-0.1.1/pyproject.toml +66 -0
  36. gauntlet_spec-0.1.1/runs/gauntlet/CONTINUE-P3-PROMPT.md +105 -0
  37. gauntlet_spec-0.1.1/runs/gauntlet/CONTINUE-P4-PROMPT.md +181 -0
  38. gauntlet_spec-0.1.1/runs/gauntlet/IMPLEMENT-PROMPT.md +95 -0
  39. gauntlet_spec-0.1.1/runs/gauntlet/plan.md +500 -0
  40. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/PR.md +38 -0
  41. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/confirm-events.jsonl +30 -0
  42. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/confirm-prompt.md +702 -0
  43. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/confirm-r2-events.jsonl +4 -0
  44. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/confirm-r2-prompt.md +60 -0
  45. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/confirm-r2-verdict.json +1 -0
  46. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/confirm-schema.json +35 -0
  47. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/confirm-verdict.json +1 -0
  48. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/findings.json +1 -0
  49. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/review-events.jsonl +88 -0
  50. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/review-prompt.md +80 -0
  51. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/review-schema.json +41 -0
  52. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p1-cycle-r1/triage.json +49 -0
  53. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/confirm-events.jsonl +27 -0
  54. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/confirm-prompt.md +688 -0
  55. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/confirm-r2-events.jsonl +24 -0
  56. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/confirm-r2-prompt.md +157 -0
  57. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/confirm-r2-verdict.json +1 -0
  58. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/confirm-schema.json +35 -0
  59. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/confirm-verdict.json +1 -0
  60. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/findings.json +1 -0
  61. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/review-events.jsonl +84 -0
  62. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/review-prompt.md +98 -0
  63. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/review-schema.json +41 -0
  64. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p2-cycle-r1/triage.json +73 -0
  65. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/confirm-events.jsonl +40 -0
  66. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/confirm-prompt.md +1078 -0
  67. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/confirm-r2-events.jsonl +4 -0
  68. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/confirm-r2-prompt.md +1307 -0
  69. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/confirm-r2-verdict.json +1 -0
  70. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/confirm-schema.json +35 -0
  71. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/confirm-verdict.json +1 -0
  72. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/findings.json +1 -0
  73. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/review-events.jsonl +112 -0
  74. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/review-prompt.md +123 -0
  75. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/review-schema.json +41 -0
  76. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p3-cycle-r1/triage.json +72 -0
  77. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/confirm-events.jsonl +28 -0
  78. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/confirm-prompt.md +1109 -0
  79. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/confirm-schema.json +47 -0
  80. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/confirm-verdict.json +46 -0
  81. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/findings.json +78 -0
  82. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/review-events.jsonl +108 -0
  83. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/review-prompt.md +151 -0
  84. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/review-schema.json +70 -0
  85. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-cycle-r1/triage.json +84 -0
  86. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/p4-triage-accuracy.md +40 -0
  87. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/plan-cycle-r1/confirm-events.jsonl +4 -0
  88. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/plan-cycle-r1/confirm-prompt.md +283 -0
  89. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/plan-cycle-r1/confirm-schema.json +35 -0
  90. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/plan-cycle-r1/confirm-verdict.json +1 -0
  91. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/plan-cycle-r1/findings.md +100 -0
  92. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/manual/plan-cycle-r1/triage.json +98 -0
  93. gauntlet_spec-0.1.1/runs/gauntlet-bootstrap/prd.md +12 -0
  94. gauntlet_spec-0.1.1/schemas/confirm.json +47 -0
  95. gauntlet_spec-0.1.1/schemas/findings.json +70 -0
  96. gauntlet_spec-0.1.1/schemas/proposals.json +40 -0
  97. gauntlet_spec-0.1.1/schemas/triage.json +55 -0
  98. gauntlet_spec-0.1.1/src/gauntlet/__init__.py +3 -0
  99. gauntlet_spec-0.1.1/src/gauntlet/__main__.py +6 -0
  100. gauntlet_spec-0.1.1/src/gauntlet/adapters/__init__.py +59 -0
  101. gauntlet_spec-0.1.1/src/gauntlet/adapters/_structured.py +51 -0
  102. gauntlet_spec-0.1.1/src/gauntlet/adapters/api.py +180 -0
  103. gauntlet_spec-0.1.1/src/gauntlet/adapters/base.py +93 -0
  104. gauntlet_spec-0.1.1/src/gauntlet/adapters/claude_code.py +267 -0
  105. gauntlet_spec-0.1.1/src/gauntlet/adapters/codex.py +244 -0
  106. gauntlet_spec-0.1.1/src/gauntlet/adapters/process.py +80 -0
  107. gauntlet_spec-0.1.1/src/gauntlet/cli.py +298 -0
  108. gauntlet_spec-0.1.1/src/gauntlet/config.py +88 -0
  109. gauntlet_spec-0.1.1/src/gauntlet/engine/__init__.py +26 -0
  110. gauntlet_spec-0.1.1/src/gauntlet/engine/commit_format.py +74 -0
  111. gauntlet_spec-0.1.1/src/gauntlet/engine/config.py +149 -0
  112. gauntlet_spec-0.1.1/src/gauntlet/engine/cycle.py +1076 -0
  113. gauntlet_spec-0.1.1/src/gauntlet/engine/doctor.py +524 -0
  114. gauntlet_spec-0.1.1/src/gauntlet/engine/execution.py +188 -0
  115. gauntlet_spec-0.1.1/src/gauntlet/engine/expr.py +74 -0
  116. gauntlet_spec-0.1.1/src/gauntlet/engine/feedback.py +115 -0
  117. gauntlet_spec-0.1.1/src/gauntlet/engine/gitops.py +277 -0
  118. gauntlet_spec-0.1.1/src/gauntlet/engine/init.py +217 -0
  119. gauntlet_spec-0.1.1/src/gauntlet/engine/judgeproc.py +168 -0
  120. gauntlet_spec-0.1.1/src/gauntlet/engine/manifest.py +158 -0
  121. gauntlet_spec-0.1.1/src/gauntlet/engine/orchestrator.py +474 -0
  122. gauntlet_spec-0.1.1/src/gauntlet/engine/pipeline.py +111 -0
  123. gauntlet_spec-0.1.1/src/gauntlet/engine/planphases.py +99 -0
  124. gauntlet_spec-0.1.1/src/gauntlet/engine/pr.py +129 -0
  125. gauntlet_spec-0.1.1/src/gauntlet/engine/proposals.py +386 -0
  126. gauntlet_spec-0.1.1/src/gauntlet/engine/report.py +121 -0
  127. gauntlet_spec-0.1.1/src/gauntlet/engine/retro.py +542 -0
  128. gauntlet_spec-0.1.1/src/gauntlet/engine/run.py +723 -0
  129. gauntlet_spec-0.1.1/src/gauntlet/engine/steptypes.py +483 -0
  130. gauntlet_spec-0.1.1/src/gauntlet/engine/trend.py +166 -0
  131. gauntlet_spec-0.1.1/src/gauntlet/engine/triage_eval.py +172 -0
  132. gauntlet_spec-0.1.1/src/gauntlet/engine/validate.py +200 -0
  133. gauntlet_spec-0.1.1/src/gauntlet/judge/__init__.py +34 -0
  134. gauntlet_spec-0.1.1/src/gauntlet/judge/classifier.py +88 -0
  135. gauntlet_spec-0.1.1/src/gauntlet/judge/core.py +118 -0
  136. gauntlet_spec-0.1.1/src/gauntlet/judge/decision.py +25 -0
  137. gauntlet_spec-0.1.1/src/gauntlet/judge/hook_client.py +165 -0
  138. gauntlet_spec-0.1.1/src/gauntlet/judge/policy.py +232 -0
  139. gauntlet_spec-0.1.1/src/gauntlet/judge/runner.py +103 -0
  140. gauntlet_spec-0.1.1/src/gauntlet/judge/service.py +85 -0
  141. gauntlet_spec-0.1.1/src/gauntlet/logging/__init__.py +27 -0
  142. gauntlet_spec-0.1.1/src/gauntlet/logging/redact.py +199 -0
  143. gauntlet_spec-0.1.1/src/gauntlet/logging/transcript.py +291 -0
  144. gauntlet_spec-0.1.1/src/gauntlet/pins.py +44 -0
  145. gauntlet_spec-0.1.1/src/gauntlet/scaffold/claude-settings.json +17 -0
  146. gauntlet_spec-0.1.1/src/gauntlet/scaffold/codex-hooks.json +17 -0
  147. gauntlet_spec-0.1.1/src/gauntlet/scaffold/config.yaml +65 -0
  148. gauntlet_spec-0.1.1/src/gauntlet/scaffold/gitignore-guidance.txt +14 -0
  149. gauntlet_spec-0.1.1/src/gauntlet/scaffold/pins.yaml +185 -0
  150. gauntlet_spec-0.1.1/src/gauntlet/scaffold/pipelines/standard.yaml +79 -0
  151. gauntlet_spec-0.1.1/src/gauntlet/scaffold/policy.yaml +179 -0
  152. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/commit-message.md +14 -0
  153. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/cycle-confirm.md +19 -0
  154. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/cycle-fix.md +16 -0
  155. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/cycle-rereview.md +25 -0
  156. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/cycle-review.md +23 -0
  157. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/implement-phase.md +31 -0
  158. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/plan-author.md +42 -0
  159. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/proposal-synthesis.md +36 -0
  160. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/retro.md +26 -0
  161. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/review-code.md +26 -0
  162. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/review-document.md +26 -0
  163. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/triage-corpus.jsonl +36 -0
  164. gauntlet_spec-0.1.1/src/gauntlet/scaffold/prompts/triage.md +107 -0
  165. gauntlet_spec-0.1.1/src/gauntlet/scaffold/schemas/confirm.json +47 -0
  166. gauntlet_spec-0.1.1/src/gauntlet/scaffold/schemas/findings.json +70 -0
  167. gauntlet_spec-0.1.1/src/gauntlet/scaffold/schemas/proposals.json +40 -0
  168. gauntlet_spec-0.1.1/src/gauntlet/scaffold/schemas/triage.json +55 -0
  169. gauntlet_spec-0.1.1/tests/fixtures/toy/prd.md +35 -0
  170. gauntlet_spec-0.1.1/tests/integration/conftest.py +39 -0
  171. gauntlet_spec-0.1.1/tests/integration/test_api_contract.py +36 -0
  172. gauntlet_spec-0.1.1/tests/integration/test_claude_contract.py +98 -0
  173. gauntlet_spec-0.1.1/tests/integration/test_codex_contract.py +119 -0
  174. gauntlet_spec-0.1.1/tests/integration/test_codex_sandbox.py +120 -0
  175. gauntlet_spec-0.1.1/tests/integration/test_cycle_contract.py +159 -0
  176. gauntlet_spec-0.1.1/tests/integration/test_install_second_env.py +177 -0
  177. gauntlet_spec-0.1.1/tests/integration/test_judge_live.py +306 -0
  178. gauntlet_spec-0.1.1/tests/integration/test_pipeline_contract.py +165 -0
  179. gauntlet_spec-0.1.1/tests/integration/test_standard_pipeline_e2e.py +212 -0
  180. gauntlet_spec-0.1.1/tests/integration/test_triage_accuracy.py +74 -0
  181. gauntlet_spec-0.1.1/tests/unit/_crash_child.py +43 -0
  182. gauntlet_spec-0.1.1/tests/unit/conftest.py +98 -0
  183. gauntlet_spec-0.1.1/tests/unit/test_api_adapter.py +124 -0
  184. gauntlet_spec-0.1.1/tests/unit/test_bootstrap_pipeline.py +64 -0
  185. gauntlet_spec-0.1.1/tests/unit/test_claude_adapter.py +225 -0
  186. gauntlet_spec-0.1.1/tests/unit/test_codex_adapter.py +214 -0
  187. gauntlet_spec-0.1.1/tests/unit/test_commit_format.py +68 -0
  188. gauntlet_spec-0.1.1/tests/unit/test_cycle.py +908 -0
  189. gauntlet_spec-0.1.1/tests/unit/test_doctor.py +296 -0
  190. gauntlet_spec-0.1.1/tests/unit/test_expr.py +45 -0
  191. gauntlet_spec-0.1.1/tests/unit/test_feedback.py +67 -0
  192. gauntlet_spec-0.1.1/tests/unit/test_flag_lint.py +84 -0
  193. gauntlet_spec-0.1.1/tests/unit/test_gitops.py +54 -0
  194. gauntlet_spec-0.1.1/tests/unit/test_hook_client.py +205 -0
  195. gauntlet_spec-0.1.1/tests/unit/test_init.py +217 -0
  196. gauntlet_spec-0.1.1/tests/unit/test_judge_core.py +221 -0
  197. gauntlet_spec-0.1.1/tests/unit/test_judge_service.py +75 -0
  198. gauntlet_spec-0.1.1/tests/unit/test_judgeproc.py +63 -0
  199. gauntlet_spec-0.1.1/tests/unit/test_manifest.py +66 -0
  200. gauntlet_spec-0.1.1/tests/unit/test_orchestrator.py +405 -0
  201. gauntlet_spec-0.1.1/tests/unit/test_pins.py +51 -0
  202. gauntlet_spec-0.1.1/tests/unit/test_pipeline_loader.py +219 -0
  203. gauntlet_spec-0.1.1/tests/unit/test_plan_phases.py +117 -0
  204. gauntlet_spec-0.1.1/tests/unit/test_policy.py +358 -0
  205. gauntlet_spec-0.1.1/tests/unit/test_pr_draft.py +80 -0
  206. gauntlet_spec-0.1.1/tests/unit/test_process_timeout.py +73 -0
  207. gauntlet_spec-0.1.1/tests/unit/test_proposals.py +279 -0
  208. gauntlet_spec-0.1.1/tests/unit/test_proposals_review.py +224 -0
  209. gauntlet_spec-0.1.1/tests/unit/test_redact.py +139 -0
  210. gauntlet_spec-0.1.1/tests/unit/test_registry.py +42 -0
  211. gauntlet_spec-0.1.1/tests/unit/test_report.py +78 -0
  212. gauntlet_spec-0.1.1/tests/unit/test_resume_crash.py +136 -0
  213. gauntlet_spec-0.1.1/tests/unit/test_retro.py +293 -0
  214. gauntlet_spec-0.1.1/tests/unit/test_run_lifecycle.py +340 -0
  215. gauntlet_spec-0.1.1/tests/unit/test_schemas.py +71 -0
  216. gauntlet_spec-0.1.1/tests/unit/test_standard_e2e.py +258 -0
  217. gauntlet_spec-0.1.1/tests/unit/test_standard_pipeline.py +91 -0
  218. gauntlet_spec-0.1.1/tests/unit/test_steptypes.py +253 -0
  219. gauntlet_spec-0.1.1/tests/unit/test_transcript.py +182 -0
  220. gauntlet_spec-0.1.1/tests/unit/test_trend.py +117 -0
  221. gauntlet_spec-0.1.1/tests/unit/test_triage_eval.py +103 -0
  222. gauntlet_spec-0.1.1/tests/unit/test_wiring.py +25 -0
  223. gauntlet_spec-0.1.1/uv.lock +2196 -0
@@ -0,0 +1,17 @@
1
+ {
2
+ "_comment": "Gauntlet safety wiring (FR-7.3, plan P2). The PreToolUse hook routes every tool call through the localhost judge via the gauntlet-judge-hook console script (installed on PATH by `uv tool install` / `pipx install`, FR-1.1; `gauntlet init` writes this file, FR-1.2). The hook is safe-by-default: with no GAUNTLET_JUDGE_TOKEN configured it returns `ask` and defers to normal permission handling, so a session without a running judge is never bricked. Under an active gauntlet run the engine sets GAUNTLET_JUDGE_TOKEN/URL/MODE; unattended runs fail closed on a dead judge, interactive sessions fall back to a prompt (review F-004).",
3
+ "hooks": {
4
+ "PreToolUse": [
5
+ {
6
+ "matcher": "*",
7
+ "hooks": [
8
+ {
9
+ "type": "command",
10
+ "command": "gauntlet-judge-hook",
11
+ "timeout": 15
12
+ }
13
+ ]
14
+ }
15
+ ]
16
+ }
17
+ }
@@ -0,0 +1,17 @@
1
+ {
2
+ "_comment": "Gauntlet safety wiring for Codex (FR-1.2, FR-7.3). The PreToolUse hook routes Codex Bash tool calls through the same localhost judge as Claude Code, via the gauntlet-judge-hook console script (installed on PATH by `uv tool install` / `pipx install`, FR-1.1). NOTE (BOOTSTRAP-NOTES #10, ratified 2026-06-11): `codex exec` does NOT fire PreToolUse hooks on the pinned build (codex-cli 0.139.0) — the hook runtime is app-server/TUI-only. Codex's headless pre-execution control is therefore its SANDBOX (read-only for review steps, workspace-write for build steps), which `.gauntlet/config.yaml` sets per profile. This file is committable and forward-looking: it activates automatically if a future codex build fires exec hooks, at which point `gauntlet doctor` reports the firing change. The exact codex hooks schema is unverified on the pinned build (the hook never fires to exercise it); it mirrors the shared stdin-JSON / exit-code-2 / permissionDecision contract the hook client speaks.",
3
+ "hooks": {
4
+ "PreToolUse": [
5
+ {
6
+ "matcher": "Bash",
7
+ "hooks": [
8
+ {
9
+ "type": "command",
10
+ "command": "gauntlet-judge-hook",
11
+ "timeout": 15
12
+ }
13
+ ]
14
+ }
15
+ ]
16
+ }
17
+ }
@@ -0,0 +1,65 @@
1
+ # Gauntlet run configuration (FR-2.1, plan P3).
2
+ #
3
+ # Agent profiles bind adapter + model + flags; pipeline steps reference them by
4
+ # name (`agent: builder`). Swapping builder/reviewer roles is a YAML edit here or
5
+ # a per-step override — no code change (FR-2.2). The engine builds the actual
6
+ # adapter via the entry-point registry (FR-2.4) and lints flags at load (§8).
7
+ #
8
+ # `gauntlet init` will scaffold this file in P6; it is hand-authored for the
9
+ # bootstrap so the P3 engine has profiles to resolve.
10
+
11
+ base_branch: main
12
+ branch_prefix: "gauntlet/"
13
+ # Single run-root for plan/transcripts/manifests (BOOTSTRAP-NOTES #2 / #13).
14
+ run_root: runs
15
+ test_command: "uv run pytest"
16
+
17
+ # On resume of a step killed mid-edit with a dirty worktree (review F-003):
18
+ # park -> stop for a human (default; nothing re-run over partial work)
19
+ # reset_to_base -> back up the partial work to a ref, reset to base, re-run
20
+ interrupted_step: park
21
+
22
+ agents:
23
+ builder:
24
+ adapter: claude-code
25
+ # Short alias form — the only spelling claude 2.1.172 resolves (pinned;
26
+ # `claude-opus-latest` 404s, caught live at P5 start, notes #24).
27
+ model: opus
28
+ permission_mode: acceptEdits
29
+ allowed_tools: [Bash, Read, Write, Edit, Grep, Glob]
30
+ # claude loads the repo's PreToolUse hook only under project setting sources
31
+ # (pins.yaml); the engine-managed judge cannot gate the builder without it.
32
+ base_flags: ["--setting-sources", "project"]
33
+ # A bootstrap implement step is a long-running build; the adapter's 600s
34
+ # default would halt it mid-phase (FR-3.3 applies, just with a realistic
35
+ # ceiling — notes #25).
36
+ step_timeout_s: 5400
37
+ reviewer:
38
+ adapter: codex
39
+ model: gpt-5.5
40
+ sandbox: read-only # pure review steps are read-only (FR-9.6)
41
+ # Cheap-tier models verified live on this machine (P1/P4 contract runs; only
42
+ # an OpenAI key is present and `claude-haiku-latest` does not resolve via
43
+ # LiteLLM — see pins.yaml). FR-3.1 tiering: mini for classification, full
44
+ # model for severity-aware triage escalation (P4, review F-009).
45
+ triage:
46
+ adapter: api
47
+ model: gpt-5-mini
48
+ escalation:
49
+ adapter: api
50
+ model: gpt-5
51
+ judge_llm:
52
+ adapter: api
53
+ model: gpt-5-mini
54
+
55
+ # Per-agent commit identities (FR-9.7) so git log distinguishes who authored what.
56
+ identities:
57
+ builder:
58
+ name: "Gauntlet Builder (claude)"
59
+ email: "builder@gauntlet.local"
60
+ reviewer:
61
+ name: "Gauntlet Reviewer (codex)"
62
+ email: "reviewer@gauntlet.local"
63
+ triage:
64
+ name: "Gauntlet Triage"
65
+ email: "triage@gauntlet.local"
@@ -0,0 +1,185 @@
1
+ # Doctor pin file (FR-1.5 groundwork, plan P1).
2
+ # Records the installed CLI versions and the exact flags the P1 contract
3
+ # suite verified against them (uv run pytest -m integration, 2026-06-10,
4
+ # 9/9 passed). Written from what the installed CLIs actually do — where
5
+ # observed behavior differs from the PRD/prompt, this file and a
6
+ # BOOTSTRAP-NOTES.md entry win.
7
+ verified_date: "2026-06-10"
8
+ clis:
9
+ claude:
10
+ version: "2.1.172"
11
+ verified_flags:
12
+ - flag: "-p --output-format json (prompt via stdin)"
13
+ verified: >-
14
+ Single result object on stdout with result, session_id,
15
+ total_cost_usd, structured_output, and usage
16
+ {input_tokens, output_tokens, cache_read_input_tokens, ...}.
17
+ - flag: "--output-format stream-json"
18
+ verified: >-
19
+ JSONL events (system/init with session_id, assistant turns, final
20
+ result). Requires --verbose in -p mode.
21
+ - flag: "--resume <session-id>"
22
+ verified: >-
23
+ Continues the recorded session in -p mode; prior-turn content
24
+ (codeword) recalled.
25
+ - flag: "--model haiku"
26
+ verified: "Model alias resolves; used for all smoke traffic."
27
+ - flag: "--model opus"
28
+ verified: >-
29
+ P5 start: short alias resolves (modelUsage reports claude-opus-4-8).
30
+ The long form `claude-opus-latest` is REJECTED with an in-band 404
31
+ result (is_error=true, exit 1) — the builder profile must use the
32
+ short alias. Caught live by the first engine-driven builder step;
33
+ evidence preserved by the P4 failure-path transcripts (F-007).
34
+ - flag: '--tools ""'
35
+ verified: "Empty string disables all built-in tools (tool-less smoke)."
36
+ - flag: "--json-schema <inline-json>"
37
+ verified: >-
38
+ Native structured output: conforming object surfaced in the result
39
+ event's structured_output field (PRD assumed best-effort JSON for
40
+ claude — positive divergence, see BOOTSTRAP-NOTES #9).
41
+ - flag: "--permission-mode acceptEdits + --allowedTools/--tools Write"
42
+ verified: >-
43
+ Write tool ran unprompted in a disposable fixture repo; file
44
+ created with expected content.
45
+ - flag: "PreToolUse hook via .claude/settings.json (--setting-sources project)"
46
+ verified: >-
47
+ P2: fires in `claude -p` for Bash tool calls. Payload carries
48
+ {session_id, cwd, hook_event_name, tool_name, tool_input{command},
49
+ tool_use_id, permission_mode}. Emitting hookSpecificOutput
50
+ permissionDecision=deny blocks the tool PRE-execution and surfaces
51
+ the reason to the agent (verified end-to-end: denied echo never
52
+ produced its output). permissionDecision=allow proceeds;
53
+ unreachable+interactive falls back to a normal prompt (no deadlock).
54
+ notes:
55
+ - >-
56
+ Cost IS reported (total_cost_usd) on this install, so the tokens-only
57
+ degraded path (PRD §12 Q3) is claude's fallback, not its default.
58
+ - >-
59
+ Permission-bypass flags observed in --help (--dangerously-skip-permissions,
60
+ --allow-dangerously-skip-permissions, --bare which skips hooks,
61
+ --permission-mode bypassPermissions) are rejected by the config lint.
62
+ - >-
63
+ P3: no --max-turns flag on claude 2.1.172 (`claude --help` has no
64
+ turn/limit option), and `codex exec` has no turn cap either. The engine
65
+ therefore rejects `max_turns` at pipeline load (review F-006); the
66
+ working per-step halts are timeout_s and budget_usd (FR-3.3).
67
+ codex:
68
+ version: "codex-cli 0.139.0"
69
+ verified_flags:
70
+ - flag: "exec --json (prompt via stdin with '-')"
71
+ verified: >-
72
+ JSONL events on stdout: thread.started{thread_id}, turn.started,
73
+ item.completed{item.type=agent_message, text}, turn.completed
74
+ {usage: input_tokens, cached_input_tokens, output_tokens,
75
+ reasoning_output_tokens}.
76
+ - flag: "exec --output-schema <file>"
77
+ verified: >-
78
+ Final agent message conformed to the schema; parsed and validated
79
+ first try (consistent with the plan-cycle-r1 capture).
80
+ - flag: "exec --output-schema (P4 live review path, strict-mode rules)"
81
+ verified: >-
82
+ P4: the schema is enforced server-side under OpenAI strict mode —
83
+ a schema whose `required` omits any declared property is rejected
84
+ with HTTP 400 before the model runs (exit 1). "Optional" fields
85
+ must be spelled required-but-nullable (type ["string","null"]);
86
+ `pattern` constraints were removed from the normative schemas to
87
+ stay inside the strict subset. With schemas/findings.json in that
88
+ form, the live adversarial_cycle contract test round-tripped:
89
+ review findings validated, fix-round commit, diff-scoped confirm
90
+ verdicts validated (tests/integration/test_cycle_contract.py).
91
+ - flag: "--output-last-message <file>"
92
+ verified: >-
93
+ Long form live-verified: file written with the final agent message
94
+ text (fresh exec and resume). The -o short alias is observed in
95
+ --help only, not exercised by the contract suite.
96
+ - flag: "-s read-only / -s workspace-write"
97
+ verified: >-
98
+ read-only used for all review/smoke runs; workspace-write created a
99
+ file in a disposable fixture repo.
100
+ - flag: "exec resume <session-id>"
101
+ verified: >-
102
+ Live contract tests: plain resume recalled prior-turn content
103
+ (codeword), and resume + --output-schema + --output-last-message
104
+ returned conforming JSON (the combination P4's confirm pass relies
105
+ on). resume accepts --json/--output-schema/--output-last-message
106
+ but NOT --sandbox/-s (help-surface assertion test) — the sandbox
107
+ must be pinned via -c sandbox_mode="..." on resume.
108
+ notes:
109
+ - >-
110
+ --full-auto does NOT exist on `codex exec` 0.139.0 (PRD §4.1/plan
111
+ mention it): exec is already non-interactive; the sandbox flag alone
112
+ governs write access. Verified by a help-surface assertion test, not
113
+ a live run. See BOOTSTRAP-NOTES #9.
114
+ - >-
115
+ Cost is never reported in the event stream — tokens-only degraded
116
+ path (PRD §12 Q3) is codex's normal mode.
117
+ - >-
118
+ Bypass flags observed in --help (--dangerously-bypass-approvals-and-sandbox,
119
+ --dangerously-bypass-hook-trust, -s danger-full-access) are rejected
120
+ by the config lint.
121
+ - >-
122
+ P2 HOOK FINDING (deviation, ratified by John 2026-06-11): `codex exec`
123
+ does NOT fire PreToolUse hooks on 0.139.0 — verified against project
124
+ `.codex/hooks.json`, user `~/.codex/hooks.json`, `[features].hooks=true`
125
+ (globally stable+true), and even `--dangerously-bypass-hook-trust`. The
126
+ hook runtime (hooks/list, hook trust review) is app-server/TUI-only.
127
+ Codex's pre-execution control is therefore its SANDBOX, verified live:
128
+ `-s read-only` blocks all writes; `-s workspace-write` confines writes
129
+ to the workspace + system temp (TMPDIR/`/tmp` are writable by design),
130
+ and refuses writes to `$HOME`. The codex hook client is wired but inert
131
+ on this build; it activates automatically if a future codex fires exec
132
+ hooks. FR-7's "100% blocked pre-execution + audit" is met via claude;
133
+ codex is sandbox-sourced (coarser, no per-command judge audit). See
134
+ BOOTSTRAP-NOTES #10.
135
+ api_models:
136
+ version: "litellm (P4 verification, 2026-06-11)"
137
+ verified_flags:
138
+ - flag: "gpt-5-mini via LiteLLM (triage/judge_llm cheap tier)"
139
+ verified: >-
140
+ P4: live completion + cost reporting work with the present OpenAI
141
+ key (~$0.00015 per short call). Ran the full 36-finding triage
142
+ accuracy harness: 94.4% verdict agreement, blocking 9/9, zero
143
+ unescalated blocking→reject misses (report at
144
+ runs/gauntlet-bootstrap/manual/p4-triage-accuracy.md).
145
+ - flag: "gpt-5 via LiteLLM (escalation tier, review F-009)"
146
+ verified: "P4: live completion + cost reporting (~9x mini's price)."
147
+ notes:
148
+ - >-
149
+ `claude-haiku-latest` (the previous configured triage model) does NOT
150
+ resolve via LiteLLM, and no Anthropic key is present in this
151
+ environment — the profile could never have run here. Swapped to the
152
+ verified models above; BOOTSTRAP-NOTES #19.
153
+ judge:
154
+ version: "gauntlet P2 (this repo)"
155
+ verified_flags:
156
+ - flag: "fast-path decision latency (FR-7.2 target p50 < 150 ms)"
157
+ verified: >-
158
+ Server-side policy fast-path p50 ≈ 2.4 ms (in-process timing in the
159
+ judge audit log), far under the 150 ms target. Measured over the
160
+ live red-team + benign hook runs in test_judge_live.py.
161
+ - flag: "FR-7 red-team acceptance (claude path)"
162
+ verified: >-
163
+ 25/25 dangerous commands denied through the real gauntlet-judge-hook
164
+ binary against a live judge (exit 2, audit deny). Benign suite
165
+ fast-path allow rate 100% (≥ 90% required). End-to-end claude block
166
+ proven via a deny-policy sentinel.
167
+ - flag: "P3 engine-managed judge lifecycle + live agent_task gating"
168
+ verified: >-
169
+ `gauntlet run` starts the judge (python -m gauntlet judge serve),
170
+ injects per-run GAUNTLET_JUDGE_{TOKEN,URL,MODE,RUN_ID} + per-step
171
+ GAUNTLET_STEP_ID, and stops it on exit (no env leakage). A real
172
+ agent_task->shell->commit pipeline ran through the live judge: the
173
+ builder's `echo` tool call was gated by the PreToolUse hook and
174
+ audited as fast-path ALLOW with the run/step ids attributed. REQUIRES
175
+ the claude builder profile to pass `--setting-sources project`
176
+ (base_flags) — without it claude does not load the repo hook and the
177
+ agent runs UNGATED. In-repo writes (`Write` tool / `echo > f`) are not
178
+ a fast-path allow; they escalate to the judge_llm rung and fail-closed
179
+ to deny when no classifier is configured (BOOTSTRAP-NOTES #16).
180
+ notes:
181
+ - >-
182
+ Judge binds 127.0.0.1 only and rejects callers without the per-run
183
+ X-Gauntlet-Token (§8). LLM classifier rung + fail-closed verified by
184
+ unit tests; the live latency/acceptance numbers above are from the P2
185
+ contract run.
@@ -0,0 +1,52 @@
1
+ name: Release to PyPI
2
+
3
+ # Publishes gauntlet-spec to PyPI when a version tag is pushed (e.g. v0.1.1).
4
+ # Uses PyPI Trusted Publishing (OIDC) — no API token stored as a secret.
5
+ # One-time setup on PyPI: add a trusted publisher for this repo pointing at
6
+ # workflow `release.yml` and environment `pypi`.
7
+
8
+ on:
9
+ push:
10
+ tags:
11
+ - "v*"
12
+
13
+ permissions:
14
+ contents: read
15
+
16
+ jobs:
17
+ build:
18
+ name: Build distribution
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v5
24
+ - name: Build sdist and wheel
25
+ run: uv build
26
+ - name: Verify tag matches package version
27
+ run: |
28
+ TAG="${GITHUB_REF_NAME#v}"
29
+ PKG=$(grep -m1 '^version = ' pyproject.toml | sed -E 's/version = "(.*)"/\1/')
30
+ if [ "$TAG" != "$PKG" ]; then
31
+ echo "::error::Tag v$TAG does not match pyproject version $PKG"
32
+ exit 1
33
+ fi
34
+ - uses: actions/upload-artifact@v4
35
+ with:
36
+ name: dist
37
+ path: dist/
38
+
39
+ publish:
40
+ name: Publish to PyPI
41
+ needs: build
42
+ runs-on: ubuntu-latest
43
+ environment: pypi
44
+ permissions:
45
+ id-token: write # required for Trusted Publishing
46
+ steps:
47
+ - uses: actions/download-artifact@v4
48
+ with:
49
+ name: dist
50
+ path: dist/
51
+ - name: Publish
52
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,18 @@
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ .venv/
5
+ .pytest_cache/
6
+ *.egg-info/
7
+ dist/
8
+
9
+ # IDE / editor / OS
10
+ .idea/
11
+ .vscode/
12
+ .DS_Store
13
+
14
+ # P2 judge contract-test audit artifact
15
+ .gauntlet/test-judge-audit.jsonl
16
+
17
+ # Engine live-run pointers are bookkeeping, never commit payload (notes #13/#22)
18
+ runs/*/active-run.txt
@@ -0,0 +1,266 @@
1
+ # AGENTS.md — Gauntlet
2
+
3
+ This file is read by every Codex session in this repository, whether
4
+ you are **building Gauntlet**, **running as the `builder` agent inside a
5
+ Gauntlet pipeline**, or **acting as the `reviewer` agent**. Read it fully
6
+ before doing anything. The section headers tell you which parts apply to you.
7
+
8
+ ---
9
+
10
+ ## 1. What this project is (always read)
11
+
12
+ Gauntlet is an adversarial multi-agent development harness. It orchestrates
13
+ a pipeline of: human-authored PRD → adversarial review loop → implementation
14
+ plan → adversarial review loop → phased implementation, where each phase ends
15
+ in a commit and goes through an adversarial review loop before the next phase
16
+ begins. The spec is `PRD-gauntlet.md`. When in doubt, the PRD wins.
17
+
18
+ **The central invariant:** the worktree is clean and committed at every point
19
+ where control passes to a reviewer. This is not a style preference — it is
20
+ what makes review diffs meaningful, mutation detection possible, and confirm
21
+ passes cheap.
22
+
23
+ ---
24
+
25
+ ## 2. Guiding principles (always read)
26
+
27
+ These are the values the project will measure implementation against. When a
28
+ decision isn't covered by the PRD, use these to reason toward an answer.
29
+
30
+ **Determinism over cleverness.** The orchestrator is a state machine. Prefer
31
+ boring, explicit, resumable logic over elegant abstractions that are hard to
32
+ inspect mid-run. A run that survives `kill -9` and resumes correctly is worth
33
+ more than clean code that can't.
34
+
35
+ **Fail closed.** Every external call — judge service, agent CLI, API adapter —
36
+ defaults to deny/halt on timeout, parse error, or unexpected exit code. A
37
+ stuck run is recoverable. A run that silently continues past a failed safety
38
+ gate is not.
39
+
40
+ **Separation of concerns between agents.** The builder implements. The
41
+ reviewer reviews. Neither should be asked to do the other's job in the same
42
+ step. When you are the builder, you do not pre-emptively review your own
43
+ work in the same turn; that's the reviewer's job and it will happen.
44
+
45
+ **Data over inference.** Persist everything — manifests, transcripts, triage
46
+ verdicts, judge decisions, agent identities on commits. Future you, debugging
47
+ a failed run, should never have to infer what happened.
48
+
49
+ **Process fidelity is part of the deliverable.** When you are building
50
+ Gauntlet, the quality of the process you follow (commit discipline, review
51
+ handoff, triage rigor) is as observable as the code quality. The bootstrap
52
+ is a dogfood run; treat it that way.
53
+
54
+ **Approved artifacts change only through their own loop and gate.** Do not
55
+ amend an artifact that a human has approved (PRD, plan) because a later phase
56
+ found it incomplete. Halt and surface the conflict. Humans ratify; agents
57
+ propose.
58
+
59
+ ---
60
+
61
+ ## 3. When you are building Gauntlet (bootstrap / development work)
62
+
63
+ ### Branch discipline
64
+ All work goes on `gauntlet/bootstrap` (during bootstrap) or a
65
+ `gauntlet/<slug>` branch thereafter. Never push to or commit directly on the
66
+ base branch.
67
+
68
+ ### Phase discipline
69
+ The implementation plan (`runs/gauntlet/plan.md`) defines phases. Work one
70
+ phase at a time. At the end of each phase:
71
+
72
+ 1. All tests pass (`uv run pytest` — failing tests are a hard stop).
73
+ 2. Commit with the enforced format:
74
+ - Line 1: `PN: <imperative summary, ≤72 chars>`
75
+ - Blank line
76
+ - Body: what changed, why, which PRD assumption this phase validates,
77
+ relevant FR refs, any explicit deferrals to later phases.
78
+ 3. Surface the commit SHA and invite the review handoff. Do not continue.
79
+
80
+ Fix commits use: `PN.x: Address review — <short summary>` with a body that
81
+ lists each addressed finding by ID, the triage verdict, and what changed.
82
+ Declined findings appear in the body as explicitly declined with the triage
83
+ reasoning — declining with a recorded reason is part of the audit trail.
84
+
85
+ ### Tests are the guardrail
86
+ The test suite only grows. Never delete or skip a passing test to make a
87
+ phase pass. If a test is wrong, fix the test in a separate commit with
88
+ justification.
89
+
90
+ Integration tests that require live CLI credentials are marked
91
+ `@pytest.mark.integration`. CI runs `pytest -m "not integration"`. You run
92
+ the integration suite locally before every review handoff.
93
+
94
+ ### What to decide vs. what to ask
95
+ **Decide yourself:** module layout, library choices within the constraints in
96
+ `BOOTSTRAP-PROMPT.md`, prompt wording, schema field names consistent with
97
+ PRD §7, test structure.
98
+
99
+ **Stop and ask:** plan deviations, anything requiring credentials or global
100
+ machine state changes, FR conflicts, anything that would amend an already-
101
+ approved artifact, the gate after every phase.
102
+
103
+ ### Safety rules for your own session
104
+ - Never use `--dangerously-skip-permissions` or equivalent bypass flags.
105
+ This disables the PreToolUse hooks. The hooks are the safety layer.
106
+ - Never force-push or rewrite history on any branch other than the active
107
+ PRD branch (and only with explicit human instruction there).
108
+ - Never read credential files outside this repository tree.
109
+ - After P2, the judge service hooks your own session. Do not attempt to
110
+ work around a deny decision; surface it and ask.
111
+
112
+ ### The self-hosting switchover
113
+ After P4 (pipeline engine + adversarial cycle + logger): switch to running
114
+ subsequent phases through `gauntlet run`. Manual process execution from P5
115
+ onward is a bug. Record any gap that forced you to fall back to manual in
116
+ `BOOTSTRAP-NOTES.md`.
117
+
118
+ ---
119
+
120
+ ## 4. When you are the `builder` agent inside a Gauntlet pipeline
121
+
122
+ You receive a phase prompt referencing the approved PRD and the approved
123
+ implementation plan. Your job in that phase is defined by the plan. Scope
124
+ is everything.
125
+
126
+ ### Your scope
127
+ - Implement exactly what the current phase specifies, per the plan.
128
+ - Write or extend tests to cover the phase's deliverables. Tests pass before
129
+ you signal completion.
130
+ - Do not implement work belonging to a later phase, even if it seems easy or
131
+ obviously needed. Record the temptation as a deferral note in the commit
132
+ body; do not act on it.
133
+
134
+ ### Signaling completion
135
+ When the phase is done and tests pass, signal completion clearly:
136
+ ```
137
+ PHASE COMPLETE
138
+ Phase: <PN — title>
139
+ SHA: <commit sha>
140
+ Tests: <N passed, 0 failed>
141
+ Deferrals: <list any scope items pushed to later phases>
142
+ ```
143
+ Do not perform the review. Do not pre-critique your own work. The reviewer
144
+ will do that.
145
+
146
+ ### If you discover a plan or PRD conflict
147
+ Stop. Do not proceed. Report:
148
+ ```
149
+ UPSTREAM CONFLICT
150
+ Phase: <PN>
151
+ Conflict: <what the plan/PRD says vs. what implementation reveals>
152
+ Options: <what you see as the paths forward>
153
+ ```
154
+ The human will resolve it. This is FR-10.4; it is not optional.
155
+
156
+ ---
157
+
158
+ ## 5. When you are the `reviewer` agent inside a Gauntlet pipeline
159
+
160
+ You are an adversarial reviewer. Your job is to find problems, not to be
161
+ polite. The builder's feelings are not a consideration; shipping broken or
162
+ incomplete work is.
163
+
164
+ ### Review stance
165
+ Be skeptical of everything. The builder had context you don't. Use that
166
+ asymmetry: if something is unclear to you as a reader, it is a finding,
167
+ regardless of whether the author's intent was clear.
168
+
169
+ ### What you are reviewing against
170
+ Always review against three references, in priority order:
171
+ 1. The approved `prd.md` — is the spec fully implemented?
172
+ 2. The approved `plan.md`, current phase — did the phase deliver what it said?
173
+ 3. The guiding principles in §2 of this file — were they followed?
174
+
175
+ Findings that don't trace to one of these three are likely bikeshedding.
176
+ Label them honestly as such in your `severity` field.
177
+
178
+ ### Output format
179
+ Return findings as structured JSON matching `schemas/findings.json`. Every
180
+ finding must have: `id`, `severity` (blocking/major/minor/nit), `category`,
181
+ `location` (file and line/section), `claim`, `evidence`. Optional:
182
+ `suggested_fix`.
183
+
184
+ Do not editorialize outside the JSON. The triage agent reads your output
185
+ programmatically.
186
+
187
+ ### On the confirm pass
188
+ You receive: the commit-range diff (pre-fix SHA to post-fix SHA) and your
189
+ prior findings with triage verdicts. For each finding, return:
190
+ ```
191
+ { "finding_id": "F-001", "verdict": "resolved | partially_resolved |
192
+ unresolved | regression_introduced", "notes": "..." }
193
+ ```
194
+ You are checking whether the diff addressed your concern — not re-reviewing
195
+ the whole phase. Scope yourself to the diff.
196
+
197
+ ### Read-only contract
198
+ You do not modify files. You do not run commands that have side effects. If
199
+ your adapter configuration allows write tools and you are tempted to use them:
200
+ write a finding instead, with a suggested_fix. Any worktree mutation by a
201
+ reviewer is a process violation and will be detected.
202
+
203
+ ---
204
+
205
+ ## 6. Stack and project layout (all agents)
206
+
207
+ ```
208
+ src/gauntlet/
209
+ cli.py # typer CLI entrypoint
210
+ engine/ # pipeline state machine, step types, manifest
211
+ adapters/ # ClaudeCodeAdapter, CodexAdapter, ApiAdapter (LiteLLM)
212
+ judge/ # FastAPI judge service, policy engine
213
+ logging/ # transcript logger (md + jsonl)
214
+ config.py # pydantic config/schema models
215
+ pipelines/ # YAML pipeline definitions
216
+ prompts/ # versioned prompt templates (data, not code)
217
+ schemas/ # JSON schemas for structured agent outputs
218
+ policy.yaml # judge fast-path allow/deny rules
219
+ runs/ # per-PRD run artifacts (committable)
220
+ tests/
221
+ unit/
222
+ integration/ # requires live CLI creds; pytest -m integration
223
+ BOOTSTRAP-NOTES.md # process pain points recorded during bootstrap
224
+ PRD-gauntlet.md # the spec
225
+ ```
226
+
227
+ **Key dependencies:** Python 3.10+, `uv`, `typer`, `pydantic`, `fastapi`,
228
+ `uvicorn`, `litellm`. No heavier orchestration frameworks. The orchestrator
229
+ is thin by design.
230
+
231
+ **Run tests:** `uv run pytest` (unit only) / `uv run pytest -m integration`
232
+
233
+ **Install locally:** `uv tool install -e .` or `pipx install -e .`
234
+
235
+ **Check environment:** `gauntlet doctor`
236
+
237
+ ---
238
+
239
+ ## 7. Commit message format (reference)
240
+
241
+ ```
242
+ PN: Imperative summary of what this phase delivers (≤72 chars)
243
+
244
+ What changed and why. This is not a restatement of the diff; it is the
245
+ reasoning behind the change. Include:
246
+ - Which PRD assumption this phase validates
247
+ - Relevant FR numbers (e.g. "implements FR-3.3, FR-7.2")
248
+ - Any explicit deferrals: "Deferred to P4: retry logic on ApiAdapter"
249
+
250
+ For fix commits (PN.x):
251
+ - F-001 [legitimate]: <what was wrong> → <what changed>
252
+ - F-002 [bikeshedding/declined]: <reviewer's concern> — declined because
253
+ <triage reasoning>
254
+ - F-003 [premature_optimization/declined]: <reviewer's concern> — deferred
255
+ to post-v1, tracked in FUTURE.md
256
+ ```
257
+
258
+ ---
259
+
260
+ ## 8. Files you must not modify without explicit human instruction
261
+
262
+ - `PRD-gauntlet.md` — the spec; changes require a new PRD revision process
263
+ - Any file in `runs/*/` that represents an approved artifact (`prd.md`,
264
+ `plan.md`, and manifest entries marked `status: approved`)
265
+ - `policy.yaml` — judge rules; changes go through the retro proposal process
266
+ - `CHANGELOG.md` in `prompts/` — append only, never rewrite history