autonomous-coding-toolkit 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. package/.claude-plugin/marketplace.json +22 -0
  2. package/.claude-plugin/plugin.json +13 -0
  3. package/LICENSE +21 -0
  4. package/Makefile +21 -0
  5. package/README.md +140 -0
  6. package/SECURITY.md +28 -0
  7. package/agents/bash-expert.md +113 -0
  8. package/agents/dependency-auditor.md +138 -0
  9. package/agents/integration-tester.md +120 -0
  10. package/agents/lesson-scanner.md +149 -0
  11. package/agents/python-expert.md +179 -0
  12. package/agents/service-monitor.md +141 -0
  13. package/agents/shell-expert.md +147 -0
  14. package/benchmarks/runner.sh +147 -0
  15. package/benchmarks/tasks/01-rest-endpoint/rubric.sh +29 -0
  16. package/benchmarks/tasks/01-rest-endpoint/task.md +17 -0
  17. package/benchmarks/tasks/02-refactor-module/task.md +8 -0
  18. package/benchmarks/tasks/03-fix-integration-bug/task.md +8 -0
  19. package/benchmarks/tasks/04-add-test-coverage/task.md +8 -0
  20. package/benchmarks/tasks/05-multi-file-feature/task.md +8 -0
  21. package/bin/act.js +238 -0
  22. package/commands/autocode.md +6 -0
  23. package/commands/cancel-ralph.md +18 -0
  24. package/commands/code-factory.md +53 -0
  25. package/commands/create-prd.md +55 -0
  26. package/commands/ralph-loop.md +18 -0
  27. package/commands/run-plan.md +117 -0
  28. package/commands/submit-lesson.md +122 -0
  29. package/docs/ARCHITECTURE.md +630 -0
  30. package/docs/CONTRIBUTING.md +125 -0
  31. package/docs/lessons/0001-bare-exception-swallowing.md +34 -0
  32. package/docs/lessons/0002-async-def-without-await.md +28 -0
  33. package/docs/lessons/0003-create-task-without-callback.md +28 -0
  34. package/docs/lessons/0004-hardcoded-test-counts.md +28 -0
  35. package/docs/lessons/0005-sqlite-without-closing.md +33 -0
  36. package/docs/lessons/0006-venv-pip-path.md +27 -0
  37. package/docs/lessons/0007-runner-state-self-rejection.md +35 -0
  38. package/docs/lessons/0008-quality-gate-blind-spot.md +33 -0
  39. package/docs/lessons/0009-parser-overcount-empty-batches.md +36 -0
  40. package/docs/lessons/0010-local-outside-function-bash.md +33 -0
  41. package/docs/lessons/0011-batch-tests-for-unimplemented-code.md +36 -0
  42. package/docs/lessons/0012-api-markdown-unescaped-chars.md +33 -0
  43. package/docs/lessons/0013-export-prefix-env-parsing.md +33 -0
  44. package/docs/lessons/0014-decorator-registry-import-side-effect.md +43 -0
  45. package/docs/lessons/0015-frontend-backend-schema-drift.md +43 -0
  46. package/docs/lessons/0016-event-driven-cold-start-seeding.md +44 -0
  47. package/docs/lessons/0017-copy-paste-logic-diverges.md +43 -0
  48. package/docs/lessons/0018-layer-passes-pipeline-broken.md +45 -0
  49. package/docs/lessons/0019-systemd-envfile-ignores-export.md +41 -0
  50. package/docs/lessons/0020-persist-state-incrementally.md +44 -0
  51. package/docs/lessons/0021-dual-axis-testing.md +48 -0
  52. package/docs/lessons/0022-jsx-factory-shadowing.md +43 -0
  53. package/docs/lessons/0023-static-analysis-spiral.md +51 -0
  54. package/docs/lessons/0024-shared-pipeline-implementation.md +55 -0
  55. package/docs/lessons/0025-defense-in-depth-all-entry-points.md +65 -0
  56. package/docs/lessons/0026-linter-no-rules-false-enforcement.md +54 -0
  57. package/docs/lessons/0027-jsx-silent-prop-drop.md +64 -0
  58. package/docs/lessons/0028-no-infrastructure-in-client-code.md +49 -0
  59. package/docs/lessons/0029-never-write-secrets-to-files.md +61 -0
  60. package/docs/lessons/0030-cache-merge-not-replace.md +62 -0
  61. package/docs/lessons/0031-verify-units-at-boundaries.md +66 -0
  62. package/docs/lessons/0032-module-lifecycle-subscribe-unsubscribe.md +89 -0
  63. package/docs/lessons/0033-async-iteration-mutable-snapshot.md +72 -0
  64. package/docs/lessons/0034-caller-missing-await-silent-discard.md +65 -0
  65. package/docs/lessons/0035-duplicate-registration-silent-overwrite.md +85 -0
  66. package/docs/lessons/0036-websocket-dirty-disconnect.md +33 -0
  67. package/docs/lessons/0037-parallel-agents-worktree-corruption.md +31 -0
  68. package/docs/lessons/0038-subscribe-no-stored-ref.md +36 -0
  69. package/docs/lessons/0039-fallback-or-default-hides-bugs.md +34 -0
  70. package/docs/lessons/0040-event-firehose-filter-first.md +36 -0
  71. package/docs/lessons/0041-ambiguous-base-dir-path-nesting.md +32 -0
  72. package/docs/lessons/0042-spec-compliance-insufficient.md +36 -0
  73. package/docs/lessons/0043-exact-count-extensible-collections.md +32 -0
  74. package/docs/lessons/0044-relative-file-deps-worktree.md +39 -0
  75. package/docs/lessons/0045-iterative-design-improvement.md +33 -0
  76. package/docs/lessons/0046-plan-assertion-math-bugs.md +38 -0
  77. package/docs/lessons/0047-pytest-single-threaded-default.md +37 -0
  78. package/docs/lessons/0048-integration-wiring-batch.md +40 -0
  79. package/docs/lessons/0049-ab-verification.md +41 -0
  80. package/docs/lessons/0050-editing-sourced-files-during-execution.md +33 -0
  81. package/docs/lessons/0051-infrastructure-fixes-cant-self-heal.md +30 -0
  82. package/docs/lessons/0052-uncommitted-changes-poison-quality-gates.md +31 -0
  83. package/docs/lessons/0053-jq-compact-flag-inconsistency.md +31 -0
  84. package/docs/lessons/0054-parser-matches-inside-code-blocks.md +30 -0
  85. package/docs/lessons/0055-agents-compensate-for-garbled-prompts.md +31 -0
  86. package/docs/lessons/0056-grep-count-exit-code-on-zero.md +42 -0
  87. package/docs/lessons/0057-new-artifacts-break-git-clean-gates.md +42 -0
  88. package/docs/lessons/0058-dead-config-keys-never-consumed.md +49 -0
  89. package/docs/lessons/0059-contract-test-shared-structures.md +53 -0
  90. package/docs/lessons/0060-set-e-silent-death-in-runners.md +53 -0
  91. package/docs/lessons/0061-context-injection-dirty-state.md +50 -0
  92. package/docs/lessons/0062-sibling-bug-neighborhood-scan.md +29 -0
  93. package/docs/lessons/0063-one-flag-two-lifetimes.md +31 -0
  94. package/docs/lessons/0064-test-passes-wrong-reason.md +31 -0
  95. package/docs/lessons/0065-pipefail-grep-count-double-output.md +39 -0
  96. package/docs/lessons/0066-local-keyword-outside-function.md +37 -0
  97. package/docs/lessons/0067-stdin-hang-non-interactive-shell.md +36 -0
  98. package/docs/lessons/0068-agent-builds-wrong-thing-correctly.md +31 -0
  99. package/docs/lessons/0069-plan-quality-dominates-execution.md +30 -0
  100. package/docs/lessons/0070-spec-echo-back-prevents-drift.md +31 -0
  101. package/docs/lessons/0071-positive-instructions-outperform-negative.md +30 -0
  102. package/docs/lessons/0072-lost-in-the-middle-context-placement.md +30 -0
  103. package/docs/lessons/0073-unscoped-lessons-cause-false-positives.md +30 -0
  104. package/docs/lessons/0074-stale-context-injection-wrong-batch.md +32 -0
  105. package/docs/lessons/0075-research-artifacts-must-persist.md +32 -0
  106. package/docs/lessons/0076-wrong-decomposition-contaminates-downstream.md +30 -0
  107. package/docs/lessons/0077-cherry-pick-merges-need-manual-resolution.md +30 -0
  108. package/docs/lessons/0078-static-review-without-live-test.md +30 -0
  109. package/docs/lessons/0079-integration-wiring-batch-required.md +32 -0
  110. package/docs/lessons/FRAMEWORK.md +161 -0
  111. package/docs/lessons/SUMMARY.md +201 -0
  112. package/docs/lessons/TEMPLATE.md +85 -0
  113. package/docs/plans/2026-02-21-code-factory-v2-design.md +204 -0
  114. package/docs/plans/2026-02-21-code-factory-v2-implementation-plan.md +2189 -0
  115. package/docs/plans/2026-02-21-code-factory-v2-phase4-design.md +537 -0
  116. package/docs/plans/2026-02-21-code-factory-v2-phase4-implementation-plan.md +2012 -0
  117. package/docs/plans/2026-02-21-hardening-pass-design.md +108 -0
  118. package/docs/plans/2026-02-21-hardening-pass-plan.md +1378 -0
  119. package/docs/plans/2026-02-21-mab-research-report.md +406 -0
  120. package/docs/plans/2026-02-21-marketplace-restructure-design.md +240 -0
  121. package/docs/plans/2026-02-21-marketplace-restructure-plan.md +832 -0
  122. package/docs/plans/2026-02-21-phase4-completion-plan.md +697 -0
  123. package/docs/plans/2026-02-21-validator-suite-design.md +148 -0
  124. package/docs/plans/2026-02-21-validator-suite-plan.md +540 -0
  125. package/docs/plans/2026-02-22-mab-research-round2.md +556 -0
  126. package/docs/plans/2026-02-22-mab-run-design.md +462 -0
  127. package/docs/plans/2026-02-22-mab-run-plan.md +2046 -0
  128. package/docs/plans/2026-02-22-operations-design-methodology-research.md +681 -0
  129. package/docs/plans/2026-02-22-research-agent-failure-taxonomy.md +532 -0
  130. package/docs/plans/2026-02-22-research-code-guideline-policies.md +886 -0
  131. package/docs/plans/2026-02-22-research-codebase-audit-refactoring.md +908 -0
  132. package/docs/plans/2026-02-22-research-coding-standards-documentation.md +541 -0
  133. package/docs/plans/2026-02-22-research-competitive-landscape.md +687 -0
  134. package/docs/plans/2026-02-22-research-comprehensive-testing.md +1076 -0
  135. package/docs/plans/2026-02-22-research-context-utilization.md +459 -0
  136. package/docs/plans/2026-02-22-research-cost-quality-tradeoff.md +548 -0
  137. package/docs/plans/2026-02-22-research-lesson-transferability.md +508 -0
  138. package/docs/plans/2026-02-22-research-multi-agent-coordination.md +312 -0
  139. package/docs/plans/2026-02-22-research-phase-integration.md +602 -0
  140. package/docs/plans/2026-02-22-research-plan-quality.md +428 -0
  141. package/docs/plans/2026-02-22-research-prompt-engineering.md +558 -0
  142. package/docs/plans/2026-02-22-research-unconventional-perspectives.md +528 -0
  143. package/docs/plans/2026-02-22-research-user-adoption.md +638 -0
  144. package/docs/plans/2026-02-22-research-verification-effectiveness.md +433 -0
  145. package/docs/plans/2026-02-23-agent-suite-design.md +299 -0
  146. package/docs/plans/2026-02-23-agent-suite-plan.md +578 -0
  147. package/docs/plans/2026-02-23-phase3-cost-infrastructure-design.md +148 -0
  148. package/docs/plans/2026-02-23-phase3-cost-infrastructure-plan.md +1062 -0
  149. package/docs/plans/2026-02-23-research-bash-expert-agent.md +543 -0
  150. package/docs/plans/2026-02-23-research-dependency-auditor-agent.md +564 -0
  151. package/docs/plans/2026-02-23-research-improving-existing-agents.md +503 -0
  152. package/docs/plans/2026-02-23-research-integration-tester-agent.md +454 -0
  153. package/docs/plans/2026-02-23-research-python-expert-agent.md +429 -0
  154. package/docs/plans/2026-02-23-research-service-monitor-agent.md +425 -0
  155. package/docs/plans/2026-02-23-research-shell-expert-agent.md +533 -0
  156. package/docs/plans/2026-02-23-roadmap-to-completion.md +530 -0
  157. package/docs/plans/2026-02-24-headless-module-split-design.md +98 -0
  158. package/docs/plans/2026-02-24-headless-module-split.md +443 -0
  159. package/docs/plans/2026-02-24-lesson-scope-metadata-design.md +228 -0
  160. package/docs/plans/2026-02-24-lesson-scope-metadata-plan.md +968 -0
  161. package/docs/plans/2026-02-24-npm-packaging-design.md +841 -0
  162. package/docs/plans/2026-02-24-npm-packaging-plan.md +1965 -0
  163. package/docs/plans/audit-findings.md +186 -0
  164. package/docs/telegram-notification-format.md +98 -0
  165. package/examples/example-plan.md +51 -0
  166. package/examples/example-prd.json +72 -0
  167. package/examples/example-roadmap.md +33 -0
  168. package/examples/quickstart-plan.md +63 -0
  169. package/hooks/hooks.json +26 -0
  170. package/hooks/setup-symlinks.sh +48 -0
  171. package/hooks/stop-hook.sh +135 -0
  172. package/package.json +47 -0
  173. package/policies/bash.md +71 -0
  174. package/policies/python.md +71 -0
  175. package/policies/testing.md +61 -0
  176. package/policies/universal.md +60 -0
  177. package/scripts/analyze-report.sh +97 -0
  178. package/scripts/architecture-map.sh +145 -0
  179. package/scripts/auto-compound.sh +273 -0
  180. package/scripts/batch-audit.sh +42 -0
  181. package/scripts/batch-test.sh +101 -0
  182. package/scripts/entropy-audit.sh +221 -0
  183. package/scripts/failure-digest.sh +51 -0
  184. package/scripts/generate-ast-rules.sh +96 -0
  185. package/scripts/init.sh +112 -0
  186. package/scripts/lesson-check.sh +428 -0
  187. package/scripts/lib/common.sh +61 -0
  188. package/scripts/lib/cost-tracking.sh +153 -0
  189. package/scripts/lib/ollama.sh +60 -0
  190. package/scripts/lib/progress-writer.sh +128 -0
  191. package/scripts/lib/run-plan-context.sh +215 -0
  192. package/scripts/lib/run-plan-echo-back.sh +231 -0
  193. package/scripts/lib/run-plan-headless.sh +396 -0
  194. package/scripts/lib/run-plan-notify.sh +57 -0
  195. package/scripts/lib/run-plan-parser.sh +81 -0
  196. package/scripts/lib/run-plan-prompt.sh +215 -0
  197. package/scripts/lib/run-plan-quality-gate.sh +132 -0
  198. package/scripts/lib/run-plan-routing.sh +315 -0
  199. package/scripts/lib/run-plan-sampling.sh +170 -0
  200. package/scripts/lib/run-plan-scoring.sh +146 -0
  201. package/scripts/lib/run-plan-state.sh +142 -0
  202. package/scripts/lib/run-plan-team.sh +199 -0
  203. package/scripts/lib/telegram.sh +54 -0
  204. package/scripts/lib/thompson-sampling.sh +176 -0
  205. package/scripts/license-check.sh +74 -0
  206. package/scripts/mab-run.sh +575 -0
  207. package/scripts/module-size-check.sh +146 -0
  208. package/scripts/patterns/async-no-await.yml +5 -0
  209. package/scripts/patterns/bare-except.yml +6 -0
  210. package/scripts/patterns/empty-catch.yml +6 -0
  211. package/scripts/patterns/hardcoded-localhost.yml +9 -0
  212. package/scripts/patterns/retry-loop-no-backoff.yml +12 -0
  213. package/scripts/pipeline-status.sh +197 -0
  214. package/scripts/policy-check.sh +226 -0
  215. package/scripts/prior-art-search.sh +133 -0
  216. package/scripts/promote-mab-lessons.sh +126 -0
  217. package/scripts/prompts/agent-a-superpowers.md +29 -0
  218. package/scripts/prompts/agent-b-ralph.md +29 -0
  219. package/scripts/prompts/judge-agent.md +61 -0
  220. package/scripts/prompts/planner-agent.md +44 -0
  221. package/scripts/pull-community-lessons.sh +90 -0
  222. package/scripts/quality-gate.sh +266 -0
  223. package/scripts/research-gate.sh +90 -0
  224. package/scripts/run-plan.sh +329 -0
  225. package/scripts/scope-infer.sh +159 -0
  226. package/scripts/setup-ralph-loop.sh +155 -0
  227. package/scripts/telemetry.sh +230 -0
  228. package/scripts/tests/run-all-tests.sh +52 -0
  229. package/scripts/tests/test-act-cli.sh +46 -0
  230. package/scripts/tests/test-agents-md.sh +87 -0
  231. package/scripts/tests/test-analyze-report.sh +114 -0
  232. package/scripts/tests/test-architecture-map.sh +89 -0
  233. package/scripts/tests/test-auto-compound.sh +169 -0
  234. package/scripts/tests/test-batch-test.sh +65 -0
  235. package/scripts/tests/test-benchmark-runner.sh +25 -0
  236. package/scripts/tests/test-common.sh +168 -0
  237. package/scripts/tests/test-cost-tracking.sh +158 -0
  238. package/scripts/tests/test-echo-back.sh +180 -0
  239. package/scripts/tests/test-entropy-audit.sh +146 -0
  240. package/scripts/tests/test-failure-digest.sh +66 -0
  241. package/scripts/tests/test-generate-ast-rules.sh +145 -0
  242. package/scripts/tests/test-helpers.sh +82 -0
  243. package/scripts/tests/test-init.sh +47 -0
  244. package/scripts/tests/test-lesson-check.sh +278 -0
  245. package/scripts/tests/test-lesson-local.sh +55 -0
  246. package/scripts/tests/test-license-check.sh +109 -0
  247. package/scripts/tests/test-mab-run.sh +182 -0
  248. package/scripts/tests/test-ollama-lib.sh +49 -0
  249. package/scripts/tests/test-ollama.sh +60 -0
  250. package/scripts/tests/test-pipeline-status.sh +198 -0
  251. package/scripts/tests/test-policy-check.sh +124 -0
  252. package/scripts/tests/test-prior-art-search.sh +96 -0
  253. package/scripts/tests/test-progress-writer.sh +140 -0
  254. package/scripts/tests/test-promote-mab-lessons.sh +110 -0
  255. package/scripts/tests/test-pull-community-lessons.sh +149 -0
  256. package/scripts/tests/test-quality-gate.sh +241 -0
  257. package/scripts/tests/test-research-gate.sh +132 -0
  258. package/scripts/tests/test-run-plan-cli.sh +86 -0
  259. package/scripts/tests/test-run-plan-context.sh +305 -0
  260. package/scripts/tests/test-run-plan-e2e.sh +153 -0
  261. package/scripts/tests/test-run-plan-headless.sh +424 -0
  262. package/scripts/tests/test-run-plan-notify.sh +124 -0
  263. package/scripts/tests/test-run-plan-parser.sh +217 -0
  264. package/scripts/tests/test-run-plan-prompt.sh +254 -0
  265. package/scripts/tests/test-run-plan-quality-gate.sh +222 -0
  266. package/scripts/tests/test-run-plan-routing.sh +178 -0
  267. package/scripts/tests/test-run-plan-scoring.sh +148 -0
  268. package/scripts/tests/test-run-plan-state.sh +261 -0
  269. package/scripts/tests/test-run-plan-team.sh +157 -0
  270. package/scripts/tests/test-scope-infer.sh +150 -0
  271. package/scripts/tests/test-setup-ralph-loop.sh +63 -0
  272. package/scripts/tests/test-telegram-env.sh +38 -0
  273. package/scripts/tests/test-telegram.sh +121 -0
  274. package/scripts/tests/test-telemetry.sh +46 -0
  275. package/scripts/tests/test-thompson-sampling.sh +139 -0
  276. package/scripts/tests/test-validate-all.sh +60 -0
  277. package/scripts/tests/test-validate-commands.sh +89 -0
  278. package/scripts/tests/test-validate-hooks.sh +98 -0
  279. package/scripts/tests/test-validate-lessons.sh +150 -0
  280. package/scripts/tests/test-validate-plan-quality.sh +235 -0
  281. package/scripts/tests/test-validate-plans.sh +187 -0
  282. package/scripts/tests/test-validate-plugin.sh +106 -0
  283. package/scripts/tests/test-validate-prd.sh +184 -0
  284. package/scripts/tests/test-validate-skills.sh +134 -0
  285. package/scripts/validate-all.sh +57 -0
  286. package/scripts/validate-commands.sh +67 -0
  287. package/scripts/validate-hooks.sh +89 -0
  288. package/scripts/validate-lessons.sh +98 -0
  289. package/scripts/validate-plan-quality.sh +369 -0
  290. package/scripts/validate-plans.sh +120 -0
  291. package/scripts/validate-plugin.sh +86 -0
  292. package/scripts/validate-policies.sh +42 -0
  293. package/scripts/validate-prd.sh +118 -0
  294. package/scripts/validate-skills.sh +96 -0
  295. package/skills/autocode/SKILL.md +285 -0
  296. package/skills/autocode/ab-verification.md +51 -0
  297. package/skills/autocode/code-quality-standards.md +37 -0
  298. package/skills/autocode/competitive-mode.md +364 -0
  299. package/skills/brainstorming/SKILL.md +97 -0
  300. package/skills/capture-lesson/SKILL.md +187 -0
  301. package/skills/check-lessons/SKILL.md +116 -0
  302. package/skills/dispatching-parallel-agents/SKILL.md +110 -0
  303. package/skills/executing-plans/SKILL.md +85 -0
  304. package/skills/finishing-a-development-branch/SKILL.md +201 -0
  305. package/skills/receiving-code-review/SKILL.md +72 -0
  306. package/skills/requesting-code-review/SKILL.md +59 -0
  307. package/skills/requesting-code-review/code-reviewer.md +82 -0
  308. package/skills/research/SKILL.md +145 -0
  309. package/skills/roadmap/SKILL.md +115 -0
  310. package/skills/subagent-driven-development/SKILL.md +98 -0
  311. package/skills/subagent-driven-development/code-quality-reviewer-prompt.md +18 -0
  312. package/skills/subagent-driven-development/implementer-prompt.md +73 -0
  313. package/skills/subagent-driven-development/spec-reviewer-prompt.md +57 -0
  314. package/skills/systematic-debugging/SKILL.md +134 -0
  315. package/skills/systematic-debugging/condition-based-waiting.md +64 -0
  316. package/skills/systematic-debugging/defense-in-depth.md +32 -0
  317. package/skills/systematic-debugging/root-cause-tracing.md +55 -0
  318. package/skills/test-driven-development/SKILL.md +167 -0
  319. package/skills/using-git-worktrees/SKILL.md +219 -0
  320. package/skills/using-superpowers/SKILL.md +54 -0
  321. package/skills/verification-before-completion/SKILL.md +140 -0
  322. package/skills/verify/SKILL.md +82 -0
  323. package/skills/writing-plans/SKILL.md +128 -0
  324. package/skills/writing-skills/SKILL.md +93 -0
@@ -0,0 +1,528 @@
1
+ # Research: Unconventional Perspectives on Autonomous Coding
2
+
3
+ > **Date:** 2026-02-22
4
+ > **Status:** Research complete
5
+ > **Method:** Cross-domain analogy mining + synthesis of 10 parallel research papers
6
+
7
+ ## Executive Summary
8
+
9
+ Three insights that none of the other 10 research papers would have found:
10
+
11
+ 1. **The toolkit is an immune system, not an assembly line.** Every other paper treats the pipeline as a manufacturing process (input -> stages -> output). But the lesson system, quality gates, and failure-pattern learning more closely resemble an adaptive immune system -- where each encountered bug produces "antibodies" (lessons) that provide lasting protection. This reframe changes the design priorities: invest in immune memory diversity and speed-of-response, not in pipeline throughput optimization.
12
+
13
+ 2. **Competitive mode is a jazz ensemble, not a tournament.** The competitive-landscape and multi-agent papers frame dual-track execution as two agents competing, with a judge picking a winner. But the mandatory best-of-both synthesis means the real pattern is _improvisation within structure_ -- like a jazz combo where two soloists play over the same changes, and the bandleader (judge) weaves together the best phrases from each. This reframe suggests the judge should be a _synthesizer_, not a _scorer_.
14
+
15
+ 3. **The single biggest blind spot across all 10 papers is _succession_.** Ecological succession describes how pioneer species colonize barren ground and create conditions for more complex species. The toolkit treats every batch as equal. But batch 1 of any plan is a pioneer -- it creates the files, tests, and structures that all subsequent batches depend on. A failure in batch 1 cascades differently than a failure in batch 6. No paper addresses batch-position-dependent failure dynamics or pioneer-batch hardening.
16
+
17
+ ---
18
+
19
+ ## Per-Topic Unconventional Additions
20
+
21
+ ### 1. Plan Quality -- Military Mission Command (Auftragstaktik)
22
+
23
+ #### The Analogy
24
+
25
+ In German military doctrine, _Auftragstaktik_ (mission command) distinguishes between the _Auftrag_ (the objective and intent) and the _Befehl_ (specific orders). Subordinate commanders receive the intent ("take that hill by dawn") and choose their own methods. This doctrine emerged because fog of war makes detailed plans obsolete within minutes of contact with the enemy.
26
+
27
+ Clausewitz's famous observation: "No plan survives contact with the enemy." The autonomous coding equivalent: no plan survives contact with the actual codebase. The plan-quality paper documents this -- stale plans, no-op tasks, batches that encounter code already changed by earlier batches. These are textbook friction (Clausewitz's _Friktion_) -- the gap between theory and execution caused by real-world complexity.
28
+
29
+ #### What It Adds to the Other Paper
30
+
31
+ The plan-quality paper recommends "structured intent" over "complete code in plan" -- provide the contract, not the implementation. This is exactly Auftragstaktik. But the paper stops at the plan level. Mission command goes further: it requires _two levels up_ understanding. Each subordinate knows not just their own objective but their commander's objective and their commander's commander's objective. This way, when the plan breaks, the subordinate can make intelligent decisions about what to do next.
32
+
33
+ The plan-quality paper's proposed task template has `Contract`, `Test`, `Verify`, and `Constraints`. What's missing is `Intent` -- why this task exists in the context of the whole feature. When a batch fails and the agent retries, it needs to know not just "what to do" but "why we're doing this" to make intelligent adaptations.
34
+
35
+ #### Concrete Design Implication
36
+
37
+ Add an `Intent` field to the plan task template:
38
+
39
+ ```markdown
40
+ ### Task N: [Name]
41
+ **Intent:** This task exists because [feature goal] requires [capability]. In the larger plan, this task enables Batch N+1 to [downstream dependency].
42
+ ```
43
+
44
+ This gives the retry agent the same decision-making power that Auftragstaktik gives to a field commander: freedom to deviate from the specific method while maintaining alignment with the larger objective.
45
+
46
+ ---
47
+
48
+ ### 2. Prompt Engineering -- Musical Rehearsal Marks and Call-and-Response
49
+
50
+ #### The Analogy
51
+
52
+ In orchestral music, a conductor doesn't tell violinists which fingers to use. The score provides _rehearsal marks_ -- structural signposts (letters A, B, C or measure numbers) that let performers navigate the piece. When something goes wrong in performance, the conductor calls out "from letter C!" and everyone resynchronizes.
53
+
54
+ In jazz, _call-and-response_ is a conversational structure where one musician plays a phrase (the "call") and another answers it (the "response"). This creates structure without rigidity -- the response must be _related_ to the call but is not predetermined.
55
+
56
+ #### What It Adds to the Other Paper
57
+
58
+ The prompt-engineering paper recommends structured planning instructions (+4% on SWE-bench). But it treats the prompt as a monologue -- a set of instructions the agent receives and follows. Musicians know that performance quality depends on _navigation structure_, not instruction density.
59
+
60
+ The top SWE-bench agents (SWE-agent, OpenHands) use 5-phase workflows that function as rehearsal marks: Explore, Analyze, Test, Implement, Verify. These aren't detailed instructions -- they're structural landmarks that keep the agent oriented. The prompt paper identifies this but doesn't name the principle.
61
+
62
+ The call-and-response pattern maps to retry prompts. Currently, the retry escalation _tells_ the agent what went wrong. A call-and-response structure would _ask_ the agent to articulate what went wrong first, then provide the failure digest as confirmation or correction. Research on self-correction confirms this: "ask yourself what went wrong" prompts outperform "be aware that you failed" prompts.
63
+
64
+ #### Concrete Design Implication
65
+
66
+ Structure the batch prompt as rehearsal marks, not instructions:
67
+
68
+ ```
69
+ [A] INVESTIGATE — Read the files you'll modify. Note discrepancies with the plan.
70
+ [B] TEST-FIRST — Write failing tests for each task. Confirm they fail.
71
+ [C] IMPLEMENT — Make each test pass. One task at a time.
72
+ [D] VERIFY — Run the full quality gate. All tests must pass.
73
+ [E] COMMIT — Commit completed work. Update progress.txt.
74
+ ```
75
+
76
+ On retry, use call-and-response: "Before reading the failure digest, describe in one sentence what you think went wrong. Then read the digest below and compare."
77
+
78
+ ---
79
+
80
+ ### 3. Context Utilization -- Ecological Carrying Capacity
81
+
82
+ #### The Analogy
83
+
84
+ In ecology, _carrying capacity_ (K) is the maximum population an environment can sustain. Below K, population grows. At K, resources are fully utilized. Above K, the population crashes as resources are exhausted.
85
+
86
+ The context window is a habitat with carrying capacity. The "population" is tokens of information. Below carrying capacity, adding more tokens improves the agent's performance (more relevant context = better decisions). At carrying capacity, the agent is using context optimally. Above carrying capacity, performance crashes -- the "context rot" documented in the Chroma study.
87
+
88
+ #### What It Adds to the Other Paper
89
+
90
+ The context-utilization paper documents the degradation curve and recommends a 6000-10000 char injection budget. But it treats context as a linear resource ("more is better up to a point, then worse"). Ecological carrying capacity has a more nuanced dynamic: _what_ occupies the capacity matters as much as _how much_.
91
+
92
+ In ecology, invasive species crowd out native species, reducing ecosystem productivity even below carrying capacity. In context windows, irrelevant injected context is an invasive species -- it consumes attention budget without contributing to task performance. The Factory.ai finding that "indiscriminate context stuffing" is counterproductive is the token-ecology equivalent of an invasive species outbreak.
93
+
94
+ The ecological insight is that healthy ecosystems have _niche partitioning_ -- different species occupy different ecological niches without competing. Context sections should occupy different informational niches: progress.txt occupies the "what happened before" niche, failure patterns occupy the "what to avoid" niche, referenced files occupy the "what exists now" niche. When two sections compete for the same niche (e.g., progress notes and git log both conveying "recent history"), one should be pruned.
95
+
96
+ #### Concrete Design Implication
97
+
98
+ Audit the context assembler for _niche overlap_. Currently, `run-plan-context.sh` injects recent commits AND progress notes -- both serve the "recent history" niche. Either merge them (a structured summary combining commit messages with progress notes) or prune one. The goal is not to fill the carrying capacity -- it's to maximize _niche diversity_ per token.
99
+
100
+ ---
101
+
102
+ ### 4. Competitive Landscape -- Niche Partitioning in Ecology
103
+
104
+ #### The Analogy
105
+
106
+ In ecology, Gause's competitive exclusion principle states that two species competing for the identical niche cannot coexist indefinitely -- one will outcompete the other. But species _can_ coexist if they partition the niche -- occupying slightly different roles in the same ecosystem. Darwin's finches survived by partitioning the seed-eating niche: different beak shapes for different seed sizes.
107
+
108
+ #### What It Adds to the Other Paper
109
+
110
+ The competitive-landscape paper positions the toolkit against 10 competitors but frames the competition spatially ("unique features vs. gaps"). Ecology would frame it temporally and dynamically. The market is not a static feature matrix -- it's an evolving ecosystem where competitive exclusion will eliminate tools that occupy the same niche as stronger competitors.
111
+
112
+ The toolkit's niche is _pipeline orchestration for Claude Code power users_. No competitor occupies this exact niche. But the niche is adjacent to Claude Code itself (which could absorb these features) and to Devin (which provides autonomous execution for a broader audience). The ecological question is: will the toolkit's niche remain viable, or will adjacent species (Claude Code with built-in pipeline features, Devin with better quality gates) consume it?
113
+
114
+ The defensive strategy from ecology is _niche hardening_ -- making the toolkit's niche deeper and more specialized so that generalist competitors can't easily subsume it. The lesson system, competitive dual-track, and batch-type-aware prompt selection are all niche-hardening features.
115
+
116
+ #### Concrete Design Implication
117
+
118
+ Stop chasing feature parity with competitors (IDE support, cloud hosting, multi-model). Instead, deepen the niche: more lesson types, richer failure pattern learning, cross-project lesson sharing, plan quality scoring. These are features that generalist tools won't build because they serve too narrow an audience -- which is exactly the point.
119
+
120
+ ---
121
+
122
+ ### 5. Agent Failure Taxonomy -- Swiss Cheese Model (Aviation Safety)
123
+
124
+ #### The Analogy
125
+
126
+ James Reason's Swiss cheese model of accident causation describes how safety barriers are like slices of Swiss cheese: each has holes (weaknesses), but the holes are in different places. An accident occurs only when the holes in multiple slices align, allowing a hazard to pass through all barriers.
127
+
128
+ In aviation, barriers include: pilot training, checklists, crew resource management, air traffic control, maintenance inspections, and aircraft design redundancy. No single barrier is sufficient. Safety comes from the _orthogonality_ of barriers -- different barriers catch different failure types.
129
+
130
+ #### What It Adds to the Other Paper
131
+
132
+ The failure-taxonomy paper identifies six failure clusters and maps them to the toolkit's coverage, finding 40-55% of failures uncovered. But it treats each failure type independently. The Swiss cheese model adds the insight that failure _combinations_ matter more than individual failures.
133
+
134
+ The toolkit's barriers are: lesson-check (syntactic anti-patterns), test suite (behavioral correctness), ast-grep (structural patterns), test-count monotonicity (test integrity), git-clean (completeness), and the verification stage (spec compliance). These are well-differentiated slices. But the paper identifies three _uncovered failure types_ (specification misunderstanding, planning errors, context degradation) -- these are holes that exist in _every_ slice simultaneously. No current barrier catches "agent solving the wrong problem."
135
+
136
+ The Swiss cheese insight is that adding more barriers of the same type (more linting rules, more grep patterns) doesn't close these aligned holes. You need a fundamentally different type of barrier -- one that operates at the specification level, not the implementation level.
137
+
138
+ #### Concrete Design Implication
139
+
140
+ Add a "specification echo-back" barrier -- a new slice of Swiss cheese that catches specification misunderstanding. Before executing code, the agent restates the task intent in its own words. A simple diff between the agent's restatement and the plan's intent section catches misalignment before implementation begins. This is a different _type_ of barrier, not more of the same type.
141
+
142
+ ---
143
+
144
+ ### 6. Verification Effectiveness -- Epidemiological Contact Tracing
145
+
146
+ #### The Analogy
147
+
148
+ When a disease outbreak occurs, epidemiologists perform _contact tracing_ -- working backward from confirmed cases to identify everyone the infected person contacted. The goal is to find and isolate potential carriers before they spread the disease further.
149
+
150
+ R0 (basic reproduction number) measures how many new infections each case generates. If R0 > 1, the outbreak grows exponentially. If R0 < 1, it dies out.
151
+
152
+ #### What It Adds to the Other Paper
153
+
154
+ The verification paper quantifies bug detection rates for each pipeline stage. But it treats each bug as independent. Bugs in code, like diseases in populations, have an R0. A bug in a shared utility function has high R0 -- it infects every module that imports it. A bug in a leaf function has R0 near zero -- it stays local.
155
+
156
+ The toolkit's quality gates check for bugs but don't trace their _spread_. When a bug is found in batch 5, the gates don't check whether batches 1-4 introduced shared code that propagates the bug to other modules. This is the equivalent of treating a COVID case without contact tracing.
157
+
158
+ The epidemiological insight is that _prevention_ (vaccination) is cheaper than _treatment_ (hospitalization). The toolkit invests heavily in treatment (finding bugs after they're written) but could invest more in prevention (making bugs impossible to write). Property-based testing, which the verification paper recommends, is vaccination -- it tests invariants that prevent entire classes of bugs, not just specific instances.
159
+
160
+ The herd immunity concept also applies: if enough of the codebase is covered by property tests (the "vaccinated population"), then even untested code benefits from the surrounding protection (because integration tests exercise it through tested interfaces).
161
+
162
+ #### Concrete Design Implication
163
+
164
+ Add "bug R0 estimation" to the failure digest. When a quality gate catches a bug, classify it as high-R0 (shared utility, imported by many) or low-R0 (leaf function, local scope). High-R0 bugs get immediate attention and trigger a scan of downstream consumers. Low-R0 bugs are fixed in place. This is triage by spread potential, not just severity.
165
+
166
+ ---
167
+
168
+ ### 7. Cost/Quality Tradeoff -- Theory of Constraints (Goldratt)
169
+
170
+ #### The Analogy
171
+
172
+ Eli Goldratt's Theory of Constraints (TOC) states that every system has exactly one constraint (bottleneck) that limits throughput. Optimizing anything other than the constraint produces zero improvement in system throughput. The Five Focusing Steps: Identify the constraint -> Exploit it -> Subordinate everything else to it -> Elevate the constraint -> Repeat.
173
+
174
+ #### What It Adds to the Other Paper
175
+
176
+ The cost-quality paper models costs per batch, per mode, per model. It identifies prompt caching as the biggest lever (83% reduction). But it implicitly assumes the constraint is _cost_. What if the constraint is something else?
177
+
178
+ For the toolkit, the actual throughput constraint is likely _plan quality_, not execution cost. The plan-quality paper found that plan quality is "worth roughly 3x the execution capability of the model itself." If plans are the bottleneck, then optimizing execution costs (model routing, caching, batch API) is optimizing a non-constraint -- it makes the execution stage faster and cheaper without increasing total throughput of _successful features_.
179
+
180
+ TOC says: don't balance capacity, balance _flow_. If plan creation takes 60 minutes and execution takes 20 minutes, making execution 50% cheaper doesn't matter. The bottleneck is plan creation.
181
+
182
+ Goldratt's "drum-buffer-rope" model suggests the pipeline should be paced by the bottleneck (plan quality), with buffers before it (design exploration time) and ropes pulling work through it (validation that plans meet quality thresholds before execution begins).
183
+
184
+ #### Concrete Design Implication
185
+
186
+ Implement the plan-quality scorecard (recommended in the plan-quality paper) as the _drum_ of the system. Don't start execution until plan quality exceeds 0.8. This subordinates execution to the constraint (plan quality), which TOC predicts will increase overall feature completion rate more than any execution-level optimization.
187
+
188
+ ---
189
+
190
+ ### 8. Multi-Agent Coordination -- Crew Resource Management (Aviation)
191
+
192
+ #### The Analogy
193
+
194
+ After a series of crashes caused not by mechanical failure but by crew miscommunication, the aviation industry developed Crew Resource Management (CRM). CRM training teaches pilots and crew to: use standardized communication protocols, challenge authority when safety is at risk, maintain shared situational awareness, and conduct structured briefings and debriefings.
195
+
196
+ The key CRM insight: most multi-crew accidents are not caused by one person's error -- they're caused by the crew failing to catch or communicate about that error. The error is the first event; the failure to catch it is the accident.
197
+
198
+ #### What It Adds to the Other Paper
199
+
200
+ The multi-agent paper catalogs coordination patterns (pipeline, debate, ensemble, etc.) and maps them to the toolkit. But it focuses on _architecture_ -- how agents are wired together. CRM focuses on _communication quality_ within any architecture.
201
+
202
+ The MAST study found that inter-agent misalignment causes 37% of multi-agent failures. CRM addresses exactly this: the 1977 Tenerife disaster (583 deaths) was caused not by pilot incompetence but by _communication failure_ between the flight crew and tower. The captain's authority gradient prevented the first officer from asserting a safety concern.
203
+
204
+ The toolkit's team mode has an authority gradient problem: the implementer generates code, the reviewer evaluates it. But the reviewer is the same model -- creating what CRM calls "authority gradient" (the reviewer may defer to the implementer's apparent confidence). The Berkeley MAST study confirms this: "conformity bias / groupthink" where agents reinforce each other's errors.
205
+
206
+ CRM's solution is the _sterile cockpit rule_ -- below 10,000 feet, no non-essential conversation. Applied to the toolkit: during critical execution phases (integration batches, retry attempts), restrict the agent's context to only task-essential information. No progress notes from previous batches, no recent commit logs -- just the task, the relevant files, and the quality gate requirements.
207
+
208
+ #### Concrete Design Implication
209
+
210
+ 1. Implement a "sterile cockpit" mode for critical and retry batches: strip non-essential context, provide only the task and directly relevant files.
211
+ 2. Use different models for implementer and reviewer to break the authority gradient (the multi-agent paper already recommends this, but CRM provides the _why_: homogeneous crews amplify errors).
212
+ 3. Add a structured "briefing" at the start of each batch (the investigation-first instruction from the prompt paper) and a structured "debriefing" at the end (append to progress.txt in a structured format).
213
+
214
+ ---
215
+
216
+ ### 9. User Adoption -- Sports Periodization
217
+
218
+ #### The Analogy
219
+
220
+ In sports science, _periodization_ structures training into cycles: macrocycles (season-long), mesocycles (weeks), and microcycles (days). Athletes don't train at maximum intensity continuously -- they alternate between high-load phases (building capacity) and low-load phases (recovery and skill development). Attempting to train at maximum intensity every day leads to overtraining syndrome: decreased performance, injury, and burnout.
221
+
222
+ Anders Ericsson's research on _deliberate practice_ shows that expertise develops through focused practice on specific weaknesses, not through general repetition. A pianist doesn't play the whole concerto repeatedly -- they isolate the difficult passages and drill them.
223
+
224
+ #### What It Adds to the Other Paper
225
+
226
+ The user-adoption paper identifies that the toolkit's rigid six-stage pipeline causes adoption friction. Its solution is progressive disclosure: start simple, unlock complexity. But progressive disclosure is spatial (what features to show) -- periodization adds a temporal dimension (when to introduce complexity).
227
+
228
+ A new user doesn't need to learn the full toolkit on day one. But they also shouldn't learn it in a single progressive climb. The periodization model suggests _cycles_:
229
+
230
+ - **Week 1 (Foundation):** Run pre-made plans. Experience quality gates. Build trust.
231
+ - **Week 2 (Skill):** Write your own plans. Learn plan quality patterns. Practice TDD.
232
+ - **Week 3 (Integration):** Use /autocode. Full pipeline with brainstorming. Higher autonomy.
233
+ - **Week 4 (Recovery):** Review what worked. Submit lessons from your experience. Customize.
234
+
235
+ This is not just "start simple" -- it's a structured learning progression with deliberate practice on specific skills and recovery periods for reflection.
236
+
237
+ The "coaching vs. playing" distinction is also relevant. The toolkit currently acts as a player (executing code). For adoption, it should also act as a coach -- explaining why quality gates matter after the user sees them work, not before.
238
+
239
+ #### Concrete Design Implication
240
+
241
+ Add a "training mode" that structures the first-month experience:
242
+ 1. First run: execute an example plan with verbose quality gate output explaining what each check does and why
243
+ 2. First plan: the toolkit validates the user's plan and explains how to improve it before execution
244
+ 3. First failure: the toolkit walks through the retry mechanism, showing what context it injects and why
245
+ 4. First lesson: the toolkit guides the user through `/submit-lesson`, demonstrating the feedback loop
246
+
247
+ ---
248
+
249
+ ### 10. Lesson Transferability -- Biological Immune System (Adaptive vs. Innate)
250
+
251
+ #### The Analogy
252
+
253
+ The vertebrate immune system has two layers:
254
+
255
+ 1. **Innate immunity:** Fast, non-specific. Physical barriers (skin), inflammation, phagocytes. Responds identically to any pathogen. No memory.
256
+ 2. **Adaptive immunity:** Slow first response, highly specific. B-cells produce antibodies targeting exact pathogens. T-cells kill infected cells. Creates _immunological memory_ -- second exposure triggers faster, stronger response.
257
+
258
+ Vaccination works by exposing the adaptive immune system to weakened pathogens, building memory without suffering the disease.
259
+
260
+ #### What It Adds to the Other Paper
261
+
262
+ The lesson-transferability paper maps beautifully onto immune system biology:
263
+
264
+ - **Innate immunity = universal lessons.** "Log before fallback" is like skin -- it protects against everything, requires no learning, and is always active. These are the ~25 universal-scope lessons.
265
+ - **Adaptive immunity = project-specific lessons.** "Hub.cache access patterns" is a specific antibody -- it only fights one pathogen (one codebase's anti-pattern) but fights it with precision. These are the ~2 project-specific lessons.
266
+ - **Vaccination = community lessons.** When another user submits a lesson from their production failure, they're providing a weakened pathogen that builds your project's immunity without suffering the bug.
267
+
268
+ But the immune analogy reveals something the paper misses: **autoimmune disorders**. When the immune system attacks the body's own healthy cells, you get autoimmune disease. When a lesson system produces false positives -- flagging correct code as buggy -- it's an autoimmune response. The paper identifies false positive risk but doesn't name the mechanism.
269
+
270
+ In immunology, autoimmune disorders are prevented by _clonal selection_ -- immune cells that react to self-antigens are eliminated during development. The lesson system equivalent: during lesson creation (the "development" phase), test the lesson against known-good code. If it triggers on good code, it's an autoimmune antibody and must be refined or eliminated.
271
+
272
+ The immune system also has _tolerance_ mechanisms -- it learns to stop reacting to benign substances (like food proteins). The lesson system needs tolerance: when a finding is repeatedly dismissed as a false positive, the system should learn to suppress it for that context. This is what DeepSource's relevance engine and Semgrep's AI triage provide -- immune tolerance for code analysis.
273
+
274
+ #### Concrete Design Implication
275
+
276
+ 1. Add "clonal selection" to the lesson creation process: before merging a new lesson, run it against a corpus of known-good code (the toolkit's own codebase, for example). If it triggers, refine the pattern.
277
+ 2. Add "immune tolerance": track dismissals per lesson per project. After N dismissals for the same project, automatically suppress that lesson for that project with a note in the scan output ("suppressed: 5 dismissals").
278
+ 3. Classify lessons as innate (universal, always active) or adaptive (scope-filtered, learned from specific exposure).
279
+
280
+ ---
281
+
282
+ ## Cross-Cutting Synthesis
283
+
284
+ ### Meta-Patterns
285
+
286
+ **1. Orthogonality Beats Redundancy**
287
+
288
+ Every domain says the same thing differently:
289
+ - Aviation (Swiss cheese): barriers must cover different failure modes
290
+ - Ecology (niche partitioning): organisms coexist by filling different niches
291
+ - Immunology (innate + adaptive): two systems cover non-overlapping threat spaces
292
+ - Military (combined arms): infantry, armor, and air power defeat enemies that any single arm cannot
293
+
294
+ The toolkit already practices this (bottom-up anti-patterns + top-down integration tests = orthogonal verification). But the principle should be explicit in design decisions: when adding a new check, ask "does this cover a failure mode no existing check covers?" not "does this make existing coverage stronger?"
295
+
296
+ **2. Memory Transforms Reactive Systems into Adaptive Systems**
297
+
298
+ - Immune system: memory B-cells enable faster response to known pathogens
299
+ - Aviation ASRS: incident reports enable industry-wide learning from near-misses
300
+ - Toyota A3: searchable problem database means "you never solve the same problem twice"
301
+ - Sports: film study enables adaptation to opponents' patterns
302
+
303
+ The toolkit's lesson system is its memory. The papers undervalue this: the lesson system is not just "automated anti-pattern checking" -- it's the mechanism by which the toolkit _evolves_. Every production failure makes the system permanently harder to break. No competitor has this. This is the toolkit's immune memory, and it's the single most defensible competitive advantage.
304
+
305
+ **3. Structure Enables Improvisation**
306
+
307
+ - Jazz: chord changes provide structure; solos are improvisation within that structure
308
+ - Military (Auftragstaktik): mission intent provides structure; tactical decisions are improvised
309
+ - Sports (plays vs. execution): the play is the plan; reading the defense is improvisation
310
+ - Ecology (succession): early species create structure; later species improvise within it
311
+
312
+ The toolkit's skill chain is the chord changes. The agent's execution is the solo. The quality gates are the barlines that keep everyone synchronized. The right design doesn't _constrain_ the agent -- it _enables_ the agent to make intelligent local decisions within a globally coherent framework.
313
+
314
+ **4. Pioneers Bear Disproportionate Risk**
315
+
316
+ - Ecology (succession): pioneer species face harsh conditions that climax species never encounter
317
+ - Military: the first wave takes the heaviest casualties
318
+ - Manufacturing (first article inspection): the first unit off the line gets the most thorough inspection
319
+ - Sports: the opening drive sets the tone for the game
320
+
321
+ Batch 1 of any plan is a pioneer. It creates the file structure, test infrastructure, and patterns that all subsequent batches inherit. A bug in batch 1 has the highest R0 of any bug. Yet the toolkit treats batch 1 identically to batch 6. The design implication: batch 1 should get hardened execution -- higher-tier model, competitive mode, extra verification.
322
+
323
+ **5. Degradation is Non-Linear and Has Phase Transitions**
324
+
325
+ - Ecology (carrying capacity): populations don't decline gradually; they crash
326
+ - Epidemiology: disease spread is exponential until herd immunity threshold, then crashes
327
+ - Materials science: metals bend, then suddenly fracture
328
+ - Aviation: workload is manageable until it isn't -- then all errors happen at once
329
+
330
+ Context degradation follows this pattern (the Chroma study: "performance drops are often sudden rather than progressive"). The toolkit's fresh-context-per-batch architecture avoids the phase transition entirely by never accumulating enough context to reach the tipping point. This is prevention, not treatment -- the epidemiological equivalent of keeping R0 below 1.
331
+
332
+ ### Contradictions
333
+
334
+ **1. Biology says evolve; Manufacturing says standardize.**
335
+
336
+ The immune system succeeds through diversity and mutation. Toyota succeeds through standardization and waste elimination. These are opposite strategies. The resolution: the _process_ should be standardized (Toyota -- rigid skill chain, consistent quality gates), but the _responses_ should evolve (immune system -- lessons learned, adaptive prompt selection, failure pattern learning). The toolkit already does this correctly: rigid pipeline structure with evolving content.
337
+
338
+ **2. Military says decentralize; Aviation says standardize communication.**
339
+
340
+ Auftragstaktik delegates decision-making downward. CRM standardizes communication upward. The resolution: delegate _execution_ decisions (the agent chooses how to implement) but standardize _status communication_ (structured progress.txt, standardized quality gate output). The toolkit should not constrain how the agent writes code but should constrain how it reports what it did.
341
+
342
+ **3. Ecology says diversity; Manufacturing says reduce variation.**
343
+
344
+ Ecological resilience comes from species diversity. Manufacturing quality comes from reducing variation. The resolution depends on which part of the system: _input_ diversity (multiple prompt strategies, competitive execution) increases resilience; _output_ consistency (quality gates, test assertions) ensures quality. The toolkit's MAB system gets this right: diverse approaches in, consistent quality bar out.
345
+
346
+ ### The Single Most Powerful Insight
347
+
348
+ **The toolkit is building an artificial immune system for codebases, and it doesn't know it.**
349
+
350
+ The lesson system is adaptive immunity. The quality gates are innate immunity. Community lesson submission is vaccination. False positives are autoimmune disorders. Scope filtering is clonal selection. The retry mechanism is the inflammatory response.
351
+
352
+ This is not a metaphor -- it's a structural isomorphism. The immune system is the most successful quality-assurance system in biology: it protects against billions of potential pathogens, learns from every encounter, shares knowledge across organisms (breast milk, vaccination), and operates with zero downtime.
353
+
354
+ Reframing the toolkit as an immune system changes the roadmap priorities:
355
+
356
+ 1. **Maximize memory diversity** (more lesson types, richer failure patterns) -- not just more checks, but checks that cover orthogonal failure modes
357
+ 2. **Speed up the immune response** (faster lesson-to-check pipeline, automated lesson extraction from failures) -- when a bug gets through, how fast does the system learn?
358
+ 3. **Prevent autoimmune disorders** (scope filtering, false positive tracking, tolerance mechanisms) -- the system should never attack healthy code
359
+ 4. **Build herd immunity** (community lessons, shared quality profiles) -- every user's failures protect every other user
360
+ 5. **Invest in vaccination** (property-based testing, pre-execution specification checks) -- prevent entire classes of bugs, don't just detect individual instances
361
+
362
+ No competitor is building an immune system. They're building assembly lines. Assembly lines break when they encounter novel inputs. Immune systems get stronger.
363
+
364
+ ---
365
+
366
+ ## Appendix: Addenda for Each Research Paper
367
+
368
+ ### Addendum for Plan Quality Paper
369
+
370
+ **Cross-Domain Perspective: Military Mission Command (Auftragstaktik)**
371
+
372
+ The plan-quality paper's recommendation to shift from "complete code in plan" to "contracts + one example" is structurally identical to the military doctrine of Auftragstaktik (mission command), where subordinate commanders receive the objective and intent rather than detailed orders. Clausewitz's observation that "no plan survives contact with the enemy" maps directly to the finding that stale plans and no-op tasks degrade execution.
373
+
374
+ The mission command framework adds one element the paper misses: _commander's intent at two levels_. Each task should include not just its own contract but its role in the larger feature. When a batch fails and the agent retries, knowing "this task exists to enable Batch N+1 to wire the modules together" gives the agent the context to make intelligent adaptation decisions -- the same way a field commander adapts tactics when the original plan is disrupted by terrain or enemy action.
375
+
376
+ Additionally, the military concept of _friction_ (the accumulation of small difficulties that make simple things difficult in war) provides a useful lens for batch boundary design. Integration batches experience the most friction because they cross module boundaries. The paper's recommendation to "never mix file-creation and integration tasks" maps to the military principle of maintaining clear phase lines between offensive operations. Crossing a phase line (moving from creation to integration) should be a deliberate, verified transition -- not an accident of batch grouping.
377
+
378
+ ### Addendum for Prompt Engineering Paper
379
+
380
+ **Cross-Domain Perspective: Musical Rehearsal Structure and Call-and-Response**
381
+
382
+ The prompt-engineering paper's finding that structured planning (+4% SWE-bench) outperforms raw chain-of-thought parallels a well-known principle in musical performance: rehearsal marks (structural landmarks in a score) enable performers to navigate complex pieces without getting lost, while detailed phrase-by-phrase instructions from a conductor actually degrade performance by removing the musician's interpretive agency.
383
+
384
+ The batch prompt should function like rehearsal marks -- five structural landmarks (Investigate, Test-First, Implement, Verify, Commit) that the agent navigates through. This is lighter than detailed step-by-step instructions and heavier than "just code it." The top SWE-bench agents already use this pattern (SWE-agent's 5-phase workflow, OpenHands' 5-phase workflow) but the paper doesn't name the underlying principle.
385
+
386
+ For retry prompts, the jazz concept of call-and-response suggests a structural improvement: instead of the current pattern (system tells agent what failed), use an interactive pattern (system asks agent to diagnose, then provides the actual failure data for comparison). The self-correction research cited in the paper supports this -- "ask yourself what went wrong" prompts outperform "here's what went wrong" prompts. In jazz terms: the rhythm section states the question, and the soloist must formulate their own answer before hearing what the rest of the band plays.
387
+
388
+ ### Addendum for Context Utilization Paper
389
+
390
+ **Cross-Domain Perspective: Ecological Carrying Capacity and Niche Partitioning**
391
+
392
+ The context-utilization paper models the context window as a linear resource with a degradation curve. Ecology offers a richer model: the context window is a habitat with carrying capacity (K). Below K, adding context tokens improves performance. At K, the agent is maximally effective. Above K, performance crashes -- matching the "sudden rather than progressive" degradation the paper documents from the Chroma study.
393
+
394
+ The ecological insight is that _what_ occupies the carrying capacity matters more than _how much_. In a healthy ecosystem, species partition niches -- different organisms fill different ecological roles without competing for the same resources. The context assembler's injected sections should similarly occupy distinct informational niches. Currently, recent commits and progress notes both serve the "recent history" niche, competing for the agent's attention. Merging these into a single structured "recent context" section would improve niche diversity per token.
395
+
396
+ The concept of _invasive species_ also applies: irrelevant context that consumes attention budget without contributing to task performance is an ecological invader. The paper's recommendation for XML-tagged sections helps the agent distinguish between context types, which is the token equivalent of species identification -- you can't manage a habitat if you can't tell the species apart.
397
+
398
+ ### Addendum for Competitive Landscape Paper
399
+
400
+ **Cross-Domain Perspective: Ecological Niche Partitioning and Competitive Exclusion**
401
+
402
+ Gause's competitive exclusion principle states that two species cannot indefinitely occupy the same ecological niche. Applied to the autonomous coding tool market: tools that compete for the exact same user need (e.g., general-purpose IDE coding assistants) will converge until only the strongest survive. Cursor and Windsurf are in a competitive exclusion race. So are Claude Code and Codex CLI.
403
+
404
+ The toolkit survives competitive exclusion by occupying a distinct niche: pipeline orchestration for Claude Code power users. This niche is too specialized for generalist tools to subsume (they won't build test-count monotonicity or batch-type-aware prompt selection for their mass market), yet valuable enough for its target audience to sustain.
405
+
406
+ The defensive strategy from ecology is _niche hardening_: making the toolkit indispensable in its niche rather than expanding into adjacent niches where it would face direct competition. Every feature that deepens the pipeline (richer lessons, better failure learning, plan quality scoring) hardens the niche. Every feature that broadens the toolkit (IDE support, cloud hosting, multi-model routing) enters contested territory where larger competitors have structural advantages.
407
+
408
+ ### Addendum for Agent Failure Taxonomy Paper
409
+
410
+ **Cross-Domain Perspective: James Reason's Swiss Cheese Model (Aviation Safety)**
411
+
412
+ The failure-taxonomy paper identifies six failure clusters and three major gaps in the toolkit's coverage. The Swiss cheese model from aviation safety adds a crucial structural insight: failures occur when holes in multiple safety barriers _align_. The toolkit's barriers (lesson-check, test suite, ast-grep, test-count, git-clean, verification) are well-differentiated slices of Swiss cheese. But the three uncovered failure classes (specification misunderstanding, planning errors, context degradation) represent holes that exist in _every_ slice simultaneously -- no current barrier operates at the specification level.
413
+
414
+ Adding more barriers of the same type (more regex patterns, more linting rules) moves the holes within existing slices but doesn't add a new slice. What's needed is a fundamentally different barrier type: a specification-level check that catches "right code, wrong task" before implementation begins. The paper's recommendation for a "specification echo-back gate" is exactly this -- it's a new slice of Swiss cheese with its holes in a different place than all existing slices.
415
+
416
+ The Swiss cheese model also provides a framework for the paper's concern about "force multiplier" failures: context degradation doesn't create bugs directly, but it _enlarges the holes_ in every barrier. When the agent's attention is degraded, it's more likely to miss each individual check. The toolkit's fresh-context architecture prevents this enlargement by resetting the barrier quality at every batch.
417
+
418
+ ### Addendum for Verification Effectiveness Paper
419
+
420
+ **Cross-Domain Perspective: Epidemiological Contact Tracing and Herd Immunity**
421
+
422
+ The verification paper quantifies detection rates per pipeline stage but treats each bug as an independent event. Epidemiology provides a richer model: bugs have a _reproduction number_ (R0) -- how many downstream bugs each bug creates. A bug in a shared utility has high R0 (every consumer is "infected"). A bug in a leaf function has R0 near zero.
423
+
424
+ Contact tracing (working backward from a bug to identify all potentially affected code) is missing from the pipeline. When a quality gate catches a bug, the current response is "fix the bug." An epidemiological response would be "fix the bug AND trace its contacts" -- identify all modules that import or depend on the buggy code, and verify they aren't already exhibiting symptoms.
425
+
426
+ The paper's recommendation for property-based testing maps to the epidemiological concept of vaccination: property tests establish invariants that prevent entire classes of bugs, not just specific instances. If enough of the codebase is "vaccinated" with property tests, the remaining untested code benefits from herd immunity -- integration tests exercise it through tested interfaces, and invariant violations are caught at the boundary.
427
+
428
+ ### Addendum for Cost/Quality Tradeoff Paper
429
+
430
+ **Cross-Domain Perspective: Theory of Constraints (Goldratt)**
431
+
432
+ The cost-quality paper optimizes execution costs (caching, model routing, batch API). The Theory of Constraints says this optimization may be irrelevant: if the system's bottleneck is plan quality (which the plan-quality paper argues it is), then making execution faster and cheaper produces zero improvement in total throughput.
433
+
434
+ Goldratt's Five Focusing Steps applied to the toolkit: (1) Identify the constraint -- plan creation takes 60 minutes while execution takes 20 minutes per feature. (2) Exploit the constraint -- invest in plan-quality tooling, not execution-cost optimization. (3) Subordinate everything else -- don't start execution until plan quality exceeds threshold. (4) Elevate the constraint -- build a plan-quality validator that catches gaps before the human reviews. (5) Repeat -- after plan quality improves, re-identify the new constraint.
435
+
436
+ The paper's recommendation to implement cost tracking is correct regardless of the constraint location -- you need data to identify the bottleneck. But the Theory of Constraints predicts that caching optimization ($0.73 savings per MAB plan) will matter far less than plan-quality investment (preventing entire feature rework cycles worth $5-10).
437
+
438
+ ### Addendum for Multi-Agent Coordination Paper
439
+
440
+ **Cross-Domain Perspective: Crew Resource Management and the Sterile Cockpit Rule**
441
+
442
+ The multi-agent paper catalogs coordination patterns and identifies conformity bias as a key risk. Aviation's Crew Resource Management (CRM) has 50+ years of evidence on exactly this problem. The Tenerife disaster (1977, 583 deaths) was caused not by pilot error per se, but by the failure of crew members to challenge the captain's incorrect assumption. CRM training reduced the aviation accident rate by teaching standardized communication and empowering junior crew members to challenge authority.
443
+
444
+ The toolkit's team mode has an analogous authority gradient: the implementer produces code, the reviewer evaluates it. When both use the same model, the reviewer tends to defer to the implementer's apparent reasoning -- the LLM equivalent of a first officer deferring to a captain. Using different models (as the paper recommends) is the CRM solution: different training backgrounds produce different assumptions, enabling genuine challenge.
445
+
446
+ CRM's sterile cockpit rule (below 10,000 feet, no non-essential communication) suggests a design pattern for critical batches: strip non-essential context and restrict the agent to only task-relevant information. This reduces cognitive load at exactly the moment when errors are most dangerous -- during integration, retry, and production-critical batches.
447
+
448
+ ### Addendum for User Adoption Paper
449
+
450
+ **Cross-Domain Perspective: Sports Periodization and Deliberate Practice**
451
+
452
+ The user-adoption paper recommends progressive disclosure to reduce friction. Sports science adds a temporal dimension: periodization. Athletes don't train at maximum intensity every day -- they alternate between loading phases (building capacity) and recovery phases (consolidating gains). Attempting to learn the full toolkit pipeline in one session is overtraining.
453
+
454
+ A periodized onboarding schedule would look like: Week 1 (foundation) -- run existing plans, experience quality gates; Week 2 (skill building) -- write plans, practice the format; Week 3 (integration) -- full /autocode pipeline; Week 4 (recovery/reflection) -- review outcomes, submit lessons, customize.
455
+
456
+ Anders Ericsson's deliberate practice research adds another dimension: expertise develops through focused practice on specific weaknesses, not general repetition. The toolkit's onboarding should identify which stage is causing the most friction for each user and provide targeted practice. If plan writing is the bottleneck, offer plan-writing exercises with the quality scorecard. If TDD is unfamiliar, offer a TDD-focused tutorial that's independent of the toolkit's pipeline.
457
+
458
+ The "coaching vs. playing" distinction is critical: the toolkit currently acts as a player (executing code). For adoption, it should also act as a coach -- explaining quality gate results after the user sees them work, demonstrating the value of TDD by showing before/after failure rates, and celebrating (in command output) when lessons prevent real bugs.
459
+
460
+ ### Addendum for Lesson Transferability Paper
461
+
462
+ **Cross-Domain Perspective: The Adaptive Immune System**
463
+
464
+ The lesson-transferability paper proposes scope metadata (universal, language, framework, domain, project-specific) for filtering lessons. This taxonomy is a structural isomorphism with the vertebrate immune system: universal lessons are innate immunity (always active, non-specific); language and framework lessons are adaptive immunity (activated by specific antigens -- file extensions, dependency manifests); project-specific lessons are tissue-specific immune responses (only active in the originating organ).
465
+
466
+ The immune analogy reveals three mechanisms the paper doesn't discuss:
467
+
468
+ First, _clonal selection_ (testing new antibodies against self before deployment): before merging a new lesson, it should be tested against known-good code. A lesson that triggers on correct code is an autoimmune antibody -- it attacks healthy tissue. Adding a "run against known-good corpus" step to the lesson PR review process prevents this.
469
+
470
+ Second, _immune tolerance_ (learning to stop reacting to benign substances): when a lesson finding is repeatedly dismissed as a false positive, the system should learn to suppress it for that context. Tracking dismissals per lesson per project and auto-suppressing after N dismissals implements tolerance without removing the lesson entirely.
471
+
472
+ Third, _mucosal immunity_ (specialized immune responses at high-exposure surfaces): the toolkit's most critical boundary is the quality gate between batches. Lessons that fire at this boundary (lesson-check.sh) should be the highest-confidence, lowest-false-positive checks -- the equivalent of the immune system's strongest defenses at the body's most exposed surfaces (gut, lungs, skin). Lower-confidence checks belong at less critical boundaries (verification stage, semantic scanner).
473
+
474
+ ---
475
+
476
+ ## Sources
477
+
478
+ ### Biology and Immunology
479
+ - Janeway, C.A. et al. _Immunobiology: The Immune System in Health and Disease._ 5th edition. Garland Science, 2001.
480
+ - Murphy, K. & Weaver, C. _Janeway's Immunobiology._ 9th edition. Garland Science, 2016.
481
+ - Medzhitov, R. & Janeway, C.A. "Innate immunity." _New England Journal of Medicine_ 343.5 (2000): 338-344.
482
+
483
+ ### Ecology
484
+ - Gause, G.F. _The Struggle for Existence._ Williams & Wilkins, 1934.
485
+ - Hardin, G. "The Competitive Exclusion Principle." _Science_ 131.3409 (1960): 1292-1297.
486
+ - Connell, J.H. & Slatyer, R.O. "Mechanisms of succession in natural communities." _American Naturalist_ 111.982 (1977): 1119-1144.
487
+ - Holling, C.S. "Resilience and stability of ecological systems." _Annual Review of Ecology and Systematics_ 4 (1973): 1-23.
488
+
489
+ ### Military Doctrine
490
+ - von Clausewitz, C. _On War._ Trans. Howard & Paret. Princeton University Press, 1976.
491
+ - Vandergriff, D.E. _Mission Command: The Who, What, Where, When and Why._ CreateSpace, 2019.
492
+ - US Army. _ADP 6-0: Mission Command._ 2019.
493
+
494
+ ### Aviation Safety
495
+ - Reason, J. "Human error: models and management." _BMJ_ 320.7237 (2000): 768-770.
496
+ - Reason, J. _Managing the Risks of Organizational Accidents._ Ashgate, 1997.
497
+ - Helmreich, R.L. et al. "The evolution of crew resource management training in commercial aviation." _International Journal of Aviation Psychology_ 9.1 (1999): 19-32.
498
+ - Federal Aviation Administration. _Advisory Circular 120-51E: Crew Resource Management Training._ 2004.
499
+
500
+ ### Manufacturing and Theory of Constraints
501
+ - Goldratt, E.M. _The Goal: A Process of Ongoing Improvement._ North River Press, 1984.
502
+ - Ohno, T. _Toyota Production System: Beyond Large-Scale Production._ Productivity Press, 1988.
503
+ - Shook, J. "Toyota's Secret: The A3 Report." _MIT Sloan Management Review_ 50.4 (2009): 30-33.
504
+
505
+ ### Game Theory
506
+ - Nash, J. "Equilibrium points in n-person games." _Proceedings of the National Academy of Sciences_ 36.1 (1950): 48-49.
507
+ - Milgrom, P. & Roberts, J. "Complementarities and fit: Strategy, structure, and organizational change in manufacturing." _Journal of Accounting and Economics_ 19 (1995): 179-208.
508
+
509
+ ### Epidemiology
510
+ - Anderson, R.M. & May, R.M. _Infectious Diseases of Humans: Dynamics and Control._ Oxford University Press, 1991.
511
+ - Fine, P. et al. "Herd immunity: a rough guide." _Clinical Infectious Diseases_ 52.7 (2011): 911-916.
512
+
513
+ ### Sports Science
514
+ - Bompa, T.O. & Haff, G.G. _Periodization: Theory and Methodology of Training._ 5th edition. Human Kinetics, 2009.
515
+ - Ericsson, K.A. et al. "The role of deliberate practice in the acquisition of expert performance." _Psychological Review_ 100.3 (1993): 363-406.
516
+
517
+ ### Music and Performance
518
+ - Berliner, P.F. _Thinking in Jazz: The Infinite Art of Improvisation._ University of Chicago Press, 1994.
519
+ - Sawyer, R.K. "Group creativity: Music, theater, collaboration." _Mahwah, NJ: Lawrence Erlbaum_ (2003).
520
+
521
+ ### Organizational Psychology
522
+ - Edmondson, A. "Psychological safety and learning behavior in work teams." _Administrative Science Quarterly_ 44.2 (1999): 350-383.
523
+ - Sweller, J. "Cognitive load theory, learning difficulty, and instructional design." _Learning and Instruction_ 4.4 (1994): 295-312.
524
+ - Weick, K.E. & Sutcliffe, K.M. _Managing the Unexpected: Resilient Performance in an Age of Uncertainty._ 3rd edition. Jossey-Bass, 2015.
525
+
526
+ ### Checklists and Verification
527
+ - Gawande, A. _The Checklist Manifesto: How to Get Things Right._ Metropolitan Books, 2009.
528
+ - Nielsen, J. "Progressive Disclosure." Nielsen Norman Group, 1995.