autonomous-coding-toolkit 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. package/.claude-plugin/marketplace.json +22 -0
  2. package/.claude-plugin/plugin.json +13 -0
  3. package/LICENSE +21 -0
  4. package/Makefile +21 -0
  5. package/README.md +140 -0
  6. package/SECURITY.md +28 -0
  7. package/agents/bash-expert.md +113 -0
  8. package/agents/dependency-auditor.md +138 -0
  9. package/agents/integration-tester.md +120 -0
  10. package/agents/lesson-scanner.md +149 -0
  11. package/agents/python-expert.md +179 -0
  12. package/agents/service-monitor.md +141 -0
  13. package/agents/shell-expert.md +147 -0
  14. package/benchmarks/runner.sh +147 -0
  15. package/benchmarks/tasks/01-rest-endpoint/rubric.sh +29 -0
  16. package/benchmarks/tasks/01-rest-endpoint/task.md +17 -0
  17. package/benchmarks/tasks/02-refactor-module/task.md +8 -0
  18. package/benchmarks/tasks/03-fix-integration-bug/task.md +8 -0
  19. package/benchmarks/tasks/04-add-test-coverage/task.md +8 -0
  20. package/benchmarks/tasks/05-multi-file-feature/task.md +8 -0
  21. package/bin/act.js +238 -0
  22. package/commands/autocode.md +6 -0
  23. package/commands/cancel-ralph.md +18 -0
  24. package/commands/code-factory.md +53 -0
  25. package/commands/create-prd.md +55 -0
  26. package/commands/ralph-loop.md +18 -0
  27. package/commands/run-plan.md +117 -0
  28. package/commands/submit-lesson.md +122 -0
  29. package/docs/ARCHITECTURE.md +630 -0
  30. package/docs/CONTRIBUTING.md +125 -0
  31. package/docs/lessons/0001-bare-exception-swallowing.md +34 -0
  32. package/docs/lessons/0002-async-def-without-await.md +28 -0
  33. package/docs/lessons/0003-create-task-without-callback.md +28 -0
  34. package/docs/lessons/0004-hardcoded-test-counts.md +28 -0
  35. package/docs/lessons/0005-sqlite-without-closing.md +33 -0
  36. package/docs/lessons/0006-venv-pip-path.md +27 -0
  37. package/docs/lessons/0007-runner-state-self-rejection.md +35 -0
  38. package/docs/lessons/0008-quality-gate-blind-spot.md +33 -0
  39. package/docs/lessons/0009-parser-overcount-empty-batches.md +36 -0
  40. package/docs/lessons/0010-local-outside-function-bash.md +33 -0
  41. package/docs/lessons/0011-batch-tests-for-unimplemented-code.md +36 -0
  42. package/docs/lessons/0012-api-markdown-unescaped-chars.md +33 -0
  43. package/docs/lessons/0013-export-prefix-env-parsing.md +33 -0
  44. package/docs/lessons/0014-decorator-registry-import-side-effect.md +43 -0
  45. package/docs/lessons/0015-frontend-backend-schema-drift.md +43 -0
  46. package/docs/lessons/0016-event-driven-cold-start-seeding.md +44 -0
  47. package/docs/lessons/0017-copy-paste-logic-diverges.md +43 -0
  48. package/docs/lessons/0018-layer-passes-pipeline-broken.md +45 -0
  49. package/docs/lessons/0019-systemd-envfile-ignores-export.md +41 -0
  50. package/docs/lessons/0020-persist-state-incrementally.md +44 -0
  51. package/docs/lessons/0021-dual-axis-testing.md +48 -0
  52. package/docs/lessons/0022-jsx-factory-shadowing.md +43 -0
  53. package/docs/lessons/0023-static-analysis-spiral.md +51 -0
  54. package/docs/lessons/0024-shared-pipeline-implementation.md +55 -0
  55. package/docs/lessons/0025-defense-in-depth-all-entry-points.md +65 -0
  56. package/docs/lessons/0026-linter-no-rules-false-enforcement.md +54 -0
  57. package/docs/lessons/0027-jsx-silent-prop-drop.md +64 -0
  58. package/docs/lessons/0028-no-infrastructure-in-client-code.md +49 -0
  59. package/docs/lessons/0029-never-write-secrets-to-files.md +61 -0
  60. package/docs/lessons/0030-cache-merge-not-replace.md +62 -0
  61. package/docs/lessons/0031-verify-units-at-boundaries.md +66 -0
  62. package/docs/lessons/0032-module-lifecycle-subscribe-unsubscribe.md +89 -0
  63. package/docs/lessons/0033-async-iteration-mutable-snapshot.md +72 -0
  64. package/docs/lessons/0034-caller-missing-await-silent-discard.md +65 -0
  65. package/docs/lessons/0035-duplicate-registration-silent-overwrite.md +85 -0
  66. package/docs/lessons/0036-websocket-dirty-disconnect.md +33 -0
  67. package/docs/lessons/0037-parallel-agents-worktree-corruption.md +31 -0
  68. package/docs/lessons/0038-subscribe-no-stored-ref.md +36 -0
  69. package/docs/lessons/0039-fallback-or-default-hides-bugs.md +34 -0
  70. package/docs/lessons/0040-event-firehose-filter-first.md +36 -0
  71. package/docs/lessons/0041-ambiguous-base-dir-path-nesting.md +32 -0
  72. package/docs/lessons/0042-spec-compliance-insufficient.md +36 -0
  73. package/docs/lessons/0043-exact-count-extensible-collections.md +32 -0
  74. package/docs/lessons/0044-relative-file-deps-worktree.md +39 -0
  75. package/docs/lessons/0045-iterative-design-improvement.md +33 -0
  76. package/docs/lessons/0046-plan-assertion-math-bugs.md +38 -0
  77. package/docs/lessons/0047-pytest-single-threaded-default.md +37 -0
  78. package/docs/lessons/0048-integration-wiring-batch.md +40 -0
  79. package/docs/lessons/0049-ab-verification.md +41 -0
  80. package/docs/lessons/0050-editing-sourced-files-during-execution.md +33 -0
  81. package/docs/lessons/0051-infrastructure-fixes-cant-self-heal.md +30 -0
  82. package/docs/lessons/0052-uncommitted-changes-poison-quality-gates.md +31 -0
  83. package/docs/lessons/0053-jq-compact-flag-inconsistency.md +31 -0
  84. package/docs/lessons/0054-parser-matches-inside-code-blocks.md +30 -0
  85. package/docs/lessons/0055-agents-compensate-for-garbled-prompts.md +31 -0
  86. package/docs/lessons/0056-grep-count-exit-code-on-zero.md +42 -0
  87. package/docs/lessons/0057-new-artifacts-break-git-clean-gates.md +42 -0
  88. package/docs/lessons/0058-dead-config-keys-never-consumed.md +49 -0
  89. package/docs/lessons/0059-contract-test-shared-structures.md +53 -0
  90. package/docs/lessons/0060-set-e-silent-death-in-runners.md +53 -0
  91. package/docs/lessons/0061-context-injection-dirty-state.md +50 -0
  92. package/docs/lessons/0062-sibling-bug-neighborhood-scan.md +29 -0
  93. package/docs/lessons/0063-one-flag-two-lifetimes.md +31 -0
  94. package/docs/lessons/0064-test-passes-wrong-reason.md +31 -0
  95. package/docs/lessons/0065-pipefail-grep-count-double-output.md +39 -0
  96. package/docs/lessons/0066-local-keyword-outside-function.md +37 -0
  97. package/docs/lessons/0067-stdin-hang-non-interactive-shell.md +36 -0
  98. package/docs/lessons/0068-agent-builds-wrong-thing-correctly.md +31 -0
  99. package/docs/lessons/0069-plan-quality-dominates-execution.md +30 -0
  100. package/docs/lessons/0070-spec-echo-back-prevents-drift.md +31 -0
  101. package/docs/lessons/0071-positive-instructions-outperform-negative.md +30 -0
  102. package/docs/lessons/0072-lost-in-the-middle-context-placement.md +30 -0
  103. package/docs/lessons/0073-unscoped-lessons-cause-false-positives.md +30 -0
  104. package/docs/lessons/0074-stale-context-injection-wrong-batch.md +32 -0
  105. package/docs/lessons/0075-research-artifacts-must-persist.md +32 -0
  106. package/docs/lessons/0076-wrong-decomposition-contaminates-downstream.md +30 -0
  107. package/docs/lessons/0077-cherry-pick-merges-need-manual-resolution.md +30 -0
  108. package/docs/lessons/0078-static-review-without-live-test.md +30 -0
  109. package/docs/lessons/0079-integration-wiring-batch-required.md +32 -0
  110. package/docs/lessons/FRAMEWORK.md +161 -0
  111. package/docs/lessons/SUMMARY.md +201 -0
  112. package/docs/lessons/TEMPLATE.md +85 -0
  113. package/docs/plans/2026-02-21-code-factory-v2-design.md +204 -0
  114. package/docs/plans/2026-02-21-code-factory-v2-implementation-plan.md +2189 -0
  115. package/docs/plans/2026-02-21-code-factory-v2-phase4-design.md +537 -0
  116. package/docs/plans/2026-02-21-code-factory-v2-phase4-implementation-plan.md +2012 -0
  117. package/docs/plans/2026-02-21-hardening-pass-design.md +108 -0
  118. package/docs/plans/2026-02-21-hardening-pass-plan.md +1378 -0
  119. package/docs/plans/2026-02-21-mab-research-report.md +406 -0
  120. package/docs/plans/2026-02-21-marketplace-restructure-design.md +240 -0
  121. package/docs/plans/2026-02-21-marketplace-restructure-plan.md +832 -0
  122. package/docs/plans/2026-02-21-phase4-completion-plan.md +697 -0
  123. package/docs/plans/2026-02-21-validator-suite-design.md +148 -0
  124. package/docs/plans/2026-02-21-validator-suite-plan.md +540 -0
  125. package/docs/plans/2026-02-22-mab-research-round2.md +556 -0
  126. package/docs/plans/2026-02-22-mab-run-design.md +462 -0
  127. package/docs/plans/2026-02-22-mab-run-plan.md +2046 -0
  128. package/docs/plans/2026-02-22-operations-design-methodology-research.md +681 -0
  129. package/docs/plans/2026-02-22-research-agent-failure-taxonomy.md +532 -0
  130. package/docs/plans/2026-02-22-research-code-guideline-policies.md +886 -0
  131. package/docs/plans/2026-02-22-research-codebase-audit-refactoring.md +908 -0
  132. package/docs/plans/2026-02-22-research-coding-standards-documentation.md +541 -0
  133. package/docs/plans/2026-02-22-research-competitive-landscape.md +687 -0
  134. package/docs/plans/2026-02-22-research-comprehensive-testing.md +1076 -0
  135. package/docs/plans/2026-02-22-research-context-utilization.md +459 -0
  136. package/docs/plans/2026-02-22-research-cost-quality-tradeoff.md +548 -0
  137. package/docs/plans/2026-02-22-research-lesson-transferability.md +508 -0
  138. package/docs/plans/2026-02-22-research-multi-agent-coordination.md +312 -0
  139. package/docs/plans/2026-02-22-research-phase-integration.md +602 -0
  140. package/docs/plans/2026-02-22-research-plan-quality.md +428 -0
  141. package/docs/plans/2026-02-22-research-prompt-engineering.md +558 -0
  142. package/docs/plans/2026-02-22-research-unconventional-perspectives.md +528 -0
  143. package/docs/plans/2026-02-22-research-user-adoption.md +638 -0
  144. package/docs/plans/2026-02-22-research-verification-effectiveness.md +433 -0
  145. package/docs/plans/2026-02-23-agent-suite-design.md +299 -0
  146. package/docs/plans/2026-02-23-agent-suite-plan.md +578 -0
  147. package/docs/plans/2026-02-23-phase3-cost-infrastructure-design.md +148 -0
  148. package/docs/plans/2026-02-23-phase3-cost-infrastructure-plan.md +1062 -0
  149. package/docs/plans/2026-02-23-research-bash-expert-agent.md +543 -0
  150. package/docs/plans/2026-02-23-research-dependency-auditor-agent.md +564 -0
  151. package/docs/plans/2026-02-23-research-improving-existing-agents.md +503 -0
  152. package/docs/plans/2026-02-23-research-integration-tester-agent.md +454 -0
  153. package/docs/plans/2026-02-23-research-python-expert-agent.md +429 -0
  154. package/docs/plans/2026-02-23-research-service-monitor-agent.md +425 -0
  155. package/docs/plans/2026-02-23-research-shell-expert-agent.md +533 -0
  156. package/docs/plans/2026-02-23-roadmap-to-completion.md +530 -0
  157. package/docs/plans/2026-02-24-headless-module-split-design.md +98 -0
  158. package/docs/plans/2026-02-24-headless-module-split.md +443 -0
  159. package/docs/plans/2026-02-24-lesson-scope-metadata-design.md +228 -0
  160. package/docs/plans/2026-02-24-lesson-scope-metadata-plan.md +968 -0
  161. package/docs/plans/2026-02-24-npm-packaging-design.md +841 -0
  162. package/docs/plans/2026-02-24-npm-packaging-plan.md +1965 -0
  163. package/docs/plans/audit-findings.md +186 -0
  164. package/docs/telegram-notification-format.md +98 -0
  165. package/examples/example-plan.md +51 -0
  166. package/examples/example-prd.json +72 -0
  167. package/examples/example-roadmap.md +33 -0
  168. package/examples/quickstart-plan.md +63 -0
  169. package/hooks/hooks.json +26 -0
  170. package/hooks/setup-symlinks.sh +48 -0
  171. package/hooks/stop-hook.sh +135 -0
  172. package/package.json +47 -0
  173. package/policies/bash.md +71 -0
  174. package/policies/python.md +71 -0
  175. package/policies/testing.md +61 -0
  176. package/policies/universal.md +60 -0
  177. package/scripts/analyze-report.sh +97 -0
  178. package/scripts/architecture-map.sh +145 -0
  179. package/scripts/auto-compound.sh +273 -0
  180. package/scripts/batch-audit.sh +42 -0
  181. package/scripts/batch-test.sh +101 -0
  182. package/scripts/entropy-audit.sh +221 -0
  183. package/scripts/failure-digest.sh +51 -0
  184. package/scripts/generate-ast-rules.sh +96 -0
  185. package/scripts/init.sh +112 -0
  186. package/scripts/lesson-check.sh +428 -0
  187. package/scripts/lib/common.sh +61 -0
  188. package/scripts/lib/cost-tracking.sh +153 -0
  189. package/scripts/lib/ollama.sh +60 -0
  190. package/scripts/lib/progress-writer.sh +128 -0
  191. package/scripts/lib/run-plan-context.sh +215 -0
  192. package/scripts/lib/run-plan-echo-back.sh +231 -0
  193. package/scripts/lib/run-plan-headless.sh +396 -0
  194. package/scripts/lib/run-plan-notify.sh +57 -0
  195. package/scripts/lib/run-plan-parser.sh +81 -0
  196. package/scripts/lib/run-plan-prompt.sh +215 -0
  197. package/scripts/lib/run-plan-quality-gate.sh +132 -0
  198. package/scripts/lib/run-plan-routing.sh +315 -0
  199. package/scripts/lib/run-plan-sampling.sh +170 -0
  200. package/scripts/lib/run-plan-scoring.sh +146 -0
  201. package/scripts/lib/run-plan-state.sh +142 -0
  202. package/scripts/lib/run-plan-team.sh +199 -0
  203. package/scripts/lib/telegram.sh +54 -0
  204. package/scripts/lib/thompson-sampling.sh +176 -0
  205. package/scripts/license-check.sh +74 -0
  206. package/scripts/mab-run.sh +575 -0
  207. package/scripts/module-size-check.sh +146 -0
  208. package/scripts/patterns/async-no-await.yml +5 -0
  209. package/scripts/patterns/bare-except.yml +6 -0
  210. package/scripts/patterns/empty-catch.yml +6 -0
  211. package/scripts/patterns/hardcoded-localhost.yml +9 -0
  212. package/scripts/patterns/retry-loop-no-backoff.yml +12 -0
  213. package/scripts/pipeline-status.sh +197 -0
  214. package/scripts/policy-check.sh +226 -0
  215. package/scripts/prior-art-search.sh +133 -0
  216. package/scripts/promote-mab-lessons.sh +126 -0
  217. package/scripts/prompts/agent-a-superpowers.md +29 -0
  218. package/scripts/prompts/agent-b-ralph.md +29 -0
  219. package/scripts/prompts/judge-agent.md +61 -0
  220. package/scripts/prompts/planner-agent.md +44 -0
  221. package/scripts/pull-community-lessons.sh +90 -0
  222. package/scripts/quality-gate.sh +266 -0
  223. package/scripts/research-gate.sh +90 -0
  224. package/scripts/run-plan.sh +329 -0
  225. package/scripts/scope-infer.sh +159 -0
  226. package/scripts/setup-ralph-loop.sh +155 -0
  227. package/scripts/telemetry.sh +230 -0
  228. package/scripts/tests/run-all-tests.sh +52 -0
  229. package/scripts/tests/test-act-cli.sh +46 -0
  230. package/scripts/tests/test-agents-md.sh +87 -0
  231. package/scripts/tests/test-analyze-report.sh +114 -0
  232. package/scripts/tests/test-architecture-map.sh +89 -0
  233. package/scripts/tests/test-auto-compound.sh +169 -0
  234. package/scripts/tests/test-batch-test.sh +65 -0
  235. package/scripts/tests/test-benchmark-runner.sh +25 -0
  236. package/scripts/tests/test-common.sh +168 -0
  237. package/scripts/tests/test-cost-tracking.sh +158 -0
  238. package/scripts/tests/test-echo-back.sh +180 -0
  239. package/scripts/tests/test-entropy-audit.sh +146 -0
  240. package/scripts/tests/test-failure-digest.sh +66 -0
  241. package/scripts/tests/test-generate-ast-rules.sh +145 -0
  242. package/scripts/tests/test-helpers.sh +82 -0
  243. package/scripts/tests/test-init.sh +47 -0
  244. package/scripts/tests/test-lesson-check.sh +278 -0
  245. package/scripts/tests/test-lesson-local.sh +55 -0
  246. package/scripts/tests/test-license-check.sh +109 -0
  247. package/scripts/tests/test-mab-run.sh +182 -0
  248. package/scripts/tests/test-ollama-lib.sh +49 -0
  249. package/scripts/tests/test-ollama.sh +60 -0
  250. package/scripts/tests/test-pipeline-status.sh +198 -0
  251. package/scripts/tests/test-policy-check.sh +124 -0
  252. package/scripts/tests/test-prior-art-search.sh +96 -0
  253. package/scripts/tests/test-progress-writer.sh +140 -0
  254. package/scripts/tests/test-promote-mab-lessons.sh +110 -0
  255. package/scripts/tests/test-pull-community-lessons.sh +149 -0
  256. package/scripts/tests/test-quality-gate.sh +241 -0
  257. package/scripts/tests/test-research-gate.sh +132 -0
  258. package/scripts/tests/test-run-plan-cli.sh +86 -0
  259. package/scripts/tests/test-run-plan-context.sh +305 -0
  260. package/scripts/tests/test-run-plan-e2e.sh +153 -0
  261. package/scripts/tests/test-run-plan-headless.sh +424 -0
  262. package/scripts/tests/test-run-plan-notify.sh +124 -0
  263. package/scripts/tests/test-run-plan-parser.sh +217 -0
  264. package/scripts/tests/test-run-plan-prompt.sh +254 -0
  265. package/scripts/tests/test-run-plan-quality-gate.sh +222 -0
  266. package/scripts/tests/test-run-plan-routing.sh +178 -0
  267. package/scripts/tests/test-run-plan-scoring.sh +148 -0
  268. package/scripts/tests/test-run-plan-state.sh +261 -0
  269. package/scripts/tests/test-run-plan-team.sh +157 -0
  270. package/scripts/tests/test-scope-infer.sh +150 -0
  271. package/scripts/tests/test-setup-ralph-loop.sh +63 -0
  272. package/scripts/tests/test-telegram-env.sh +38 -0
  273. package/scripts/tests/test-telegram.sh +121 -0
  274. package/scripts/tests/test-telemetry.sh +46 -0
  275. package/scripts/tests/test-thompson-sampling.sh +139 -0
  276. package/scripts/tests/test-validate-all.sh +60 -0
  277. package/scripts/tests/test-validate-commands.sh +89 -0
  278. package/scripts/tests/test-validate-hooks.sh +98 -0
  279. package/scripts/tests/test-validate-lessons.sh +150 -0
  280. package/scripts/tests/test-validate-plan-quality.sh +235 -0
  281. package/scripts/tests/test-validate-plans.sh +187 -0
  282. package/scripts/tests/test-validate-plugin.sh +106 -0
  283. package/scripts/tests/test-validate-prd.sh +184 -0
  284. package/scripts/tests/test-validate-skills.sh +134 -0
  285. package/scripts/validate-all.sh +57 -0
  286. package/scripts/validate-commands.sh +67 -0
  287. package/scripts/validate-hooks.sh +89 -0
  288. package/scripts/validate-lessons.sh +98 -0
  289. package/scripts/validate-plan-quality.sh +369 -0
  290. package/scripts/validate-plans.sh +120 -0
  291. package/scripts/validate-plugin.sh +86 -0
  292. package/scripts/validate-policies.sh +42 -0
  293. package/scripts/validate-prd.sh +118 -0
  294. package/scripts/validate-skills.sh +96 -0
  295. package/skills/autocode/SKILL.md +285 -0
  296. package/skills/autocode/ab-verification.md +51 -0
  297. package/skills/autocode/code-quality-standards.md +37 -0
  298. package/skills/autocode/competitive-mode.md +364 -0
  299. package/skills/brainstorming/SKILL.md +97 -0
  300. package/skills/capture-lesson/SKILL.md +187 -0
  301. package/skills/check-lessons/SKILL.md +116 -0
  302. package/skills/dispatching-parallel-agents/SKILL.md +110 -0
  303. package/skills/executing-plans/SKILL.md +85 -0
  304. package/skills/finishing-a-development-branch/SKILL.md +201 -0
  305. package/skills/receiving-code-review/SKILL.md +72 -0
  306. package/skills/requesting-code-review/SKILL.md +59 -0
  307. package/skills/requesting-code-review/code-reviewer.md +82 -0
  308. package/skills/research/SKILL.md +145 -0
  309. package/skills/roadmap/SKILL.md +115 -0
  310. package/skills/subagent-driven-development/SKILL.md +98 -0
  311. package/skills/subagent-driven-development/code-quality-reviewer-prompt.md +18 -0
  312. package/skills/subagent-driven-development/implementer-prompt.md +73 -0
  313. package/skills/subagent-driven-development/spec-reviewer-prompt.md +57 -0
  314. package/skills/systematic-debugging/SKILL.md +134 -0
  315. package/skills/systematic-debugging/condition-based-waiting.md +64 -0
  316. package/skills/systematic-debugging/defense-in-depth.md +32 -0
  317. package/skills/systematic-debugging/root-cause-tracing.md +55 -0
  318. package/skills/test-driven-development/SKILL.md +167 -0
  319. package/skills/using-git-worktrees/SKILL.md +219 -0
  320. package/skills/using-superpowers/SKILL.md +54 -0
  321. package/skills/verification-before-completion/SKILL.md +140 -0
  322. package/skills/verify/SKILL.md +82 -0
  323. package/skills/writing-plans/SKILL.md +128 -0
  324. package/skills/writing-skills/SKILL.md +93 -0
@@ -0,0 +1,406 @@
1
+ # Multi-Armed Bandit System: Research Report
2
+
3
+ **Date:** 2026-02-21
4
+ **Status:** Research complete
5
+ **Scope:** Codebase gap analysis, academic literature review, Notion workspace cross-reference, internet survey of competing approaches
6
+ **Builds on:** `docs/plans/2026-02-22-mab-run-design.md` (approved design), `docs/plans/2026-02-22-mab-run-plan.md` (implementation plan)
7
+
8
+ ---
9
+
10
+ ## Executive Summary
11
+
12
+ The approved MAB design is sound in goal but overbuilt for first deployment. Research across six sources — the existing codebase, academic literature on MAB+LLM systems, LLM-as-Judge best practices, self-evolving workflow research, SWE-bench competitive approaches, and the Notion knowledge base — reveals that:
13
+
14
+ 1. **80% of the orchestration infrastructure already exists** in `run-plan.sh` and its 8 lib files
15
+ 2. **The judge is the highest-value component** — academic literature provides concrete design improvements
16
+ 3. **The planner agent, architecture map, and community sync are premature** — they need data that doesn't exist yet
17
+ 4. **Academic research suggests three techniques the current design misses:** Thompson Sampling, prompt evolution from judge reasoning, and position-bias mitigation
18
+
19
+ **Recommendation:** Rewrite the implementation plan as a 2-batch Phase 1 that builds on existing infrastructure, defer 4 of 6 original batches to Phases 2-3 after accumulating run data.
20
+
21
+ ---
22
+
23
+ ## 1. Codebase Gap Analysis
24
+
25
+ ### What Run-Plan Already Provides
26
+
27
+ The current `run-plan.sh` system (main script + 8 lib modules + 35 test files) implements most of the orchestration the MAB design requires:
28
+
29
+ | MAB Requirement | Existing Capability | File | Gap |
30
+ |----------------|---------------------|------|-----|
31
+ | Parallel agent execution | Team mode runs parallel `claude -p` processes | `lib/run-plan-team.sh` | Uses same worktree; MAB needs separate worktrees |
32
+ | Automated scoring | `score_candidate()` scores gate pass, test count, diff size, lint, lessons, ast violations | `lib/run-plan-scoring.sh:8-29` | Already sufficient for MAB |
33
+ | Batch type classification | `classify_batch_type()` returns new-file/refactoring/integration/test-only | `lib/run-plan-scoring.sh:50-93` | Identical to MAB planner's classification |
34
+ | Prompt variant selection | `get_prompt_variants()` with explore/exploit from learned outcomes | `lib/run-plan-scoring.sh:99-146` | Needs extension for strategy-level variants |
35
+ | Quality gates between batches | Full pipeline: lesson-check → lint → ast-grep → tests → memory → regression → git clean | `lib/run-plan-quality-gate.sh`, `quality-gate.sh` | Already sufficient |
36
+ | Per-batch context injection | `generate_batch_context()` with 6000-char budget, failure patterns, state, git log | `lib/run-plan-context.sh` | Needs MAB lessons section added |
37
+ | State persistence + resume | `.run-plan-state.json` tracks batches, test counts, durations, quality gates | `lib/run-plan-state.sh` | Needs MAB-specific fields |
38
+ | Failure pattern learning | `record_failure_pattern()` tracks failure types and winning fixes per batch title | `lib/run-plan-context.sh:118-151` | Feed to judge as context |
39
+ | Retry with escalation | Attempt 1 → plain, Attempt 2 → "previous failed", Attempt 3 → failure digest | `lib/run-plan-headless.sh:214-234` | Already sufficient |
40
+ | Sampling with parallel candidates | `--sample N` spawns N candidates, scores, picks winner, logs outcome | `lib/run-plan-headless.sh:119-210` | Extend for strategy variants |
41
+ | Telegram notifications | Success/failure notifications with test counts and batch summaries | `lib/run-plan-notify.sh` | Already sufficient |
42
+ | Competitive mode | Stub that prints launch command | `run-plan.sh:267-272` | Replace with real `run_mode_mab()` |
43
+
44
+ ### What the MAB Plan Duplicates
45
+
46
+ The `mab-run.sh` script in the implementation plan (1,134 lines) reimplements:
47
+ - Argument parsing and validation (already in `run-plan.sh:111-247`)
48
+ - Worktree creation and cleanup (partially in `run-plan-team.sh`)
49
+ - Agent launching via `claude -p` (already in `run-plan-headless.sh` and `run-plan-team.sh`)
50
+ - Quality gate execution (already in `run-plan-quality-gate.sh`)
51
+ - State tracking (already in `run-plan-state.sh`)
52
+ - Prompt assembly with placeholder substitution (already in `run-plan-prompt.sh`)
53
+
54
+ **Conclusion:** Build `lib/run-plan-mab.sh` (~200-300 lines) as a peer to `run-plan-headless.sh` and `run-plan-team.sh`, not a standalone script.
55
+
56
+ ### What's Genuinely New
57
+
58
+ Only these components have no existing equivalent:
59
+ 1. **Judge agent prompt** — evaluates two diffs and picks a winner
60
+ 2. **Strategy prompt templates** — Agent A (superpowers) and Agent B (ralph) lead instructions
61
+ 3. **Worktree-per-agent isolation** — team mode uses same worktree; MAB needs two
62
+ 4. **Strategy performance data** — `logs/strategy-perf.json` (new data file)
63
+ 5. **MAB lesson accumulation** — `logs/mab-lessons.json` (new data file)
64
+ 6. **Winner merge logic** — `git merge <winner-branch>` after judge decision
65
+
66
+ ---
67
+
68
+ ## 2. Academic Literature Review
69
+
70
+ ### 2.1 Multi-Armed Bandits Meet LLMs
71
+
72
+ **Source:** [Multi-Armed Bandits Meet Large Language Models — IBM Research, AAAI 2026](https://research.ibm.com/publications/multi-armed-bandits-meet-large-language-models)
73
+
74
+ Key findings directly applicable to the MAB system:
75
+
76
+ - **Prompt/strategy selection as MAB:** "Different prompt variants represent different arms, and the LLM's response quality serves as the reward signal." Bandit algorithms "continuously explore new formulations while exploiting the most successful ones."
77
+ - **Thompson Sampling recommended:** For strategy selection with binary outcomes (win/loss), Thompson Sampling naturally balances explore/exploit by sampling from the posterior distribution Beta(wins+1, losses+1) of each arm's win rate.
78
+ - **LLMs enhance bandits:** LLMs can "analyze historical data to dynamically suggest exploration rates" and convert "qualitative feedback into structured rewards." This means the judge's reasoning (qualitative) can be converted into structured strategy-perf updates (quantitative).
79
+ - **Contextual bandits:** For richer routing, use batch type + project characteristics as context features. The existing `classify_batch_type()` already provides the primary context dimension.
80
+
81
+ **Implication for design:** Replace the LLM planner agent with Thompson Sampling in bash (~15 lines). The planner becomes valuable only at 50+ data points when contextual features matter.
82
+
83
+ ### 2.2 LLM-as-Judge Best Practices
84
+
85
+ **Sources:**
86
+ - [Using LLM-as-a-Judge (Hamel Husain)](https://hamel.dev/blog/posts/llm-judge/)
87
+ - [LLM-as-a-Judge Complete Guide (Langfuse)](https://langfuse.com/docs/evaluation/evaluation-methods/llm-as-a-judge)
88
+ - [LLM-As-Judge: 7 Best Practices (Monte Carlo Data)](https://www.montecarlodata.com/blog-llm-as-judge/)
89
+ - [Multi-Agent Debate for LLM Judges](https://arxiv.org/html/2510.12697v1)
90
+
91
+ Critical design guidance:
92
+
93
+ 1. **Start binary, not multi-dimensional.** "A binary decision forces everyone to consider what truly matters." The current design asks the judge for 6 dimensions simultaneously — winner, bidirectional lessons, failure mode classification, toolkit compliance, strategy recommendation, lesson extraction. Research says this produces worse results than binary + reasoning.
94
+
95
+ 2. **Position bias is real and measurable.** In pairwise LLM comparisons, the item presented first has a measurable advantage. The current judge prompt always shows Agent A (superpowers) first. Fix: randomize presentation order and include the order in the output for analysis.
96
+
97
+ 3. **Pairwise comparison > direct scoring.** "Pairwise comparisons lead to more stable results and smaller differences between LLM judgments and human annotations relative to direct scoring." The MAB design already uses pairwise comparison — this validates the approach over scoring each agent independently.
98
+
99
+ 4. **Validate against expert judgment.** "Start with ~30 examples covering different scenarios. Calculate precision and recall separately. Iterate until >90% agreement." For the MAB system: manually review the first 10 judge decisions before trusting automated routing.
100
+
101
+ 5. **Detailed critiques prevent shallow evaluation.** "Provide detailed reasoning in training examples so the judge learns to explain its logic, not just score." The judge prompt should include worked examples of good evaluations.
102
+
103
+ **Implication for design:** Restructure the judge in phases:
104
+ - Phase 1: Binary winner + 2-3 sentence reasoning (one JSON field each)
105
+ - Phase 2: Add failure_mode and strategy_update
106
+ - Phase 3: Add bidirectional lessons and lesson extraction
107
+
108
+ ### 2.3 Self-Evolving Workflows (SEW)
109
+
110
+ **Source:** [SEW: Self-Evolving Agentic Workflows (2025)](https://arxiv.org/abs/2505.18646)
111
+
112
+ SEW demonstrates that **evolving both workflow topology and agent prompts** via mutation and heuristic-driven operators yields up to 33% improvement on LiveCodeBench compared to static hand-crafted baselines.
113
+
114
+ Key insight for the MAB system: The current design treats strategies as static ("superpowers-v1" and "ralph-v1" forever). SEW shows that the *strategies themselves should evolve* based on outcomes.
115
+
116
+ **Concrete mechanism:** After the judge picks a winner and explains why, extract the winning behavior as a new prompt variant. The variant pool grows organically:
117
+
118
+ ```json
119
+ {
120
+ "variant": "Extract shared validation patterns before writing per-type validators",
121
+ "source_run": "mab-run-1708607400",
122
+ "batch_type": "new-file",
123
+ "win_rate": 0.75,
124
+ "uses": 4
125
+ }
126
+ ```
127
+
128
+ This is a minimal version of SEW's mutation operators — but instead of random mutation, the judge's reasoning is the mutation source. The existing `get_prompt_variants()` and `logs/sampling-outcomes.json` infrastructure already supports this pattern.
129
+
130
+ **Implication for design:** Phase 2 feature. After 10+ runs, evolve prompt variants from judge reasoning rather than using hardcoded variant strings.
131
+
132
+ ### 2.4 Automated Design of Agentic Systems (ADAS)
133
+
134
+ **Source:** [ADAS — ICLR 2025](https://github.com/ShengranHu/ADAS)
135
+
136
+ ADAS uses a "Meta Agent Search" algorithm where a meta agent iteratively programs new agents from an ever-growing archive of previous discoveries. Agents discovered by Meta Agent Search outperform state-of-the-art hand-designed agents and transfer across domains and models.
137
+
138
+ Key insight: The MAB system's two strategies are both hand-designed. ADAS shows that **the archive of discovered strategies is more valuable than any individual strategy.** Over time, the system should discover strategies that neither superpowers nor ralph represent.
139
+
140
+ **Concrete mechanism for Phase 3:**
141
+
142
+ ```json
143
+ // logs/strategy-archive.json
144
+ [
145
+ {"name": "superpowers-v1", "prompt_hash": "abc123", "win_rate": 0.6, "runs": 20},
146
+ {"name": "ralph-v1", "prompt_hash": "def456", "win_rate": 0.55, "runs": 20},
147
+ {"name": "hybrid-v1", "prompt_hash": "ghi789", "win_rate": 0.7, "runs": 8,
148
+ "discovered_from": "mab-run-12: judge noted Agent B skipped tests on new files",
149
+ "description": "Ralph iteration loop with mandatory test-first on new files"}
150
+ ]
151
+ ```
152
+
153
+ **Implication for design:** Phase 3 feature. Requires 50+ runs and a mechanism for the judge to propose new strategy descriptions, not just pick between existing ones.
154
+
155
+ ### 2.5 SWE-bench Tournament Patterns
156
+
157
+ **Sources:**
158
+ - [SWE-bench Leaderboard Analysis](https://arxiv.org/html/2506.17208v2)
159
+ - [SWE-bench Verified Leaderboard](https://llm-stats.com/benchmarks/swe-bench-verified-(agentic-coding))
160
+
161
+ TRAE achieved 70.4% on SWE-bench Verified (May 2025) by using **o1 to select among patches generated by three different models** (Claude 3.7 Sonnet, Gemini 2.5 Pro, o4-mini). IBM's approach used inference scaling — running the same model multiple times on the same issue.
162
+
163
+ Both patterns validate the MAB approach. But they also reveal an underexplored dimension: **model variation matters as much as strategy variation.**
164
+
165
+ The current MAB design holds the model constant (both agents use the same model) and varies only the lead instruction. SWE-bench results suggest that varying the model may produce more diverse candidates:
166
+
167
+ | Variation dimension | Current MAB | SWE-bench winners | Expected diversity |
168
+ |--------------------|-------------|-------------------|-------------------|
169
+ | Strategy (prompt) only | ✅ superpowers vs ralph | — | Low (same model interprets both similarly) |
170
+ | Model only | — | ✅ TRAE: 3 models + selector | High (different training → different patterns) |
171
+ | Strategy + model | — | — | Highest |
172
+
173
+ **Implication for design:** Phase 2 feature. Extend the existing `--sample` flag to support heterogeneous model candidates:
174
+ ```bash
175
+ --sample-models "sonnet,opus,haiku" # one candidate per model
176
+ ```
177
+ The scoring infrastructure (`score_candidate()`) is model-agnostic — it scores outputs, not inputs.
178
+
179
+ ---
180
+
181
+ ## 3. Notion Workspace Cross-Reference
182
+
183
+ ### 3.1 Algorithms to Live By — Explore/Exploit Framework
184
+
185
+ **Source:** Notion Knowledge Hub page (693de656)
186
+
187
+ Justin's notes on "Algorithms to Live By" (Brian Christian, Tom Griffiths) contain directly relevant decision frameworks:
188
+
189
+ - **Gittins Index:** The mathematically optimal MAB solution that assigns an "exploration bonus" to unknowns. More principled than the current plan's 70% threshold but computationally expensive. Thompson Sampling is the practical approximation.
190
+ - **Time horizon matters:** "Young people should explore more; older people should exploit favorites (declining time horizon)." Applied to the toolkit: early runs should heavily explore (MAB everything), later runs should exploit (route to known winners). The current plan doesn't model this — it uses a fixed 70% threshold regardless of how many runs remain in a project.
191
+ - **37% Rule for optimal stopping:** For one-shot decisions (like "which strategy to use on a critical batch with no historical data"), spend 37% of budget on exploration and the rest exploiting the best-so-far. This suggests: for a 6-batch plan with no data, MAB the first 2 batches (33%), then route the rest.
192
+ - **Satisficing vs optimizing:** "The 37% rule still fails 63% of the time. Satisficing ('good enough') may produce better life outcomes." Applied: don't over-optimize strategy selection. A 60% win-rate strategy that ships reliably beats an 80% strategy that takes 3x longer to select.
193
+
194
+ ### 3.2 Ryan Carson's Code Factory Pattern
195
+
196
+ **Source:** Notion page (73a97e21)
197
+
198
+ Carson's production code-review pipeline provides a complementary pattern: **SHA-pinned review state.** His key lesson: "If you skip current-head SHA matching, you can merge a PR using stale 'clean' evidence."
199
+
200
+ Applied to MAB: When the judge evaluates two diffs, those diffs must be pinned to specific commit SHAs. If either agent pushes additional commits after the judge starts evaluating, the judgment is stale.
201
+
202
+ **Concrete fields to add to judge output:**
203
+ ```json
204
+ {
205
+ "sha_a": "abc1234",
206
+ "sha_b": "def5678",
207
+ "evaluated_at": "2026-02-22T15:30:00Z"
208
+ }
209
+ ```
210
+
211
+ This enables:
212
+ 1. Re-running the judge on historical runs (reproducibility)
213
+ 2. Detecting if an agent continued working after evaluation (staleness)
214
+ 3. Cherry-picking the exact winning state via `git checkout <sha>`
215
+
216
+ ### 3.3 Code Factory V2 Design
217
+
218
+ **Source:** Local plan at `docs/plans/2026-02-21-code-factory-v2-design.md` (referenced in Notion researcher results)
219
+
220
+ The Code Factory V2 design already contemplates three execution modes (headless, team, competitive) and establishes `sampling-outcomes.json` for learning which prompt variants win per batch type. The MAB system is the "competitive" mode made real.
221
+
222
+ V2's key data insight: **batch-type-aware routing** already tracks outcomes per (batch_type × prompt_variant). The MAB system adds a new axis: (batch_type × strategy × prompt_variant). The existing `sampling-outcomes.json` schema needs one additional field (`strategy`) to accommodate this.
223
+
224
+ ---
225
+
226
+ ## 4. Component-Level Recommendations
227
+
228
+ ### 4.1 Components to Keep (validated by research)
229
+
230
+ | Component | Research validation | Priority |
231
+ |-----------|-------------------|----------|
232
+ | **Judge agent** | LLM-as-Judge literature confirms pairwise comparison is best approach | P0 — highest value |
233
+ | **Two strategy prompts** | SWE-bench shows diverse candidates improve outcomes | P0 — required for judge |
234
+ | **Worktree isolation** | Standard practice; Carson's SHA-pinning adds rigor | P0 — required for parallel execution |
235
+ | **strategy-perf.json** | Thompson Sampling needs win/loss counts per arm | P0 — required for learning |
236
+ | **mab-lessons.json** | Captures judge reasoning for prompt evolution (SEW pattern) | P1 — enables Phase 2 |
237
+
238
+ ### 4.2 Components to Cut (invalidated or premature)
239
+
240
+ | Component | Original plan | Research finding | Recommendation |
241
+ |-----------|--------------|-------------------|----------------|
242
+ | **Standalone `mab-run.sh`** | 1,134 lines, separate script | 80% duplicates existing infrastructure | **Cut.** Build `lib/run-plan-mab.sh` (~200-300 lines) |
243
+ | **LLM planner agent** | Full LLM call to decide routing | Thompson Sampling does this in 15 lines of bash; LLM planner needs data that doesn't exist | **Defer to Phase 3** (50+ runs) |
244
+ | **`architecture-map.sh`** | Scans imports to produce module graph | Claude reads files natively; static analysis misses dynamic imports; maintenance burden exceeds value | **Cut entirely** |
245
+ | **`pull-community-lessons.sh`** | Fetches lessons from upstream remote | `git pull` already propagates lesson files; only `strategy-perf.json` merge needs custom handling | **Cut.** Document git-based workflow instead |
246
+ | **`promote-mab-lessons.sh`** | Auto-promotes patterns with 3+ occurrences | String-matching dedup fails (same lesson, different phrasing); Pinecone semantic dedup is better but premature | **Defer to Phase 3** |
247
+ | **Planner agent prompt** | `scripts/prompts/planner-agent.md` | No data to route on; Thompson Sampling replaces for Phase 1 | **Defer to Phase 3** |
248
+
249
+ ### 4.3 Components to Improve (research-informed changes)
250
+
251
+ | Component | Original design | Research improvement |
252
+ |-----------|----------------|---------------------|
253
+ | **Judge prompt** | 6 simultaneous evaluation dimensions | Start binary (winner + reasoning), add dimensions in phases |
254
+ | **Judge presentation** | Agent A always shown first | Randomize order; record which was shown first for bias analysis |
255
+ | **Strategy selection** | Fixed threshold (>70%, 10+ data points) | Thompson Sampling from Beta distribution; natural explore/exploit balance |
256
+ | **Strategy evolution** | Static strategies forever | Phase 2: extract winning behaviors from judge reasoning as new prompt variants (SEW pattern) |
257
+ | **Model variation** | Same model for both agents | Phase 2: extend `--sample` for heterogeneous models (SWE-bench TRAE pattern) |
258
+ | **Judge context** | No failure history | Inject `failure-patterns.json` data for this batch type into judge prompt |
259
+ | **Evaluation reproducibility** | No SHA tracking | Pin judge evaluation to commit SHAs (Carson's SHA discipline) |
260
+ | **Validation** | Trust judge from run 1 | Manually review first 10 decisions; compute agreement rate before automated routing |
261
+
262
+ ---
263
+
264
+ ## 5. Revised Implementation Phases
265
+
266
+ ### Phase 1: Judge + Orchestration (build now)
267
+
268
+ **Goal:** Produce the first MAB run data. Everything else depends on having real data.
269
+
270
+ **Files:**
271
+ - Create: `scripts/lib/run-plan-mab.sh` (~250 lines)
272
+ - Create: `scripts/prompts/judge-agent.md`
273
+ - Create: `scripts/prompts/agent-a-superpowers.md`
274
+ - Create: `scripts/prompts/agent-b-ralph.md`
275
+ - Modify: `scripts/run-plan.sh` (replace competitive stub with `--mode mab`)
276
+ - Modify: `scripts/lib/run-plan-context.sh` (inject MAB lessons)
277
+ - Runtime data: `logs/strategy-perf.json`, `logs/mab-lessons.json`, `logs/mab-run-<ts>.json`
278
+
279
+ **Judge design (Phase 1 — binary):**
280
+ ```
281
+ Input: Two diffs (randomized order), design doc, PRD, automated scores, failure history
282
+ Output: {"winner": "agent_a|agent_b", "confidence": "low|medium|high",
283
+ "reasoning": "2-3 sentences", "sha_a": "...", "sha_b": "...",
284
+ "presentation_order": "a_first|b_first"}
285
+ ```
286
+
287
+ **Routing design (Phase 1 — Thompson Sampling):**
288
+ ```bash
289
+ # 15 lines of bash, not an LLM call
290
+ wins_a=$(jq ".[\"$batch_type\"].superpowers.wins" strategy-perf.json)
291
+ losses_a=$(jq ".[\"$batch_type\"].superpowers.losses" strategy-perf.json)
292
+ wins_b=$(jq ".[\"$batch_type\"].ralph.wins" strategy-perf.json)
293
+ losses_b=$(jq ".[\"$batch_type\"].ralph.losses" strategy-perf.json)
294
+ sample_a=$(python3 -c "import random; print(random.betavariate($wins_a+1,$losses_a+1))")
295
+ sample_b=$(python3 -c "import random; print(random.betavariate($wins_b+1,$losses_b+1))")
296
+ # If samples within 0.1 of each other, MAB run (explore)
297
+ # Otherwise, route to higher sample (exploit)
298
+ ```
299
+
300
+ **Estimated effort:** 2 batches (down from 6)
301
+
302
+ ### Phase 2: Learning Loop (after 10+ runs)
303
+
304
+ **Goal:** The system gets smarter from its own data.
305
+
306
+ **New capabilities:**
307
+ - Thompson Sampling routing replaces MAB-everything default
308
+ - Prompt evolution: extract winning behaviors from judge reasoning → `logs/evolved-prompts.json`
309
+ - Heterogeneous model sampling: `--sample-models "sonnet,opus,haiku"`
310
+ - MAB lesson injection into batch context (enriched `generate_batch_context()`)
311
+ - Judge enrichment: add failure_mode and strategy_update fields
312
+
313
+ **Prerequisite:** 10+ completed MAB runs with manually validated judge decisions.
314
+
315
+ **Estimated effort:** 1 batch
316
+
317
+ ### Phase 3: Strategy Discovery (after 50+ runs, maybe never)
318
+
319
+ **Goal:** The system discovers strategies humans didn't design.
320
+
321
+ **New capabilities:**
322
+ - Strategy archive (ADAS pattern): judge proposes new strategy descriptions
323
+ - LLM planner agent for complex multi-factor routing
324
+ - Pinecone semantic dedup for lesson accumulation
325
+ - Community strategy data aggregation (merge `strategy-perf.json` across users)
326
+ - Auto-promotion of recurring lessons with semantic matching
327
+
328
+ **Prerequisite:** 50+ runs, validated learning loop, clear signal that current strategies plateau.
329
+
330
+ **Estimated effort:** 2 batches
331
+
332
+ ---
333
+
334
+ ## 6. Risk Analysis
335
+
336
+ | Risk | Likelihood | Impact | Mitigation |
337
+ |------|-----------|--------|------------|
338
+ | Judge produces inconsistent evaluations | High (first 10 runs) | Medium — bad data poisons routing | Manually validate first 10 decisions; don't enable automated routing until >80% agreement |
339
+ | Both agents produce similar output (low diversity) | Medium | High — MAB provides no value if candidates are identical | Phase 2: add model variation; monitor diff similarity between agents |
340
+ | Cost: 2x compute per MAB batch | Certain | Medium — doubles API spend | Thompson Sampling quickly converges, reducing MAB frequency; budget-aware routing |
341
+ | Position bias in judge | High (measured in literature) | Medium — systematically favors first-presented agent | Randomize order; log order in output; monitor win rates by presentation position |
342
+ | Strategy-perf.json data is sparse per batch type | High (early runs) | Low — Thompson Sampling handles sparse data gracefully via prior | Start with uniform prior Beta(1,1); don't route based on < 5 data points |
343
+ | Worktree merge conflicts | Low | Medium — winner branch may conflict with main | Judge should flag "both agents modified same files" as a risk signal |
344
+
345
+ ---
346
+
347
+ ## 7. Success Metrics
348
+
349
+ | Metric | Phase 1 target | Phase 2 target | Measurement |
350
+ |--------|---------------|---------------|-------------|
351
+ | MAB runs completed | 10 | 50 | Count of `logs/mab-run-*.json` files |
352
+ | Judge agreement with human review | >80% | >90% | Manual validation of first 10, spot-check after |
353
+ | Strategy differentiation | Agents produce measurably different diffs | Win rates diverge by batch type | Compare diff overlap between agents |
354
+ | Quality gate pass rate (winner) | >80% | >90% | `strategy-perf.json` aggregate |
355
+ | Routing accuracy (Phase 2+) | — | Thompson Sampling converges within 15 runs | Track cumulative regret vs oracle |
356
+ | Prompt evolution yield (Phase 2+) | — | 1 evolved variant per 5 runs | Count of `logs/evolved-prompts.json` entries |
357
+
358
+ ---
359
+
360
+ ## 8. Sources
361
+
362
+ ### Academic Literature
363
+ - [Multi-Armed Bandits Meet Large Language Models — IBM Research, AAAI 2026](https://research.ibm.com/publications/multi-armed-bandits-meet-large-language-models)
364
+ - [When AIs Judge AIs: Agent-as-a-Judge Evaluation for LLMs](https://arxiv.org/html/2508.02994v1)
365
+ - [Multi-Agent Debate for LLM Judges with Adaptive Stability Detection](https://arxiv.org/html/2510.12697v1)
366
+ - [In-Context Dueling Bandits with LLM Agents](https://aclanthology.org/2025.findings-acl.519.pdf)
367
+ - [Evaluation and Benchmarking of LLM Agents: A Survey](https://arxiv.org/html/2507.21504v1)
368
+ - [SEW: Self-Evolving Agentic Workflows for Automated Code Generation](https://arxiv.org/abs/2505.18646)
369
+ - [EvoAgentX: An Automated Framework for Evolving Agentic Workflows](https://github.com/EvoAgentX/EvoAgentX)
370
+ - [ADAS: Automated Design of Agentic Systems — ICLR 2025](https://github.com/ShengranHu/ADAS)
371
+ - [SWE-bench Leaderboard: Profiling Architectures of Agent-Based Repair Systems](https://arxiv.org/html/2506.17208v2)
372
+ - [SWE-EVO: Benchmarking Coding Agents in Long-Horizon Software Evolution](https://arxiv.org/html/2512.18470v1)
373
+ - [Multi-Agent Evolution Framework for Code Generation](https://medium.com/@tkadeethum/multi-agent-evolution-framework-a-self-improving-system-for-code-generation-02f8ddbf2ec9)
374
+
375
+ ### Practitioner Guides
376
+ - [Using LLM-as-a-Judge for Evaluation (Hamel Husain)](https://hamel.dev/blog/posts/llm-judge/)
377
+ - [LLM-as-a-Judge Evaluation: Complete Guide (Langfuse)](https://langfuse.com/docs/evaluation/evaluation-methods/llm-as-a-judge)
378
+ - [LLM-As-Judge: 7 Best Practices (Monte Carlo Data)](https://www.montecarlodata.com/blog-llm-as-judge/)
379
+ - [LLM-as-a-Judge Simply Explained (Confident AI)](https://www.confident-ai.com/blog/why-llm-as-a-judge-is-the-best-llm-evaluation-method)
380
+ - [Using LLM-as-a-Judge for Agent Outputs (Patronus AI)](https://www.patronus.ai/llm-testing/llm-as-a-judge)
381
+ - [Evaluating the Effectiveness of LLM-Evaluators (Eugene Yan)](https://eugeneyan.com/writing/llm-evaluators/)
382
+
383
+ ### Industry Reports
384
+ - [2026 Agentic Coding Trends Report (Anthropic)](https://resources.anthropic.com/hubfs/2026%20Agentic%20Coding%20Trends%20Report.pdf)
385
+ - [The Rise of AI Teammates in Software Engineering 3.0](https://arxiv.org/html/2507.15003v1)
386
+ - [Coding for the Agentic World (O'Reilly)](https://www.oreilly.com/AgenticWorld/)
387
+ - [Top Open-Source Autonomous Agents & Frameworks](https://cline.bot/blog/top-11-open-source-autonomous-agents-frameworks-in-2025)
388
+ - [Best AI Coding Agents for 2026 (Faros AI)](https://www.faros.ai/blog/best-ai-coding-agents-2026)
389
+
390
+ ### Notion Workspace
391
+ - Algorithms to Live By — Knowledge Hub (Gittins Index, explore/exploit, optimal stopping)
392
+ - Code Factory — Repo Setup for Agent-Driven Code Review (Ryan Carson's SHA-pinning pattern)
393
+ - Code Factory V2 Design (batch-type-aware routing, sampling-outcomes.json)
394
+
395
+ ### Codebase (autonomous-coding-toolkit)
396
+ - `scripts/run-plan.sh` — main runner (293 lines, 3 modes)
397
+ - `scripts/lib/run-plan-headless.sh` — serial batch execution with retry/sampling (344 lines)
398
+ - `scripts/lib/run-plan-team.sh` — parallel batch groups (191 lines)
399
+ - `scripts/lib/run-plan-scoring.sh` — candidate scoring, batch classification, prompt variants (147 lines)
400
+ - `scripts/lib/run-plan-context.sh` — per-batch context assembly within token budget (151 lines)
401
+ - `scripts/lib/run-plan-quality-gate.sh` — quality gate with test regression detection (129 lines)
402
+ - `scripts/lib/run-plan-state.sh` — JSON state persistence (99 lines)
403
+ - `scripts/lib/run-plan-prompt.sh` — batch prompt builder with cross-context (139 lines)
404
+ - `scripts/quality-gate.sh` — composite gate: validation + lessons + lint + ast-grep + tests + memory (231 lines)
405
+ - `docs/plans/2026-02-22-mab-run-design.md` — approved MAB design (445 lines)
406
+ - `docs/plans/2026-02-22-mab-run-plan.md` — original implementation plan (2042 lines, 6 batches, 26 tasks)
@@ -0,0 +1,240 @@
1
+ # Design: Marketplace-Ready Toolkit with Community Lesson Loop
2
+
3
+ **Date:** 2026-02-21
4
+ **Status:** Approved
5
+
6
+ ## Problem
7
+
8
+ The autonomous-coding-toolkit repo works as a standalone clone but isn't discoverable as a Claude Code plugin. It's missing marketplace manifests, commands are in the wrong directory, skills contain personal project references, and the lesson system is static — only the maintainer adds new checks.
9
+
10
+ ## Goals
11
+
12
+ 1. Make the repo installable via `/plugin install` from a marketplace
13
+ 2. Strip personal references so skills work for any project
14
+ 3. Add a community lesson contribution pipeline where every user's production failures improve every other user's agent
15
+ 4. Make the lesson-scanner and lesson-check.sh dynamic — new lessons are new checks, no code changes needed
16
+
17
+ ## Decisions
18
+
19
+ | Decision | Choice | Rationale |
20
+ |----------|--------|-----------|
21
+ | Ralph-loop handling | Merge into top level | Marketplace expects one plugin per repo |
22
+ | Personal references | Strip all | Makes toolkit truly generic and shareable |
23
+ | Distribution | Both self-hosted + official marketplace | Maximum reach |
24
+ | Lesson flow | GitHub PRs via `/submit-lesson` command | Low-tech, high-trust, maintainer curates |
25
+ | Lesson schema | Structured YAML frontmatter | Machine-parseable for automatic check generation |
26
+ | Automation | Semi-auto: command generates PR | User runs command, maintainer reviews and merges |
27
+ | Scanner design | Dynamic — reads lessons/ at scan time | Adding a lesson file = adding a check, no code changes |
28
+ | Attribution | Fork with attribution to superpowers | Clear credit in README and plugin.json |
29
+
30
+ ## Architecture
31
+
32
+ ### Community Lesson Loop
33
+
34
+ ```
35
+ User hits bug → captures lesson → /submit-lesson → PR → maintainer merges →
36
+ → lesson file added to docs/lessons/
37
+ → lesson-check.sh gains new grep pattern (if syntactic)
38
+ → lesson-scanner reads it dynamically at scan time (if semantic)
39
+ → every user's next scan catches that pattern
40
+ ```
41
+
42
+ ### Structured Lesson Schema
43
+
44
+ Each lesson is a markdown file in `docs/lessons/` with machine-parseable YAML frontmatter:
45
+
46
+ ```yaml
47
+ ---
48
+ id: 7
49
+ title: "Bare exception swallowing hides failures"
50
+ severity: blocker # blocker | should-fix | nice-to-have
51
+ languages: [python] # python | javascript | typescript | shell | all
52
+ category: silent-failures # async-traps | resource-lifecycle | silent-failures |
53
+ # integration-boundaries | test-anti-patterns | performance
54
+ pattern:
55
+ type: syntactic # syntactic (grep-detectable) | semantic (needs context)
56
+ regex: "except:\\s*$" # grep -P pattern (only for syntactic)
57
+ description: "bare except without logging"
58
+ fix: "Always log the exception before returning a fallback"
59
+ example:
60
+ bad: |
61
+ try:
62
+ result = api_call()
63
+ except:
64
+ return default_value
65
+ good: |
66
+ try:
67
+ result = api_call()
68
+ except Exception as e:
69
+ logger.error("API call failed", exc_info=True)
70
+ return default_value
71
+ ---
72
+
73
+ ## Observation
74
+ [What happened]
75
+
76
+ ## Insight
77
+ [Why it happened]
78
+
79
+ ## Lesson
80
+ [The rule to follow]
81
+ ```
82
+
83
+ Key properties:
84
+ - `pattern.type: syntactic` → auto-wired into `lesson-check.sh` (grep-based, <2s)
85
+ - `pattern.type: semantic` → picked up by lesson-scanner agent dynamically
86
+ - `severity` maps to BLOCKER/SHOULD-FIX/NICE-TO-HAVE report tiers
87
+ - `regex` is the machine-readable contract for enforcement
88
+
89
+ ### Dynamic Lesson Scanner
90
+
91
+ Rewrite `agents/lesson-scanner.md` to:
92
+
93
+ 1. Glob `docs/lessons/*.md`
94
+ 2. Parse YAML frontmatter from each
95
+ 3. Filter by language (match target project)
96
+ 4. For syntactic patterns: run grep with the `regex` field
97
+ 5. For semantic patterns: use `description` + `example` for contextual analysis
98
+ 6. Report using BLOCKER/SHOULD-FIX/NICE-TO-HAVE format
99
+
100
+ Current 6 hardcoded scan groups become starter lesson files.
101
+
102
+ ### Dynamic lesson-check.sh
103
+
104
+ Update to read syntactic patterns from lesson files:
105
+
106
+ ```bash
107
+ for lesson in docs/lessons/*.md; do
108
+ regex=$(parse_frontmatter_regex "$lesson")
109
+ if [ -n "$regex" ]; then
110
+ grep -Pn "$regex" "$target_files" && report_violation "$lesson"
111
+ fi
112
+ done
113
+ ```
114
+
115
+ Still <2s — grep overhead is negligible per pattern.
116
+
117
+ ### `/submit-lesson` Command
118
+
119
+ New command at `commands/submit-lesson.md`:
120
+
121
+ 1. Ask user to describe the bug (what happened, what it should have done)
122
+ 2. Identify anti-pattern and category
123
+ 3. Determine syntactic vs semantic
124
+ 4. If syntactic, generate grep regex and test against user's code
125
+ 5. Fill structured YAML frontmatter
126
+ 6. Write lesson file to `docs/lessons/NNNN-<slug>.md`
127
+ 7. Generate PR against toolkit repo via `gh`
128
+
129
+ ## Directory Structure (Target)
130
+
131
+ ```
132
+ autonomous-coding-toolkit/
133
+ ├── .claude-plugin/
134
+ │ ├── plugin.json # Plugin metadata
135
+ │ └── marketplace.json # Self-hosted marketplace config
136
+ ├── skills/ # 15 skills (generic, no personal refs)
137
+ │ ├── brainstorming/SKILL.md
138
+ │ ├── writing-plans/SKILL.md
139
+ │ ├── executing-plans/SKILL.md
140
+ │ ├── using-git-worktrees/SKILL.md
141
+ │ ├── subagent-driven-development/
142
+ │ │ ├── SKILL.md
143
+ │ │ ├── implementer-prompt.md
144
+ │ │ ├── spec-reviewer-prompt.md
145
+ │ │ └── code-quality-reviewer-prompt.md
146
+ │ ├── verification-before-completion/SKILL.md
147
+ │ ├── finishing-a-development-branch/SKILL.md
148
+ │ ├── test-driven-development/SKILL.md
149
+ │ ├── systematic-debugging/
150
+ │ │ ├── SKILL.md
151
+ │ │ ├── root-cause-tracing.md
152
+ │ │ ├── defense-in-depth.md
153
+ │ │ └── condition-based-waiting.md
154
+ │ ├── dispatching-parallel-agents/SKILL.md
155
+ │ ├── requesting-code-review/
156
+ │ │ ├── SKILL.md
157
+ │ │ └── code-reviewer.md
158
+ │ ├── receiving-code-review/SKILL.md
159
+ │ ├── writing-skills/SKILL.md
160
+ │ ├── using-superpowers/SKILL.md
161
+ │ └── verify/SKILL.md
162
+ ├── commands/ # Merged: .claude/commands/ + ralph-loop
163
+ │ ├── code-factory.md
164
+ │ ├── create-prd.md
165
+ │ ├── run-plan.md
166
+ │ ├── ralph-loop.md
167
+ │ ├── cancel-ralph.md
168
+ │ └── submit-lesson.md # NEW
169
+ ├── agents/
170
+ │ └── lesson-scanner.md # REWRITTEN: dynamic
171
+ ├── hooks/
172
+ │ ├── hooks.json
173
+ │ └── stop-hook.sh
174
+ ├── scripts/
175
+ │ ├── run-plan.sh
176
+ │ ├── lib/
177
+ │ ├── setup-ralph-loop.sh # Moved from plugins/
178
+ │ ├── quality-gate.sh
179
+ │ ├── lesson-check.sh # REWRITTEN: dynamic
180
+ │ ├── auto-compound.sh
181
+ │ ├── entropy-audit.sh
182
+ │ ├── analyze-report.sh
183
+ │ ├── batch-audit.sh
184
+ │ ├── batch-test.sh
185
+ │ └── tests/
186
+ ├── docs/
187
+ │ ├── ARCHITECTURE.md # Updated
188
+ │ ├── CONTRIBUTING.md # NEW: how to submit lessons
189
+ │ └── lessons/
190
+ │ ├── FRAMEWORK.md
191
+ │ ├── TEMPLATE.md # Updated to match schema
192
+ │ ├── 0001-bare-exception-swallowing.md
193
+ │ ├── 0002-async-def-without-await.md
194
+ │ ├── 0003-create-task-without-callback.md
195
+ │ ├── 0004-hardcoded-test-counts.md
196
+ │ ├── 0005-sqlite-without-closing.md
197
+ │ └── ... # Community-contributed
198
+ ├── examples/
199
+ │ ├── example-plan.md
200
+ │ └── example-prd.json
201
+ ├── CLAUDE.md # Updated for new structure
202
+ ├── README.md # Updated with community section + attribution
203
+ └── LICENSE
204
+ ```
205
+
206
+ ## Changes Summary
207
+
208
+ ### Create
209
+ - `.claude-plugin/plugin.json` — plugin manifest
210
+ - `.claude-plugin/marketplace.json` — self-hosted marketplace config
211
+ - `commands/submit-lesson.md` — community lesson submission command
212
+ - `docs/CONTRIBUTING.md` — contribution guide
213
+ - `docs/lessons/0001-*.md` through `0005-*.md` — starter lessons (from current hardcoded checks)
214
+
215
+ ### Move
216
+ - `.claude/commands/*.md` → `commands/`
217
+ - `plugins/ralph-loop/commands/*.md` → `commands/`
218
+ - `plugins/ralph-loop/hooks/` → `hooks/`
219
+ - `plugins/ralph-loop/scripts/setup-ralph-loop.sh` → `scripts/`
220
+
221
+ ### Delete
222
+ - `plugins/` directory (fully merged)
223
+ - `.claude/commands/` directory (moved)
224
+
225
+ ### Rewrite
226
+ - `agents/lesson-scanner.md` — dynamic, reads lessons/ at scan time
227
+ - `scripts/lesson-check.sh` — dynamic, reads syntactic patterns from lesson files
228
+ - `docs/lessons/TEMPLATE.md` — updated to structured YAML schema
229
+
230
+ ### Update
231
+ - All 15 skills — strip personal references, add version to frontmatter
232
+ - `CLAUDE.md` — updated paths for new structure
233
+ - `README.md` — attribution, marketplace install, community section
234
+ - `docs/ARCHITECTURE.md` — updated for community lesson loop
235
+
236
+ ## Attribution
237
+
238
+ README acknowledgment:
239
+ - Core skill chain forked from [superpowers](https://github.com/obra/superpowers) by Jesse Vincent / Anthropic
240
+ - Custom additions: quality gate pipeline, headless execution, ralph-loop, lesson framework, dynamic lesson scanner, community contribution pipeline