@event4u/agent-config 6.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (378) hide show
  1. package/.claude-plugin/marketplace.json +5 -5
  2. package/CHANGELOG.md +167 -440
  3. package/README.md +3 -3
  4. package/dist/agent-src/commands/agent-handoff.md +5 -4
  5. package/dist/agent-src/commands/agent-status.md +1 -0
  6. package/dist/agent-src/commands/agents/audit.md +1 -0
  7. package/dist/agent-src/commands/agents/init.md +3 -0
  8. package/dist/agent-src/commands/agents/optimize.md +1 -0
  9. package/dist/agent-src/commands/agents/user/accept.md +1 -0
  10. package/dist/agent-src/commands/agents/user/init.md +1 -0
  11. package/dist/agent-src/commands/agents/user/review.md +1 -0
  12. package/dist/agent-src/commands/agents/user/show.md +1 -0
  13. package/dist/agent-src/commands/agents/user/update.md +1 -0
  14. package/dist/agent-src/commands/agents/user.md +1 -0
  15. package/dist/agent-src/commands/agents.md +1 -0
  16. package/dist/agent-src/commands/analytics/prune.md +3 -2
  17. package/dist/agent-src/commands/analytics/show.md +3 -2
  18. package/dist/agent-src/commands/analytics.md +3 -2
  19. package/dist/agent-src/commands/analyze-reference-repo.md +1 -0
  20. package/dist/agent-src/commands/bug-fix.md +1 -0
  21. package/dist/agent-src/commands/bug-investigate.md +1 -0
  22. package/dist/agent-src/commands/challenge-me/vision.md +3 -2
  23. package/dist/agent-src/commands/challenge-me/with-docs.md +3 -2
  24. package/dist/agent-src/commands/challenge-me.md +3 -2
  25. package/dist/agent-src/commands/chat-history/import.md +9 -9
  26. package/dist/agent-src/commands/chat-history.md +32 -30
  27. package/dist/agent-src/commands/check-current-md.md +1 -0
  28. package/dist/agent-src/commands/commit/in-chunks.md +1 -0
  29. package/dist/agent-src/commands/commit.md +1 -0
  30. package/dist/agent-src/commands/condense.md +1 -0
  31. package/dist/agent-src/commands/context/create.md +1 -0
  32. package/dist/agent-src/commands/context/refactor.md +1 -0
  33. package/dist/agent-src/commands/context.md +1 -0
  34. package/dist/agent-src/commands/cost-report.md +5 -4
  35. package/dist/agent-src/commands/council/analysis.md +3 -2
  36. package/dist/agent-src/commands/council/debate.md +5 -4
  37. package/dist/agent-src/commands/council/default.md +3 -2
  38. package/dist/agent-src/commands/council/design.md +3 -2
  39. package/dist/agent-src/commands/council/optimize.md +3 -2
  40. package/dist/agent-src/commands/council/pr.md +3 -2
  41. package/dist/agent-src/commands/council.md +4 -3
  42. package/dist/agent-src/commands/e2e-heal.md +1 -0
  43. package/dist/agent-src/commands/e2e-plan.md +1 -0
  44. package/dist/agent-src/commands/estimate-ticket.md +1 -0
  45. package/dist/agent-src/commands/feature/dev.md +1 -0
  46. package/dist/agent-src/commands/feature/explore.md +1 -0
  47. package/dist/agent-src/commands/feature/plan.md +6 -6
  48. package/dist/agent-src/commands/feature/refactor.md +1 -0
  49. package/dist/agent-src/commands/feature/roadmap.md +1 -0
  50. package/dist/agent-src/commands/feature.md +1 -0
  51. package/dist/agent-src/commands/fix/ci.md +1 -0
  52. package/dist/agent-src/commands/fix/portability.md +1 -0
  53. package/dist/agent-src/commands/fix/pr-comments.md +147 -15
  54. package/dist/agent-src/commands/fix/refs.md +1 -0
  55. package/dist/agent-src/commands/fix/seeder.md +1 -0
  56. package/dist/agent-src/commands/fix.md +8 -8
  57. package/dist/agent-src/commands/ghostwriter/delete.md +1 -0
  58. package/dist/agent-src/commands/ghostwriter/fetch.md +1 -0
  59. package/dist/agent-src/commands/ghostwriter/list.md +1 -0
  60. package/dist/agent-src/commands/ghostwriter/show.md +1 -0
  61. package/dist/agent-src/commands/ghostwriter/write.md +1 -0
  62. package/dist/agent-src/commands/ghostwriter.md +1 -0
  63. package/dist/agent-src/commands/grill-me.md +3 -2
  64. package/dist/agent-src/commands/image/analyse.md +1 -0
  65. package/dist/agent-src/commands/image/create.md +1 -0
  66. package/dist/agent-src/commands/image/verify.md +1 -0
  67. package/dist/agent-src/commands/image.md +1 -0
  68. package/dist/agent-src/commands/implement-ticket.md +1 -0
  69. package/dist/agent-src/commands/jira-ticket.md +1 -0
  70. package/dist/agent-src/commands/judge/on-diff.md +1 -0
  71. package/dist/agent-src/commands/judge/solo.md +1 -0
  72. package/dist/agent-src/commands/judge/steps.md +1 -0
  73. package/dist/agent-src/commands/judge.md +1 -0
  74. package/dist/agent-src/commands/knowledge/cross-repo.md +1 -0
  75. package/dist/agent-src/commands/knowledge/forget.md +1 -0
  76. package/dist/agent-src/commands/knowledge/ingest.md +1 -0
  77. package/dist/agent-src/commands/knowledge/list.md +1 -0
  78. package/dist/agent-src/commands/knowledge.md +1 -0
  79. package/dist/agent-src/commands/memory/add.md +8 -6
  80. package/dist/agent-src/commands/memory/learn-low-impact.md +3 -2
  81. package/dist/agent-src/commands/memory/load.md +7 -7
  82. package/dist/agent-src/commands/memory/mine-session.md +39 -12
  83. package/dist/agent-src/commands/memory/promote.md +3 -2
  84. package/dist/agent-src/commands/memory/propose.md +7 -6
  85. package/dist/agent-src/commands/memory.md +3 -2
  86. package/dist/agent-src/commands/mode.md +1 -0
  87. package/dist/agent-src/commands/module/create.md +1 -0
  88. package/dist/agent-src/commands/module/explore.md +1 -0
  89. package/dist/agent-src/commands/module.md +1 -0
  90. package/dist/agent-src/commands/optimize/agents-dir.md +1 -0
  91. package/dist/agent-src/commands/optimize/augmentignore.md +1 -0
  92. package/dist/agent-src/commands/optimize/rtk.md +1 -0
  93. package/dist/agent-src/commands/optimize/skills.md +1 -0
  94. package/dist/agent-src/commands/optimize-prompt.md +1 -0
  95. package/dist/agent-src/commands/optimize.md +1 -0
  96. package/dist/agent-src/commands/orchestrate.md +1 -0
  97. package/dist/agent-src/commands/override/create.md +1 -0
  98. package/dist/agent-src/commands/override/manage.md +1 -0
  99. package/dist/agent-src/commands/override.md +1 -0
  100. package/dist/agent-src/commands/package-reset.md +1 -0
  101. package/dist/agent-src/commands/package-test.md +1 -0
  102. package/dist/agent-src/commands/post-as/ghostwriter.md +1 -0
  103. package/dist/agent-src/commands/post-as/me.md +1 -0
  104. package/dist/agent-src/commands/post-as.md +1 -0
  105. package/dist/agent-src/commands/pr/create/description-only.md +1 -0
  106. package/dist/agent-src/commands/pr/create.md +25 -0
  107. package/dist/agent-src/commands/prediction-pool.md +1 -0
  108. package/dist/agent-src/commands/prepare-for-review.md +1 -0
  109. package/dist/agent-src/commands/profile/activate.md +1 -0
  110. package/dist/agent-src/commands/profile/deactivate.md +1 -0
  111. package/dist/agent-src/commands/profile/show.md +1 -0
  112. package/dist/agent-src/commands/profile.md +1 -0
  113. package/dist/agent-src/commands/project-analyze.md +1 -0
  114. package/dist/agent-src/commands/project-health.md +1 -0
  115. package/dist/agent-src/commands/quality-fix.md +1 -0
  116. package/dist/agent-src/commands/refine-ticket.md +1 -0
  117. package/dist/agent-src/commands/research/deep.md +1 -0
  118. package/dist/agent-src/commands/research/report.md +1 -0
  119. package/dist/agent-src/commands/research.md +1 -0
  120. package/dist/agent-src/commands/review-changes.md +1 -0
  121. package/dist/agent-src/commands/review-routing.md +1 -0
  122. package/dist/agent-src/commands/roadmap/ai-council.md +1 -0
  123. package/dist/agent-src/commands/roadmap/create.md +1 -0
  124. package/dist/agent-src/commands/roadmap/process-full.md +1 -0
  125. package/dist/agent-src/commands/roadmap/process-phase.md +1 -0
  126. package/dist/agent-src/commands/roadmap/process-step.md +1 -0
  127. package/dist/agent-src/commands/roadmap.md +1 -0
  128. package/dist/agent-src/commands/rule-compliance-audit.md +1 -0
  129. package/dist/agent-src/commands/security-audit-config.md +84 -0
  130. package/dist/agent-src/commands/set-cost-profile.md +1 -0
  131. package/dist/agent-src/commands/skill/preview.md +1 -0
  132. package/dist/agent-src/commands/skill.md +1 -0
  133. package/dist/agent-src/commands/skills/discover.md +1 -0
  134. package/dist/agent-src/commands/skills.md +1 -0
  135. package/dist/agent-src/commands/sync-agent-settings.md +1 -0
  136. package/dist/agent-src/commands/sync-gitignore/fix.md +1 -0
  137. package/dist/agent-src/commands/sync-gitignore.md +1 -0
  138. package/dist/agent-src/commands/tests/create.md +1 -0
  139. package/dist/agent-src/commands/tests/execute.md +1 -0
  140. package/dist/agent-src/commands/tests.md +1 -0
  141. package/dist/agent-src/commands/threat-model.md +1 -0
  142. package/dist/agent-src/commands/update-form-request-messages.md +1 -0
  143. package/dist/agent-src/commands/upstream-contribute.md +1 -0
  144. package/dist/agent-src/commands/video/from-script.md +1 -0
  145. package/dist/agent-src/commands/video/from-song.md +1 -0
  146. package/dist/agent-src/commands/video/scene.md +1 -0
  147. package/dist/agent-src/commands/video/stitch.md +1 -0
  148. package/dist/agent-src/commands/video/storyboard.md +1 -0
  149. package/dist/agent-src/commands/video.md +1 -0
  150. package/dist/agent-src/commands/work.md +1 -0
  151. package/dist/agent-src/contexts/augment-infrastructure.md +1 -1
  152. package/dist/agent-src/contexts/communication/rules-auto/skill-quality-mechanics.md +1 -1
  153. package/dist/agent-src/contexts/communication/rules-auto/slash-command-routing-policy-mechanics.md +2 -2
  154. package/dist/agent-src/contexts/communication/rules-auto/think-before-action-mechanics.md +6 -6
  155. package/dist/agent-src/contexts/contracts/consumer-agents-md-guide.md +2 -2
  156. package/dist/agent-src/contexts/execution/rdp-gate.md +75 -0
  157. package/dist/agent-src/contexts/subagent-configuration.md +1 -0
  158. package/dist/agent-src/personas/advisors/contrarian.md +1 -1
  159. package/dist/agent-src/personas/advisors/executor.md +1 -1
  160. package/dist/agent-src/personas/advisors/expansionist.md +1 -1
  161. package/dist/agent-src/personas/advisors/first-principles.md +1 -1
  162. package/dist/agent-src/personas/advisors/outsider.md +1 -1
  163. package/dist/agent-src/rules/autonomous-execution.md +12 -0
  164. package/dist/agent-src/rules/external-reference-deep-dive.md +1 -1
  165. package/dist/agent-src/rules/git-history-discipline.md +47 -1
  166. package/dist/agent-src/rules/improve-before-implement.md +12 -0
  167. package/dist/agent-src/rules/lethal-trifecta-guard.md +80 -0
  168. package/dist/agent-src/rules/no-pr-progress-comments.md +3 -4
  169. package/dist/agent-src/rules/notes-first-reasoning.md +71 -0
  170. package/dist/agent-src/rules/roadmap-progress-sync.md +48 -31
  171. package/dist/agent-src/rules/security-sensitive-stop.md +14 -1
  172. package/dist/agent-src/rules/source-confidentiality.md +97 -0
  173. package/dist/agent-src/rules/think-before-action.md +9 -1
  174. package/dist/agent-src/rules/untrusted-input-defense.md +76 -0
  175. package/dist/agent-src/scripts/archive_completed_roadmaps.py +171 -0
  176. package/dist/agent-src/skills/adversarial-review/SKILL.md +14 -0
  177. package/dist/agent-src/skills/agent-security-review/SKILL.md +113 -0
  178. package/dist/agent-src/skills/agent-security-review/evals/triggers.json +51 -0
  179. package/dist/agent-src/skills/ai-council/SKILL.md +3 -3
  180. package/dist/agent-src/skills/async-python-patterns/SKILL.md +1 -1
  181. package/dist/agent-src/skills/blast-radius-analyzer/SKILL.md +12 -11
  182. package/dist/agent-src/skills/command-routing/SKILL.md +1 -1
  183. package/dist/agent-src/skills/complexity-first-planning/SKILL.md +96 -0
  184. package/dist/agent-src/skills/complexity-first-planning/evals/triggers.json +16 -0
  185. package/dist/agent-src/skills/copilot-config/SKILL.md +3 -4
  186. package/dist/agent-src/skills/defense-in-depth/SKILL.md +1 -1
  187. package/dist/agent-src/skills/developer-like-execution/SKILL.md +5 -4
  188. package/dist/agent-src/skills/error-handling-patterns/SKILL.md +1 -1
  189. package/dist/agent-src/skills/feature-planning/SKILL.md +2 -2
  190. package/dist/agent-src/skills/mcp-builder/SKILL.md +1 -1
  191. package/dist/agent-src/skills/memory-consolidation/SKILL.md +63 -17
  192. package/dist/agent-src/skills/prompt-engineering-patterns/SKILL.md +1 -1
  193. package/dist/agent-src/skills/readme-writing-package/SKILL.md +1 -1
  194. package/dist/agent-src/skills/reasoning-orchestrator/SKILL.md +119 -0
  195. package/dist/agent-src/skills/reasoning-orchestrator/evals/triggers.json +16 -0
  196. package/dist/agent-src/skills/receiving-code-review/SKILL.md +6 -6
  197. package/dist/agent-src/skills/refine-prompt/SKILL.md +1 -1
  198. package/dist/agent-src/skills/refine-ticket/SKILL.md +1 -1
  199. package/dist/agent-src/skills/repomix-packer/SKILL.md +1 -1
  200. package/dist/agent-src/skills/secrets-management/SKILL.md +1 -1
  201. package/dist/agent-src/skills/subagent-orchestration/SKILL.md +10 -3
  202. package/dist/agent-src/skills/testing-anti-patterns/SKILL.md +1 -1
  203. package/dist/agent-src/skills/testing-anti-patterns/process-anti-patterns.md +1 -1
  204. package/dist/agent-src/skills/token-optimizer/SKILL.md +1 -1
  205. package/dist/agent-src/templates/agents/.gitattributes.fragment +0 -1
  206. package/dist/agent-src/templates/agents/agent-project-settings.example.yml +4 -4
  207. package/dist/agent-src/templates/scripts/check_memory.py +1 -2
  208. package/dist/agent-src/templates/scripts/check_memory_proposal.py +1 -1
  209. package/dist/agent-src/templates/scripts/memory_lookup.py +148 -289
  210. package/dist/agent-src/templates/scripts/memory_report.py +132 -2
  211. package/dist/agent-src/templates/scripts/memory_signal.py +7 -9
  212. package/dist/agent-src/templates/scripts/memory_status.py +25 -206
  213. package/dist/agent-src/templates/scripts/work_engine/directives/backend/memory.py +6 -6
  214. package/dist/agent-src/templates/scripts/work_engine/directives/ui/_passthrough.py +3 -3
  215. package/dist/agent-src/templates/scripts/work_engine/scoring/memory_visibility.py +0 -1
  216. package/dist/cli/agent-config.js +31 -300
  217. package/dist/cli/agent-config.js.map +1 -1
  218. package/dist/cli/commands/commands.js +10 -5
  219. package/dist/cli/commands/commands.js.map +1 -1
  220. package/dist/cli/discovery/loadManifest.js.map +1 -1
  221. package/dist/cli/main.js +309 -0
  222. package/dist/cli/main.js.map +1 -0
  223. package/dist/discovery/deprecation-report.md +1 -1
  224. package/dist/discovery/discovery-manifest.json +645 -342
  225. package/dist/discovery/discovery-manifest.json.sha256 +1 -1
  226. package/dist/discovery/discovery-manifest.summary.md +8 -5
  227. package/dist/discovery/orphan-report.md +1 -1
  228. package/dist/discovery/packs.json +149 -37
  229. package/dist/discovery/trust-report.md +3 -3
  230. package/dist/discovery/workspaces.json +61 -36
  231. package/dist/mcp/registry-manifest.json +4 -4
  232. package/dist/router.json +1 -1
  233. package/dist/server/routes/wizard.js +4 -3
  234. package/dist/server/routes/wizard.js.map +1 -1
  235. package/dist/server/schemas/settings.js +18 -0
  236. package/dist/server/schemas/settings.js.map +1 -1
  237. package/docs/MIGRATION.md +1 -1
  238. package/docs/adrs/cost/0001-hard-stop-hook.md +5 -5
  239. package/docs/adrs/memory/0001-consumer-side-snapshot.md +15 -7
  240. package/docs/adrs/memory/README.md +6 -5
  241. package/docs/adrs/router/0001-three-tier-routing.md +2 -2
  242. package/docs/adrs/schema/0001-json-schema-frontmatter.md +2 -2
  243. package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +5 -5
  244. package/docs/adrs/telegraph/0001-default-off-until-bench.md +3 -3
  245. package/docs/architecture.md +9 -9
  246. package/docs/archive/CHANGELOG-pre-2.2.0.md +30 -30
  247. package/docs/archive/CHANGELOG-pre-2.25.0.md +1 -1
  248. package/docs/archive/CHANGELOG-pre-4.5.0.md +1 -1
  249. package/docs/archive/CHANGELOG-pre-6.0.0.md +473 -0
  250. package/docs/benchmark.md +54 -53
  251. package/docs/benchmarks.md +2 -2
  252. package/docs/case-studies/{frontend-design-vs-ui-ux-pro-max.md → frontend-design-positioning.md} +4 -4
  253. package/docs/catalog.md +20 -13
  254. package/docs/command-flows.md +90 -92
  255. package/docs/contracts/adr-layout.md +2 -3
  256. package/docs/contracts/adr-level-6-productization.md +1 -1
  257. package/docs/contracts/ai-council-config.md +42 -7
  258. package/docs/contracts/command-clusters.md +1 -1
  259. package/docs/contracts/cost-enforcement.md +1 -1
  260. package/docs/contracts/cost-summary-schema.md +1 -1
  261. package/docs/contracts/daily-workspace.md +1 -0
  262. package/docs/contracts/discovery-manifest.schema.json +4 -2
  263. package/docs/contracts/explain-modes.md +1 -1
  264. package/docs/contracts/implement-ticket-flow.md +6 -7
  265. package/docs/contracts/mcp-tool-inventory.md +10 -10
  266. package/docs/contracts/measurement-baseline.md +1 -1
  267. package/docs/contracts/memory-visibility-v1.md +1 -5
  268. package/docs/contracts/namespace.md +1 -1
  269. package/docs/contracts/persona-schema.md +1 -1
  270. package/docs/contracts/rule-interactions.md +1 -1
  271. package/docs/contracts/smoke-contracts.md +1 -1
  272. package/docs/contracts/universal-skills.md +0 -1
  273. package/docs/contracts/workspace-boundary.md +84 -0
  274. package/docs/customization.md +3 -3
  275. package/docs/decisions/ADR-009-event4u-namespace.md +1 -1
  276. package/docs/decisions/ADR-013-discovery-frontmatter-contract.md +1 -1
  277. package/docs/decisions/ADR-026-explain-mode-translation.md +1 -1
  278. package/docs/decisions/ADR-088-no-external-runtime-federation.md +26 -27
  279. package/docs/decisions/ADR-090-visibility-command-frontmatter-field.md +95 -0
  280. package/docs/decisions/ADR-091-split-meta-capability-packs.md +113 -0
  281. package/docs/decisions/ADR-092-defer-command-tier-alias-removal.md +93 -0
  282. package/docs/decisions/ADR-093-ai-council-config-user-global.md +111 -0
  283. package/docs/decisions/ADR-094-agent-memory-layer-removal.md +94 -0
  284. package/docs/decisions/ADR-095-workspace-boundary-contract.md +108 -0
  285. package/docs/decisions/INDEX.md +6 -0
  286. package/docs/development.md +5 -7
  287. package/docs/getting-started.md +4 -4
  288. package/docs/guidelines/agent-infra/5w2h-analysis.md +1 -1
  289. package/docs/guidelines/agent-infra/comparison-matrix.md +1 -1
  290. package/docs/guidelines/agent-infra/corpus-grounding-authoring.md +1 -1
  291. package/docs/guidelines/agent-infra/critical-thinking.md +1 -1
  292. package/docs/guidelines/agent-infra/engineering-memory-data-format.md +1 -5
  293. package/docs/guidelines/agent-infra/first-principles.md +1 -1
  294. package/docs/guidelines/agent-infra/frontier-reasoning-operating-profile.md +164 -0
  295. package/docs/guidelines/agent-infra/inversion-thinking.md +1 -1
  296. package/docs/guidelines/agent-infra/ios-simulator-guide.md +9 -14
  297. package/docs/guidelines/agent-infra/mcp-request-signing.md +19 -22
  298. package/docs/guidelines/agent-infra/memory-access.md +25 -31
  299. package/docs/guidelines/agent-infra/mental-models.md +1 -1
  300. package/docs/guidelines/agent-infra/model-recommendation.md +29 -0
  301. package/docs/guidelines/agent-infra/scqa-framework.md +3 -3
  302. package/docs/guidelines/agent-infra/security-lint-containment.md +81 -0
  303. package/docs/guidelines/agent-infra/six-hats.md +1 -1
  304. package/docs/guidelines/agent-infra/systems-thinking.md +1 -1
  305. package/docs/guidelines/agent-infra/untrusted-input-spotlighting.md +72 -0
  306. package/docs/installation.md +1 -1
  307. package/docs/mcp.md +2 -2
  308. package/docs/parity/{bench-ruflo.json → bench-external.json} +10 -10
  309. package/docs/parity/{ruflo.md → external-runtime.md} +9 -9
  310. package/docs/quality.md +3 -3
  311. package/docs/safety.md +3 -3
  312. package/docs/skills-catalog.md +4 -1
  313. package/llms.txt +3 -0
  314. package/package.json +1 -1
  315. package/src/config/agent-settings.template.yml +65 -3
  316. package/src/config/discovery/packs.yml +29 -0
  317. package/src/config/discovery/workspaces.yml +3 -1
  318. package/src/config/gitignore-block.txt +6 -0
  319. package/src/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
  320. package/src/scripts/_cli/cmd_doctor.py +99 -13
  321. package/src/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
  322. package/src/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
  323. package/src/scripts/_lib/bench_ab_scoring_v2.py +227 -0
  324. package/src/scripts/_lib/global_deploy_inventory.py +39 -9
  325. package/src/scripts/_lib/link_crypto.py +206 -0
  326. package/src/scripts/_lib/security_lint.py +228 -0
  327. package/src/scripts/ai_council/clients.py +2 -2
  328. package/src/scripts/ai_council/config.py +55 -0
  329. package/src/scripts/audit_adr_coverage.py +0 -2
  330. package/src/scripts/audit_command_surface.py +18 -5
  331. package/src/scripts/audit_mcp_tools.py +2 -2
  332. package/src/scripts/audit_skill_descriptions.py +2 -2
  333. package/src/scripts/bench_ab_clone.py +62 -12
  334. package/src/scripts/bench_ab_task_runner.py +475 -30
  335. package/src/scripts/bench_ab_v2_run.py +247 -0
  336. package/src/scripts/bench_ab_v2_stats.py +347 -0
  337. package/src/scripts/bench_run.py +1 -1
  338. package/src/scripts/build_discovery_manifest.py +10 -0
  339. package/src/scripts/check_bite_sized_granularity.py +1 -2
  340. package/src/scripts/check_memory.py +49 -63
  341. package/src/scripts/check_memory_proposal.py +1 -1
  342. package/src/scripts/check_no_external_sources.py +101 -0
  343. package/src/scripts/check_references.py +2 -0
  344. package/src/scripts/cost_by_conversation.py +1 -1
  345. package/src/scripts/council_cli.py +28 -14
  346. package/src/scripts/external_sources_denylist.json +91 -0
  347. package/src/scripts/hook_manifest.yaml +14 -6
  348. package/src/scripts/injection_scan_hook.py +145 -0
  349. package/src/scripts/install-hooks.sh +11 -0
  350. package/src/scripts/install.py +88 -13
  351. package/src/scripts/lint_agent_security.py +112 -0
  352. package/src/scripts/lint_bench_ab.py +5 -4
  353. package/src/scripts/lint_command_tiers.py +63 -22
  354. package/src/scripts/lint_discovery_vocabulary.py +2 -0
  355. package/src/scripts/lint_empty_roadmaps.py +80 -0
  356. package/src/scripts/lint_hidden_unicode.py +132 -0
  357. package/src/scripts/lint_instruction_smuggling.py +107 -0
  358. package/src/scripts/lint_marketplace.py +1 -1
  359. package/src/scripts/lint_mcp_config_security.py +124 -0
  360. package/src/scripts/lint_skill_frontmatter_safety.py +144 -0
  361. package/src/scripts/lint_workspace_boundary.py +122 -0
  362. package/src/scripts/mcp_server/consumer_tool_catalog.json +2 -3
  363. package/src/scripts/mcp_server/tools.py +8 -32
  364. package/src/scripts/memory_lookup.py +27 -296
  365. package/src/scripts/memory_report.py +1 -23
  366. package/src/scripts/memory_signal.py +6 -53
  367. package/src/scripts/memory_status.py +25 -206
  368. package/src/scripts/mine_session.py +118 -41
  369. package/src/scripts/pack_dependency_allowlist.json +2 -2
  370. package/src/scripts/render_benchmark_md.py +141 -52
  371. package/src/scripts/schemas/command.schema.json +6 -1
  372. package/src/scripts/security_audit_config.py +153 -0
  373. package/dist/agent-src/commands/chat-history/learn.md +0 -184
  374. package/dist/agent-src/commands/chat-history/show.md +0 -113
  375. package/dist/agent-src/commands/fix/pr-bot-comments.md +0 -157
  376. package/dist/agent-src/commands/fix/pr-developer-comments.md +0 -163
  377. package/dist/agent-src/templates/agents/memory/architecture-decisions.example.yml +0 -95
  378. package/docs/contracts/agent-memory-contract.md +0 -159
@@ -38,6 +38,7 @@ import os
38
38
  import shutil
39
39
  import subprocess
40
40
  import sys
41
+ import threading
41
42
  import time
42
43
  from datetime import datetime, timezone
43
44
  from pathlib import Path
@@ -61,6 +62,46 @@ REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab"
61
62
  # How far we descend into a clone when snapshotting. The fixture is shallow.
62
63
  SNAPSHOT_MAX_DEPTH = 6
63
64
 
65
+ # --- Activation (proven mechanism) ---
66
+ # agent-config is a GLOBAL Claude Code plugin (enabledPlugins in ~/.claude
67
+ # settings), so plain `claude --print` already runs WITH the package. The clean
68
+ # control is `--setting-sources project,local`, which excludes the user settings
69
+ # where `enabledPlugins` lives → plugin OFF, but auth survives. Measured proof:
70
+ # plain --print = ~35.5k input tokens; --setting-sources project,local = ~11.9k
71
+ # → the ~24k delta IS the package's always-on footprint. So:
72
+ # without = `--setting-sources project,local` (plugin OFF, base model)
73
+ # with = plain `--print` (the real installed plugin = package)
74
+ # with-rdp = plain `--print` + RDP rules injected (RDP not yet in the release plugin)
75
+ # (`--bare` is NOT used — it disables auth too.)
76
+ RDP_EXTRA_FILES = (
77
+ REPO_ROOT / "src" / "rules" / "notes-first-reasoning.md",
78
+ REPO_ROOT / "src" / "agent-src" / "contexts" / "execution" / "rdp-gate.md",
79
+ )
80
+
81
+
82
+ def _concat_rules(paths) -> str:
83
+ parts: list[str] = []
84
+ for p in paths:
85
+ try:
86
+ parts.append(p.read_text(encoding="utf-8"))
87
+ except OSError:
88
+ continue
89
+ return "\n\n---\n\n".join(parts)
90
+
91
+
92
+ def system_prompt_for(variant: str) -> str | None:
93
+ """Extra rules injected on top of the plugin. Only `with-rdp` injects (the RDP
94
+ artifacts aren't in the released plugin yet); `with` uses the real plugin,
95
+ `without` runs plugin-off."""
96
+ if variant == "with-rdp":
97
+ return _concat_rules([p for p in RDP_EXTRA_FILES if p.exists()])
98
+ return None
99
+
100
+
101
+ def setting_sources_for(variant: str) -> str | None:
102
+ """`without` excludes user settings to drop the global plugin (auth survives)."""
103
+ return "project,local" if variant == "without" else None
104
+
64
105
 
65
106
  def utc_stamp() -> str:
66
107
  return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
@@ -106,7 +147,7 @@ def reset_clone(variant: str) -> Path:
106
147
  raise RuntimeError("cannot load bench_ab_clone helper")
107
148
  module = importlib.util.module_from_spec(spec)
108
149
  spec.loader.exec_module(module)
109
- return module.clone(variant, refresh=True) # type: ignore[attr-defined]
150
+ return module.clone(variant, refresh=True, quiet=True) # type: ignore[attr-defined]
110
151
 
111
152
 
112
153
  def claude_executable() -> str | None:
@@ -114,13 +155,28 @@ def claude_executable() -> str | None:
114
155
  override = os.environ.get("CLAUDE_CLI")
115
156
  if override:
116
157
  return override
117
- if shutil.which("claude") is not None:
118
- return "claude"
119
- return None
158
+ # Resolve to an absolute path so the subprocess (run with cwd=clone_root)
159
+ # cannot miss it on a PATH/cwd quirk — the failure that showed up as a
160
+ # spurious "claude CLI not found" on a later arm of the first full run.
161
+ return shutil.which("claude")
120
162
 
121
163
 
122
- def run_live(task: dict, clone_root: Path, *, timeout_s: int) -> dict:
123
- """Invoke claude in print/one-shot mode against the task prompt."""
164
+ def run_live(
165
+ task: dict,
166
+ clone_root: Path,
167
+ *,
168
+ timeout_s: int,
169
+ sysprompt_file: "Path | None" = None,
170
+ setting_sources: "str | None" = None,
171
+ max_budget: "float | None" = None,
172
+ model: "str | None" = None,
173
+ ) -> dict:
174
+ """Invoke claude in print/one-shot mode against the task prompt.
175
+
176
+ `setting_sources` (e.g. "project,local") drops the global plugin for the
177
+ `without` arm while keeping auth. `sysprompt_file` injects extra rules
178
+ (the `with-rdp` arm). `with` passes neither → the real installed plugin.
179
+ """
124
180
  binary = claude_executable()
125
181
  if binary is None:
126
182
  return {
@@ -129,9 +185,33 @@ def run_live(task: dict, clone_root: Path, *, timeout_s: int) -> dict:
129
185
  "transcript": "",
130
186
  "exit_code": None,
131
187
  "wall_time_seconds": 0.0,
188
+ "tokens": 0,
189
+ "tokens_breakdown": {},
190
+ "errored": True,
132
191
  }
133
192
  prompt = task.get("prompt", "")
134
- cmd = [binary, "--print", "--", prompt]
193
+ # --output-format json yields a `usage` block for token counts. The global
194
+ # plugin is dropped per-arm via --setting-sources (NOT --bare, which kills auth).
195
+ # bypassPermissions on EVERY arm: the clone is a throwaway fixture, and this
196
+ # equalizes file-edit capability across arms (else `without`, which excludes
197
+ # user settings, would lack edit perms and fail tasks for the wrong reason).
198
+ cmd = [binary, "--print", "--output-format", "json", "--permission-mode", "bypassPermissions"]
199
+ if model:
200
+ # Pin ONE model across every arm. The session default here is Opus-4.8-1M,
201
+ # whose ~$1.78 first-turn cache-creation trips any sane budget cap instantly
202
+ # and makes a full corpus run blow the account quota. Holding the model
203
+ # constant is also a validity requirement: the bench measures the package
204
+ # LIFT on a fixed host, not model-vs-model.
205
+ cmd += ["--model", model]
206
+ if max_budget:
207
+ # Caps per-task API spend so one runaway agentic loop can't exhaust the
208
+ # account quota (the failure mode that starved later arms on the first run).
209
+ cmd += ["--max-budget-usd", str(max_budget)]
210
+ if setting_sources:
211
+ cmd += ["--setting-sources", setting_sources]
212
+ if sysprompt_file is not None:
213
+ cmd += ["--append-system-prompt-file", str(sysprompt_file)]
214
+ cmd += ["--", prompt]
135
215
  started = time.monotonic()
136
216
  try:
137
217
  proc = subprocess.run(
@@ -149,14 +229,82 @@ def run_live(task: dict, clone_root: Path, *, timeout_s: int) -> dict:
149
229
  "transcript": (exc.stdout or "") + "\n[TIMEOUT]",
150
230
  "exit_code": -1,
151
231
  "wall_time_seconds": round(time.monotonic() - started, 3),
232
+ "tokens": 0,
233
+ "tokens_breakdown": {},
234
+ "errored": True,
152
235
  }
153
236
  duration = time.monotonic() - started
237
+ # Parse the JSON envelope: `result` is the model text; `usage` holds tokens.
238
+ transcript = proc.stdout
239
+ tokens = 0
240
+ is_error = False
241
+ err_reason = "ok"
242
+ num_turns = 0
243
+ subtype = ""
244
+ breakdown = {
245
+ "input_tokens": 0,
246
+ "output_tokens": 0,
247
+ "cache_read_input_tokens": 0,
248
+ "cache_creation_input_tokens": 0,
249
+ }
250
+ try:
251
+ obj = json.loads(proc.stdout)
252
+ is_error = bool(obj.get("is_error"))
253
+ transcript = obj.get("result") or obj.get("text") or proc.stdout
254
+ usage = obj.get("usage") or {}
255
+ breakdown = {
256
+ k: int(usage.get(k, 0) or 0)
257
+ for k in (
258
+ "input_tokens",
259
+ "output_tokens",
260
+ "cache_read_input_tokens",
261
+ "cache_creation_input_tokens",
262
+ )
263
+ }
264
+ tokens = sum(breakdown.values())
265
+ # The top-level `usage` block is zeroed on a budget-capped / errored run
266
+ # (and unreliable even on some completions). `modelUsage` carries the
267
+ # authoritative per-model counts — sum it as the fallback so token deltas
268
+ # survive even when a task hits its cap mid-flight.
269
+ if tokens == 0:
270
+ mu = obj.get("modelUsage") or {}
271
+ agg = {
272
+ "input_tokens": 0,
273
+ "output_tokens": 0,
274
+ "cache_read_input_tokens": 0,
275
+ "cache_creation_input_tokens": 0,
276
+ }
277
+ for stats in mu.values():
278
+ agg["input_tokens"] += int(stats.get("inputTokens", 0) or 0)
279
+ agg["output_tokens"] += int(stats.get("outputTokens", 0) or 0)
280
+ agg["cache_read_input_tokens"] += int(
281
+ stats.get("cacheReadInputTokens", 0) or 0
282
+ )
283
+ agg["cache_creation_input_tokens"] += int(
284
+ stats.get("cacheCreationInputTokens", 0) or 0
285
+ )
286
+ mu_total = sum(agg.values())
287
+ if mu_total > 0:
288
+ breakdown = agg
289
+ tokens = mu_total
290
+ num_turns = int(obj.get("num_turns", 0) or 0)
291
+ subtype = str(obj.get("subtype") or "")
292
+ # Surface WHY a task errored (budget cap vs. other) without leaking $.
293
+ if is_error:
294
+ err_reason = obj.get("subtype") or "error"
295
+ except (json.JSONDecodeError, AttributeError, ValueError):
296
+ transcript = proc.stdout
154
297
  return {
155
298
  "mode": "live",
156
- "reason": "ok",
157
- "transcript": proc.stdout + "\n" + proc.stderr,
299
+ "reason": err_reason if is_error else ("ok" if proc.returncode == 0 else f"exit {proc.returncode}"),
300
+ "transcript": str(transcript) + "\n" + proc.stderr,
158
301
  "exit_code": proc.returncode,
159
302
  "wall_time_seconds": round(duration, 3),
303
+ "tokens": tokens,
304
+ "tokens_breakdown": breakdown,
305
+ "errored": is_error or proc.returncode != 0,
306
+ "num_turns": num_turns,
307
+ "subtype": subtype,
160
308
  }
161
309
 
162
310
 
@@ -198,22 +346,184 @@ def count_ask_events(transcript: str) -> dict[str, int]:
198
346
  return {"asked": asked, "acted_with_commit": acted, "ratio": ratio}
199
347
 
200
348
 
349
+ PROGRESS_PATH = REPORTS_DIR / ".progress.json"
350
+
351
+
352
+ def _write_progress(state: dict) -> None:
353
+ """Mirror live state to .progress.json for `task bench:ab:watch` (best-effort)."""
354
+ try:
355
+ REPORTS_DIR.mkdir(parents=True, exist_ok=True)
356
+ PROGRESS_PATH.write_text(json.dumps(state, indent=2) + "\n")
357
+ except OSError:
358
+ pass
359
+
360
+
361
+ class Progress:
362
+ """Live per-task progress. stdlib-only, TTY-aware, log-safe.
363
+
364
+ style: auto (bar if stderr is a TTY, else one plain line per task) | bar |
365
+ plain | none. Mirrors state to .progress.json regardless of style.
366
+ """
367
+
368
+ BAR_WIDTH = 24
369
+
370
+ def __init__(self, total: int, *, mode: str, style: str = "auto", stream=sys.stderr) -> None:
371
+ self.total = max(total, 1)
372
+ self.mode = mode
373
+ self.stream = stream
374
+ self.done = 0
375
+ self.started = time.monotonic()
376
+ if style in ("bar", "plain", "none"):
377
+ self.kind = style
378
+ else: # auto
379
+ self.kind = "bar" if getattr(stream, "isatty", lambda: False)() else "plain"
380
+ self._cur = ""
381
+ self._task_started = 0.0
382
+ self._hb_stop: "threading.Event | None" = None
383
+ self._hb_thread: "threading.Thread | None" = None
384
+
385
+ def _elapsed(self, since: float) -> str:
386
+ s = int(time.monotonic() - since)
387
+ return f"{s // 60}m{s % 60:02d}s" if s >= 60 else f"{s}s"
388
+
389
+ def _bar(self) -> str:
390
+ filled = int(self.BAR_WIDTH * self.done / self.total)
391
+ return "█" * filled + "░" * (self.BAR_WIDTH - filled)
392
+
393
+ def _render_bar(self, suffix: str = "") -> None:
394
+ line = f"\r[{self._bar()}] {self.done}/{self.total} · {self._cur} · {self._elapsed(self.started)}{suffix}"
395
+ self.stream.write(line.ljust(90)[:160])
396
+ self.stream.flush()
397
+
398
+ def _start_heartbeat(self) -> None:
399
+ if self.kind != "bar" or self.mode != "live":
400
+ return
401
+ self._hb_stop = threading.Event()
402
+
403
+ def _tick() -> None:
404
+ assert self._hb_stop is not None
405
+ while not self._hb_stop.wait(1.0):
406
+ self._render_bar(suffix=f" · {self._elapsed(self._task_started)}…")
407
+
408
+ self._hb_thread = threading.Thread(target=_tick, daemon=True)
409
+ self._hb_thread.start()
410
+
411
+ def _stop_heartbeat(self) -> None:
412
+ if self._hb_stop is not None:
413
+ self._hb_stop.set()
414
+ if self._hb_thread is not None:
415
+ self._hb_thread.join(timeout=2.0)
416
+ self._hb_stop = self._hb_thread = None
417
+
418
+ def start_task(self, variant: str, idx: int, count: int, task_id: str) -> None:
419
+ self._cur = f"{variant} {idx}/{count} · {task_id}"
420
+ self._task_started = time.monotonic()
421
+ _write_progress({
422
+ "mode": self.mode, "variant": variant, "task_idx": idx, "task_count": count,
423
+ "total_done": self.done, "total": self.total, "current_id": task_id,
424
+ "started_at": utc_stamp(), "last_result": None,
425
+ })
426
+ if self.kind == "none":
427
+ return
428
+ if self.kind == "bar":
429
+ self._render_bar(suffix=" · running…" if self.mode == "live" else "")
430
+ self._start_heartbeat()
431
+ elif self.mode == "live": # plain: a start marker so a long task isn't mistaken for a hang
432
+ self.stream.write(f"[{self.done + 1}/{self.total}] ▶ {self._cur}\n")
433
+ self.stream.flush()
434
+
435
+ def end_task(self, *, passed: bool, wall: float, variant: str, task_id: str) -> None:
436
+ self._stop_heartbeat()
437
+ self.done += 1
438
+ mark = "✓" if passed else "✗"
439
+ _write_progress({
440
+ "mode": self.mode, "variant": variant, "total_done": self.done,
441
+ "total": self.total, "current_id": task_id, "updated_at": utc_stamp(),
442
+ "last_result": "pass" if passed else "fail",
443
+ })
444
+ if self.kind == "none":
445
+ return
446
+ if self.kind == "bar":
447
+ self._render_bar(suffix=f" · {mark}")
448
+ else:
449
+ self.stream.write(f"[{self.done}/{self.total}] {mark} {variant} · {task_id} · {wall:.1f}s\n")
450
+ self.stream.flush()
451
+
452
+ def variant_done(self, line: str) -> None:
453
+ """Print a per-variant summary line without corrupting an active bar."""
454
+ if self.kind == "bar":
455
+ self.stream.write("\n")
456
+ self.stream.write(line if line.endswith("\n") else line + "\n")
457
+ self.stream.flush()
458
+
459
+ def finish(self) -> None:
460
+ if self.kind == "bar":
461
+ self.stream.write("\n")
462
+ if self.kind != "none":
463
+ self.stream.write(
464
+ f"bench progress: {self.done}/{self.total} tasks · total {self._elapsed(self.started)}\n"
465
+ )
466
+ self.stream.flush()
467
+
468
+
201
469
  def per_category_aggregate(per_task: list[dict]) -> dict[str, dict]:
202
470
  by_cat: dict[str, list[dict]] = {}
203
471
  for entry in per_task:
204
472
  by_cat.setdefault(entry.get("category", "unknown"), []).append(entry)
205
473
  out: dict[str, dict] = {}
206
474
  for cat, entries in by_cat.items():
207
- passed = sum(1 for e in entries if e.get("score", {}).get("passed"))
475
+ done = [e for e in entries if not e.get("errored")]
476
+ passed = sum(1 for e in done if e.get("score", {}).get("passed"))
208
477
  total = len(entries)
478
+ completed = len(done)
209
479
  out[cat] = {
210
480
  "passed": passed,
211
481
  "total": total,
212
- "completion_rate": round(passed / total, 4) if total else 0,
482
+ "completed": completed,
483
+ "errored": total - completed,
484
+ "completion_rate": round(passed / completed, 4) if completed else 0,
213
485
  "mean_wall_time": round(
214
- sum(e.get("wall_time_seconds", 0) for e in entries) / total, 3
486
+ sum(e.get("wall_time_seconds", 0) for e in done) / completed, 3
215
487
  )
216
- if total
488
+ if completed
489
+ else 0,
490
+ "mean_tokens": round(sum(e.get("tokens", 0) for e in done) / completed)
491
+ if completed
492
+ else 0,
493
+ }
494
+ return out
495
+
496
+
497
+ def per_cell_aggregate(per_task: list[dict]) -> dict[str, dict]:
498
+ """Aggregate by the 2×2 (duration × cognitive) cell — the value-benchmark axis.
499
+
500
+ Compared across conditions this answers "are short tasks more expensive?"
501
+ (cell `short/mechanical`) and "do long tasks get cheaper / better?"
502
+ (cell `long/reasoning-heavy`). Cell key is `"<duration>/<cognitive>"`.
503
+ """
504
+ by_cell: dict[str, list[dict]] = {}
505
+ for entry in per_task:
506
+ cell = f"{entry.get('duration', 'untagged')}/{entry.get('cognitive', 'untagged')}"
507
+ by_cell.setdefault(cell, []).append(entry)
508
+ out: dict[str, dict] = {}
509
+ for cell, entries in by_cell.items():
510
+ done = [e for e in entries if not e.get("errored")]
511
+ passed = sum(1 for e in done if e.get("score", {}).get("passed"))
512
+ total = len(entries)
513
+ completed = len(done)
514
+ out[cell] = {
515
+ "passed": passed,
516
+ "total": total,
517
+ "completed": completed,
518
+ "errored": total - completed,
519
+ "completion_rate": round(passed / completed, 4) if completed else 0,
520
+ "mean_wall_time": round(
521
+ sum(e.get("wall_time_seconds", 0) for e in done) / completed, 3
522
+ )
523
+ if completed
524
+ else 0,
525
+ "mean_tokens": round(sum(e.get("tokens", 0) for e in done) / completed)
526
+ if completed
217
527
  else 0,
218
528
  }
219
529
  return out
@@ -233,22 +543,35 @@ def write_report(
233
543
  target_shape_hash=bench_ab_cache.target_shape_hash(),
234
544
  )
235
545
  total = len(per_task)
236
- passed = sum(1 for e in per_task if e.get("score", {}).get("passed"))
546
+ done = [e for e in per_task if not e.get("errored")]
547
+ completed = len(done)
548
+ errored = total - completed
549
+ passed = sum(1 for e in done if e.get("score", {}).get("passed"))
237
550
  results = {
238
551
  "mode": mode,
239
- "completion_rate": round(passed / total, 4) if total else 0,
552
+ # Hit-rate is over COMPLETED tasks only errored (rate-limit / budget /
553
+ # timeout / CLI-fail) tasks are excluded so a transient quota trip does
554
+ # not read as a content failure of the package.
555
+ "completion_rate": round(passed / completed, 4) if completed else 0,
240
556
  "passed": passed,
557
+ "completed": completed,
558
+ "errored": errored,
241
559
  "total": total,
242
560
  "per_category": per_category_aggregate(per_task),
561
+ "per_cell": per_cell_aggregate(per_task),
243
562
  "mean_wall_time": round(
244
- sum(e.get("wall_time_seconds", 0) for e in per_task) / total, 3
563
+ sum(e.get("wall_time_seconds", 0) for e in done) / completed, 3
245
564
  )
246
- if total
565
+ if completed
566
+ else 0,
567
+ "total_tokens": sum(e.get("tokens", 0) for e in done),
568
+ "mean_tokens": round(sum(e.get("tokens", 0) for e in done) / completed)
569
+ if completed
247
570
  else 0,
248
571
  "ask_vs_act_ratio": round(
249
- sum(e.get("ask_events", {}).get("ratio", 0) for e in per_task) / total, 3
572
+ sum(e.get("ask_events", {}).get("ratio", 0) for e in done) / completed, 3
250
573
  )
251
- if total
574
+ if completed
252
575
  else 0,
253
576
  "per_task": per_task,
254
577
  }
@@ -269,7 +592,7 @@ def write_report(
269
592
  f"# Track B · {variant} · {mode}\n\n"
270
593
  f"- Stamp: `{stamp}`\n"
271
594
  f"- Completion rate: **{results['completion_rate'] * 100:.1f}%**"
272
- f" ({passed}/{total})\n"
595
+ f" ({passed}/{completed} completed; {errored} errored of {total})\n"
273
596
  f"- Mean wall-time: {results['mean_wall_time']}s\n"
274
597
  f"- Ask vs. act ratio: {results['ask_vs_act_ratio']}\n"
275
598
  f"\n## Per-category\n\n"
@@ -283,14 +606,43 @@ def write_report(
283
606
  return path
284
607
 
285
608
 
286
- def run_variant(variant: str, tasks: list[dict], *, mode: str, timeout_s: int) -> dict:
609
+ def run_variant(
610
+ variant: str,
611
+ tasks: list[dict],
612
+ *,
613
+ mode: str,
614
+ timeout_s: int,
615
+ max_budget: "float | None" = None,
616
+ model: "str | None" = None,
617
+ progress: "Progress | None" = None,
618
+ ) -> dict:
287
619
  started = time.monotonic()
620
+ # Build the injected rule corpus once per variant (live only).
621
+ sp_file: "Path | None" = None
622
+ if mode == "live":
623
+ sp_text = system_prompt_for(variant)
624
+ if sp_text:
625
+ REPORTS_DIR.mkdir(parents=True, exist_ok=True)
626
+ sp_file = REPORTS_DIR / f".sysprompt-{variant}.txt"
627
+ sp_file.write_text(sp_text, encoding="utf-8")
288
628
  per_task: list[dict] = []
289
- for task in tasks:
290
- clone_root = reset_clone(variant)
629
+ for i, task in enumerate(tasks):
630
+ if progress is not None:
631
+ progress.start_task(variant, i + 1, len(tasks), str(task.get("id")))
632
+ # Fixture-only working dir, identical for every arm — the package is NOT
633
+ # in the clone files; activation is the injected system prompt (sp_file).
634
+ clone_root = reset_clone("without")
291
635
  pre = snapshot_clone(clone_root)
292
636
  if mode == "live":
293
- run_result = run_live(task, clone_root, timeout_s=timeout_s)
637
+ run_result = run_live(
638
+ task,
639
+ clone_root,
640
+ timeout_s=timeout_s,
641
+ sysprompt_file=sp_file,
642
+ setting_sources=setting_sources_for(variant),
643
+ max_budget=max_budget,
644
+ model=model,
645
+ )
294
646
  else:
295
647
  run_result = run_dry(task, clone_root, variant)
296
648
  post = snapshot_clone(clone_root)
@@ -305,21 +657,42 @@ def run_variant(variant: str, tasks: list[dict], *, mode: str, timeout_s: int) -
305
657
  {
306
658
  "id": task.get("id"),
307
659
  "category": task.get("category"),
660
+ "duration": task.get("duration"),
661
+ "cognitive": task.get("cognitive"),
308
662
  "score": score,
663
+ # `errored` = the run did not complete on merit (rate-limit,
664
+ # budget-cap, timeout, CLI failure). Distinct from a content
665
+ # fail (`score.passed == False`). Errored tasks are excluded
666
+ # from the hit-rate so a transient quota trip can't masquerade
667
+ # as the package "not working".
668
+ "errored": bool(run_result.get("errored", False)),
309
669
  "wall_time_seconds": run_result.get("wall_time_seconds", 0.0),
670
+ "tokens": run_result.get("tokens", 0),
671
+ "tokens_breakdown": run_result.get("tokens_breakdown", {}),
310
672
  "exit_code": run_result.get("exit_code"),
311
673
  "mode": run_result.get("mode", mode),
312
674
  "reason": run_result.get("reason", ""),
313
675
  "ask_events": count_ask_events(run_result.get("transcript", "")),
314
676
  }
315
677
  )
678
+ if progress is not None:
679
+ progress.end_task(
680
+ passed=bool(score.get("passed")),
681
+ wall=float(run_result.get("wall_time_seconds", 0.0) or 0.0),
682
+ variant=variant,
683
+ task_id=str(task.get("id")),
684
+ )
316
685
  duration = time.monotonic() - started
317
686
  path = write_report(variant, mode=mode, per_task=per_task, duration=duration)
318
- sys.stdout.write(
687
+ summary = (
319
688
  f"bench_ab_task_runner: {variant} ({mode}) → "
320
689
  f"{sum(1 for e in per_task if e['score']['passed'])}/{len(per_task)} "
321
- f"passed — {path.relative_to(REPO_ROOT)}\n"
690
+ f"passed — {path.relative_to(REPO_ROOT)}"
322
691
  )
692
+ if progress is not None:
693
+ progress.variant_done(summary)
694
+ else:
695
+ sys.stdout.write(summary + "\n")
323
696
  return {"path": path, "per_task": per_task, "duration": duration}
324
697
 
325
698
 
@@ -327,9 +700,10 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
327
700
  parser = argparse.ArgumentParser(description="Run Track B tasks per variant.")
328
701
  parser.add_argument(
329
702
  "--variant",
330
- choices=("with", "without", "both"),
703
+ choices=("with", "without", "with-rdp", "both", "all"),
331
704
  default="both",
332
- help="Which variant to run (default: both).",
705
+ help="with | without | with-rdp | both (=with+without, back-compat "
706
+ "default) | all (=the 3-condition value-benchmark set).",
333
707
  )
334
708
  parser.add_argument(
335
709
  "--mode",
@@ -346,6 +720,48 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
346
720
  default=120,
347
721
  help="Live mode: per-task timeout in seconds (default 120).",
348
722
  )
723
+ parser.add_argument(
724
+ "--progress",
725
+ choices=("auto", "bar", "plain", "none"),
726
+ default="auto",
727
+ help="Live display: auto (TTY→bar, else plain line-per-task) | bar | plain | none.",
728
+ )
729
+ parser.add_argument(
730
+ "--limit",
731
+ type=int,
732
+ default=0,
733
+ help="Run only the first N tasks per variant (0 = all). For cheap smoke tests.",
734
+ )
735
+ parser.add_argument(
736
+ "--tasks",
737
+ default="",
738
+ help=(
739
+ "Comma-separated task IDs to run (e.g. trackb-bugfix-01,trackb-refactor-01). "
740
+ "Overrides --limit. Use to span the 2×2 cells in a bounded run instead of "
741
+ "taking the first-N in file order."
742
+ ),
743
+ )
744
+ parser.add_argument(
745
+ "--model",
746
+ default="claude-sonnet-4-6",
747
+ help=(
748
+ "Pin ONE model across all arms (live mode). Default claude-sonnet-4-6 — "
749
+ "capable enough to complete the coding tasks, ~2.3x cheaper per turn than "
750
+ "the Opus-4.8-1M session default whose cache-creation blows the quota. "
751
+ "Empty string = inherit the session default (expensive)."
752
+ ),
753
+ )
754
+ parser.add_argument(
755
+ "--budget",
756
+ type=float,
757
+ default=2.0,
758
+ help=(
759
+ "Live mode: per-task API spend cap in USD (passed to "
760
+ "`claude --max-budget-usd`). Stops a runaway agentic loop from "
761
+ "exhausting the account quota and starving later arms. 0 = uncapped. "
762
+ "Default 2.0."
763
+ ),
764
+ )
349
765
  return parser.parse_args(argv)
350
766
 
351
767
 
@@ -359,9 +775,38 @@ def main(argv: list[str] | None = None) -> int:
359
775
  if not tasks:
360
776
  sys.stderr.write("bench_ab_task_runner: corpus has no tasks\n")
361
777
  return 1
362
- variants = ("with", "without") if args.variant == "both" else (args.variant,)
778
+ if args.tasks.strip():
779
+ wanted = [s.strip() for s in args.tasks.split(",") if s.strip()]
780
+ by_id = {t.get("id"): t for t in tasks}
781
+ missing = [w for w in wanted if w not in by_id]
782
+ if missing:
783
+ sys.stderr.write(
784
+ f"bench_ab_task_runner: unknown task id(s): {', '.join(missing)}\n"
785
+ )
786
+ return 1
787
+ tasks = [by_id[w] for w in wanted]
788
+ elif args.limit and args.limit > 0:
789
+ tasks = tasks[: args.limit]
790
+ if args.variant == "both":
791
+ variants = ("with", "without")
792
+ elif args.variant == "all":
793
+ variants = ("with", "without", "with-rdp")
794
+ else:
795
+ variants = (args.variant,)
796
+ max_budget = args.budget if args.budget and args.budget > 0 else None
797
+ model = args.model or None
798
+ progress = Progress(len(variants) * len(tasks), mode=args.mode, style=args.progress)
363
799
  for variant in variants:
364
- run_variant(variant, tasks, mode=args.mode, timeout_s=args.timeout)
800
+ run_variant(
801
+ variant,
802
+ tasks,
803
+ mode=args.mode,
804
+ timeout_s=args.timeout,
805
+ max_budget=max_budget,
806
+ model=model,
807
+ progress=progress,
808
+ )
809
+ progress.finish()
365
810
  return 0
366
811
 
367
812