@event4u/agent-config 3.3.0 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (561) hide show
  1. package/.agent-src/README.md +2 -2
  2. package/.agent-src/commands/agent-handoff.md +31 -2
  3. package/.agent-src/commands/agent-status.md +5 -5
  4. package/.agent-src/commands/agents/audit.md +8 -8
  5. package/.agent-src/commands/agents/init.md +25 -1
  6. package/.agent-src/commands/agents/optimize.md +3 -3
  7. package/.agent-src/commands/agents/user.md +1 -1
  8. package/.agent-src/commands/agents.md +1 -1
  9. package/.agent-src/commands/analyze-reference-repo.md +1 -1
  10. package/.agent-src/commands/check-current-md.md +8 -8
  11. package/.agent-src/commands/{compress.md → condense.md} +55 -55
  12. package/.agent-src/commands/context/create.md +7 -4
  13. package/.agent-src/commands/context/refactor.md +3 -1
  14. package/.agent-src/commands/feature/dev.md +1 -1
  15. package/.agent-src/commands/feature/explore.md +1 -1
  16. package/.agent-src/commands/feature/plan.md +10 -8
  17. package/.agent-src/commands/feature/refactor.md +3 -1
  18. package/.agent-src/commands/feature/roadmap.md +7 -4
  19. package/.agent-src/commands/fix/portability.md +3 -3
  20. package/.agent-src/commands/fix/refs.md +4 -4
  21. package/.agent-src/commands/ghostwriter.md +2 -2
  22. package/.agent-src/commands/memory/learn-low-impact.md +3 -3
  23. package/.agent-src/commands/module/explore.md +34 -8
  24. package/.agent-src/commands/optimize/agents-dir.md +9 -7
  25. package/.agent-src/commands/optimize/augmentignore.md +2 -2
  26. package/.agent-src/commands/optimize/skills.md +9 -9
  27. package/.agent-src/commands/post-as.md +1 -1
  28. package/.agent-src/commands/project-analyze.md +2 -2
  29. package/.agent-src/commands/project-health.md +3 -2
  30. package/.agent-src/commands/research/deep.md +1 -1
  31. package/.agent-src/commands/research/report.md +1 -1
  32. package/.agent-src/commands/research.md +1 -1
  33. package/.agent-src/commands/roadmap/ai-council.md +1 -1
  34. package/.agent-src/commands/roadmap/create.md +9 -4
  35. package/.agent-src/commands/rule-compliance-audit.md +1 -1
  36. package/.agent-src/commands/upstream-contribute.md +14 -14
  37. package/.agent-src/commands/video/from-script.md +1 -1
  38. package/.agent-src/commands/video/scene.md +1 -1
  39. package/.agent-src/commands/video/stitch.md +1 -1
  40. package/.agent-src/commands/video/storyboard.md +1 -1
  41. package/.agent-src/commands/video.md +1 -1
  42. package/.agent-src/contexts/augment-infrastructure.md +1 -1
  43. package/.agent-src/contexts/authority/commit-mechanics.md +15 -0
  44. package/.agent-src/contexts/authority/kernel-rule-edits.md +3 -3
  45. package/.agent-src/contexts/authority/scope-mechanics.md +1 -1
  46. package/.agent-src/contexts/communication/rules-auto/augment-source-of-truth-mechanics.md +28 -28
  47. package/.agent-src/contexts/communication/rules-auto/skill-quality-mechanics.md +4 -4
  48. package/.agent-src/contexts/communication/rules-auto/think-before-action-mechanics.md +2 -2
  49. package/.agent-src/contexts/contracts/artifact-engagement-flow.md +6 -6
  50. package/.agent-src/contexts/contracts/command-suggestion-flow.md +3 -3
  51. package/.agent-src/contexts/contracts/emergency-triage-block.md +4 -4
  52. package/.agent-src/contexts/contracts/frugality-charter.md +3 -3
  53. package/.agent-src/contexts/documentation-hierarchy.md +14 -7
  54. package/.agent-src/contexts/execution/autonomy-examples.md +1 -1
  55. package/.agent-src/contexts/execution/cheap-question-mechanics.md +39 -2
  56. package/.agent-src/contexts/execution/roadmap-process-loop.md +28 -5
  57. package/.agent-src/contexts/override-system.md +5 -5
  58. package/.agent-src/ghostwriter/fictional-fixture-v1.md +1 -1
  59. package/.agent-src/personas/advisors/first-principles.md +1 -1
  60. package/.agent-src/personas/hollywood-director.md +1 -1
  61. package/.agent-src/rules/architecture.md +5 -1
  62. package/.agent-src/rules/augment-edit-discipline.md +5 -5
  63. package/.agent-src/rules/augment-source-of-truth.md +15 -15
  64. package/.agent-src/rules/commit-conventions.md +1 -1
  65. package/.agent-src/rules/commit-policy.md +10 -0
  66. package/.agent-src/rules/domain-adoption-policy.md +3 -3
  67. package/.agent-src/rules/fast-path-marker-visibility.md +3 -3
  68. package/.agent-src/rules/finance-safety-floor.md +1 -1
  69. package/.agent-src/rules/framework-neutrality-in-generic-skills.md +8 -8
  70. package/.agent-src/rules/git-history-discipline.md +1 -1
  71. package/.agent-src/rules/improve-before-implement.md +2 -2
  72. package/.agent-src/rules/language-and-tone.md +2 -2
  73. package/.agent-src/rules/media-governance-routing.md +5 -5
  74. package/.agent-src/rules/no-attribution-footers.md +1 -0
  75. package/.agent-src/rules/no-cheap-questions.md +3 -0
  76. package/.agent-src/rules/no-decorative-emojis-in-git-surfaces.md +111 -0
  77. package/.agent-src/rules/no-pr-progress-comments.md +118 -0
  78. package/.agent-src/rules/no-roadmap-references.md +3 -3
  79. package/.agent-src/rules/non-destructive-by-default.md +1 -1
  80. package/.agent-src/rules/persona-governance.md +3 -3
  81. package/.agent-src/rules/preservation-guard.md +15 -15
  82. package/.agent-src/rules/roadmap-ci-steps-policy.md +7 -3
  83. package/.agent-src/rules/rule-type-governance.md +1 -1
  84. package/.agent-src/rules/skill-quality.md +1 -1
  85. package/.agent-src/rules/{caveman-speak.md → telegraph-speak.md} +15 -15
  86. package/.agent-src/rules/token-optimizer-maintenance.md +6 -6
  87. package/.agent-src/skills/agent-docs-writing/SKILL.md +17 -11
  88. package/.agent-src/skills/agents-md-thin-root/SKILL.md +9 -9
  89. package/.agent-src/skills/check-refs/SKILL.md +2 -2
  90. package/.agent-src/skills/code-refactoring/SKILL.md +2 -2
  91. package/.agent-src/skills/command-writing/SKILL.md +19 -19
  92. package/.agent-src/skills/comp-banding/SKILL.md +1 -1
  93. package/.agent-src/skills/condense-memory/SKILL.md +131 -0
  94. package/.agent-src/skills/context-authoring/SKILL.md +2 -2
  95. package/.agent-src/skills/context-document/SKILL.md +5 -3
  96. package/.agent-src/skills/copilot-agents-optimization/SKILL.md +3 -3
  97. package/.agent-src/skills/description-assist/SKILL.md +2 -2
  98. package/.agent-src/skills/git-workflow/SKILL.md +1 -1
  99. package/.agent-src/skills/guideline-writing/SKILL.md +5 -5
  100. package/.agent-src/skills/learning-to-rule-or-skill/SKILL.md +4 -4
  101. package/.agent-src/skills/lint-skills/SKILL.md +3 -3
  102. package/.agent-src/skills/md-language-check/SKILL.md +2 -2
  103. package/.agent-src/skills/module-detect-on-the-fly/SKILL.md +138 -0
  104. package/.agent-src/skills/module-management/SKILL.md +166 -94
  105. package/.agent-src/skills/override-management/SKILL.md +1 -1
  106. package/.agent-src/skills/persona-writing/SKILL.md +5 -5
  107. package/.agent-src/skills/positioning-strategy/SKILL.md +1 -1
  108. package/.agent-src/skills/project-docs/SKILL.md +6 -4
  109. package/.agent-src/skills/readme-reviewer/SKILL.md +2 -2
  110. package/.agent-src/skills/roadmap-management/SKILL.md +13 -1
  111. package/.agent-src/skills/roadmap-writing/SKILL.md +4 -2
  112. package/.agent-src/skills/rule-refactor/SKILL.md +5 -5
  113. package/.agent-src/skills/rule-writing/SKILL.md +18 -18
  114. package/.agent-src/skills/script-writing/SKILL.md +1 -1
  115. package/.agent-src/skills/skill-improvement-pipeline/SKILL.md +6 -6
  116. package/.agent-src/skills/skill-management/SKILL.md +21 -21
  117. package/.agent-src/skills/skill-reviewer/SKILL.md +2 -2
  118. package/.agent-src/skills/skill-writing/SKILL.md +8 -8
  119. package/.agent-src/skills/skill-writing/evals/triggers.json +1 -1
  120. package/.agent-src/skills/token-optimizer/SKILL.md +4 -4
  121. package/.agent-src/skills/unit-economics-modeling/SKILL.md +1 -1
  122. package/.agent-src/skills/upstream-contribute/SKILL.md +17 -17
  123. package/.agent-src/templates/AGENTS.md +1 -0
  124. package/.agent-src/templates/agent-settings.md +24 -13
  125. package/.agent-src/templates/agents/agent-project-settings.example.yml +61 -2
  126. package/.agent-src/templates/command.md +5 -5
  127. package/.agent-src/templates/contexts.md +1 -1
  128. package/.agent-src/templates/copilot-instructions.md +8 -8
  129. package/.agent-src/templates/features.md +1 -1
  130. package/.agent-src/templates/hooks/pre-commit-frontmatter +2 -2
  131. package/.agent-src/templates/hooks/pre-commit-roadmap-progress +3 -3
  132. package/.agent-src/templates/persona.md +2 -2
  133. package/.agent-src/templates/roadmaps.md +1 -1
  134. package/.agent-src/templates/rule.md +13 -13
  135. package/.agent-src/templates/scripts/memory_lookup.py +1 -1
  136. package/.agent-src/templates/scripts/memory_status.py +2 -2
  137. package/.agent-src/templates/scripts/work_engine/_lib/agent_settings.py +195 -1
  138. package/.agent-src/templates/scripts/work_engine/orchestration.py +1 -1
  139. package/.agent-src/templates/skill-archive-note.md +5 -5
  140. package/.agent-src/templates/skill.md +1 -1
  141. package/.claude-plugin/marketplace.json +4 -4
  142. package/AGENTS.md +16 -17
  143. package/CHANGELOG.md +216 -3
  144. package/CONTRIBUTING.md +31 -12
  145. package/README.md +21 -12
  146. package/config/agent-settings.template.yml +22 -2
  147. package/config/discovery/unassigned-artefacts.yml +24 -24
  148. package/config/profiles/full.ini +1 -1
  149. package/dist/cli/agent-config.js +52 -3
  150. package/dist/cli/agent-config.js.map +1 -1
  151. package/dist/cli/commands/uiServe.js +9 -0
  152. package/dist/cli/commands/uiServe.js.map +1 -1
  153. package/dist/cli/registry.js +2 -1
  154. package/dist/cli/registry.js.map +1 -1
  155. package/dist/discovery/deprecation-report.md +1 -1
  156. package/dist/discovery/discovery-manifest.json +649 -606
  157. package/dist/discovery/discovery-manifest.json.sha256 +1 -1
  158. package/dist/discovery/discovery-manifest.summary.md +4 -4
  159. package/dist/discovery/orphan-report.md +1 -1
  160. package/dist/discovery/packs.json +439 -437
  161. package/dist/discovery/trust-report.md +5 -5
  162. package/dist/discovery/workspaces.json +450 -448
  163. package/dist/install/atomic.js +92 -0
  164. package/dist/install/atomic.js.map +1 -0
  165. package/dist/install/conflict.js +196 -0
  166. package/dist/install/conflict.js.map +1 -0
  167. package/dist/install/detect.js +218 -0
  168. package/dist/install/detect.js.map +1 -0
  169. package/dist/install/paths.js +82 -0
  170. package/dist/install/paths.js.map +1 -0
  171. package/dist/install/plan.js +157 -0
  172. package/dist/install/plan.js.map +1 -0
  173. package/dist/install/txlog.js +140 -0
  174. package/dist/install/txlog.js.map +1 -0
  175. package/dist/install/types.js +19 -0
  176. package/dist/install/types.js.map +1 -0
  177. package/dist/install/wizard-plan.js +184 -0
  178. package/dist/install/wizard-plan.js.map +1 -0
  179. package/dist/mcp/registry-manifest.json +4 -4
  180. package/dist/router.json +67 -19
  181. package/dist/server/app.js +6 -0
  182. package/dist/server/app.js.map +1 -1
  183. package/dist/server/routes/install.js +358 -0
  184. package/dist/server/routes/install.js.map +1 -0
  185. package/dist/server/routes/wizard.js +468 -32
  186. package/dist/server/routes/wizard.js.map +1 -1
  187. package/dist/server/routes/workspace.js +396 -0
  188. package/dist/server/routes/workspace.js.map +1 -0
  189. package/dist/server/schemas/settings.js +5 -3
  190. package/dist/server/schemas/settings.js.map +1 -1
  191. package/dist/ui/assets/index-BDAhhpDV.js +40 -0
  192. package/dist/ui/assets/index-BDAhhpDV.js.map +1 -0
  193. package/dist/ui/assets/index-BXZILUxe.css +1 -0
  194. package/dist/ui/index.html +2 -2
  195. package/docs/MIGRATION.md +1 -1
  196. package/docs/adrs/cost/0001-hard-stop-hook.md +1 -1
  197. package/docs/adrs/router/0001-three-tier-routing.md +4 -4
  198. package/docs/adrs/schema/0001-json-schema-frontmatter.md +1 -1
  199. package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +4 -4
  200. package/docs/adrs/{caveman → telegraph}/0001-default-off-until-bench.md +9 -9
  201. package/docs/adrs/telegraph/README.md +9 -0
  202. package/docs/architecture/augment-projection.md +4 -4
  203. package/docs/architecture/claude-bundle.md +1 -1
  204. package/docs/architecture/current-onboard-baseline.md +3 -3
  205. package/docs/architecture/multi-tool-projection.md +10 -10
  206. package/docs/architecture/source-projection.md +27 -27
  207. package/docs/architecture.md +19 -15
  208. package/docs/archive/CHANGELOG-pre-2.11.0.md +2 -2
  209. package/docs/archive/CHANGELOG-pre-2.15.0.md +3 -3
  210. package/docs/archive/CHANGELOG-pre-2.16.0.md +1 -1
  211. package/docs/archive/CHANGELOG-pre-2.2.0.md +70 -70
  212. package/docs/archive/CHANGELOG-pre-2.20.0.md +2 -2
  213. package/docs/archive/CHANGELOG-pre-2.25.0.md +15 -15
  214. package/docs/archive/CHANGELOG-pre-3.0.0.md +4 -4
  215. package/docs/archive/CHANGELOG-pre-3.1.0.md +2 -2
  216. package/docs/archive/CHANGELOG-pre-3.2.0.md +3 -3
  217. package/docs/benchmark.md +65 -0
  218. package/docs/benchmarks.md +16 -16
  219. package/docs/catalog.md +17 -15
  220. package/docs/contracts/CHANGELOG-conventions.md +1 -1
  221. package/docs/contracts/STABILITY.md +2 -2
  222. package/docs/contracts/adoption-signal-floor.md +110 -0
  223. package/docs/contracts/adr-chat-history-split.md +4 -4
  224. package/docs/contracts/adr-command-suggestion.md +4 -4
  225. package/docs/contracts/adr-gtm-context-spine.md +1 -1
  226. package/docs/contracts/adr-implement-ticket-runtime.md +4 -4
  227. package/docs/contracts/adr-install-user-type-axis.md +1 -1
  228. package/docs/contracts/adr-layout.md +2 -2
  229. package/docs/contracts/adr-product-ui-track.md +10 -10
  230. package/docs/contracts/adr-user-types-axis.md +3 -3
  231. package/docs/contracts/adr-wing4-context-spine.md +1 -1
  232. package/docs/contracts/agent-memory-contract.md +3 -3
  233. package/docs/contracts/agents-md-tech-stack.md +2 -2
  234. package/docs/contracts/ai-council-config.md +2 -2
  235. package/docs/contracts/at-rest-encryption.md +4 -0
  236. package/docs/contracts/audit-log-v1.md +1 -1
  237. package/docs/contracts/benchmark-ab-contract.md +101 -0
  238. package/docs/contracts/benchmark-corpus-spec.md +1 -1
  239. package/docs/contracts/branch-protection-policy.md +98 -0
  240. package/docs/contracts/ci-cost-budget.md +106 -0
  241. package/docs/contracts/ci-green-floor.md +141 -0
  242. package/docs/contracts/command-clusters.md +6 -6
  243. package/docs/contracts/command-surface-tiers.md +2 -2
  244. package/docs/contracts/command-taxonomy.md +2 -2
  245. package/docs/contracts/{compression-default-kill-criterion.md → condensation-default-kill-criterion.md} +29 -29
  246. package/docs/contracts/config-presets.md +1 -1
  247. package/docs/contracts/context-paths.md +3 -3
  248. package/docs/contracts/context-spine.md +1 -1
  249. package/docs/contracts/cost-summary-schema.md +12 -12
  250. package/docs/contracts/cross-wing-handoff.md +4 -4
  251. package/docs/contracts/daily-workspace.md +4 -0
  252. package/docs/contracts/decision-trace-v1.md +2 -2
  253. package/docs/contracts/discovery-manifest.md +4 -4
  254. package/docs/contracts/explain-modes.md +4 -0
  255. package/docs/contracts/file-ownership-matrix.json +3493 -3318
  256. package/docs/contracts/file-ownership-matrix.md +3 -3
  257. package/docs/contracts/frontmatter-contract.md +4 -4
  258. package/docs/contracts/ghostwriter-schema.md +3 -3
  259. package/docs/contracts/gui-wizard.md +110 -97
  260. package/docs/contracts/harness-expectations.md +123 -0
  261. package/docs/contracts/host-agent-protocol.md +4 -0
  262. package/docs/contracts/implement-ticket-flow.md +9 -9
  263. package/docs/contracts/install-scopes.md +77 -0
  264. package/docs/contracts/iron-law-overrides.txt +1 -1
  265. package/docs/contracts/kernel-membership.md +26 -26
  266. package/docs/contracts/linear-ai-rules-inclusion.md +1 -1
  267. package/docs/contracts/linter-structural-model.md +2 -2
  268. package/docs/contracts/load-context-budget-model.md +4 -4
  269. package/docs/contracts/load-context-schema.md +13 -13
  270. package/docs/contracts/local-analytics.md +4 -0
  271. package/docs/contracts/local-knowledge-ingestion.md +1 -1
  272. package/docs/contracts/mcp-cloud-scope.md +2 -2
  273. package/docs/contracts/mcp-phase-1-scope.md +3 -3
  274. package/docs/contracts/measurement-baseline.md +5 -5
  275. package/docs/contracts/mental-models.md +30 -30
  276. package/docs/contracts/multi-tool-projection-fidelity.md +4 -4
  277. package/docs/contracts/namespace.md +4 -4
  278. package/docs/contracts/orchestration-dsl-v1.md +7 -7
  279. package/docs/contracts/package-self-orientation.md +12 -12
  280. package/docs/contracts/persona-schema.md +6 -6
  281. package/docs/contracts/pilot/language-and-tone.md +1 -1
  282. package/docs/contracts/plain-language-surface.md +117 -0
  283. package/docs/contracts/profile-system.md +3 -3
  284. package/docs/contracts/release-pr-gating.md +103 -0
  285. package/docs/contracts/role-experience.md +3 -3
  286. package/docs/contracts/rule-classification.md +13 -13
  287. package/docs/contracts/rule-interactions.md +4 -4
  288. package/docs/contracts/rule-interactions.yml +30 -30
  289. package/docs/contracts/rule-priority-hierarchy.md +13 -13
  290. package/docs/contracts/rule-router.md +2 -2
  291. package/docs/contracts/safety-model.md +1 -1
  292. package/docs/contracts/skill-distribution-channels.md +61 -0
  293. package/docs/contracts/skill-domains.md +2 -2
  294. package/docs/contracts/smoke-contracts.md +5 -5
  295. package/docs/contracts/telegraph-telemetry.md +83 -0
  296. package/docs/contracts/trust-and-safety.md +5 -5
  297. package/docs/contracts/ui-stack-extension.md +7 -7
  298. package/docs/contracts/ui-track-flow.md +9 -9
  299. package/docs/contracts/user-type-schema.md +4 -4
  300. package/docs/contracts/workflow-packs.md +4 -4
  301. package/docs/contracts/workspace-documents.md +4 -0
  302. package/docs/customization.md +28 -8
  303. package/docs/decisions/ADR-001-kernel-swap-deferred.md +6 -6
  304. package/docs/decisions/ADR-002-kernel-bucket-overrides.md +11 -11
  305. package/docs/decisions/ADR-003-flat-cluster-subs-and-colon-syntax.md +2 -2
  306. package/docs/decisions/ADR-004-rule-governance-pruning.md +4 -4
  307. package/docs/decisions/ADR-005-subagent-worktrees.md +7 -7
  308. package/docs/decisions/ADR-011-domain-pack-readiness.md +6 -6
  309. package/docs/decisions/ADR-013-discovery-frontmatter-contract.md +3 -3
  310. package/docs/decisions/ADR-015-discovery-manifest-contract.md +3 -3
  311. package/docs/decisions/ADR-017-monorepo-physical-layout.md +10 -10
  312. package/docs/decisions/ADR-018-trust-and-safety-layer.md +6 -6
  313. package/docs/decisions/ADR-019-router-json-dist-location.md +2 -2
  314. package/docs/decisions/ADR-020-global-only-consumer-scope.md +2 -2
  315. package/docs/decisions/ADR-021-deployment-shape.md +3 -3
  316. package/docs/decisions/ADR-022-daily-workspace-decomposition.md +1 -1
  317. package/docs/decisions/ADR-027-changelog-machine-vs-manual.md +2 -2
  318. package/docs/decisions/ADR-028-root-layout.md +7 -7
  319. package/docs/decisions/ADR-029-multi-workspace-deferred.md +2 -2
  320. package/docs/decisions/ADR-rule-kernel-and-router.md +5 -5
  321. package/docs/deploy/connector-setup.md +2 -2
  322. package/docs/deploy/policy-cookbook.md +2 -2
  323. package/docs/deploy/team-deployment-posture.md +20 -0
  324. package/docs/development.md +17 -17
  325. package/docs/distribution/registries.md +32 -0
  326. package/docs/distribution/registry-submissions.md +85 -0
  327. package/docs/distribution/telemetry-schema.md +1 -1
  328. package/docs/getting-started-by-role.md +45 -3
  329. package/docs/getting-started.md +2 -2
  330. package/docs/guidelines/agent-infra/5w2h-analysis.md +3 -3
  331. package/docs/guidelines/agent-infra/ask-when-uncertain-demos.md +1 -1
  332. package/docs/guidelines/agent-infra/asking-and-brevity-examples.md +3 -3
  333. package/docs/guidelines/agent-infra/carve-out-predicates.md +3 -3
  334. package/docs/guidelines/agent-infra/critical-thinking.md +4 -4
  335. package/docs/guidelines/agent-infra/direct-answers-demos.md +1 -1
  336. package/docs/guidelines/agent-infra/first-principles.md +2 -2
  337. package/docs/guidelines/agent-infra/inversion-thinking.md +5 -5
  338. package/docs/guidelines/agent-infra/layered-settings.md +56 -2
  339. package/docs/guidelines/agent-infra/mental-models.md +3 -3
  340. package/docs/guidelines/agent-infra/roadmap-progress-mechanics.md +2 -2
  341. package/docs/guidelines/agent-infra/rule-type-governance.md +1 -1
  342. package/docs/guidelines/agent-infra/scqa-framework.md +5 -5
  343. package/docs/guidelines/agent-infra/self-improvement-pipeline.md +2 -2
  344. package/docs/guidelines/agent-infra/six-hats.md +3 -3
  345. package/docs/guidelines/agent-infra/skill-quality-checklist.md +5 -5
  346. package/docs/guidelines/agent-infra/systems-thinking.md +1 -1
  347. package/docs/guidelines/agent-infra/verify-before-complete-demos.md +1 -1
  348. package/docs/guidelines/augment-portability-patterns.md +4 -4
  349. package/docs/guidelines/cross-role-handoff.md +2 -2
  350. package/docs/guidelines/php/php-coding-patterns.md +1 -1
  351. package/docs/guidelines/prompt-templates.md +6 -6
  352. package/docs/maintainers/dev-mode.md +1 -1
  353. package/docs/mcp.md +1 -1
  354. package/docs/parity/bench.json +3 -3
  355. package/docs/parity/ruflo.md +2 -2
  356. package/docs/profiles.md +11 -11
  357. package/docs/quality.md +11 -11
  358. package/docs/safety.md +3 -3
  359. package/docs/setup/mcp-client-config.md +1 -1
  360. package/docs/setup/mcp-r2-bootstrap.md +1 -1
  361. package/docs/setup/mcp-server-docker.md +3 -3
  362. package/docs/setup/per-ide/windsurf.md +1 -1
  363. package/docs/skills-catalog.md +8 -7
  364. package/docs/troubleshooting.md +1 -1
  365. package/docs/walkthroughs/daily-workspace-a11y.md +87 -0
  366. package/llms.txt +7 -6
  367. package/package.json +1 -1
  368. package/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
  369. package/scripts/_archive/README.md +2 -2
  370. package/scripts/_archive/_backfill_skill_domains.py +3 -3
  371. package/scripts/_archive/_bootstrap_tier_frontmatter.py +3 -3
  372. package/scripts/_archive/_p43_bodies.py +10 -10
  373. package/scripts/_archive/{_p43_compress.py → _p43_condense.py} +5 -5
  374. package/scripts/_archive/_p4_migrate.py +7 -7
  375. package/scripts/_archive/_phase2_shim_helper.py +1 -1
  376. package/scripts/_archive/_pilot_council_question.py +5 -5
  377. package/scripts/_cli/explain_last/inputs.py +1 -1
  378. package/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
  379. package/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
  380. package/scripts/_lib/agent_settings.py +195 -1
  381. package/scripts/_lib/agent_src.py +19 -19
  382. package/scripts/_lib/bench_ab_cache.py +162 -0
  383. package/scripts/_lib/bench_ab_scoring.py +209 -0
  384. package/scripts/_lib/{bench_caveman.py → bench_telegraph.py} +21 -21
  385. package/scripts/_lib/{bench_caveman_report.py → bench_telegraph_report.py} +21 -21
  386. package/scripts/_lib/claude_desktop_bundler.py +5 -5
  387. package/scripts/_lib/module_detection.py +223 -0
  388. package/scripts/_lib/scope_guard.sh +162 -0
  389. package/scripts/_phase4_bucket.py +3 -3
  390. package/scripts/_pilot_measure.py +4 -4
  391. package/scripts/_tmp_scan_framework_leakage.py +1 -1
  392. package/scripts/adoption_report.py +195 -0
  393. package/scripts/adoption_snapshot.py +219 -0
  394. package/scripts/adoption_status.py +166 -0
  395. package/scripts/ai-video/lib/parse-blueprint.sh +1 -1
  396. package/scripts/ai_council/advisors.py +5 -5
  397. package/scripts/ai_council/compile_corpus.py +1 -1
  398. package/scripts/ai_council/one_off_archive/2026-05/_one_off_budget_v2_audit.py +3 -3
  399. package/scripts/ai_council/one_off_archive/2026-05/_one_off_context_layer_v1_review.py +2 -2
  400. package/scripts/ai_council/one_off_archive/2026-05/_one_off_inject_quiet_flag.py +1 -1
  401. package/scripts/ai_council/one_off_archive/2026-05/_one_off_measure_v2.sh +1 -1
  402. package/scripts/ai_council/one_off_archive/2026-05/_one_off_measure_verbosity.sh +1 -1
  403. package/scripts/ai_council/one_off_archive/2026-05/_one_off_nondestructive_inline_audit.py +3 -3
  404. package/scripts/ai_council/one_off_archive/2026-05/_one_off_per_task.sh +1 -1
  405. package/scripts/ai_council/one_off_archive/2026-05/_one_off_phase6_trigger_jaccard.py +1 -1
  406. package/scripts/ai_council/one_off_archive/2026-05/_one_off_phase_2a_budget_rebalance.py +6 -6
  407. package/scripts/ai_council/one_off_archive/2026-05/_one_off_rebalancing_audit.py +1 -1
  408. package/scripts/ai_council/one_off_archive/2026-05/_one_off_tier_retrofit.py +6 -6
  409. package/scripts/annotate_discovery.py +13 -13
  410. package/scripts/apply_modules_config.py +290 -0
  411. package/scripts/audit_adr_coverage.py +2 -2
  412. package/scripts/audit_auto_rules.py +2 -2
  413. package/scripts/audit_cloud_compatibility.py +3 -3
  414. package/scripts/audit_command_surface.py +9 -9
  415. package/scripts/audit_likelihood.py +2 -2
  416. package/scripts/audit_user_type_axis.py +2 -2
  417. package/scripts/bench_ab_cache_dispatch.py +68 -0
  418. package/scripts/bench_ab_clone.py +170 -0
  419. package/scripts/bench_ab_diff.py +227 -0
  420. package/scripts/bench_ab_integrity.py +143 -0
  421. package/scripts/bench_ab_run.py +235 -0
  422. package/scripts/bench_ab_task_runner.py +369 -0
  423. package/scripts/bench_ab_tracka_run.py +202 -0
  424. package/scripts/{bench_compress_memory.py → bench_condense_memory.py} +16 -16
  425. package/scripts/bench_run.py +33 -33
  426. package/scripts/bench_runner.py +2 -2
  427. package/scripts/bootstrap.sh +99 -0
  428. package/scripts/build_cloud_bundle.py +6 -6
  429. package/scripts/build_discovery_manifest.py +7 -7
  430. package/scripts/build_linear_digest.py +3 -3
  431. package/scripts/build_rule_trigger_matrix.py +8 -8
  432. package/scripts/chat_history.py +5 -5
  433. package/scripts/check_always_budget.py +11 -5
  434. package/scripts/check_augment_description_cap.py +3 -3
  435. package/scripts/check_cluster_patterns.py +2 -2
  436. package/scripts/check_command_count_messaging.py +3 -3
  437. package/scripts/{check_compression.py → check_condensation.py} +34 -34
  438. package/scripts/{check_compressed_paths.py → check_condensed_paths.py} +8 -8
  439. package/scripts/check_context_paths.py +7 -7
  440. package/scripts/check_council_layout.py +2 -2
  441. package/scripts/check_council_references.py +9 -9
  442. package/scripts/check_iron_law_prominence.py +2 -2
  443. package/scripts/check_kernel_rule_bundle.py +2 -2
  444. package/scripts/check_module_management_neutral.py +149 -0
  445. package/scripts/check_no_roadmap_refs.py +9 -9
  446. package/scripts/check_portability.py +3 -3
  447. package/scripts/check_public_catalog_links.py +4 -4
  448. package/scripts/check_references.py +7 -6
  449. package/scripts/check_release_pr_shape.py +112 -0
  450. package/scripts/check_reply_consistency.py +3 -3
  451. package/scripts/check_safety_floor_untouched.py +1 -1
  452. package/scripts/check_template_pin_drift.py +5 -5
  453. package/scripts/check_token_optimizer_freshness.py +3 -3
  454. package/scripts/ci_status.py +301 -0
  455. package/scripts/ci_time_ratio.py +1 -1
  456. package/scripts/cleanup_other_scope.sh +146 -0
  457. package/scripts/compile_router.py +10 -10
  458. package/scripts/{compress.py → condense.py} +64 -64
  459. package/scripts/condense.sh +18 -0
  460. package/scripts/{compress_memory.py → condense_memory.py} +33 -33
  461. package/scripts/config/presets.py +2 -2
  462. package/scripts/config/profiles.py +1 -1
  463. package/scripts/cost_by_conversation.py +3 -3
  464. package/scripts/cost_summary.py +7 -7
  465. package/scripts/count_token_optimizer_usage.sh +1 -1
  466. package/scripts/gen_discovery_baseline.py +5 -5
  467. package/scripts/generate_index.py +6 -6
  468. package/scripts/generate_ownership_matrix.py +10 -10
  469. package/scripts/generate_pack_manifests.py +1 -1
  470. package/scripts/ghostwriter_fixture_allowlist.txt +1 -1
  471. package/scripts/install +3 -3
  472. package/scripts/install-hooks.sh +6 -6
  473. package/scripts/install.py +273 -45
  474. package/scripts/install.sh +187 -1
  475. package/scripts/inventory_frontmatter.py +2 -2
  476. package/scripts/iron_law_sha.py +3 -3
  477. package/scripts/lint_agents_layout.py +14 -7
  478. package/scripts/lint_agents_md.py +4 -4
  479. package/scripts/lint_archived_skills.py +3 -3
  480. package/scripts/lint_artefact_frontmatter.py +2 -2
  481. package/scripts/lint_bench_ab.py +172 -0
  482. package/scripts/lint_bench_corpus.py +1 -1
  483. package/scripts/lint_command_tiers.py +5 -5
  484. package/scripts/lint_context_spine_usage.py +1 -1
  485. package/scripts/lint_framework_leakage.py +7 -7
  486. package/scripts/lint_framework_leakage_allowlist.json +152 -84
  487. package/scripts/lint_ghostwriter_source.py +3 -3
  488. package/scripts/lint_handoffs.py +1 -1
  489. package/scripts/lint_load_context.py +11 -11
  490. package/scripts/lint_media_policy_linkage.py +5 -5
  491. package/scripts/lint_namespace.py +1 -1
  492. package/scripts/lint_no_new_atomic_commands.py +2 -2
  493. package/scripts/lint_orchestration_dsl.py +1 -1
  494. package/scripts/lint_pack_boundaries.py +2 -2
  495. package/scripts/lint_persona_governance.py +4 -4
  496. package/scripts/lint_role_experiences.py +237 -0
  497. package/scripts/lint_rule_interactions.py +2 -2
  498. package/scripts/lint_rule_tiers.py +1 -1
  499. package/scripts/lint_trust_coherence.py +2 -2
  500. package/scripts/mcp_registry_submit.sh +187 -0
  501. package/scripts/mcp_server/tools.py +1 -1
  502. package/scripts/measure_frugality_savings.py +10 -10
  503. package/scripts/measure_patterns.py +1 -1
  504. package/scripts/measure_projection_bytes.py +5 -5
  505. package/scripts/measure_rule_budget.py +3 -3
  506. package/scripts/measure_skill_reduction.py +1 -1
  507. package/scripts/memory_lookup.py +1 -1
  508. package/scripts/memory_status.py +2 -2
  509. package/scripts/migrate_command_suggestions.py +3 -3
  510. package/scripts/mine_session.py +1 -1
  511. package/scripts/move_artefact.py +3 -3
  512. package/scripts/new_skill.py +2 -2
  513. package/scripts/pack_mcp_content.py +9 -9
  514. package/scripts/plan_physical_move.py +6 -6
  515. package/scripts/print_required_checks.py +196 -0
  516. package/scripts/probe_skill_registration.py +413 -0
  517. package/scripts/propose_modules_config.py +145 -0
  518. package/scripts/prototype_lint_contradictions.py +1 -1
  519. package/scripts/recruit_preflight.sh +152 -0
  520. package/scripts/refine_ticket_detect.py +3 -3
  521. package/scripts/release.py +20 -0
  522. package/scripts/render_benchmark_md.py +308 -0
  523. package/scripts/roadmap_progress_hook.py +1 -1
  524. package/scripts/run_skill_evals.py +2 -2
  525. package/scripts/runtime_registry.py +4 -4
  526. package/scripts/schemas/command.schema.json +4 -4
  527. package/scripts/schemas/rule.schema.json +5 -5
  528. package/scripts/schemas/skill.schema.json +3 -3
  529. package/scripts/schemas/user-type.schema.json +1 -1
  530. package/scripts/score_skill_selection.py +1 -1
  531. package/scripts/skill_collision_clusters.py +2 -2
  532. package/scripts/skill_linter.py +81 -81
  533. package/scripts/skill_overlap.py +5 -5
  534. package/scripts/skill_tools/audit_persona_coverage.py +2 -2
  535. package/scripts/skill_tools/audit_user_type_coverage.py +2 -2
  536. package/scripts/skill_tools/run_block_d_eval.py +1 -1
  537. package/scripts/skill_tools/score_skill_relevance.py +1 -1
  538. package/scripts/skill_tools/suggest_skill_for_task.py +1 -1
  539. package/scripts/skill_trigger_eval.py +3 -3
  540. package/scripts/smoke/kernel.sh +7 -1
  541. package/scripts/smoke/router.sh +5 -5
  542. package/scripts/smoke/skills.sh +1 -1
  543. package/scripts/smoke_quickstart.py +1 -1
  544. package/scripts/snapshot_agent_outputs.py +3 -3
  545. package/scripts/spotcheck_thin_root.py +1 -1
  546. package/scripts/{caveman_stats.py → telegraph_stats.py} +18 -18
  547. package/scripts/update_counts.py +1 -1
  548. package/scripts/validate_decision_engine.py +1 -1
  549. package/scripts/validate_frontmatter.py +1 -1
  550. package/scripts/validate_safe_paths.py +3 -3
  551. package/scripts/{validate_caveman_carveouts.py → validate_telegraph_carveouts.py} +7 -7
  552. package/scripts/verify_roadmap_closure.py +6 -6
  553. package/templates/consumer-settings/ONBOARDING.md +41 -0
  554. package/.agent-src/commands/install-via-agent.md +0 -129
  555. package/.agent-src/skills/compress-memory/SKILL.md +0 -131
  556. package/dist/ui/assets/index-D-DY1ywI.js +0 -35
  557. package/dist/ui/assets/index-D-DY1ywI.js.map +0 -1
  558. package/dist/ui/assets/index-Dqfhmg-d.css +0 -1
  559. package/docs/adrs/caveman/README.md +0 -9
  560. package/docs/contracts/caveman-telemetry.md +0 -83
  561. package/scripts/compress.sh +0 -18
@@ -0,0 +1,235 @@
1
+ #!/usr/bin/env python3
2
+ """Top-level orchestrator for the package-impact A/B bench.
3
+
4
+ Phase 2 Step 1 of `agents/roadmaps/road-to-package-impact-benchmark.md`.
5
+
6
+ A thin wrapper around the per-track runners (Track A behavioural eval,
7
+ Track B task corpus). Owns:
8
+
9
+ - the `--variant {with,without}` axis,
10
+ - the cache lookup that decides whether the `without` arm runs at all,
11
+ - the report-header convention (cache key, variant, corpus, timestamp),
12
+ - the report-path convention `internal/bench/reports/ab/{stamp}-{corpus}-{variant}.json`.
13
+
14
+ Track A's actual runner lands in Phase 3; Track B's in Phase 4. Until then
15
+ this script writes stub reports so the cache and diff plumbing can be
16
+ exercised end-to-end.
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import json
22
+ import sys
23
+ import time
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+
27
+ REPO_ROOT = Path(__file__).resolve().parent.parent
28
+ sys.path.insert(0, str(REPO_ROOT / "scripts"))
29
+
30
+ from _lib import bench_ab_cache # type: ignore[import-not-found] # noqa: E402
31
+
32
+ REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab"
33
+ CORPUS_DIR = REPO_ROOT / "internal" / "bench" / "corpora"
34
+ CLONES_DIR = REPO_ROOT / "internal" / "bench" / "ab" / "clones"
35
+
36
+ # Supported corpora (created in Phases 3 + 4).
37
+ KNOWN_CORPORA = ("ab-tracka", "ab-trackb")
38
+
39
+ REPORT_SCHEMA_VERSION = "ab-bench/0.1"
40
+
41
+
42
+ def utc_stamp() -> str:
43
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
44
+
45
+
46
+ def corpus_path(corpus: str) -> Path:
47
+ return CORPUS_DIR / f"{corpus}.yaml"
48
+
49
+
50
+ def report_path(stamp: str, corpus: str, variant: str) -> Path:
51
+ return REPORTS_DIR / f"{stamp}-{corpus}-{variant}.json"
52
+
53
+
54
+ def ensure_clone(variant: str) -> Path:
55
+ """Make sure the clone exists; do NOT --refresh — that's a user-driven choice."""
56
+ target = CLONES_DIR / variant
57
+ if not target.exists():
58
+ # Lazy-import so the dependency stays explicit
59
+ import importlib.util
60
+
61
+ spec = importlib.util.spec_from_file_location(
62
+ "bench_ab_clone", REPO_ROOT / "scripts" / "bench_ab_clone.py"
63
+ )
64
+ if spec is None or spec.loader is None:
65
+ raise RuntimeError("cannot load bench_ab_clone helper")
66
+ module = importlib.util.module_from_spec(spec)
67
+ spec.loader.exec_module(module)
68
+ module.clone(variant, refresh=False) # type: ignore[attr-defined]
69
+ return target
70
+
71
+
72
+ def run_track_stub(variant: str, corpus: str, clone_root: Path) -> dict[str, object]:
73
+ """Phase-2 placeholder.
74
+
75
+ Returns a minimal results block. Phase 3 + Phase 4 plug their real
76
+ runners into this dispatch.
77
+ """
78
+ return {
79
+ "track": corpus,
80
+ "status": "stub",
81
+ "note": (
82
+ "Phase 2 plumbing only. The real runner lands in Phase 3 (Track A) "
83
+ "or Phase 4 (Track B). See road-to-package-impact-benchmark.md."
84
+ ),
85
+ "clone_root": str(clone_root.relative_to(REPO_ROOT)),
86
+ "variant": variant,
87
+ }
88
+
89
+
90
+ def write_report(
91
+ *,
92
+ variant: str,
93
+ corpus: str,
94
+ stamp: str,
95
+ cache_key: bench_ab_cache.CacheKey,
96
+ results: dict[str, object],
97
+ duration_seconds: float,
98
+ ) -> Path:
99
+ REPORTS_DIR.mkdir(parents=True, exist_ok=True)
100
+ report = {
101
+ "schema": REPORT_SCHEMA_VERSION,
102
+ "stamp": stamp,
103
+ "variant": variant,
104
+ "corpus": corpus,
105
+ "cache_key": cache_key.to_dict(),
106
+ "duration_seconds": round(duration_seconds, 3),
107
+ "results": results,
108
+ }
109
+ json_path = report_path(stamp, corpus, variant)
110
+ json_path.write_text(json.dumps(report, indent=2) + "\n")
111
+ md_path = json_path.with_suffix(".md")
112
+ md_path.write_text(render_markdown(report))
113
+ return json_path
114
+
115
+
116
+ def render_markdown(report: dict[str, object]) -> str:
117
+ lines = [
118
+ f"# A/B Bench Report — {report['variant']} · {report['corpus']}",
119
+ "",
120
+ f"- Stamp: `{report['stamp']}`",
121
+ f"- Duration: {report['duration_seconds']}s",
122
+ "",
123
+ "## Cache key",
124
+ "",
125
+ ]
126
+ for k, v in (report.get("cache_key") or {}).items(): # type: ignore[union-attr]
127
+ lines.append(f"- `{k}`: `{v}`")
128
+ lines.append("")
129
+ lines.append("## Results")
130
+ lines.append("")
131
+ lines.append("```json")
132
+ lines.append(json.dumps(report.get("results"), indent=2))
133
+ lines.append("```")
134
+ lines.append("")
135
+ return "\n".join(lines)
136
+
137
+
138
+ def parse_args(argv: list[str]) -> argparse.Namespace:
139
+ parser = argparse.ArgumentParser(
140
+ description="Run one arm of the package-impact A/B bench."
141
+ )
142
+ parser.add_argument(
143
+ "--variant",
144
+ choices=("with", "without"),
145
+ required=True,
146
+ help="Which target clone to run against.",
147
+ )
148
+ parser.add_argument(
149
+ "--corpus",
150
+ choices=KNOWN_CORPORA,
151
+ required=True,
152
+ help="Which corpus to execute.",
153
+ )
154
+ parser.add_argument(
155
+ "--non-interactive",
156
+ action="store_true",
157
+ help="Never prompt; assume defaults on cache decisions.",
158
+ )
159
+ parser.add_argument(
160
+ "--reuse-cache",
161
+ action="store_true",
162
+ help=(
163
+ "If a fresh cached `without` report exists, skip re-running and "
164
+ "exit 0 without writing a new report. Only meaningful for "
165
+ "--variant without."
166
+ ),
167
+ )
168
+ return parser.parse_args(argv)
169
+
170
+
171
+ def main(argv: list[str] | None = None) -> int:
172
+ args = parse_args(argv if argv is not None else sys.argv[1:])
173
+
174
+ corpus_file = corpus_path(args.corpus)
175
+ if not corpus_file.exists():
176
+ sys.stdout.write(
177
+ f"bench_ab_run: corpus '{args.corpus}' missing at {corpus_file} — "
178
+ "Phase 3 (track A) or Phase 4 (track B) author it. Writing a "
179
+ "placeholder run with the synthetic corpus hash so cache plumbing "
180
+ "remains exercisable.\n"
181
+ )
182
+
183
+ cache_key_value = bench_ab_cache.CacheKey(
184
+ corpus_hash=(
185
+ bench_ab_cache.hash_file(corpus_file)
186
+ if corpus_file.exists()
187
+ else "missing-corpus"
188
+ ),
189
+ claude_cli_version=bench_ab_cache.claude_cli_version(),
190
+ target_shape_hash=bench_ab_cache.target_shape_hash(),
191
+ )
192
+
193
+ if args.variant == "without" and args.reuse_cache and corpus_file.exists():
194
+ lookup = bench_ab_cache.lookup(corpus_file)
195
+ if lookup.fresh and lookup.report_path is not None:
196
+ sys.stdout.write(
197
+ f"bench_ab_run: reusing fresh cached `without` report at "
198
+ f"{lookup.report_path.relative_to(REPO_ROOT)}\n"
199
+ )
200
+ return 0
201
+ if lookup.found and not lookup.fresh:
202
+ sys.stdout.write(
203
+ f"bench_ab_run: cached `without` report stale ({lookup.reason})\n"
204
+ )
205
+ if args.non_interactive:
206
+ sys.stdout.write(
207
+ "bench_ab_run: --non-interactive — reusing stale baseline "
208
+ "and flagging the run.\n"
209
+ )
210
+ return 0
211
+ sys.stdout.write(
212
+ "bench_ab_run: continuing with a fresh run "
213
+ "(set --reuse-cache off and use --non-interactive to keep the stale baseline)\n"
214
+ )
215
+
216
+ clone_root = ensure_clone(args.variant)
217
+ started = time.monotonic()
218
+ results = run_track_stub(args.variant, args.corpus, clone_root)
219
+ duration = time.monotonic() - started
220
+ path = write_report(
221
+ variant=args.variant,
222
+ corpus=args.corpus,
223
+ stamp=utc_stamp(),
224
+ cache_key=cache_key_value,
225
+ results=results,
226
+ duration_seconds=duration,
227
+ )
228
+ sys.stdout.write(
229
+ f"bench_ab_run: wrote {path.relative_to(REPO_ROOT)}\n"
230
+ )
231
+ return 0
232
+
233
+
234
+ if __name__ == "__main__":
235
+ raise SystemExit(main())
@@ -0,0 +1,369 @@
1
+ #!/usr/bin/env python3
2
+ """Track B — task runner for the package-impact A/B bench.
3
+
4
+ Phase 4 Step 2 of `agents/roadmaps/road-to-package-impact-benchmark.md`.
5
+
6
+ For each task in `internal/bench/corpora/ab-trackb.yaml`, in each variant:
7
+
8
+ 1. Snapshot the variant clone's file tree.
9
+ 2. Invoke the `claude` CLI with the task prompt — OR dry-run, depending
10
+ on `--mode`.
11
+ 3. Capture the transcript, tool-call events, wall-time, and (if available)
12
+ token + cost counts.
13
+ 4. Snapshot the post-run tree.
14
+ 5. Score the task via scripts/_lib/bench_ab_scoring.py.
15
+
16
+ Modes:
17
+
18
+ - `dry-run` (default) — record the would-run shell command, write a stub
19
+ transcript naming the variant, score against the unchanged tree. The
20
+ result is structural-zero for every check that requires a file write,
21
+ but the scoring + reporting pipeline runs end-to-end. This is what the
22
+ bench produces in CI by default — fast, free, repeatable.
23
+ - `live` — actually invoke the `claude` CLI with `--print` (one-shot
24
+ mode) and the task prompt. Reads `CLAUDE_CLI` from env if set, falls
25
+ back to `claude` on PATH. Captures stdout as the transcript. Honors
26
+ `--samples N` for repeated runs.
27
+
28
+ The runner ALWAYS resets the clone to a clean state before each task and
29
+ ALWAYS records the mode in the report header so a reader can never mistake
30
+ a dry-run report for a real measurement.
31
+ """
32
+ from __future__ import annotations
33
+
34
+ import argparse
35
+ import hashlib
36
+ import json
37
+ import os
38
+ import shutil
39
+ import subprocess
40
+ import sys
41
+ import time
42
+ from datetime import datetime, timezone
43
+ from pathlib import Path
44
+
45
+ REPO_ROOT = Path(__file__).resolve().parent.parent
46
+ sys.path.insert(0, str(REPO_ROOT / "scripts"))
47
+
48
+ from _lib import bench_ab_cache # type: ignore[import-not-found] # noqa: E402
49
+ from _lib import bench_ab_scoring # type: ignore[import-not-found] # noqa: E402
50
+
51
+ try:
52
+ import yaml
53
+ except ImportError:
54
+ sys.stderr.write("bench_ab_task_runner: PyYAML required (pip install pyyaml)\n")
55
+ raise SystemExit(2)
56
+
57
+ CORPUS_PATH = REPO_ROOT / "internal" / "bench" / "corpora" / "ab-trackb.yaml"
58
+ CLONES_DIR = REPO_ROOT / "internal" / "bench" / "ab" / "clones"
59
+ REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab"
60
+
61
+ # How far we descend into a clone when snapshotting. The fixture is shallow.
62
+ SNAPSHOT_MAX_DEPTH = 6
63
+
64
+
65
+ def utc_stamp() -> str:
66
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
67
+
68
+
69
+ def snapshot_clone(clone_root: Path, *, max_depth: int = SNAPSHOT_MAX_DEPTH) -> dict[str, str]:
70
+ """Return {relpath: sha256-short} for every fixture file under the clone.
71
+
72
+ Skips the agent-config surface (.claude, .augment, AGENTS.md, CLAUDE.md, manifest)
73
+ because that's the variant axis, not the task surface.
74
+ """
75
+ skip_roots = {".claude", ".augment"}
76
+ skip_files = {"AGENTS.md", "CLAUDE.md", ".bench-ab-manifest.json"}
77
+ out: dict[str, str] = {}
78
+ for path in sorted(clone_root.rglob("*")):
79
+ if not path.is_file():
80
+ continue
81
+ rel = path.relative_to(clone_root)
82
+ parts = rel.parts
83
+ if parts and parts[0] in skip_roots:
84
+ continue
85
+ if rel.as_posix() in skip_files:
86
+ continue
87
+ if len(parts) > max_depth:
88
+ continue
89
+ h = hashlib.sha256()
90
+ try:
91
+ h.update(path.read_bytes())
92
+ except OSError:
93
+ continue
94
+ out[rel.as_posix()] = h.hexdigest()[:16]
95
+ return out
96
+
97
+
98
+ def reset_clone(variant: str) -> Path:
99
+ """Rebuild the clone so each task starts from the same state."""
100
+ import importlib.util
101
+
102
+ spec = importlib.util.spec_from_file_location(
103
+ "bench_ab_clone", REPO_ROOT / "scripts" / "bench_ab_clone.py"
104
+ )
105
+ if spec is None or spec.loader is None:
106
+ raise RuntimeError("cannot load bench_ab_clone helper")
107
+ module = importlib.util.module_from_spec(spec)
108
+ spec.loader.exec_module(module)
109
+ return module.clone(variant, refresh=True) # type: ignore[attr-defined]
110
+
111
+
112
+ def claude_executable() -> str | None:
113
+ """Resolve the claude CLI binary (env override → PATH)."""
114
+ override = os.environ.get("CLAUDE_CLI")
115
+ if override:
116
+ return override
117
+ if shutil.which("claude") is not None:
118
+ return "claude"
119
+ return None
120
+
121
+
122
+ def run_live(task: dict, clone_root: Path, *, timeout_s: int) -> dict:
123
+ """Invoke claude in print/one-shot mode against the task prompt."""
124
+ binary = claude_executable()
125
+ if binary is None:
126
+ return {
127
+ "mode": "live-skipped",
128
+ "reason": "claude CLI not found; set CLAUDE_CLI or install it",
129
+ "transcript": "",
130
+ "exit_code": None,
131
+ "wall_time_seconds": 0.0,
132
+ }
133
+ prompt = task.get("prompt", "")
134
+ cmd = [binary, "--print", "--", prompt]
135
+ started = time.monotonic()
136
+ try:
137
+ proc = subprocess.run(
138
+ cmd,
139
+ cwd=clone_root,
140
+ capture_output=True,
141
+ text=True,
142
+ timeout=timeout_s,
143
+ check=False,
144
+ )
145
+ except subprocess.TimeoutExpired as exc:
146
+ return {
147
+ "mode": "live",
148
+ "reason": f"timeout after {timeout_s}s",
149
+ "transcript": (exc.stdout or "") + "\n[TIMEOUT]",
150
+ "exit_code": -1,
151
+ "wall_time_seconds": round(time.monotonic() - started, 3),
152
+ }
153
+ duration = time.monotonic() - started
154
+ return {
155
+ "mode": "live",
156
+ "reason": "ok",
157
+ "transcript": proc.stdout + "\n" + proc.stderr,
158
+ "exit_code": proc.returncode,
159
+ "wall_time_seconds": round(duration, 3),
160
+ }
161
+
162
+
163
+ def run_dry(task: dict, clone_root: Path, variant: str) -> dict:
164
+ """Record what would have run; produce a deterministic stub transcript.
165
+
166
+ The stub deliberately does NOT echo the user prompt: doing so would let
167
+ transcript-keyword criteria spuriously match against the prompt text
168
+ instead of the agent's response. The stub is therefore inert for every
169
+ `transcript_contains_*` criterion, which is the honest dry-run signal.
170
+ """
171
+ stub_transcript = (
172
+ "[bench_ab_task_runner dry-run]\n"
173
+ f"variant={variant}\n"
174
+ f"clone={clone_root}\n"
175
+ f"task_id={task.get('id')}\n"
176
+ "[no claude invocation; --mode live to execute for real]\n"
177
+ )
178
+ return {
179
+ "mode": "dry-run",
180
+ "reason": "ok",
181
+ "transcript": stub_transcript,
182
+ "exit_code": 0,
183
+ "wall_time_seconds": 0.0,
184
+ }
185
+
186
+
187
+ def count_ask_events(transcript: str) -> dict[str, int]:
188
+ """Crude ask-vs-act heuristic over the transcript."""
189
+ if not transcript:
190
+ return {"asked": 0, "acted_with_commit": 0, "ratio": 0}
191
+ lt = transcript.lower()
192
+ ask_markers = ["should i", "do you want", "shall i", "soll ich", "möchtest du"]
193
+ asked = sum(lt.count(m) for m in ask_markers)
194
+ commit_markers = ["git commit", "git push", "gh pr create", "gh pr merge"]
195
+ acted = sum(lt.count(m) for m in commit_markers)
196
+ total = asked + acted
197
+ ratio = round(asked / total, 3) if total else 0
198
+ return {"asked": asked, "acted_with_commit": acted, "ratio": ratio}
199
+
200
+
201
+ def per_category_aggregate(per_task: list[dict]) -> dict[str, dict]:
202
+ by_cat: dict[str, list[dict]] = {}
203
+ for entry in per_task:
204
+ by_cat.setdefault(entry.get("category", "unknown"), []).append(entry)
205
+ out: dict[str, dict] = {}
206
+ for cat, entries in by_cat.items():
207
+ passed = sum(1 for e in entries if e.get("score", {}).get("passed"))
208
+ total = len(entries)
209
+ out[cat] = {
210
+ "passed": passed,
211
+ "total": total,
212
+ "completion_rate": round(passed / total, 4) if total else 0,
213
+ "mean_wall_time": round(
214
+ sum(e.get("wall_time_seconds", 0) for e in entries) / total, 3
215
+ )
216
+ if total
217
+ else 0,
218
+ }
219
+ return out
220
+
221
+
222
+ def write_report(
223
+ variant: str,
224
+ *,
225
+ mode: str,
226
+ per_task: list[dict],
227
+ duration: float,
228
+ ) -> Path:
229
+ REPORTS_DIR.mkdir(parents=True, exist_ok=True)
230
+ cache_key = bench_ab_cache.CacheKey(
231
+ corpus_hash=bench_ab_cache.hash_file(CORPUS_PATH),
232
+ claude_cli_version=bench_ab_cache.claude_cli_version(),
233
+ target_shape_hash=bench_ab_cache.target_shape_hash(),
234
+ )
235
+ total = len(per_task)
236
+ passed = sum(1 for e in per_task if e.get("score", {}).get("passed"))
237
+ results = {
238
+ "mode": mode,
239
+ "completion_rate": round(passed / total, 4) if total else 0,
240
+ "passed": passed,
241
+ "total": total,
242
+ "per_category": per_category_aggregate(per_task),
243
+ "mean_wall_time": round(
244
+ sum(e.get("wall_time_seconds", 0) for e in per_task) / total, 3
245
+ )
246
+ if total
247
+ else 0,
248
+ "ask_vs_act_ratio": round(
249
+ sum(e.get("ask_events", {}).get("ratio", 0) for e in per_task) / total, 3
250
+ )
251
+ if total
252
+ else 0,
253
+ "per_task": per_task,
254
+ }
255
+ stamp = utc_stamp()
256
+ payload = {
257
+ "schema": "ab-bench/0.1",
258
+ "stamp": stamp,
259
+ "variant": variant,
260
+ "corpus": "ab-trackb",
261
+ "cache_key": cache_key.to_dict(),
262
+ "duration_seconds": round(duration, 3),
263
+ "results": results,
264
+ }
265
+ path = REPORTS_DIR / f"{stamp}-ab-trackb-{variant}.json"
266
+ path.write_text(json.dumps(payload, indent=2) + "\n")
267
+ md = path.with_suffix(".md")
268
+ md.write_text(
269
+ f"# Track B · {variant} · {mode}\n\n"
270
+ f"- Stamp: `{stamp}`\n"
271
+ f"- Completion rate: **{results['completion_rate'] * 100:.1f}%**"
272
+ f" ({passed}/{total})\n"
273
+ f"- Mean wall-time: {results['mean_wall_time']}s\n"
274
+ f"- Ask vs. act ratio: {results['ask_vs_act_ratio']}\n"
275
+ f"\n## Per-category\n\n"
276
+ + "\n".join(
277
+ f"- `{cat}` — {info['passed']}/{info['total']} "
278
+ f"({info['completion_rate'] * 100:.1f}%)"
279
+ for cat, info in results["per_category"].items()
280
+ )
281
+ + "\n"
282
+ )
283
+ return path
284
+
285
+
286
+ def run_variant(variant: str, tasks: list[dict], *, mode: str, timeout_s: int) -> dict:
287
+ started = time.monotonic()
288
+ per_task: list[dict] = []
289
+ for task in tasks:
290
+ clone_root = reset_clone(variant)
291
+ pre = snapshot_clone(clone_root)
292
+ if mode == "live":
293
+ run_result = run_live(task, clone_root, timeout_s=timeout_s)
294
+ else:
295
+ run_result = run_dry(task, clone_root, variant)
296
+ post = snapshot_clone(clone_root)
297
+ score = bench_ab_scoring.score_task(
298
+ task,
299
+ pre_snapshot=pre,
300
+ post_snapshot=post,
301
+ clone_root=clone_root,
302
+ transcript=run_result.get("transcript", ""),
303
+ )
304
+ per_task.append(
305
+ {
306
+ "id": task.get("id"),
307
+ "category": task.get("category"),
308
+ "score": score,
309
+ "wall_time_seconds": run_result.get("wall_time_seconds", 0.0),
310
+ "exit_code": run_result.get("exit_code"),
311
+ "mode": run_result.get("mode", mode),
312
+ "reason": run_result.get("reason", ""),
313
+ "ask_events": count_ask_events(run_result.get("transcript", "")),
314
+ }
315
+ )
316
+ duration = time.monotonic() - started
317
+ path = write_report(variant, mode=mode, per_task=per_task, duration=duration)
318
+ sys.stdout.write(
319
+ f"bench_ab_task_runner: {variant} ({mode}) → "
320
+ f"{sum(1 for e in per_task if e['score']['passed'])}/{len(per_task)} "
321
+ f"passed — {path.relative_to(REPO_ROOT)}\n"
322
+ )
323
+ return {"path": path, "per_task": per_task, "duration": duration}
324
+
325
+
326
+ def parse_args(argv: list[str]) -> argparse.Namespace:
327
+ parser = argparse.ArgumentParser(description="Run Track B tasks per variant.")
328
+ parser.add_argument(
329
+ "--variant",
330
+ choices=("with", "without", "both"),
331
+ default="both",
332
+ help="Which variant to run (default: both).",
333
+ )
334
+ parser.add_argument(
335
+ "--mode",
336
+ choices=("dry-run", "live"),
337
+ default="dry-run",
338
+ help=(
339
+ "dry-run: stub transcript, no CLI invocation (fast, free). "
340
+ "live: invoke `claude --print` per task (cost-bearing)."
341
+ ),
342
+ )
343
+ parser.add_argument(
344
+ "--timeout",
345
+ type=int,
346
+ default=120,
347
+ help="Live mode: per-task timeout in seconds (default 120).",
348
+ )
349
+ return parser.parse_args(argv)
350
+
351
+
352
+ def main(argv: list[str] | None = None) -> int:
353
+ args = parse_args(argv if argv is not None else sys.argv[1:])
354
+ if not CORPUS_PATH.exists():
355
+ sys.stderr.write(f"bench_ab_task_runner: corpus missing at {CORPUS_PATH}\n")
356
+ return 1
357
+ data = yaml.safe_load(CORPUS_PATH.read_text())
358
+ tasks = data.get("tasks") or []
359
+ if not tasks:
360
+ sys.stderr.write("bench_ab_task_runner: corpus has no tasks\n")
361
+ return 1
362
+ variants = ("with", "without") if args.variant == "both" else (args.variant,)
363
+ for variant in variants:
364
+ run_variant(variant, tasks, mode=args.mode, timeout_s=args.timeout)
365
+ return 0
366
+
367
+
368
+ if __name__ == "__main__":
369
+ raise SystemExit(main())