@event4u/agent-config 6.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (378) hide show
  1. package/.claude-plugin/marketplace.json +5 -5
  2. package/CHANGELOG.md +167 -440
  3. package/README.md +3 -3
  4. package/dist/agent-src/commands/agent-handoff.md +5 -4
  5. package/dist/agent-src/commands/agent-status.md +1 -0
  6. package/dist/agent-src/commands/agents/audit.md +1 -0
  7. package/dist/agent-src/commands/agents/init.md +3 -0
  8. package/dist/agent-src/commands/agents/optimize.md +1 -0
  9. package/dist/agent-src/commands/agents/user/accept.md +1 -0
  10. package/dist/agent-src/commands/agents/user/init.md +1 -0
  11. package/dist/agent-src/commands/agents/user/review.md +1 -0
  12. package/dist/agent-src/commands/agents/user/show.md +1 -0
  13. package/dist/agent-src/commands/agents/user/update.md +1 -0
  14. package/dist/agent-src/commands/agents/user.md +1 -0
  15. package/dist/agent-src/commands/agents.md +1 -0
  16. package/dist/agent-src/commands/analytics/prune.md +3 -2
  17. package/dist/agent-src/commands/analytics/show.md +3 -2
  18. package/dist/agent-src/commands/analytics.md +3 -2
  19. package/dist/agent-src/commands/analyze-reference-repo.md +1 -0
  20. package/dist/agent-src/commands/bug-fix.md +1 -0
  21. package/dist/agent-src/commands/bug-investigate.md +1 -0
  22. package/dist/agent-src/commands/challenge-me/vision.md +3 -2
  23. package/dist/agent-src/commands/challenge-me/with-docs.md +3 -2
  24. package/dist/agent-src/commands/challenge-me.md +3 -2
  25. package/dist/agent-src/commands/chat-history/import.md +9 -9
  26. package/dist/agent-src/commands/chat-history.md +32 -30
  27. package/dist/agent-src/commands/check-current-md.md +1 -0
  28. package/dist/agent-src/commands/commit/in-chunks.md +1 -0
  29. package/dist/agent-src/commands/commit.md +1 -0
  30. package/dist/agent-src/commands/condense.md +1 -0
  31. package/dist/agent-src/commands/context/create.md +1 -0
  32. package/dist/agent-src/commands/context/refactor.md +1 -0
  33. package/dist/agent-src/commands/context.md +1 -0
  34. package/dist/agent-src/commands/cost-report.md +5 -4
  35. package/dist/agent-src/commands/council/analysis.md +3 -2
  36. package/dist/agent-src/commands/council/debate.md +5 -4
  37. package/dist/agent-src/commands/council/default.md +3 -2
  38. package/dist/agent-src/commands/council/design.md +3 -2
  39. package/dist/agent-src/commands/council/optimize.md +3 -2
  40. package/dist/agent-src/commands/council/pr.md +3 -2
  41. package/dist/agent-src/commands/council.md +4 -3
  42. package/dist/agent-src/commands/e2e-heal.md +1 -0
  43. package/dist/agent-src/commands/e2e-plan.md +1 -0
  44. package/dist/agent-src/commands/estimate-ticket.md +1 -0
  45. package/dist/agent-src/commands/feature/dev.md +1 -0
  46. package/dist/agent-src/commands/feature/explore.md +1 -0
  47. package/dist/agent-src/commands/feature/plan.md +6 -6
  48. package/dist/agent-src/commands/feature/refactor.md +1 -0
  49. package/dist/agent-src/commands/feature/roadmap.md +1 -0
  50. package/dist/agent-src/commands/feature.md +1 -0
  51. package/dist/agent-src/commands/fix/ci.md +1 -0
  52. package/dist/agent-src/commands/fix/portability.md +1 -0
  53. package/dist/agent-src/commands/fix/pr-comments.md +147 -15
  54. package/dist/agent-src/commands/fix/refs.md +1 -0
  55. package/dist/agent-src/commands/fix/seeder.md +1 -0
  56. package/dist/agent-src/commands/fix.md +8 -8
  57. package/dist/agent-src/commands/ghostwriter/delete.md +1 -0
  58. package/dist/agent-src/commands/ghostwriter/fetch.md +1 -0
  59. package/dist/agent-src/commands/ghostwriter/list.md +1 -0
  60. package/dist/agent-src/commands/ghostwriter/show.md +1 -0
  61. package/dist/agent-src/commands/ghostwriter/write.md +1 -0
  62. package/dist/agent-src/commands/ghostwriter.md +1 -0
  63. package/dist/agent-src/commands/grill-me.md +3 -2
  64. package/dist/agent-src/commands/image/analyse.md +1 -0
  65. package/dist/agent-src/commands/image/create.md +1 -0
  66. package/dist/agent-src/commands/image/verify.md +1 -0
  67. package/dist/agent-src/commands/image.md +1 -0
  68. package/dist/agent-src/commands/implement-ticket.md +1 -0
  69. package/dist/agent-src/commands/jira-ticket.md +1 -0
  70. package/dist/agent-src/commands/judge/on-diff.md +1 -0
  71. package/dist/agent-src/commands/judge/solo.md +1 -0
  72. package/dist/agent-src/commands/judge/steps.md +1 -0
  73. package/dist/agent-src/commands/judge.md +1 -0
  74. package/dist/agent-src/commands/knowledge/cross-repo.md +1 -0
  75. package/dist/agent-src/commands/knowledge/forget.md +1 -0
  76. package/dist/agent-src/commands/knowledge/ingest.md +1 -0
  77. package/dist/agent-src/commands/knowledge/list.md +1 -0
  78. package/dist/agent-src/commands/knowledge.md +1 -0
  79. package/dist/agent-src/commands/memory/add.md +8 -6
  80. package/dist/agent-src/commands/memory/learn-low-impact.md +3 -2
  81. package/dist/agent-src/commands/memory/load.md +7 -7
  82. package/dist/agent-src/commands/memory/mine-session.md +39 -12
  83. package/dist/agent-src/commands/memory/promote.md +3 -2
  84. package/dist/agent-src/commands/memory/propose.md +7 -6
  85. package/dist/agent-src/commands/memory.md +3 -2
  86. package/dist/agent-src/commands/mode.md +1 -0
  87. package/dist/agent-src/commands/module/create.md +1 -0
  88. package/dist/agent-src/commands/module/explore.md +1 -0
  89. package/dist/agent-src/commands/module.md +1 -0
  90. package/dist/agent-src/commands/optimize/agents-dir.md +1 -0
  91. package/dist/agent-src/commands/optimize/augmentignore.md +1 -0
  92. package/dist/agent-src/commands/optimize/rtk.md +1 -0
  93. package/dist/agent-src/commands/optimize/skills.md +1 -0
  94. package/dist/agent-src/commands/optimize-prompt.md +1 -0
  95. package/dist/agent-src/commands/optimize.md +1 -0
  96. package/dist/agent-src/commands/orchestrate.md +1 -0
  97. package/dist/agent-src/commands/override/create.md +1 -0
  98. package/dist/agent-src/commands/override/manage.md +1 -0
  99. package/dist/agent-src/commands/override.md +1 -0
  100. package/dist/agent-src/commands/package-reset.md +1 -0
  101. package/dist/agent-src/commands/package-test.md +1 -0
  102. package/dist/agent-src/commands/post-as/ghostwriter.md +1 -0
  103. package/dist/agent-src/commands/post-as/me.md +1 -0
  104. package/dist/agent-src/commands/post-as.md +1 -0
  105. package/dist/agent-src/commands/pr/create/description-only.md +1 -0
  106. package/dist/agent-src/commands/pr/create.md +25 -0
  107. package/dist/agent-src/commands/prediction-pool.md +1 -0
  108. package/dist/agent-src/commands/prepare-for-review.md +1 -0
  109. package/dist/agent-src/commands/profile/activate.md +1 -0
  110. package/dist/agent-src/commands/profile/deactivate.md +1 -0
  111. package/dist/agent-src/commands/profile/show.md +1 -0
  112. package/dist/agent-src/commands/profile.md +1 -0
  113. package/dist/agent-src/commands/project-analyze.md +1 -0
  114. package/dist/agent-src/commands/project-health.md +1 -0
  115. package/dist/agent-src/commands/quality-fix.md +1 -0
  116. package/dist/agent-src/commands/refine-ticket.md +1 -0
  117. package/dist/agent-src/commands/research/deep.md +1 -0
  118. package/dist/agent-src/commands/research/report.md +1 -0
  119. package/dist/agent-src/commands/research.md +1 -0
  120. package/dist/agent-src/commands/review-changes.md +1 -0
  121. package/dist/agent-src/commands/review-routing.md +1 -0
  122. package/dist/agent-src/commands/roadmap/ai-council.md +1 -0
  123. package/dist/agent-src/commands/roadmap/create.md +1 -0
  124. package/dist/agent-src/commands/roadmap/process-full.md +1 -0
  125. package/dist/agent-src/commands/roadmap/process-phase.md +1 -0
  126. package/dist/agent-src/commands/roadmap/process-step.md +1 -0
  127. package/dist/agent-src/commands/roadmap.md +1 -0
  128. package/dist/agent-src/commands/rule-compliance-audit.md +1 -0
  129. package/dist/agent-src/commands/security-audit-config.md +84 -0
  130. package/dist/agent-src/commands/set-cost-profile.md +1 -0
  131. package/dist/agent-src/commands/skill/preview.md +1 -0
  132. package/dist/agent-src/commands/skill.md +1 -0
  133. package/dist/agent-src/commands/skills/discover.md +1 -0
  134. package/dist/agent-src/commands/skills.md +1 -0
  135. package/dist/agent-src/commands/sync-agent-settings.md +1 -0
  136. package/dist/agent-src/commands/sync-gitignore/fix.md +1 -0
  137. package/dist/agent-src/commands/sync-gitignore.md +1 -0
  138. package/dist/agent-src/commands/tests/create.md +1 -0
  139. package/dist/agent-src/commands/tests/execute.md +1 -0
  140. package/dist/agent-src/commands/tests.md +1 -0
  141. package/dist/agent-src/commands/threat-model.md +1 -0
  142. package/dist/agent-src/commands/update-form-request-messages.md +1 -0
  143. package/dist/agent-src/commands/upstream-contribute.md +1 -0
  144. package/dist/agent-src/commands/video/from-script.md +1 -0
  145. package/dist/agent-src/commands/video/from-song.md +1 -0
  146. package/dist/agent-src/commands/video/scene.md +1 -0
  147. package/dist/agent-src/commands/video/stitch.md +1 -0
  148. package/dist/agent-src/commands/video/storyboard.md +1 -0
  149. package/dist/agent-src/commands/video.md +1 -0
  150. package/dist/agent-src/commands/work.md +1 -0
  151. package/dist/agent-src/contexts/augment-infrastructure.md +1 -1
  152. package/dist/agent-src/contexts/communication/rules-auto/skill-quality-mechanics.md +1 -1
  153. package/dist/agent-src/contexts/communication/rules-auto/slash-command-routing-policy-mechanics.md +2 -2
  154. package/dist/agent-src/contexts/communication/rules-auto/think-before-action-mechanics.md +6 -6
  155. package/dist/agent-src/contexts/contracts/consumer-agents-md-guide.md +2 -2
  156. package/dist/agent-src/contexts/execution/rdp-gate.md +75 -0
  157. package/dist/agent-src/contexts/subagent-configuration.md +1 -0
  158. package/dist/agent-src/personas/advisors/contrarian.md +1 -1
  159. package/dist/agent-src/personas/advisors/executor.md +1 -1
  160. package/dist/agent-src/personas/advisors/expansionist.md +1 -1
  161. package/dist/agent-src/personas/advisors/first-principles.md +1 -1
  162. package/dist/agent-src/personas/advisors/outsider.md +1 -1
  163. package/dist/agent-src/rules/autonomous-execution.md +12 -0
  164. package/dist/agent-src/rules/external-reference-deep-dive.md +1 -1
  165. package/dist/agent-src/rules/git-history-discipline.md +47 -1
  166. package/dist/agent-src/rules/improve-before-implement.md +12 -0
  167. package/dist/agent-src/rules/lethal-trifecta-guard.md +80 -0
  168. package/dist/agent-src/rules/no-pr-progress-comments.md +3 -4
  169. package/dist/agent-src/rules/notes-first-reasoning.md +71 -0
  170. package/dist/agent-src/rules/roadmap-progress-sync.md +48 -31
  171. package/dist/agent-src/rules/security-sensitive-stop.md +14 -1
  172. package/dist/agent-src/rules/source-confidentiality.md +97 -0
  173. package/dist/agent-src/rules/think-before-action.md +9 -1
  174. package/dist/agent-src/rules/untrusted-input-defense.md +76 -0
  175. package/dist/agent-src/scripts/archive_completed_roadmaps.py +171 -0
  176. package/dist/agent-src/skills/adversarial-review/SKILL.md +14 -0
  177. package/dist/agent-src/skills/agent-security-review/SKILL.md +113 -0
  178. package/dist/agent-src/skills/agent-security-review/evals/triggers.json +51 -0
  179. package/dist/agent-src/skills/ai-council/SKILL.md +3 -3
  180. package/dist/agent-src/skills/async-python-patterns/SKILL.md +1 -1
  181. package/dist/agent-src/skills/blast-radius-analyzer/SKILL.md +12 -11
  182. package/dist/agent-src/skills/command-routing/SKILL.md +1 -1
  183. package/dist/agent-src/skills/complexity-first-planning/SKILL.md +96 -0
  184. package/dist/agent-src/skills/complexity-first-planning/evals/triggers.json +16 -0
  185. package/dist/agent-src/skills/copilot-config/SKILL.md +3 -4
  186. package/dist/agent-src/skills/defense-in-depth/SKILL.md +1 -1
  187. package/dist/agent-src/skills/developer-like-execution/SKILL.md +5 -4
  188. package/dist/agent-src/skills/error-handling-patterns/SKILL.md +1 -1
  189. package/dist/agent-src/skills/feature-planning/SKILL.md +2 -2
  190. package/dist/agent-src/skills/mcp-builder/SKILL.md +1 -1
  191. package/dist/agent-src/skills/memory-consolidation/SKILL.md +63 -17
  192. package/dist/agent-src/skills/prompt-engineering-patterns/SKILL.md +1 -1
  193. package/dist/agent-src/skills/readme-writing-package/SKILL.md +1 -1
  194. package/dist/agent-src/skills/reasoning-orchestrator/SKILL.md +119 -0
  195. package/dist/agent-src/skills/reasoning-orchestrator/evals/triggers.json +16 -0
  196. package/dist/agent-src/skills/receiving-code-review/SKILL.md +6 -6
  197. package/dist/agent-src/skills/refine-prompt/SKILL.md +1 -1
  198. package/dist/agent-src/skills/refine-ticket/SKILL.md +1 -1
  199. package/dist/agent-src/skills/repomix-packer/SKILL.md +1 -1
  200. package/dist/agent-src/skills/secrets-management/SKILL.md +1 -1
  201. package/dist/agent-src/skills/subagent-orchestration/SKILL.md +10 -3
  202. package/dist/agent-src/skills/testing-anti-patterns/SKILL.md +1 -1
  203. package/dist/agent-src/skills/testing-anti-patterns/process-anti-patterns.md +1 -1
  204. package/dist/agent-src/skills/token-optimizer/SKILL.md +1 -1
  205. package/dist/agent-src/templates/agents/.gitattributes.fragment +0 -1
  206. package/dist/agent-src/templates/agents/agent-project-settings.example.yml +4 -4
  207. package/dist/agent-src/templates/scripts/check_memory.py +1 -2
  208. package/dist/agent-src/templates/scripts/check_memory_proposal.py +1 -1
  209. package/dist/agent-src/templates/scripts/memory_lookup.py +148 -289
  210. package/dist/agent-src/templates/scripts/memory_report.py +132 -2
  211. package/dist/agent-src/templates/scripts/memory_signal.py +7 -9
  212. package/dist/agent-src/templates/scripts/memory_status.py +25 -206
  213. package/dist/agent-src/templates/scripts/work_engine/directives/backend/memory.py +6 -6
  214. package/dist/agent-src/templates/scripts/work_engine/directives/ui/_passthrough.py +3 -3
  215. package/dist/agent-src/templates/scripts/work_engine/scoring/memory_visibility.py +0 -1
  216. package/dist/cli/agent-config.js +31 -300
  217. package/dist/cli/agent-config.js.map +1 -1
  218. package/dist/cli/commands/commands.js +10 -5
  219. package/dist/cli/commands/commands.js.map +1 -1
  220. package/dist/cli/discovery/loadManifest.js.map +1 -1
  221. package/dist/cli/main.js +309 -0
  222. package/dist/cli/main.js.map +1 -0
  223. package/dist/discovery/deprecation-report.md +1 -1
  224. package/dist/discovery/discovery-manifest.json +645 -342
  225. package/dist/discovery/discovery-manifest.json.sha256 +1 -1
  226. package/dist/discovery/discovery-manifest.summary.md +8 -5
  227. package/dist/discovery/orphan-report.md +1 -1
  228. package/dist/discovery/packs.json +149 -37
  229. package/dist/discovery/trust-report.md +3 -3
  230. package/dist/discovery/workspaces.json +61 -36
  231. package/dist/mcp/registry-manifest.json +4 -4
  232. package/dist/router.json +1 -1
  233. package/dist/server/routes/wizard.js +4 -3
  234. package/dist/server/routes/wizard.js.map +1 -1
  235. package/dist/server/schemas/settings.js +18 -0
  236. package/dist/server/schemas/settings.js.map +1 -1
  237. package/docs/MIGRATION.md +1 -1
  238. package/docs/adrs/cost/0001-hard-stop-hook.md +5 -5
  239. package/docs/adrs/memory/0001-consumer-side-snapshot.md +15 -7
  240. package/docs/adrs/memory/README.md +6 -5
  241. package/docs/adrs/router/0001-three-tier-routing.md +2 -2
  242. package/docs/adrs/schema/0001-json-schema-frontmatter.md +2 -2
  243. package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +5 -5
  244. package/docs/adrs/telegraph/0001-default-off-until-bench.md +3 -3
  245. package/docs/architecture.md +9 -9
  246. package/docs/archive/CHANGELOG-pre-2.2.0.md +30 -30
  247. package/docs/archive/CHANGELOG-pre-2.25.0.md +1 -1
  248. package/docs/archive/CHANGELOG-pre-4.5.0.md +1 -1
  249. package/docs/archive/CHANGELOG-pre-6.0.0.md +473 -0
  250. package/docs/benchmark.md +54 -53
  251. package/docs/benchmarks.md +2 -2
  252. package/docs/case-studies/{frontend-design-vs-ui-ux-pro-max.md → frontend-design-positioning.md} +4 -4
  253. package/docs/catalog.md +20 -13
  254. package/docs/command-flows.md +90 -92
  255. package/docs/contracts/adr-layout.md +2 -3
  256. package/docs/contracts/adr-level-6-productization.md +1 -1
  257. package/docs/contracts/ai-council-config.md +42 -7
  258. package/docs/contracts/command-clusters.md +1 -1
  259. package/docs/contracts/cost-enforcement.md +1 -1
  260. package/docs/contracts/cost-summary-schema.md +1 -1
  261. package/docs/contracts/daily-workspace.md +1 -0
  262. package/docs/contracts/discovery-manifest.schema.json +4 -2
  263. package/docs/contracts/explain-modes.md +1 -1
  264. package/docs/contracts/implement-ticket-flow.md +6 -7
  265. package/docs/contracts/mcp-tool-inventory.md +10 -10
  266. package/docs/contracts/measurement-baseline.md +1 -1
  267. package/docs/contracts/memory-visibility-v1.md +1 -5
  268. package/docs/contracts/namespace.md +1 -1
  269. package/docs/contracts/persona-schema.md +1 -1
  270. package/docs/contracts/rule-interactions.md +1 -1
  271. package/docs/contracts/smoke-contracts.md +1 -1
  272. package/docs/contracts/universal-skills.md +0 -1
  273. package/docs/contracts/workspace-boundary.md +84 -0
  274. package/docs/customization.md +3 -3
  275. package/docs/decisions/ADR-009-event4u-namespace.md +1 -1
  276. package/docs/decisions/ADR-013-discovery-frontmatter-contract.md +1 -1
  277. package/docs/decisions/ADR-026-explain-mode-translation.md +1 -1
  278. package/docs/decisions/ADR-088-no-external-runtime-federation.md +26 -27
  279. package/docs/decisions/ADR-090-visibility-command-frontmatter-field.md +95 -0
  280. package/docs/decisions/ADR-091-split-meta-capability-packs.md +113 -0
  281. package/docs/decisions/ADR-092-defer-command-tier-alias-removal.md +93 -0
  282. package/docs/decisions/ADR-093-ai-council-config-user-global.md +111 -0
  283. package/docs/decisions/ADR-094-agent-memory-layer-removal.md +94 -0
  284. package/docs/decisions/ADR-095-workspace-boundary-contract.md +108 -0
  285. package/docs/decisions/INDEX.md +6 -0
  286. package/docs/development.md +5 -7
  287. package/docs/getting-started.md +4 -4
  288. package/docs/guidelines/agent-infra/5w2h-analysis.md +1 -1
  289. package/docs/guidelines/agent-infra/comparison-matrix.md +1 -1
  290. package/docs/guidelines/agent-infra/corpus-grounding-authoring.md +1 -1
  291. package/docs/guidelines/agent-infra/critical-thinking.md +1 -1
  292. package/docs/guidelines/agent-infra/engineering-memory-data-format.md +1 -5
  293. package/docs/guidelines/agent-infra/first-principles.md +1 -1
  294. package/docs/guidelines/agent-infra/frontier-reasoning-operating-profile.md +164 -0
  295. package/docs/guidelines/agent-infra/inversion-thinking.md +1 -1
  296. package/docs/guidelines/agent-infra/ios-simulator-guide.md +9 -14
  297. package/docs/guidelines/agent-infra/mcp-request-signing.md +19 -22
  298. package/docs/guidelines/agent-infra/memory-access.md +25 -31
  299. package/docs/guidelines/agent-infra/mental-models.md +1 -1
  300. package/docs/guidelines/agent-infra/model-recommendation.md +29 -0
  301. package/docs/guidelines/agent-infra/scqa-framework.md +3 -3
  302. package/docs/guidelines/agent-infra/security-lint-containment.md +81 -0
  303. package/docs/guidelines/agent-infra/six-hats.md +1 -1
  304. package/docs/guidelines/agent-infra/systems-thinking.md +1 -1
  305. package/docs/guidelines/agent-infra/untrusted-input-spotlighting.md +72 -0
  306. package/docs/installation.md +1 -1
  307. package/docs/mcp.md +2 -2
  308. package/docs/parity/{bench-ruflo.json → bench-external.json} +10 -10
  309. package/docs/parity/{ruflo.md → external-runtime.md} +9 -9
  310. package/docs/quality.md +3 -3
  311. package/docs/safety.md +3 -3
  312. package/docs/skills-catalog.md +4 -1
  313. package/llms.txt +3 -0
  314. package/package.json +1 -1
  315. package/src/config/agent-settings.template.yml +65 -3
  316. package/src/config/discovery/packs.yml +29 -0
  317. package/src/config/discovery/workspaces.yml +3 -1
  318. package/src/config/gitignore-block.txt +6 -0
  319. package/src/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
  320. package/src/scripts/_cli/cmd_doctor.py +99 -13
  321. package/src/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
  322. package/src/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
  323. package/src/scripts/_lib/bench_ab_scoring_v2.py +227 -0
  324. package/src/scripts/_lib/global_deploy_inventory.py +39 -9
  325. package/src/scripts/_lib/link_crypto.py +206 -0
  326. package/src/scripts/_lib/security_lint.py +228 -0
  327. package/src/scripts/ai_council/clients.py +2 -2
  328. package/src/scripts/ai_council/config.py +55 -0
  329. package/src/scripts/audit_adr_coverage.py +0 -2
  330. package/src/scripts/audit_command_surface.py +18 -5
  331. package/src/scripts/audit_mcp_tools.py +2 -2
  332. package/src/scripts/audit_skill_descriptions.py +2 -2
  333. package/src/scripts/bench_ab_clone.py +62 -12
  334. package/src/scripts/bench_ab_task_runner.py +475 -30
  335. package/src/scripts/bench_ab_v2_run.py +247 -0
  336. package/src/scripts/bench_ab_v2_stats.py +347 -0
  337. package/src/scripts/bench_run.py +1 -1
  338. package/src/scripts/build_discovery_manifest.py +10 -0
  339. package/src/scripts/check_bite_sized_granularity.py +1 -2
  340. package/src/scripts/check_memory.py +49 -63
  341. package/src/scripts/check_memory_proposal.py +1 -1
  342. package/src/scripts/check_no_external_sources.py +101 -0
  343. package/src/scripts/check_references.py +2 -0
  344. package/src/scripts/cost_by_conversation.py +1 -1
  345. package/src/scripts/council_cli.py +28 -14
  346. package/src/scripts/external_sources_denylist.json +91 -0
  347. package/src/scripts/hook_manifest.yaml +14 -6
  348. package/src/scripts/injection_scan_hook.py +145 -0
  349. package/src/scripts/install-hooks.sh +11 -0
  350. package/src/scripts/install.py +88 -13
  351. package/src/scripts/lint_agent_security.py +112 -0
  352. package/src/scripts/lint_bench_ab.py +5 -4
  353. package/src/scripts/lint_command_tiers.py +63 -22
  354. package/src/scripts/lint_discovery_vocabulary.py +2 -0
  355. package/src/scripts/lint_empty_roadmaps.py +80 -0
  356. package/src/scripts/lint_hidden_unicode.py +132 -0
  357. package/src/scripts/lint_instruction_smuggling.py +107 -0
  358. package/src/scripts/lint_marketplace.py +1 -1
  359. package/src/scripts/lint_mcp_config_security.py +124 -0
  360. package/src/scripts/lint_skill_frontmatter_safety.py +144 -0
  361. package/src/scripts/lint_workspace_boundary.py +122 -0
  362. package/src/scripts/mcp_server/consumer_tool_catalog.json +2 -3
  363. package/src/scripts/mcp_server/tools.py +8 -32
  364. package/src/scripts/memory_lookup.py +27 -296
  365. package/src/scripts/memory_report.py +1 -23
  366. package/src/scripts/memory_signal.py +6 -53
  367. package/src/scripts/memory_status.py +25 -206
  368. package/src/scripts/mine_session.py +118 -41
  369. package/src/scripts/pack_dependency_allowlist.json +2 -2
  370. package/src/scripts/render_benchmark_md.py +141 -52
  371. package/src/scripts/schemas/command.schema.json +6 -1
  372. package/src/scripts/security_audit_config.py +153 -0
  373. package/dist/agent-src/commands/chat-history/learn.md +0 -184
  374. package/dist/agent-src/commands/chat-history/show.md +0 -113
  375. package/dist/agent-src/commands/fix/pr-bot-comments.md +0 -157
  376. package/dist/agent-src/commands/fix/pr-developer-comments.md +0 -163
  377. package/dist/agent-src/templates/agents/memory/architecture-decisions.example.yml +0 -95
  378. package/docs/contracts/agent-memory-contract.md +0 -159
@@ -0,0 +1,247 @@
1
+ #!/usr/bin/env python3
2
+ """bench:ab v2 — discipline-axis runner (Phases 2-4).
3
+
4
+ Runs the discipline-headroom corpus (ab-trackb-v2.yaml) across FOUR arms on a
5
+ fixed host model, scores each on the dual axis (capability + discipline) plus
6
+ trajectory metrics, and emits a PAIRED per-instance report (the same task × seed
7
+ seen under every arm) so the lift is computed paired, not as independent rates.
8
+
9
+ Arms (council L2/L5):
10
+ - vanilla : plugin OFF (--setting-sources project,local), no injection.
11
+ - package : the REAL installed plugin (plain --print).
12
+ - package-rdp : real plugin + RDP rules injected (--append-system-prompt-file).
13
+ - placebo : plugin OFF + an equal-length INERT prose block — controls for
14
+ "does any long prompt prime caution?" so a measured lift can't
15
+ be dismissed as prompt-length priming.
16
+
17
+ Reuses the v1 harness primitives (run_live, claude_executable, count_ask_events,
18
+ RDP sysprompt) — refactor-in-place per the v2 inventory; only corpus + scorer +
19
+ metrics + arms are new.
20
+
21
+ Cost controls inherited: --model pin (sonnet), --max-budget-usd cap. Cheap-by-
22
+ construction: the v2 fixtures are tiny, so per-run tokens are far below the v1
23
+ big-repo tasks.
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import json
29
+ import shutil
30
+ import sys
31
+ from pathlib import Path
32
+
33
+ import yaml
34
+
35
+ sys.path.insert(0, str(Path(__file__).resolve().parent / "_lib"))
36
+ import bench_ab_scoring_v2 as scoring # noqa: E402
37
+
38
+ # Import v1 primitives (skeleton reuse).
39
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
40
+ import bench_ab_task_runner as v1 # noqa: E402
41
+
42
+ REPO_ROOT = v1.REPO_ROOT
43
+ CORPUS_PATH = REPO_ROOT / "internal" / "bench" / "corpora" / "ab-trackb-v2.yaml"
44
+ FIXTURES_ROOT = REPO_ROOT / "internal" / "bench" / "ab"
45
+ REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab-v2"
46
+ WORK_ROOT = REPO_ROOT / "internal" / "bench" / "ab" / "v2-clones"
47
+
48
+ # Arm -> (setting_sources, inject) where inject ∈ {None, "rdp", "placebo"}.
49
+ ARMS = {
50
+ "vanilla": ("project,local", None),
51
+ "package": (None, None),
52
+ "package-rdp": (None, "rdp"),
53
+ "placebo": ("project,local", "placebo"),
54
+ }
55
+
56
+
57
+ def placebo_prose(target_chars: int) -> str:
58
+ """Deterministic inert prose of ~target_chars — no rules, no discipline cues.
59
+
60
+ Sized to the package's injected footprint so the placebo arm is a genuine
61
+ length control. Content is neutral filler that must NOT prime careful
62
+ behaviour (no 'verify', 'minimal', 'careful', 'ask' vocabulary)."""
63
+ sentence = (
64
+ "The following note is background context with no bearing on the task. "
65
+ "It describes a fictional inventory of office supplies across several "
66
+ "storage rooms, listing quantities of paper, folders, and assorted "
67
+ "stationery without any instruction or guidance of any kind. "
68
+ )
69
+ out = []
70
+ n = 0
71
+ i = 0
72
+ while n < target_chars:
73
+ line = f"Section {i}: " + sentence
74
+ out.append(line)
75
+ n += len(line)
76
+ i += 1
77
+ return "".join(out)[:target_chars]
78
+
79
+
80
+ def injected_text(inject: str | None, placebo_chars: int) -> str | None:
81
+ if inject == "rdp":
82
+ return v1.system_prompt_for("with-rdp")
83
+ if inject == "placebo":
84
+ return placebo_prose(placebo_chars)
85
+ return None
86
+
87
+
88
+ def reset_fixture(task: dict) -> Path:
89
+ """Copy the task's pristine fixture into a throwaway working clone."""
90
+ fixture = FIXTURES_ROOT / task["fixture"]
91
+ dest = WORK_ROOT / task["id"]
92
+ if dest.exists():
93
+ shutil.rmtree(dest)
94
+ dest.parent.mkdir(parents=True, exist_ok=True)
95
+ shutil.copytree(fixture, dest)
96
+ return dest, fixture
97
+
98
+
99
+ def status_bucket(run: dict) -> str:
100
+ """Map a run outcome to an AgentBench-style trajectory bucket."""
101
+ if not run.get("errored"):
102
+ return "completed"
103
+ sub = (run.get("subtype") or "").lower()
104
+ if "budget" in sub:
105
+ return "budget_limit"
106
+ if "timeout" in (run.get("reason") or "").lower() or run.get("exit_code") == -1:
107
+ return "task_limit"
108
+ if "max_turns" in sub or "turn" in sub:
109
+ return "task_limit"
110
+ return "validation_failed"
111
+
112
+
113
+ def trajectory_metrics(run: dict, score: dict) -> dict:
114
+ asks = v1.count_ask_events(run.get("transcript", ""))
115
+ return {
116
+ "status_bucket": status_bucket(run),
117
+ "num_turns": run.get("num_turns", 0),
118
+ "files_changed": len(score.get("files_changed", [])),
119
+ "ask_vs_act_ratio": asks.get("ratio", 0),
120
+ "ask_events": asks.get("asks", 0) if isinstance(asks, dict) else 0,
121
+ "wall_time_seconds": run.get("wall_time_seconds", 0.0),
122
+ "tokens": run.get("tokens", 0),
123
+ }
124
+
125
+
126
+ def run_one(task: dict, arm: str, *, model, max_budget, timeout, placebo_chars,
127
+ sp_dir: Path) -> dict:
128
+ setting_sources, inject = ARMS[arm]
129
+ clone, fixture = reset_fixture(task)
130
+ sp_text = injected_text(inject, placebo_chars)
131
+ sp_file = None
132
+ if sp_text:
133
+ sp_file = sp_dir / f".sp-{arm}.txt"
134
+ sp_file.write_text(sp_text, encoding="utf-8")
135
+ run = v1.run_live(
136
+ task, clone,
137
+ timeout_s=timeout,
138
+ sysprompt_file=sp_file,
139
+ setting_sources=setting_sources,
140
+ max_budget=max_budget,
141
+ model=model,
142
+ )
143
+ score = scoring.score_task_v2(
144
+ task, fixture_root=fixture, clone_root=clone,
145
+ transcript=run.get("transcript", ""),
146
+ )
147
+ return {
148
+ "errored": bool(run.get("errored")),
149
+ "reason": run.get("reason"),
150
+ "capability_pass": score["capability_pass"],
151
+ "discipline_score": score["discipline_score"],
152
+ "discipline_pass": score["discipline_pass"],
153
+ "metrics": trajectory_metrics(run, score),
154
+ "injected_chars": len(sp_text) if sp_text else 0,
155
+ }
156
+
157
+
158
+ def main(argv: "list[str] | None" = None) -> int:
159
+ ap = argparse.ArgumentParser(description="bench:ab v2 discipline-axis runner.")
160
+ ap.add_argument("--arms", default="vanilla,package,package-rdp,placebo")
161
+ ap.add_argument("--seeds", type=int, default=3, help="reps per arm (stochastic seeds).")
162
+ ap.add_argument("--tasks", default="", help="comma-separated task ids (default: all).")
163
+ ap.add_argument("--limit", type=int, default=0)
164
+ ap.add_argument("--model", default="claude-sonnet-4-6")
165
+ ap.add_argument("--budget", type=float, default=1.0, help="per-run --max-budget-usd (0=off).")
166
+ ap.add_argument("--timeout", type=int, default=180)
167
+ ap.add_argument("--mode", choices=("live", "dry-run"), default="live")
168
+ args = ap.parse_args(argv if argv is not None else sys.argv[1:])
169
+
170
+ corpus = yaml.safe_load(CORPUS_PATH.read_text())
171
+ tasks = corpus.get("tasks") or []
172
+ if args.tasks.strip():
173
+ want = {s.strip() for s in args.tasks.split(",") if s.strip()}
174
+ tasks = [t for t in tasks if t["id"] in want]
175
+ elif args.limit:
176
+ tasks = tasks[: args.limit]
177
+ arms = [a.strip() for a in args.arms.split(",") if a.strip()]
178
+ for a in arms:
179
+ if a not in ARMS:
180
+ sys.stderr.write(f"unknown arm: {a}\n")
181
+ return 1
182
+
183
+ if args.mode == "dry-run":
184
+ sys.stdout.write(
185
+ f"bench_ab_v2: DRY — {len(tasks)} tasks × {len(arms)} arms × "
186
+ f"{args.seeds} seeds = {len(tasks) * len(arms) * args.seeds} runs "
187
+ f"(model={args.model}, budget={args.budget}). No spend.\n"
188
+ )
189
+ return 0
190
+
191
+ if v1.claude_executable() is None:
192
+ sys.stderr.write("claude CLI not found\n")
193
+ return 1
194
+
195
+ REPORTS_DIR.mkdir(parents=True, exist_ok=True)
196
+ sp_dir = REPORTS_DIR
197
+ max_budget = args.budget if args.budget and args.budget > 0 else None
198
+ # Size the placebo to the RDP injection so package-rdp vs placebo is length-matched.
199
+ rdp_text = v1.system_prompt_for("with-rdp") or ""
200
+ placebo_chars = max(len(rdp_text), 2000)
201
+
202
+ total = len(tasks) * len(arms) * args.seeds
203
+ done = 0
204
+ records: list[dict] = []
205
+ for task in tasks:
206
+ per_arm: dict[str, list[dict]] = {}
207
+ for arm in arms:
208
+ seed_runs = []
209
+ for seed in range(args.seeds):
210
+ done += 1
211
+ sys.stderr.write(
212
+ f"[{done}/{total}] {task['id']} · {arm} · seed {seed}\n")
213
+ sys.stderr.flush()
214
+ r = run_one(
215
+ task, arm, model=args.model, max_budget=max_budget,
216
+ timeout=args.timeout, placebo_chars=placebo_chars, sp_dir=sp_dir)
217
+ r["seed"] = seed
218
+ seed_runs.append(r)
219
+ per_arm[arm] = seed_runs
220
+ records.append({
221
+ "id": task["id"],
222
+ "archetype": task["archetype"],
223
+ "rule": task["rule"],
224
+ "arms": per_arm,
225
+ })
226
+
227
+ stamp = v1.utc_stamp()
228
+ payload = {
229
+ "schema": "ab-bench-v2/0.1",
230
+ "stamp": stamp,
231
+ "model": args.model,
232
+ "seeds": args.seeds,
233
+ "arms": arms,
234
+ "budget_usd_per_run": args.budget,
235
+ "placebo_chars": placebo_chars,
236
+ "corpus": "ab-trackb-v2",
237
+ "records": records,
238
+ }
239
+ out = REPORTS_DIR / f"{stamp}-ab-v2-paired.json"
240
+ out.write_text(json.dumps(payload, indent=2) + "\n")
241
+ sys.stdout.write(f"bench_ab_v2: wrote {out.relative_to(REPO_ROOT)} "
242
+ f"({len(records)} tasks, {total} runs)\n")
243
+ return 0
244
+
245
+
246
+ if __name__ == "__main__":
247
+ raise SystemExit(main())
@@ -0,0 +1,347 @@
1
+ #!/usr/bin/env python3
2
+ """bench:ab v2 — paired statistics (Phase 3).
3
+
4
+ Reads a v2 paired report (bench_ab_v2_run.py output) and computes, for each
5
+ arm comparison, paired significance + effect size on:
6
+
7
+ - capability axis (binary) -> McNemar exact test + Cohen's h
8
+ - discipline axis ([0,1]) -> Wilcoxon signed-rank + rank-biserial
9
+ - status buckets -> error/undisciplined-rate per arm
10
+
11
+ Pairing: each (task, seed) is one pair, seen under every arm. Pooled across all
12
+ task×seed pairs. Dependency-free (stdlib math only) so the benchmark stays
13
+ portable. Errored runs are EXCLUDED from a pair (per-axis) so a quota trip is
14
+ never read as a content/discipline fail.
15
+
16
+ Comparisons reported:
17
+ package vs vanilla -> the package lift (adoption question)
18
+ package-rdp vs package -> the RDP reasoning lift
19
+ package vs placebo -> attribution: content vs mere prompt-length priming
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ import json
25
+ import math
26
+ import sys
27
+ from pathlib import Path
28
+
29
+ REPO_ROOT = Path(__file__).resolve().parent.parent.parent
30
+ REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab-v2"
31
+
32
+ COMPARISONS = [
33
+ ("package", "vanilla", "package lift"),
34
+ ("package-rdp", "package", "RDP lift"),
35
+ ("package", "placebo", "attribution (content vs length)"),
36
+ ]
37
+
38
+
39
+ def _phi(z: float) -> float:
40
+ """Standard-normal CDF via erf."""
41
+ return 0.5 * (1.0 + math.erf(z / math.sqrt(2.0)))
42
+
43
+
44
+ def _comb(n: int, k: int) -> int:
45
+ return math.comb(n, k)
46
+
47
+
48
+ def mcnemar_exact(b: int, c: int) -> float:
49
+ """Two-sided exact McNemar p-value (binomial on discordant pairs)."""
50
+ n = b + c
51
+ if n == 0:
52
+ return 1.0
53
+ k = min(b, c)
54
+ tail = sum(_comb(n, i) for i in range(0, k + 1)) * (0.5 ** n)
55
+ return min(1.0, 2.0 * tail)
56
+
57
+
58
+ def cohens_h(p1: float, p2: float) -> float:
59
+ return 2 * math.asin(math.sqrt(max(0, min(1, p1)))) - \
60
+ 2 * math.asin(math.sqrt(max(0, min(1, p2))))
61
+
62
+
63
+ def wilcoxon(diffs: list[float]) -> dict:
64
+ """Wilcoxon signed-rank on paired differences (treatment - baseline).
65
+
66
+ Returns W+, W-, normal-approx two-sided p (continuity-corrected), and
67
+ rank-biserial effect size. Zeros are dropped."""
68
+ nz = [d for d in diffs if abs(d) > 1e-9]
69
+ n = len(nz)
70
+ if n == 0:
71
+ return {"n": 0, "W_plus": 0, "W_minus": 0, "p": 1.0, "rank_biserial": 0.0}
72
+ order = sorted(range(n), key=lambda i: abs(nz[i]))
73
+ ranks = [0.0] * n
74
+ i = 0
75
+ while i < n:
76
+ j = i
77
+ while j + 1 < n and abs(nz[order[j + 1]]) == abs(nz[order[i]]):
78
+ j += 1
79
+ avg = (i + 1 + j + 1) / 2.0 # average rank for ties (1-based)
80
+ for k in range(i, j + 1):
81
+ ranks[order[k]] = avg
82
+ i = j + 1
83
+ w_plus = sum(ranks[i] for i in range(n) if nz[i] > 0)
84
+ w_minus = sum(ranks[i] for i in range(n) if nz[i] < 0)
85
+ total = w_plus + w_minus
86
+ rb = (w_plus - w_minus) / total if total else 0.0
87
+ # Normal approximation (ok-ish for n>=10; for small n it's conservative —
88
+ # we surface n so the reader can weight it).
89
+ mean = n * (n + 1) / 4.0
90
+ sd = math.sqrt(n * (n + 1) * (2 * n + 1) / 24.0)
91
+ w = min(w_plus, w_minus)
92
+ if sd == 0:
93
+ p = 1.0
94
+ else:
95
+ z = (w - mean + 0.5) / sd
96
+ p = min(1.0, 2.0 * _phi(z))
97
+ return {"n": n, "W_plus": round(w_plus, 1), "W_minus": round(w_minus, 1),
98
+ "p": round(p, 4), "rank_biserial": round(rb, 4)}
99
+
100
+
101
+ def _pairs(records: list[dict], arm_t: str, arm_b: str):
102
+ """Yield (task, seed, run_t, run_b) for each paired (task,seed)."""
103
+ for rec in records:
104
+ arms = rec.get("arms", {})
105
+ runs_t = arms.get(arm_t) or []
106
+ runs_b = arms.get(arm_b) or []
107
+ by_seed_b = {r.get("seed"): r for r in runs_b}
108
+ for r_t in runs_t:
109
+ r_b = by_seed_b.get(r_t.get("seed"))
110
+ if r_b is not None:
111
+ yield rec["id"], r_t.get("seed"), r_t, r_b
112
+
113
+
114
+ def compare(records: list[dict], arm_t: str, arm_b: str) -> dict:
115
+ # Capability (binary, McNemar) — exclude pairs where either side errored.
116
+ b = c = both1 = both0 = 0
117
+ cap_t = cap_b = capn = 0
118
+ # Discipline (continuous, Wilcoxon)
119
+ diffs: list[float] = []
120
+ dis_t_sum = dis_b_sum = disn = 0.0
121
+ for _id, _seed, rt, rb in _pairs(records, arm_t, arm_b):
122
+ if not rt.get("errored") and not rb.get("errored"):
123
+ t = bool(rt.get("capability_pass"))
124
+ bb = bool(rb.get("capability_pass"))
125
+ capn += 1
126
+ cap_t += int(t)
127
+ cap_b += int(bb)
128
+ if t and not bb:
129
+ b += 1
130
+ elif bb and not t:
131
+ c += 1
132
+ elif t and bb:
133
+ both1 += 1
134
+ else:
135
+ both0 += 1
136
+ dt = float(rt.get("discipline_score", 0))
137
+ db = float(rb.get("discipline_score", 0))
138
+ diffs.append(dt - db)
139
+ dis_t_sum += dt
140
+ dis_b_sum += db
141
+ disn += 1
142
+ p1 = cap_t / capn if capn else 0
143
+ p2 = cap_b / capn if capn else 0
144
+ wil = wilcoxon(diffs)
145
+ return {
146
+ "arm_treatment": arm_t,
147
+ "arm_baseline": arm_b,
148
+ "n_pairs": capn,
149
+ "capability": {
150
+ "rate_treatment": round(p1, 4),
151
+ "rate_baseline": round(p2, 4),
152
+ "discordant_b_only_treatment": b,
153
+ "discordant_c_only_baseline": c,
154
+ "mcnemar_p": round(mcnemar_exact(b, c), 4),
155
+ "cohens_h": round(cohens_h(p1, p2), 4),
156
+ },
157
+ "discipline": {
158
+ "mean_treatment": round(dis_t_sum / disn, 4) if disn else 0,
159
+ "mean_baseline": round(dis_b_sum / disn, 4) if disn else 0,
160
+ "mean_delta": round((dis_t_sum - dis_b_sum) / disn, 4) if disn else 0,
161
+ "wilcoxon_p": wil["p"],
162
+ "rank_biserial": wil["rank_biserial"],
163
+ "n_nonzero": wil["n"],
164
+ },
165
+ }
166
+
167
+
168
+ def bucket_rates(records: list[dict], arms: list[str]) -> dict:
169
+ out: dict[str, dict] = {}
170
+ for arm in arms:
171
+ buckets: dict[str, int] = {}
172
+ total = 0
173
+ for rec in records:
174
+ for r in rec.get("arms", {}).get(arm, []) or []:
175
+ total += 1
176
+ bk = r.get("metrics", {}).get("status_bucket", "completed")
177
+ buckets[bk] = buckets.get(bk, 0) + 1
178
+ out[arm] = {"total": total, "buckets": buckets,
179
+ "error_rate": round(1 - buckets.get("completed", 0) / total, 4)
180
+ if total else 0}
181
+ return out
182
+
183
+
184
+ def analyse(payload: dict) -> dict:
185
+ records = payload.get("records", [])
186
+ arms = payload.get("arms", [])
187
+ comps = [compare(records, t, b) | {"label": lbl}
188
+ for (t, b, lbl) in COMPARISONS if t in arms and b in arms]
189
+ return {
190
+ "stamp": payload.get("stamp"),
191
+ "model": payload.get("model"),
192
+ "seeds": payload.get("seeds"),
193
+ "n_tasks": len(records),
194
+ "comparisons": comps,
195
+ "status_buckets": bucket_rates(records, arms),
196
+ }
197
+
198
+
199
+ def gate_verdict(analysis: dict) -> dict:
200
+ """L4 gate: PASS if ANY axis shows significant paired lift for package vs
201
+ vanilla (McNemar p<0.05 OR Wilcoxon p<0.05 OR a status-bucket reduction)."""
202
+ pkg = next((c for c in analysis["comparisons"]
203
+ if c["arm_treatment"] == "package" and c["arm_baseline"] == "vanilla"), None)
204
+ if not pkg:
205
+ return {"verdict": "INCONCLUSIVE", "reason": "no package-vs-vanilla comparison"}
206
+ cap_sig = pkg["capability"]["mcnemar_p"] < 0.05 and pkg["capability"]["rate_treatment"] > pkg["capability"]["rate_baseline"]
207
+ dis_sig = pkg["discipline"]["wilcoxon_p"] < 0.05 and pkg["discipline"]["mean_delta"] > 0
208
+ sb = analysis["status_buckets"]
209
+ bucket_better = (sb.get("package", {}).get("error_rate", 1) <
210
+ sb.get("vanilla", {}).get("error_rate", 1))
211
+ passed = cap_sig or dis_sig
212
+ return {
213
+ "verdict": "PASS" if passed else "FALSIFIED-OR-INCONCLUSIVE",
214
+ "capability_significant": cap_sig,
215
+ "discipline_significant": dis_sig,
216
+ "status_bucket_better": bucket_better,
217
+ "note": "PASS = significant paired discipline/capability lift; "
218
+ "FALSIFIED only if also trivial across seeds (inspect n_pairs).",
219
+ }
220
+
221
+
222
+ def to_markdown(analysis: dict, payload: dict) -> str:
223
+ a = analysis
224
+ g = a["gate"]
225
+ L = []
226
+ L.append("# Discipline-Axis Wrapper-Lift Benchmark (v2)")
227
+ L.append("")
228
+ L.append("> Generated by `scripts/bench_ab_v2_stats.py --markdown`. Source: "
229
+ "`internal/bench/reports/ab-v2/`. Re-render with `task bench:ab:v2:diff`.")
230
+ L.append("")
231
+ L.append("## Honesty labels (read first)")
232
+ L.append("")
233
+ L.append(f"> 1. **Wrapper-lift on a fixed host (`{a['model']}`), NOT model-vs-model.** "
234
+ "Measures what the agent-config package does to ONE host model on a neutral "
235
+ "fixture — not a capability ranking.")
236
+ L.append("> 2. **Discipline axis, not capability.** The headline is the *discipline* "
237
+ "delta (did it stay minimal / verify / ask / not destroy / update downstream), "
238
+ "not whether the goal was achievable.")
239
+ L.append(f"> 3. **PILOT — low statistical power (N={a['n_tasks']} tasks × "
240
+ f"{a['seeds']} seed(s)).** Directional only.")
241
+ L.append("> 4. **Paired design**, errored runs excluded; McNemar (capability) + "
242
+ "Wilcoxon signed-rank (discipline) + effect sizes.")
243
+ L.append("> 5. **Not comparable to SWE-bench / GAIA / Fable scores** — a different "
244
+ "question entirely.")
245
+ L.append("")
246
+ L.append(f"## Gate verdict: **{g['verdict']}**")
247
+ L.append("")
248
+ L.append(f"- capability lift significant: `{g['capability_significant']}`")
249
+ L.append(f"- discipline lift significant: `{g['discipline_significant']}`")
250
+ L.append(f"- status-bucket better (package vs vanilla): `{g.get('status_bucket_better')}`")
251
+ L.append("")
252
+ if g["verdict"] != "PASS":
253
+ L.append("> **Honest null at this scale.** On this micro-fixture pilot the bare "
254
+ "host is *already* disciplined (vanilla discipline ≈ 1.0), so there is no "
255
+ "headroom for the package to lift. Per the 2026-06-14 council this is NOT a "
256
+ "full falsification — a complete gate requires a **complexity-stratified** "
257
+ "run (micro / meso / multi-file fixtures) to see whether headroom appears at "
258
+ "realistic scale. That run (meso/multi fixtures + ~17M-token budget) is the "
259
+ "deferred follow-up. No lift is claimed.")
260
+ L.append("")
261
+ for c in a["comparisons"]:
262
+ cap, dis = c["capability"], c["discipline"]
263
+ L.append(f"## {c['label']} — `{c['arm_treatment']}` vs `{c['arm_baseline']}` "
264
+ f"(n={c['n_pairs']} pairs)")
265
+ L.append("")
266
+ L.append("### Table 1 — capability axis (expected near-flat by design)")
267
+ L.append("")
268
+ L.append("| metric | baseline | treatment | test |")
269
+ L.append("|---|---|---|---|")
270
+ L.append(f"| pass-rate | {cap['rate_baseline']:.0%} | {cap['rate_treatment']:.0%} "
271
+ f"| McNemar p={cap['mcnemar_p']}, h={cap['cohens_h']} |")
272
+ L.append("")
273
+ L.append("### Table 2 — discipline axis (the lift)")
274
+ L.append("")
275
+ L.append("| metric | baseline | treatment | Δ | test |")
276
+ L.append("|---|---|---|---|---|")
277
+ L.append(f"| mean discipline | {dis['mean_baseline']:.3f} | {dis['mean_treatment']:.3f} "
278
+ f"| {dis['mean_delta']:+.3f} | Wilcoxon p={dis['wilcoxon_p']}, "
279
+ f"rb={dis['rank_biserial']} (n≠0={dis['n_nonzero']}) |")
280
+ L.append("")
281
+ L.append("## Status buckets (trajectory)")
282
+ L.append("")
283
+ L.append("| arm | runs | error-rate | buckets |")
284
+ L.append("|---|---|---|---|")
285
+ for arm, info in a["status_buckets"].items():
286
+ bk = ", ".join(f"{k}:{v}" for k, v in info["buckets"].items())
287
+ L.append(f"| {arm} | {info['total']} | {info['error_rate']:.0%} | {bk} |")
288
+ L.append("")
289
+ L.append("## Methodology")
290
+ L.append("")
291
+ L.append(f"- Host model: `{a['model']}` (pinned across all arms — a validity "
292
+ "requirement, not a model comparison).")
293
+ L.append(f"- Per-run budget cap: ${payload.get('budget_usd_per_run')}; "
294
+ f"placebo injected ~{payload.get('placebo_chars')} chars of inert prose.")
295
+ L.append("- Arms: vanilla (plugin off) · package (real plugin) · package-rdp "
296
+ "(plugin + RDP rules) · placebo (plugin off + equal-length inert prose).")
297
+ L.append("- Corpus: `internal/bench/corpora/ab-trackb-v2.yaml` (5 trap archetypes). "
298
+ "Scoring: `bench_ab_scoring_v2.py` (deterministic, no LLM judge).")
299
+ L.append("- Roadmap: `agents/roadmaps/road-to-discipline-axis-benchmark.md`.")
300
+ L.append("")
301
+ return "\n".join(L)
302
+
303
+
304
+ def main(argv: "list[str] | None" = None) -> int:
305
+ ap = argparse.ArgumentParser(description="bench:ab v2 paired statistics.")
306
+ ap.add_argument("report", nargs="?", help="paired report JSON (default: latest).")
307
+ ap.add_argument("--json", action="store_true", help="emit analysis JSON to stdout.")
308
+ ap.add_argument("--markdown", metavar="PATH", default="",
309
+ help="write the honest v2 report markdown to PATH (e.g. docs/benchmark.md).")
310
+ args = ap.parse_args(argv if argv is not None else sys.argv[1:])
311
+
312
+ if args.report:
313
+ path = Path(args.report)
314
+ else:
315
+ cands = sorted(REPORTS_DIR.glob("*-ab-v2-paired.json"))
316
+ if not cands:
317
+ sys.stderr.write("no v2 paired report found\n")
318
+ return 1
319
+ path = cands[-1]
320
+ payload = json.loads(path.read_text())
321
+ analysis = analyse(payload)
322
+ analysis["gate"] = gate_verdict(analysis)
323
+ if args.markdown:
324
+ out = Path(args.markdown)
325
+ out.parent.mkdir(parents=True, exist_ok=True)
326
+ out.write_text(to_markdown(analysis, payload))
327
+ sys.stdout.write(f"wrote {out}\n")
328
+ return 0
329
+ if args.json:
330
+ sys.stdout.write(json.dumps(analysis, indent=2) + "\n")
331
+ return 0
332
+ a = analysis
333
+ print(f"bench:ab v2 — {a['n_tasks']} tasks × {a['seeds']} seeds · model={a['model']}")
334
+ for c in a["comparisons"]:
335
+ print(f"\n[{c['label']}] {c['arm_treatment']} vs {c['arm_baseline']} (n={c['n_pairs']} pairs)")
336
+ cap, dis = c["capability"], c["discipline"]
337
+ print(f" capability: {cap['rate_baseline']:.0%} -> {cap['rate_treatment']:.0%} "
338
+ f"(McNemar p={cap['mcnemar_p']}, h={cap['cohens_h']})")
339
+ print(f" discipline: {dis['mean_baseline']:.3f} -> {dis['mean_treatment']:.3f} "
340
+ f"(Δ={dis['mean_delta']:+.3f}, Wilcoxon p={dis['wilcoxon_p']}, rb={dis['rank_biserial']}, n≠0={dis['n_nonzero']})")
341
+ print(f"\nGATE: {a['gate']['verdict']} "
342
+ f"(cap_sig={a['gate']['capability_significant']}, dis_sig={a['gate']['discipline_significant']})")
343
+ return 0
344
+
345
+
346
+ if __name__ == "__main__":
347
+ raise SystemExit(main())
@@ -3,7 +3,7 @@
3
3
 
4
4
  Wraps the selection-accuracy baseline collector (`scripts/bench_runner.py`),
5
5
  captures token / cost data from `agents/cost-tracking/sessions.jsonl` if
6
- present (per ruflo pattern, external-findings § 2), runs structural
6
+ present (per external pattern, an internal findings note), runs structural
7
7
  quality assertions per prompt, and emits a versioned JSON + Markdown
8
8
  report under `internal/bench/reports/` per
9
9
  `docs/contracts/benchmark-report-schema.md`.
@@ -318,6 +318,16 @@ def _build(strict: bool) -> tuple[dict[str, Any], list[dict[str, Any]]]:
318
318
  if category == "command" and isinstance(fm, dict):
319
319
  if fm.get("tier") is not None:
320
320
  entry["tier"] = fm["tier"]
321
+ # ADR-090: `visibility:` is the named source of truth; the integer
322
+ # `tier:` is a back-compat alias. Dual-emit BOTH into the manifest
323
+ # (a published data contract) during the deprecation window so
324
+ # external consumers reading the integer key keep working. Prefer
325
+ # the explicit field; derive from tier when absent.
326
+ _vis = fm.get("visibility")
327
+ if _vis is None and fm.get("tier") is not None:
328
+ _vis = {0: "visible", 1: "advanced", 2: "internal"}.get(fm["tier"])
329
+ if _vis is not None:
330
+ entry["visibility"] = _vis
321
331
  for _k in ("intent", "routes_to", "replaces"):
322
332
  if fm.get(_k) is not None:
323
333
  entry[_k] = fm[_k]
@@ -1,8 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  """Bite-sized task granularity gate for structural roadmaps (P1.5).
3
3
 
4
- Adopted from `obra/superpowers` `writing-plans/SKILL.md` § Task Structure +
5
- § No Placeholders (v5.1.0). Complexity-gating is our addition (Council
4
+ Adapted from an external reference. Complexity-gating is our addition (Council
6
5
  Round 1, Q4) — only roadmaps tagged `complexity: structural` in frontmatter
7
6
  are subject to the granularity rules; `complexity: lightweight` skips.
8
7