@event4u/agent-config 3.3.0 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/README.md +2 -2
- package/.agent-src/commands/agent-handoff.md +31 -2
- package/.agent-src/commands/agent-status.md +5 -5
- package/.agent-src/commands/agents/audit.md +8 -8
- package/.agent-src/commands/agents/init.md +25 -1
- package/.agent-src/commands/agents/optimize.md +3 -3
- package/.agent-src/commands/agents/user.md +1 -1
- package/.agent-src/commands/agents.md +1 -1
- package/.agent-src/commands/analyze-reference-repo.md +1 -1
- package/.agent-src/commands/check-current-md.md +8 -8
- package/.agent-src/commands/{compress.md → condense.md} +55 -55
- package/.agent-src/commands/context/create.md +7 -4
- package/.agent-src/commands/context/refactor.md +3 -1
- package/.agent-src/commands/feature/dev.md +1 -1
- package/.agent-src/commands/feature/explore.md +1 -1
- package/.agent-src/commands/feature/plan.md +10 -8
- package/.agent-src/commands/feature/refactor.md +3 -1
- package/.agent-src/commands/feature/roadmap.md +7 -4
- package/.agent-src/commands/fix/portability.md +3 -3
- package/.agent-src/commands/fix/refs.md +4 -4
- package/.agent-src/commands/ghostwriter.md +2 -2
- package/.agent-src/commands/memory/learn-low-impact.md +3 -3
- package/.agent-src/commands/module/explore.md +34 -8
- package/.agent-src/commands/optimize/agents-dir.md +9 -7
- package/.agent-src/commands/optimize/augmentignore.md +2 -2
- package/.agent-src/commands/optimize/skills.md +9 -9
- package/.agent-src/commands/post-as.md +1 -1
- package/.agent-src/commands/project-analyze.md +2 -2
- package/.agent-src/commands/project-health.md +3 -2
- package/.agent-src/commands/research/deep.md +1 -1
- package/.agent-src/commands/research/report.md +1 -1
- package/.agent-src/commands/research.md +1 -1
- package/.agent-src/commands/roadmap/ai-council.md +1 -1
- package/.agent-src/commands/roadmap/create.md +9 -4
- package/.agent-src/commands/rule-compliance-audit.md +1 -1
- package/.agent-src/commands/upstream-contribute.md +14 -14
- package/.agent-src/commands/video/from-script.md +1 -1
- package/.agent-src/commands/video/scene.md +1 -1
- package/.agent-src/commands/video/stitch.md +1 -1
- package/.agent-src/commands/video/storyboard.md +1 -1
- package/.agent-src/commands/video.md +1 -1
- package/.agent-src/contexts/augment-infrastructure.md +1 -1
- package/.agent-src/contexts/authority/commit-mechanics.md +15 -0
- package/.agent-src/contexts/authority/kernel-rule-edits.md +3 -3
- package/.agent-src/contexts/authority/scope-mechanics.md +1 -1
- package/.agent-src/contexts/communication/rules-auto/augment-source-of-truth-mechanics.md +28 -28
- package/.agent-src/contexts/communication/rules-auto/skill-quality-mechanics.md +4 -4
- package/.agent-src/contexts/communication/rules-auto/think-before-action-mechanics.md +2 -2
- package/.agent-src/contexts/contracts/artifact-engagement-flow.md +6 -6
- package/.agent-src/contexts/contracts/command-suggestion-flow.md +3 -3
- package/.agent-src/contexts/contracts/emergency-triage-block.md +4 -4
- package/.agent-src/contexts/contracts/frugality-charter.md +3 -3
- package/.agent-src/contexts/documentation-hierarchy.md +14 -7
- package/.agent-src/contexts/execution/autonomy-examples.md +1 -1
- package/.agent-src/contexts/execution/cheap-question-mechanics.md +39 -2
- package/.agent-src/contexts/execution/roadmap-process-loop.md +28 -5
- package/.agent-src/contexts/override-system.md +5 -5
- package/.agent-src/ghostwriter/fictional-fixture-v1.md +1 -1
- package/.agent-src/personas/advisors/first-principles.md +1 -1
- package/.agent-src/personas/hollywood-director.md +1 -1
- package/.agent-src/rules/architecture.md +5 -1
- package/.agent-src/rules/augment-edit-discipline.md +5 -5
- package/.agent-src/rules/augment-source-of-truth.md +15 -15
- package/.agent-src/rules/commit-conventions.md +1 -1
- package/.agent-src/rules/commit-policy.md +10 -0
- package/.agent-src/rules/domain-adoption-policy.md +3 -3
- package/.agent-src/rules/fast-path-marker-visibility.md +3 -3
- package/.agent-src/rules/finance-safety-floor.md +1 -1
- package/.agent-src/rules/framework-neutrality-in-generic-skills.md +8 -8
- package/.agent-src/rules/git-history-discipline.md +1 -1
- package/.agent-src/rules/improve-before-implement.md +2 -2
- package/.agent-src/rules/language-and-tone.md +2 -2
- package/.agent-src/rules/media-governance-routing.md +5 -5
- package/.agent-src/rules/no-attribution-footers.md +1 -0
- package/.agent-src/rules/no-cheap-questions.md +3 -0
- package/.agent-src/rules/no-decorative-emojis-in-git-surfaces.md +111 -0
- package/.agent-src/rules/no-pr-progress-comments.md +118 -0
- package/.agent-src/rules/no-roadmap-references.md +3 -3
- package/.agent-src/rules/non-destructive-by-default.md +1 -1
- package/.agent-src/rules/persona-governance.md +3 -3
- package/.agent-src/rules/preservation-guard.md +15 -15
- package/.agent-src/rules/roadmap-ci-steps-policy.md +7 -3
- package/.agent-src/rules/rule-type-governance.md +1 -1
- package/.agent-src/rules/skill-quality.md +1 -1
- package/.agent-src/rules/{caveman-speak.md → telegraph-speak.md} +15 -15
- package/.agent-src/rules/token-optimizer-maintenance.md +6 -6
- package/.agent-src/skills/agent-docs-writing/SKILL.md +17 -11
- package/.agent-src/skills/agents-md-thin-root/SKILL.md +9 -9
- package/.agent-src/skills/check-refs/SKILL.md +2 -2
- package/.agent-src/skills/code-refactoring/SKILL.md +2 -2
- package/.agent-src/skills/command-writing/SKILL.md +19 -19
- package/.agent-src/skills/comp-banding/SKILL.md +1 -1
- package/.agent-src/skills/condense-memory/SKILL.md +131 -0
- package/.agent-src/skills/context-authoring/SKILL.md +2 -2
- package/.agent-src/skills/context-document/SKILL.md +5 -3
- package/.agent-src/skills/copilot-agents-optimization/SKILL.md +3 -3
- package/.agent-src/skills/description-assist/SKILL.md +2 -2
- package/.agent-src/skills/git-workflow/SKILL.md +1 -1
- package/.agent-src/skills/guideline-writing/SKILL.md +5 -5
- package/.agent-src/skills/learning-to-rule-or-skill/SKILL.md +4 -4
- package/.agent-src/skills/lint-skills/SKILL.md +3 -3
- package/.agent-src/skills/md-language-check/SKILL.md +2 -2
- package/.agent-src/skills/module-detect-on-the-fly/SKILL.md +138 -0
- package/.agent-src/skills/module-management/SKILL.md +166 -94
- package/.agent-src/skills/override-management/SKILL.md +1 -1
- package/.agent-src/skills/persona-writing/SKILL.md +5 -5
- package/.agent-src/skills/positioning-strategy/SKILL.md +1 -1
- package/.agent-src/skills/project-docs/SKILL.md +6 -4
- package/.agent-src/skills/readme-reviewer/SKILL.md +2 -2
- package/.agent-src/skills/roadmap-management/SKILL.md +13 -1
- package/.agent-src/skills/roadmap-writing/SKILL.md +4 -2
- package/.agent-src/skills/rule-refactor/SKILL.md +5 -5
- package/.agent-src/skills/rule-writing/SKILL.md +18 -18
- package/.agent-src/skills/script-writing/SKILL.md +1 -1
- package/.agent-src/skills/skill-improvement-pipeline/SKILL.md +6 -6
- package/.agent-src/skills/skill-management/SKILL.md +21 -21
- package/.agent-src/skills/skill-reviewer/SKILL.md +2 -2
- package/.agent-src/skills/skill-writing/SKILL.md +8 -8
- package/.agent-src/skills/skill-writing/evals/triggers.json +1 -1
- package/.agent-src/skills/token-optimizer/SKILL.md +4 -4
- package/.agent-src/skills/unit-economics-modeling/SKILL.md +1 -1
- package/.agent-src/skills/upstream-contribute/SKILL.md +17 -17
- package/.agent-src/templates/AGENTS.md +1 -0
- package/.agent-src/templates/agent-settings.md +24 -13
- package/.agent-src/templates/agents/agent-project-settings.example.yml +61 -2
- package/.agent-src/templates/command.md +5 -5
- package/.agent-src/templates/contexts.md +1 -1
- package/.agent-src/templates/copilot-instructions.md +8 -8
- package/.agent-src/templates/features.md +1 -1
- package/.agent-src/templates/hooks/pre-commit-frontmatter +2 -2
- package/.agent-src/templates/hooks/pre-commit-roadmap-progress +3 -3
- package/.agent-src/templates/persona.md +2 -2
- package/.agent-src/templates/roadmaps.md +1 -1
- package/.agent-src/templates/rule.md +13 -13
- package/.agent-src/templates/scripts/memory_lookup.py +1 -1
- package/.agent-src/templates/scripts/memory_status.py +2 -2
- package/.agent-src/templates/scripts/work_engine/_lib/agent_settings.py +195 -1
- package/.agent-src/templates/scripts/work_engine/orchestration.py +1 -1
- package/.agent-src/templates/skill-archive-note.md +5 -5
- package/.agent-src/templates/skill.md +1 -1
- package/.claude-plugin/marketplace.json +4 -4
- package/AGENTS.md +16 -17
- package/CHANGELOG.md +216 -3
- package/CONTRIBUTING.md +31 -12
- package/README.md +21 -12
- package/config/agent-settings.template.yml +22 -2
- package/config/discovery/unassigned-artefacts.yml +24 -24
- package/config/profiles/full.ini +1 -1
- package/dist/cli/agent-config.js +52 -3
- package/dist/cli/agent-config.js.map +1 -1
- package/dist/cli/commands/uiServe.js +9 -0
- package/dist/cli/commands/uiServe.js.map +1 -1
- package/dist/cli/registry.js +2 -1
- package/dist/cli/registry.js.map +1 -1
- package/dist/discovery/deprecation-report.md +1 -1
- package/dist/discovery/discovery-manifest.json +649 -606
- package/dist/discovery/discovery-manifest.json.sha256 +1 -1
- package/dist/discovery/discovery-manifest.summary.md +4 -4
- package/dist/discovery/orphan-report.md +1 -1
- package/dist/discovery/packs.json +439 -437
- package/dist/discovery/trust-report.md +5 -5
- package/dist/discovery/workspaces.json +450 -448
- package/dist/install/atomic.js +92 -0
- package/dist/install/atomic.js.map +1 -0
- package/dist/install/conflict.js +196 -0
- package/dist/install/conflict.js.map +1 -0
- package/dist/install/detect.js +218 -0
- package/dist/install/detect.js.map +1 -0
- package/dist/install/paths.js +82 -0
- package/dist/install/paths.js.map +1 -0
- package/dist/install/plan.js +157 -0
- package/dist/install/plan.js.map +1 -0
- package/dist/install/txlog.js +140 -0
- package/dist/install/txlog.js.map +1 -0
- package/dist/install/types.js +19 -0
- package/dist/install/types.js.map +1 -0
- package/dist/install/wizard-plan.js +184 -0
- package/dist/install/wizard-plan.js.map +1 -0
- package/dist/mcp/registry-manifest.json +4 -4
- package/dist/router.json +67 -19
- package/dist/server/app.js +6 -0
- package/dist/server/app.js.map +1 -1
- package/dist/server/routes/install.js +358 -0
- package/dist/server/routes/install.js.map +1 -0
- package/dist/server/routes/wizard.js +468 -32
- package/dist/server/routes/wizard.js.map +1 -1
- package/dist/server/routes/workspace.js +396 -0
- package/dist/server/routes/workspace.js.map +1 -0
- package/dist/server/schemas/settings.js +5 -3
- package/dist/server/schemas/settings.js.map +1 -1
- package/dist/ui/assets/index-BDAhhpDV.js +40 -0
- package/dist/ui/assets/index-BDAhhpDV.js.map +1 -0
- package/dist/ui/assets/index-BXZILUxe.css +1 -0
- package/dist/ui/index.html +2 -2
- package/docs/MIGRATION.md +1 -1
- package/docs/adrs/cost/0001-hard-stop-hook.md +1 -1
- package/docs/adrs/router/0001-three-tier-routing.md +4 -4
- package/docs/adrs/schema/0001-json-schema-frontmatter.md +1 -1
- package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +4 -4
- package/docs/adrs/{caveman → telegraph}/0001-default-off-until-bench.md +9 -9
- package/docs/adrs/telegraph/README.md +9 -0
- package/docs/architecture/augment-projection.md +4 -4
- package/docs/architecture/claude-bundle.md +1 -1
- package/docs/architecture/current-onboard-baseline.md +3 -3
- package/docs/architecture/multi-tool-projection.md +10 -10
- package/docs/architecture/source-projection.md +27 -27
- package/docs/architecture.md +19 -15
- package/docs/archive/CHANGELOG-pre-2.11.0.md +2 -2
- package/docs/archive/CHANGELOG-pre-2.15.0.md +3 -3
- package/docs/archive/CHANGELOG-pre-2.16.0.md +1 -1
- package/docs/archive/CHANGELOG-pre-2.2.0.md +70 -70
- package/docs/archive/CHANGELOG-pre-2.20.0.md +2 -2
- package/docs/archive/CHANGELOG-pre-2.25.0.md +15 -15
- package/docs/archive/CHANGELOG-pre-3.0.0.md +4 -4
- package/docs/archive/CHANGELOG-pre-3.1.0.md +2 -2
- package/docs/archive/CHANGELOG-pre-3.2.0.md +3 -3
- package/docs/benchmark.md +65 -0
- package/docs/benchmarks.md +16 -16
- package/docs/catalog.md +17 -15
- package/docs/contracts/CHANGELOG-conventions.md +1 -1
- package/docs/contracts/STABILITY.md +2 -2
- package/docs/contracts/adoption-signal-floor.md +110 -0
- package/docs/contracts/adr-chat-history-split.md +4 -4
- package/docs/contracts/adr-command-suggestion.md +4 -4
- package/docs/contracts/adr-gtm-context-spine.md +1 -1
- package/docs/contracts/adr-implement-ticket-runtime.md +4 -4
- package/docs/contracts/adr-install-user-type-axis.md +1 -1
- package/docs/contracts/adr-layout.md +2 -2
- package/docs/contracts/adr-product-ui-track.md +10 -10
- package/docs/contracts/adr-user-types-axis.md +3 -3
- package/docs/contracts/adr-wing4-context-spine.md +1 -1
- package/docs/contracts/agent-memory-contract.md +3 -3
- package/docs/contracts/agents-md-tech-stack.md +2 -2
- package/docs/contracts/ai-council-config.md +2 -2
- package/docs/contracts/at-rest-encryption.md +4 -0
- package/docs/contracts/audit-log-v1.md +1 -1
- package/docs/contracts/benchmark-ab-contract.md +101 -0
- package/docs/contracts/benchmark-corpus-spec.md +1 -1
- package/docs/contracts/branch-protection-policy.md +98 -0
- package/docs/contracts/ci-cost-budget.md +106 -0
- package/docs/contracts/ci-green-floor.md +141 -0
- package/docs/contracts/command-clusters.md +6 -6
- package/docs/contracts/command-surface-tiers.md +2 -2
- package/docs/contracts/command-taxonomy.md +2 -2
- package/docs/contracts/{compression-default-kill-criterion.md → condensation-default-kill-criterion.md} +29 -29
- package/docs/contracts/config-presets.md +1 -1
- package/docs/contracts/context-paths.md +3 -3
- package/docs/contracts/context-spine.md +1 -1
- package/docs/contracts/cost-summary-schema.md +12 -12
- package/docs/contracts/cross-wing-handoff.md +4 -4
- package/docs/contracts/daily-workspace.md +4 -0
- package/docs/contracts/decision-trace-v1.md +2 -2
- package/docs/contracts/discovery-manifest.md +4 -4
- package/docs/contracts/explain-modes.md +4 -0
- package/docs/contracts/file-ownership-matrix.json +3493 -3318
- package/docs/contracts/file-ownership-matrix.md +3 -3
- package/docs/contracts/frontmatter-contract.md +4 -4
- package/docs/contracts/ghostwriter-schema.md +3 -3
- package/docs/contracts/gui-wizard.md +110 -97
- package/docs/contracts/harness-expectations.md +123 -0
- package/docs/contracts/host-agent-protocol.md +4 -0
- package/docs/contracts/implement-ticket-flow.md +9 -9
- package/docs/contracts/install-scopes.md +77 -0
- package/docs/contracts/iron-law-overrides.txt +1 -1
- package/docs/contracts/kernel-membership.md +26 -26
- package/docs/contracts/linear-ai-rules-inclusion.md +1 -1
- package/docs/contracts/linter-structural-model.md +2 -2
- package/docs/contracts/load-context-budget-model.md +4 -4
- package/docs/contracts/load-context-schema.md +13 -13
- package/docs/contracts/local-analytics.md +4 -0
- package/docs/contracts/local-knowledge-ingestion.md +1 -1
- package/docs/contracts/mcp-cloud-scope.md +2 -2
- package/docs/contracts/mcp-phase-1-scope.md +3 -3
- package/docs/contracts/measurement-baseline.md +5 -5
- package/docs/contracts/mental-models.md +30 -30
- package/docs/contracts/multi-tool-projection-fidelity.md +4 -4
- package/docs/contracts/namespace.md +4 -4
- package/docs/contracts/orchestration-dsl-v1.md +7 -7
- package/docs/contracts/package-self-orientation.md +12 -12
- package/docs/contracts/persona-schema.md +6 -6
- package/docs/contracts/pilot/language-and-tone.md +1 -1
- package/docs/contracts/plain-language-surface.md +117 -0
- package/docs/contracts/profile-system.md +3 -3
- package/docs/contracts/release-pr-gating.md +103 -0
- package/docs/contracts/role-experience.md +3 -3
- package/docs/contracts/rule-classification.md +13 -13
- package/docs/contracts/rule-interactions.md +4 -4
- package/docs/contracts/rule-interactions.yml +30 -30
- package/docs/contracts/rule-priority-hierarchy.md +13 -13
- package/docs/contracts/rule-router.md +2 -2
- package/docs/contracts/safety-model.md +1 -1
- package/docs/contracts/skill-distribution-channels.md +61 -0
- package/docs/contracts/skill-domains.md +2 -2
- package/docs/contracts/smoke-contracts.md +5 -5
- package/docs/contracts/telegraph-telemetry.md +83 -0
- package/docs/contracts/trust-and-safety.md +5 -5
- package/docs/contracts/ui-stack-extension.md +7 -7
- package/docs/contracts/ui-track-flow.md +9 -9
- package/docs/contracts/user-type-schema.md +4 -4
- package/docs/contracts/workflow-packs.md +4 -4
- package/docs/contracts/workspace-documents.md +4 -0
- package/docs/customization.md +28 -8
- package/docs/decisions/ADR-001-kernel-swap-deferred.md +6 -6
- package/docs/decisions/ADR-002-kernel-bucket-overrides.md +11 -11
- package/docs/decisions/ADR-003-flat-cluster-subs-and-colon-syntax.md +2 -2
- package/docs/decisions/ADR-004-rule-governance-pruning.md +4 -4
- package/docs/decisions/ADR-005-subagent-worktrees.md +7 -7
- package/docs/decisions/ADR-011-domain-pack-readiness.md +6 -6
- package/docs/decisions/ADR-013-discovery-frontmatter-contract.md +3 -3
- package/docs/decisions/ADR-015-discovery-manifest-contract.md +3 -3
- package/docs/decisions/ADR-017-monorepo-physical-layout.md +10 -10
- package/docs/decisions/ADR-018-trust-and-safety-layer.md +6 -6
- package/docs/decisions/ADR-019-router-json-dist-location.md +2 -2
- package/docs/decisions/ADR-020-global-only-consumer-scope.md +2 -2
- package/docs/decisions/ADR-021-deployment-shape.md +3 -3
- package/docs/decisions/ADR-022-daily-workspace-decomposition.md +1 -1
- package/docs/decisions/ADR-027-changelog-machine-vs-manual.md +2 -2
- package/docs/decisions/ADR-028-root-layout.md +7 -7
- package/docs/decisions/ADR-029-multi-workspace-deferred.md +2 -2
- package/docs/decisions/ADR-rule-kernel-and-router.md +5 -5
- package/docs/deploy/connector-setup.md +2 -2
- package/docs/deploy/policy-cookbook.md +2 -2
- package/docs/deploy/team-deployment-posture.md +20 -0
- package/docs/development.md +17 -17
- package/docs/distribution/registries.md +32 -0
- package/docs/distribution/registry-submissions.md +85 -0
- package/docs/distribution/telemetry-schema.md +1 -1
- package/docs/getting-started-by-role.md +45 -3
- package/docs/getting-started.md +2 -2
- package/docs/guidelines/agent-infra/5w2h-analysis.md +3 -3
- package/docs/guidelines/agent-infra/ask-when-uncertain-demos.md +1 -1
- package/docs/guidelines/agent-infra/asking-and-brevity-examples.md +3 -3
- package/docs/guidelines/agent-infra/carve-out-predicates.md +3 -3
- package/docs/guidelines/agent-infra/critical-thinking.md +4 -4
- package/docs/guidelines/agent-infra/direct-answers-demos.md +1 -1
- package/docs/guidelines/agent-infra/first-principles.md +2 -2
- package/docs/guidelines/agent-infra/inversion-thinking.md +5 -5
- package/docs/guidelines/agent-infra/layered-settings.md +56 -2
- package/docs/guidelines/agent-infra/mental-models.md +3 -3
- package/docs/guidelines/agent-infra/roadmap-progress-mechanics.md +2 -2
- package/docs/guidelines/agent-infra/rule-type-governance.md +1 -1
- package/docs/guidelines/agent-infra/scqa-framework.md +5 -5
- package/docs/guidelines/agent-infra/self-improvement-pipeline.md +2 -2
- package/docs/guidelines/agent-infra/six-hats.md +3 -3
- package/docs/guidelines/agent-infra/skill-quality-checklist.md +5 -5
- package/docs/guidelines/agent-infra/systems-thinking.md +1 -1
- package/docs/guidelines/agent-infra/verify-before-complete-demos.md +1 -1
- package/docs/guidelines/augment-portability-patterns.md +4 -4
- package/docs/guidelines/cross-role-handoff.md +2 -2
- package/docs/guidelines/php/php-coding-patterns.md +1 -1
- package/docs/guidelines/prompt-templates.md +6 -6
- package/docs/maintainers/dev-mode.md +1 -1
- package/docs/mcp.md +1 -1
- package/docs/parity/bench.json +3 -3
- package/docs/parity/ruflo.md +2 -2
- package/docs/profiles.md +11 -11
- package/docs/quality.md +11 -11
- package/docs/safety.md +3 -3
- package/docs/setup/mcp-client-config.md +1 -1
- package/docs/setup/mcp-r2-bootstrap.md +1 -1
- package/docs/setup/mcp-server-docker.md +3 -3
- package/docs/setup/per-ide/windsurf.md +1 -1
- package/docs/skills-catalog.md +8 -7
- package/docs/troubleshooting.md +1 -1
- package/docs/walkthroughs/daily-workspace-a11y.md +87 -0
- package/llms.txt +7 -6
- package/package.json +1 -1
- package/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
- package/scripts/_archive/README.md +2 -2
- package/scripts/_archive/_backfill_skill_domains.py +3 -3
- package/scripts/_archive/_bootstrap_tier_frontmatter.py +3 -3
- package/scripts/_archive/_p43_bodies.py +10 -10
- package/scripts/_archive/{_p43_compress.py → _p43_condense.py} +5 -5
- package/scripts/_archive/_p4_migrate.py +7 -7
- package/scripts/_archive/_phase2_shim_helper.py +1 -1
- package/scripts/_archive/_pilot_council_question.py +5 -5
- package/scripts/_cli/explain_last/inputs.py +1 -1
- package/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- package/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
- package/scripts/_lib/agent_settings.py +195 -1
- package/scripts/_lib/agent_src.py +19 -19
- package/scripts/_lib/bench_ab_cache.py +162 -0
- package/scripts/_lib/bench_ab_scoring.py +209 -0
- package/scripts/_lib/{bench_caveman.py → bench_telegraph.py} +21 -21
- package/scripts/_lib/{bench_caveman_report.py → bench_telegraph_report.py} +21 -21
- package/scripts/_lib/claude_desktop_bundler.py +5 -5
- package/scripts/_lib/module_detection.py +223 -0
- package/scripts/_lib/scope_guard.sh +162 -0
- package/scripts/_phase4_bucket.py +3 -3
- package/scripts/_pilot_measure.py +4 -4
- package/scripts/_tmp_scan_framework_leakage.py +1 -1
- package/scripts/adoption_report.py +195 -0
- package/scripts/adoption_snapshot.py +219 -0
- package/scripts/adoption_status.py +166 -0
- package/scripts/ai-video/lib/parse-blueprint.sh +1 -1
- package/scripts/ai_council/advisors.py +5 -5
- package/scripts/ai_council/compile_corpus.py +1 -1
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_budget_v2_audit.py +3 -3
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_context_layer_v1_review.py +2 -2
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_inject_quiet_flag.py +1 -1
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_measure_v2.sh +1 -1
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_measure_verbosity.sh +1 -1
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_nondestructive_inline_audit.py +3 -3
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_per_task.sh +1 -1
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_phase6_trigger_jaccard.py +1 -1
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_phase_2a_budget_rebalance.py +6 -6
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_rebalancing_audit.py +1 -1
- package/scripts/ai_council/one_off_archive/2026-05/_one_off_tier_retrofit.py +6 -6
- package/scripts/annotate_discovery.py +13 -13
- package/scripts/apply_modules_config.py +290 -0
- package/scripts/audit_adr_coverage.py +2 -2
- package/scripts/audit_auto_rules.py +2 -2
- package/scripts/audit_cloud_compatibility.py +3 -3
- package/scripts/audit_command_surface.py +9 -9
- package/scripts/audit_likelihood.py +2 -2
- package/scripts/audit_user_type_axis.py +2 -2
- package/scripts/bench_ab_cache_dispatch.py +68 -0
- package/scripts/bench_ab_clone.py +170 -0
- package/scripts/bench_ab_diff.py +227 -0
- package/scripts/bench_ab_integrity.py +143 -0
- package/scripts/bench_ab_run.py +235 -0
- package/scripts/bench_ab_task_runner.py +369 -0
- package/scripts/bench_ab_tracka_run.py +202 -0
- package/scripts/{bench_compress_memory.py → bench_condense_memory.py} +16 -16
- package/scripts/bench_run.py +33 -33
- package/scripts/bench_runner.py +2 -2
- package/scripts/bootstrap.sh +99 -0
- package/scripts/build_cloud_bundle.py +6 -6
- package/scripts/build_discovery_manifest.py +7 -7
- package/scripts/build_linear_digest.py +3 -3
- package/scripts/build_rule_trigger_matrix.py +8 -8
- package/scripts/chat_history.py +5 -5
- package/scripts/check_always_budget.py +11 -5
- package/scripts/check_augment_description_cap.py +3 -3
- package/scripts/check_cluster_patterns.py +2 -2
- package/scripts/check_command_count_messaging.py +3 -3
- package/scripts/{check_compression.py → check_condensation.py} +34 -34
- package/scripts/{check_compressed_paths.py → check_condensed_paths.py} +8 -8
- package/scripts/check_context_paths.py +7 -7
- package/scripts/check_council_layout.py +2 -2
- package/scripts/check_council_references.py +9 -9
- package/scripts/check_iron_law_prominence.py +2 -2
- package/scripts/check_kernel_rule_bundle.py +2 -2
- package/scripts/check_module_management_neutral.py +149 -0
- package/scripts/check_no_roadmap_refs.py +9 -9
- package/scripts/check_portability.py +3 -3
- package/scripts/check_public_catalog_links.py +4 -4
- package/scripts/check_references.py +7 -6
- package/scripts/check_release_pr_shape.py +112 -0
- package/scripts/check_reply_consistency.py +3 -3
- package/scripts/check_safety_floor_untouched.py +1 -1
- package/scripts/check_template_pin_drift.py +5 -5
- package/scripts/check_token_optimizer_freshness.py +3 -3
- package/scripts/ci_status.py +301 -0
- package/scripts/ci_time_ratio.py +1 -1
- package/scripts/cleanup_other_scope.sh +146 -0
- package/scripts/compile_router.py +10 -10
- package/scripts/{compress.py → condense.py} +64 -64
- package/scripts/condense.sh +18 -0
- package/scripts/{compress_memory.py → condense_memory.py} +33 -33
- package/scripts/config/presets.py +2 -2
- package/scripts/config/profiles.py +1 -1
- package/scripts/cost_by_conversation.py +3 -3
- package/scripts/cost_summary.py +7 -7
- package/scripts/count_token_optimizer_usage.sh +1 -1
- package/scripts/gen_discovery_baseline.py +5 -5
- package/scripts/generate_index.py +6 -6
- package/scripts/generate_ownership_matrix.py +10 -10
- package/scripts/generate_pack_manifests.py +1 -1
- package/scripts/ghostwriter_fixture_allowlist.txt +1 -1
- package/scripts/install +3 -3
- package/scripts/install-hooks.sh +6 -6
- package/scripts/install.py +273 -45
- package/scripts/install.sh +187 -1
- package/scripts/inventory_frontmatter.py +2 -2
- package/scripts/iron_law_sha.py +3 -3
- package/scripts/lint_agents_layout.py +14 -7
- package/scripts/lint_agents_md.py +4 -4
- package/scripts/lint_archived_skills.py +3 -3
- package/scripts/lint_artefact_frontmatter.py +2 -2
- package/scripts/lint_bench_ab.py +172 -0
- package/scripts/lint_bench_corpus.py +1 -1
- package/scripts/lint_command_tiers.py +5 -5
- package/scripts/lint_context_spine_usage.py +1 -1
- package/scripts/lint_framework_leakage.py +7 -7
- package/scripts/lint_framework_leakage_allowlist.json +152 -84
- package/scripts/lint_ghostwriter_source.py +3 -3
- package/scripts/lint_handoffs.py +1 -1
- package/scripts/lint_load_context.py +11 -11
- package/scripts/lint_media_policy_linkage.py +5 -5
- package/scripts/lint_namespace.py +1 -1
- package/scripts/lint_no_new_atomic_commands.py +2 -2
- package/scripts/lint_orchestration_dsl.py +1 -1
- package/scripts/lint_pack_boundaries.py +2 -2
- package/scripts/lint_persona_governance.py +4 -4
- package/scripts/lint_role_experiences.py +237 -0
- package/scripts/lint_rule_interactions.py +2 -2
- package/scripts/lint_rule_tiers.py +1 -1
- package/scripts/lint_trust_coherence.py +2 -2
- package/scripts/mcp_registry_submit.sh +187 -0
- package/scripts/mcp_server/tools.py +1 -1
- package/scripts/measure_frugality_savings.py +10 -10
- package/scripts/measure_patterns.py +1 -1
- package/scripts/measure_projection_bytes.py +5 -5
- package/scripts/measure_rule_budget.py +3 -3
- package/scripts/measure_skill_reduction.py +1 -1
- package/scripts/memory_lookup.py +1 -1
- package/scripts/memory_status.py +2 -2
- package/scripts/migrate_command_suggestions.py +3 -3
- package/scripts/mine_session.py +1 -1
- package/scripts/move_artefact.py +3 -3
- package/scripts/new_skill.py +2 -2
- package/scripts/pack_mcp_content.py +9 -9
- package/scripts/plan_physical_move.py +6 -6
- package/scripts/print_required_checks.py +196 -0
- package/scripts/probe_skill_registration.py +413 -0
- package/scripts/propose_modules_config.py +145 -0
- package/scripts/prototype_lint_contradictions.py +1 -1
- package/scripts/recruit_preflight.sh +152 -0
- package/scripts/refine_ticket_detect.py +3 -3
- package/scripts/release.py +20 -0
- package/scripts/render_benchmark_md.py +308 -0
- package/scripts/roadmap_progress_hook.py +1 -1
- package/scripts/run_skill_evals.py +2 -2
- package/scripts/runtime_registry.py +4 -4
- package/scripts/schemas/command.schema.json +4 -4
- package/scripts/schemas/rule.schema.json +5 -5
- package/scripts/schemas/skill.schema.json +3 -3
- package/scripts/schemas/user-type.schema.json +1 -1
- package/scripts/score_skill_selection.py +1 -1
- package/scripts/skill_collision_clusters.py +2 -2
- package/scripts/skill_linter.py +81 -81
- package/scripts/skill_overlap.py +5 -5
- package/scripts/skill_tools/audit_persona_coverage.py +2 -2
- package/scripts/skill_tools/audit_user_type_coverage.py +2 -2
- package/scripts/skill_tools/run_block_d_eval.py +1 -1
- package/scripts/skill_tools/score_skill_relevance.py +1 -1
- package/scripts/skill_tools/suggest_skill_for_task.py +1 -1
- package/scripts/skill_trigger_eval.py +3 -3
- package/scripts/smoke/kernel.sh +7 -1
- package/scripts/smoke/router.sh +5 -5
- package/scripts/smoke/skills.sh +1 -1
- package/scripts/smoke_quickstart.py +1 -1
- package/scripts/snapshot_agent_outputs.py +3 -3
- package/scripts/spotcheck_thin_root.py +1 -1
- package/scripts/{caveman_stats.py → telegraph_stats.py} +18 -18
- package/scripts/update_counts.py +1 -1
- package/scripts/validate_decision_engine.py +1 -1
- package/scripts/validate_frontmatter.py +1 -1
- package/scripts/validate_safe_paths.py +3 -3
- package/scripts/{validate_caveman_carveouts.py → validate_telegraph_carveouts.py} +7 -7
- package/scripts/verify_roadmap_closure.py +6 -6
- package/templates/consumer-settings/ONBOARDING.md +41 -0
- package/.agent-src/commands/install-via-agent.md +0 -129
- package/.agent-src/skills/compress-memory/SKILL.md +0 -131
- package/dist/ui/assets/index-D-DY1ywI.js +0 -35
- package/dist/ui/assets/index-D-DY1ywI.js.map +0 -1
- package/dist/ui/assets/index-Dqfhmg-d.css +0 -1
- package/docs/adrs/caveman/README.md +0 -9
- package/docs/contracts/caveman-telemetry.md +0 -83
- package/scripts/compress.sh +0 -18
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Top-level orchestrator for the package-impact A/B bench.
|
|
3
|
+
|
|
4
|
+
Phase 2 Step 1 of `agents/roadmaps/road-to-package-impact-benchmark.md`.
|
|
5
|
+
|
|
6
|
+
A thin wrapper around the per-track runners (Track A behavioural eval,
|
|
7
|
+
Track B task corpus). Owns:
|
|
8
|
+
|
|
9
|
+
- the `--variant {with,without}` axis,
|
|
10
|
+
- the cache lookup that decides whether the `without` arm runs at all,
|
|
11
|
+
- the report-header convention (cache key, variant, corpus, timestamp),
|
|
12
|
+
- the report-path convention `internal/bench/reports/ab/{stamp}-{corpus}-{variant}.json`.
|
|
13
|
+
|
|
14
|
+
Track A's actual runner lands in Phase 3; Track B's in Phase 4. Until then
|
|
15
|
+
this script writes stub reports so the cache and diff plumbing can be
|
|
16
|
+
exercised end-to-end.
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import sys
|
|
23
|
+
import time
|
|
24
|
+
from datetime import datetime, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
28
|
+
sys.path.insert(0, str(REPO_ROOT / "scripts"))
|
|
29
|
+
|
|
30
|
+
from _lib import bench_ab_cache # type: ignore[import-not-found] # noqa: E402
|
|
31
|
+
|
|
32
|
+
REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab"
|
|
33
|
+
CORPUS_DIR = REPO_ROOT / "internal" / "bench" / "corpora"
|
|
34
|
+
CLONES_DIR = REPO_ROOT / "internal" / "bench" / "ab" / "clones"
|
|
35
|
+
|
|
36
|
+
# Supported corpora (created in Phases 3 + 4).
|
|
37
|
+
KNOWN_CORPORA = ("ab-tracka", "ab-trackb")
|
|
38
|
+
|
|
39
|
+
REPORT_SCHEMA_VERSION = "ab-bench/0.1"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def utc_stamp() -> str:
|
|
43
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def corpus_path(corpus: str) -> Path:
|
|
47
|
+
return CORPUS_DIR / f"{corpus}.yaml"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def report_path(stamp: str, corpus: str, variant: str) -> Path:
|
|
51
|
+
return REPORTS_DIR / f"{stamp}-{corpus}-{variant}.json"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def ensure_clone(variant: str) -> Path:
|
|
55
|
+
"""Make sure the clone exists; do NOT --refresh — that's a user-driven choice."""
|
|
56
|
+
target = CLONES_DIR / variant
|
|
57
|
+
if not target.exists():
|
|
58
|
+
# Lazy-import so the dependency stays explicit
|
|
59
|
+
import importlib.util
|
|
60
|
+
|
|
61
|
+
spec = importlib.util.spec_from_file_location(
|
|
62
|
+
"bench_ab_clone", REPO_ROOT / "scripts" / "bench_ab_clone.py"
|
|
63
|
+
)
|
|
64
|
+
if spec is None or spec.loader is None:
|
|
65
|
+
raise RuntimeError("cannot load bench_ab_clone helper")
|
|
66
|
+
module = importlib.util.module_from_spec(spec)
|
|
67
|
+
spec.loader.exec_module(module)
|
|
68
|
+
module.clone(variant, refresh=False) # type: ignore[attr-defined]
|
|
69
|
+
return target
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def run_track_stub(variant: str, corpus: str, clone_root: Path) -> dict[str, object]:
|
|
73
|
+
"""Phase-2 placeholder.
|
|
74
|
+
|
|
75
|
+
Returns a minimal results block. Phase 3 + Phase 4 plug their real
|
|
76
|
+
runners into this dispatch.
|
|
77
|
+
"""
|
|
78
|
+
return {
|
|
79
|
+
"track": corpus,
|
|
80
|
+
"status": "stub",
|
|
81
|
+
"note": (
|
|
82
|
+
"Phase 2 plumbing only. The real runner lands in Phase 3 (Track A) "
|
|
83
|
+
"or Phase 4 (Track B). See road-to-package-impact-benchmark.md."
|
|
84
|
+
),
|
|
85
|
+
"clone_root": str(clone_root.relative_to(REPO_ROOT)),
|
|
86
|
+
"variant": variant,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def write_report(
|
|
91
|
+
*,
|
|
92
|
+
variant: str,
|
|
93
|
+
corpus: str,
|
|
94
|
+
stamp: str,
|
|
95
|
+
cache_key: bench_ab_cache.CacheKey,
|
|
96
|
+
results: dict[str, object],
|
|
97
|
+
duration_seconds: float,
|
|
98
|
+
) -> Path:
|
|
99
|
+
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
report = {
|
|
101
|
+
"schema": REPORT_SCHEMA_VERSION,
|
|
102
|
+
"stamp": stamp,
|
|
103
|
+
"variant": variant,
|
|
104
|
+
"corpus": corpus,
|
|
105
|
+
"cache_key": cache_key.to_dict(),
|
|
106
|
+
"duration_seconds": round(duration_seconds, 3),
|
|
107
|
+
"results": results,
|
|
108
|
+
}
|
|
109
|
+
json_path = report_path(stamp, corpus, variant)
|
|
110
|
+
json_path.write_text(json.dumps(report, indent=2) + "\n")
|
|
111
|
+
md_path = json_path.with_suffix(".md")
|
|
112
|
+
md_path.write_text(render_markdown(report))
|
|
113
|
+
return json_path
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def render_markdown(report: dict[str, object]) -> str:
|
|
117
|
+
lines = [
|
|
118
|
+
f"# A/B Bench Report — {report['variant']} · {report['corpus']}",
|
|
119
|
+
"",
|
|
120
|
+
f"- Stamp: `{report['stamp']}`",
|
|
121
|
+
f"- Duration: {report['duration_seconds']}s",
|
|
122
|
+
"",
|
|
123
|
+
"## Cache key",
|
|
124
|
+
"",
|
|
125
|
+
]
|
|
126
|
+
for k, v in (report.get("cache_key") or {}).items(): # type: ignore[union-attr]
|
|
127
|
+
lines.append(f"- `{k}`: `{v}`")
|
|
128
|
+
lines.append("")
|
|
129
|
+
lines.append("## Results")
|
|
130
|
+
lines.append("")
|
|
131
|
+
lines.append("```json")
|
|
132
|
+
lines.append(json.dumps(report.get("results"), indent=2))
|
|
133
|
+
lines.append("```")
|
|
134
|
+
lines.append("")
|
|
135
|
+
return "\n".join(lines)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def parse_args(argv: list[str]) -> argparse.Namespace:
|
|
139
|
+
parser = argparse.ArgumentParser(
|
|
140
|
+
description="Run one arm of the package-impact A/B bench."
|
|
141
|
+
)
|
|
142
|
+
parser.add_argument(
|
|
143
|
+
"--variant",
|
|
144
|
+
choices=("with", "without"),
|
|
145
|
+
required=True,
|
|
146
|
+
help="Which target clone to run against.",
|
|
147
|
+
)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
"--corpus",
|
|
150
|
+
choices=KNOWN_CORPORA,
|
|
151
|
+
required=True,
|
|
152
|
+
help="Which corpus to execute.",
|
|
153
|
+
)
|
|
154
|
+
parser.add_argument(
|
|
155
|
+
"--non-interactive",
|
|
156
|
+
action="store_true",
|
|
157
|
+
help="Never prompt; assume defaults on cache decisions.",
|
|
158
|
+
)
|
|
159
|
+
parser.add_argument(
|
|
160
|
+
"--reuse-cache",
|
|
161
|
+
action="store_true",
|
|
162
|
+
help=(
|
|
163
|
+
"If a fresh cached `without` report exists, skip re-running and "
|
|
164
|
+
"exit 0 without writing a new report. Only meaningful for "
|
|
165
|
+
"--variant without."
|
|
166
|
+
),
|
|
167
|
+
)
|
|
168
|
+
return parser.parse_args(argv)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def main(argv: list[str] | None = None) -> int:
|
|
172
|
+
args = parse_args(argv if argv is not None else sys.argv[1:])
|
|
173
|
+
|
|
174
|
+
corpus_file = corpus_path(args.corpus)
|
|
175
|
+
if not corpus_file.exists():
|
|
176
|
+
sys.stdout.write(
|
|
177
|
+
f"bench_ab_run: corpus '{args.corpus}' missing at {corpus_file} — "
|
|
178
|
+
"Phase 3 (track A) or Phase 4 (track B) author it. Writing a "
|
|
179
|
+
"placeholder run with the synthetic corpus hash so cache plumbing "
|
|
180
|
+
"remains exercisable.\n"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
cache_key_value = bench_ab_cache.CacheKey(
|
|
184
|
+
corpus_hash=(
|
|
185
|
+
bench_ab_cache.hash_file(corpus_file)
|
|
186
|
+
if corpus_file.exists()
|
|
187
|
+
else "missing-corpus"
|
|
188
|
+
),
|
|
189
|
+
claude_cli_version=bench_ab_cache.claude_cli_version(),
|
|
190
|
+
target_shape_hash=bench_ab_cache.target_shape_hash(),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if args.variant == "without" and args.reuse_cache and corpus_file.exists():
|
|
194
|
+
lookup = bench_ab_cache.lookup(corpus_file)
|
|
195
|
+
if lookup.fresh and lookup.report_path is not None:
|
|
196
|
+
sys.stdout.write(
|
|
197
|
+
f"bench_ab_run: reusing fresh cached `without` report at "
|
|
198
|
+
f"{lookup.report_path.relative_to(REPO_ROOT)}\n"
|
|
199
|
+
)
|
|
200
|
+
return 0
|
|
201
|
+
if lookup.found and not lookup.fresh:
|
|
202
|
+
sys.stdout.write(
|
|
203
|
+
f"bench_ab_run: cached `without` report stale ({lookup.reason})\n"
|
|
204
|
+
)
|
|
205
|
+
if args.non_interactive:
|
|
206
|
+
sys.stdout.write(
|
|
207
|
+
"bench_ab_run: --non-interactive — reusing stale baseline "
|
|
208
|
+
"and flagging the run.\n"
|
|
209
|
+
)
|
|
210
|
+
return 0
|
|
211
|
+
sys.stdout.write(
|
|
212
|
+
"bench_ab_run: continuing with a fresh run "
|
|
213
|
+
"(set --reuse-cache off and use --non-interactive to keep the stale baseline)\n"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
clone_root = ensure_clone(args.variant)
|
|
217
|
+
started = time.monotonic()
|
|
218
|
+
results = run_track_stub(args.variant, args.corpus, clone_root)
|
|
219
|
+
duration = time.monotonic() - started
|
|
220
|
+
path = write_report(
|
|
221
|
+
variant=args.variant,
|
|
222
|
+
corpus=args.corpus,
|
|
223
|
+
stamp=utc_stamp(),
|
|
224
|
+
cache_key=cache_key_value,
|
|
225
|
+
results=results,
|
|
226
|
+
duration_seconds=duration,
|
|
227
|
+
)
|
|
228
|
+
sys.stdout.write(
|
|
229
|
+
f"bench_ab_run: wrote {path.relative_to(REPO_ROOT)}\n"
|
|
230
|
+
)
|
|
231
|
+
return 0
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
if __name__ == "__main__":
|
|
235
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Track B — task runner for the package-impact A/B bench.
|
|
3
|
+
|
|
4
|
+
Phase 4 Step 2 of `agents/roadmaps/road-to-package-impact-benchmark.md`.
|
|
5
|
+
|
|
6
|
+
For each task in `internal/bench/corpora/ab-trackb.yaml`, in each variant:
|
|
7
|
+
|
|
8
|
+
1. Snapshot the variant clone's file tree.
|
|
9
|
+
2. Invoke the `claude` CLI with the task prompt — OR dry-run, depending
|
|
10
|
+
on `--mode`.
|
|
11
|
+
3. Capture the transcript, tool-call events, wall-time, and (if available)
|
|
12
|
+
token + cost counts.
|
|
13
|
+
4. Snapshot the post-run tree.
|
|
14
|
+
5. Score the task via scripts/_lib/bench_ab_scoring.py.
|
|
15
|
+
|
|
16
|
+
Modes:
|
|
17
|
+
|
|
18
|
+
- `dry-run` (default) — record the would-run shell command, write a stub
|
|
19
|
+
transcript naming the variant, score against the unchanged tree. The
|
|
20
|
+
result is structural-zero for every check that requires a file write,
|
|
21
|
+
but the scoring + reporting pipeline runs end-to-end. This is what the
|
|
22
|
+
bench produces in CI by default — fast, free, repeatable.
|
|
23
|
+
- `live` — actually invoke the `claude` CLI with `--print` (one-shot
|
|
24
|
+
mode) and the task prompt. Reads `CLAUDE_CLI` from env if set, falls
|
|
25
|
+
back to `claude` on PATH. Captures stdout as the transcript. Honors
|
|
26
|
+
`--samples N` for repeated runs.
|
|
27
|
+
|
|
28
|
+
The runner ALWAYS resets the clone to a clean state before each task and
|
|
29
|
+
ALWAYS records the mode in the report header so a reader can never mistake
|
|
30
|
+
a dry-run report for a real measurement.
|
|
31
|
+
"""
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import argparse
|
|
35
|
+
import hashlib
|
|
36
|
+
import json
|
|
37
|
+
import os
|
|
38
|
+
import shutil
|
|
39
|
+
import subprocess
|
|
40
|
+
import sys
|
|
41
|
+
import time
|
|
42
|
+
from datetime import datetime, timezone
|
|
43
|
+
from pathlib import Path
|
|
44
|
+
|
|
45
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
46
|
+
sys.path.insert(0, str(REPO_ROOT / "scripts"))
|
|
47
|
+
|
|
48
|
+
from _lib import bench_ab_cache # type: ignore[import-not-found] # noqa: E402
|
|
49
|
+
from _lib import bench_ab_scoring # type: ignore[import-not-found] # noqa: E402
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
import yaml
|
|
53
|
+
except ImportError:
|
|
54
|
+
sys.stderr.write("bench_ab_task_runner: PyYAML required (pip install pyyaml)\n")
|
|
55
|
+
raise SystemExit(2)
|
|
56
|
+
|
|
57
|
+
CORPUS_PATH = REPO_ROOT / "internal" / "bench" / "corpora" / "ab-trackb.yaml"
|
|
58
|
+
CLONES_DIR = REPO_ROOT / "internal" / "bench" / "ab" / "clones"
|
|
59
|
+
REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab"
|
|
60
|
+
|
|
61
|
+
# How far we descend into a clone when snapshotting. The fixture is shallow.
|
|
62
|
+
SNAPSHOT_MAX_DEPTH = 6
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def utc_stamp() -> str:
|
|
66
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def snapshot_clone(clone_root: Path, *, max_depth: int = SNAPSHOT_MAX_DEPTH) -> dict[str, str]:
|
|
70
|
+
"""Return {relpath: sha256-short} for every fixture file under the clone.
|
|
71
|
+
|
|
72
|
+
Skips the agent-config surface (.claude, .augment, AGENTS.md, CLAUDE.md, manifest)
|
|
73
|
+
because that's the variant axis, not the task surface.
|
|
74
|
+
"""
|
|
75
|
+
skip_roots = {".claude", ".augment"}
|
|
76
|
+
skip_files = {"AGENTS.md", "CLAUDE.md", ".bench-ab-manifest.json"}
|
|
77
|
+
out: dict[str, str] = {}
|
|
78
|
+
for path in sorted(clone_root.rglob("*")):
|
|
79
|
+
if not path.is_file():
|
|
80
|
+
continue
|
|
81
|
+
rel = path.relative_to(clone_root)
|
|
82
|
+
parts = rel.parts
|
|
83
|
+
if parts and parts[0] in skip_roots:
|
|
84
|
+
continue
|
|
85
|
+
if rel.as_posix() in skip_files:
|
|
86
|
+
continue
|
|
87
|
+
if len(parts) > max_depth:
|
|
88
|
+
continue
|
|
89
|
+
h = hashlib.sha256()
|
|
90
|
+
try:
|
|
91
|
+
h.update(path.read_bytes())
|
|
92
|
+
except OSError:
|
|
93
|
+
continue
|
|
94
|
+
out[rel.as_posix()] = h.hexdigest()[:16]
|
|
95
|
+
return out
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def reset_clone(variant: str) -> Path:
|
|
99
|
+
"""Rebuild the clone so each task starts from the same state."""
|
|
100
|
+
import importlib.util
|
|
101
|
+
|
|
102
|
+
spec = importlib.util.spec_from_file_location(
|
|
103
|
+
"bench_ab_clone", REPO_ROOT / "scripts" / "bench_ab_clone.py"
|
|
104
|
+
)
|
|
105
|
+
if spec is None or spec.loader is None:
|
|
106
|
+
raise RuntimeError("cannot load bench_ab_clone helper")
|
|
107
|
+
module = importlib.util.module_from_spec(spec)
|
|
108
|
+
spec.loader.exec_module(module)
|
|
109
|
+
return module.clone(variant, refresh=True) # type: ignore[attr-defined]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def claude_executable() -> str | None:
|
|
113
|
+
"""Resolve the claude CLI binary (env override → PATH)."""
|
|
114
|
+
override = os.environ.get("CLAUDE_CLI")
|
|
115
|
+
if override:
|
|
116
|
+
return override
|
|
117
|
+
if shutil.which("claude") is not None:
|
|
118
|
+
return "claude"
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def run_live(task: dict, clone_root: Path, *, timeout_s: int) -> dict:
|
|
123
|
+
"""Invoke claude in print/one-shot mode against the task prompt."""
|
|
124
|
+
binary = claude_executable()
|
|
125
|
+
if binary is None:
|
|
126
|
+
return {
|
|
127
|
+
"mode": "live-skipped",
|
|
128
|
+
"reason": "claude CLI not found; set CLAUDE_CLI or install it",
|
|
129
|
+
"transcript": "",
|
|
130
|
+
"exit_code": None,
|
|
131
|
+
"wall_time_seconds": 0.0,
|
|
132
|
+
}
|
|
133
|
+
prompt = task.get("prompt", "")
|
|
134
|
+
cmd = [binary, "--print", "--", prompt]
|
|
135
|
+
started = time.monotonic()
|
|
136
|
+
try:
|
|
137
|
+
proc = subprocess.run(
|
|
138
|
+
cmd,
|
|
139
|
+
cwd=clone_root,
|
|
140
|
+
capture_output=True,
|
|
141
|
+
text=True,
|
|
142
|
+
timeout=timeout_s,
|
|
143
|
+
check=False,
|
|
144
|
+
)
|
|
145
|
+
except subprocess.TimeoutExpired as exc:
|
|
146
|
+
return {
|
|
147
|
+
"mode": "live",
|
|
148
|
+
"reason": f"timeout after {timeout_s}s",
|
|
149
|
+
"transcript": (exc.stdout or "") + "\n[TIMEOUT]",
|
|
150
|
+
"exit_code": -1,
|
|
151
|
+
"wall_time_seconds": round(time.monotonic() - started, 3),
|
|
152
|
+
}
|
|
153
|
+
duration = time.monotonic() - started
|
|
154
|
+
return {
|
|
155
|
+
"mode": "live",
|
|
156
|
+
"reason": "ok",
|
|
157
|
+
"transcript": proc.stdout + "\n" + proc.stderr,
|
|
158
|
+
"exit_code": proc.returncode,
|
|
159
|
+
"wall_time_seconds": round(duration, 3),
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def run_dry(task: dict, clone_root: Path, variant: str) -> dict:
|
|
164
|
+
"""Record what would have run; produce a deterministic stub transcript.
|
|
165
|
+
|
|
166
|
+
The stub deliberately does NOT echo the user prompt: doing so would let
|
|
167
|
+
transcript-keyword criteria spuriously match against the prompt text
|
|
168
|
+
instead of the agent's response. The stub is therefore inert for every
|
|
169
|
+
`transcript_contains_*` criterion, which is the honest dry-run signal.
|
|
170
|
+
"""
|
|
171
|
+
stub_transcript = (
|
|
172
|
+
"[bench_ab_task_runner dry-run]\n"
|
|
173
|
+
f"variant={variant}\n"
|
|
174
|
+
f"clone={clone_root}\n"
|
|
175
|
+
f"task_id={task.get('id')}\n"
|
|
176
|
+
"[no claude invocation; --mode live to execute for real]\n"
|
|
177
|
+
)
|
|
178
|
+
return {
|
|
179
|
+
"mode": "dry-run",
|
|
180
|
+
"reason": "ok",
|
|
181
|
+
"transcript": stub_transcript,
|
|
182
|
+
"exit_code": 0,
|
|
183
|
+
"wall_time_seconds": 0.0,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def count_ask_events(transcript: str) -> dict[str, int]:
|
|
188
|
+
"""Crude ask-vs-act heuristic over the transcript."""
|
|
189
|
+
if not transcript:
|
|
190
|
+
return {"asked": 0, "acted_with_commit": 0, "ratio": 0}
|
|
191
|
+
lt = transcript.lower()
|
|
192
|
+
ask_markers = ["should i", "do you want", "shall i", "soll ich", "möchtest du"]
|
|
193
|
+
asked = sum(lt.count(m) for m in ask_markers)
|
|
194
|
+
commit_markers = ["git commit", "git push", "gh pr create", "gh pr merge"]
|
|
195
|
+
acted = sum(lt.count(m) for m in commit_markers)
|
|
196
|
+
total = asked + acted
|
|
197
|
+
ratio = round(asked / total, 3) if total else 0
|
|
198
|
+
return {"asked": asked, "acted_with_commit": acted, "ratio": ratio}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def per_category_aggregate(per_task: list[dict]) -> dict[str, dict]:
|
|
202
|
+
by_cat: dict[str, list[dict]] = {}
|
|
203
|
+
for entry in per_task:
|
|
204
|
+
by_cat.setdefault(entry.get("category", "unknown"), []).append(entry)
|
|
205
|
+
out: dict[str, dict] = {}
|
|
206
|
+
for cat, entries in by_cat.items():
|
|
207
|
+
passed = sum(1 for e in entries if e.get("score", {}).get("passed"))
|
|
208
|
+
total = len(entries)
|
|
209
|
+
out[cat] = {
|
|
210
|
+
"passed": passed,
|
|
211
|
+
"total": total,
|
|
212
|
+
"completion_rate": round(passed / total, 4) if total else 0,
|
|
213
|
+
"mean_wall_time": round(
|
|
214
|
+
sum(e.get("wall_time_seconds", 0) for e in entries) / total, 3
|
|
215
|
+
)
|
|
216
|
+
if total
|
|
217
|
+
else 0,
|
|
218
|
+
}
|
|
219
|
+
return out
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def write_report(
|
|
223
|
+
variant: str,
|
|
224
|
+
*,
|
|
225
|
+
mode: str,
|
|
226
|
+
per_task: list[dict],
|
|
227
|
+
duration: float,
|
|
228
|
+
) -> Path:
|
|
229
|
+
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
230
|
+
cache_key = bench_ab_cache.CacheKey(
|
|
231
|
+
corpus_hash=bench_ab_cache.hash_file(CORPUS_PATH),
|
|
232
|
+
claude_cli_version=bench_ab_cache.claude_cli_version(),
|
|
233
|
+
target_shape_hash=bench_ab_cache.target_shape_hash(),
|
|
234
|
+
)
|
|
235
|
+
total = len(per_task)
|
|
236
|
+
passed = sum(1 for e in per_task if e.get("score", {}).get("passed"))
|
|
237
|
+
results = {
|
|
238
|
+
"mode": mode,
|
|
239
|
+
"completion_rate": round(passed / total, 4) if total else 0,
|
|
240
|
+
"passed": passed,
|
|
241
|
+
"total": total,
|
|
242
|
+
"per_category": per_category_aggregate(per_task),
|
|
243
|
+
"mean_wall_time": round(
|
|
244
|
+
sum(e.get("wall_time_seconds", 0) for e in per_task) / total, 3
|
|
245
|
+
)
|
|
246
|
+
if total
|
|
247
|
+
else 0,
|
|
248
|
+
"ask_vs_act_ratio": round(
|
|
249
|
+
sum(e.get("ask_events", {}).get("ratio", 0) for e in per_task) / total, 3
|
|
250
|
+
)
|
|
251
|
+
if total
|
|
252
|
+
else 0,
|
|
253
|
+
"per_task": per_task,
|
|
254
|
+
}
|
|
255
|
+
stamp = utc_stamp()
|
|
256
|
+
payload = {
|
|
257
|
+
"schema": "ab-bench/0.1",
|
|
258
|
+
"stamp": stamp,
|
|
259
|
+
"variant": variant,
|
|
260
|
+
"corpus": "ab-trackb",
|
|
261
|
+
"cache_key": cache_key.to_dict(),
|
|
262
|
+
"duration_seconds": round(duration, 3),
|
|
263
|
+
"results": results,
|
|
264
|
+
}
|
|
265
|
+
path = REPORTS_DIR / f"{stamp}-ab-trackb-{variant}.json"
|
|
266
|
+
path.write_text(json.dumps(payload, indent=2) + "\n")
|
|
267
|
+
md = path.with_suffix(".md")
|
|
268
|
+
md.write_text(
|
|
269
|
+
f"# Track B · {variant} · {mode}\n\n"
|
|
270
|
+
f"- Stamp: `{stamp}`\n"
|
|
271
|
+
f"- Completion rate: **{results['completion_rate'] * 100:.1f}%**"
|
|
272
|
+
f" ({passed}/{total})\n"
|
|
273
|
+
f"- Mean wall-time: {results['mean_wall_time']}s\n"
|
|
274
|
+
f"- Ask vs. act ratio: {results['ask_vs_act_ratio']}\n"
|
|
275
|
+
f"\n## Per-category\n\n"
|
|
276
|
+
+ "\n".join(
|
|
277
|
+
f"- `{cat}` — {info['passed']}/{info['total']} "
|
|
278
|
+
f"({info['completion_rate'] * 100:.1f}%)"
|
|
279
|
+
for cat, info in results["per_category"].items()
|
|
280
|
+
)
|
|
281
|
+
+ "\n"
|
|
282
|
+
)
|
|
283
|
+
return path
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def run_variant(variant: str, tasks: list[dict], *, mode: str, timeout_s: int) -> dict:
|
|
287
|
+
started = time.monotonic()
|
|
288
|
+
per_task: list[dict] = []
|
|
289
|
+
for task in tasks:
|
|
290
|
+
clone_root = reset_clone(variant)
|
|
291
|
+
pre = snapshot_clone(clone_root)
|
|
292
|
+
if mode == "live":
|
|
293
|
+
run_result = run_live(task, clone_root, timeout_s=timeout_s)
|
|
294
|
+
else:
|
|
295
|
+
run_result = run_dry(task, clone_root, variant)
|
|
296
|
+
post = snapshot_clone(clone_root)
|
|
297
|
+
score = bench_ab_scoring.score_task(
|
|
298
|
+
task,
|
|
299
|
+
pre_snapshot=pre,
|
|
300
|
+
post_snapshot=post,
|
|
301
|
+
clone_root=clone_root,
|
|
302
|
+
transcript=run_result.get("transcript", ""),
|
|
303
|
+
)
|
|
304
|
+
per_task.append(
|
|
305
|
+
{
|
|
306
|
+
"id": task.get("id"),
|
|
307
|
+
"category": task.get("category"),
|
|
308
|
+
"score": score,
|
|
309
|
+
"wall_time_seconds": run_result.get("wall_time_seconds", 0.0),
|
|
310
|
+
"exit_code": run_result.get("exit_code"),
|
|
311
|
+
"mode": run_result.get("mode", mode),
|
|
312
|
+
"reason": run_result.get("reason", ""),
|
|
313
|
+
"ask_events": count_ask_events(run_result.get("transcript", "")),
|
|
314
|
+
}
|
|
315
|
+
)
|
|
316
|
+
duration = time.monotonic() - started
|
|
317
|
+
path = write_report(variant, mode=mode, per_task=per_task, duration=duration)
|
|
318
|
+
sys.stdout.write(
|
|
319
|
+
f"bench_ab_task_runner: {variant} ({mode}) → "
|
|
320
|
+
f"{sum(1 for e in per_task if e['score']['passed'])}/{len(per_task)} "
|
|
321
|
+
f"passed — {path.relative_to(REPO_ROOT)}\n"
|
|
322
|
+
)
|
|
323
|
+
return {"path": path, "per_task": per_task, "duration": duration}
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def parse_args(argv: list[str]) -> argparse.Namespace:
|
|
327
|
+
parser = argparse.ArgumentParser(description="Run Track B tasks per variant.")
|
|
328
|
+
parser.add_argument(
|
|
329
|
+
"--variant",
|
|
330
|
+
choices=("with", "without", "both"),
|
|
331
|
+
default="both",
|
|
332
|
+
help="Which variant to run (default: both).",
|
|
333
|
+
)
|
|
334
|
+
parser.add_argument(
|
|
335
|
+
"--mode",
|
|
336
|
+
choices=("dry-run", "live"),
|
|
337
|
+
default="dry-run",
|
|
338
|
+
help=(
|
|
339
|
+
"dry-run: stub transcript, no CLI invocation (fast, free). "
|
|
340
|
+
"live: invoke `claude --print` per task (cost-bearing)."
|
|
341
|
+
),
|
|
342
|
+
)
|
|
343
|
+
parser.add_argument(
|
|
344
|
+
"--timeout",
|
|
345
|
+
type=int,
|
|
346
|
+
default=120,
|
|
347
|
+
help="Live mode: per-task timeout in seconds (default 120).",
|
|
348
|
+
)
|
|
349
|
+
return parser.parse_args(argv)
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def main(argv: list[str] | None = None) -> int:
|
|
353
|
+
args = parse_args(argv if argv is not None else sys.argv[1:])
|
|
354
|
+
if not CORPUS_PATH.exists():
|
|
355
|
+
sys.stderr.write(f"bench_ab_task_runner: corpus missing at {CORPUS_PATH}\n")
|
|
356
|
+
return 1
|
|
357
|
+
data = yaml.safe_load(CORPUS_PATH.read_text())
|
|
358
|
+
tasks = data.get("tasks") or []
|
|
359
|
+
if not tasks:
|
|
360
|
+
sys.stderr.write("bench_ab_task_runner: corpus has no tasks\n")
|
|
361
|
+
return 1
|
|
362
|
+
variants = ("with", "without") if args.variant == "both" else (args.variant,)
|
|
363
|
+
for variant in variants:
|
|
364
|
+
run_variant(variant, tasks, mode=args.mode, timeout_s=args.timeout)
|
|
365
|
+
return 0
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
if __name__ == "__main__":
|
|
369
|
+
raise SystemExit(main())
|