@event4u/agent-config 6.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +5 -5
- package/CHANGELOG.md +167 -440
- package/README.md +3 -3
- package/dist/agent-src/commands/agent-handoff.md +5 -4
- package/dist/agent-src/commands/agent-status.md +1 -0
- package/dist/agent-src/commands/agents/audit.md +1 -0
- package/dist/agent-src/commands/agents/init.md +3 -0
- package/dist/agent-src/commands/agents/optimize.md +1 -0
- package/dist/agent-src/commands/agents/user/accept.md +1 -0
- package/dist/agent-src/commands/agents/user/init.md +1 -0
- package/dist/agent-src/commands/agents/user/review.md +1 -0
- package/dist/agent-src/commands/agents/user/show.md +1 -0
- package/dist/agent-src/commands/agents/user/update.md +1 -0
- package/dist/agent-src/commands/agents/user.md +1 -0
- package/dist/agent-src/commands/agents.md +1 -0
- package/dist/agent-src/commands/analytics/prune.md +3 -2
- package/dist/agent-src/commands/analytics/show.md +3 -2
- package/dist/agent-src/commands/analytics.md +3 -2
- package/dist/agent-src/commands/analyze-reference-repo.md +1 -0
- package/dist/agent-src/commands/bug-fix.md +1 -0
- package/dist/agent-src/commands/bug-investigate.md +1 -0
- package/dist/agent-src/commands/challenge-me/vision.md +3 -2
- package/dist/agent-src/commands/challenge-me/with-docs.md +3 -2
- package/dist/agent-src/commands/challenge-me.md +3 -2
- package/dist/agent-src/commands/chat-history/import.md +9 -9
- package/dist/agent-src/commands/chat-history.md +32 -30
- package/dist/agent-src/commands/check-current-md.md +1 -0
- package/dist/agent-src/commands/commit/in-chunks.md +1 -0
- package/dist/agent-src/commands/commit.md +1 -0
- package/dist/agent-src/commands/condense.md +1 -0
- package/dist/agent-src/commands/context/create.md +1 -0
- package/dist/agent-src/commands/context/refactor.md +1 -0
- package/dist/agent-src/commands/context.md +1 -0
- package/dist/agent-src/commands/cost-report.md +5 -4
- package/dist/agent-src/commands/council/analysis.md +3 -2
- package/dist/agent-src/commands/council/debate.md +5 -4
- package/dist/agent-src/commands/council/default.md +3 -2
- package/dist/agent-src/commands/council/design.md +3 -2
- package/dist/agent-src/commands/council/optimize.md +3 -2
- package/dist/agent-src/commands/council/pr.md +3 -2
- package/dist/agent-src/commands/council.md +4 -3
- package/dist/agent-src/commands/e2e-heal.md +1 -0
- package/dist/agent-src/commands/e2e-plan.md +1 -0
- package/dist/agent-src/commands/estimate-ticket.md +1 -0
- package/dist/agent-src/commands/feature/dev.md +1 -0
- package/dist/agent-src/commands/feature/explore.md +1 -0
- package/dist/agent-src/commands/feature/plan.md +6 -6
- package/dist/agent-src/commands/feature/refactor.md +1 -0
- package/dist/agent-src/commands/feature/roadmap.md +1 -0
- package/dist/agent-src/commands/feature.md +1 -0
- package/dist/agent-src/commands/fix/ci.md +1 -0
- package/dist/agent-src/commands/fix/portability.md +1 -0
- package/dist/agent-src/commands/fix/pr-comments.md +147 -15
- package/dist/agent-src/commands/fix/refs.md +1 -0
- package/dist/agent-src/commands/fix/seeder.md +1 -0
- package/dist/agent-src/commands/fix.md +8 -8
- package/dist/agent-src/commands/ghostwriter/delete.md +1 -0
- package/dist/agent-src/commands/ghostwriter/fetch.md +1 -0
- package/dist/agent-src/commands/ghostwriter/list.md +1 -0
- package/dist/agent-src/commands/ghostwriter/show.md +1 -0
- package/dist/agent-src/commands/ghostwriter/write.md +1 -0
- package/dist/agent-src/commands/ghostwriter.md +1 -0
- package/dist/agent-src/commands/grill-me.md +3 -2
- package/dist/agent-src/commands/image/analyse.md +1 -0
- package/dist/agent-src/commands/image/create.md +1 -0
- package/dist/agent-src/commands/image/verify.md +1 -0
- package/dist/agent-src/commands/image.md +1 -0
- package/dist/agent-src/commands/implement-ticket.md +1 -0
- package/dist/agent-src/commands/jira-ticket.md +1 -0
- package/dist/agent-src/commands/judge/on-diff.md +1 -0
- package/dist/agent-src/commands/judge/solo.md +1 -0
- package/dist/agent-src/commands/judge/steps.md +1 -0
- package/dist/agent-src/commands/judge.md +1 -0
- package/dist/agent-src/commands/knowledge/cross-repo.md +1 -0
- package/dist/agent-src/commands/knowledge/forget.md +1 -0
- package/dist/agent-src/commands/knowledge/ingest.md +1 -0
- package/dist/agent-src/commands/knowledge/list.md +1 -0
- package/dist/agent-src/commands/knowledge.md +1 -0
- package/dist/agent-src/commands/memory/add.md +8 -6
- package/dist/agent-src/commands/memory/learn-low-impact.md +3 -2
- package/dist/agent-src/commands/memory/load.md +7 -7
- package/dist/agent-src/commands/memory/mine-session.md +39 -12
- package/dist/agent-src/commands/memory/promote.md +3 -2
- package/dist/agent-src/commands/memory/propose.md +7 -6
- package/dist/agent-src/commands/memory.md +3 -2
- package/dist/agent-src/commands/mode.md +1 -0
- package/dist/agent-src/commands/module/create.md +1 -0
- package/dist/agent-src/commands/module/explore.md +1 -0
- package/dist/agent-src/commands/module.md +1 -0
- package/dist/agent-src/commands/optimize/agents-dir.md +1 -0
- package/dist/agent-src/commands/optimize/augmentignore.md +1 -0
- package/dist/agent-src/commands/optimize/rtk.md +1 -0
- package/dist/agent-src/commands/optimize/skills.md +1 -0
- package/dist/agent-src/commands/optimize-prompt.md +1 -0
- package/dist/agent-src/commands/optimize.md +1 -0
- package/dist/agent-src/commands/orchestrate.md +1 -0
- package/dist/agent-src/commands/override/create.md +1 -0
- package/dist/agent-src/commands/override/manage.md +1 -0
- package/dist/agent-src/commands/override.md +1 -0
- package/dist/agent-src/commands/package-reset.md +1 -0
- package/dist/agent-src/commands/package-test.md +1 -0
- package/dist/agent-src/commands/post-as/ghostwriter.md +1 -0
- package/dist/agent-src/commands/post-as/me.md +1 -0
- package/dist/agent-src/commands/post-as.md +1 -0
- package/dist/agent-src/commands/pr/create/description-only.md +1 -0
- package/dist/agent-src/commands/pr/create.md +25 -0
- package/dist/agent-src/commands/prediction-pool.md +1 -0
- package/dist/agent-src/commands/prepare-for-review.md +1 -0
- package/dist/agent-src/commands/profile/activate.md +1 -0
- package/dist/agent-src/commands/profile/deactivate.md +1 -0
- package/dist/agent-src/commands/profile/show.md +1 -0
- package/dist/agent-src/commands/profile.md +1 -0
- package/dist/agent-src/commands/project-analyze.md +1 -0
- package/dist/agent-src/commands/project-health.md +1 -0
- package/dist/agent-src/commands/quality-fix.md +1 -0
- package/dist/agent-src/commands/refine-ticket.md +1 -0
- package/dist/agent-src/commands/research/deep.md +1 -0
- package/dist/agent-src/commands/research/report.md +1 -0
- package/dist/agent-src/commands/research.md +1 -0
- package/dist/agent-src/commands/review-changes.md +1 -0
- package/dist/agent-src/commands/review-routing.md +1 -0
- package/dist/agent-src/commands/roadmap/ai-council.md +1 -0
- package/dist/agent-src/commands/roadmap/create.md +1 -0
- package/dist/agent-src/commands/roadmap/process-full.md +1 -0
- package/dist/agent-src/commands/roadmap/process-phase.md +1 -0
- package/dist/agent-src/commands/roadmap/process-step.md +1 -0
- package/dist/agent-src/commands/roadmap.md +1 -0
- package/dist/agent-src/commands/rule-compliance-audit.md +1 -0
- package/dist/agent-src/commands/security-audit-config.md +84 -0
- package/dist/agent-src/commands/set-cost-profile.md +1 -0
- package/dist/agent-src/commands/skill/preview.md +1 -0
- package/dist/agent-src/commands/skill.md +1 -0
- package/dist/agent-src/commands/skills/discover.md +1 -0
- package/dist/agent-src/commands/skills.md +1 -0
- package/dist/agent-src/commands/sync-agent-settings.md +1 -0
- package/dist/agent-src/commands/sync-gitignore/fix.md +1 -0
- package/dist/agent-src/commands/sync-gitignore.md +1 -0
- package/dist/agent-src/commands/tests/create.md +1 -0
- package/dist/agent-src/commands/tests/execute.md +1 -0
- package/dist/agent-src/commands/tests.md +1 -0
- package/dist/agent-src/commands/threat-model.md +1 -0
- package/dist/agent-src/commands/update-form-request-messages.md +1 -0
- package/dist/agent-src/commands/upstream-contribute.md +1 -0
- package/dist/agent-src/commands/video/from-script.md +1 -0
- package/dist/agent-src/commands/video/from-song.md +1 -0
- package/dist/agent-src/commands/video/scene.md +1 -0
- package/dist/agent-src/commands/video/stitch.md +1 -0
- package/dist/agent-src/commands/video/storyboard.md +1 -0
- package/dist/agent-src/commands/video.md +1 -0
- package/dist/agent-src/commands/work.md +1 -0
- package/dist/agent-src/contexts/augment-infrastructure.md +1 -1
- package/dist/agent-src/contexts/communication/rules-auto/skill-quality-mechanics.md +1 -1
- package/dist/agent-src/contexts/communication/rules-auto/slash-command-routing-policy-mechanics.md +2 -2
- package/dist/agent-src/contexts/communication/rules-auto/think-before-action-mechanics.md +6 -6
- package/dist/agent-src/contexts/contracts/consumer-agents-md-guide.md +2 -2
- package/dist/agent-src/contexts/execution/rdp-gate.md +75 -0
- package/dist/agent-src/contexts/subagent-configuration.md +1 -0
- package/dist/agent-src/personas/advisors/contrarian.md +1 -1
- package/dist/agent-src/personas/advisors/executor.md +1 -1
- package/dist/agent-src/personas/advisors/expansionist.md +1 -1
- package/dist/agent-src/personas/advisors/first-principles.md +1 -1
- package/dist/agent-src/personas/advisors/outsider.md +1 -1
- package/dist/agent-src/rules/autonomous-execution.md +12 -0
- package/dist/agent-src/rules/external-reference-deep-dive.md +1 -1
- package/dist/agent-src/rules/git-history-discipline.md +47 -1
- package/dist/agent-src/rules/improve-before-implement.md +12 -0
- package/dist/agent-src/rules/lethal-trifecta-guard.md +80 -0
- package/dist/agent-src/rules/no-pr-progress-comments.md +3 -4
- package/dist/agent-src/rules/notes-first-reasoning.md +71 -0
- package/dist/agent-src/rules/roadmap-progress-sync.md +48 -31
- package/dist/agent-src/rules/security-sensitive-stop.md +14 -1
- package/dist/agent-src/rules/source-confidentiality.md +97 -0
- package/dist/agent-src/rules/think-before-action.md +9 -1
- package/dist/agent-src/rules/untrusted-input-defense.md +76 -0
- package/dist/agent-src/scripts/archive_completed_roadmaps.py +171 -0
- package/dist/agent-src/skills/adversarial-review/SKILL.md +14 -0
- package/dist/agent-src/skills/agent-security-review/SKILL.md +113 -0
- package/dist/agent-src/skills/agent-security-review/evals/triggers.json +51 -0
- package/dist/agent-src/skills/ai-council/SKILL.md +3 -3
- package/dist/agent-src/skills/async-python-patterns/SKILL.md +1 -1
- package/dist/agent-src/skills/blast-radius-analyzer/SKILL.md +12 -11
- package/dist/agent-src/skills/command-routing/SKILL.md +1 -1
- package/dist/agent-src/skills/complexity-first-planning/SKILL.md +96 -0
- package/dist/agent-src/skills/complexity-first-planning/evals/triggers.json +16 -0
- package/dist/agent-src/skills/copilot-config/SKILL.md +3 -4
- package/dist/agent-src/skills/defense-in-depth/SKILL.md +1 -1
- package/dist/agent-src/skills/developer-like-execution/SKILL.md +5 -4
- package/dist/agent-src/skills/error-handling-patterns/SKILL.md +1 -1
- package/dist/agent-src/skills/feature-planning/SKILL.md +2 -2
- package/dist/agent-src/skills/mcp-builder/SKILL.md +1 -1
- package/dist/agent-src/skills/memory-consolidation/SKILL.md +63 -17
- package/dist/agent-src/skills/prompt-engineering-patterns/SKILL.md +1 -1
- package/dist/agent-src/skills/readme-writing-package/SKILL.md +1 -1
- package/dist/agent-src/skills/reasoning-orchestrator/SKILL.md +119 -0
- package/dist/agent-src/skills/reasoning-orchestrator/evals/triggers.json +16 -0
- package/dist/agent-src/skills/receiving-code-review/SKILL.md +6 -6
- package/dist/agent-src/skills/refine-prompt/SKILL.md +1 -1
- package/dist/agent-src/skills/refine-ticket/SKILL.md +1 -1
- package/dist/agent-src/skills/repomix-packer/SKILL.md +1 -1
- package/dist/agent-src/skills/secrets-management/SKILL.md +1 -1
- package/dist/agent-src/skills/subagent-orchestration/SKILL.md +10 -3
- package/dist/agent-src/skills/testing-anti-patterns/SKILL.md +1 -1
- package/dist/agent-src/skills/testing-anti-patterns/process-anti-patterns.md +1 -1
- package/dist/agent-src/skills/token-optimizer/SKILL.md +1 -1
- package/dist/agent-src/templates/agents/.gitattributes.fragment +0 -1
- package/dist/agent-src/templates/agents/agent-project-settings.example.yml +4 -4
- package/dist/agent-src/templates/scripts/check_memory.py +1 -2
- package/dist/agent-src/templates/scripts/check_memory_proposal.py +1 -1
- package/dist/agent-src/templates/scripts/memory_lookup.py +148 -289
- package/dist/agent-src/templates/scripts/memory_report.py +132 -2
- package/dist/agent-src/templates/scripts/memory_signal.py +7 -9
- package/dist/agent-src/templates/scripts/memory_status.py +25 -206
- package/dist/agent-src/templates/scripts/work_engine/directives/backend/memory.py +6 -6
- package/dist/agent-src/templates/scripts/work_engine/directives/ui/_passthrough.py +3 -3
- package/dist/agent-src/templates/scripts/work_engine/scoring/memory_visibility.py +0 -1
- package/dist/cli/agent-config.js +31 -300
- package/dist/cli/agent-config.js.map +1 -1
- package/dist/cli/commands/commands.js +10 -5
- package/dist/cli/commands/commands.js.map +1 -1
- package/dist/cli/discovery/loadManifest.js.map +1 -1
- package/dist/cli/main.js +309 -0
- package/dist/cli/main.js.map +1 -0
- package/dist/discovery/deprecation-report.md +1 -1
- package/dist/discovery/discovery-manifest.json +645 -342
- package/dist/discovery/discovery-manifest.json.sha256 +1 -1
- package/dist/discovery/discovery-manifest.summary.md +8 -5
- package/dist/discovery/orphan-report.md +1 -1
- package/dist/discovery/packs.json +149 -37
- package/dist/discovery/trust-report.md +3 -3
- package/dist/discovery/workspaces.json +61 -36
- package/dist/mcp/registry-manifest.json +4 -4
- package/dist/router.json +1 -1
- package/dist/server/routes/wizard.js +4 -3
- package/dist/server/routes/wizard.js.map +1 -1
- package/dist/server/schemas/settings.js +18 -0
- package/dist/server/schemas/settings.js.map +1 -1
- package/docs/MIGRATION.md +1 -1
- package/docs/adrs/cost/0001-hard-stop-hook.md +5 -5
- package/docs/adrs/memory/0001-consumer-side-snapshot.md +15 -7
- package/docs/adrs/memory/README.md +6 -5
- package/docs/adrs/router/0001-three-tier-routing.md +2 -2
- package/docs/adrs/schema/0001-json-schema-frontmatter.md +2 -2
- package/docs/adrs/smoke/0001-per-tier-smoke-scripts.md +5 -5
- package/docs/adrs/telegraph/0001-default-off-until-bench.md +3 -3
- package/docs/architecture.md +9 -9
- package/docs/archive/CHANGELOG-pre-2.2.0.md +30 -30
- package/docs/archive/CHANGELOG-pre-2.25.0.md +1 -1
- package/docs/archive/CHANGELOG-pre-4.5.0.md +1 -1
- package/docs/archive/CHANGELOG-pre-6.0.0.md +473 -0
- package/docs/benchmark.md +54 -53
- package/docs/benchmarks.md +2 -2
- package/docs/case-studies/{frontend-design-vs-ui-ux-pro-max.md → frontend-design-positioning.md} +4 -4
- package/docs/catalog.md +20 -13
- package/docs/command-flows.md +90 -92
- package/docs/contracts/adr-layout.md +2 -3
- package/docs/contracts/adr-level-6-productization.md +1 -1
- package/docs/contracts/ai-council-config.md +42 -7
- package/docs/contracts/command-clusters.md +1 -1
- package/docs/contracts/cost-enforcement.md +1 -1
- package/docs/contracts/cost-summary-schema.md +1 -1
- package/docs/contracts/daily-workspace.md +1 -0
- package/docs/contracts/discovery-manifest.schema.json +4 -2
- package/docs/contracts/explain-modes.md +1 -1
- package/docs/contracts/implement-ticket-flow.md +6 -7
- package/docs/contracts/mcp-tool-inventory.md +10 -10
- package/docs/contracts/measurement-baseline.md +1 -1
- package/docs/contracts/memory-visibility-v1.md +1 -5
- package/docs/contracts/namespace.md +1 -1
- package/docs/contracts/persona-schema.md +1 -1
- package/docs/contracts/rule-interactions.md +1 -1
- package/docs/contracts/smoke-contracts.md +1 -1
- package/docs/contracts/universal-skills.md +0 -1
- package/docs/contracts/workspace-boundary.md +84 -0
- package/docs/customization.md +3 -3
- package/docs/decisions/ADR-009-event4u-namespace.md +1 -1
- package/docs/decisions/ADR-013-discovery-frontmatter-contract.md +1 -1
- package/docs/decisions/ADR-026-explain-mode-translation.md +1 -1
- package/docs/decisions/ADR-088-no-external-runtime-federation.md +26 -27
- package/docs/decisions/ADR-090-visibility-command-frontmatter-field.md +95 -0
- package/docs/decisions/ADR-091-split-meta-capability-packs.md +113 -0
- package/docs/decisions/ADR-092-defer-command-tier-alias-removal.md +93 -0
- package/docs/decisions/ADR-093-ai-council-config-user-global.md +111 -0
- package/docs/decisions/ADR-094-agent-memory-layer-removal.md +94 -0
- package/docs/decisions/ADR-095-workspace-boundary-contract.md +108 -0
- package/docs/decisions/INDEX.md +6 -0
- package/docs/development.md +5 -7
- package/docs/getting-started.md +4 -4
- package/docs/guidelines/agent-infra/5w2h-analysis.md +1 -1
- package/docs/guidelines/agent-infra/comparison-matrix.md +1 -1
- package/docs/guidelines/agent-infra/corpus-grounding-authoring.md +1 -1
- package/docs/guidelines/agent-infra/critical-thinking.md +1 -1
- package/docs/guidelines/agent-infra/engineering-memory-data-format.md +1 -5
- package/docs/guidelines/agent-infra/first-principles.md +1 -1
- package/docs/guidelines/agent-infra/frontier-reasoning-operating-profile.md +164 -0
- package/docs/guidelines/agent-infra/inversion-thinking.md +1 -1
- package/docs/guidelines/agent-infra/ios-simulator-guide.md +9 -14
- package/docs/guidelines/agent-infra/mcp-request-signing.md +19 -22
- package/docs/guidelines/agent-infra/memory-access.md +25 -31
- package/docs/guidelines/agent-infra/mental-models.md +1 -1
- package/docs/guidelines/agent-infra/model-recommendation.md +29 -0
- package/docs/guidelines/agent-infra/scqa-framework.md +3 -3
- package/docs/guidelines/agent-infra/security-lint-containment.md +81 -0
- package/docs/guidelines/agent-infra/six-hats.md +1 -1
- package/docs/guidelines/agent-infra/systems-thinking.md +1 -1
- package/docs/guidelines/agent-infra/untrusted-input-spotlighting.md +72 -0
- package/docs/installation.md +1 -1
- package/docs/mcp.md +2 -2
- package/docs/parity/{bench-ruflo.json → bench-external.json} +10 -10
- package/docs/parity/{ruflo.md → external-runtime.md} +9 -9
- package/docs/quality.md +3 -3
- package/docs/safety.md +3 -3
- package/docs/skills-catalog.md +4 -1
- package/llms.txt +3 -0
- package/package.json +1 -1
- package/src/config/agent-settings.template.yml +65 -3
- package/src/config/discovery/packs.yml +29 -0
- package/src/config/discovery/workspaces.yml +3 -1
- package/src/config/gitignore-block.txt +6 -0
- package/src/scripts/__pycache__/validate_frontmatter.cpython-312.pyc +0 -0
- package/src/scripts/_cli/cmd_doctor.py +99 -13
- package/src/scripts/_lib/__pycache__/__init__.cpython-312.pyc +0 -0
- package/src/scripts/_lib/__pycache__/agent_src.cpython-312.pyc +0 -0
- package/src/scripts/_lib/bench_ab_scoring_v2.py +227 -0
- package/src/scripts/_lib/global_deploy_inventory.py +39 -9
- package/src/scripts/_lib/link_crypto.py +206 -0
- package/src/scripts/_lib/security_lint.py +228 -0
- package/src/scripts/ai_council/clients.py +2 -2
- package/src/scripts/ai_council/config.py +55 -0
- package/src/scripts/audit_adr_coverage.py +0 -2
- package/src/scripts/audit_command_surface.py +18 -5
- package/src/scripts/audit_mcp_tools.py +2 -2
- package/src/scripts/audit_skill_descriptions.py +2 -2
- package/src/scripts/bench_ab_clone.py +62 -12
- package/src/scripts/bench_ab_task_runner.py +475 -30
- package/src/scripts/bench_ab_v2_run.py +247 -0
- package/src/scripts/bench_ab_v2_stats.py +347 -0
- package/src/scripts/bench_run.py +1 -1
- package/src/scripts/build_discovery_manifest.py +10 -0
- package/src/scripts/check_bite_sized_granularity.py +1 -2
- package/src/scripts/check_memory.py +49 -63
- package/src/scripts/check_memory_proposal.py +1 -1
- package/src/scripts/check_no_external_sources.py +101 -0
- package/src/scripts/check_references.py +2 -0
- package/src/scripts/cost_by_conversation.py +1 -1
- package/src/scripts/council_cli.py +28 -14
- package/src/scripts/external_sources_denylist.json +91 -0
- package/src/scripts/hook_manifest.yaml +14 -6
- package/src/scripts/injection_scan_hook.py +145 -0
- package/src/scripts/install-hooks.sh +11 -0
- package/src/scripts/install.py +88 -13
- package/src/scripts/lint_agent_security.py +112 -0
- package/src/scripts/lint_bench_ab.py +5 -4
- package/src/scripts/lint_command_tiers.py +63 -22
- package/src/scripts/lint_discovery_vocabulary.py +2 -0
- package/src/scripts/lint_empty_roadmaps.py +80 -0
- package/src/scripts/lint_hidden_unicode.py +132 -0
- package/src/scripts/lint_instruction_smuggling.py +107 -0
- package/src/scripts/lint_marketplace.py +1 -1
- package/src/scripts/lint_mcp_config_security.py +124 -0
- package/src/scripts/lint_skill_frontmatter_safety.py +144 -0
- package/src/scripts/lint_workspace_boundary.py +122 -0
- package/src/scripts/mcp_server/consumer_tool_catalog.json +2 -3
- package/src/scripts/mcp_server/tools.py +8 -32
- package/src/scripts/memory_lookup.py +27 -296
- package/src/scripts/memory_report.py +1 -23
- package/src/scripts/memory_signal.py +6 -53
- package/src/scripts/memory_status.py +25 -206
- package/src/scripts/mine_session.py +118 -41
- package/src/scripts/pack_dependency_allowlist.json +2 -2
- package/src/scripts/render_benchmark_md.py +141 -52
- package/src/scripts/schemas/command.schema.json +6 -1
- package/src/scripts/security_audit_config.py +153 -0
- package/dist/agent-src/commands/chat-history/learn.md +0 -184
- package/dist/agent-src/commands/chat-history/show.md +0 -113
- package/dist/agent-src/commands/fix/pr-bot-comments.md +0 -157
- package/dist/agent-src/commands/fix/pr-developer-comments.md +0 -163
- package/dist/agent-src/templates/agents/memory/architecture-decisions.example.yml +0 -95
- package/docs/contracts/agent-memory-contract.md +0 -159
|
@@ -38,6 +38,7 @@ import os
|
|
|
38
38
|
import shutil
|
|
39
39
|
import subprocess
|
|
40
40
|
import sys
|
|
41
|
+
import threading
|
|
41
42
|
import time
|
|
42
43
|
from datetime import datetime, timezone
|
|
43
44
|
from pathlib import Path
|
|
@@ -61,6 +62,46 @@ REPORTS_DIR = REPO_ROOT / "internal" / "bench" / "reports" / "ab"
|
|
|
61
62
|
# How far we descend into a clone when snapshotting. The fixture is shallow.
|
|
62
63
|
SNAPSHOT_MAX_DEPTH = 6
|
|
63
64
|
|
|
65
|
+
# --- Activation (proven mechanism) ---
|
|
66
|
+
# agent-config is a GLOBAL Claude Code plugin (enabledPlugins in ~/.claude
|
|
67
|
+
# settings), so plain `claude --print` already runs WITH the package. The clean
|
|
68
|
+
# control is `--setting-sources project,local`, which excludes the user settings
|
|
69
|
+
# where `enabledPlugins` lives → plugin OFF, but auth survives. Measured proof:
|
|
70
|
+
# plain --print = ~35.5k input tokens; --setting-sources project,local = ~11.9k
|
|
71
|
+
# → the ~24k delta IS the package's always-on footprint. So:
|
|
72
|
+
# without = `--setting-sources project,local` (plugin OFF, base model)
|
|
73
|
+
# with = plain `--print` (the real installed plugin = package)
|
|
74
|
+
# with-rdp = plain `--print` + RDP rules injected (RDP not yet in the release plugin)
|
|
75
|
+
# (`--bare` is NOT used — it disables auth too.)
|
|
76
|
+
RDP_EXTRA_FILES = (
|
|
77
|
+
REPO_ROOT / "src" / "rules" / "notes-first-reasoning.md",
|
|
78
|
+
REPO_ROOT / "src" / "agent-src" / "contexts" / "execution" / "rdp-gate.md",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _concat_rules(paths) -> str:
|
|
83
|
+
parts: list[str] = []
|
|
84
|
+
for p in paths:
|
|
85
|
+
try:
|
|
86
|
+
parts.append(p.read_text(encoding="utf-8"))
|
|
87
|
+
except OSError:
|
|
88
|
+
continue
|
|
89
|
+
return "\n\n---\n\n".join(parts)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def system_prompt_for(variant: str) -> str | None:
|
|
93
|
+
"""Extra rules injected on top of the plugin. Only `with-rdp` injects (the RDP
|
|
94
|
+
artifacts aren't in the released plugin yet); `with` uses the real plugin,
|
|
95
|
+
`without` runs plugin-off."""
|
|
96
|
+
if variant == "with-rdp":
|
|
97
|
+
return _concat_rules([p for p in RDP_EXTRA_FILES if p.exists()])
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def setting_sources_for(variant: str) -> str | None:
|
|
102
|
+
"""`without` excludes user settings to drop the global plugin (auth survives)."""
|
|
103
|
+
return "project,local" if variant == "without" else None
|
|
104
|
+
|
|
64
105
|
|
|
65
106
|
def utc_stamp() -> str:
|
|
66
107
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
|
|
@@ -106,7 +147,7 @@ def reset_clone(variant: str) -> Path:
|
|
|
106
147
|
raise RuntimeError("cannot load bench_ab_clone helper")
|
|
107
148
|
module = importlib.util.module_from_spec(spec)
|
|
108
149
|
spec.loader.exec_module(module)
|
|
109
|
-
return module.clone(variant, refresh=True) # type: ignore[attr-defined]
|
|
150
|
+
return module.clone(variant, refresh=True, quiet=True) # type: ignore[attr-defined]
|
|
110
151
|
|
|
111
152
|
|
|
112
153
|
def claude_executable() -> str | None:
|
|
@@ -114,13 +155,28 @@ def claude_executable() -> str | None:
|
|
|
114
155
|
override = os.environ.get("CLAUDE_CLI")
|
|
115
156
|
if override:
|
|
116
157
|
return override
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
158
|
+
# Resolve to an absolute path so the subprocess (run with cwd=clone_root)
|
|
159
|
+
# cannot miss it on a PATH/cwd quirk — the failure that showed up as a
|
|
160
|
+
# spurious "claude CLI not found" on a later arm of the first full run.
|
|
161
|
+
return shutil.which("claude")
|
|
120
162
|
|
|
121
163
|
|
|
122
|
-
def run_live(
|
|
123
|
-
|
|
164
|
+
def run_live(
|
|
165
|
+
task: dict,
|
|
166
|
+
clone_root: Path,
|
|
167
|
+
*,
|
|
168
|
+
timeout_s: int,
|
|
169
|
+
sysprompt_file: "Path | None" = None,
|
|
170
|
+
setting_sources: "str | None" = None,
|
|
171
|
+
max_budget: "float | None" = None,
|
|
172
|
+
model: "str | None" = None,
|
|
173
|
+
) -> dict:
|
|
174
|
+
"""Invoke claude in print/one-shot mode against the task prompt.
|
|
175
|
+
|
|
176
|
+
`setting_sources` (e.g. "project,local") drops the global plugin for the
|
|
177
|
+
`without` arm while keeping auth. `sysprompt_file` injects extra rules
|
|
178
|
+
(the `with-rdp` arm). `with` passes neither → the real installed plugin.
|
|
179
|
+
"""
|
|
124
180
|
binary = claude_executable()
|
|
125
181
|
if binary is None:
|
|
126
182
|
return {
|
|
@@ -129,9 +185,33 @@ def run_live(task: dict, clone_root: Path, *, timeout_s: int) -> dict:
|
|
|
129
185
|
"transcript": "",
|
|
130
186
|
"exit_code": None,
|
|
131
187
|
"wall_time_seconds": 0.0,
|
|
188
|
+
"tokens": 0,
|
|
189
|
+
"tokens_breakdown": {},
|
|
190
|
+
"errored": True,
|
|
132
191
|
}
|
|
133
192
|
prompt = task.get("prompt", "")
|
|
134
|
-
|
|
193
|
+
# --output-format json yields a `usage` block for token counts. The global
|
|
194
|
+
# plugin is dropped per-arm via --setting-sources (NOT --bare, which kills auth).
|
|
195
|
+
# bypassPermissions on EVERY arm: the clone is a throwaway fixture, and this
|
|
196
|
+
# equalizes file-edit capability across arms (else `without`, which excludes
|
|
197
|
+
# user settings, would lack edit perms and fail tasks for the wrong reason).
|
|
198
|
+
cmd = [binary, "--print", "--output-format", "json", "--permission-mode", "bypassPermissions"]
|
|
199
|
+
if model:
|
|
200
|
+
# Pin ONE model across every arm. The session default here is Opus-4.8-1M,
|
|
201
|
+
# whose ~$1.78 first-turn cache-creation trips any sane budget cap instantly
|
|
202
|
+
# and makes a full corpus run blow the account quota. Holding the model
|
|
203
|
+
# constant is also a validity requirement: the bench measures the package
|
|
204
|
+
# LIFT on a fixed host, not model-vs-model.
|
|
205
|
+
cmd += ["--model", model]
|
|
206
|
+
if max_budget:
|
|
207
|
+
# Caps per-task API spend so one runaway agentic loop can't exhaust the
|
|
208
|
+
# account quota (the failure mode that starved later arms on the first run).
|
|
209
|
+
cmd += ["--max-budget-usd", str(max_budget)]
|
|
210
|
+
if setting_sources:
|
|
211
|
+
cmd += ["--setting-sources", setting_sources]
|
|
212
|
+
if sysprompt_file is not None:
|
|
213
|
+
cmd += ["--append-system-prompt-file", str(sysprompt_file)]
|
|
214
|
+
cmd += ["--", prompt]
|
|
135
215
|
started = time.monotonic()
|
|
136
216
|
try:
|
|
137
217
|
proc = subprocess.run(
|
|
@@ -149,14 +229,82 @@ def run_live(task: dict, clone_root: Path, *, timeout_s: int) -> dict:
|
|
|
149
229
|
"transcript": (exc.stdout or "") + "\n[TIMEOUT]",
|
|
150
230
|
"exit_code": -1,
|
|
151
231
|
"wall_time_seconds": round(time.monotonic() - started, 3),
|
|
232
|
+
"tokens": 0,
|
|
233
|
+
"tokens_breakdown": {},
|
|
234
|
+
"errored": True,
|
|
152
235
|
}
|
|
153
236
|
duration = time.monotonic() - started
|
|
237
|
+
# Parse the JSON envelope: `result` is the model text; `usage` holds tokens.
|
|
238
|
+
transcript = proc.stdout
|
|
239
|
+
tokens = 0
|
|
240
|
+
is_error = False
|
|
241
|
+
err_reason = "ok"
|
|
242
|
+
num_turns = 0
|
|
243
|
+
subtype = ""
|
|
244
|
+
breakdown = {
|
|
245
|
+
"input_tokens": 0,
|
|
246
|
+
"output_tokens": 0,
|
|
247
|
+
"cache_read_input_tokens": 0,
|
|
248
|
+
"cache_creation_input_tokens": 0,
|
|
249
|
+
}
|
|
250
|
+
try:
|
|
251
|
+
obj = json.loads(proc.stdout)
|
|
252
|
+
is_error = bool(obj.get("is_error"))
|
|
253
|
+
transcript = obj.get("result") or obj.get("text") or proc.stdout
|
|
254
|
+
usage = obj.get("usage") or {}
|
|
255
|
+
breakdown = {
|
|
256
|
+
k: int(usage.get(k, 0) or 0)
|
|
257
|
+
for k in (
|
|
258
|
+
"input_tokens",
|
|
259
|
+
"output_tokens",
|
|
260
|
+
"cache_read_input_tokens",
|
|
261
|
+
"cache_creation_input_tokens",
|
|
262
|
+
)
|
|
263
|
+
}
|
|
264
|
+
tokens = sum(breakdown.values())
|
|
265
|
+
# The top-level `usage` block is zeroed on a budget-capped / errored run
|
|
266
|
+
# (and unreliable even on some completions). `modelUsage` carries the
|
|
267
|
+
# authoritative per-model counts — sum it as the fallback so token deltas
|
|
268
|
+
# survive even when a task hits its cap mid-flight.
|
|
269
|
+
if tokens == 0:
|
|
270
|
+
mu = obj.get("modelUsage") or {}
|
|
271
|
+
agg = {
|
|
272
|
+
"input_tokens": 0,
|
|
273
|
+
"output_tokens": 0,
|
|
274
|
+
"cache_read_input_tokens": 0,
|
|
275
|
+
"cache_creation_input_tokens": 0,
|
|
276
|
+
}
|
|
277
|
+
for stats in mu.values():
|
|
278
|
+
agg["input_tokens"] += int(stats.get("inputTokens", 0) or 0)
|
|
279
|
+
agg["output_tokens"] += int(stats.get("outputTokens", 0) or 0)
|
|
280
|
+
agg["cache_read_input_tokens"] += int(
|
|
281
|
+
stats.get("cacheReadInputTokens", 0) or 0
|
|
282
|
+
)
|
|
283
|
+
agg["cache_creation_input_tokens"] += int(
|
|
284
|
+
stats.get("cacheCreationInputTokens", 0) or 0
|
|
285
|
+
)
|
|
286
|
+
mu_total = sum(agg.values())
|
|
287
|
+
if mu_total > 0:
|
|
288
|
+
breakdown = agg
|
|
289
|
+
tokens = mu_total
|
|
290
|
+
num_turns = int(obj.get("num_turns", 0) or 0)
|
|
291
|
+
subtype = str(obj.get("subtype") or "")
|
|
292
|
+
# Surface WHY a task errored (budget cap vs. other) without leaking $.
|
|
293
|
+
if is_error:
|
|
294
|
+
err_reason = obj.get("subtype") or "error"
|
|
295
|
+
except (json.JSONDecodeError, AttributeError, ValueError):
|
|
296
|
+
transcript = proc.stdout
|
|
154
297
|
return {
|
|
155
298
|
"mode": "live",
|
|
156
|
-
"reason": "ok",
|
|
157
|
-
"transcript":
|
|
299
|
+
"reason": err_reason if is_error else ("ok" if proc.returncode == 0 else f"exit {proc.returncode}"),
|
|
300
|
+
"transcript": str(transcript) + "\n" + proc.stderr,
|
|
158
301
|
"exit_code": proc.returncode,
|
|
159
302
|
"wall_time_seconds": round(duration, 3),
|
|
303
|
+
"tokens": tokens,
|
|
304
|
+
"tokens_breakdown": breakdown,
|
|
305
|
+
"errored": is_error or proc.returncode != 0,
|
|
306
|
+
"num_turns": num_turns,
|
|
307
|
+
"subtype": subtype,
|
|
160
308
|
}
|
|
161
309
|
|
|
162
310
|
|
|
@@ -198,22 +346,184 @@ def count_ask_events(transcript: str) -> dict[str, int]:
|
|
|
198
346
|
return {"asked": asked, "acted_with_commit": acted, "ratio": ratio}
|
|
199
347
|
|
|
200
348
|
|
|
349
|
+
PROGRESS_PATH = REPORTS_DIR / ".progress.json"
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _write_progress(state: dict) -> None:
|
|
353
|
+
"""Mirror live state to .progress.json for `task bench:ab:watch` (best-effort)."""
|
|
354
|
+
try:
|
|
355
|
+
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
356
|
+
PROGRESS_PATH.write_text(json.dumps(state, indent=2) + "\n")
|
|
357
|
+
except OSError:
|
|
358
|
+
pass
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class Progress:
|
|
362
|
+
"""Live per-task progress. stdlib-only, TTY-aware, log-safe.
|
|
363
|
+
|
|
364
|
+
style: auto (bar if stderr is a TTY, else one plain line per task) | bar |
|
|
365
|
+
plain | none. Mirrors state to .progress.json regardless of style.
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
BAR_WIDTH = 24
|
|
369
|
+
|
|
370
|
+
def __init__(self, total: int, *, mode: str, style: str = "auto", stream=sys.stderr) -> None:
|
|
371
|
+
self.total = max(total, 1)
|
|
372
|
+
self.mode = mode
|
|
373
|
+
self.stream = stream
|
|
374
|
+
self.done = 0
|
|
375
|
+
self.started = time.monotonic()
|
|
376
|
+
if style in ("bar", "plain", "none"):
|
|
377
|
+
self.kind = style
|
|
378
|
+
else: # auto
|
|
379
|
+
self.kind = "bar" if getattr(stream, "isatty", lambda: False)() else "plain"
|
|
380
|
+
self._cur = ""
|
|
381
|
+
self._task_started = 0.0
|
|
382
|
+
self._hb_stop: "threading.Event | None" = None
|
|
383
|
+
self._hb_thread: "threading.Thread | None" = None
|
|
384
|
+
|
|
385
|
+
def _elapsed(self, since: float) -> str:
|
|
386
|
+
s = int(time.monotonic() - since)
|
|
387
|
+
return f"{s // 60}m{s % 60:02d}s" if s >= 60 else f"{s}s"
|
|
388
|
+
|
|
389
|
+
def _bar(self) -> str:
|
|
390
|
+
filled = int(self.BAR_WIDTH * self.done / self.total)
|
|
391
|
+
return "█" * filled + "░" * (self.BAR_WIDTH - filled)
|
|
392
|
+
|
|
393
|
+
def _render_bar(self, suffix: str = "") -> None:
|
|
394
|
+
line = f"\r[{self._bar()}] {self.done}/{self.total} · {self._cur} · {self._elapsed(self.started)}{suffix}"
|
|
395
|
+
self.stream.write(line.ljust(90)[:160])
|
|
396
|
+
self.stream.flush()
|
|
397
|
+
|
|
398
|
+
def _start_heartbeat(self) -> None:
|
|
399
|
+
if self.kind != "bar" or self.mode != "live":
|
|
400
|
+
return
|
|
401
|
+
self._hb_stop = threading.Event()
|
|
402
|
+
|
|
403
|
+
def _tick() -> None:
|
|
404
|
+
assert self._hb_stop is not None
|
|
405
|
+
while not self._hb_stop.wait(1.0):
|
|
406
|
+
self._render_bar(suffix=f" · {self._elapsed(self._task_started)}…")
|
|
407
|
+
|
|
408
|
+
self._hb_thread = threading.Thread(target=_tick, daemon=True)
|
|
409
|
+
self._hb_thread.start()
|
|
410
|
+
|
|
411
|
+
def _stop_heartbeat(self) -> None:
|
|
412
|
+
if self._hb_stop is not None:
|
|
413
|
+
self._hb_stop.set()
|
|
414
|
+
if self._hb_thread is not None:
|
|
415
|
+
self._hb_thread.join(timeout=2.0)
|
|
416
|
+
self._hb_stop = self._hb_thread = None
|
|
417
|
+
|
|
418
|
+
def start_task(self, variant: str, idx: int, count: int, task_id: str) -> None:
|
|
419
|
+
self._cur = f"{variant} {idx}/{count} · {task_id}"
|
|
420
|
+
self._task_started = time.monotonic()
|
|
421
|
+
_write_progress({
|
|
422
|
+
"mode": self.mode, "variant": variant, "task_idx": idx, "task_count": count,
|
|
423
|
+
"total_done": self.done, "total": self.total, "current_id": task_id,
|
|
424
|
+
"started_at": utc_stamp(), "last_result": None,
|
|
425
|
+
})
|
|
426
|
+
if self.kind == "none":
|
|
427
|
+
return
|
|
428
|
+
if self.kind == "bar":
|
|
429
|
+
self._render_bar(suffix=" · running…" if self.mode == "live" else "")
|
|
430
|
+
self._start_heartbeat()
|
|
431
|
+
elif self.mode == "live": # plain: a start marker so a long task isn't mistaken for a hang
|
|
432
|
+
self.stream.write(f"[{self.done + 1}/{self.total}] ▶ {self._cur}\n")
|
|
433
|
+
self.stream.flush()
|
|
434
|
+
|
|
435
|
+
def end_task(self, *, passed: bool, wall: float, variant: str, task_id: str) -> None:
|
|
436
|
+
self._stop_heartbeat()
|
|
437
|
+
self.done += 1
|
|
438
|
+
mark = "✓" if passed else "✗"
|
|
439
|
+
_write_progress({
|
|
440
|
+
"mode": self.mode, "variant": variant, "total_done": self.done,
|
|
441
|
+
"total": self.total, "current_id": task_id, "updated_at": utc_stamp(),
|
|
442
|
+
"last_result": "pass" if passed else "fail",
|
|
443
|
+
})
|
|
444
|
+
if self.kind == "none":
|
|
445
|
+
return
|
|
446
|
+
if self.kind == "bar":
|
|
447
|
+
self._render_bar(suffix=f" · {mark}")
|
|
448
|
+
else:
|
|
449
|
+
self.stream.write(f"[{self.done}/{self.total}] {mark} {variant} · {task_id} · {wall:.1f}s\n")
|
|
450
|
+
self.stream.flush()
|
|
451
|
+
|
|
452
|
+
def variant_done(self, line: str) -> None:
|
|
453
|
+
"""Print a per-variant summary line without corrupting an active bar."""
|
|
454
|
+
if self.kind == "bar":
|
|
455
|
+
self.stream.write("\n")
|
|
456
|
+
self.stream.write(line if line.endswith("\n") else line + "\n")
|
|
457
|
+
self.stream.flush()
|
|
458
|
+
|
|
459
|
+
def finish(self) -> None:
|
|
460
|
+
if self.kind == "bar":
|
|
461
|
+
self.stream.write("\n")
|
|
462
|
+
if self.kind != "none":
|
|
463
|
+
self.stream.write(
|
|
464
|
+
f"bench progress: {self.done}/{self.total} tasks · total {self._elapsed(self.started)}\n"
|
|
465
|
+
)
|
|
466
|
+
self.stream.flush()
|
|
467
|
+
|
|
468
|
+
|
|
201
469
|
def per_category_aggregate(per_task: list[dict]) -> dict[str, dict]:
|
|
202
470
|
by_cat: dict[str, list[dict]] = {}
|
|
203
471
|
for entry in per_task:
|
|
204
472
|
by_cat.setdefault(entry.get("category", "unknown"), []).append(entry)
|
|
205
473
|
out: dict[str, dict] = {}
|
|
206
474
|
for cat, entries in by_cat.items():
|
|
207
|
-
|
|
475
|
+
done = [e for e in entries if not e.get("errored")]
|
|
476
|
+
passed = sum(1 for e in done if e.get("score", {}).get("passed"))
|
|
208
477
|
total = len(entries)
|
|
478
|
+
completed = len(done)
|
|
209
479
|
out[cat] = {
|
|
210
480
|
"passed": passed,
|
|
211
481
|
"total": total,
|
|
212
|
-
"
|
|
482
|
+
"completed": completed,
|
|
483
|
+
"errored": total - completed,
|
|
484
|
+
"completion_rate": round(passed / completed, 4) if completed else 0,
|
|
213
485
|
"mean_wall_time": round(
|
|
214
|
-
sum(e.get("wall_time_seconds", 0) for e in
|
|
486
|
+
sum(e.get("wall_time_seconds", 0) for e in done) / completed, 3
|
|
215
487
|
)
|
|
216
|
-
if
|
|
488
|
+
if completed
|
|
489
|
+
else 0,
|
|
490
|
+
"mean_tokens": round(sum(e.get("tokens", 0) for e in done) / completed)
|
|
491
|
+
if completed
|
|
492
|
+
else 0,
|
|
493
|
+
}
|
|
494
|
+
return out
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def per_cell_aggregate(per_task: list[dict]) -> dict[str, dict]:
|
|
498
|
+
"""Aggregate by the 2×2 (duration × cognitive) cell — the value-benchmark axis.
|
|
499
|
+
|
|
500
|
+
Compared across conditions this answers "are short tasks more expensive?"
|
|
501
|
+
(cell `short/mechanical`) and "do long tasks get cheaper / better?"
|
|
502
|
+
(cell `long/reasoning-heavy`). Cell key is `"<duration>/<cognitive>"`.
|
|
503
|
+
"""
|
|
504
|
+
by_cell: dict[str, list[dict]] = {}
|
|
505
|
+
for entry in per_task:
|
|
506
|
+
cell = f"{entry.get('duration', 'untagged')}/{entry.get('cognitive', 'untagged')}"
|
|
507
|
+
by_cell.setdefault(cell, []).append(entry)
|
|
508
|
+
out: dict[str, dict] = {}
|
|
509
|
+
for cell, entries in by_cell.items():
|
|
510
|
+
done = [e for e in entries if not e.get("errored")]
|
|
511
|
+
passed = sum(1 for e in done if e.get("score", {}).get("passed"))
|
|
512
|
+
total = len(entries)
|
|
513
|
+
completed = len(done)
|
|
514
|
+
out[cell] = {
|
|
515
|
+
"passed": passed,
|
|
516
|
+
"total": total,
|
|
517
|
+
"completed": completed,
|
|
518
|
+
"errored": total - completed,
|
|
519
|
+
"completion_rate": round(passed / completed, 4) if completed else 0,
|
|
520
|
+
"mean_wall_time": round(
|
|
521
|
+
sum(e.get("wall_time_seconds", 0) for e in done) / completed, 3
|
|
522
|
+
)
|
|
523
|
+
if completed
|
|
524
|
+
else 0,
|
|
525
|
+
"mean_tokens": round(sum(e.get("tokens", 0) for e in done) / completed)
|
|
526
|
+
if completed
|
|
217
527
|
else 0,
|
|
218
528
|
}
|
|
219
529
|
return out
|
|
@@ -233,22 +543,35 @@ def write_report(
|
|
|
233
543
|
target_shape_hash=bench_ab_cache.target_shape_hash(),
|
|
234
544
|
)
|
|
235
545
|
total = len(per_task)
|
|
236
|
-
|
|
546
|
+
done = [e for e in per_task if not e.get("errored")]
|
|
547
|
+
completed = len(done)
|
|
548
|
+
errored = total - completed
|
|
549
|
+
passed = sum(1 for e in done if e.get("score", {}).get("passed"))
|
|
237
550
|
results = {
|
|
238
551
|
"mode": mode,
|
|
239
|
-
|
|
552
|
+
# Hit-rate is over COMPLETED tasks only — errored (rate-limit / budget /
|
|
553
|
+
# timeout / CLI-fail) tasks are excluded so a transient quota trip does
|
|
554
|
+
# not read as a content failure of the package.
|
|
555
|
+
"completion_rate": round(passed / completed, 4) if completed else 0,
|
|
240
556
|
"passed": passed,
|
|
557
|
+
"completed": completed,
|
|
558
|
+
"errored": errored,
|
|
241
559
|
"total": total,
|
|
242
560
|
"per_category": per_category_aggregate(per_task),
|
|
561
|
+
"per_cell": per_cell_aggregate(per_task),
|
|
243
562
|
"mean_wall_time": round(
|
|
244
|
-
sum(e.get("wall_time_seconds", 0) for e in
|
|
563
|
+
sum(e.get("wall_time_seconds", 0) for e in done) / completed, 3
|
|
245
564
|
)
|
|
246
|
-
if
|
|
565
|
+
if completed
|
|
566
|
+
else 0,
|
|
567
|
+
"total_tokens": sum(e.get("tokens", 0) for e in done),
|
|
568
|
+
"mean_tokens": round(sum(e.get("tokens", 0) for e in done) / completed)
|
|
569
|
+
if completed
|
|
247
570
|
else 0,
|
|
248
571
|
"ask_vs_act_ratio": round(
|
|
249
|
-
sum(e.get("ask_events", {}).get("ratio", 0) for e in
|
|
572
|
+
sum(e.get("ask_events", {}).get("ratio", 0) for e in done) / completed, 3
|
|
250
573
|
)
|
|
251
|
-
if
|
|
574
|
+
if completed
|
|
252
575
|
else 0,
|
|
253
576
|
"per_task": per_task,
|
|
254
577
|
}
|
|
@@ -269,7 +592,7 @@ def write_report(
|
|
|
269
592
|
f"# Track B · {variant} · {mode}\n\n"
|
|
270
593
|
f"- Stamp: `{stamp}`\n"
|
|
271
594
|
f"- Completion rate: **{results['completion_rate'] * 100:.1f}%**"
|
|
272
|
-
f" ({passed}/{total})\n"
|
|
595
|
+
f" ({passed}/{completed} completed; {errored} errored of {total})\n"
|
|
273
596
|
f"- Mean wall-time: {results['mean_wall_time']}s\n"
|
|
274
597
|
f"- Ask vs. act ratio: {results['ask_vs_act_ratio']}\n"
|
|
275
598
|
f"\n## Per-category\n\n"
|
|
@@ -283,14 +606,43 @@ def write_report(
|
|
|
283
606
|
return path
|
|
284
607
|
|
|
285
608
|
|
|
286
|
-
def run_variant(
|
|
609
|
+
def run_variant(
|
|
610
|
+
variant: str,
|
|
611
|
+
tasks: list[dict],
|
|
612
|
+
*,
|
|
613
|
+
mode: str,
|
|
614
|
+
timeout_s: int,
|
|
615
|
+
max_budget: "float | None" = None,
|
|
616
|
+
model: "str | None" = None,
|
|
617
|
+
progress: "Progress | None" = None,
|
|
618
|
+
) -> dict:
|
|
287
619
|
started = time.monotonic()
|
|
620
|
+
# Build the injected rule corpus once per variant (live only).
|
|
621
|
+
sp_file: "Path | None" = None
|
|
622
|
+
if mode == "live":
|
|
623
|
+
sp_text = system_prompt_for(variant)
|
|
624
|
+
if sp_text:
|
|
625
|
+
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
626
|
+
sp_file = REPORTS_DIR / f".sysprompt-{variant}.txt"
|
|
627
|
+
sp_file.write_text(sp_text, encoding="utf-8")
|
|
288
628
|
per_task: list[dict] = []
|
|
289
|
-
for task in tasks:
|
|
290
|
-
|
|
629
|
+
for i, task in enumerate(tasks):
|
|
630
|
+
if progress is not None:
|
|
631
|
+
progress.start_task(variant, i + 1, len(tasks), str(task.get("id")))
|
|
632
|
+
# Fixture-only working dir, identical for every arm — the package is NOT
|
|
633
|
+
# in the clone files; activation is the injected system prompt (sp_file).
|
|
634
|
+
clone_root = reset_clone("without")
|
|
291
635
|
pre = snapshot_clone(clone_root)
|
|
292
636
|
if mode == "live":
|
|
293
|
-
run_result = run_live(
|
|
637
|
+
run_result = run_live(
|
|
638
|
+
task,
|
|
639
|
+
clone_root,
|
|
640
|
+
timeout_s=timeout_s,
|
|
641
|
+
sysprompt_file=sp_file,
|
|
642
|
+
setting_sources=setting_sources_for(variant),
|
|
643
|
+
max_budget=max_budget,
|
|
644
|
+
model=model,
|
|
645
|
+
)
|
|
294
646
|
else:
|
|
295
647
|
run_result = run_dry(task, clone_root, variant)
|
|
296
648
|
post = snapshot_clone(clone_root)
|
|
@@ -305,21 +657,42 @@ def run_variant(variant: str, tasks: list[dict], *, mode: str, timeout_s: int) -
|
|
|
305
657
|
{
|
|
306
658
|
"id": task.get("id"),
|
|
307
659
|
"category": task.get("category"),
|
|
660
|
+
"duration": task.get("duration"),
|
|
661
|
+
"cognitive": task.get("cognitive"),
|
|
308
662
|
"score": score,
|
|
663
|
+
# `errored` = the run did not complete on merit (rate-limit,
|
|
664
|
+
# budget-cap, timeout, CLI failure). Distinct from a content
|
|
665
|
+
# fail (`score.passed == False`). Errored tasks are excluded
|
|
666
|
+
# from the hit-rate so a transient quota trip can't masquerade
|
|
667
|
+
# as the package "not working".
|
|
668
|
+
"errored": bool(run_result.get("errored", False)),
|
|
309
669
|
"wall_time_seconds": run_result.get("wall_time_seconds", 0.0),
|
|
670
|
+
"tokens": run_result.get("tokens", 0),
|
|
671
|
+
"tokens_breakdown": run_result.get("tokens_breakdown", {}),
|
|
310
672
|
"exit_code": run_result.get("exit_code"),
|
|
311
673
|
"mode": run_result.get("mode", mode),
|
|
312
674
|
"reason": run_result.get("reason", ""),
|
|
313
675
|
"ask_events": count_ask_events(run_result.get("transcript", "")),
|
|
314
676
|
}
|
|
315
677
|
)
|
|
678
|
+
if progress is not None:
|
|
679
|
+
progress.end_task(
|
|
680
|
+
passed=bool(score.get("passed")),
|
|
681
|
+
wall=float(run_result.get("wall_time_seconds", 0.0) or 0.0),
|
|
682
|
+
variant=variant,
|
|
683
|
+
task_id=str(task.get("id")),
|
|
684
|
+
)
|
|
316
685
|
duration = time.monotonic() - started
|
|
317
686
|
path = write_report(variant, mode=mode, per_task=per_task, duration=duration)
|
|
318
|
-
|
|
687
|
+
summary = (
|
|
319
688
|
f"bench_ab_task_runner: {variant} ({mode}) → "
|
|
320
689
|
f"{sum(1 for e in per_task if e['score']['passed'])}/{len(per_task)} "
|
|
321
|
-
f"passed — {path.relative_to(REPO_ROOT)}
|
|
690
|
+
f"passed — {path.relative_to(REPO_ROOT)}"
|
|
322
691
|
)
|
|
692
|
+
if progress is not None:
|
|
693
|
+
progress.variant_done(summary)
|
|
694
|
+
else:
|
|
695
|
+
sys.stdout.write(summary + "\n")
|
|
323
696
|
return {"path": path, "per_task": per_task, "duration": duration}
|
|
324
697
|
|
|
325
698
|
|
|
@@ -327,9 +700,10 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
327
700
|
parser = argparse.ArgumentParser(description="Run Track B tasks per variant.")
|
|
328
701
|
parser.add_argument(
|
|
329
702
|
"--variant",
|
|
330
|
-
choices=("with", "without", "both"),
|
|
703
|
+
choices=("with", "without", "with-rdp", "both", "all"),
|
|
331
704
|
default="both",
|
|
332
|
-
help="
|
|
705
|
+
help="with | without | with-rdp | both (=with+without, back-compat "
|
|
706
|
+
"default) | all (=the 3-condition value-benchmark set).",
|
|
333
707
|
)
|
|
334
708
|
parser.add_argument(
|
|
335
709
|
"--mode",
|
|
@@ -346,6 +720,48 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
346
720
|
default=120,
|
|
347
721
|
help="Live mode: per-task timeout in seconds (default 120).",
|
|
348
722
|
)
|
|
723
|
+
parser.add_argument(
|
|
724
|
+
"--progress",
|
|
725
|
+
choices=("auto", "bar", "plain", "none"),
|
|
726
|
+
default="auto",
|
|
727
|
+
help="Live display: auto (TTY→bar, else plain line-per-task) | bar | plain | none.",
|
|
728
|
+
)
|
|
729
|
+
parser.add_argument(
|
|
730
|
+
"--limit",
|
|
731
|
+
type=int,
|
|
732
|
+
default=0,
|
|
733
|
+
help="Run only the first N tasks per variant (0 = all). For cheap smoke tests.",
|
|
734
|
+
)
|
|
735
|
+
parser.add_argument(
|
|
736
|
+
"--tasks",
|
|
737
|
+
default="",
|
|
738
|
+
help=(
|
|
739
|
+
"Comma-separated task IDs to run (e.g. trackb-bugfix-01,trackb-refactor-01). "
|
|
740
|
+
"Overrides --limit. Use to span the 2×2 cells in a bounded run instead of "
|
|
741
|
+
"taking the first-N in file order."
|
|
742
|
+
),
|
|
743
|
+
)
|
|
744
|
+
parser.add_argument(
|
|
745
|
+
"--model",
|
|
746
|
+
default="claude-sonnet-4-6",
|
|
747
|
+
help=(
|
|
748
|
+
"Pin ONE model across all arms (live mode). Default claude-sonnet-4-6 — "
|
|
749
|
+
"capable enough to complete the coding tasks, ~2.3x cheaper per turn than "
|
|
750
|
+
"the Opus-4.8-1M session default whose cache-creation blows the quota. "
|
|
751
|
+
"Empty string = inherit the session default (expensive)."
|
|
752
|
+
),
|
|
753
|
+
)
|
|
754
|
+
parser.add_argument(
|
|
755
|
+
"--budget",
|
|
756
|
+
type=float,
|
|
757
|
+
default=2.0,
|
|
758
|
+
help=(
|
|
759
|
+
"Live mode: per-task API spend cap in USD (passed to "
|
|
760
|
+
"`claude --max-budget-usd`). Stops a runaway agentic loop from "
|
|
761
|
+
"exhausting the account quota and starving later arms. 0 = uncapped. "
|
|
762
|
+
"Default 2.0."
|
|
763
|
+
),
|
|
764
|
+
)
|
|
349
765
|
return parser.parse_args(argv)
|
|
350
766
|
|
|
351
767
|
|
|
@@ -359,9 +775,38 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
359
775
|
if not tasks:
|
|
360
776
|
sys.stderr.write("bench_ab_task_runner: corpus has no tasks\n")
|
|
361
777
|
return 1
|
|
362
|
-
|
|
778
|
+
if args.tasks.strip():
|
|
779
|
+
wanted = [s.strip() for s in args.tasks.split(",") if s.strip()]
|
|
780
|
+
by_id = {t.get("id"): t for t in tasks}
|
|
781
|
+
missing = [w for w in wanted if w not in by_id]
|
|
782
|
+
if missing:
|
|
783
|
+
sys.stderr.write(
|
|
784
|
+
f"bench_ab_task_runner: unknown task id(s): {', '.join(missing)}\n"
|
|
785
|
+
)
|
|
786
|
+
return 1
|
|
787
|
+
tasks = [by_id[w] for w in wanted]
|
|
788
|
+
elif args.limit and args.limit > 0:
|
|
789
|
+
tasks = tasks[: args.limit]
|
|
790
|
+
if args.variant == "both":
|
|
791
|
+
variants = ("with", "without")
|
|
792
|
+
elif args.variant == "all":
|
|
793
|
+
variants = ("with", "without", "with-rdp")
|
|
794
|
+
else:
|
|
795
|
+
variants = (args.variant,)
|
|
796
|
+
max_budget = args.budget if args.budget and args.budget > 0 else None
|
|
797
|
+
model = args.model or None
|
|
798
|
+
progress = Progress(len(variants) * len(tasks), mode=args.mode, style=args.progress)
|
|
363
799
|
for variant in variants:
|
|
364
|
-
run_variant(
|
|
800
|
+
run_variant(
|
|
801
|
+
variant,
|
|
802
|
+
tasks,
|
|
803
|
+
mode=args.mode,
|
|
804
|
+
timeout_s=args.timeout,
|
|
805
|
+
max_budget=max_budget,
|
|
806
|
+
model=model,
|
|
807
|
+
progress=progress,
|
|
808
|
+
)
|
|
809
|
+
progress.finish()
|
|
365
810
|
return 0
|
|
366
811
|
|
|
367
812
|
|