@nexus-cortex/cli 4.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cortex/agents/AGENT_PROFILE_GUIDE.md +307 -0
- package/.cortex/agents/README.md +268 -0
- package/.cortex/agents/a-frontend-landing-page-designer.md +41 -0
- package/.cortex/agents/autoresearch-agent.md +49 -0
- package/.cortex/agents/code-reviewer.md +63 -0
- package/.cortex/agents/context-research.md +26 -0
- package/.cortex/agents/doc-writer.md +92 -0
- package/.cortex/agents/explore.md +63 -0
- package/.cortex/agents/new-model-api-integrator-analyst.md +41 -0
- package/.cortex/agents/plan.md +109 -0
- package/.cortex/agents/pr-architecture-reviewer.md +77 -0
- package/.cortex/agents/pr-code-quality.md +78 -0
- package/.cortex/agents/pr-implementer.md +50 -0
- package/.cortex/agents/pr-security-auditor.md +62 -0
- package/.cortex/agents/pr-test-writer.md +67 -0
- package/.cortex/agents/refactor.md +118 -0
- package/.cortex/agents/test-writer.md +72 -0
- package/.cortex/agents/web-researcher.md +72 -0
- package/.cortex/bench/tasks/sample-tasks.json +20 -0
- package/.cortex/commands/compare.md +14 -0
- package/.cortex/commands/deps.md +16 -0
- package/.cortex/commands/diff.md +14 -0
- package/.cortex/commands/explain.md +16 -0
- package/.cortex/commands/find-bug.md +13 -0
- package/.cortex/commands/profile.md +15 -0
- package/.cortex/commands/review.md +18 -0
- package/.cortex/commands/search.md +16 -0
- package/.cortex/commands/test.md +15 -0
- package/.cortex/permissions.dev.json +20 -0
- package/.cortex/permissions.example.json +71 -0
- package/.cortex/permissions.prod.json +63 -0
- package/.cortex/permissions.test.json +19 -0
- package/.cortex/skills/autoresearch/SKILL.md +77 -0
- package/.cortex/skills/autoresearch/personas/README.md +45 -0
- package/.cortex/skills/autoresearch/personas/aggressive-refactor.md +25 -0
- package/.cortex/skills/autoresearch/personas/creative.md +29 -0
- package/.cortex/skills/autoresearch/personas/perf-hunter.md +27 -0
- package/.cortex/skills/autoresearch/personas/precise.md +23 -0
- package/.cortex/skills/autoresearch/personas/root-cause.md +26 -0
- package/.cortex/skills/autoresearch/personas/security-auditor.md +29 -0
- package/.cortex/skills/autoresearch/personas/skeptic-reviewer.md +31 -0
- package/.cortex/skills/autoresearch/personas/test-first.md +25 -0
- package/.cortex/skills/best-of-n/SKILL.md +76 -0
- package/.cortex/skills/cortex/SKILL.md +834 -0
- package/.cortex/skills/cortex-bench/SKILL.md +354 -0
- package/.cortex/skills/docx/SKILL.md +83 -0
- package/.cortex/skills/pdf-documents/SKILL.md +297 -0
- package/.cortex/skills/pdf-documents/sections/01-image-acquisition.md +132 -0
- package/.cortex/skills/pdf-documents/sections/02-ai-image-generation.md +274 -0
- package/.cortex/skills/pdf-documents/sections/03-paper-sizes.md +89 -0
- package/.cortex/skills/pdf-documents/sections/04-design-system.md +549 -0
- package/.cortex/skills/pdf-documents/sections/05-css-print-rules.md +135 -0
- package/.cortex/skills/pdf-documents/sections/06-svg-charts.md +100 -0
- package/.cortex/skills/pdf-documents/sections/07-templates.md +224 -0
- package/.cortex/skills/pdf-documents/sections/08-scaled-output.md +164 -0
- package/.cortex/skills/pdf-documents/sections/09-preview-qa.md +66 -0
- package/.cortex/skills/pdf-documents/sections/10-reading-pdfs.md +499 -0
- package/.cortex/skills/pdf-documents/sections/11-form-filling.md +241 -0
- package/.cortex/skills/pptx/SKILL.md +90 -0
- package/.cortex/skills/resume-analyst/SKILL.md +373 -0
- package/.cortex/skills/verify-work/SKILL.md +74 -0
- package/.cortex/skills/xlsx/SKILL.md +101 -0
- package/.cortex/system-messages/messages/WORK_QUALITY.md +159 -0
- package/.cortex/system-messages/registry.json +18 -0
- package/LICENSE +202 -0
- package/NOTICE +2 -0
- package/README.md +13 -0
- package/bin/cortex.js +548 -0
- package/dist/agent-mode.d.ts +21 -0
- package/dist/agent-mode.d.ts.map +1 -0
- package/dist/agent-mode.js +511 -0
- package/dist/agent-mode.js.map +1 -0
- package/dist/client/CortexClient.d.ts +84 -0
- package/dist/client/CortexClient.d.ts.map +1 -0
- package/dist/client/CortexClient.js +163 -0
- package/dist/client/CortexClient.js.map +1 -0
- package/dist/commands/artifact/list.d.ts +15 -0
- package/dist/commands/artifact/list.d.ts.map +1 -0
- package/dist/commands/artifact/list.js +89 -0
- package/dist/commands/artifact/list.js.map +1 -0
- package/dist/commands/artifact/restart.d.ts +13 -0
- package/dist/commands/artifact/restart.d.ts.map +1 -0
- package/dist/commands/artifact/restart.js +56 -0
- package/dist/commands/artifact/restart.js.map +1 -0
- package/dist/commands/artifact/status.d.ts +13 -0
- package/dist/commands/artifact/status.d.ts.map +1 -0
- package/dist/commands/artifact/status.js +100 -0
- package/dist/commands/artifact/status.js.map +1 -0
- package/dist/commands/artifact/stop.d.ts +13 -0
- package/dist/commands/artifact/stop.d.ts.map +1 -0
- package/dist/commands/artifact/stop.js +50 -0
- package/dist/commands/artifact/stop.js.map +1 -0
- package/dist/commands/autoresearch/bench.d.ts +32 -0
- package/dist/commands/autoresearch/bench.d.ts.map +1 -0
- package/dist/commands/autoresearch/bench.js +123 -0
- package/dist/commands/autoresearch/bench.js.map +1 -0
- package/dist/commands/autoresearch/commandRunner.d.ts +35 -0
- package/dist/commands/autoresearch/commandRunner.d.ts.map +1 -0
- package/dist/commands/autoresearch/commandRunner.js +91 -0
- package/dist/commands/autoresearch/commandRunner.js.map +1 -0
- package/dist/commands/autoresearch/evaluate.d.ts +18 -0
- package/dist/commands/autoresearch/evaluate.d.ts.map +1 -0
- package/dist/commands/autoresearch/evaluate.js +117 -0
- package/dist/commands/autoresearch/evaluate.js.map +1 -0
- package/dist/commands/autoresearch/experiment.d.ts +38 -0
- package/dist/commands/autoresearch/experiment.d.ts.map +1 -0
- package/dist/commands/autoresearch/experiment.js +168 -0
- package/dist/commands/autoresearch/experiment.js.map +1 -0
- package/dist/commands/autoresearch/fix.d.ts +10 -0
- package/dist/commands/autoresearch/fix.d.ts.map +1 -0
- package/dist/commands/autoresearch/fix.js +86 -0
- package/dist/commands/autoresearch/fix.js.map +1 -0
- package/dist/commands/autoresearch/harnessProcess.d.ts +48 -0
- package/dist/commands/autoresearch/harnessProcess.d.ts.map +1 -0
- package/dist/commands/autoresearch/harnessProcess.js +140 -0
- package/dist/commands/autoresearch/harnessProcess.js.map +1 -0
- package/dist/commands/autoresearch/list.d.ts +6 -0
- package/dist/commands/autoresearch/list.d.ts.map +1 -0
- package/dist/commands/autoresearch/list.js +38 -0
- package/dist/commands/autoresearch/list.js.map +1 -0
- package/dist/commands/autoresearch/loop.d.ts +26 -0
- package/dist/commands/autoresearch/loop.d.ts.map +1 -0
- package/dist/commands/autoresearch/loop.js +242 -0
- package/dist/commands/autoresearch/loop.js.map +1 -0
- package/dist/commands/cache/metrics.d.ts +13 -0
- package/dist/commands/cache/metrics.d.ts.map +1 -0
- package/dist/commands/cache/metrics.js +77 -0
- package/dist/commands/cache/metrics.js.map +1 -0
- package/dist/commands/chat/AgenticChat.d.ts +39 -0
- package/dist/commands/chat/AgenticChat.d.ts.map +1 -0
- package/dist/commands/chat/AgenticChat.js +201 -0
- package/dist/commands/chat/AgenticChat.js.map +1 -0
- package/dist/commands/chat/renderers/CodeRenderer.d.ts +36 -0
- package/dist/commands/chat/renderers/CodeRenderer.d.ts.map +1 -0
- package/dist/commands/chat/renderers/CodeRenderer.js +85 -0
- package/dist/commands/chat/renderers/CodeRenderer.js.map +1 -0
- package/dist/commands/chat/renderers/ToolRenderer.d.ts +30 -0
- package/dist/commands/chat/renderers/ToolRenderer.d.ts.map +1 -0
- package/dist/commands/chat/renderers/ToolRenderer.js +93 -0
- package/dist/commands/chat/renderers/ToolRenderer.js.map +1 -0
- package/dist/commands/chat/single-message.d.ts +15 -0
- package/dist/commands/chat/single-message.d.ts.map +1 -0
- package/dist/commands/chat/single-message.js +85 -0
- package/dist/commands/chat/single-message.js.map +1 -0
- package/dist/commands/config/categories.d.ts +8 -0
- package/dist/commands/config/categories.d.ts.map +1 -0
- package/dist/commands/config/categories.js +75 -0
- package/dist/commands/config/categories.js.map +1 -0
- package/dist/commands/config/category.d.ts +8 -0
- package/dist/commands/config/category.d.ts.map +1 -0
- package/dist/commands/config/category.js +81 -0
- package/dist/commands/config/category.js.map +1 -0
- package/dist/commands/config/get.d.ts +9 -0
- package/dist/commands/config/get.d.ts.map +1 -0
- package/dist/commands/config/get.js +98 -0
- package/dist/commands/config/get.js.map +1 -0
- package/dist/commands/config/reset.d.ts +6 -0
- package/dist/commands/config/reset.d.ts.map +1 -0
- package/dist/commands/config/reset.js +68 -0
- package/dist/commands/config/reset.js.map +1 -0
- package/dist/commands/config/set.d.ts +6 -0
- package/dist/commands/config/set.d.ts.map +1 -0
- package/dist/commands/config/set.js +60 -0
- package/dist/commands/config/set.js.map +1 -0
- package/dist/commands/config/utils.d.ts +14 -0
- package/dist/commands/config/utils.d.ts.map +1 -0
- package/dist/commands/config/utils.js +54 -0
- package/dist/commands/config/utils.js.map +1 -0
- package/dist/commands/context/boundaries.d.ts +13 -0
- package/dist/commands/context/boundaries.d.ts.map +1 -0
- package/dist/commands/context/boundaries.js +45 -0
- package/dist/commands/context/boundaries.js.map +1 -0
- package/dist/commands/context/compact.d.ts +13 -0
- package/dist/commands/context/compact.d.ts.map +1 -0
- package/dist/commands/context/compact.js +41 -0
- package/dist/commands/context/compact.js.map +1 -0
- package/dist/commands/context/savings.d.ts +13 -0
- package/dist/commands/context/savings.d.ts.map +1 -0
- package/dist/commands/context/savings.js +49 -0
- package/dist/commands/context/savings.js.map +1 -0
- package/dist/commands/context/status.d.ts +13 -0
- package/dist/commands/context/status.d.ts.map +1 -0
- package/dist/commands/context/status.js +52 -0
- package/dist/commands/context/status.js.map +1 -0
- package/dist/commands/context/strategy.d.ts +13 -0
- package/dist/commands/context/strategy.d.ts.map +1 -0
- package/dist/commands/context/strategy.js +66 -0
- package/dist/commands/context/strategy.js.map +1 -0
- package/dist/commands/mcp/disable.d.ts +5 -0
- package/dist/commands/mcp/disable.d.ts.map +1 -0
- package/dist/commands/mcp/disable.js +26 -0
- package/dist/commands/mcp/disable.js.map +1 -0
- package/dist/commands/mcp/edit.d.ts +9 -0
- package/dist/commands/mcp/edit.d.ts.map +1 -0
- package/dist/commands/mcp/edit.js +62 -0
- package/dist/commands/mcp/edit.js.map +1 -0
- package/dist/commands/mcp/enable.d.ts +5 -0
- package/dist/commands/mcp/enable.d.ts.map +1 -0
- package/dist/commands/mcp/enable.js +27 -0
- package/dist/commands/mcp/enable.js.map +1 -0
- package/dist/commands/mcp/init.d.ts +9 -0
- package/dist/commands/mcp/init.d.ts.map +1 -0
- package/dist/commands/mcp/init.js +97 -0
- package/dist/commands/mcp/init.js.map +1 -0
- package/dist/commands/mcp/list.d.ts +6 -0
- package/dist/commands/mcp/list.d.ts.map +1 -0
- package/dist/commands/mcp/list.js +56 -0
- package/dist/commands/mcp/list.js.map +1 -0
- package/dist/commands/mcp/server.d.ts +6 -0
- package/dist/commands/mcp/server.d.ts.map +1 -0
- package/dist/commands/mcp/server.js +44 -0
- package/dist/commands/mcp/server.js.map +1 -0
- package/dist/commands/mcp/status.d.ts +6 -0
- package/dist/commands/mcp/status.d.ts.map +1 -0
- package/dist/commands/mcp/status.js +43 -0
- package/dist/commands/mcp/status.js.map +1 -0
- package/dist/commands/mcp/tools.d.ts +7 -0
- package/dist/commands/mcp/tools.d.ts.map +1 -0
- package/dist/commands/mcp/tools.js +82 -0
- package/dist/commands/mcp/tools.js.map +1 -0
- package/dist/commands/mcp/validate.d.ts +8 -0
- package/dist/commands/mcp/validate.d.ts.map +1 -0
- package/dist/commands/mcp/validate.js +121 -0
- package/dist/commands/mcp/validate.js.map +1 -0
- package/dist/commands/middleware/config.d.ts +13 -0
- package/dist/commands/middleware/config.d.ts.map +1 -0
- package/dist/commands/middleware/config.js +87 -0
- package/dist/commands/middleware/config.js.map +1 -0
- package/dist/commands/middleware/disable.d.ts +13 -0
- package/dist/commands/middleware/disable.d.ts.map +1 -0
- package/dist/commands/middleware/disable.js +50 -0
- package/dist/commands/middleware/disable.js.map +1 -0
- package/dist/commands/middleware/enable.d.ts +13 -0
- package/dist/commands/middleware/enable.d.ts.map +1 -0
- package/dist/commands/middleware/enable.js +50 -0
- package/dist/commands/middleware/enable.js.map +1 -0
- package/dist/commands/middleware/list.d.ts +13 -0
- package/dist/commands/middleware/list.d.ts.map +1 -0
- package/dist/commands/middleware/list.js +64 -0
- package/dist/commands/middleware/list.js.map +1 -0
- package/dist/commands/middleware/status.d.ts +13 -0
- package/dist/commands/middleware/status.d.ts.map +1 -0
- package/dist/commands/middleware/status.js +80 -0
- package/dist/commands/middleware/status.js.map +1 -0
- package/dist/commands/models/compare.d.ts +9 -0
- package/dist/commands/models/compare.d.ts.map +1 -0
- package/dist/commands/models/compare.js +76 -0
- package/dist/commands/models/compare.js.map +1 -0
- package/dist/commands/models/cost.d.ts +9 -0
- package/dist/commands/models/cost.d.ts.map +1 -0
- package/dist/commands/models/cost.js +64 -0
- package/dist/commands/models/cost.js.map +1 -0
- package/dist/commands/models/info.d.ts +9 -0
- package/dist/commands/models/info.d.ts.map +1 -0
- package/dist/commands/models/info.js +61 -0
- package/dist/commands/models/info.js.map +1 -0
- package/dist/commands/models/list.d.ts +6 -0
- package/dist/commands/models/list.d.ts.map +1 -0
- package/dist/commands/models/list.js +66 -0
- package/dist/commands/models/list.js.map +1 -0
- package/dist/commands/models/providers.d.ts +13 -0
- package/dist/commands/models/providers.d.ts.map +1 -0
- package/dist/commands/models/providers.js +45 -0
- package/dist/commands/models/providers.js.map +1 -0
- package/dist/commands/models/search.d.ts +10 -0
- package/dist/commands/models/search.d.ts.map +1 -0
- package/dist/commands/models/search.js +56 -0
- package/dist/commands/models/search.js.map +1 -0
- package/dist/commands/models/switch.d.ts +14 -0
- package/dist/commands/models/switch.d.ts.map +1 -0
- package/dist/commands/models/switch.js +67 -0
- package/dist/commands/models/switch.js.map +1 -0
- package/dist/commands/permissions/auto-approve.d.ts +13 -0
- package/dist/commands/permissions/auto-approve.d.ts.map +1 -0
- package/dist/commands/permissions/auto-approve.js +53 -0
- package/dist/commands/permissions/auto-approve.js.map +1 -0
- package/dist/commands/permissions/grant.d.ts +13 -0
- package/dist/commands/permissions/grant.d.ts.map +1 -0
- package/dist/commands/permissions/grant.js +46 -0
- package/dist/commands/permissions/grant.js.map +1 -0
- package/dist/commands/permissions/mode.d.ts +12 -0
- package/dist/commands/permissions/mode.d.ts.map +1 -0
- package/dist/commands/permissions/mode.js +61 -0
- package/dist/commands/permissions/mode.js.map +1 -0
- package/dist/commands/permissions/policies.d.ts +13 -0
- package/dist/commands/permissions/policies.d.ts.map +1 -0
- package/dist/commands/permissions/policies.js +47 -0
- package/dist/commands/permissions/policies.js.map +1 -0
- package/dist/commands/permissions/revoke.d.ts +13 -0
- package/dist/commands/permissions/revoke.d.ts.map +1 -0
- package/dist/commands/permissions/revoke.js +46 -0
- package/dist/commands/permissions/revoke.js.map +1 -0
- package/dist/commands/permissions/set.d.ts +13 -0
- package/dist/commands/permissions/set.d.ts.map +1 -0
- package/dist/commands/permissions/set.js +57 -0
- package/dist/commands/permissions/set.js.map +1 -0
- package/dist/commands/permissions/tools.d.ts +13 -0
- package/dist/commands/permissions/tools.d.ts.map +1 -0
- package/dist/commands/permissions/tools.js +50 -0
- package/dist/commands/permissions/tools.js.map +1 -0
- package/dist/commands/server/start.d.ts +11 -0
- package/dist/commands/server/start.d.ts.map +1 -0
- package/dist/commands/server/start.js +58 -0
- package/dist/commands/server/start.js.map +1 -0
- package/dist/commands/session/checkpoints.d.ts +6 -0
- package/dist/commands/session/checkpoints.d.ts.map +1 -0
- package/dist/commands/session/checkpoints.js +41 -0
- package/dist/commands/session/checkpoints.js.map +1 -0
- package/dist/commands/session/compact.d.ts +13 -0
- package/dist/commands/session/compact.d.ts.map +1 -0
- package/dist/commands/session/compact.js +56 -0
- package/dist/commands/session/compact.js.map +1 -0
- package/dist/commands/session/export.d.ts +6 -0
- package/dist/commands/session/export.d.ts.map +1 -0
- package/dist/commands/session/export.js +31 -0
- package/dist/commands/session/export.js.map +1 -0
- package/dist/commands/session/list.d.ts +7 -0
- package/dist/commands/session/list.d.ts.map +1 -0
- package/dist/commands/session/list.js +63 -0
- package/dist/commands/session/list.js.map +1 -0
- package/dist/commands/session/new.d.ts +8 -0
- package/dist/commands/session/new.d.ts.map +1 -0
- package/dist/commands/session/new.js +23 -0
- package/dist/commands/session/new.js.map +1 -0
- package/dist/commands/session/resume.d.ts +6 -0
- package/dist/commands/session/resume.d.ts.map +1 -0
- package/dist/commands/session/resume.js +32 -0
- package/dist/commands/session/resume.js.map +1 -0
- package/dist/commands/session/search.d.ts +10 -0
- package/dist/commands/session/search.d.ts.map +1 -0
- package/dist/commands/session/search.js +65 -0
- package/dist/commands/session/search.js.map +1 -0
- package/dist/commands/session/stats.d.ts +6 -0
- package/dist/commands/session/stats.d.ts.map +1 -0
- package/dist/commands/session/stats.js +58 -0
- package/dist/commands/session/stats.js.map +1 -0
- package/dist/commands/session/view.d.ts +6 -0
- package/dist/commands/session/view.d.ts.map +1 -0
- package/dist/commands/session/view.js +65 -0
- package/dist/commands/session/view.js.map +1 -0
- package/dist/commands/slash/CommandPalette.d.ts +60 -0
- package/dist/commands/slash/CommandPalette.d.ts.map +1 -0
- package/dist/commands/slash/CommandPalette.js +351 -0
- package/dist/commands/slash/CommandPalette.js.map +1 -0
- package/dist/commands/slash/SlashCommandParser.d.ts +11 -0
- package/dist/commands/slash/SlashCommandParser.d.ts.map +1 -0
- package/dist/commands/slash/SlashCommandParser.js +11 -0
- package/dist/commands/slash/SlashCommandParser.js.map +1 -0
- package/dist/commands/slash/SlashCommandRegistry.d.ts +11 -0
- package/dist/commands/slash/SlashCommandRegistry.d.ts.map +1 -0
- package/dist/commands/slash/SlashCommandRegistry.js +11 -0
- package/dist/commands/slash/SlashCommandRegistry.js.map +1 -0
- package/dist/commands/slash/index.d.ts +11 -0
- package/dist/commands/slash/index.d.ts.map +1 -0
- package/dist/commands/slash/index.js +13 -0
- package/dist/commands/slash/index.js.map +1 -0
- package/dist/commands/system-messages/list.d.ts +13 -0
- package/dist/commands/system-messages/list.d.ts.map +1 -0
- package/dist/commands/system-messages/list.js +54 -0
- package/dist/commands/system-messages/list.js.map +1 -0
- package/dist/commands/system-messages/reload.d.ts +13 -0
- package/dist/commands/system-messages/reload.d.ts.map +1 -0
- package/dist/commands/system-messages/reload.js +36 -0
- package/dist/commands/system-messages/reload.js.map +1 -0
- package/dist/commands/system-messages/view.d.ts +13 -0
- package/dist/commands/system-messages/view.d.ts.map +1 -0
- package/dist/commands/system-messages/view.js +52 -0
- package/dist/commands/system-messages/view.js.map +1 -0
- package/dist/commands/tmux/list.d.ts +13 -0
- package/dist/commands/tmux/list.d.ts.map +1 -0
- package/dist/commands/tmux/list.js +68 -0
- package/dist/commands/tmux/list.js.map +1 -0
- package/dist/commands/tools/info.d.ts +13 -0
- package/dist/commands/tools/info.d.ts.map +1 -0
- package/dist/commands/tools/info.js +82 -0
- package/dist/commands/tools/info.js.map +1 -0
- package/dist/commands/tools/list.d.ts +14 -0
- package/dist/commands/tools/list.d.ts.map +1 -0
- package/dist/commands/tools/list.js +67 -0
- package/dist/commands/tools/list.js.map +1 -0
- package/dist/config/ConfigManager.d.ts +40 -0
- package/dist/config/ConfigManager.d.ts.map +1 -0
- package/dist/config/ConfigManager.js +162 -0
- package/dist/config/ConfigManager.js.map +1 -0
- package/dist/config/extension.d.ts +12 -0
- package/dist/config/extension.d.ts.map +1 -0
- package/dist/config/extension.js +5 -0
- package/dist/config/extension.js.map +1 -0
- package/dist/config/settings.d.ts +42 -0
- package/dist/config/settings.d.ts.map +1 -0
- package/dist/config/settings.js +32 -0
- package/dist/config/settings.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +883 -0
- package/dist/index.js.map +1 -0
- package/dist/orchestrator/OrchestratorClient.d.ts +385 -0
- package/dist/orchestrator/OrchestratorClient.d.ts.map +1 -0
- package/dist/orchestrator/OrchestratorClient.js +1195 -0
- package/dist/orchestrator/OrchestratorClient.js.map +1 -0
- package/dist/themes/DefaultTheme.d.ts +9 -0
- package/dist/themes/DefaultTheme.d.ts.map +1 -0
- package/dist/themes/DefaultTheme.js +29 -0
- package/dist/themes/DefaultTheme.js.map +1 -0
- package/dist/themes/MinimalTheme.d.ts +9 -0
- package/dist/themes/MinimalTheme.d.ts.map +1 -0
- package/dist/themes/MinimalTheme.js +29 -0
- package/dist/themes/MinimalTheme.js.map +1 -0
- package/dist/themes/Theme.interface.d.ts +36 -0
- package/dist/themes/Theme.interface.d.ts.map +1 -0
- package/dist/themes/Theme.interface.js +5 -0
- package/dist/themes/Theme.interface.js.map +1 -0
- package/dist/themes/ThemeManager.d.ts +63 -0
- package/dist/themes/ThemeManager.d.ts.map +1 -0
- package/dist/themes/ThemeManager.js +257 -0
- package/dist/themes/ThemeManager.js.map +1 -0
- package/dist/themes/colors.d.ts +108 -0
- package/dist/themes/colors.d.ts.map +1 -0
- package/dist/themes/colors.js +284 -0
- package/dist/themes/colors.js.map +1 -0
- package/dist/themes/createTheme.d.ts +40 -0
- package/dist/themes/createTheme.d.ts.map +1 -0
- package/dist/themes/createTheme.js +114 -0
- package/dist/themes/createTheme.js.map +1 -0
- package/dist/themes/themeDefinitions.d.ts +27 -0
- package/dist/themes/themeDefinitions.d.ts.map +1 -0
- package/dist/themes/themeDefinitions.js +244 -0
- package/dist/themes/themeDefinitions.js.map +1 -0
- package/dist/utils/CodeDiffRenderer.d.ts +124 -0
- package/dist/utils/CodeDiffRenderer.d.ts.map +1 -0
- package/dist/utils/CodeDiffRenderer.js +257 -0
- package/dist/utils/CodeDiffRenderer.js.map +1 -0
- package/dist/utils/MarkdownRenderer.d.ts +74 -0
- package/dist/utils/MarkdownRenderer.d.ts.map +1 -0
- package/dist/utils/MarkdownRenderer.js +260 -0
- package/dist/utils/MarkdownRenderer.js.map +1 -0
- package/dist/utils/MessageRenderer.d.ts +200 -0
- package/dist/utils/MessageRenderer.d.ts.map +1 -0
- package/dist/utils/MessageRenderer.js +283 -0
- package/dist/utils/MessageRenderer.js.map +1 -0
- package/dist/utils/ToolFormatter.d.ts +103 -0
- package/dist/utils/ToolFormatter.d.ts.map +1 -0
- package/dist/utils/ToolFormatter.js +357 -0
- package/dist/utils/ToolFormatter.js.map +1 -0
- package/dist/utils/boxDrawing.d.ts +23 -0
- package/dist/utils/boxDrawing.d.ts.map +1 -0
- package/dist/utils/boxDrawing.js +78 -0
- package/dist/utils/boxDrawing.js.map +1 -0
- package/dist/utils/checks.d.ts +9 -0
- package/dist/utils/checks.d.ts.map +1 -0
- package/dist/utils/checks.js +11 -0
- package/dist/utils/checks.js.map +1 -0
- package/dist/utils/events.d.ts +24 -0
- package/dist/utils/events.d.ts.map +1 -0
- package/dist/utils/events.js +17 -0
- package/dist/utils/events.js.map +1 -0
- package/dist/utils/formatters.d.ts +255 -0
- package/dist/utils/formatters.d.ts.map +1 -0
- package/dist/utils/formatters.js +361 -0
- package/dist/utils/formatters.js.map +1 -0
- package/dist/utils/math.d.ts +11 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +13 -0
- package/dist/utils/math.js.map +1 -0
- package/package.json +82 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `cortex autoresearch bench` — run a task set through the harness, GRADE each
|
|
3
|
+
* output with the task's verifier, and write REAL scored records to
|
|
4
|
+
* router-matrix.jsonl. This is the grader the decision layer was missing:
|
|
5
|
+
* `evaluate` (the gate) only produces a meaningful keep/discard once arms carry
|
|
6
|
+
* real qualitativeScores, which this command supplies.
|
|
7
|
+
*
|
|
8
|
+
* One invocation benches ONE harness build (records auto-stamped with its git
|
|
9
|
+
* SHA, or --harness-ref). The swarm orchestrator runs it in the base worktree
|
|
10
|
+
* and the candidate worktree, then calls `evaluate --base … --candidate …`.
|
|
11
|
+
*/
|
|
12
|
+
import { readdirSync, readFileSync, statSync } from 'node:fs';
|
|
13
|
+
import { join, resolve } from 'node:path';
|
|
14
|
+
import { spawnSync } from 'node:child_process';
|
|
15
|
+
import { ModelRouterMatrix, runBench, parseTaskSet, ResearchBacklog, } from '@nexus-cortex/core';
|
|
16
|
+
import { ThemeManager } from '../../themes/ThemeManager.js';
|
|
17
|
+
import { findProjectRoot } from '../config/utils.js';
|
|
18
|
+
import { serverRunner } from './harnessProcess.js';
|
|
19
|
+
import { commandRunner } from './commandRunner.js';
|
|
20
|
+
/** Load tasks from a file (array or single object) or a directory of *.json. */
|
|
21
|
+
function loadTasks(taskSetPath) {
|
|
22
|
+
const st = statSync(taskSetPath);
|
|
23
|
+
const files = st.isDirectory()
|
|
24
|
+
? readdirSync(taskSetPath).filter(f => f.endsWith('.json')).map(f => join(taskSetPath, f))
|
|
25
|
+
: [taskSetPath];
|
|
26
|
+
const tasks = [];
|
|
27
|
+
for (const f of files) {
|
|
28
|
+
const raw = JSON.parse(readFileSync(f, 'utf8'));
|
|
29
|
+
tasks.push(...parseTaskSet(raw, f));
|
|
30
|
+
}
|
|
31
|
+
return tasks;
|
|
32
|
+
}
|
|
33
|
+
export async function autoResearchBench(options) {
|
|
34
|
+
const theme = ThemeManager.getTheme();
|
|
35
|
+
const projectRoot = findProjectRoot();
|
|
36
|
+
if (!options.taskSet) {
|
|
37
|
+
console.error(theme.colors.error('Error: --task-set is required'));
|
|
38
|
+
process.exit(1);
|
|
39
|
+
}
|
|
40
|
+
if (!options.experimentTag) {
|
|
41
|
+
console.error(theme.colors.error('Error: --experiment-tag is required'));
|
|
42
|
+
process.exit(1);
|
|
43
|
+
}
|
|
44
|
+
const split = options.split ?? 'train';
|
|
45
|
+
if (split !== 'train' && split !== 'holdout') {
|
|
46
|
+
console.error(theme.colors.error("Error: --split must be 'train' or 'holdout'"));
|
|
47
|
+
process.exit(1);
|
|
48
|
+
}
|
|
49
|
+
try {
|
|
50
|
+
const tasks = loadTasks(options.taskSet);
|
|
51
|
+
if (tasks.length === 0) {
|
|
52
|
+
console.error(theme.colors.error(`Error: no tasks found in ${options.taskSet}`));
|
|
53
|
+
process.exit(1);
|
|
54
|
+
}
|
|
55
|
+
const matrix = new ModelRouterMatrix(projectRoot);
|
|
56
|
+
const model = options.model ?? process.env.DEFAULT_MODEL_ID;
|
|
57
|
+
const log = (m) => { if (!options.json)
|
|
58
|
+
console.log(theme.colors.muted(` ${m}`)); };
|
|
59
|
+
let runner;
|
|
60
|
+
let source;
|
|
61
|
+
if (options.runCmd) {
|
|
62
|
+
// Non-cortex command target: optionally build once, then grade a shell command per task.
|
|
63
|
+
const cwd = options.cwd ? resolve(options.cwd) : projectRoot;
|
|
64
|
+
const acceptExit = (options.acceptExit ?? '0').split(',').map(s => Number(s.trim())).filter(n => Number.isFinite(n));
|
|
65
|
+
if (options.buildCmd) {
|
|
66
|
+
log(`Building target: ${options.buildCmd} (cwd ${cwd})`);
|
|
67
|
+
const b = spawnSync('sh', ['-c', options.buildCmd], { cwd, stdio: options.json ? 'ignore' : 'inherit' });
|
|
68
|
+
if (b.status !== 0) {
|
|
69
|
+
console.error(theme.colors.error(`Error: build command failed (exit ${b.status})`));
|
|
70
|
+
process.exit(1);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
runner = commandRunner({ cwd, template: options.runCmd, acceptExitCodes: acceptExit, log });
|
|
74
|
+
source = `cmd "${options.runCmd}" (cwd ${cwd})`;
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
const serverUrl = options.serverUrl ?? process.env.CORTEX_SERVER_URL ?? 'http://localhost:4000';
|
|
78
|
+
runner = serverRunner(serverUrl, model);
|
|
79
|
+
source = serverUrl;
|
|
80
|
+
}
|
|
81
|
+
if (!options.json) {
|
|
82
|
+
console.log();
|
|
83
|
+
console.log(theme.colors.muted(` Benching ${tasks.length} task(s) × ${options.runs ?? 2} run(s) via ${source} [${split}] tag=${options.experimentTag}`));
|
|
84
|
+
}
|
|
85
|
+
const summary = await runBench(tasks, runner, matrix, {
|
|
86
|
+
experimentTag: options.experimentTag,
|
|
87
|
+
runs: options.runs ? Number(options.runs) : undefined,
|
|
88
|
+
split,
|
|
89
|
+
modelId: model,
|
|
90
|
+
benchmarkSource: options.benchmarkSource,
|
|
91
|
+
harnessRef: options.harnessRef,
|
|
92
|
+
temperature: options.temperature !== undefined && Number.isFinite(Number(options.temperature)) ? Number(options.temperature) : undefined,
|
|
93
|
+
strategy: options.strategy,
|
|
94
|
+
backlog: options.seedBacklog === false ? undefined : new ResearchBacklog(projectRoot),
|
|
95
|
+
discoveredRound: options.experimentTag,
|
|
96
|
+
discoveredRef: options.harnessRef,
|
|
97
|
+
onRun: options.json ? undefined : (info) => {
|
|
98
|
+
const mark = info.pass ? theme.colors.success('[OK]') : theme.colors.error('[FAIL]');
|
|
99
|
+
console.log(theme.colors.muted(` ${mark} ${info.taskId} run ${info.run}: ${info.qualitativeScore}`));
|
|
100
|
+
},
|
|
101
|
+
});
|
|
102
|
+
if (options.json) {
|
|
103
|
+
console.log(JSON.stringify(summary, null, 2));
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
console.log();
|
|
107
|
+
for (const t of summary.tasks) {
|
|
108
|
+
console.log(` ${theme.colors.highlight(t.taskId.padEnd(28))} ${theme.colors.secondary(t.taskType)} mean ${t.meanScore} pass ${Math.round(t.passRate * 100)}%`);
|
|
109
|
+
}
|
|
110
|
+
console.log();
|
|
111
|
+
console.log(theme.colors.muted(` ${summary.totalRuns} run(s) recorded → ${projectRoot}/.cortex/router-matrix.jsonl (harnessRef ${summary.harnessRef ?? 'auto'})`));
|
|
112
|
+
if (summary.seededDeficiencies > 0) {
|
|
113
|
+
console.log(theme.colors.muted(` ${summary.seededDeficiencies} deficiency(ies) seeded → ${projectRoot}/.cortex/research-backlog.jsonl (ResearchBacklog list / next)`));
|
|
114
|
+
}
|
|
115
|
+
console.log(theme.colors.muted(` Next: cortex autoresearch evaluate --experiment-tag ${options.experimentTag} --base <ref> --candidate <ref> --branch <wt>`));
|
|
116
|
+
console.log();
|
|
117
|
+
}
|
|
118
|
+
catch (error) {
|
|
119
|
+
console.error(theme.colors.error(`Error: ${error.message}`));
|
|
120
|
+
process.exit(1);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
//# sourceMappingURL=bench.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bench.js","sourceRoot":"","sources":["../../../src/commands/autoresearch/bench.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AACH,OAAO,EAAE,WAAW,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AAC9D,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAC1C,OAAO,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AAC/C,OAAO,EACL,iBAAiB,EACjB,QAAQ,EACR,YAAY,EACZ,eAAe,GAGhB,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAC5D,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAC;AAiCnD,gFAAgF;AAChF,SAAS,SAAS,CAAC,WAAmB;IACpC,MAAM,EAAE,GAAG,QAAQ,CAAC,WAAW,CAAC,CAAC;IACjC,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,EAAE;QAC5B,CAAC,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC;QAC1F,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;IAClB,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC,CAAC;QAChD,KAAK,CAAC,IAAI,CAAC,GAAG,YAAY,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;IACtC,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,OAAiC;IACvE,MAAM,KAAK,GAAG,YAAY,CAAC,QAAQ,EAAE,CAAC;IACtC,MAAM,WAAW,GAAG,eAAe,EAAE,CAAC;IAEtC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;QAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,+BAA+B,CAAC,CAAC,CAAC;QAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAAC,CAAC;IAC9G,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC;QAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,qCAAqC,CAAC,CAAC,CAAC;QAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAAC,CAAC;IAE1H,MAAM,KAAK,GAAI,OAAO,CAAC,KAAyC,IAAI,OAAO,CAAC;IAC5E,IAAI,KAAK,KAAK,OAAO,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;QAC7C,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,6CAA6C,CAAC,CAAC,CAAC;QACjF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,CAAC,OAAQ,CAAC,CAAC;QAC1C,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,4BAA4B,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;YAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAAC,CAAC;QAE9H,MAAM,MAAM,GAAG,IAAI,iBAAiB,CAAC,WAAW,CAAC,CAAC;QAClD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,IAAI,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;QAC5D,MAAM,GAAG,GAAG,CAAC,CAAS,EAAE,EAAE,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI;YAAE,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAE5F,IAAI,MAAqB,CAAC;QAC1B,IAAI,MAAc,CAAC;QACnB,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;YACnB,yFAAyF;YACzF,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC;YAC7D,MAAM,UAAU,GAAG,CAAC,OAAO,CAAC,UAAU,IAAI,GAAG,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;YACrH,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;gBACrB,GAAG,CAAC,oBAAoB,OAAO,CAAC,QAAQ,UAAU,GAAG,GAAG,CAAC,CAAC;gBAC1D,MAAM,CAAC,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE,GAAG,EAAE,KAAK,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC;gBACzG,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;oBAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,qCAAqC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;oBAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;gBAAC,CAAC;YAC/H,CAAC;YACD,MAAM,GAAG,aAAa,CAAC,EAAE,GAAG,EAAE,QAAQ,EAAE,OAAO,CAAC,MAAM,EAAE,eAAe,EAAE,UAAU,EAAE,GAAG,EAAE,CAAC,CAAC;YAC5F,MAAM,GAAG,QAAQ,OAAO,CAAC,MAAM,UAAU,GAAG,GAAG,CAAC;QAClD,CAAC;aAAM,CAAC;YACN,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,GAAG,CAAC,iBAAiB,IAAI,uBAAuB,CAAC;YAChG,MAAM,GAAG,YAAY,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;YACxC,MAAM,GAAG,SAAS,CAAC;QACrB,CAAC;QAED,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YAClB,OAAO,CAAC,GAAG,EAAE,CAAC;YACd,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,aAAa,KAAK,CAAC,MAAM,cAAc,OAAO,CAAC,IAAI,IAAI,CAAC,eAAe,MAAM,KAAK,KAAK,UAAU,OAAO,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC;QAC5J,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE;YACpD,aAAa,EAAE,OAAO,CAAC,aAAc;YACrC,IAAI,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,SAAS;YACrD,KAAK;YACL,OAAO,EAAE,KAAK;YACd,eAAe,EAAE,OAAO,CAAC,eAAe;YACxC,UAAU,EAAE,OAAO,CAAC,UAAU;YAC9B,WAAW,EAAE,OAAO,CAAC,WAAW,KAAK,SAAS,IAAI,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,SAAS;YACxI,QAAQ,EAAE,OAAO,CAAC,QAAQ;YAC1B,OAAO,EAAE,OAAO,CAAC,WAAW,KAAK,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,eAAe,CAAC,WAAW,CAAC;YACrF,eAAe,EAAE,OAAO,CAAC,aAAa;YACtC,aAAa,EAAE,OAAO,CAAC,UAAU;YACjC,KAAK,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE;gBACzC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;gBACrF,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,QAAQ,IAAI,CAAC,GAAG,KAAK,IAAI,CAAC,gBAAgB,EAAE,CAAC,CAAC,CAAC;YACvG,CAAC;SACF,CAAC,CAAC;QAEH,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YAC9C,OAAO;QACT,CAAC;QAED,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YAC9B,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,SAAS,UAAU,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,QAAQ,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;QACnK,CAAC;QACD,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,SAAS,sBAAsB,WAAW,6CAA6C,OAAO,CAAC,UAAU,IAAI,MAAM,GAAG,CAAC,CAAC,CAAC;QACpK,IAAI,OAAO,CAAC,kBAAkB,GAAG,CAAC,EAAE,CAAC;YACnC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,OAAO,CAAC,kBAAkB,6BAA6B,WAAW,gEAAgE,CAAC,CAAC,CAAC;QAC1K,CAAC;QACD,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,wDAAwD,OAAO,CAAC,aAAa,+CAA+C,CAAC,CAAC,CAAC;QAC9J,OAAO,CAAC,GAAG,EAAE,CAAC;IAEhB,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAC7D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import type { HarnessRunner } from '@nexus-cortex/core';
|
|
2
|
+
import type { ExperimentTarget, PreparedArm, PrepareArmOptions } from './harnessProcess.js';
|
|
3
|
+
export interface CommandRunnerOptions {
|
|
4
|
+
/** Working directory the command runs in (e.g. the candidate worktree). */
|
|
5
|
+
cwd: string;
|
|
6
|
+
/** Command template, e.g. `./eval.sh {prompt}` or `python eval.py --case {prompt}`.
|
|
7
|
+
* If it contains no `{prompt}`/`{case}` placeholder, the prompt is appended as a
|
|
8
|
+
* single quoted argument. */
|
|
9
|
+
template: string;
|
|
10
|
+
/** Exit codes whose stdout is accepted for grading. Default `[0]`. */
|
|
11
|
+
acceptExitCodes?: number[];
|
|
12
|
+
/** Per-run hard timeout in ms. Default 120000. */
|
|
13
|
+
timeoutMs?: number;
|
|
14
|
+
/** Progress/diagnostic sink (stderr + nonzero-exit notices). */
|
|
15
|
+
log?: (message: string) => void;
|
|
16
|
+
}
|
|
17
|
+
export declare function commandRunner(opts: CommandRunnerOptions): HarnessRunner;
|
|
18
|
+
/**
|
|
19
|
+
* CommandTarget — the non-cortex `ExperimentTarget`. An optional one-shot build command,
|
|
20
|
+
* then grade a shell command per task via `commandRunner`. Nothing to serve, nothing to
|
|
21
|
+
* tear down — so `cortex autoresearch experiment` can run base-vs-candidate on any project
|
|
22
|
+
* (a library, CLI, test suite, backtest) through the same statistical gate as the harness.
|
|
23
|
+
*/
|
|
24
|
+
export declare class CommandTarget implements ExperimentTarget {
|
|
25
|
+
private readonly cfg;
|
|
26
|
+
readonly kind = "command";
|
|
27
|
+
constructor(cfg: {
|
|
28
|
+
template: string;
|
|
29
|
+
buildCmd?: string;
|
|
30
|
+
acceptExitCodes?: number[];
|
|
31
|
+
timeoutMs?: number;
|
|
32
|
+
});
|
|
33
|
+
prepare(dir: string, opts: PrepareArmOptions): Promise<PreparedArm>;
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=commandRunner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"commandRunner.d.ts","sourceRoot":"","sources":["../../../src/commands/autoresearch/commandRunner.ts"],"names":[],"mappings":"AAkBA,OAAO,KAAK,EAAE,aAAa,EAAoB,MAAM,oBAAoB,CAAC;AAC1E,OAAO,KAAK,EAAE,gBAAgB,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAE5F,MAAM,WAAW,oBAAoB;IACnC,2EAA2E;IAC3E,GAAG,EAAE,MAAM,CAAC;IACZ;;kCAE8B;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,sEAAsE;IACtE,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAC3B,kDAAkD;IAClD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gEAAgE;IAChE,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;CACjC;AAOD,wBAAgB,aAAa,CAAC,IAAI,EAAE,oBAAoB,GAAG,aAAa,CAoCvE;AAED;;;;;GAKG;AACH,qBAAa,aAAc,YAAW,gBAAgB;IAGlD,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFtB,QAAQ,CAAC,IAAI,aAAa;gBAEP,GAAG,EAAE;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;QAAC,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;QAAC,SAAS,CAAC,EAAE,MAAM,CAAA;KAAE;IAGzG,OAAO,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,iBAAiB,GAAG,OAAO,CAAC,WAAW,CAAC;CAe1E"}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CommandRunner — a `HarnessRunner` that grades a SHELL COMMAND instead of an LLM
|
|
3
|
+
* endpoint. This is the non-cortex experiment path: run `template` (with `{prompt}` /
|
|
4
|
+
* `{case}` substituted) per task in `cwd`, capture stdout, and hand it to the task's
|
|
5
|
+
* verifier (typically a `numeric` verifier that extracts a metric). It lets the
|
|
6
|
+
* auto-research loop measure a library / CLI / test suite / backtest — anything with a
|
|
7
|
+
* build + run + metric — through the same statistical gate as the cortex harness.
|
|
8
|
+
*
|
|
9
|
+
* Exit-code contract: stdout is graded only when the exit code is in `acceptExitCodes`
|
|
10
|
+
* (default `[0]`); otherwise `text=''`, so every verifier fails — a crashed run is not a
|
|
11
|
+
* valid measurement, so it fails the bench and seeds the backlog. stderr (and the exit
|
|
12
|
+
* code) are surfaced via `log`, never graded.
|
|
13
|
+
*
|
|
14
|
+
* The substituted `{prompt}` value is single-quote-escaped before it reaches the shell,
|
|
15
|
+
* so a task prompt cannot inject shell syntax. The `template` itself is operator-supplied
|
|
16
|
+
* (the experiment spec) and trusted.
|
|
17
|
+
*/
|
|
18
|
+
import { spawn, spawnSync } from 'node:child_process';
|
|
19
|
+
/** POSIX single-quote escape: wrap in '…', and close/escape/reopen any embedded quote. */
|
|
20
|
+
function shQuote(s) {
|
|
21
|
+
return `'${s.replace(/'/g, `'\\''`)}'`;
|
|
22
|
+
}
|
|
23
|
+
export function commandRunner(opts) {
|
|
24
|
+
const accept = opts.acceptExitCodes ?? [0];
|
|
25
|
+
const timeoutMs = opts.timeoutMs ?? 120_000;
|
|
26
|
+
return {
|
|
27
|
+
run(prompt) {
|
|
28
|
+
const cmd = /\{prompt\}|\{case\}/.test(opts.template)
|
|
29
|
+
? opts.template.replace(/\{prompt\}|\{case\}/g, shQuote(prompt))
|
|
30
|
+
: `${opts.template} ${shQuote(prompt)}`;
|
|
31
|
+
const start = Date.now();
|
|
32
|
+
return new Promise((resolve) => {
|
|
33
|
+
const proc = spawn('sh', ['-c', cmd], { cwd: opts.cwd, stdio: ['ignore', 'pipe', 'pipe'] });
|
|
34
|
+
let out = '';
|
|
35
|
+
let err = '';
|
|
36
|
+
let timedOut = false;
|
|
37
|
+
const timer = setTimeout(() => { timedOut = true; try {
|
|
38
|
+
proc.kill('SIGKILL');
|
|
39
|
+
}
|
|
40
|
+
catch { /* already gone */ } }, timeoutMs);
|
|
41
|
+
proc.stdout.on('data', (d) => { out += d.toString(); });
|
|
42
|
+
proc.stderr.on('data', (d) => { err += d.toString(); });
|
|
43
|
+
const done = (text, latencyMs) => ({ text, modelId: 'command', inputTokens: 0, outputTokens: 0, toolCallCount: 0, latencyMs });
|
|
44
|
+
proc.on('close', (code) => {
|
|
45
|
+
clearTimeout(timer);
|
|
46
|
+
const ok = !timedOut && code != null && accept.includes(code);
|
|
47
|
+
if (!ok) {
|
|
48
|
+
const reason = timedOut ? `timeout after ${timeoutMs}ms` : `exit ${code}`;
|
|
49
|
+
opts.log?.(`[command ${reason}] ${err.trim().slice(-300)}`);
|
|
50
|
+
}
|
|
51
|
+
resolve(done(ok ? out : '', Date.now() - start));
|
|
52
|
+
});
|
|
53
|
+
proc.on('error', (e) => {
|
|
54
|
+
clearTimeout(timer);
|
|
55
|
+
opts.log?.(`[command error] ${e.message}`);
|
|
56
|
+
resolve(done('', Date.now() - start));
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* CommandTarget — the non-cortex `ExperimentTarget`. An optional one-shot build command,
|
|
64
|
+
* then grade a shell command per task via `commandRunner`. Nothing to serve, nothing to
|
|
65
|
+
* tear down — so `cortex autoresearch experiment` can run base-vs-candidate on any project
|
|
66
|
+
* (a library, CLI, test suite, backtest) through the same statistical gate as the harness.
|
|
67
|
+
*/
|
|
68
|
+
export class CommandTarget {
|
|
69
|
+
cfg;
|
|
70
|
+
kind = 'command';
|
|
71
|
+
constructor(cfg) {
|
|
72
|
+
this.cfg = cfg;
|
|
73
|
+
}
|
|
74
|
+
async prepare(dir, opts) {
|
|
75
|
+
if (opts.build && this.cfg.buildCmd) {
|
|
76
|
+
opts.log(`build: ${this.cfg.buildCmd} (cwd ${dir})`);
|
|
77
|
+
const b = spawnSync('sh', ['-c', this.cfg.buildCmd], { cwd: dir, stdio: ['ignore', 'ignore', 'inherit'] });
|
|
78
|
+
if (b.status !== 0)
|
|
79
|
+
throw new Error(`build command failed (exit ${b.status}) in ${dir}`);
|
|
80
|
+
}
|
|
81
|
+
const runner = commandRunner({
|
|
82
|
+
cwd: dir,
|
|
83
|
+
template: this.cfg.template,
|
|
84
|
+
acceptExitCodes: this.cfg.acceptExitCodes,
|
|
85
|
+
timeoutMs: this.cfg.timeoutMs,
|
|
86
|
+
log: opts.log,
|
|
87
|
+
});
|
|
88
|
+
return { runner, stop: () => { } };
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
//# sourceMappingURL=commandRunner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"commandRunner.js","sourceRoot":"","sources":["../../../src/commands/autoresearch/commandRunner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AACH,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AAmBtD,0FAA0F;AAC1F,SAAS,OAAO,CAAC,CAAS;IACxB,OAAO,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC,GAAG,CAAC;AACzC,CAAC;AAED,MAAM,UAAU,aAAa,CAAC,IAA0B;IACtD,MAAM,MAAM,GAAG,IAAI,CAAC,eAAe,IAAI,CAAC,CAAC,CAAC,CAAC;IAC3C,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,OAAO,CAAC;IAC5C,OAAO;QACL,GAAG,CAAC,MAAc;YAChB,MAAM,GAAG,GAAG,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC;gBACnD,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,sBAAsB,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;gBAChE,CAAC,CAAC,GAAG,IAAI,CAAC,QAAQ,IAAI,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACzB,OAAO,IAAI,OAAO,CAAmB,CAAC,OAAO,EAAE,EAAE;gBAC/C,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,KAAK,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,CAAC;gBAC5F,IAAI,GAAG,GAAG,EAAE,CAAC;gBACb,IAAI,GAAG,GAAG,EAAE,CAAC;gBACb,IAAI,QAAQ,GAAG,KAAK,CAAC;gBACrB,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE,GAAG,QAAQ,GAAG,IAAI,CAAC,CAAC,IAAI,CAAC;oBAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBAAC,CAAC;gBAAC,MAAM,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;gBAC3H,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC,EAAE,EAAE,GAAG,GAAG,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxD,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC,EAAE,EAAE,GAAG,GAAG,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxD,MAAM,IAAI,GAAG,CAAC,IAAY,EAAE,SAAiB,EAAoB,EAAE,CACjE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,aAAa,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC;gBAC/F,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;oBACxB,YAAY,CAAC,KAAK,CAAC,CAAC;oBACpB,MAAM,EAAE,GAAG,CAAC,QAAQ,IAAI,IAAI,IAAI,IAAI,IAAI,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;oBAC9D,IAAI,CAAC,EAAE,EAAE,CAAC;wBACR,MAAM,MAAM,GAAG,QAAQ,CAAC,CAAC,CAAC,iBAAiB,SAAS,IAAI,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC;wBAC1E,IAAI,CAAC,GAAG,EAAE,CAAC,YAAY,MAAM,KAAK,GAAG,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;oBAC9D,CAAC;oBACD,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC;gBACnD,CAAC,CAAC,CAAC;gBACH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,CAAC,EAAE,EAAE;oBACrB,YAAY,CAAC,KAAK,CAAC,CAAC;oBACpB,IAAI,CAAC,GAAG,EAAE,CAAC,mBAAmB,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;oBAC3C,OAAO,CAAC,IAAI,CAAC,EAAE,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,CAAC,CAAC;gBACxC,CAAC,CAAC,CAAC;YACL,CAAC,CAAC,CAAC;QACL,CAAC;KACF,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACH,MAAM,OAAO,aAAa;IAGL;IAFV,IAAI,GAAG,SAAS,CAAC;IAC1B,YACmB,GAA4F;QAA5F,QAAG,GAAH,GAAG,CAAyF;IAC5G,CAAC;IAEJ,KAAK,CAAC,OAAO,CAAC,GAAW,EAAE,IAAuB;QAChD,IAAI,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,CAAC;YACpC,IAAI,CAAC,GAAG,CAAC,UAAU,IAAI,CAAC,GAAG,CAAC,QAAQ,UAAU,GAAG,GAAG,CAAC,CAAC;YACtD,MAAM,CAAC,GAAG,SAAS,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,EAAE,GAAG,EAAE,GAAG,EAAE,KAAK,EAAE,CAAC,QAAQ,EAAE,QAAQ,EAAE,SAAS,CAAC,EAAE,CAAC,CAAC;YAC3G,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC;gBAAE,MAAM,IAAI,KAAK,CAAC,8BAA8B,CAAC,CAAC,MAAM,QAAQ,GAAG,EAAE,CAAC,CAAC;QAC3F,CAAC;QACD,MAAM,MAAM,GAAG,aAAa,CAAC;YAC3B,GAAG,EAAE,GAAG;YACR,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,QAAQ;YAC3B,eAAe,EAAE,IAAI,CAAC,GAAG,CAAC,eAAe;YACzC,SAAS,EAAE,IAAI,CAAC,GAAG,CAAC,SAAS;YAC7B,GAAG,EAAE,IAAI,CAAC,GAAG;SACd,CAAC,CAAC;QACH,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,GAAG,EAAE,GAA8B,CAAC,EAAE,CAAC;IAChE,CAAC;CACF"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
export interface AutoResearchEvaluateOptions {
|
|
2
|
+
experimentTag?: string;
|
|
3
|
+
base?: string;
|
|
4
|
+
candidate?: string;
|
|
5
|
+
branch?: string;
|
|
6
|
+
deficiencyId?: string;
|
|
7
|
+
benchmarkSource?: string;
|
|
8
|
+
modelId?: string;
|
|
9
|
+
nFamily?: string;
|
|
10
|
+
alpha?: string;
|
|
11
|
+
seed?: string;
|
|
12
|
+
epsilon?: string;
|
|
13
|
+
minRuns?: string;
|
|
14
|
+
verifyHoldout?: boolean;
|
|
15
|
+
json?: boolean;
|
|
16
|
+
}
|
|
17
|
+
export declare function autoResearchEvaluate(options: AutoResearchEvaluateOptions): Promise<void>;
|
|
18
|
+
//# sourceMappingURL=evaluate.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluate.d.ts","sourceRoot":"","sources":["../../../src/commands/autoresearch/evaluate.ts"],"names":[],"mappings":"AA0BA,MAAM,WAAW,2BAA2B;IAC1C,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,IAAI,CAAC,EAAE,OAAO,CAAC;CAChB;AAQD,wBAAsB,oBAAoB,CAAC,OAAO,EAAE,2BAA2B,GAAG,OAAO,CAAC,IAAI,CAAC,CAoG9F"}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `cortex autoresearch evaluate` — headless invocation of the keep/discard gate.
|
|
3
|
+
*
|
|
4
|
+
* This is the entry point a swarm member (or the cortex-bench flow) calls AFTER
|
|
5
|
+
* it has recorded base-version and candidate-version benchmark runs to the
|
|
6
|
+
* matrix (router-matrix.jsonl) under the same experimentTag. It runs the full
|
|
7
|
+
* decision pipeline (regressionScan → Monte-Carlo gate → ledger.decide) and
|
|
8
|
+
* writes the audited verdict to `.cortex/experiments.jsonl`.
|
|
9
|
+
*
|
|
10
|
+
* The output JSONL is THE integration boundary: the nexus Layer-3 STDB module
|
|
11
|
+
* ingests it (header → `experiment`, results[] → `experiment_task_result`).
|
|
12
|
+
* Nothing downstream re-derives the statistics — the decision is final here.
|
|
13
|
+
*
|
|
14
|
+
* Overfitting guard: keep/discard reads split='train' records; `--verify-holdout`
|
|
15
|
+
* additionally runs the held-out gate (a candidate is only merge-eligible when
|
|
16
|
+
* kept-on-train AND verified-on-holdout).
|
|
17
|
+
*/
|
|
18
|
+
import { ModelRouterMatrix, ExperimentLedger, evaluateAutoResearchExperiment, verifyOnHoldout, } from '@nexus-cortex/core';
|
|
19
|
+
import { ThemeManager } from '../../themes/ThemeManager.js';
|
|
20
|
+
import { findProjectRoot } from '../config/utils.js';
|
|
21
|
+
function numOrUndef(v) {
|
|
22
|
+
if (v === undefined)
|
|
23
|
+
return undefined;
|
|
24
|
+
const n = Number(v);
|
|
25
|
+
return Number.isFinite(n) ? n : undefined;
|
|
26
|
+
}
|
|
27
|
+
export async function autoResearchEvaluate(options) {
|
|
28
|
+
const theme = ThemeManager.getTheme();
|
|
29
|
+
const projectRoot = findProjectRoot();
|
|
30
|
+
const required = [
|
|
31
|
+
['experimentTag', '--experiment-tag'],
|
|
32
|
+
['base', '--base'],
|
|
33
|
+
['candidate', '--candidate'],
|
|
34
|
+
['branch', '--branch'],
|
|
35
|
+
];
|
|
36
|
+
const missing = required.filter(([k]) => !options[k]).map(([, flag]) => flag);
|
|
37
|
+
if (missing.length > 0) {
|
|
38
|
+
console.error(theme.colors.error(`Error: missing required option(s): ${missing.join(', ')}`));
|
|
39
|
+
process.exit(1);
|
|
40
|
+
}
|
|
41
|
+
try {
|
|
42
|
+
const matrix = new ModelRouterMatrix(projectRoot);
|
|
43
|
+
const ledger = new ExperimentLedger(projectRoot);
|
|
44
|
+
const gate = {
|
|
45
|
+
alpha: numOrUndef(options.alpha),
|
|
46
|
+
seed: numOrUndef(options.seed),
|
|
47
|
+
minRunsPerArm: numOrUndef(options.minRuns),
|
|
48
|
+
};
|
|
49
|
+
const result = evaluateAutoResearchExperiment(matrix, ledger, {
|
|
50
|
+
experimentTag: options.experimentTag,
|
|
51
|
+
baseRef: options.base,
|
|
52
|
+
candidateRef: options.candidate,
|
|
53
|
+
branch: options.branch,
|
|
54
|
+
deficiencyId: options.deficiencyId,
|
|
55
|
+
benchmarkSource: options.benchmarkSource,
|
|
56
|
+
modelId: options.modelId,
|
|
57
|
+
nFamilyExperiments: numOrUndef(options.nFamily) ?? 1,
|
|
58
|
+
gate,
|
|
59
|
+
epsilon: numOrUndef(options.epsilon),
|
|
60
|
+
});
|
|
61
|
+
let holdout = null;
|
|
62
|
+
if (options.verifyHoldout) {
|
|
63
|
+
holdout = verifyOnHoldout(matrix, {
|
|
64
|
+
baseRef: options.base,
|
|
65
|
+
candidateRef: options.candidate,
|
|
66
|
+
benchmarkSource: options.benchmarkSource,
|
|
67
|
+
modelId: options.modelId,
|
|
68
|
+
nFamilyExperiments: numOrUndef(options.nFamily) ?? 1,
|
|
69
|
+
gate,
|
|
70
|
+
epsilon: numOrUndef(options.epsilon),
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
// Merge-eligibility: kept on train AND (if checked) verified on holdout.
|
|
74
|
+
const mergeEligible = result.verdict.decision === 'keep' &&
|
|
75
|
+
result.verdict.fwerAdjusted === true &&
|
|
76
|
+
(!options.verifyHoldout || holdout?.decision === 'keep');
|
|
77
|
+
if (options.json) {
|
|
78
|
+
console.log(JSON.stringify({
|
|
79
|
+
record: result.record,
|
|
80
|
+
verdict: result.verdict,
|
|
81
|
+
regressedTasks: result.regressedTasks,
|
|
82
|
+
holdoutVerdict: holdout,
|
|
83
|
+
mergeEligible,
|
|
84
|
+
}, null, 2));
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
const v = result.verdict;
|
|
88
|
+
const decColor = v.decision === 'keep' ? theme.colors.success
|
|
89
|
+
: v.decision === 'discard' ? theme.colors.error
|
|
90
|
+
: theme.colors.muted;
|
|
91
|
+
console.log();
|
|
92
|
+
console.log(` ${theme.colors.highlight('Experiment')} ${options.experimentTag} (${options.base} → ${options.candidate})`);
|
|
93
|
+
console.log(` ${theme.colors.highlight('Decision')} ${decColor(v.decision.toUpperCase())}`);
|
|
94
|
+
console.log(` ${theme.colors.highlight('Effect')} ${v.effect >= 0 ? '+' : ''}${v.effect} (95% CI [${v.ciLow ?? '—'}, ${v.ciHigh ?? '—'}])`);
|
|
95
|
+
console.log(` ${theme.colors.highlight('p-value')} ${v.pValue ?? '—'} vs alpha_adj ${v.alphaAdjusted ?? '—'} (N=${numOrUndef(options.nFamily) ?? 1}, FWER ${v.fwerAdjusted ? 'on' : 'off'})`);
|
|
96
|
+
console.log(` ${theme.colors.highlight('Runs/Tasks')} ${v.nRuns} runs over ${v.nTasks} task(s)`);
|
|
97
|
+
if (result.regressedTasks.length > 0) {
|
|
98
|
+
console.log(` ${theme.colors.error('Regressions')} ${result.regressedTasks.length} task(s): ${result.regressedTasks.join(', ')}`);
|
|
99
|
+
}
|
|
100
|
+
if (options.verifyHoldout) {
|
|
101
|
+
const hd = holdout
|
|
102
|
+
? `${holdout.decision.toUpperCase()} (effect ${holdout.effect >= 0 ? '+' : ''}${holdout.effect}, CI [${holdout.ciLow ?? '—'}, ${holdout.ciHigh ?? '—'}])`
|
|
103
|
+
: theme.colors.muted('no held-out evidence — unverifiable');
|
|
104
|
+
console.log(` ${theme.colors.highlight('Holdout')} ${hd}`);
|
|
105
|
+
}
|
|
106
|
+
console.log(` ${theme.colors.highlight('Mergeable')} ${mergeEligible ? theme.colors.success('YES') : theme.colors.muted('no')}`);
|
|
107
|
+
console.log(theme.colors.muted(` Recorded → ${projectRoot}/.cortex/experiments.jsonl`));
|
|
108
|
+
console.log();
|
|
109
|
+
console.log(theme.colors.muted(` ${v.reason}`));
|
|
110
|
+
console.log();
|
|
111
|
+
}
|
|
112
|
+
catch (error) {
|
|
113
|
+
console.error(theme.colors.error(`Error: ${error.message}`));
|
|
114
|
+
process.exit(1);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
//# sourceMappingURL=evaluate.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluate.js","sourceRoot":"","sources":["../../../src/commands/autoresearch/evaluate.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AACH,OAAO,EACL,iBAAiB,EACjB,gBAAgB,EAChB,8BAA8B,EAC9B,eAAe,GAChB,MAAM,oBAAoB,CAAC;AAC5B,OAAO,EAAE,YAAY,EAAE,MAAM,8BAA8B,CAAC;AAC5D,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AAmBrD,SAAS,UAAU,CAAC,CAAqB;IACvC,IAAI,CAAC,KAAK,SAAS;QAAE,OAAO,SAAS,CAAC;IACtC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;IACpB,OAAO,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;AAC5C,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,OAAoC;IAC7E,MAAM,KAAK,GAAG,YAAY,CAAC,QAAQ,EAAE,CAAC;IACtC,MAAM,WAAW,GAAG,eAAe,EAAE,CAAC;IAEtC,MAAM,QAAQ,GAAuD;QACnE,CAAC,eAAe,EAAE,kBAAkB,CAAC;QACrC,CAAC,MAAM,EAAE,QAAQ,CAAC;QAClB,CAAC,WAAW,EAAE,aAAa,CAAC;QAC5B,CAAC,QAAQ,EAAE,UAAU,CAAC;KACvB,CAAC;IACF,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;IAC9E,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,sCAAsC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;QAC9F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;IAED,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,iBAAiB,CAAC,WAAW,CAAC,CAAC;QAClD,MAAM,MAAM,GAAG,IAAI,gBAAgB,CAAC,WAAW,CAAC,CAAC;QAEjD,MAAM,IAAI,GAAG;YACX,KAAK,EAAE,UAAU,CAAC,OAAO,CAAC,KAAK,CAAC;YAChC,IAAI,EAAE,UAAU,CAAC,OAAO,CAAC,IAAI,CAAC;YAC9B,aAAa,EAAE,UAAU,CAAC,OAAO,CAAC,OAAO,CAAC;SAC3C,CAAC;QAEF,MAAM,MAAM,GAAG,8BAA8B,CAAC,MAAM,EAAE,MAAM,EAAE;YAC5D,aAAa,EAAE,OAAO,CAAC,aAAc;YACrC,OAAO,EAAE,OAAO,CAAC,IAAK;YACtB,YAAY,EAAE,OAAO,CAAC,SAAU;YAChC,MAAM,EAAE,OAAO,CAAC,MAAO;YACvB,YAAY,EAAE,OAAO,CAAC,YAAY;YAClC,eAAe,EAAE,OAAO,CAAC,eAAe;YACxC,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,kBAAkB,EAAE,UAAU,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC;YACpD,IAAI;YACJ,OAAO,EAAE,UAAU,CAAC,OAAO,CAAC,OAAO,CAAC;SACrC,CAAC,CAAC;QAEH,IAAI,OAAO,GAAG,IAAI,CAAC;QACnB,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;YAC1B,OAAO,GAAG,eAAe,CAAC,MAAM,EAAE;gBAChC,OAAO,EAAE,OAAO,CAAC,IAAK;gBACtB,YAAY,EAAE,OAAO,CAAC,SAAU;gBAChC,eAAe,EAAE,OAAO,CAAC,eAAe;gBACxC,OAAO,EAAE,OAAO,CAAC,OAAO;gBACxB,kBAAkB,EAAE,UAAU,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC;gBACpD,IAAI;gBACJ,OAAO,EAAE,UAAU,CAAC,OAAO,CAAC,OAAO,CAAC;aACrC,CAAC,CAAC;QACL,CAAC;QAED,yEAAyE;QACzE,MAAM,aAAa,GACjB,MAAM,CAAC,OAAO,CAAC,QAAQ,KAAK,MAAM;YAClC,MAAM,CAAC,OAAO,CAAC,YAAY,KAAK,IAAI;YACpC,CAAC,CAAC,OAAO,CAAC,aAAa,IAAI,OAAO,EAAE,QAAQ,KAAK,MAAM,CAAC,CAAC;QAE3D,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC;YACjB,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC;gBACzB,MAAM,EAAE,MAAM,CAAC,MAAM;gBACrB,OAAO,EAAE,MAAM,CAAC,OAAO;gBACvB,cAAc,EAAE,MAAM,CAAC,cAAc;gBACrC,cAAc,EAAE,OAAO;gBACvB,aAAa;aACd,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC;YACb,OAAO;QACT,CAAC;QAED,MAAM,CAAC,GAAG,MAAM,CAAC,OAAO,CAAC;QACzB,MAAM,QAAQ,GACZ,CAAC,CAAC,QAAQ,KAAK,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO;YAC5C,CAAC,CAAC,CAAC,CAAC,QAAQ,KAAK,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK;gBAC/C,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC;QAEvB,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,MAAM,OAAO,CAAC,aAAa,MAAM,OAAO,CAAC,IAAI,MAAM,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;QAC7H,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,UAAU,CAAC,QAAQ,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC,CAAC;QAChG,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,MAAM,cAAc,CAAC,CAAC,KAAK,IAAI,GAAG,KAAK,CAAC,CAAC,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC;QACnJ,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC,MAAM,IAAI,GAAG,kBAAkB,CAAC,CAAC,aAAa,IAAI,GAAG,QAAQ,UAAU,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC;QACrM,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,KAAK,cAAc,CAAC,CAAC,MAAM,UAAU,CAAC,CAAC;QACnG,IAAI,MAAM,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,aAAa,CAAC,KAAK,MAAM,CAAC,cAAc,CAAC,MAAM,aAAa,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACrI,CAAC;QACD,IAAI,OAAO,CAAC,aAAa,EAAE,CAAC;YAC1B,MAAM,EAAE,GAAG,OAAO;gBAChB,CAAC,CAAC,GAAG,OAAO,CAAC,QAAQ,CAAC,WAAW,EAAE,YAAY,OAAO,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,GAAG,OAAO,CAAC,MAAM,SAAS,OAAO,CAAC,KAAK,IAAI,GAAG,KAAK,OAAO,CAAC,MAAM,IAAI,GAAG,IAAI;gBACzJ,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,qCAAqC,CAAC,CAAC;YAC9D,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,SAAS,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;QAClE,CAAC;QACD,OAAO,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,CAAC,SAAS,CAAC,WAAW,CAAC,OAAO,aAAa,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACpI,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,eAAe,WAAW,4BAA4B,CAAC,CAAC,CAAC;QACxF,OAAO,CAAC,GAAG,EAAE,CAAC;QACd,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QAChD,OAAO,CAAC,GAAG,EAAE,CAAC;IAEhB,CAAC;IAAC,OAAO,KAAU,EAAE,CAAC;QACpB,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,UAAU,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAC7D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IAClB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export interface AutoResearchExperimentOptions {
|
|
2
|
+
experimentTag?: string;
|
|
3
|
+
candidateDir?: string;
|
|
4
|
+
baseDir?: string;
|
|
5
|
+
taskSet?: string;
|
|
6
|
+
holdoutSet?: string;
|
|
7
|
+
branch?: string;
|
|
8
|
+
nFamily?: string;
|
|
9
|
+
runs?: string;
|
|
10
|
+
model?: string;
|
|
11
|
+
deficiencyId?: string;
|
|
12
|
+
benchmarkSource?: string;
|
|
13
|
+
baseRef?: string;
|
|
14
|
+
candidateRef?: string;
|
|
15
|
+
buildBase?: boolean;
|
|
16
|
+
noBuild?: boolean;
|
|
17
|
+
basePort?: string;
|
|
18
|
+
candidatePort?: string;
|
|
19
|
+
cortexDir?: string;
|
|
20
|
+
seed?: string;
|
|
21
|
+
alpha?: string;
|
|
22
|
+
epsilon?: string;
|
|
23
|
+
minRuns?: string;
|
|
24
|
+
json?: boolean;
|
|
25
|
+
/** Non-cortex target: grade a shell command per task (with --build-cmd/--accept-exit)
|
|
26
|
+
* instead of building+serving a cortex server. Both arms use the same command; the
|
|
27
|
+
* base/candidate difference is the worktree the command runs in. */
|
|
28
|
+
runCmd?: string;
|
|
29
|
+
buildCmd?: string;
|
|
30
|
+
acceptExit?: string;
|
|
31
|
+
/** Effectiveness-arm labels recorded on both base + candidate records (shared dispatch
|
|
32
|
+
* config; the experiment isolates the harness-version variable). Fall back to the
|
|
33
|
+
* CORTEX_SUBAGENT_TEMPERATURE / CORTEX_ARM_STRATEGY env stamp when omitted. */
|
|
34
|
+
temperature?: string;
|
|
35
|
+
strategy?: string;
|
|
36
|
+
}
|
|
37
|
+
export declare function autoResearchExperiment(options: AutoResearchExperimentOptions): Promise<void>;
|
|
38
|
+
//# sourceMappingURL=experiment.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"experiment.d.ts","sourceRoot":"","sources":["../../../src/commands/autoresearch/experiment.ts"],"names":[],"mappings":"AA+BA,MAAM,WAAW,6BAA6B;IAC5C,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,CAAC,EAAE,OAAO,CAAC;IACf;;yEAEqE;IACrE,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB;;oFAEgF;IAChF,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAkBD,wBAAsB,sBAAsB,CAAC,OAAO,EAAE,6BAA6B,GAAG,OAAO,CAAC,IAAI,CAAC,CAwHlG"}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `cortex autoresearch experiment` — the full single-experiment lifecycle.
|
|
3
|
+
*
|
|
4
|
+
* Builds the candidate (and optionally base) checkout, serves each on its own
|
|
5
|
+
* isolated port, benches both arms (train + optional holdout) into ONE shared
|
|
6
|
+
* `.cortex` store, runs the keep/discard gate + held-out verification, and emits
|
|
7
|
+
* the audited verdict + the JSONL artifact a downstream ingest consumes.
|
|
8
|
+
*
|
|
9
|
+
* This is the piece that owns the "two builds, not one relabel" correctness: each
|
|
10
|
+
* arm is served by a server BUILT FROM ITS OWN CODE, so the comparison is real.
|
|
11
|
+
* Servers run with MODEL_ROUTER_RECORD off; only the bench's graded records land
|
|
12
|
+
* in the shared store. Teardown is guaranteed in `finally`.
|
|
13
|
+
*
|
|
14
|
+
* Scope: this is a HARNESS-CODE experiment (base build vs candidate build,
|
|
15
|
+
* compared by git SHA). Model/config experiments (same code, different --model)
|
|
16
|
+
* use the lower-level `bench` + `evaluate` directly.
|
|
17
|
+
*/
|
|
18
|
+
import { readdirSync, readFileSync, statSync } from 'node:fs';
|
|
19
|
+
import { join, basename } from 'node:path';
|
|
20
|
+
import { ModelRouterMatrix, ExperimentLedger, runExperiment, parseTaskSet, } from '@nexus-cortex/core';
|
|
21
|
+
import { ThemeManager } from '../../themes/ThemeManager.js';
|
|
22
|
+
import { findProjectRoot } from '../config/utils.js';
|
|
23
|
+
import { freePort, gitShortSha, CortexTarget } from './harnessProcess.js';
|
|
24
|
+
import { CommandTarget } from './commandRunner.js';
|
|
25
|
+
function num(v) {
|
|
26
|
+
if (v === undefined)
|
|
27
|
+
return undefined;
|
|
28
|
+
const n = Number(v);
|
|
29
|
+
return Number.isFinite(n) ? n : undefined;
|
|
30
|
+
}
|
|
31
|
+
function loadTasks(p) {
|
|
32
|
+
const st = statSync(p);
|
|
33
|
+
const files = st.isDirectory()
|
|
34
|
+
? readdirSync(p).filter(f => f.endsWith('.json')).map(f => join(p, f))
|
|
35
|
+
: [p];
|
|
36
|
+
const out = [];
|
|
37
|
+
for (const f of files)
|
|
38
|
+
out.push(...parseTaskSet(JSON.parse(readFileSync(f, 'utf8')), f));
|
|
39
|
+
return out;
|
|
40
|
+
}
|
|
41
|
+
export async function autoResearchExperiment(options) {
|
|
42
|
+
const theme = ThemeManager.getTheme();
|
|
43
|
+
const json = !!options.json;
|
|
44
|
+
const log = (m) => { if (!json)
|
|
45
|
+
console.log(theme.colors.muted(` ${m}`)); };
|
|
46
|
+
const projectRoot = findProjectRoot();
|
|
47
|
+
const candidateDir = options.candidateDir;
|
|
48
|
+
const baseDir = options.baseDir ?? projectRoot;
|
|
49
|
+
const cortexDir = options.cortexDir ?? projectRoot;
|
|
50
|
+
const missing = [];
|
|
51
|
+
if (!options.experimentTag)
|
|
52
|
+
missing.push('--experiment-tag');
|
|
53
|
+
if (!candidateDir)
|
|
54
|
+
missing.push('--candidate-dir');
|
|
55
|
+
if (!options.taskSet)
|
|
56
|
+
missing.push('--task-set');
|
|
57
|
+
if (missing.length) {
|
|
58
|
+
console.error(theme.colors.error(`Error: missing ${missing.join(', ')}`));
|
|
59
|
+
process.exit(1);
|
|
60
|
+
}
|
|
61
|
+
const arms = [];
|
|
62
|
+
try {
|
|
63
|
+
const trainTasks = loadTasks(options.taskSet);
|
|
64
|
+
const holdoutTasks = options.holdoutSet ? loadTasks(options.holdoutSet) : undefined;
|
|
65
|
+
if (trainTasks.length === 0) {
|
|
66
|
+
console.error(theme.colors.error('Error: empty --task-set'));
|
|
67
|
+
process.exit(1);
|
|
68
|
+
}
|
|
69
|
+
// Distinct arm labels: git SHA when the dir is a checkout, else its basename.
|
|
70
|
+
const refFor = (dir, override) => {
|
|
71
|
+
if (override)
|
|
72
|
+
return override;
|
|
73
|
+
const sha = gitShortSha(dir);
|
|
74
|
+
return sha !== 'unknown' ? sha : basename(dir);
|
|
75
|
+
};
|
|
76
|
+
const baseRef = refFor(baseDir, options.baseRef);
|
|
77
|
+
const candidateRef = refFor(candidateDir, options.candidateRef);
|
|
78
|
+
if (baseRef === candidateRef) {
|
|
79
|
+
console.error(theme.colors.error(`Error: base and candidate resolve to the same ref (${baseRef}) — an experiment needs two distinct arms. Pass --base-ref/--candidate-ref to label, or use a real candidate worktree.`));
|
|
80
|
+
process.exit(1);
|
|
81
|
+
}
|
|
82
|
+
const model = options.model ?? process.env.DEFAULT_MODEL_ID;
|
|
83
|
+
// Select the target: a shell-command target (any project) or the default cortex server.
|
|
84
|
+
const target = options.runCmd
|
|
85
|
+
? new CommandTarget({
|
|
86
|
+
template: options.runCmd,
|
|
87
|
+
buildCmd: options.buildCmd,
|
|
88
|
+
acceptExitCodes: (options.acceptExit ?? '0').split(',').map(s => Number(s.trim())).filter(n => Number.isFinite(n)),
|
|
89
|
+
})
|
|
90
|
+
: new CortexTarget();
|
|
91
|
+
if (!json) {
|
|
92
|
+
console.log();
|
|
93
|
+
console.log(` ${theme.colors.highlight('Experiment')} ${options.experimentTag} ${baseRef} → ${candidateRef} [${target.kind}]`);
|
|
94
|
+
}
|
|
95
|
+
// Prepare each arm (build if asked + start its runner). Candidate builds unless
|
|
96
|
+
// --no-build; base builds only with --build-base. Each arm gets its own reserved port
|
|
97
|
+
// (server targets bind it; command targets ignore it).
|
|
98
|
+
const basePort = num(options.basePort) ?? await freePort();
|
|
99
|
+
const candPort = num(options.candidatePort) ?? await freePort();
|
|
100
|
+
const baseArm = await target.prepare(baseDir, { port: basePort, model, build: !options.noBuild && !!options.buildBase, log });
|
|
101
|
+
arms.push(baseArm);
|
|
102
|
+
const candArm = await target.prepare(candidateDir, { port: candPort, model, build: !options.noBuild, log });
|
|
103
|
+
arms.push(candArm);
|
|
104
|
+
// Bench both arms + gate (shared store at cortexDir/.cortex).
|
|
105
|
+
const matrix = new ModelRouterMatrix(cortexDir);
|
|
106
|
+
const ledger = new ExperimentLedger(cortexDir);
|
|
107
|
+
const result = await runExperiment(matrix, ledger, {
|
|
108
|
+
baseRunner: baseArm.runner,
|
|
109
|
+
candidateRunner: candArm.runner,
|
|
110
|
+
}, {
|
|
111
|
+
experimentTag: options.experimentTag,
|
|
112
|
+
baseRef, candidateRef,
|
|
113
|
+
branch: options.branch ?? candidateRef,
|
|
114
|
+
trainTasks, holdoutTasks,
|
|
115
|
+
runs: num(options.runs),
|
|
116
|
+
nFamily: num(options.nFamily) ?? 1,
|
|
117
|
+
modelId: model,
|
|
118
|
+
temperature: num(options.temperature),
|
|
119
|
+
strategy: options.strategy,
|
|
120
|
+
deficiencyId: options.deficiencyId,
|
|
121
|
+
benchmarkSource: options.benchmarkSource,
|
|
122
|
+
gate: { alpha: num(options.alpha), seed: num(options.seed), minRunsPerArm: num(options.minRuns) },
|
|
123
|
+
epsilon: num(options.epsilon),
|
|
124
|
+
onProgress: log,
|
|
125
|
+
});
|
|
126
|
+
const out = {
|
|
127
|
+
experimentTag: options.experimentTag,
|
|
128
|
+
baseRef, candidateRef, branch: options.branch ?? candidateRef,
|
|
129
|
+
verdict: result.verdict,
|
|
130
|
+
holdoutVerdict: result.holdoutVerdict,
|
|
131
|
+
regressedTasks: result.regressedTasks,
|
|
132
|
+
mergeEligible: result.mergeEligible,
|
|
133
|
+
benchSummaries: result.benchSummaries,
|
|
134
|
+
cortexDir,
|
|
135
|
+
jsonlPaths: {
|
|
136
|
+
matrix: join(cortexDir, '.cortex', 'router-matrix.jsonl'),
|
|
137
|
+
experiments: join(cortexDir, '.cortex', 'experiments.jsonl'),
|
|
138
|
+
backlog: join(cortexDir, '.cortex', 'research-backlog.jsonl'),
|
|
139
|
+
},
|
|
140
|
+
};
|
|
141
|
+
if (json) {
|
|
142
|
+
console.log(JSON.stringify(out, null, 2));
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
const v = result.verdict;
|
|
146
|
+
const dc = v.decision === 'keep' ? theme.colors.success : v.decision === 'discard' ? theme.colors.error : theme.colors.muted;
|
|
147
|
+
console.log();
|
|
148
|
+
console.log(` ${theme.colors.highlight('Decision')} ${dc(v.decision.toUpperCase())} effect ${v.effect >= 0 ? '+' : ''}${v.effect} CI [${v.ciLow ?? '—'}, ${v.ciHigh ?? '—'}] p=${v.pValue ?? '—'} vs ${v.alphaAdjusted ?? '—'} (N=${num(options.nFamily) ?? 1})`);
|
|
149
|
+
console.log(` ${theme.colors.highlight('Holdout')} ${result.holdoutVerdict ? result.holdoutVerdict.decision.toUpperCase() + ` (effect ${result.holdoutVerdict.effect >= 0 ? '+' : ''}${result.holdoutVerdict.effect})` : theme.colors.muted('not provided → not verifiable')}`);
|
|
150
|
+
if (result.regressedTasks.length)
|
|
151
|
+
console.log(` ${theme.colors.error('Regressions')} ${result.regressedTasks.length}: ${result.regressedTasks.join(', ')}`);
|
|
152
|
+
console.log(` ${theme.colors.highlight('Mergeable')} ${result.mergeEligible ? theme.colors.success('YES') : theme.colors.muted('no')}`);
|
|
153
|
+
console.log(theme.colors.muted(` artifact → ${cortexDir}/.cortex/{router-matrix,experiments,research-backlog}.jsonl`));
|
|
154
|
+
console.log();
|
|
155
|
+
}
|
|
156
|
+
catch (error) {
|
|
157
|
+
if (json)
|
|
158
|
+
console.log(JSON.stringify({ error: error.message }, null, 2));
|
|
159
|
+
else
|
|
160
|
+
console.error(theme.colors.error(`Error: ${error.message}`));
|
|
161
|
+
process.exitCode = 1;
|
|
162
|
+
}
|
|
163
|
+
finally {
|
|
164
|
+
for (const a of arms)
|
|
165
|
+
a.stop();
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
//# sourceMappingURL=experiment.js.map
|