@roackb2/heddle 0.0.36 → 0.0.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/examples/repo-investigator.js +1 -2
- package/dist/examples/repo-investigator.js.map +1 -1
- package/dist/src/cli/ask.d.ts.map +1 -1
- package/dist/src/cli/ask.js +11 -0
- package/dist/src/cli/ask.js.map +1 -1
- package/dist/src/cli/chat/App.d.ts.map +1 -1
- package/dist/src/cli/chat/App.js +123 -93
- package/dist/src/cli/chat/App.js.map +1 -1
- package/dist/src/cli/chat/components/ModelPickerPanel.d.ts +2 -1
- package/dist/src/cli/chat/components/ModelPickerPanel.d.ts.map +1 -1
- package/dist/src/cli/chat/components/ModelPickerPanel.js +8 -4
- package/dist/src/cli/chat/components/ModelPickerPanel.js.map +1 -1
- package/dist/src/cli/chat/components/PromptInput.d.ts +5 -0
- package/dist/src/cli/chat/components/PromptInput.d.ts.map +1 -1
- package/dist/src/cli/chat/components/PromptInput.js +131 -85
- package/dist/src/cli/chat/components/PromptInput.js.map +1 -1
- package/dist/src/cli/chat/debug/tui-debug-snapshot.d.ts +2 -1
- package/dist/src/cli/chat/debug/tui-debug-snapshot.d.ts.map +1 -1
- package/dist/src/cli/chat/debug/tui-debug-snapshot.js +14 -1
- package/dist/src/cli/chat/debug/tui-debug-snapshot.js.map +1 -1
- package/dist/src/cli/chat/hooks/tui-agent-turn-lifecycle.d.ts +6 -0
- package/dist/src/cli/chat/hooks/tui-agent-turn-lifecycle.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-agent-turn-lifecycle.js +38 -0
- package/dist/src/cli/chat/hooks/tui-agent-turn-lifecycle.js.map +1 -0
- package/dist/src/cli/chat/hooks/tui-agent-turn-result.d.ts +48 -0
- package/dist/src/cli/chat/hooks/tui-agent-turn-result.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-agent-turn-result.js +171 -0
- package/dist/src/cli/chat/hooks/tui-agent-turn-result.js.map +1 -0
- package/dist/src/cli/chat/hooks/tui-compaction-status.d.ts +24 -0
- package/dist/src/cli/chat/hooks/tui-compaction-status.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-compaction-status.js +65 -0
- package/dist/src/cli/chat/hooks/tui-compaction-status.js.map +1 -0
- package/dist/src/cli/chat/hooks/tui-direct-shell-result.d.ts +28 -0
- package/dist/src/cli/chat/hooks/tui-direct-shell-result.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-direct-shell-result.js +41 -0
- package/dist/src/cli/chat/hooks/tui-direct-shell-result.js.map +1 -0
- package/dist/src/cli/chat/hooks/tui-direct-shell.d.ts +22 -0
- package/dist/src/cli/chat/hooks/tui-direct-shell.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-direct-shell.js +127 -0
- package/dist/src/cli/chat/hooks/tui-direct-shell.js.map +1 -0
- package/dist/src/cli/chat/hooks/tui-drift-observer.d.ts +13 -0
- package/dist/src/cli/chat/hooks/tui-drift-observer.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-drift-observer.js +36 -0
- package/dist/src/cli/chat/hooks/tui-drift-observer.js.map +1 -0
- package/dist/src/cli/chat/hooks/tui-ordinary-turn.d.ts +31 -0
- package/dist/src/cli/chat/hooks/tui-ordinary-turn.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-ordinary-turn.js +82 -0
- package/dist/src/cli/chat/hooks/tui-ordinary-turn.js.map +1 -0
- package/dist/src/cli/chat/hooks/tui-run-loop-events.d.ts +23 -0
- package/dist/src/cli/chat/hooks/tui-run-loop-events.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-run-loop-events.js +74 -0
- package/dist/src/cli/chat/hooks/tui-run-loop-events.js.map +1 -0
- package/dist/src/cli/chat/hooks/tui-tool-approval.d.ts +8 -0
- package/dist/src/cli/chat/hooks/tui-tool-approval.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/tui-tool-approval.js +37 -0
- package/dist/src/cli/chat/hooks/tui-tool-approval.js.map +1 -0
- package/dist/src/cli/chat/hooks/useAgentRun.d.ts +1 -1
- package/dist/src/cli/chat/hooks/useAgentRun.d.ts.map +1 -1
- package/dist/src/cli/chat/hooks/useAgentRun.js +59 -551
- package/dist/src/cli/chat/hooks/useAgentRun.js.map +1 -1
- package/dist/src/cli/chat/hooks/useApprovalFlow.d.ts.map +1 -1
- package/dist/src/cli/chat/hooks/useApprovalFlow.js +3 -3
- package/dist/src/cli/chat/hooks/useApprovalFlow.js.map +1 -1
- package/dist/src/cli/chat/hooks/useChatPickers.d.ts +6 -3
- package/dist/src/cli/chat/hooks/useChatPickers.d.ts.map +1 -1
- package/dist/src/cli/chat/hooks/useChatPickers.js +17 -4
- package/dist/src/cli/chat/hooks/useChatPickers.js.map +1 -1
- package/dist/src/cli/chat/hooks/useChatSessions.d.ts +1 -1
- package/dist/src/cli/chat/hooks/useChatSessions.d.ts.map +1 -1
- package/dist/src/cli/chat/hooks/useChatSessions.js +23 -20
- package/dist/src/cli/chat/hooks/useChatSessions.js.map +1 -1
- package/dist/src/cli/chat/hooks/useChatStatusSummary.d.ts +58 -0
- package/dist/src/cli/chat/hooks/useChatStatusSummary.d.ts.map +1 -0
- package/dist/src/cli/chat/hooks/useChatStatusSummary.js +85 -0
- package/dist/src/cli/chat/hooks/useChatStatusSummary.js.map +1 -0
- package/dist/src/cli/chat/hooks/usePromptSubmission.d.ts +2 -1
- package/dist/src/cli/chat/hooks/usePromptSubmission.d.ts.map +1 -1
- package/dist/src/cli/chat/hooks/usePromptSubmission.js +5 -1
- package/dist/src/cli/chat/hooks/usePromptSubmission.js.map +1 -1
- package/dist/src/cli/chat/state/local-commands.d.ts +2 -0
- package/dist/src/cli/chat/state/local-commands.d.ts.map +1 -1
- package/dist/src/cli/chat/state/local-commands.js +17 -3
- package/dist/src/cli/chat/state/local-commands.js.map +1 -1
- package/dist/src/cli/chat/submit.d.ts.map +1 -1
- package/dist/src/cli/chat/submit.js +13 -1
- package/dist/src/cli/chat/submit.js.map +1 -1
- package/dist/src/cli/chat/utils/format.d.ts.map +1 -1
- package/dist/src/cli/chat/utils/format.js +21 -0
- package/dist/src/cli/chat/utils/format.js.map +1 -1
- package/dist/src/cli/chat/utils/runtime.d.ts.map +1 -1
- package/dist/src/cli/chat/utils/runtime.js +5 -3
- package/dist/src/cli/chat/utils/runtime.js.map +1 -1
- package/dist/src/cli/eval/index.d.ts +24 -0
- package/dist/src/cli/eval/index.d.ts.map +1 -0
- package/dist/src/cli/eval/index.js +232 -0
- package/dist/src/cli/eval/index.js.map +1 -0
- package/dist/src/cli/main.js +25 -3
- package/dist/src/cli/main.js.map +1 -1
- package/dist/src/cli/remote/control-plane-client.d.ts +5 -1
- package/dist/src/cli/remote/control-plane-client.d.ts.map +1 -1
- package/dist/src/core/agent/mutation-tracking.d.ts +0 -7
- package/dist/src/core/agent/mutation-tracking.d.ts.map +1 -1
- package/dist/src/core/agent/mutation-tracking.js +5 -63
- package/dist/src/core/agent/mutation-tracking.js.map +1 -1
- package/dist/src/core/agent/post-mutation.d.ts +2 -2
- package/dist/src/core/agent/post-mutation.d.ts.map +1 -1
- package/dist/src/core/agent/post-mutation.js +5 -20
- package/dist/src/core/agent/post-mutation.js.map +1 -1
- package/dist/src/core/agent/progress-reminders.d.ts +1 -4
- package/dist/src/core/agent/progress-reminders.d.ts.map +1 -1
- package/dist/src/core/agent/progress-reminders.js +4 -56
- package/dist/src/core/agent/progress-reminders.js.map +1 -1
- package/dist/src/core/agent/run-agent.d.ts.map +1 -1
- package/dist/src/core/agent/run-agent.js +3 -103
- package/dist/src/core/agent/run-agent.js.map +1 -1
- package/dist/src/core/agent/tool-dispatch.js +3 -2
- package/dist/src/core/agent/tool-dispatch.js.map +1 -1
- package/dist/src/core/chat/ordinary-turn.d.ts +34 -0
- package/dist/src/core/chat/ordinary-turn.d.ts.map +1 -0
- package/dist/src/core/chat/ordinary-turn.js +274 -0
- package/dist/src/core/chat/ordinary-turn.js.map +1 -0
- package/dist/src/core/chat/session-submit.d.ts +4 -4
- package/dist/src/core/chat/session-submit.d.ts.map +1 -1
- package/dist/src/core/chat/session-submit.js +22 -282
- package/dist/src/core/chat/session-submit.js.map +1 -1
- package/dist/src/core/chat/session-title.d.ts +15 -0
- package/dist/src/core/chat/session-title.d.ts.map +1 -0
- package/dist/src/core/chat/session-title.js +17 -0
- package/dist/src/core/chat/session-title.js.map +1 -0
- package/dist/src/core/chat/session-turn-preflight.d.ts +37 -0
- package/dist/src/core/chat/session-turn-preflight.d.ts.map +1 -0
- package/dist/src/core/chat/session-turn-preflight.js +43 -0
- package/dist/src/core/chat/session-turn-preflight.js.map +1 -0
- package/dist/src/core/chat/session-turn-result.d.ts +36 -0
- package/dist/src/core/chat/session-turn-result.d.ts.map +1 -0
- package/dist/src/core/chat/session-turn-result.js +60 -0
- package/dist/src/core/chat/session-turn-result.js.map +1 -0
- package/dist/src/core/chat/tool-approval-host.d.ts +22 -0
- package/dist/src/core/chat/tool-approval-host.d.ts.map +1 -0
- package/dist/src/core/chat/tool-approval-host.js +14 -0
- package/dist/src/core/chat/tool-approval-host.js.map +1 -0
- package/dist/src/core/chat/turn-host.d.ts +25 -0
- package/dist/src/core/chat/turn-host.d.ts.map +1 -0
- package/dist/src/core/chat/turn-host.js +2 -0
- package/dist/src/core/chat/turn-host.js.map +1 -0
- package/dist/src/core/chat/types.d.ts +1 -0
- package/dist/src/core/chat/types.d.ts.map +1 -1
- package/dist/src/core/config.d.ts +1 -1
- package/dist/src/core/config.d.ts.map +1 -1
- package/dist/src/core/config.js +1 -1
- package/dist/src/core/config.js.map +1 -1
- package/dist/src/core/eval/agent-runner.d.ts +24 -0
- package/dist/src/core/eval/agent-runner.d.ts.map +1 -0
- package/dist/src/core/eval/agent-runner.js +151 -0
- package/dist/src/core/eval/agent-runner.js.map +1 -0
- package/dist/src/core/eval/case-loader.d.ts +7 -0
- package/dist/src/core/eval/case-loader.d.ts.map +1 -0
- package/dist/src/core/eval/case-loader.js +34 -0
- package/dist/src/core/eval/case-loader.js.map +1 -0
- package/dist/src/core/eval/check-runner.d.ts +8 -0
- package/dist/src/core/eval/check-runner.d.ts.map +1 -0
- package/dist/src/core/eval/check-runner.js +33 -0
- package/dist/src/core/eval/check-runner.js.map +1 -0
- package/dist/src/core/eval/cleanup.d.ts +20 -0
- package/dist/src/core/eval/cleanup.d.ts.map +1 -0
- package/dist/src/core/eval/cleanup.js +42 -0
- package/dist/src/core/eval/cleanup.js.map +1 -0
- package/dist/src/core/eval/git-artifacts.d.ts +26 -0
- package/dist/src/core/eval/git-artifacts.d.ts.map +1 -0
- package/dist/src/core/eval/git-artifacts.js +211 -0
- package/dist/src/core/eval/git-artifacts.js.map +1 -0
- package/dist/src/core/eval/process.d.ts +22 -0
- package/dist/src/core/eval/process.d.ts.map +1 -0
- package/dist/src/core/eval/process.js +65 -0
- package/dist/src/core/eval/process.js.map +1 -0
- package/dist/src/core/eval/progress.d.ts +28 -0
- package/dist/src/core/eval/progress.d.ts.map +1 -0
- package/dist/src/core/eval/progress.js +94 -0
- package/dist/src/core/eval/progress.js.map +1 -0
- package/dist/src/core/eval/report-writer.d.ts +7 -0
- package/dist/src/core/eval/report-writer.d.ts.map +1 -0
- package/dist/src/core/eval/report-writer.js +159 -0
- package/dist/src/core/eval/report-writer.js.map +1 -0
- package/dist/src/core/eval/schema.d.ts +206 -0
- package/dist/src/core/eval/schema.d.ts.map +1 -0
- package/dist/src/core/eval/schema.js +104 -0
- package/dist/src/core/eval/schema.js.map +1 -0
- package/dist/src/core/eval/trace-analyzer.d.ts +6 -0
- package/dist/src/core/eval/trace-analyzer.d.ts.map +1 -0
- package/dist/src/core/eval/trace-analyzer.js +106 -0
- package/dist/src/core/eval/trace-analyzer.js.map +1 -0
- package/dist/src/core/eval/workspace-fixture.d.ts +14 -0
- package/dist/src/core/eval/workspace-fixture.d.ts.map +1 -0
- package/dist/src/core/eval/workspace-fixture.js +235 -0
- package/dist/src/core/eval/workspace-fixture.js.map +1 -0
- package/dist/src/core/llm/model-policy.d.ts +26 -0
- package/dist/src/core/llm/model-policy.d.ts.map +1 -1
- package/dist/src/core/llm/model-policy.js +47 -0
- package/dist/src/core/llm/model-policy.js.map +1 -1
- package/dist/src/core/memory/domain-prompt.d.ts.map +1 -1
- package/dist/src/core/memory/domain-prompt.js +48 -40
- package/dist/src/core/memory/domain-prompt.js.map +1 -1
- package/dist/src/core/prompts/system-prompt.d.ts +1 -1
- package/dist/src/core/prompts/system-prompt.d.ts.map +1 -1
- package/dist/src/core/prompts/system-prompt.js +19 -100
- package/dist/src/core/prompts/system-prompt.js.map +1 -1
- package/dist/src/core/runtime/agent-loop.d.ts.map +1 -1
- package/dist/src/core/runtime/agent-loop.js +9 -3
- package/dist/src/core/runtime/agent-loop.js.map +1 -1
- package/dist/src/core/runtime/default-tools.d.ts.map +1 -1
- package/dist/src/core/runtime/default-tools.js +1 -2
- package/dist/src/core/runtime/default-tools.js.map +1 -1
- package/dist/src/core/tools/file-edit-core.d.ts.map +1 -1
- package/dist/src/core/tools/file-edit-core.js +21 -2
- package/dist/src/core/tools/file-edit-core.js.map +1 -1
- package/dist/src/index.d.ts +0 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +0 -1
- package/dist/src/index.js.map +1 -1
- package/dist/src/server/features/control-plane/router.d.ts +5 -1
- package/dist/src/server/features/control-plane/router.d.ts.map +1 -1
- package/dist/src/server/features/control-plane/router.js +16 -2
- package/dist/src/server/features/control-plane/router.js.map +1 -1
- package/dist/src/server/features/control-plane/services/chat-session-events.d.ts +26 -0
- package/dist/src/server/features/control-plane/services/chat-session-events.d.ts.map +1 -0
- package/dist/src/server/features/control-plane/services/chat-session-events.js +61 -0
- package/dist/src/server/features/control-plane/services/chat-session-events.js.map +1 -0
- package/dist/src/server/features/control-plane/services/chat-sessions.d.ts +2 -1
- package/dist/src/server/features/control-plane/services/chat-sessions.d.ts.map +1 -1
- package/dist/src/server/features/control-plane/services/chat-sessions.js +30 -42
- package/dist/src/server/features/control-plane/services/chat-sessions.js.map +1 -1
- package/dist/src/server/router.d.ts +5 -1
- package/dist/src/server/router.d.ts.map +1 -1
- package/dist/src/web/assets/{MonacoDiffViewer-DM8Cy5Xf.js → MonacoDiffViewer-DP7GeCEC.js} +1 -1
- package/dist/src/web/assets/index-CYd4sslC.css +2 -0
- package/dist/src/web/assets/index-PUxjg447.js +56 -0
- package/dist/src/web/index.html +2 -2
- package/package.json +7 -2
- package/dist/src/core/tools/report-state.d.ts +0 -3
- package/dist/src/core/tools/report-state.d.ts.map +0 -1
- package/dist/src/core/tools/report-state.js +0 -63
- package/dist/src/core/tools/report-state.js.map +0 -1
- package/dist/src/web/assets/index-BEeN-RT5.css +0 -2
- package/dist/src/web/assets/index-BKDg9H_-.js +0 -56
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"progress.js","sourceRoot":"","sources":["../../../../src/core/eval/progress.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AACpD,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAWpC,MAAM,OAAO,oBAAoB;IACtB,YAAY,CAAS;IACb,MAAM,CAAS;IACf,WAAW,CAAU;IAEtC,YAAY,IAIX;QACC,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC;QAC1B,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC;QACtC,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC;QAC5C,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,IAAI,CAAC,KAAa,EAAE,OAAe;QACjC,IAAI,CAAC,KAAK,CAAC;YACT,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,KAAK;YACL,MAAM,EAAE,MAAM;YACd,OAAO;SACR,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,KAAK,CAAI,IAMd;QACC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC,KAAK,CAAC;YACT,SAAS,EAAE,IAAI,IAAI,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE;YAC5C,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,MAAM,EAAE,SAAS;YACjB,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,SAAS,EAAE,CAAC;SACb,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,MAAM,CAAC;QAC/C,MAAM,QAAQ,GAAG,WAAW,GAAG,CAAC,CAAC,CAAC;YAChC,WAAW,CAAC,GAAG,EAAE;gBACf,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBACzC,IAAI,CAAC,KAAK,CAAC;oBACT,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;oBACnC,MAAM,EAAE,IAAI,CAAC,MAAM;oBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;oBACjB,MAAM,EAAE,WAAW;oBACnB,OAAO,EAAE,IAAI,CAAC,gBAAgB,IAAI,iBAAiB,IAAI,CAAC,OAAO,EAAE;oBACjE,SAAS;iBACV,CAAC,CAAC;YACL,CAAC,EAAE,WAAW,CAAC;YACjB,CAAC,CAAC,SAAS,CAAC;QAEZ,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE,CAAC;YAChC,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACzC,IAAI,CAAC,KAAK,CAAC;gBACT,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;gBACnC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,WAAW;gBACnB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,SAAS;aACV,CAAC,CAAC;YACH,OAAO,MAAM,CAAC;QAChB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YACzC,IAAI,CAAC,KAAK,CAAC;gBACT,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;gBACnC,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,QAAQ;gBAChB,OAAO,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;gBAC/D,SAAS;aACV,CAAC,CAAC;YACH,MAAM,KAAK,CAAC;QACd,CAAC;gBAAS,CAAC;YACT,IAAI,QAAQ,EAAE,CAAC;gBACb,aAAa,CAAC,QAAQ,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,KAAwB;QACpC,cAAc,CAAC,IAAI,CAAC,YAAY,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;QACxE,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YACrB,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;CACF;AAED,SAAS,kBAAkB,CAAC,KAAwB;IAClD,MAAM,OAAO,GAAG,KAAK,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,aAAa,CAAC,KAAK,CAAC,SAAS,CAAC,GAAG,CAAC;IAC5F,OAAO,IAAI,KAAK,CAAC,MAAM,KAAK,KAAK,CAAC,MAAM,KAAK,KAAK,CAAC,OAAO,GAAG,OAAO,IAAI,CAAC;AAC3E,CAAC;AAED,SAAS,aAAa,CAAC,SAAiB;IACtC,IAAI,SAAS,GAAG,IAAI,EAAE,CAAC;QACrB,OAAO,GAAG,SAAS,IAAI,CAAC;IAC1B,CAAC;IACD,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC;AAC5C,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { EvalSuiteReport } from './schema.js';
|
|
2
|
+
export declare function writeEvalSuiteReport(report: EvalSuiteReport): {
|
|
3
|
+
jsonPath: string;
|
|
4
|
+
markdownPath: string;
|
|
5
|
+
};
|
|
6
|
+
export declare function formatEvalSuiteMarkdown(report: EvalSuiteReport): string;
|
|
7
|
+
//# sourceMappingURL=report-writer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"report-writer.d.ts","sourceRoot":"","sources":["../../../../src/core/eval/report-writer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAiB,eAAe,EAAE,MAAM,aAAa,CAAC;AAElE,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,eAAe,GAAG;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,CAOxG;AAED,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,eAAe,GAAG,MAAM,CAqBvE"}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync } from 'node:fs';
|
|
2
|
+
import { relative, join } from 'node:path';
|
|
3
|
+
export function writeEvalSuiteReport(report) {
|
|
4
|
+
mkdirSync(report.resultsDir, { recursive: true });
|
|
5
|
+
const jsonPath = join(report.resultsDir, 'report.json');
|
|
6
|
+
const markdownPath = join(report.resultsDir, 'report.md');
|
|
7
|
+
writeFileSync(jsonPath, `${JSON.stringify(report, null, 2)}\n`, 'utf8');
|
|
8
|
+
writeFileSync(markdownPath, formatEvalSuiteMarkdown(report), 'utf8');
|
|
9
|
+
return { jsonPath, markdownPath };
|
|
10
|
+
}
|
|
11
|
+
export function formatEvalSuiteMarkdown(report) {
|
|
12
|
+
const passed = report.results.filter((result) => result.status === 'passed').length;
|
|
13
|
+
const lines = [
|
|
14
|
+
'# Heddle Agent Eval Report',
|
|
15
|
+
'',
|
|
16
|
+
`Target: ${report.target}`,
|
|
17
|
+
`Started: ${report.startedAt}`,
|
|
18
|
+
`Finished: ${report.finishedAt}`,
|
|
19
|
+
`Results: ${passed}/${report.results.length} passed`,
|
|
20
|
+
'',
|
|
21
|
+
'| Case | Status | Model | Checks | Outcome | Turns | Mutations | Verification After Mutation |',
|
|
22
|
+
'| --- | --- | --- | ---: | --- | ---: | ---: | ---: |',
|
|
23
|
+
...report.results.map(formatSummaryRow),
|
|
24
|
+
'',
|
|
25
|
+
];
|
|
26
|
+
for (const result of report.results) {
|
|
27
|
+
lines.push(...formatRunDetail(result, report.resultsDir), '');
|
|
28
|
+
}
|
|
29
|
+
return `${lines.join('\n').trimEnd()}\n`;
|
|
30
|
+
}
|
|
31
|
+
function formatSummaryRow(result) {
|
|
32
|
+
const passedChecks = result.checks.filter((check) => check.passed).length;
|
|
33
|
+
return [
|
|
34
|
+
result.caseId,
|
|
35
|
+
result.status,
|
|
36
|
+
result.model ?? 'default',
|
|
37
|
+
`${passedChecks}/${result.checks.length}`,
|
|
38
|
+
result.metrics.outcome ?? `exit ${result.agent.exitCode ?? 'unknown'}`,
|
|
39
|
+
String(result.metrics.assistantTurns),
|
|
40
|
+
String(result.metrics.mutations),
|
|
41
|
+
String(result.metrics.verificationCommandsAfterMutation),
|
|
42
|
+
].map(escapeCell).join(' | ').replace(/^/, '| ').replace(/$/, ' |');
|
|
43
|
+
}
|
|
44
|
+
function formatRunDetail(result, resultsDir) {
|
|
45
|
+
const traceFiles = result.artifacts.traceFiles.map((path) => formatPath(path, resultsDir));
|
|
46
|
+
const lines = [
|
|
47
|
+
`## ${result.caseId}`,
|
|
48
|
+
'',
|
|
49
|
+
'| Field | Value |',
|
|
50
|
+
'| --- | --- |',
|
|
51
|
+
`| Status | ${escapeCell(result.status)} |`,
|
|
52
|
+
`| Model | ${escapeCell(result.model ?? 'default')} |`,
|
|
53
|
+
`| Max steps | ${escapeCell(String(result.maxSteps ?? 'default'))} |`,
|
|
54
|
+
`| Agent exit | ${escapeCell(`${result.agent.exitCode ?? 'unknown'}${result.agent.timedOut ? ' (timed out)' : ''}`)} |`,
|
|
55
|
+
`| Fixture | ${escapeCell(formatFixture(result))} |`,
|
|
56
|
+
`| Workspace | \`${escapeCell(formatPath(result.workspaceRoot, resultsDir))}\` |`,
|
|
57
|
+
`| Output | \`${escapeCell(formatPath(result.outputDir, resultsDir))}\` |`,
|
|
58
|
+
`| Diff | \`${escapeCell(formatPath(result.artifacts.gitDiffPath, resultsDir))}\` |`,
|
|
59
|
+
`| Diff stat | \`${escapeCell(formatPath(result.artifacts.gitDiffStatPath, resultsDir))}\` |`,
|
|
60
|
+
`| Changed files JSON | \`${escapeCell(formatPath(result.artifacts.changedFilesPath, resultsDir))}\` |`,
|
|
61
|
+
`| Git status | \`${escapeCell(formatPath(result.artifacts.gitStatusPath, resultsDir))}\` |`,
|
|
62
|
+
`| Progress | ${result.artifacts.progressPath ? `\`${escapeCell(formatPath(result.artifacts.progressPath, resultsDir))}\`` : 'none'} |`,
|
|
63
|
+
`| Session catalog | ${result.artifacts.sessionCatalogPath ? `\`${escapeCell(formatPath(result.artifacts.sessionCatalogPath, resultsDir))}\`` : 'none'} |`,
|
|
64
|
+
`| Trace files | ${traceFiles.length ? traceFiles.map((path) => `\`${escapeCell(path)}\``).join('<br>') : 'none'} |`,
|
|
65
|
+
'',
|
|
66
|
+
'### Milestone Review',
|
|
67
|
+
'',
|
|
68
|
+
];
|
|
69
|
+
lines.push(...formatReviewSection(result), '', '### Changed Files', '');
|
|
70
|
+
if (result.artifacts.changedFiles.length === 0) {
|
|
71
|
+
lines.push('- none');
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
lines.push('| File | Status | + | - |', '| --- | --- | ---: | ---: |');
|
|
75
|
+
for (const file of result.artifacts.changedFiles) {
|
|
76
|
+
lines.push(`| ${escapeCell(file.path)} | ${escapeCell(file.status)} | ${file.additions ?? ''} | ${file.deletions ?? ''} |`);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
lines.push('', '### Post-Run Checks', '');
|
|
80
|
+
if (result.checks.length === 0) {
|
|
81
|
+
lines.push('- none');
|
|
82
|
+
}
|
|
83
|
+
else {
|
|
84
|
+
for (const check of result.checks) {
|
|
85
|
+
lines.push(`- ${check.passed ? 'PASS' : 'FAIL'} ${check.name}: \`${check.command}\` (${check.exitCode ?? 'unknown'}, ${check.durationMs}ms)`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
lines.push('', '### Metrics', '', '| Metric | Value |', '| --- | ---: |', `| Assistant turns | ${result.metrics.assistantTurns} |`, `| Tool calls | ${result.metrics.toolCalls} |`, `| Mutations | ${result.metrics.mutations} |`, `| Verification after mutation | ${result.metrics.verificationCommandsAfterMutation} |`, `| Approvals requested | ${result.metrics.approvalsRequested} |`, `| Approvals resolved | ${result.metrics.approvalsResolved} |`, `| Tool errors | ${result.metrics.toolErrors} |`);
|
|
89
|
+
lines.push('', '### Agent Verification Commands', '');
|
|
90
|
+
if (result.metrics.verificationCommandDetails.length === 0) {
|
|
91
|
+
lines.push('- none detected after first mutation');
|
|
92
|
+
}
|
|
93
|
+
else {
|
|
94
|
+
for (const command of result.metrics.verificationCommandDetails) {
|
|
95
|
+
lines.push(`- \`${escapeCell(command)}\``);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
lines.push('', '### Rubric', '');
|
|
99
|
+
if (result.review.requiredOutcomes.length === 0 && result.review.humanQuestions.length === 0) {
|
|
100
|
+
lines.push('- none');
|
|
101
|
+
}
|
|
102
|
+
else {
|
|
103
|
+
for (const outcome of result.review.requiredOutcomes) {
|
|
104
|
+
lines.push(`- [ ] ${outcome}`);
|
|
105
|
+
}
|
|
106
|
+
for (const question of result.review.humanQuestions) {
|
|
107
|
+
lines.push(`- [ ] ${question}`);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
if (result.metrics.summary) {
|
|
111
|
+
lines.push('', '### Final Summary', '', result.metrics.summary);
|
|
112
|
+
}
|
|
113
|
+
return lines;
|
|
114
|
+
}
|
|
115
|
+
function formatReviewSection(result) {
|
|
116
|
+
const lines = [];
|
|
117
|
+
if (result.review.milestone) {
|
|
118
|
+
lines.push(`Milestone: ${result.review.milestone}`);
|
|
119
|
+
}
|
|
120
|
+
if (result.review.intent) {
|
|
121
|
+
lines.push('', result.review.intent);
|
|
122
|
+
}
|
|
123
|
+
lines.push('', '| Review Field | Items |', '| --- | --- |');
|
|
124
|
+
lines.push(`| Required outcomes | ${formatListCell(result.review.requiredOutcomes)} |`);
|
|
125
|
+
lines.push(`| Allowed scope | ${formatListCell(result.review.allowedScope)} |`);
|
|
126
|
+
lines.push(`| Out of scope | ${formatListCell(result.review.outOfScope)} |`);
|
|
127
|
+
lines.push(`| Human questions | ${formatListCell(result.review.humanQuestions)} |`);
|
|
128
|
+
return lines;
|
|
129
|
+
}
|
|
130
|
+
function formatListCell(items) {
|
|
131
|
+
return items.length ? items.map((item) => `- ${escapeCell(item)}`).join('<br>') : 'none';
|
|
132
|
+
}
|
|
133
|
+
function formatFixture(result) {
|
|
134
|
+
if (result.fixture.type === 'git-worktree') {
|
|
135
|
+
return [
|
|
136
|
+
'git-worktree',
|
|
137
|
+
result.fixture.ref ? `ref ${result.fixture.ref}` : undefined,
|
|
138
|
+
result.fixture.resolvedRef ? `commit ${shortSha(result.fixture.resolvedRef)}` : undefined,
|
|
139
|
+
result.fixture.baselineCommit && result.fixture.baselineCommit !== result.fixture.resolvedRef ?
|
|
140
|
+
`baseline ${shortSha(result.fixture.baselineCommit)}`
|
|
141
|
+
: undefined,
|
|
142
|
+
].filter(Boolean).join(', ');
|
|
143
|
+
}
|
|
144
|
+
return result.fixture.baselineCommit ? `inline, baseline ${shortSha(result.fixture.baselineCommit)}` : 'inline';
|
|
145
|
+
}
|
|
146
|
+
function shortSha(value) {
|
|
147
|
+
return value.slice(0, 12);
|
|
148
|
+
}
|
|
149
|
+
function escapeCell(value) {
|
|
150
|
+
return value.replaceAll('|', '\\|').replaceAll('\n', ' ');
|
|
151
|
+
}
|
|
152
|
+
function formatPath(path, basePath) {
|
|
153
|
+
if (path === basePath) {
|
|
154
|
+
return '.';
|
|
155
|
+
}
|
|
156
|
+
const relativePath = relative(basePath, path);
|
|
157
|
+
return relativePath && !relativePath.startsWith('..') ? relativePath : path;
|
|
158
|
+
}
|
|
159
|
+
//# sourceMappingURL=report-writer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"report-writer.js","sourceRoot":"","sources":["../../../../src/core/eval/report-writer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAG3C,MAAM,UAAU,oBAAoB,CAAC,MAAuB;IAC1D,SAAS,CAAC,MAAM,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAClD,MAAM,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,aAAa,CAAC,CAAC;IACxD,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC;IAC1D,aAAa,CAAC,QAAQ,EAAE,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACxE,aAAa,CAAC,YAAY,EAAE,uBAAuB,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,CAAC;IACrE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,CAAC;AACpC,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,MAAuB;IAC7D,MAAM,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,MAAM,CAAC;IACpF,MAAM,KAAK,GAAG;QACZ,4BAA4B;QAC5B,EAAE;QACF,WAAW,MAAM,CAAC,MAAM,EAAE;QAC1B,YAAY,MAAM,CAAC,SAAS,EAAE;QAC9B,aAAa,MAAM,CAAC,UAAU,EAAE;QAChC,YAAY,MAAM,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,SAAS;QACpD,EAAE;QACF,gGAAgG;QAChG,uDAAuD;QACvD,GAAG,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC;QACvC,EAAE;KACH,CAAC;IAEF,KAAK,MAAM,MAAM,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,GAAG,eAAe,CAAC,MAAM,EAAE,MAAM,CAAC,UAAU,CAAC,EAAE,EAAE,CAAC,CAAC;IAChE,CAAC;IAED,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,OAAO,EAAE,IAAI,CAAC;AAC3C,CAAC;AAED,SAAS,gBAAgB,CAAC,MAAqB;IAC7C,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IAC1E,OAAO;QACL,MAAM,CAAC,MAAM;QACb,MAAM,CAAC,MAAM;QACb,MAAM,CAAC,KAAK,IAAI,SAAS;QACzB,GAAG,YAAY,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE;QACzC,MAAM,CAAC,OAAO,CAAC,OAAO,IAAI,QAAQ,MAAM,CAAC,KAAK,CAAC,QAAQ,IAAI,SAAS,EAAE;QACtE,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC;QACrC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC;QAChC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,iCAAiC,CAAC;KACzD,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;AACtE,CAAC;AAED,SAAS,eAAe,CAAC,MAAqB,EAAE,UAAkB;IAChE,MAAM,UAAU,GAAG,MAAM,CAAC,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,UAAU,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC,CAAC;IAC3F,MAAM,KAAK,GAAG;QACZ,MAAM,MAAM,CAAC,MAAM,EAAE;QACrB,EAAE;QACF,mBAAmB;QACnB,eAAe;QACf,cAAc,UAAU,CAAC,MAAM,CAAC,MAAM,CAAC,IAAI;QAC3C,aAAa,UAAU,CAAC,MAAM,CAAC,KAAK,IAAI,SAAS,CAAC,IAAI;QACtD,iBAAiB,UAAU,CAAC,MAAM,CAAC,MAAM,CAAC,QAAQ,IAAI,SAAS,CAAC,CAAC,IAAI;QACrE,kBAAkB,UAAU,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,QAAQ,IAAI,SAAS,GAAG,MAAM,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI;QACvH,eAAe,UAAU,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,IAAI;QACpD,mBAAmB,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,aAAa,EAAE,UAAU,CAAC,CAAC,MAAM;QACjF,gBAAgB,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,EAAE,UAAU,CAAC,CAAC,MAAM;QAC1E,cAAc,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,WAAW,EAAE,UAAU,CAAC,CAAC,MAAM;QACpF,mBAAmB,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,eAAe,EAAE,UAAU,CAAC,CAAC,MAAM;QAC7F,4BAA4B,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,gBAAgB,EAAE,UAAU,CAAC,CAAC,MAAM;QACvG,oBAAoB,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,aAAa,EAAE,UAAU,CAAC,CAAC,MAAM;QAC5F,gBAAgB,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC,CAAC,KAAK,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,YAAY,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,IAAI;QACvI,uBAAuB,MAAM,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC,CAAC,KAAK,UAAU,CAAC,UAAU,CAAC,MAAM,CAAC,SAAS,CAAC,kBAAkB,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,IAAI;QAC1J,mBAAmB,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI;QACpH,EAAE;QACF,sBAAsB;QACtB,EAAE;KACH,CAAC;IAEF,KAAK,CAAC,IAAI,CAAC,GAAG,mBAAmB,CAAC,MAAM,CAAC,EAAE,EAAE,EAAE,mBAAmB,EAAE,EAAE,CAAC,CAAC;IACxE,IAAI,MAAM,CAAC,SAAS,CAAC,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/C,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvB,CAAC;SAAM,CAAC;QACN,KAAK,CAAC,IAAI,CAAC,2BAA2B,EAAE,6BAA6B,CAAC,CAAC;QACvE,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,SAAS,CAAC,YAAY,EAAE,CAAC;YACjD,KAAK,CAAC,IAAI,CAAC,KAAK,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,IAAI,CAAC,SAAS,IAAI,EAAE,MAAM,IAAI,CAAC,SAAS,IAAI,EAAE,IAAI,CAAC,CAAC;QAC9H,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,qBAAqB,EAAE,EAAE,CAAC,CAAC;IAC1C,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvB,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;YAClC,KAAK,CAAC,IAAI,CAAC,KAAK,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,IAAI,KAAK,CAAC,IAAI,OAAO,KAAK,CAAC,OAAO,OAAO,KAAK,CAAC,QAAQ,IAAI,SAAS,KAAK,KAAK,CAAC,UAAU,KAAK,CAAC,CAAC;QAChJ,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CACR,EAAE,EACF,aAAa,EACb,EAAE,EACF,oBAAoB,EACpB,gBAAgB,EAChB,uBAAuB,MAAM,CAAC,OAAO,CAAC,cAAc,IAAI,EACxD,kBAAkB,MAAM,CAAC,OAAO,CAAC,SAAS,IAAI,EAC9C,iBAAiB,MAAM,CAAC,OAAO,CAAC,SAAS,IAAI,EAC7C,mCAAmC,MAAM,CAAC,OAAO,CAAC,iCAAiC,IAAI,EACvF,2BAA2B,MAAM,CAAC,OAAO,CAAC,kBAAkB,IAAI,EAChE,0BAA0B,MAAM,CAAC,OAAO,CAAC,iBAAiB,IAAI,EAC9D,mBAAmB,MAAM,CAAC,OAAO,CAAC,UAAU,IAAI,CACjD,CAAC;IAEF,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,iCAAiC,EAAE,EAAE,CAAC,CAAC;IACtD,IAAI,MAAM,CAAC,OAAO,CAAC,0BAA0B,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3D,KAAK,CAAC,IAAI,CAAC,sCAAsC,CAAC,CAAC;IACrD,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,OAAO,CAAC,0BAA0B,EAAE,CAAC;YAChE,KAAK,CAAC,IAAI,CAAC,OAAO,UAAU,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC7C,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,YAAY,EAAE,EAAE,CAAC,CAAC;IACjC,IAAI,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,MAAM,KAAK,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC7F,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvB,CAAC;SAAM,CAAC;QACN,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,MAAM,CAAC,gBAAgB,EAAE,CAAC;YACrD,KAAK,CAAC,IAAI,CAAC,SAAS,OAAO,EAAE,CAAC,CAAC;QACjC,CAAC;QACD,KAAK,MAAM,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAC,cAAc,EAAE,CAAC;YACpD,KAAK,CAAC,IAAI,CAAC,SAAS,QAAQ,EAAE,CAAC,CAAC;QAClC,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;QAC3B,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,mBAAmB,EAAE,EAAE,EAAE,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAClE,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,mBAAmB,CAAC,MAAqB;IAChD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC;QAC5B,KAAK,CAAC,IAAI,CAAC,cAAc,MAAM,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC,CAAC;IACtD,CAAC;IACD,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;QACzB,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;IACvC,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,EAAE,0BAA0B,EAAE,eAAe,CAAC,CAAC;IAC5D,KAAK,CAAC,IAAI,CAAC,yBAAyB,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,IAAI,CAAC,CAAC;IACxF,KAAK,CAAC,IAAI,CAAC,qBAAqB,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;IAChF,KAAK,CAAC,IAAI,CAAC,oBAAoB,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IAC7E,KAAK,CAAC,IAAI,CAAC,uBAAuB,cAAc,CAAC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;IACpF,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,cAAc,CAAC,KAAe;IACrC,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;AAC3F,CAAC;AAED,SAAS,aAAa,CAAC,MAAqB;IAC1C,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,KAAK,cAAc,EAAE,CAAC;QAC3C,OAAO;YACL,cAAc;YACd,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,MAAM,CAAC,OAAO,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,SAAS;YAC5D,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,UAAU,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS;YACzF,MAAM,CAAC,OAAO,CAAC,cAAc,IAAI,MAAM,CAAC,OAAO,CAAC,cAAc,KAAK,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;gBAC7F,YAAY,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE;gBACvD,CAAC,CAAC,SAAS;SACZ,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC/B,CAAC;IACD,OAAO,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,oBAAoB,QAAQ,CAAC,MAAM,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;AAClH,CAAC;AAED,SAAS,QAAQ,CAAC,KAAa;IAC7B,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AAC5B,CAAC;AAED,SAAS,UAAU,CAAC,KAAa;IAC/B,OAAO,KAAK,CAAC,UAAU,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC,UAAU,CAAC,IAAI,EAAE,GAAG,CAAC,CAAC;AAC5D,CAAC;AAED,SAAS,UAAU,CAAC,IAAY,EAAE,QAAgB;IAChD,IAAI,IAAI,KAAK,QAAQ,EAAE,CAAC;QACtB,OAAO,GAAG,CAAC;IACb,CAAC;IAED,MAAM,YAAY,GAAG,QAAQ,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC;IAC9C,OAAO,YAAY,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC;AAC9E,CAAC"}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
export declare const evalCheckSchema: z.ZodObject<{
|
|
3
|
+
name: z.ZodOptional<z.ZodString>;
|
|
4
|
+
command: z.ZodString;
|
|
5
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
6
|
+
}, z.core.$strip>;
|
|
7
|
+
export declare const evalSetupSchema: z.ZodObject<{
|
|
8
|
+
copyFiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
9
|
+
files: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
10
|
+
commands: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
11
|
+
name: z.ZodOptional<z.ZodString>;
|
|
12
|
+
command: z.ZodString;
|
|
13
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
14
|
+
}, z.core.$strip>>>;
|
|
15
|
+
commitMessage: z.ZodOptional<z.ZodString>;
|
|
16
|
+
}, z.core.$strip>;
|
|
17
|
+
export declare const evalInlineFixtureSchema: z.ZodObject<{
|
|
18
|
+
type: z.ZodLiteral<"inline">;
|
|
19
|
+
}, z.core.$strip>;
|
|
20
|
+
export declare const evalGitWorktreeFixtureSchema: z.ZodObject<{
|
|
21
|
+
type: z.ZodLiteral<"git-worktree">;
|
|
22
|
+
repo: z.ZodDefault<z.ZodString>;
|
|
23
|
+
ref: z.ZodString;
|
|
24
|
+
}, z.core.$strip>;
|
|
25
|
+
export declare const evalFixtureSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
26
|
+
type: z.ZodLiteral<"inline">;
|
|
27
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
28
|
+
type: z.ZodLiteral<"git-worktree">;
|
|
29
|
+
repo: z.ZodDefault<z.ZodString>;
|
|
30
|
+
ref: z.ZodString;
|
|
31
|
+
}, z.core.$strip>], "type">;
|
|
32
|
+
export declare const evalMilestoneReviewSchema: z.ZodObject<{
|
|
33
|
+
milestone: z.ZodOptional<z.ZodString>;
|
|
34
|
+
intent: z.ZodOptional<z.ZodString>;
|
|
35
|
+
requiredOutcomes: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
36
|
+
allowedScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
37
|
+
outOfScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
38
|
+
humanQuestions: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
39
|
+
}, z.core.$strip>;
|
|
40
|
+
export declare const agentEvalCaseSchema: z.ZodObject<{
|
|
41
|
+
id: z.ZodString;
|
|
42
|
+
kind: z.ZodLiteral<"coding">;
|
|
43
|
+
description: z.ZodOptional<z.ZodString>;
|
|
44
|
+
prompt: z.ZodString;
|
|
45
|
+
model: z.ZodOptional<z.ZodString>;
|
|
46
|
+
maxSteps: z.ZodOptional<z.ZodNumber>;
|
|
47
|
+
setup: z.ZodDefault<z.ZodObject<{
|
|
48
|
+
copyFiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
49
|
+
files: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
50
|
+
commands: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
51
|
+
name: z.ZodOptional<z.ZodString>;
|
|
52
|
+
command: z.ZodString;
|
|
53
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
54
|
+
}, z.core.$strip>>>;
|
|
55
|
+
commitMessage: z.ZodOptional<z.ZodString>;
|
|
56
|
+
}, z.core.$strip>>;
|
|
57
|
+
fixture: z.ZodDefault<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
58
|
+
type: z.ZodLiteral<"inline">;
|
|
59
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
60
|
+
type: z.ZodLiteral<"git-worktree">;
|
|
61
|
+
repo: z.ZodDefault<z.ZodString>;
|
|
62
|
+
ref: z.ZodString;
|
|
63
|
+
}, z.core.$strip>], "type">>;
|
|
64
|
+
review: z.ZodDefault<z.ZodObject<{
|
|
65
|
+
milestone: z.ZodOptional<z.ZodString>;
|
|
66
|
+
intent: z.ZodOptional<z.ZodString>;
|
|
67
|
+
requiredOutcomes: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
68
|
+
allowedScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
69
|
+
outOfScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
70
|
+
humanQuestions: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
71
|
+
}, z.core.$strip>>;
|
|
72
|
+
checks: z.ZodDefault<z.ZodArray<z.ZodObject<{
|
|
73
|
+
name: z.ZodOptional<z.ZodString>;
|
|
74
|
+
command: z.ZodString;
|
|
75
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
76
|
+
}, z.core.$strip>>>;
|
|
77
|
+
rubric: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
78
|
+
tags: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
79
|
+
}, z.core.$strip>;
|
|
80
|
+
export declare const evalCaseSchema: z.ZodObject<{
|
|
81
|
+
id: z.ZodString;
|
|
82
|
+
kind: z.ZodLiteral<"coding">;
|
|
83
|
+
description: z.ZodOptional<z.ZodString>;
|
|
84
|
+
prompt: z.ZodString;
|
|
85
|
+
model: z.ZodOptional<z.ZodString>;
|
|
86
|
+
maxSteps: z.ZodOptional<z.ZodNumber>;
|
|
87
|
+
setup: z.ZodDefault<z.ZodObject<{
|
|
88
|
+
copyFiles: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
89
|
+
files: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
90
|
+
commands: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
91
|
+
name: z.ZodOptional<z.ZodString>;
|
|
92
|
+
command: z.ZodString;
|
|
93
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
94
|
+
}, z.core.$strip>>>;
|
|
95
|
+
commitMessage: z.ZodOptional<z.ZodString>;
|
|
96
|
+
}, z.core.$strip>>;
|
|
97
|
+
fixture: z.ZodDefault<z.ZodDiscriminatedUnion<[z.ZodObject<{
|
|
98
|
+
type: z.ZodLiteral<"inline">;
|
|
99
|
+
}, z.core.$strip>, z.ZodObject<{
|
|
100
|
+
type: z.ZodLiteral<"git-worktree">;
|
|
101
|
+
repo: z.ZodDefault<z.ZodString>;
|
|
102
|
+
ref: z.ZodString;
|
|
103
|
+
}, z.core.$strip>], "type">>;
|
|
104
|
+
review: z.ZodDefault<z.ZodObject<{
|
|
105
|
+
milestone: z.ZodOptional<z.ZodString>;
|
|
106
|
+
intent: z.ZodOptional<z.ZodString>;
|
|
107
|
+
requiredOutcomes: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
108
|
+
allowedScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
109
|
+
outOfScope: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
110
|
+
humanQuestions: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
111
|
+
}, z.core.$strip>>;
|
|
112
|
+
checks: z.ZodDefault<z.ZodArray<z.ZodObject<{
|
|
113
|
+
name: z.ZodOptional<z.ZodString>;
|
|
114
|
+
command: z.ZodString;
|
|
115
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
116
|
+
}, z.core.$strip>>>;
|
|
117
|
+
rubric: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
118
|
+
tags: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
119
|
+
}, z.core.$strip>;
|
|
120
|
+
export type EvalCheck = z.infer<typeof evalCheckSchema>;
|
|
121
|
+
export type EvalSetup = z.infer<typeof evalSetupSchema>;
|
|
122
|
+
export type EvalFixture = z.infer<typeof evalFixtureSchema>;
|
|
123
|
+
export type EvalMilestoneReview = z.infer<typeof evalMilestoneReviewSchema>;
|
|
124
|
+
export type AgentEvalCase = z.infer<typeof agentEvalCaseSchema>;
|
|
125
|
+
export type EvalCase = z.infer<typeof evalCaseSchema>;
|
|
126
|
+
export type EvalCheckResult = {
|
|
127
|
+
name: string;
|
|
128
|
+
command: string;
|
|
129
|
+
exitCode: number | null;
|
|
130
|
+
stdout: string;
|
|
131
|
+
stderr: string;
|
|
132
|
+
durationMs: number;
|
|
133
|
+
passed: boolean;
|
|
134
|
+
timedOut: boolean;
|
|
135
|
+
};
|
|
136
|
+
export type EvalChangedFile = {
|
|
137
|
+
path: string;
|
|
138
|
+
status: string;
|
|
139
|
+
additions?: number;
|
|
140
|
+
deletions?: number;
|
|
141
|
+
};
|
|
142
|
+
export type EvalTraceMetrics = {
|
|
143
|
+
assistantTurns: number;
|
|
144
|
+
toolCalls: number;
|
|
145
|
+
toolResults: number;
|
|
146
|
+
mutations: number;
|
|
147
|
+
approvalsRequested: number;
|
|
148
|
+
approvalsResolved: number;
|
|
149
|
+
toolErrors: number;
|
|
150
|
+
verificationCommandsAfterMutation: number;
|
|
151
|
+
firstMutationStep?: number;
|
|
152
|
+
outcome?: string;
|
|
153
|
+
summary?: string;
|
|
154
|
+
toolsByName: Record<string, number>;
|
|
155
|
+
readOrSearchBeforeMutation: string[];
|
|
156
|
+
verificationCommandDetails: string[];
|
|
157
|
+
};
|
|
158
|
+
export type EvalRunResult = {
|
|
159
|
+
caseId: string;
|
|
160
|
+
target: string;
|
|
161
|
+
status: 'passed' | 'failed';
|
|
162
|
+
workspaceRoot: string;
|
|
163
|
+
outputDir: string;
|
|
164
|
+
fixture: {
|
|
165
|
+
type: EvalFixture['type'];
|
|
166
|
+
repo?: string;
|
|
167
|
+
ref?: string;
|
|
168
|
+
resolvedRef?: string;
|
|
169
|
+
baselineCommit?: string;
|
|
170
|
+
};
|
|
171
|
+
startedAt: string;
|
|
172
|
+
finishedAt: string;
|
|
173
|
+
durationMs: number;
|
|
174
|
+
agent: {
|
|
175
|
+
command: string[];
|
|
176
|
+
exitCode: number | null;
|
|
177
|
+
stdoutPath: string;
|
|
178
|
+
stderrPath: string;
|
|
179
|
+
timedOut: boolean;
|
|
180
|
+
};
|
|
181
|
+
artifacts: {
|
|
182
|
+
gitStatusPath: string;
|
|
183
|
+
gitDiffPath: string;
|
|
184
|
+
gitDiffStatPath: string;
|
|
185
|
+
changedFilesPath: string;
|
|
186
|
+
progressPath?: string;
|
|
187
|
+
sessionCatalogPath?: string;
|
|
188
|
+
traceFiles: string[];
|
|
189
|
+
changedFiles: EvalChangedFile[];
|
|
190
|
+
};
|
|
191
|
+
checks: EvalCheckResult[];
|
|
192
|
+
metrics: EvalTraceMetrics;
|
|
193
|
+
review: EvalMilestoneReview;
|
|
194
|
+
model?: string;
|
|
195
|
+
maxSteps?: number;
|
|
196
|
+
};
|
|
197
|
+
export type EvalSuiteReport = {
|
|
198
|
+
version: 1;
|
|
199
|
+
target: string;
|
|
200
|
+
repoRoot: string;
|
|
201
|
+
startedAt: string;
|
|
202
|
+
finishedAt: string;
|
|
203
|
+
resultsDir: string;
|
|
204
|
+
results: EvalRunResult[];
|
|
205
|
+
};
|
|
206
|
+
//# sourceMappingURL=schema.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../../../src/core/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,eAAO,MAAM,eAAe;;;;iBASoG,CAAC;AAEjI,eAAO,MAAM,eAAe;;;;;;;;;iBAmB8E,CAAC;AAE3G,eAAO,MAAM,uBAAuB;;iBAG+C,CAAC;AAEpF,eAAO,MAAM,4BAA4B;;;;iBAQ8C,CAAC;AAExF,eAAO,MAAM,iBAAiB;;;;;;2BAGyC,CAAC;AAExE,eAAO,MAAM,yBAAyB;;;;;;;iBA2BoE,CAAC;AAE3G,eAAO,MAAM,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBA2CmF,CAAC;AAEpH,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAAsB,CAAC;AAElD,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AACxD,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,eAAe,CAAC,CAAC;AACxD,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,iBAAiB,CAAC,CAAC;AAC5D,MAAM,MAAM,mBAAmB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,yBAAyB,CAAC,CAAC;AAC5E,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAChE,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAEtD,MAAM,MAAM,eAAe,GAAG;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,OAAO,CAAC;IAChB,QAAQ,EAAE,OAAO,CAAC;CACnB,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B,cAAc,EAAE,MAAM,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,kBAAkB,EAAE,MAAM,CAAC;IAC3B,iBAAiB,EAAE,MAAM,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,iCAAiC,EAAE,MAAM,CAAC;IAC1C,iBAAiB,CAAC,EAAE,MAAM,CAAC;IAC3B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACpC,0BAA0B,EAAE,MAAM,EAAE,CAAC;IACrC,0BAA0B,EAAE,MAAM,EAAE,CAAC;CACtC,CAAC;AAEF,MAAM,MAAM,aAAa,GAAG;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC5B,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE;QACP,IAAI,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;QAC1B,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,cAAc,CAAC,EAAE,MAAM,CAAC;KACzB,CAAC;IACF,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE;QACL,OAAO,EAAE,MAAM,EAAE,CAAC;QAClB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;QACxB,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,QAAQ,EAAE,OAAO,CAAC;KACnB,CAAC;IACF,SAAS,EAAE;QACT,aAAa,EAAE,MAAM,CAAC;QACtB,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,CAAC;QACxB,gBAAgB,EAAE,MAAM,CAAC;QACzB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,UAAU,EAAE,MAAM,EAAE,CAAC;QACrB,YAAY,EAAE,eAAe,EAAE,CAAC;KACjC,CAAC;IACF,MAAM,EAAE,eAAe,EAAE,CAAC;IAC1B,OAAO,EAAE,gBAAgB,CAAC;IAC1B,MAAM,EAAE,mBAAmB,CAAC;IAC5B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,OAAO,EAAE,CAAC,CAAC;IACX,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,aAAa,EAAE,CAAC;CAC1B,CAAC"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
export const evalCheckSchema = z.object({
|
|
3
|
+
name: z.string().trim().min(1)
|
|
4
|
+
.describe('Human-readable label for this check in reports. Defaults to the command when omitted.')
|
|
5
|
+
.optional(),
|
|
6
|
+
command: z.string().trim().min(1)
|
|
7
|
+
.describe('Shell command to run inside the disposable eval workspace after the agent finishes. Exit code 0 means the check passed.'),
|
|
8
|
+
timeoutMs: z.number().int().positive()
|
|
9
|
+
.describe('Optional timeout for this check command in milliseconds.')
|
|
10
|
+
.optional(),
|
|
11
|
+
}).describe('A deterministic post-run command used to decide whether the agent-produced workspace passes objective validation.');
|
|
12
|
+
export const evalSetupSchema = z.object({
|
|
13
|
+
copyFiles: z.record(z.string().trim().min(1).describe('Repository-relative source file path to copy from the eval harness repo.'), z.string().trim().min(1).describe('Workspace-relative destination file path to create before the eval starts.'))
|
|
14
|
+
.describe('Fixture files to copy from the eval harness repo into the disposable workspace before committing the initial Git baseline.')
|
|
15
|
+
.optional(),
|
|
16
|
+
files: z.record(z.string().trim().min(1).describe('Workspace-relative file path to create before the eval starts.'), z.string().describe('Exact UTF-8 file contents to write for the fixture file.'))
|
|
17
|
+
.describe('Fixture files to write into the disposable workspace before committing the initial Git baseline.')
|
|
18
|
+
.optional(),
|
|
19
|
+
commands: z.array(evalCheckSchema)
|
|
20
|
+
.describe('Setup commands to run after fixture files are written and before the initial Git baseline is committed.')
|
|
21
|
+
.optional(),
|
|
22
|
+
commitMessage: z.string().trim().min(1)
|
|
23
|
+
.describe('Commit message for the initial fixture Git baseline. Defaults to a generated eval fixture message.')
|
|
24
|
+
.optional(),
|
|
25
|
+
}).describe('Instructions for creating the disposable repository state that the agent will work against.');
|
|
26
|
+
export const evalInlineFixtureSchema = z.object({
|
|
27
|
+
type: z.literal('inline')
|
|
28
|
+
.describe('Create a small disposable Git repository from the case setup files and commands.'),
|
|
29
|
+
}).describe('A synthetic disposable repository built from inline eval setup data.');
|
|
30
|
+
export const evalGitWorktreeFixtureSchema = z.object({
|
|
31
|
+
type: z.literal('git-worktree')
|
|
32
|
+
.describe('Create a disposable Git worktree from an existing repository at a pinned ref.'),
|
|
33
|
+
repo: z.string().trim().min(1)
|
|
34
|
+
.describe('Repository path to create the worktree from. Relative paths are resolved from the Heddle repo root.')
|
|
35
|
+
.default('.'),
|
|
36
|
+
ref: z.string().trim().min(1)
|
|
37
|
+
.describe('Pinned target ref for the worktree, such as a release tag or commit SHA. Avoid moving HEAD for comparable evals.'),
|
|
38
|
+
}).describe('A realistic disposable repository fixture created from a pinned Git ref.');
|
|
39
|
+
export const evalFixtureSchema = z.discriminatedUnion('type', [
|
|
40
|
+
evalInlineFixtureSchema,
|
|
41
|
+
evalGitWorktreeFixtureSchema,
|
|
42
|
+
]).describe('How to prepare the disposable workspace the agent edits.');
|
|
43
|
+
export const evalMilestoneReviewSchema = z.object({
|
|
44
|
+
milestone: z.string().trim().min(1)
|
|
45
|
+
.describe('Short name for the user-intended milestone this case evaluates.')
|
|
46
|
+
.optional(),
|
|
47
|
+
intent: z.string().trim().min(1)
|
|
48
|
+
.describe('Human-readable statement of what the agent should accomplish beyond merely passing checks.')
|
|
49
|
+
.optional(),
|
|
50
|
+
requiredOutcomes: z.array(z.string().trim().min(1).describe('Observable outcome a human reviewer should look for in the final diff, trace, or answer.'))
|
|
51
|
+
.describe('Milestone outcomes expected for a high-quality completion.')
|
|
52
|
+
.default([]),
|
|
53
|
+
allowedScope: z.array(z.string().trim().min(1).describe('Files, modules, or behavior areas the agent is allowed or expected to touch.'))
|
|
54
|
+
.describe('Expected implementation scope for judging whether the diff stayed on task.')
|
|
55
|
+
.default([]),
|
|
56
|
+
outOfScope: z.array(z.string().trim().min(1).describe('Files, modules, or behavior areas that should not be changed for this case.'))
|
|
57
|
+
.describe('Boundaries a human reviewer should use to spot unrelated churn.')
|
|
58
|
+
.default([]),
|
|
59
|
+
humanQuestions: z.array(z.string().trim().min(1).describe('Question for human review after the run completes.'))
|
|
60
|
+
.describe('Review prompts that help judge task completion quality beyond deterministic checks.')
|
|
61
|
+
.default([]),
|
|
62
|
+
}).describe('Human-review metadata for milestone-style eval cases where pass/fail checks are not enough.');
|
|
63
|
+
export const agentEvalCaseSchema = z.object({
|
|
64
|
+
id: z.string().trim().regex(/^[a-zA-Z0-9._-]+$/, 'Use a filesystem-safe case id.')
|
|
65
|
+
.describe('Stable filesystem-safe case id used in result paths, filtering, and reports.'),
|
|
66
|
+
kind: z.literal('coding')
|
|
67
|
+
.describe('Eval case type. The first harness slice supports coding cases run through ask --new-session.'),
|
|
68
|
+
description: z.string().trim()
|
|
69
|
+
.describe('Optional short explanation of what behavior this case is meant to exercise.')
|
|
70
|
+
.optional(),
|
|
71
|
+
prompt: z.string().trim().min(1)
|
|
72
|
+
.describe('User prompt sent to Heddle in the disposable workspace. This should ask for real coding work, not just Q&A.'),
|
|
73
|
+
model: z.string().trim().min(1)
|
|
74
|
+
.describe('Optional model override for this case. The CLI-level --model takes precedence when supplied.')
|
|
75
|
+
.optional(),
|
|
76
|
+
maxSteps: z.number().int().positive()
|
|
77
|
+
.describe('Optional maximum agent loop steps for this case. The CLI-level --max-steps takes precedence when supplied.')
|
|
78
|
+
.optional(),
|
|
79
|
+
setup: evalSetupSchema
|
|
80
|
+
.describe('Disposable workspace setup for this case.')
|
|
81
|
+
.default({}),
|
|
82
|
+
fixture: evalFixtureSchema
|
|
83
|
+
.describe('Workspace fixture source. Defaults to an inline synthetic repository.')
|
|
84
|
+
.default({ type: 'inline' }),
|
|
85
|
+
review: evalMilestoneReviewSchema
|
|
86
|
+
.describe('Optional milestone-completion review guidance included in reports.')
|
|
87
|
+
.default({
|
|
88
|
+
requiredOutcomes: [],
|
|
89
|
+
allowedScope: [],
|
|
90
|
+
outOfScope: [],
|
|
91
|
+
humanQuestions: [],
|
|
92
|
+
}),
|
|
93
|
+
checks: z.array(evalCheckSchema)
|
|
94
|
+
.describe('Deterministic post-agent commands that must pass for the case to be marked passed.')
|
|
95
|
+
.default([]),
|
|
96
|
+
rubric: z.array(z.string().trim().min(1).describe('Qualitative behavior criterion for human or future LLM judging.'))
|
|
97
|
+
.describe('Non-deterministic quality criteria preserved in reports for manual or future judge review.')
|
|
98
|
+
.default([]),
|
|
99
|
+
tags: z.array(z.string().trim().min(1).describe('Free-form label for filtering or grouping eval cases.'))
|
|
100
|
+
.describe('Case labels such as bugfix, refactor, verification, multi-file, or tui.')
|
|
101
|
+
.default([]),
|
|
102
|
+
}).describe('A live coding-task eval case run in a disposable Git workspace through Heddle ask/session execution.');
|
|
103
|
+
export const evalCaseSchema = agentEvalCaseSchema;
|
|
104
|
+
//# sourceMappingURL=schema.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"schema.js","sourceRoot":"","sources":["../../../../src/core/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC3B,QAAQ,CAAC,uFAAuF,CAAC;SACjG,QAAQ,EAAE;IACb,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC9B,QAAQ,CAAC,yHAAyH,CAAC;IACtI,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE;SACnC,QAAQ,CAAC,0DAA0D,CAAC;SACpE,QAAQ,EAAE;CACd,CAAC,CAAC,QAAQ,CAAC,mHAAmH,CAAC,CAAC;AAEjI,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IACtC,SAAS,EAAE,CAAC,CAAC,MAAM,CACjB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,0EAA0E,CAAC,EAC7G,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,4EAA4E,CAAC,CAChH;SACE,QAAQ,CAAC,4HAA4H,CAAC;SACtI,QAAQ,EAAE;IACb,KAAK,EAAE,CAAC,CAAC,MAAM,CACb,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,gEAAgE,CAAC,EACnG,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,0DAA0D,CAAC,CAChF;SACE,QAAQ,CAAC,kGAAkG,CAAC;SAC5G,QAAQ,EAAE;IACb,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC;SAC/B,QAAQ,CAAC,yGAAyG,CAAC;SACnH,QAAQ,EAAE;IACb,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SACpC,QAAQ,CAAC,oGAAoG,CAAC;SAC9G,QAAQ,EAAE;CACd,CAAC,CAAC,QAAQ,CAAC,6FAA6F,CAAC,CAAC;AAE3G,MAAM,CAAC,MAAM,uBAAuB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC9C,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC;SACtB,QAAQ,CAAC,kFAAkF,CAAC;CAChG,CAAC,CAAC,QAAQ,CAAC,sEAAsE,CAAC,CAAC;AAEpF,MAAM,CAAC,MAAM,4BAA4B,GAAG,CAAC,CAAC,MAAM,CAAC;IACnD,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,cAAc,CAAC;SAC5B,QAAQ,CAAC,+EAA+E,CAAC;IAC5F,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC3B,QAAQ,CAAC,qGAAqG,CAAC;SAC/G,OAAO,CAAC,GAAG,CAAC;IACf,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC1B,QAAQ,CAAC,kHAAkH,CAAC;CAChI,CAAC,CAAC,QAAQ,CAAC,0EAA0E,CAAC,CAAC;AAExF,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAAC,CAAC,kBAAkB,CAAC,MAAM,EAAE;IAC5D,uBAAuB;IACvB,4BAA4B;CAC7B,CAAC,CAAC,QAAQ,CAAC,0DAA0D,CAAC,CAAC;AAExE,MAAM,CAAC,MAAM,yBAAyB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAChC,QAAQ,CAAC,iEAAiE,CAAC;SAC3E,QAAQ,EAAE;IACb,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC7B,QAAQ,CAAC,4FAA4F,CAAC;SACtG,QAAQ,EAAE;IACb,gBAAgB,EAAE,CAAC,CAAC,KAAK,CACvB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,0FAA0F,CAAC,CAC9H;SACE,QAAQ,CAAC,4DAA4D,CAAC;SACtE,OAAO,CAAC,EAAE,CAAC;IACd,YAAY,EAAE,CAAC,CAAC,KAAK,CACnB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,8EAA8E,CAAC,CAClH;SACE,QAAQ,CAAC,4EAA4E,CAAC;SACtF,OAAO,CAAC,EAAE,CAAC;IACd,UAAU,EAAE,CAAC,CAAC,KAAK,CACjB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,6EAA6E,CAAC,CACjH;SACE,QAAQ,CAAC,iEAAiE,CAAC;SAC3E,OAAO,CAAC,EAAE,CAAC;IACd,cAAc,EAAE,CAAC,CAAC,KAAK,CACrB,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,oDAAoD,CAAC,CACxF;SACE,QAAQ,CAAC,qFAAqF,CAAC;SAC/F,OAAO,CAAC,EAAE,CAAC;CACf,CAAC,CAAC,QAAQ,CAAC,6FAA6F,CAAC,CAAC;AAE3G,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC1C,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,mBAAmB,EAAE,gCAAgC,CAAC;SAC/E,QAAQ,CAAC,8EAA8E,CAAC;IAC3F,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC;SACtB,QAAQ,CAAC,8FAA8F,CAAC;IAC3G,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE;SAC3B,QAAQ,CAAC,6EAA6E,CAAC;SACvF,QAAQ,EAAE;IACb,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC7B,QAAQ,CAAC,6GAA6G,CAAC;IAC1H,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;SAC5B,QAAQ,CAAC,8FAA8F,CAAC;SACxG,QAAQ,EAAE;IACb,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE;SAClC,QAAQ,CAAC,4GAA4G,CAAC;SACtH,QAAQ,EAAE;IACb,KAAK,EAAE,eAAe;SACnB,QAAQ,CAAC,2CAA2C,CAAC;SACrD,OAAO,CAAC,EAAE,CAAC;IACd,OAAO,EAAE,iBAAiB;SACvB,QAAQ,CAAC,uEAAuE,CAAC;SACjF,OAAO,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;IAC9B,MAAM,EAAE,yBAAyB;SAC9B,QAAQ,CAAC,oEAAoE,CAAC;SAC9E,OAAO,CAAC;QACP,gBAAgB,EAAE,EAAE;QACpB,YAAY,EAAE,EAAE;QAChB,UAAU,EAAE,EAAE;QACd,cAAc,EAAE,EAAE;KACnB,CAAC;IACJ,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC;SAC7B,QAAQ,CAAC,oFAAoF,CAAC;SAC9F,OAAO,CAAC,EAAE,CAAC;IACd,MAAM,EAAE,CAAC,CAAC,KAAK,CACb,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,iEAAiE,CAAC,CACrG;SACE,QAAQ,CAAC,4FAA4F,CAAC;SACtG,OAAO,CAAC,EAAE,CAAC;IACd,IAAI,EAAE,CAAC,CAAC,KAAK,CACX,CAAC,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,uDAAuD,CAAC,CAC3F;SACE,QAAQ,CAAC,yEAAyE,CAAC;SACnF,OAAO,CAAC,EAAE,CAAC;CACf,CAAC,CAAC,QAAQ,CAAC,sGAAsG,CAAC,CAAC;AAEpH,MAAM,CAAC,MAAM,cAAc,GAAG,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { TraceEvent } from '../types.js';
|
|
2
|
+
import type { EvalTraceMetrics } from './schema.js';
|
|
3
|
+
export declare function analyzeTraceFiles(paths: string[]): EvalTraceMetrics;
|
|
4
|
+
export declare function analyzeTrace(trace: TraceEvent[]): EvalTraceMetrics;
|
|
5
|
+
export declare function readTraceFile(path: string): TraceEvent[];
|
|
6
|
+
//# sourceMappingURL=trace-analyzer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"trace-analyzer.d.ts","sourceRoot":"","sources":["../../../../src/core/eval/trace-analyzer.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAKpD,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,gBAAgB,CAEnE;AAED,wBAAgB,YAAY,CAAC,KAAK,EAAE,UAAU,EAAE,GAAG,gBAAgB,CAiFlE;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,UAAU,EAAE,CAMxD"}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
2
|
+
const MUTATION_TOOLS = new Set(['edit_file', 'delete_file', 'move_file', 'run_shell_mutate']);
|
|
3
|
+
const READ_OR_SEARCH_TOOLS = new Set(['read_file', 'list_files', 'search_files', 'run_shell_inspect']);
|
|
4
|
+
export function analyzeTraceFiles(paths) {
|
|
5
|
+
return analyzeTrace(paths.flatMap(readTraceFile));
|
|
6
|
+
}
|
|
7
|
+
export function analyzeTrace(trace) {
|
|
8
|
+
const toolsByName = {};
|
|
9
|
+
const readOrSearchBeforeMutation = [];
|
|
10
|
+
let assistantTurns = 0;
|
|
11
|
+
let toolCalls = 0;
|
|
12
|
+
let toolResults = 0;
|
|
13
|
+
let mutations = 0;
|
|
14
|
+
let approvalsRequested = 0;
|
|
15
|
+
let approvalsResolved = 0;
|
|
16
|
+
let toolErrors = 0;
|
|
17
|
+
let verificationCommandsAfterMutation = 0;
|
|
18
|
+
const verificationCommandDetails = [];
|
|
19
|
+
let firstMutationStep;
|
|
20
|
+
let outcome;
|
|
21
|
+
let summary;
|
|
22
|
+
for (const event of trace) {
|
|
23
|
+
if (event.type === 'assistant.turn') {
|
|
24
|
+
assistantTurns++;
|
|
25
|
+
for (const call of event.toolCalls ?? []) {
|
|
26
|
+
toolCalls++;
|
|
27
|
+
toolsByName[call.tool] = (toolsByName[call.tool] ?? 0) + 1;
|
|
28
|
+
if (READ_OR_SEARCH_TOOLS.has(call.tool) && firstMutationStep === undefined) {
|
|
29
|
+
readOrSearchBeforeMutation.push(summarizeToolInput(call.tool, call.input));
|
|
30
|
+
}
|
|
31
|
+
if (MUTATION_TOOLS.has(call.tool)) {
|
|
32
|
+
mutations++;
|
|
33
|
+
firstMutationStep ??= event.step;
|
|
34
|
+
}
|
|
35
|
+
if (firstMutationStep !== undefined
|
|
36
|
+
&& (call.tool === 'run_shell_mutate' || call.tool === 'run_shell_inspect')
|
|
37
|
+
&& isVerificationCommand(call.input)) {
|
|
38
|
+
verificationCommandsAfterMutation++;
|
|
39
|
+
verificationCommandDetails.push(summarizeToolInput(call.tool, call.input));
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
if (event.type === 'tool.result') {
|
|
45
|
+
toolResults++;
|
|
46
|
+
if (!event.result.ok) {
|
|
47
|
+
toolErrors++;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
if (event.type === 'tool.approval_requested') {
|
|
51
|
+
approvalsRequested++;
|
|
52
|
+
}
|
|
53
|
+
if (event.type === 'tool.approval_resolved') {
|
|
54
|
+
approvalsResolved++;
|
|
55
|
+
}
|
|
56
|
+
if (event.type === 'run.finished') {
|
|
57
|
+
outcome = event.outcome;
|
|
58
|
+
summary = event.summary;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return {
|
|
62
|
+
assistantTurns,
|
|
63
|
+
toolCalls,
|
|
64
|
+
toolResults,
|
|
65
|
+
mutations,
|
|
66
|
+
approvalsRequested,
|
|
67
|
+
approvalsResolved,
|
|
68
|
+
toolErrors,
|
|
69
|
+
verificationCommandsAfterMutation,
|
|
70
|
+
verificationCommandDetails: [...new Set(verificationCommandDetails)],
|
|
71
|
+
firstMutationStep,
|
|
72
|
+
outcome,
|
|
73
|
+
summary,
|
|
74
|
+
toolsByName,
|
|
75
|
+
readOrSearchBeforeMutation: [...new Set(readOrSearchBeforeMutation)],
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
export function readTraceFile(path) {
|
|
79
|
+
if (!existsSync(path)) {
|
|
80
|
+
return [];
|
|
81
|
+
}
|
|
82
|
+
const parsed = JSON.parse(readFileSync(path, 'utf8'));
|
|
83
|
+
return Array.isArray(parsed) ? parsed.filter(isTraceEvent) : [];
|
|
84
|
+
}
|
|
85
|
+
function isTraceEvent(value) {
|
|
86
|
+
return Boolean(value && typeof value === 'object' && !Array.isArray(value) && typeof value.type === 'string');
|
|
87
|
+
}
|
|
88
|
+
function summarizeToolInput(tool, input) {
|
|
89
|
+
const object = input && typeof input === 'object' && !Array.isArray(input) ? input : undefined;
|
|
90
|
+
const path = object && typeof object.path === 'string' ? object.path : undefined;
|
|
91
|
+
const query = object && typeof object.query === 'string' ? object.query : undefined;
|
|
92
|
+
const command = object && typeof object.command === 'string' ? object.command : undefined;
|
|
93
|
+
return `${tool}:${path ?? query ?? command ?? ''}`.slice(0, 160);
|
|
94
|
+
}
|
|
95
|
+
function isVerificationCommand(input) {
|
|
96
|
+
if (!input || typeof input !== 'object' || Array.isArray(input)) {
|
|
97
|
+
return false;
|
|
98
|
+
}
|
|
99
|
+
const command = input.command;
|
|
100
|
+
if (typeof command !== 'string') {
|
|
101
|
+
return false;
|
|
102
|
+
}
|
|
103
|
+
return /\b(yarn|npm|pnpm|vitest|jest|mocha|tsc|eslint|cargo|go|pytest|python|ruff)\b/.test(command)
|
|
104
|
+
&& /\b(test|build|lint|typecheck|check|vitest|tsc|pytest|ruff)\b/.test(command);
|
|
105
|
+
}
|
|
106
|
+
//# sourceMappingURL=trace-analyzer.js.map
|