@mcoda/mswarm 0.1.57 → 0.1.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/dist/codali-executor.d.ts +278 -0
- package/dist/codali-executor.d.ts.map +1 -0
- package/dist/codali-executor.js +243 -0
- package/dist/codali-executor.js.map +1 -0
- package/dist/runtime.d.ts +46 -1
- package/dist/runtime.d.ts.map +1 -1
- package/dist/runtime.js +298 -30
- package/dist/runtime.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +66 -1
- package/dist/server.js.map +1 -1
- package/dist/vendor/codali/agents/AgentProtocol.d.ts +287 -0
- package/dist/vendor/codali/agents/AgentProtocol.d.ts.map +1 -0
- package/dist/vendor/codali/agents/AgentProtocol.js +365 -0
- package/dist/vendor/codali/agents/AgentResolver.d.ts +23 -0
- package/dist/vendor/codali/agents/AgentResolver.d.ts.map +1 -0
- package/dist/vendor/codali/agents/AgentResolver.js +77 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.d.ts +23 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.d.ts.map +1 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.js +287 -0
- package/dist/vendor/codali/cli/EvalCommand.d.ts +37 -0
- package/dist/vendor/codali/cli/EvalCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/EvalCommand.js +333 -0
- package/dist/vendor/codali/cli/FeedbackCommand.d.ts +22 -0
- package/dist/vendor/codali/cli/FeedbackCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/FeedbackCommand.js +163 -0
- package/dist/vendor/codali/cli/RunCommand.d.ts +78 -0
- package/dist/vendor/codali/cli/RunCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/RunCommand.js +2261 -0
- package/dist/vendor/codali/cli.d.ts +3 -0
- package/dist/vendor/codali/cli.d.ts.map +1 -0
- package/dist/vendor/codali/cli.js +109 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.d.ts +107 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.js +1726 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.d.ts +25 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.js +164 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.d.ts +76 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.js +1159 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.d.ts +91 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.js +4547 -0
- package/dist/vendor/codali/cognitive/ContextBudget.d.ts +19 -0
- package/dist/vendor/codali/cognitive/ContextBudget.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextBudget.js +35 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.d.ts +30 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.js +307 -0
- package/dist/vendor/codali/cognitive/ContextManager.d.ts +47 -0
- package/dist/vendor/codali/cognitive/ContextManager.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextManager.js +272 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.d.ts +18 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.js +53 -0
- package/dist/vendor/codali/cognitive/ContextSelector.d.ts +22 -0
- package/dist/vendor/codali/cognitive/ContextSelector.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSelector.js +431 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.d.ts +8 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.js +882 -0
- package/dist/vendor/codali/cognitive/ContextStore.d.ts +27 -0
- package/dist/vendor/codali/cognitive/ContextStore.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextStore.js +79 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.d.ts +16 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.js +45 -0
- package/dist/vendor/codali/cognitive/CostEstimator.d.ts +31 -0
- package/dist/vendor/codali/cognitive/CostEstimator.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/CostEstimator.js +66 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.d.ts +32 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.js +297 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.d.ts +9 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.js +75 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.d.ts +12 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.js +34 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.d.ts +33 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.js +159 -0
- package/dist/vendor/codali/cognitive/IntentSignals.d.ts +7 -0
- package/dist/vendor/codali/cognitive/IntentSignals.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/IntentSignals.js +285 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.d.ts +100 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.js +276 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.d.ts +64 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.js +287 -0
- package/dist/vendor/codali/cognitive/PatchApplier.d.ts +49 -0
- package/dist/vendor/codali/cognitive/PatchApplier.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchApplier.js +199 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.d.ts +35 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.js +100 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.d.ts +7 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.js +59 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.d.ts +17 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.js +131 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.d.ts +3 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.js +85 -0
- package/dist/vendor/codali/cognitive/Prompts.d.ts +15 -0
- package/dist/vendor/codali/cognitive/Prompts.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/Prompts.js +326 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.d.ts +16 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.js +24 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.d.ts +12 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.js +262 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.d.ts +13 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.js +125 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.d.ts +92 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.js +4804 -0
- package/dist/vendor/codali/cognitive/Types.d.ts +474 -0
- package/dist/vendor/codali/cognitive/Types.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/Types.js +7 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.d.ts +57 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.js +515 -0
- package/dist/vendor/codali/config/Config.d.ts +249 -0
- package/dist/vendor/codali/config/Config.d.ts.map +1 -0
- package/dist/vendor/codali/config/Config.js +200 -0
- package/dist/vendor/codali/config/ConfigLoader.d.ts +56 -0
- package/dist/vendor/codali/config/ConfigLoader.d.ts.map +1 -0
- package/dist/vendor/codali/config/ConfigLoader.js +1246 -0
- package/dist/vendor/codali/docdex/DocdexClient.d.ts +158 -0
- package/dist/vendor/codali/docdex/DocdexClient.d.ts.map +1 -0
- package/dist/vendor/codali/docdex/DocdexClient.js +785 -0
- package/dist/vendor/codali/eval/EvalRunner.d.ts +35 -0
- package/dist/vendor/codali/eval/EvalRunner.d.ts.map +1 -0
- package/dist/vendor/codali/eval/EvalRunner.js +38 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.d.ts +81 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.d.ts.map +1 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.js +371 -0
- package/dist/vendor/codali/eval/GateEvaluator.d.ts +31 -0
- package/dist/vendor/codali/eval/GateEvaluator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/GateEvaluator.js +134 -0
- package/dist/vendor/codali/eval/MetricTypes.d.ts +28 -0
- package/dist/vendor/codali/eval/MetricTypes.d.ts.map +1 -0
- package/dist/vendor/codali/eval/MetricTypes.js +1 -0
- package/dist/vendor/codali/eval/MetricsAggregator.d.ts +4 -0
- package/dist/vendor/codali/eval/MetricsAggregator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/MetricsAggregator.js +97 -0
- package/dist/vendor/codali/eval/RegressionComparator.d.ts +29 -0
- package/dist/vendor/codali/eval/RegressionComparator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/RegressionComparator.js +155 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.d.ts +52 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.js +229 -0
- package/dist/vendor/codali/eval/ReportSerializer.d.ts +32 -0
- package/dist/vendor/codali/eval/ReportSerializer.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportSerializer.js +33 -0
- package/dist/vendor/codali/eval/ReportStore.d.ts +18 -0
- package/dist/vendor/codali/eval/ReportStore.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportStore.js +96 -0
- package/dist/vendor/codali/eval/SuiteLoader.d.ts +12 -0
- package/dist/vendor/codali/eval/SuiteLoader.d.ts.map +1 -0
- package/dist/vendor/codali/eval/SuiteLoader.js +51 -0
- package/dist/vendor/codali/eval/SuiteSchema.d.ts +56 -0
- package/dist/vendor/codali/eval/SuiteSchema.d.ts.map +1 -0
- package/dist/vendor/codali/eval/SuiteSchema.js +357 -0
- package/dist/vendor/codali/index.d.ts +11 -0
- package/dist/vendor/codali/index.d.ts.map +1 -0
- package/dist/vendor/codali/index.js +5 -0
- package/dist/vendor/codali/providers/CodexCliProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/CodexCliProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/CodexCliProvider.js +282 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.js +300 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.js +192 -0
- package/dist/vendor/codali/providers/ProviderRegistry.d.ts +12 -0
- package/dist/vendor/codali/providers/ProviderRegistry.d.ts.map +1 -0
- package/dist/vendor/codali/providers/ProviderRegistry.js +28 -0
- package/dist/vendor/codali/providers/ProviderTypes.d.ts +81 -0
- package/dist/vendor/codali/providers/ProviderTypes.d.ts.map +1 -0
- package/dist/vendor/codali/providers/ProviderTypes.js +1 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.d.ts +189 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.js +1435 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.d.ts +39 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.js +57 -0
- package/dist/vendor/codali/runtime/RunContext.d.ts +27 -0
- package/dist/vendor/codali/runtime/RunContext.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunContext.js +51 -0
- package/dist/vendor/codali/runtime/RunLogQuery.d.ts +48 -0
- package/dist/vendor/codali/runtime/RunLogQuery.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogQuery.js +36 -0
- package/dist/vendor/codali/runtime/RunLogReader.d.ts +19 -0
- package/dist/vendor/codali/runtime/RunLogReader.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogReader.js +361 -0
- package/dist/vendor/codali/runtime/RunLogger.d.ts +71 -0
- package/dist/vendor/codali/runtime/RunLogger.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogger.js +100 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.d.ts +117 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.js +299 -0
- package/dist/vendor/codali/runtime/Runner.d.ts +66 -0
- package/dist/vendor/codali/runtime/Runner.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/Runner.js +215 -0
- package/dist/vendor/codali/runtime/StoragePaths.d.ts +3 -0
- package/dist/vendor/codali/runtime/StoragePaths.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/StoragePaths.js +19 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.d.ts +30 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.js +141 -0
- package/dist/vendor/codali/session/InstructionLoader.d.ts +14 -0
- package/dist/vendor/codali/session/InstructionLoader.d.ts.map +1 -0
- package/dist/vendor/codali/session/InstructionLoader.js +107 -0
- package/dist/vendor/codali/session/SessionStore.d.ts +81 -0
- package/dist/vendor/codali/session/SessionStore.d.ts.map +1 -0
- package/dist/vendor/codali/session/SessionStore.js +244 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.d.ts +68 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.d.ts.map +1 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.js +150 -0
- package/dist/vendor/codali/tools/ToolRegistry.d.ts +9 -0
- package/dist/vendor/codali/tools/ToolRegistry.d.ts.map +1 -0
- package/dist/vendor/codali/tools/ToolRegistry.js +293 -0
- package/dist/vendor/codali/tools/ToolTypes.d.ts +66 -0
- package/dist/vendor/codali/tools/ToolTypes.d.ts.map +1 -0
- package/dist/vendor/codali/tools/ToolTypes.js +40 -0
- package/dist/vendor/codali/tools/diff/DiffTool.d.ts +3 -0
- package/dist/vendor/codali/tools/diff/DiffTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/diff/DiffTool.js +34 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.d.ts +4 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.d.ts.map +1 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.js +490 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.d.ts +3 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.d.ts.map +1 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.js +141 -0
- package/dist/vendor/codali/tools/search/SearchTool.d.ts +3 -0
- package/dist/vendor/codali/tools/search/SearchTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/search/SearchTool.js +46 -0
- package/dist/vendor/codali/tools/shell/ShellTool.d.ts +3 -0
- package/dist/vendor/codali/tools/shell/ShellTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/shell/ShellTool.js +104 -0
- package/package.json +5 -3
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import type { EvalTaskExecution } from "./EvalTaskExecutor.js";
|
|
2
|
+
import type { EvalTaskDefinition } from "./SuiteSchema.js";
|
|
3
|
+
export interface EvalTaskExecutorLike {
|
|
4
|
+
executeTask(task: EvalTaskDefinition): Promise<EvalTaskExecution>;
|
|
5
|
+
}
|
|
6
|
+
export interface EvalRunSummary {
|
|
7
|
+
total: number;
|
|
8
|
+
passed: number;
|
|
9
|
+
failed: number;
|
|
10
|
+
execution_errors: number;
|
|
11
|
+
}
|
|
12
|
+
export interface EvalRunResult {
|
|
13
|
+
schema_version: 1;
|
|
14
|
+
suite_id: string;
|
|
15
|
+
suite_fingerprint: string;
|
|
16
|
+
started_at: string;
|
|
17
|
+
ended_at: string;
|
|
18
|
+
duration_ms: number;
|
|
19
|
+
task_results: EvalTaskExecution[];
|
|
20
|
+
summary: EvalRunSummary;
|
|
21
|
+
}
|
|
22
|
+
export declare class EvalRunner {
|
|
23
|
+
private readonly suiteId;
|
|
24
|
+
private readonly suiteFingerprint;
|
|
25
|
+
private readonly tasks;
|
|
26
|
+
private readonly executor;
|
|
27
|
+
constructor(params: {
|
|
28
|
+
suite_id: string;
|
|
29
|
+
suite_fingerprint: string;
|
|
30
|
+
tasks: EvalTaskDefinition[];
|
|
31
|
+
executor: EvalTaskExecutorLike;
|
|
32
|
+
});
|
|
33
|
+
run(): Promise<EvalRunResult>;
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=EvalRunner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"EvalRunner.d.ts","sourceRoot":"","sources":["../../src/eval/EvalRunner.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAC;AAC/D,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAE3D,MAAM,WAAW,oBAAoB;IACnC,WAAW,CAAC,IAAI,EAAE,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAAC;CACnE;AAED,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,gBAAgB,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,aAAa;IAC5B,cAAc,EAAE,CAAC,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,iBAAiB,EAAE,CAAC;IAClC,OAAO,EAAE,cAAc,CAAC;CACzB;AAED,qBAAa,UAAU;IACrB,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAS;IAEjC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAS;IAE1C,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAuB;IAE7C,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAuB;gBAEpC,MAAM,EAAE;QAClB,QAAQ,EAAE,MAAM,CAAC;QACjB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,KAAK,EAAE,kBAAkB,EAAE,CAAC;QAC5B,QAAQ,EAAE,oBAAoB,CAAC;KAChC;IAOK,GAAG,IAAI,OAAO,CAAC,aAAa,CAAC;CA8BpC"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export class EvalRunner {
|
|
2
|
+
constructor(params) {
|
|
3
|
+
this.suiteId = params.suite_id;
|
|
4
|
+
this.suiteFingerprint = params.suite_fingerprint;
|
|
5
|
+
this.tasks = params.tasks;
|
|
6
|
+
this.executor = params.executor;
|
|
7
|
+
}
|
|
8
|
+
async run() {
|
|
9
|
+
const startedAtMs = Date.now();
|
|
10
|
+
const startedAt = new Date(startedAtMs).toISOString();
|
|
11
|
+
const taskResults = [];
|
|
12
|
+
for (const task of this.tasks) {
|
|
13
|
+
// Keep task ordering deterministic by executing suites sequentially.
|
|
14
|
+
// This also keeps report comparisons stable across repeated runs.
|
|
15
|
+
// eslint-disable-next-line no-await-in-loop
|
|
16
|
+
taskResults.push(await this.executor.executeTask(task));
|
|
17
|
+
}
|
|
18
|
+
const endedAtMs = Date.now();
|
|
19
|
+
const executionErrors = taskResults.filter((result) => Boolean(result.execution_error)).length;
|
|
20
|
+
const passed = taskResults.filter((result) => result.task_passed).length;
|
|
21
|
+
const summary = {
|
|
22
|
+
total: taskResults.length,
|
|
23
|
+
passed,
|
|
24
|
+
failed: taskResults.length - passed,
|
|
25
|
+
execution_errors: executionErrors,
|
|
26
|
+
};
|
|
27
|
+
return {
|
|
28
|
+
schema_version: 1,
|
|
29
|
+
suite_id: this.suiteId,
|
|
30
|
+
suite_fingerprint: this.suiteFingerprint,
|
|
31
|
+
started_at: startedAt,
|
|
32
|
+
ended_at: new Date(endedAtMs).toISOString(),
|
|
33
|
+
duration_ms: endedAtMs - startedAtMs,
|
|
34
|
+
task_results: taskResults,
|
|
35
|
+
summary,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import type { VerificationOutcome } from "../cognitive/Types.js";
|
|
2
|
+
import type { SafetyTelemetryEventData } from "../runtime/RunLogger.js";
|
|
3
|
+
import { type NormalizedRunRecord } from "./ReportInputAdapter.js";
|
|
4
|
+
import type { EvalTaskDefinition } from "./SuiteSchema.js";
|
|
5
|
+
export interface EvalRunMeta {
|
|
6
|
+
runId?: string;
|
|
7
|
+
fingerprint?: string | null;
|
|
8
|
+
logPath?: string;
|
|
9
|
+
outputLogPath?: string;
|
|
10
|
+
touchedFiles: string[];
|
|
11
|
+
command?: string;
|
|
12
|
+
commandRunId?: string;
|
|
13
|
+
jobId?: string;
|
|
14
|
+
project?: string;
|
|
15
|
+
taskId?: string;
|
|
16
|
+
taskKey?: string;
|
|
17
|
+
agentId?: string;
|
|
18
|
+
agentSlug?: string;
|
|
19
|
+
workflow?: Record<string, unknown> | null;
|
|
20
|
+
}
|
|
21
|
+
export interface EvalAssertionResult {
|
|
22
|
+
code: string;
|
|
23
|
+
passed: boolean;
|
|
24
|
+
message: string;
|
|
25
|
+
expected?: unknown;
|
|
26
|
+
actual?: unknown;
|
|
27
|
+
}
|
|
28
|
+
export interface EvalTaskExecution {
|
|
29
|
+
task_id: string;
|
|
30
|
+
title: string;
|
|
31
|
+
command: string;
|
|
32
|
+
mode: "success" | "failure";
|
|
33
|
+
started_at: string;
|
|
34
|
+
ended_at: string;
|
|
35
|
+
duration_ms: number;
|
|
36
|
+
exit_code: number | null;
|
|
37
|
+
run_succeeded: boolean;
|
|
38
|
+
task_passed: boolean;
|
|
39
|
+
first_pass: boolean | null;
|
|
40
|
+
patch_apply_success: boolean | null;
|
|
41
|
+
verification_outcome: VerificationOutcome | null;
|
|
42
|
+
verification_passed: boolean | null;
|
|
43
|
+
hallucination_detected: boolean | null;
|
|
44
|
+
scope_violation_detected: boolean | null;
|
|
45
|
+
latency_ms: number | null;
|
|
46
|
+
tokens_used: number | null;
|
|
47
|
+
cost_usd: number | null;
|
|
48
|
+
assertion_results: EvalAssertionResult[];
|
|
49
|
+
stdout: string;
|
|
50
|
+
stderr: string;
|
|
51
|
+
command_line: string[];
|
|
52
|
+
run_meta?: EvalRunMeta;
|
|
53
|
+
run_summary?: Record<string, unknown>;
|
|
54
|
+
normalized_run?: NormalizedRunRecord;
|
|
55
|
+
safety_events: SafetyTelemetryEventData[];
|
|
56
|
+
execution_error?: string;
|
|
57
|
+
}
|
|
58
|
+
export interface EvalTaskExecutorOptions {
|
|
59
|
+
workspace_root: string;
|
|
60
|
+
suite_dir: string;
|
|
61
|
+
cli_entry?: string;
|
|
62
|
+
provider?: string;
|
|
63
|
+
model?: string;
|
|
64
|
+
api_key?: string;
|
|
65
|
+
base_url?: string;
|
|
66
|
+
agent?: string;
|
|
67
|
+
agent_id?: string;
|
|
68
|
+
agent_slug?: string;
|
|
69
|
+
workflow_profile?: string;
|
|
70
|
+
smart?: boolean;
|
|
71
|
+
no_deep_investigation?: boolean;
|
|
72
|
+
timeout_ms?: number;
|
|
73
|
+
extra_env?: NodeJS.ProcessEnv;
|
|
74
|
+
log_dir?: string;
|
|
75
|
+
}
|
|
76
|
+
export declare class EvalTaskExecutor {
|
|
77
|
+
private readonly options;
|
|
78
|
+
constructor(options: EvalTaskExecutorOptions);
|
|
79
|
+
executeTask(task: EvalTaskDefinition): Promise<EvalTaskExecution>;
|
|
80
|
+
}
|
|
81
|
+
//# sourceMappingURL=EvalTaskExecutor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"EvalTaskExecutor.d.ts","sourceRoot":"","sources":["../../src/eval/EvalTaskExecutor.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,uBAAuB,CAAC;AAGjE,OAAO,KAAK,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC;AACxE,OAAO,EAEL,KAAK,mBAAmB,EACzB,MAAM,yBAAyB,CAAC;AACjC,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAG3D,MAAM,WAAW,WAAW;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;CAC3C;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,OAAO,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAED,MAAM,WAAW,iBAAiB;IAChC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,SAAS,GAAG,SAAS,CAAC;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,aAAa,EAAE,OAAO,CAAC;IACvB,WAAW,EAAE,OAAO,CAAC;IACrB,UAAU,EAAE,OAAO,GAAG,IAAI,CAAC;IAC3B,mBAAmB,EAAE,OAAO,GAAG,IAAI,CAAC;IACpC,oBAAoB,EAAE,mBAAmB,GAAG,IAAI,CAAC;IACjD,mBAAmB,EAAE,OAAO,GAAG,IAAI,CAAC;IACpC,sBAAsB,EAAE,OAAO,GAAG,IAAI,CAAC;IACvC,wBAAwB,EAAE,OAAO,GAAG,IAAI,CAAC;IACzC,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,iBAAiB,EAAE,mBAAmB,EAAE,CAAC;IACzC,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACtC,cAAc,CAAC,EAAE,mBAAmB,CAAC;IACrC,aAAa,EAAE,wBAAwB,EAAE,CAAC;IAC1C,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,uBAAuB;IACtC,cAAc,EAAE,MAAM,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC;IAC9B,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AA+ND,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA0B;gBAEtC,OAAO,EAAE,uBAAuB;IAItC,WAAW,CAAC,IAAI,EAAE,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC;CAmKxE"}
|
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
import { spawnSync } from "node:child_process";
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import process from "node:process";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
6
|
+
import { DEFAULT_LOG_DIR } from "../config/Config.js";
|
|
7
|
+
import { RunLogReader } from "../runtime/RunLogReader.js";
|
|
8
|
+
import { adaptRunSummaryForReport, } from "./ReportInputAdapter.js";
|
|
9
|
+
import { resolveTaskFilePath } from "./SuiteLoader.js";
|
|
10
|
+
const asNumber = (value) => typeof value === "number" && Number.isFinite(value) ? value : null;
|
|
11
|
+
const asRecord = (value) => {
|
|
12
|
+
if (!value || typeof value !== "object" || Array.isArray(value))
|
|
13
|
+
return undefined;
|
|
14
|
+
return value;
|
|
15
|
+
};
|
|
16
|
+
const parseRunMeta = (stderr) => {
|
|
17
|
+
const lines = stderr.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
|
|
18
|
+
for (let index = lines.length - 1; index >= 0; index -= 1) {
|
|
19
|
+
const line = lines[index];
|
|
20
|
+
if (!line.startsWith("CODALI_RUN_META "))
|
|
21
|
+
continue;
|
|
22
|
+
const payload = line.slice("CODALI_RUN_META ".length);
|
|
23
|
+
try {
|
|
24
|
+
const parsed = JSON.parse(payload);
|
|
25
|
+
const touched = parsed.touchedFiles;
|
|
26
|
+
const touchedFiles = Array.isArray(touched)
|
|
27
|
+
? touched.filter((entry) => typeof entry === "string")
|
|
28
|
+
: [];
|
|
29
|
+
return {
|
|
30
|
+
runId: typeof parsed.runId === "string" ? parsed.runId : undefined,
|
|
31
|
+
fingerprint: typeof parsed.fingerprint === "string" ? parsed.fingerprint : null,
|
|
32
|
+
logPath: typeof parsed.logPath === "string" ? parsed.logPath : undefined,
|
|
33
|
+
outputLogPath: typeof parsed.outputLogPath === "string" ? parsed.outputLogPath : undefined,
|
|
34
|
+
touchedFiles,
|
|
35
|
+
command: typeof parsed.command === "string" ? parsed.command : undefined,
|
|
36
|
+
commandRunId: typeof parsed.commandRunId === "string" ? parsed.commandRunId : undefined,
|
|
37
|
+
jobId: typeof parsed.jobId === "string" ? parsed.jobId : undefined,
|
|
38
|
+
project: typeof parsed.project === "string" ? parsed.project : undefined,
|
|
39
|
+
taskId: typeof parsed.taskId === "string" ? parsed.taskId : undefined,
|
|
40
|
+
taskKey: typeof parsed.taskKey === "string" ? parsed.taskKey : undefined,
|
|
41
|
+
agentId: typeof parsed.agentId === "string" ? parsed.agentId : undefined,
|
|
42
|
+
agentSlug: typeof parsed.agentSlug === "string" ? parsed.agentSlug : undefined,
|
|
43
|
+
workflow: asRecord(parsed.workflow) ?? null,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
return undefined;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return undefined;
|
|
51
|
+
};
|
|
52
|
+
const readRunLog = async (logPath) => {
|
|
53
|
+
if (!logPath)
|
|
54
|
+
return { run_failed_reasons: [] };
|
|
55
|
+
let content = "";
|
|
56
|
+
try {
|
|
57
|
+
content = await readFile(logPath, "utf8");
|
|
58
|
+
}
|
|
59
|
+
catch {
|
|
60
|
+
return { run_failed_reasons: [] };
|
|
61
|
+
}
|
|
62
|
+
let runSummary;
|
|
63
|
+
const runFailedReasons = [];
|
|
64
|
+
const lines = content.split("\n").filter(Boolean);
|
|
65
|
+
for (const line of lines) {
|
|
66
|
+
let parsed;
|
|
67
|
+
try {
|
|
68
|
+
parsed = JSON.parse(line);
|
|
69
|
+
}
|
|
70
|
+
catch {
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
const type = typeof parsed.type === "string" ? parsed.type : "";
|
|
74
|
+
const data = asRecord(parsed.data) ?? {};
|
|
75
|
+
if (type === "run_summary") {
|
|
76
|
+
runSummary = data;
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
if (type !== "run_failed")
|
|
80
|
+
continue;
|
|
81
|
+
const reasonsValue = data.reasons;
|
|
82
|
+
if (Array.isArray(reasonsValue)) {
|
|
83
|
+
for (const reason of reasonsValue) {
|
|
84
|
+
if (typeof reason === "string" && reason.trim())
|
|
85
|
+
runFailedReasons.push(reason.trim());
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
const stage = typeof data.stage === "string" ? data.stage.trim() : "";
|
|
89
|
+
if (stage)
|
|
90
|
+
runFailedReasons.push(stage);
|
|
91
|
+
}
|
|
92
|
+
return { run_summary: runSummary, run_failed_reasons: runFailedReasons };
|
|
93
|
+
};
|
|
94
|
+
const resolveCliEntry = (provided) => {
|
|
95
|
+
if (provided)
|
|
96
|
+
return provided;
|
|
97
|
+
if (process.argv[1])
|
|
98
|
+
return process.argv[1];
|
|
99
|
+
const current = fileURLToPath(import.meta.url);
|
|
100
|
+
return path.resolve(current, "..", "..", "cli.js");
|
|
101
|
+
};
|
|
102
|
+
const hasPatchFailure = (stderr, reasons) => {
|
|
103
|
+
const haystack = [stderr, ...reasons].join("\n").toLowerCase();
|
|
104
|
+
return (haystack.includes("patch_apply_failed")
|
|
105
|
+
|| haystack.includes("patch_scope_violation")
|
|
106
|
+
|| haystack.includes("patch_search")
|
|
107
|
+
|| haystack.includes("patch_rollback")
|
|
108
|
+
|| haystack.includes("search block"));
|
|
109
|
+
};
|
|
110
|
+
const detectHallucination = (stderr, reasons) => {
|
|
111
|
+
const haystack = [stderr, ...reasons].join("\n");
|
|
112
|
+
return /\bhallucinat|\bunknown symbol\b|\bunknown file\b|non[-_ ]existent|ENOENT|no such file or directory/i.test(haystack);
|
|
113
|
+
};
|
|
114
|
+
const detectScopeViolation = (stderr, reasons, safetyEvents) => {
|
|
115
|
+
if (safetyEvents.some((event) => event.code === "scope_violation"))
|
|
116
|
+
return true;
|
|
117
|
+
const haystack = [stderr, ...reasons].join("\n");
|
|
118
|
+
return /\bscope_violation\b|patch_outside_allowed_scope|patch_outside_workspace/i.test(haystack);
|
|
119
|
+
};
|
|
120
|
+
const buildAssertions = (params) => {
|
|
121
|
+
const assertions = [];
|
|
122
|
+
const expectedSuccess = params.task.assertions.expect_success;
|
|
123
|
+
assertions.push({
|
|
124
|
+
code: "assert_expect_success",
|
|
125
|
+
passed: params.runSucceeded === expectedSuccess,
|
|
126
|
+
message: "Run success matched expectation.",
|
|
127
|
+
expected: expectedSuccess,
|
|
128
|
+
actual: params.runSucceeded,
|
|
129
|
+
});
|
|
130
|
+
if (params.task.assertions.expect_exit_code !== undefined) {
|
|
131
|
+
assertions.push({
|
|
132
|
+
code: "assert_expect_exit_code",
|
|
133
|
+
passed: params.exitCode === params.task.assertions.expect_exit_code,
|
|
134
|
+
message: "Exit code matched expectation.",
|
|
135
|
+
expected: params.task.assertions.expect_exit_code,
|
|
136
|
+
actual: params.exitCode,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
if (params.task.assertions.expect_patch_apply !== undefined) {
|
|
140
|
+
const actual = params.patchApplySuccess;
|
|
141
|
+
assertions.push({
|
|
142
|
+
code: actual === null ? "assert_expect_patch_apply_missing" : "assert_expect_patch_apply",
|
|
143
|
+
passed: actual !== null && actual === params.task.assertions.expect_patch_apply,
|
|
144
|
+
message: "Patch apply outcome matched expectation.",
|
|
145
|
+
expected: params.task.assertions.expect_patch_apply,
|
|
146
|
+
actual,
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
if (params.task.assertions.expect_verification !== undefined
|
|
150
|
+
&& params.task.assertions.expect_verification !== "any") {
|
|
151
|
+
const actual = params.verificationOutcome;
|
|
152
|
+
assertions.push({
|
|
153
|
+
code: actual === null ? "assert_expect_verification_missing" : "assert_expect_verification",
|
|
154
|
+
passed: actual !== null && actual === params.task.assertions.expect_verification,
|
|
155
|
+
message: "Verification outcome matched expectation.",
|
|
156
|
+
expected: params.task.assertions.expect_verification,
|
|
157
|
+
actual,
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
if (params.task.assertions.max_latency_ms !== undefined) {
|
|
161
|
+
const actual = params.latencyMs;
|
|
162
|
+
assertions.push({
|
|
163
|
+
code: actual === null ? "assert_max_latency_missing" : "assert_max_latency",
|
|
164
|
+
passed: actual !== null && actual <= params.task.assertions.max_latency_ms,
|
|
165
|
+
message: "Latency stayed within threshold.",
|
|
166
|
+
expected: params.task.assertions.max_latency_ms,
|
|
167
|
+
actual,
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
if (params.task.assertions.max_cost_usd !== undefined) {
|
|
171
|
+
const actual = params.costUsd;
|
|
172
|
+
assertions.push({
|
|
173
|
+
code: actual === null ? "assert_max_cost_missing" : "assert_max_cost",
|
|
174
|
+
passed: actual !== null && actual <= params.task.assertions.max_cost_usd,
|
|
175
|
+
message: "Cost stayed within threshold.",
|
|
176
|
+
expected: params.task.assertions.max_cost_usd,
|
|
177
|
+
actual,
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
if (!params.task.assertions.allow_hallucination) {
|
|
181
|
+
assertions.push({
|
|
182
|
+
code: "assert_no_hallucination",
|
|
183
|
+
passed: params.hallucinationDetected !== true,
|
|
184
|
+
message: "No hallucination signals were detected.",
|
|
185
|
+
expected: false,
|
|
186
|
+
actual: params.hallucinationDetected,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
if (!params.task.assertions.allow_scope_violation) {
|
|
190
|
+
assertions.push({
|
|
191
|
+
code: "assert_no_scope_violation",
|
|
192
|
+
passed: params.scopeViolationDetected !== true,
|
|
193
|
+
message: "No scope-violation signals were detected.",
|
|
194
|
+
expected: false,
|
|
195
|
+
actual: params.scopeViolationDetected,
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
return assertions;
|
|
199
|
+
};
|
|
200
|
+
export class EvalTaskExecutor {
|
|
201
|
+
constructor(options) {
|
|
202
|
+
this.options = options;
|
|
203
|
+
}
|
|
204
|
+
async executeTask(task) {
|
|
205
|
+
const startedAtMs = Date.now();
|
|
206
|
+
const startedAt = new Date(startedAtMs).toISOString();
|
|
207
|
+
const cliEntry = resolveCliEntry(this.options.cli_entry);
|
|
208
|
+
const commandArgs = [task.command, "--workspace-root", this.options.workspace_root];
|
|
209
|
+
if (this.options.provider)
|
|
210
|
+
commandArgs.push("--provider", this.options.provider);
|
|
211
|
+
if (this.options.model)
|
|
212
|
+
commandArgs.push("--model", this.options.model);
|
|
213
|
+
if (this.options.api_key)
|
|
214
|
+
commandArgs.push("--api-key", this.options.api_key);
|
|
215
|
+
if (this.options.base_url)
|
|
216
|
+
commandArgs.push("--base-url", this.options.base_url);
|
|
217
|
+
if (this.options.agent)
|
|
218
|
+
commandArgs.push("--agent", this.options.agent);
|
|
219
|
+
if (this.options.agent_id)
|
|
220
|
+
commandArgs.push("--agent-id", this.options.agent_id);
|
|
221
|
+
if (this.options.agent_slug)
|
|
222
|
+
commandArgs.push("--agent-slug", this.options.agent_slug);
|
|
223
|
+
if (this.options.workflow_profile)
|
|
224
|
+
commandArgs.push("--profile", this.options.workflow_profile);
|
|
225
|
+
if (this.options.smart === true)
|
|
226
|
+
commandArgs.push("--smart");
|
|
227
|
+
if (this.options.no_deep_investigation)
|
|
228
|
+
commandArgs.push("--no-deep-investigation");
|
|
229
|
+
if (task.args.length > 0)
|
|
230
|
+
commandArgs.push(...task.args);
|
|
231
|
+
const taskFilePath = resolveTaskFilePath(task, this.options.suite_dir, this.options.workspace_root);
|
|
232
|
+
if (taskFilePath) {
|
|
233
|
+
commandArgs.push("--task", taskFilePath);
|
|
234
|
+
}
|
|
235
|
+
else if (task.inline_task) {
|
|
236
|
+
commandArgs.push(task.inline_task);
|
|
237
|
+
}
|
|
238
|
+
const commandLine = [process.execPath, cliEntry, ...commandArgs];
|
|
239
|
+
let stdout = "";
|
|
240
|
+
let stderr = "";
|
|
241
|
+
let exitCode = null;
|
|
242
|
+
let executionError;
|
|
243
|
+
try {
|
|
244
|
+
const result = spawnSync(process.execPath, [cliEntry, ...commandArgs], {
|
|
245
|
+
cwd: this.options.workspace_root,
|
|
246
|
+
encoding: "utf8",
|
|
247
|
+
env: {
|
|
248
|
+
...process.env,
|
|
249
|
+
...(this.options.extra_env ?? {}),
|
|
250
|
+
},
|
|
251
|
+
timeout: this.options.timeout_ms ?? 20 * 60 * 1000,
|
|
252
|
+
});
|
|
253
|
+
stdout = (result.stdout ?? "").toString();
|
|
254
|
+
stderr = (result.stderr ?? "").toString();
|
|
255
|
+
exitCode = typeof result.status === "number" ? result.status : null;
|
|
256
|
+
if (result.error)
|
|
257
|
+
executionError = String(result.error);
|
|
258
|
+
if (result.signal && exitCode === null)
|
|
259
|
+
executionError = `terminated_by_signal:${result.signal}`;
|
|
260
|
+
}
|
|
261
|
+
catch (error) {
|
|
262
|
+
executionError = error instanceof Error ? error.message : String(error);
|
|
263
|
+
}
|
|
264
|
+
const runMeta = parseRunMeta(stderr);
|
|
265
|
+
const runLog = await readRunLog(runMeta?.logPath);
|
|
266
|
+
const reader = new RunLogReader(this.options.workspace_root, this.options.log_dir ?? DEFAULT_LOG_DIR);
|
|
267
|
+
const safetyEvents = runMeta?.runId
|
|
268
|
+
? await reader.getSafetyEvents(runMeta.runId)
|
|
269
|
+
: [];
|
|
270
|
+
const verificationReports = runMeta?.runId
|
|
271
|
+
? await reader.getVerificationReports(runMeta.runId)
|
|
272
|
+
: [];
|
|
273
|
+
const latestVerification = verificationReports.length
|
|
274
|
+
? verificationReports[verificationReports.length - 1]
|
|
275
|
+
: undefined;
|
|
276
|
+
const runSummary = runLog.run_summary;
|
|
277
|
+
const normalizedRun = adaptRunSummaryForReport({
|
|
278
|
+
runSummary,
|
|
279
|
+
runId: runMeta?.runId,
|
|
280
|
+
taskId: task.id,
|
|
281
|
+
verificationOutcome: latestVerification?.outcome ?? null,
|
|
282
|
+
touchedFiles: runMeta?.touchedFiles ?? [],
|
|
283
|
+
});
|
|
284
|
+
const runSucceeded = exitCode === 0 && !executionError;
|
|
285
|
+
const firstPass = (() => {
|
|
286
|
+
const smartRuntime = asRecord(runSummary?.smartRuntime);
|
|
287
|
+
const attempts = asNumber(smartRuntime?.attempts);
|
|
288
|
+
if (attempts !== null)
|
|
289
|
+
return attempts <= 1;
|
|
290
|
+
return runSucceeded ? true : false;
|
|
291
|
+
})();
|
|
292
|
+
const patchApplySuccess = (() => {
|
|
293
|
+
if (runMeta?.touchedFiles?.length)
|
|
294
|
+
return true;
|
|
295
|
+
if (hasPatchFailure(stderr, runLog.run_failed_reasons))
|
|
296
|
+
return false;
|
|
297
|
+
return null;
|
|
298
|
+
})();
|
|
299
|
+
const verificationOutcome = (() => {
|
|
300
|
+
const normalizedOutcome = normalizedRun.verification_outcome;
|
|
301
|
+
if (normalizedOutcome)
|
|
302
|
+
return normalizedOutcome;
|
|
303
|
+
if (latestVerification?.outcome)
|
|
304
|
+
return latestVerification.outcome;
|
|
305
|
+
const verification = asRecord(runSummary?.verification);
|
|
306
|
+
const outcome = verification?.outcome;
|
|
307
|
+
if (outcome === "verified_passed"
|
|
308
|
+
|| outcome === "verified_failed"
|
|
309
|
+
|| outcome === "unverified_with_reason") {
|
|
310
|
+
return outcome;
|
|
311
|
+
}
|
|
312
|
+
return null;
|
|
313
|
+
})();
|
|
314
|
+
const verificationPassed = verificationOutcome === null ? null : verificationOutcome === "verified_passed";
|
|
315
|
+
const hallucinationDetected = detectHallucination(stderr, runLog.run_failed_reasons);
|
|
316
|
+
const scopeViolationDetected = detectScopeViolation(stderr, runLog.run_failed_reasons, safetyEvents);
|
|
317
|
+
const latencyMs = normalizedRun.duration_ms ?? asNumber(runSummary?.durationMs) ?? (Date.now() - startedAtMs);
|
|
318
|
+
const usage = asRecord(runSummary?.usage);
|
|
319
|
+
const tokensUsed = normalizedRun.usage_tokens_total
|
|
320
|
+
?? asNumber(usage?.totalTokens)
|
|
321
|
+
?? (() => {
|
|
322
|
+
const input = asNumber(usage?.inputTokens) ?? 0;
|
|
323
|
+
const output = asNumber(usage?.outputTokens) ?? 0;
|
|
324
|
+
return input + output > 0 ? input + output : null;
|
|
325
|
+
})();
|
|
326
|
+
const costUsd = normalizedRun.cost_usd ?? asNumber(runSummary?.actualCost);
|
|
327
|
+
const assertionResults = buildAssertions({
|
|
328
|
+
task,
|
|
329
|
+
runSucceeded,
|
|
330
|
+
exitCode,
|
|
331
|
+
patchApplySuccess,
|
|
332
|
+
verificationOutcome,
|
|
333
|
+
latencyMs,
|
|
334
|
+
costUsd,
|
|
335
|
+
hallucinationDetected,
|
|
336
|
+
scopeViolationDetected,
|
|
337
|
+
});
|
|
338
|
+
const taskPassed = assertionResults.every((assertion) => assertion.passed);
|
|
339
|
+
const endedAtMs = Date.now();
|
|
340
|
+
return {
|
|
341
|
+
task_id: task.id,
|
|
342
|
+
title: task.title,
|
|
343
|
+
command: task.command,
|
|
344
|
+
mode: task.mode,
|
|
345
|
+
started_at: startedAt,
|
|
346
|
+
ended_at: new Date(endedAtMs).toISOString(),
|
|
347
|
+
duration_ms: endedAtMs - startedAtMs,
|
|
348
|
+
exit_code: exitCode,
|
|
349
|
+
run_succeeded: runSucceeded,
|
|
350
|
+
task_passed: taskPassed,
|
|
351
|
+
first_pass: firstPass,
|
|
352
|
+
patch_apply_success: patchApplySuccess,
|
|
353
|
+
verification_outcome: verificationOutcome,
|
|
354
|
+
verification_passed: verificationPassed,
|
|
355
|
+
hallucination_detected: hallucinationDetected,
|
|
356
|
+
scope_violation_detected: scopeViolationDetected,
|
|
357
|
+
latency_ms: latencyMs,
|
|
358
|
+
tokens_used: tokensUsed,
|
|
359
|
+
cost_usd: costUsd,
|
|
360
|
+
assertion_results: assertionResults,
|
|
361
|
+
stdout,
|
|
362
|
+
stderr,
|
|
363
|
+
command_line: commandLine,
|
|
364
|
+
run_meta: runMeta,
|
|
365
|
+
run_summary: runSummary,
|
|
366
|
+
normalized_run: normalizedRun,
|
|
367
|
+
safety_events: safetyEvents,
|
|
368
|
+
execution_error: executionError,
|
|
369
|
+
};
|
|
370
|
+
}
|
|
371
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import type { EvalMetrics } from "./MetricTypes.js";
|
|
2
|
+
import type { EvalRegressionComparison } from "./RegressionComparator.js";
|
|
3
|
+
export interface EvalGateThresholds {
|
|
4
|
+
patch_apply_drop_max: number;
|
|
5
|
+
verification_pass_rate_min: number;
|
|
6
|
+
hallucination_rate_max: number;
|
|
7
|
+
scope_violation_rate_max: number;
|
|
8
|
+
}
|
|
9
|
+
export interface EvalGateFailure {
|
|
10
|
+
code: string;
|
|
11
|
+
metric: string;
|
|
12
|
+
message: string;
|
|
13
|
+
threshold: number;
|
|
14
|
+
actual: number | null;
|
|
15
|
+
baseline?: number | null;
|
|
16
|
+
delta?: number | null;
|
|
17
|
+
}
|
|
18
|
+
export interface EvalGateResult {
|
|
19
|
+
schema_version: 1;
|
|
20
|
+
passed: boolean;
|
|
21
|
+
thresholds: EvalGateThresholds;
|
|
22
|
+
failures: EvalGateFailure[];
|
|
23
|
+
}
|
|
24
|
+
export declare const DEFAULT_EVAL_GATE_THRESHOLDS: EvalGateThresholds;
|
|
25
|
+
export declare const resolveGateThresholds: (...sources: Array<Partial<EvalGateThresholds> | undefined>) => EvalGateThresholds;
|
|
26
|
+
export declare const evaluateGates: (params: {
|
|
27
|
+
metrics: EvalMetrics;
|
|
28
|
+
thresholds: EvalGateThresholds;
|
|
29
|
+
comparison?: EvalRegressionComparison;
|
|
30
|
+
}) => EvalGateResult;
|
|
31
|
+
//# sourceMappingURL=GateEvaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"GateEvaluator.d.ts","sourceRoot":"","sources":["../../src/eval/GateEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,KAAK,EAAE,wBAAwB,EAAE,MAAM,2BAA2B,CAAC;AAE1E,MAAM,WAAW,kBAAkB;IACjC,oBAAoB,EAAE,MAAM,CAAC;IAC7B,0BAA0B,EAAE,MAAM,CAAC;IACnC,sBAAsB,EAAE,MAAM,CAAC;IAC/B,wBAAwB,EAAE,MAAM,CAAC;CAClC;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,KAAK,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CACvB;AAED,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,CAAC,CAAC;IAClB,MAAM,EAAE,OAAO,CAAC;IAChB,UAAU,EAAE,kBAAkB,CAAC;IAC/B,QAAQ,EAAE,eAAe,EAAE,CAAC;CAC7B;AAED,eAAO,MAAM,4BAA4B,EAAE,kBAK1C,CAAC;AAiBF,eAAO,MAAM,qBAAqB,GAChC,GAAG,SAAS,KAAK,CAAC,OAAO,CAAC,kBAAkB,CAAC,GAAG,SAAS,CAAC,KACzD,kBAuCF,CAAC;AAgBF,eAAO,MAAM,aAAa,GAAI,QAAQ;IACpC,OAAO,EAAE,WAAW,CAAC;IACrB,UAAU,EAAE,kBAAkB,CAAC;IAC/B,UAAU,CAAC,EAAE,wBAAwB,CAAC;CACvC,KAAG,cAmFH,CAAC"}
|