@mcoda/mswarm 0.1.57 → 0.1.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. package/README.md +19 -0
  2. package/dist/codali-executor.d.ts +266 -0
  3. package/dist/codali-executor.d.ts.map +1 -0
  4. package/dist/codali-executor.js +227 -0
  5. package/dist/codali-executor.js.map +1 -0
  6. package/dist/runtime.d.ts +36 -1
  7. package/dist/runtime.d.ts.map +1 -1
  8. package/dist/runtime.js +219 -30
  9. package/dist/runtime.js.map +1 -1
  10. package/dist/server.d.ts.map +1 -1
  11. package/dist/server.js +54 -0
  12. package/dist/server.js.map +1 -1
  13. package/dist/vendor/codali/agents/AgentProtocol.d.ts +287 -0
  14. package/dist/vendor/codali/agents/AgentProtocol.d.ts.map +1 -0
  15. package/dist/vendor/codali/agents/AgentProtocol.js +365 -0
  16. package/dist/vendor/codali/agents/AgentResolver.d.ts +23 -0
  17. package/dist/vendor/codali/agents/AgentResolver.d.ts.map +1 -0
  18. package/dist/vendor/codali/agents/AgentResolver.js +77 -0
  19. package/dist/vendor/codali/agents/PhaseAgentSelector.d.ts +23 -0
  20. package/dist/vendor/codali/agents/PhaseAgentSelector.d.ts.map +1 -0
  21. package/dist/vendor/codali/agents/PhaseAgentSelector.js +287 -0
  22. package/dist/vendor/codali/cli/EvalCommand.d.ts +37 -0
  23. package/dist/vendor/codali/cli/EvalCommand.d.ts.map +1 -0
  24. package/dist/vendor/codali/cli/EvalCommand.js +333 -0
  25. package/dist/vendor/codali/cli/FeedbackCommand.d.ts +22 -0
  26. package/dist/vendor/codali/cli/FeedbackCommand.d.ts.map +1 -0
  27. package/dist/vendor/codali/cli/FeedbackCommand.js +163 -0
  28. package/dist/vendor/codali/cli/RunCommand.d.ts +78 -0
  29. package/dist/vendor/codali/cli/RunCommand.d.ts.map +1 -0
  30. package/dist/vendor/codali/cli/RunCommand.js +2261 -0
  31. package/dist/vendor/codali/cli.d.ts +3 -0
  32. package/dist/vendor/codali/cli.d.ts.map +1 -0
  33. package/dist/vendor/codali/cli.js +109 -0
  34. package/dist/vendor/codali/cognitive/ArchitectPlanner.d.ts +107 -0
  35. package/dist/vendor/codali/cognitive/ArchitectPlanner.d.ts.map +1 -0
  36. package/dist/vendor/codali/cognitive/ArchitectPlanner.js +1726 -0
  37. package/dist/vendor/codali/cognitive/BuilderOutputParser.d.ts +25 -0
  38. package/dist/vendor/codali/cognitive/BuilderOutputParser.d.ts.map +1 -0
  39. package/dist/vendor/codali/cognitive/BuilderOutputParser.js +164 -0
  40. package/dist/vendor/codali/cognitive/BuilderRunner.d.ts +76 -0
  41. package/dist/vendor/codali/cognitive/BuilderRunner.d.ts.map +1 -0
  42. package/dist/vendor/codali/cognitive/BuilderRunner.js +1159 -0
  43. package/dist/vendor/codali/cognitive/ContextAssembler.d.ts +91 -0
  44. package/dist/vendor/codali/cognitive/ContextAssembler.d.ts.map +1 -0
  45. package/dist/vendor/codali/cognitive/ContextAssembler.js +4547 -0
  46. package/dist/vendor/codali/cognitive/ContextBudget.d.ts +19 -0
  47. package/dist/vendor/codali/cognitive/ContextBudget.d.ts.map +1 -0
  48. package/dist/vendor/codali/cognitive/ContextBudget.js +35 -0
  49. package/dist/vendor/codali/cognitive/ContextFileLoader.d.ts +30 -0
  50. package/dist/vendor/codali/cognitive/ContextFileLoader.d.ts.map +1 -0
  51. package/dist/vendor/codali/cognitive/ContextFileLoader.js +307 -0
  52. package/dist/vendor/codali/cognitive/ContextManager.d.ts +47 -0
  53. package/dist/vendor/codali/cognitive/ContextManager.d.ts.map +1 -0
  54. package/dist/vendor/codali/cognitive/ContextManager.js +272 -0
  55. package/dist/vendor/codali/cognitive/ContextRedactor.d.ts +18 -0
  56. package/dist/vendor/codali/cognitive/ContextRedactor.d.ts.map +1 -0
  57. package/dist/vendor/codali/cognitive/ContextRedactor.js +53 -0
  58. package/dist/vendor/codali/cognitive/ContextSelector.d.ts +22 -0
  59. package/dist/vendor/codali/cognitive/ContextSelector.d.ts.map +1 -0
  60. package/dist/vendor/codali/cognitive/ContextSelector.js +431 -0
  61. package/dist/vendor/codali/cognitive/ContextSerializer.d.ts +8 -0
  62. package/dist/vendor/codali/cognitive/ContextSerializer.d.ts.map +1 -0
  63. package/dist/vendor/codali/cognitive/ContextSerializer.js +882 -0
  64. package/dist/vendor/codali/cognitive/ContextStore.d.ts +27 -0
  65. package/dist/vendor/codali/cognitive/ContextStore.d.ts.map +1 -0
  66. package/dist/vendor/codali/cognitive/ContextStore.js +79 -0
  67. package/dist/vendor/codali/cognitive/ContextSummarizer.d.ts +16 -0
  68. package/dist/vendor/codali/cognitive/ContextSummarizer.d.ts.map +1 -0
  69. package/dist/vendor/codali/cognitive/ContextSummarizer.js +45 -0
  70. package/dist/vendor/codali/cognitive/CostEstimator.d.ts +31 -0
  71. package/dist/vendor/codali/cognitive/CostEstimator.d.ts.map +1 -0
  72. package/dist/vendor/codali/cognitive/CostEstimator.js +66 -0
  73. package/dist/vendor/codali/cognitive/CriticEvaluator.d.ts +32 -0
  74. package/dist/vendor/codali/cognitive/CriticEvaluator.d.ts.map +1 -0
  75. package/dist/vendor/codali/cognitive/CriticEvaluator.js +297 -0
  76. package/dist/vendor/codali/cognitive/EvidenceGate.d.ts +9 -0
  77. package/dist/vendor/codali/cognitive/EvidenceGate.d.ts.map +1 -0
  78. package/dist/vendor/codali/cognitive/EvidenceGate.js +75 -0
  79. package/dist/vendor/codali/cognitive/GoldenExampleIndexer.d.ts +12 -0
  80. package/dist/vendor/codali/cognitive/GoldenExampleIndexer.d.ts.map +1 -0
  81. package/dist/vendor/codali/cognitive/GoldenExampleIndexer.js +34 -0
  82. package/dist/vendor/codali/cognitive/GoldenSetStore.d.ts +33 -0
  83. package/dist/vendor/codali/cognitive/GoldenSetStore.d.ts.map +1 -0
  84. package/dist/vendor/codali/cognitive/GoldenSetStore.js +159 -0
  85. package/dist/vendor/codali/cognitive/IntentSignals.d.ts +7 -0
  86. package/dist/vendor/codali/cognitive/IntentSignals.d.ts.map +1 -0
  87. package/dist/vendor/codali/cognitive/IntentSignals.js +285 -0
  88. package/dist/vendor/codali/cognitive/LearningGovernance.d.ts +100 -0
  89. package/dist/vendor/codali/cognitive/LearningGovernance.d.ts.map +1 -0
  90. package/dist/vendor/codali/cognitive/LearningGovernance.js +276 -0
  91. package/dist/vendor/codali/cognitive/MemoryWriteback.d.ts +64 -0
  92. package/dist/vendor/codali/cognitive/MemoryWriteback.d.ts.map +1 -0
  93. package/dist/vendor/codali/cognitive/MemoryWriteback.js +287 -0
  94. package/dist/vendor/codali/cognitive/PatchApplier.d.ts +49 -0
  95. package/dist/vendor/codali/cognitive/PatchApplier.d.ts.map +1 -0
  96. package/dist/vendor/codali/cognitive/PatchApplier.js +199 -0
  97. package/dist/vendor/codali/cognitive/PatchInterpreter.d.ts +35 -0
  98. package/dist/vendor/codali/cognitive/PatchInterpreter.d.ts.map +1 -0
  99. package/dist/vendor/codali/cognitive/PatchInterpreter.js +100 -0
  100. package/dist/vendor/codali/cognitive/PatchOutputNormalizer.d.ts +7 -0
  101. package/dist/vendor/codali/cognitive/PatchOutputNormalizer.d.ts.map +1 -0
  102. package/dist/vendor/codali/cognitive/PatchOutputNormalizer.js +59 -0
  103. package/dist/vendor/codali/cognitive/PostMortemAnalyzer.d.ts +17 -0
  104. package/dist/vendor/codali/cognitive/PostMortemAnalyzer.d.ts.map +1 -0
  105. package/dist/vendor/codali/cognitive/PostMortemAnalyzer.js +131 -0
  106. package/dist/vendor/codali/cognitive/PreferenceExtraction.d.ts +3 -0
  107. package/dist/vendor/codali/cognitive/PreferenceExtraction.d.ts.map +1 -0
  108. package/dist/vendor/codali/cognitive/PreferenceExtraction.js +85 -0
  109. package/dist/vendor/codali/cognitive/Prompts.d.ts +15 -0
  110. package/dist/vendor/codali/cognitive/Prompts.d.ts.map +1 -0
  111. package/dist/vendor/codali/cognitive/Prompts.js +326 -0
  112. package/dist/vendor/codali/cognitive/ProviderRouting.d.ts +16 -0
  113. package/dist/vendor/codali/cognitive/ProviderRouting.d.ts.map +1 -0
  114. package/dist/vendor/codali/cognitive/ProviderRouting.js +24 -0
  115. package/dist/vendor/codali/cognitive/QueryExtraction.d.ts +12 -0
  116. package/dist/vendor/codali/cognitive/QueryExtraction.d.ts.map +1 -0
  117. package/dist/vendor/codali/cognitive/QueryExtraction.js +262 -0
  118. package/dist/vendor/codali/cognitive/RunHistoryIndexer.d.ts +13 -0
  119. package/dist/vendor/codali/cognitive/RunHistoryIndexer.d.ts.map +1 -0
  120. package/dist/vendor/codali/cognitive/RunHistoryIndexer.js +125 -0
  121. package/dist/vendor/codali/cognitive/SmartPipeline.d.ts +92 -0
  122. package/dist/vendor/codali/cognitive/SmartPipeline.d.ts.map +1 -0
  123. package/dist/vendor/codali/cognitive/SmartPipeline.js +4804 -0
  124. package/dist/vendor/codali/cognitive/Types.d.ts +474 -0
  125. package/dist/vendor/codali/cognitive/Types.d.ts.map +1 -0
  126. package/dist/vendor/codali/cognitive/Types.js +7 -0
  127. package/dist/vendor/codali/cognitive/ValidationRunner.d.ts +57 -0
  128. package/dist/vendor/codali/cognitive/ValidationRunner.d.ts.map +1 -0
  129. package/dist/vendor/codali/cognitive/ValidationRunner.js +515 -0
  130. package/dist/vendor/codali/config/Config.d.ts +249 -0
  131. package/dist/vendor/codali/config/Config.d.ts.map +1 -0
  132. package/dist/vendor/codali/config/Config.js +200 -0
  133. package/dist/vendor/codali/config/ConfigLoader.d.ts +56 -0
  134. package/dist/vendor/codali/config/ConfigLoader.d.ts.map +1 -0
  135. package/dist/vendor/codali/config/ConfigLoader.js +1246 -0
  136. package/dist/vendor/codali/docdex/DocdexClient.d.ts +113 -0
  137. package/dist/vendor/codali/docdex/DocdexClient.d.ts.map +1 -0
  138. package/dist/vendor/codali/docdex/DocdexClient.js +524 -0
  139. package/dist/vendor/codali/eval/EvalRunner.d.ts +35 -0
  140. package/dist/vendor/codali/eval/EvalRunner.d.ts.map +1 -0
  141. package/dist/vendor/codali/eval/EvalRunner.js +38 -0
  142. package/dist/vendor/codali/eval/EvalTaskExecutor.d.ts +81 -0
  143. package/dist/vendor/codali/eval/EvalTaskExecutor.d.ts.map +1 -0
  144. package/dist/vendor/codali/eval/EvalTaskExecutor.js +371 -0
  145. package/dist/vendor/codali/eval/GateEvaluator.d.ts +31 -0
  146. package/dist/vendor/codali/eval/GateEvaluator.d.ts.map +1 -0
  147. package/dist/vendor/codali/eval/GateEvaluator.js +134 -0
  148. package/dist/vendor/codali/eval/MetricTypes.d.ts +28 -0
  149. package/dist/vendor/codali/eval/MetricTypes.d.ts.map +1 -0
  150. package/dist/vendor/codali/eval/MetricTypes.js +1 -0
  151. package/dist/vendor/codali/eval/MetricsAggregator.d.ts +4 -0
  152. package/dist/vendor/codali/eval/MetricsAggregator.d.ts.map +1 -0
  153. package/dist/vendor/codali/eval/MetricsAggregator.js +97 -0
  154. package/dist/vendor/codali/eval/RegressionComparator.d.ts +29 -0
  155. package/dist/vendor/codali/eval/RegressionComparator.d.ts.map +1 -0
  156. package/dist/vendor/codali/eval/RegressionComparator.js +155 -0
  157. package/dist/vendor/codali/eval/ReportInputAdapter.d.ts +52 -0
  158. package/dist/vendor/codali/eval/ReportInputAdapter.d.ts.map +1 -0
  159. package/dist/vendor/codali/eval/ReportInputAdapter.js +229 -0
  160. package/dist/vendor/codali/eval/ReportSerializer.d.ts +32 -0
  161. package/dist/vendor/codali/eval/ReportSerializer.d.ts.map +1 -0
  162. package/dist/vendor/codali/eval/ReportSerializer.js +33 -0
  163. package/dist/vendor/codali/eval/ReportStore.d.ts +18 -0
  164. package/dist/vendor/codali/eval/ReportStore.d.ts.map +1 -0
  165. package/dist/vendor/codali/eval/ReportStore.js +96 -0
  166. package/dist/vendor/codali/eval/SuiteLoader.d.ts +12 -0
  167. package/dist/vendor/codali/eval/SuiteLoader.d.ts.map +1 -0
  168. package/dist/vendor/codali/eval/SuiteLoader.js +51 -0
  169. package/dist/vendor/codali/eval/SuiteSchema.d.ts +56 -0
  170. package/dist/vendor/codali/eval/SuiteSchema.d.ts.map +1 -0
  171. package/dist/vendor/codali/eval/SuiteSchema.js +357 -0
  172. package/dist/vendor/codali/index.d.ts +11 -0
  173. package/dist/vendor/codali/index.d.ts.map +1 -0
  174. package/dist/vendor/codali/index.js +5 -0
  175. package/dist/vendor/codali/providers/CodexCliProvider.d.ts +8 -0
  176. package/dist/vendor/codali/providers/CodexCliProvider.d.ts.map +1 -0
  177. package/dist/vendor/codali/providers/CodexCliProvider.js +282 -0
  178. package/dist/vendor/codali/providers/OllamaRemoteProvider.d.ts +8 -0
  179. package/dist/vendor/codali/providers/OllamaRemoteProvider.d.ts.map +1 -0
  180. package/dist/vendor/codali/providers/OllamaRemoteProvider.js +300 -0
  181. package/dist/vendor/codali/providers/OpenAiCompatibleProvider.d.ts +8 -0
  182. package/dist/vendor/codali/providers/OpenAiCompatibleProvider.d.ts.map +1 -0
  183. package/dist/vendor/codali/providers/OpenAiCompatibleProvider.js +192 -0
  184. package/dist/vendor/codali/providers/ProviderRegistry.d.ts +12 -0
  185. package/dist/vendor/codali/providers/ProviderRegistry.d.ts.map +1 -0
  186. package/dist/vendor/codali/providers/ProviderRegistry.js +28 -0
  187. package/dist/vendor/codali/providers/ProviderTypes.d.ts +81 -0
  188. package/dist/vendor/codali/providers/ProviderTypes.d.ts.map +1 -0
  189. package/dist/vendor/codali/providers/ProviderTypes.js +1 -0
  190. package/dist/vendor/codali/runtime/CodaliRuntime.d.ts +183 -0
  191. package/dist/vendor/codali/runtime/CodaliRuntime.d.ts.map +1 -0
  192. package/dist/vendor/codali/runtime/CodaliRuntime.js +1363 -0
  193. package/dist/vendor/codali/runtime/DeepInvestigationErrors.d.ts +39 -0
  194. package/dist/vendor/codali/runtime/DeepInvestigationErrors.d.ts.map +1 -0
  195. package/dist/vendor/codali/runtime/DeepInvestigationErrors.js +57 -0
  196. package/dist/vendor/codali/runtime/RunContext.d.ts +27 -0
  197. package/dist/vendor/codali/runtime/RunContext.d.ts.map +1 -0
  198. package/dist/vendor/codali/runtime/RunContext.js +51 -0
  199. package/dist/vendor/codali/runtime/RunLogQuery.d.ts +48 -0
  200. package/dist/vendor/codali/runtime/RunLogQuery.d.ts.map +1 -0
  201. package/dist/vendor/codali/runtime/RunLogQuery.js +36 -0
  202. package/dist/vendor/codali/runtime/RunLogReader.d.ts +19 -0
  203. package/dist/vendor/codali/runtime/RunLogReader.d.ts.map +1 -0
  204. package/dist/vendor/codali/runtime/RunLogReader.js +361 -0
  205. package/dist/vendor/codali/runtime/RunLogger.d.ts +71 -0
  206. package/dist/vendor/codali/runtime/RunLogger.d.ts.map +1 -0
  207. package/dist/vendor/codali/runtime/RunLogger.js +100 -0
  208. package/dist/vendor/codali/runtime/RunTelemetryTypes.d.ts +117 -0
  209. package/dist/vendor/codali/runtime/RunTelemetryTypes.d.ts.map +1 -0
  210. package/dist/vendor/codali/runtime/RunTelemetryTypes.js +299 -0
  211. package/dist/vendor/codali/runtime/Runner.d.ts +66 -0
  212. package/dist/vendor/codali/runtime/Runner.d.ts.map +1 -0
  213. package/dist/vendor/codali/runtime/Runner.js +215 -0
  214. package/dist/vendor/codali/runtime/StoragePaths.d.ts +3 -0
  215. package/dist/vendor/codali/runtime/StoragePaths.d.ts.map +1 -0
  216. package/dist/vendor/codali/runtime/StoragePaths.js +19 -0
  217. package/dist/vendor/codali/runtime/WorkspaceLock.d.ts +30 -0
  218. package/dist/vendor/codali/runtime/WorkspaceLock.d.ts.map +1 -0
  219. package/dist/vendor/codali/runtime/WorkspaceLock.js +141 -0
  220. package/dist/vendor/codali/session/InstructionLoader.d.ts +14 -0
  221. package/dist/vendor/codali/session/InstructionLoader.d.ts.map +1 -0
  222. package/dist/vendor/codali/session/InstructionLoader.js +107 -0
  223. package/dist/vendor/codali/session/SessionStore.d.ts +81 -0
  224. package/dist/vendor/codali/session/SessionStore.d.ts.map +1 -0
  225. package/dist/vendor/codali/session/SessionStore.js +244 -0
  226. package/dist/vendor/codali/subagents/SubagentOrchestrator.d.ts +68 -0
  227. package/dist/vendor/codali/subagents/SubagentOrchestrator.d.ts.map +1 -0
  228. package/dist/vendor/codali/subagents/SubagentOrchestrator.js +150 -0
  229. package/dist/vendor/codali/tools/ToolRegistry.d.ts +9 -0
  230. package/dist/vendor/codali/tools/ToolRegistry.d.ts.map +1 -0
  231. package/dist/vendor/codali/tools/ToolRegistry.js +263 -0
  232. package/dist/vendor/codali/tools/ToolTypes.d.ts +66 -0
  233. package/dist/vendor/codali/tools/ToolTypes.d.ts.map +1 -0
  234. package/dist/vendor/codali/tools/ToolTypes.js +32 -0
  235. package/dist/vendor/codali/tools/diff/DiffTool.d.ts +3 -0
  236. package/dist/vendor/codali/tools/diff/DiffTool.d.ts.map +1 -0
  237. package/dist/vendor/codali/tools/diff/DiffTool.js +34 -0
  238. package/dist/vendor/codali/tools/docdex/DocdexTools.d.ts +4 -0
  239. package/dist/vendor/codali/tools/docdex/DocdexTools.d.ts.map +1 -0
  240. package/dist/vendor/codali/tools/docdex/DocdexTools.js +453 -0
  241. package/dist/vendor/codali/tools/filesystem/FileTools.d.ts +3 -0
  242. package/dist/vendor/codali/tools/filesystem/FileTools.d.ts.map +1 -0
  243. package/dist/vendor/codali/tools/filesystem/FileTools.js +141 -0
  244. package/dist/vendor/codali/tools/search/SearchTool.d.ts +3 -0
  245. package/dist/vendor/codali/tools/search/SearchTool.d.ts.map +1 -0
  246. package/dist/vendor/codali/tools/search/SearchTool.js +46 -0
  247. package/dist/vendor/codali/tools/shell/ShellTool.d.ts +3 -0
  248. package/dist/vendor/codali/tools/shell/ShellTool.d.ts.map +1 -0
  249. package/dist/vendor/codali/tools/shell/ShellTool.js +104 -0
  250. package/package.json +5 -3
@@ -0,0 +1,38 @@
1
+ export class EvalRunner {
2
+ constructor(params) {
3
+ this.suiteId = params.suite_id;
4
+ this.suiteFingerprint = params.suite_fingerprint;
5
+ this.tasks = params.tasks;
6
+ this.executor = params.executor;
7
+ }
8
+ async run() {
9
+ const startedAtMs = Date.now();
10
+ const startedAt = new Date(startedAtMs).toISOString();
11
+ const taskResults = [];
12
+ for (const task of this.tasks) {
13
+ // Keep task ordering deterministic by executing suites sequentially.
14
+ // This also keeps report comparisons stable across repeated runs.
15
+ // eslint-disable-next-line no-await-in-loop
16
+ taskResults.push(await this.executor.executeTask(task));
17
+ }
18
+ const endedAtMs = Date.now();
19
+ const executionErrors = taskResults.filter((result) => Boolean(result.execution_error)).length;
20
+ const passed = taskResults.filter((result) => result.task_passed).length;
21
+ const summary = {
22
+ total: taskResults.length,
23
+ passed,
24
+ failed: taskResults.length - passed,
25
+ execution_errors: executionErrors,
26
+ };
27
+ return {
28
+ schema_version: 1,
29
+ suite_id: this.suiteId,
30
+ suite_fingerprint: this.suiteFingerprint,
31
+ started_at: startedAt,
32
+ ended_at: new Date(endedAtMs).toISOString(),
33
+ duration_ms: endedAtMs - startedAtMs,
34
+ task_results: taskResults,
35
+ summary,
36
+ };
37
+ }
38
+ }
@@ -0,0 +1,81 @@
1
+ import type { VerificationOutcome } from "../cognitive/Types.js";
2
+ import type { SafetyTelemetryEventData } from "../runtime/RunLogger.js";
3
+ import { type NormalizedRunRecord } from "./ReportInputAdapter.js";
4
+ import type { EvalTaskDefinition } from "./SuiteSchema.js";
5
+ export interface EvalRunMeta {
6
+ runId?: string;
7
+ fingerprint?: string | null;
8
+ logPath?: string;
9
+ outputLogPath?: string;
10
+ touchedFiles: string[];
11
+ command?: string;
12
+ commandRunId?: string;
13
+ jobId?: string;
14
+ project?: string;
15
+ taskId?: string;
16
+ taskKey?: string;
17
+ agentId?: string;
18
+ agentSlug?: string;
19
+ workflow?: Record<string, unknown> | null;
20
+ }
21
+ export interface EvalAssertionResult {
22
+ code: string;
23
+ passed: boolean;
24
+ message: string;
25
+ expected?: unknown;
26
+ actual?: unknown;
27
+ }
28
+ export interface EvalTaskExecution {
29
+ task_id: string;
30
+ title: string;
31
+ command: string;
32
+ mode: "success" | "failure";
33
+ started_at: string;
34
+ ended_at: string;
35
+ duration_ms: number;
36
+ exit_code: number | null;
37
+ run_succeeded: boolean;
38
+ task_passed: boolean;
39
+ first_pass: boolean | null;
40
+ patch_apply_success: boolean | null;
41
+ verification_outcome: VerificationOutcome | null;
42
+ verification_passed: boolean | null;
43
+ hallucination_detected: boolean | null;
44
+ scope_violation_detected: boolean | null;
45
+ latency_ms: number | null;
46
+ tokens_used: number | null;
47
+ cost_usd: number | null;
48
+ assertion_results: EvalAssertionResult[];
49
+ stdout: string;
50
+ stderr: string;
51
+ command_line: string[];
52
+ run_meta?: EvalRunMeta;
53
+ run_summary?: Record<string, unknown>;
54
+ normalized_run?: NormalizedRunRecord;
55
+ safety_events: SafetyTelemetryEventData[];
56
+ execution_error?: string;
57
+ }
58
+ export interface EvalTaskExecutorOptions {
59
+ workspace_root: string;
60
+ suite_dir: string;
61
+ cli_entry?: string;
62
+ provider?: string;
63
+ model?: string;
64
+ api_key?: string;
65
+ base_url?: string;
66
+ agent?: string;
67
+ agent_id?: string;
68
+ agent_slug?: string;
69
+ workflow_profile?: string;
70
+ smart?: boolean;
71
+ no_deep_investigation?: boolean;
72
+ timeout_ms?: number;
73
+ extra_env?: NodeJS.ProcessEnv;
74
+ log_dir?: string;
75
+ }
76
+ export declare class EvalTaskExecutor {
77
+ private readonly options;
78
+ constructor(options: EvalTaskExecutorOptions);
79
+ executeTask(task: EvalTaskDefinition): Promise<EvalTaskExecution>;
80
+ }
81
+ //# sourceMappingURL=EvalTaskExecutor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"EvalTaskExecutor.d.ts","sourceRoot":"","sources":["../../src/eval/EvalTaskExecutor.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,uBAAuB,CAAC;AAGjE,OAAO,KAAK,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC;AACxE,OAAO,EAEL,KAAK,mBAAmB,EACzB,MAAM,yBAAyB,CAAC;AACjC,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AAG3D,MAAM,WAAW,WAAW;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,IAAI,CAAC;CAC3C;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,OAAO,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAED,MAAM,WAAW,iBAAiB;IAChC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,SAAS,GAAG,SAAS,CAAC;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,aAAa,EAAE,OAAO,CAAC;IACvB,WAAW,EAAE,OAAO,CAAC;IACrB,UAAU,EAAE,OAAO,GAAG,IAAI,CAAC;IAC3B,mBAAmB,EAAE,OAAO,GAAG,IAAI,CAAC;IACpC,oBAAoB,EAAE,mBAAmB,GAAG,IAAI,CAAC;IACjD,mBAAmB,EAAE,OAAO,GAAG,IAAI,CAAC;IACpC,sBAAsB,EAAE,OAAO,GAAG,IAAI,CAAC;IACvC,wBAAwB,EAAE,OAAO,GAAG,IAAI,CAAC;IACzC,UAAU,EAAE,MAAM,GAAG,IAAI,CAAC;IAC1B,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,iBAAiB,EAAE,mBAAmB,EAAE,CAAC;IACzC,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACtC,cAAc,CAAC,EAAE,mBAAmB,CAAC;IACrC,aAAa,EAAE,wBAAwB,EAAE,CAAC;IAC1C,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,uBAAuB;IACtC,cAAc,EAAE,MAAM,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,qBAAqB,CAAC,EAAE,OAAO,CAAC;IAChC,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC;IAC9B,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AA+ND,qBAAa,gBAAgB;IAC3B,OAAO,CAAC,QAAQ,CAAC,OAAO,CAA0B;gBAEtC,OAAO,EAAE,uBAAuB;IAItC,WAAW,CAAC,IAAI,EAAE,kBAAkB,GAAG,OAAO,CAAC,iBAAiB,CAAC;CAmKxE"}
@@ -0,0 +1,371 @@
1
+ import { spawnSync } from "node:child_process";
2
+ import { readFile } from "node:fs/promises";
3
+ import path from "node:path";
4
+ import process from "node:process";
5
+ import { fileURLToPath } from "node:url";
6
+ import { DEFAULT_LOG_DIR } from "../config/Config.js";
7
+ import { RunLogReader } from "../runtime/RunLogReader.js";
8
+ import { adaptRunSummaryForReport, } from "./ReportInputAdapter.js";
9
+ import { resolveTaskFilePath } from "./SuiteLoader.js";
10
+ const asNumber = (value) => typeof value === "number" && Number.isFinite(value) ? value : null;
11
+ const asRecord = (value) => {
12
+ if (!value || typeof value !== "object" || Array.isArray(value))
13
+ return undefined;
14
+ return value;
15
+ };
16
+ const parseRunMeta = (stderr) => {
17
+ const lines = stderr.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
18
+ for (let index = lines.length - 1; index >= 0; index -= 1) {
19
+ const line = lines[index];
20
+ if (!line.startsWith("CODALI_RUN_META "))
21
+ continue;
22
+ const payload = line.slice("CODALI_RUN_META ".length);
23
+ try {
24
+ const parsed = JSON.parse(payload);
25
+ const touched = parsed.touchedFiles;
26
+ const touchedFiles = Array.isArray(touched)
27
+ ? touched.filter((entry) => typeof entry === "string")
28
+ : [];
29
+ return {
30
+ runId: typeof parsed.runId === "string" ? parsed.runId : undefined,
31
+ fingerprint: typeof parsed.fingerprint === "string" ? parsed.fingerprint : null,
32
+ logPath: typeof parsed.logPath === "string" ? parsed.logPath : undefined,
33
+ outputLogPath: typeof parsed.outputLogPath === "string" ? parsed.outputLogPath : undefined,
34
+ touchedFiles,
35
+ command: typeof parsed.command === "string" ? parsed.command : undefined,
36
+ commandRunId: typeof parsed.commandRunId === "string" ? parsed.commandRunId : undefined,
37
+ jobId: typeof parsed.jobId === "string" ? parsed.jobId : undefined,
38
+ project: typeof parsed.project === "string" ? parsed.project : undefined,
39
+ taskId: typeof parsed.taskId === "string" ? parsed.taskId : undefined,
40
+ taskKey: typeof parsed.taskKey === "string" ? parsed.taskKey : undefined,
41
+ agentId: typeof parsed.agentId === "string" ? parsed.agentId : undefined,
42
+ agentSlug: typeof parsed.agentSlug === "string" ? parsed.agentSlug : undefined,
43
+ workflow: asRecord(parsed.workflow) ?? null,
44
+ };
45
+ }
46
+ catch {
47
+ return undefined;
48
+ }
49
+ }
50
+ return undefined;
51
+ };
52
+ const readRunLog = async (logPath) => {
53
+ if (!logPath)
54
+ return { run_failed_reasons: [] };
55
+ let content = "";
56
+ try {
57
+ content = await readFile(logPath, "utf8");
58
+ }
59
+ catch {
60
+ return { run_failed_reasons: [] };
61
+ }
62
+ let runSummary;
63
+ const runFailedReasons = [];
64
+ const lines = content.split("\n").filter(Boolean);
65
+ for (const line of lines) {
66
+ let parsed;
67
+ try {
68
+ parsed = JSON.parse(line);
69
+ }
70
+ catch {
71
+ continue;
72
+ }
73
+ const type = typeof parsed.type === "string" ? parsed.type : "";
74
+ const data = asRecord(parsed.data) ?? {};
75
+ if (type === "run_summary") {
76
+ runSummary = data;
77
+ continue;
78
+ }
79
+ if (type !== "run_failed")
80
+ continue;
81
+ const reasonsValue = data.reasons;
82
+ if (Array.isArray(reasonsValue)) {
83
+ for (const reason of reasonsValue) {
84
+ if (typeof reason === "string" && reason.trim())
85
+ runFailedReasons.push(reason.trim());
86
+ }
87
+ }
88
+ const stage = typeof data.stage === "string" ? data.stage.trim() : "";
89
+ if (stage)
90
+ runFailedReasons.push(stage);
91
+ }
92
+ return { run_summary: runSummary, run_failed_reasons: runFailedReasons };
93
+ };
94
+ const resolveCliEntry = (provided) => {
95
+ if (provided)
96
+ return provided;
97
+ if (process.argv[1])
98
+ return process.argv[1];
99
+ const current = fileURLToPath(import.meta.url);
100
+ return path.resolve(current, "..", "..", "cli.js");
101
+ };
102
+ const hasPatchFailure = (stderr, reasons) => {
103
+ const haystack = [stderr, ...reasons].join("\n").toLowerCase();
104
+ return (haystack.includes("patch_apply_failed")
105
+ || haystack.includes("patch_scope_violation")
106
+ || haystack.includes("patch_search")
107
+ || haystack.includes("patch_rollback")
108
+ || haystack.includes("search block"));
109
+ };
110
+ const detectHallucination = (stderr, reasons) => {
111
+ const haystack = [stderr, ...reasons].join("\n");
112
+ return /\bhallucinat|\bunknown symbol\b|\bunknown file\b|non[-_ ]existent|ENOENT|no such file or directory/i.test(haystack);
113
+ };
114
+ const detectScopeViolation = (stderr, reasons, safetyEvents) => {
115
+ if (safetyEvents.some((event) => event.code === "scope_violation"))
116
+ return true;
117
+ const haystack = [stderr, ...reasons].join("\n");
118
+ return /\bscope_violation\b|patch_outside_allowed_scope|patch_outside_workspace/i.test(haystack);
119
+ };
120
+ const buildAssertions = (params) => {
121
+ const assertions = [];
122
+ const expectedSuccess = params.task.assertions.expect_success;
123
+ assertions.push({
124
+ code: "assert_expect_success",
125
+ passed: params.runSucceeded === expectedSuccess,
126
+ message: "Run success matched expectation.",
127
+ expected: expectedSuccess,
128
+ actual: params.runSucceeded,
129
+ });
130
+ if (params.task.assertions.expect_exit_code !== undefined) {
131
+ assertions.push({
132
+ code: "assert_expect_exit_code",
133
+ passed: params.exitCode === params.task.assertions.expect_exit_code,
134
+ message: "Exit code matched expectation.",
135
+ expected: params.task.assertions.expect_exit_code,
136
+ actual: params.exitCode,
137
+ });
138
+ }
139
+ if (params.task.assertions.expect_patch_apply !== undefined) {
140
+ const actual = params.patchApplySuccess;
141
+ assertions.push({
142
+ code: actual === null ? "assert_expect_patch_apply_missing" : "assert_expect_patch_apply",
143
+ passed: actual !== null && actual === params.task.assertions.expect_patch_apply,
144
+ message: "Patch apply outcome matched expectation.",
145
+ expected: params.task.assertions.expect_patch_apply,
146
+ actual,
147
+ });
148
+ }
149
+ if (params.task.assertions.expect_verification !== undefined
150
+ && params.task.assertions.expect_verification !== "any") {
151
+ const actual = params.verificationOutcome;
152
+ assertions.push({
153
+ code: actual === null ? "assert_expect_verification_missing" : "assert_expect_verification",
154
+ passed: actual !== null && actual === params.task.assertions.expect_verification,
155
+ message: "Verification outcome matched expectation.",
156
+ expected: params.task.assertions.expect_verification,
157
+ actual,
158
+ });
159
+ }
160
+ if (params.task.assertions.max_latency_ms !== undefined) {
161
+ const actual = params.latencyMs;
162
+ assertions.push({
163
+ code: actual === null ? "assert_max_latency_missing" : "assert_max_latency",
164
+ passed: actual !== null && actual <= params.task.assertions.max_latency_ms,
165
+ message: "Latency stayed within threshold.",
166
+ expected: params.task.assertions.max_latency_ms,
167
+ actual,
168
+ });
169
+ }
170
+ if (params.task.assertions.max_cost_usd !== undefined) {
171
+ const actual = params.costUsd;
172
+ assertions.push({
173
+ code: actual === null ? "assert_max_cost_missing" : "assert_max_cost",
174
+ passed: actual !== null && actual <= params.task.assertions.max_cost_usd,
175
+ message: "Cost stayed within threshold.",
176
+ expected: params.task.assertions.max_cost_usd,
177
+ actual,
178
+ });
179
+ }
180
+ if (!params.task.assertions.allow_hallucination) {
181
+ assertions.push({
182
+ code: "assert_no_hallucination",
183
+ passed: params.hallucinationDetected !== true,
184
+ message: "No hallucination signals were detected.",
185
+ expected: false,
186
+ actual: params.hallucinationDetected,
187
+ });
188
+ }
189
+ if (!params.task.assertions.allow_scope_violation) {
190
+ assertions.push({
191
+ code: "assert_no_scope_violation",
192
+ passed: params.scopeViolationDetected !== true,
193
+ message: "No scope-violation signals were detected.",
194
+ expected: false,
195
+ actual: params.scopeViolationDetected,
196
+ });
197
+ }
198
+ return assertions;
199
+ };
200
+ export class EvalTaskExecutor {
201
+ constructor(options) {
202
+ this.options = options;
203
+ }
204
+ async executeTask(task) {
205
+ const startedAtMs = Date.now();
206
+ const startedAt = new Date(startedAtMs).toISOString();
207
+ const cliEntry = resolveCliEntry(this.options.cli_entry);
208
+ const commandArgs = [task.command, "--workspace-root", this.options.workspace_root];
209
+ if (this.options.provider)
210
+ commandArgs.push("--provider", this.options.provider);
211
+ if (this.options.model)
212
+ commandArgs.push("--model", this.options.model);
213
+ if (this.options.api_key)
214
+ commandArgs.push("--api-key", this.options.api_key);
215
+ if (this.options.base_url)
216
+ commandArgs.push("--base-url", this.options.base_url);
217
+ if (this.options.agent)
218
+ commandArgs.push("--agent", this.options.agent);
219
+ if (this.options.agent_id)
220
+ commandArgs.push("--agent-id", this.options.agent_id);
221
+ if (this.options.agent_slug)
222
+ commandArgs.push("--agent-slug", this.options.agent_slug);
223
+ if (this.options.workflow_profile)
224
+ commandArgs.push("--profile", this.options.workflow_profile);
225
+ if (this.options.smart === true)
226
+ commandArgs.push("--smart");
227
+ if (this.options.no_deep_investigation)
228
+ commandArgs.push("--no-deep-investigation");
229
+ if (task.args.length > 0)
230
+ commandArgs.push(...task.args);
231
+ const taskFilePath = resolveTaskFilePath(task, this.options.suite_dir, this.options.workspace_root);
232
+ if (taskFilePath) {
233
+ commandArgs.push("--task", taskFilePath);
234
+ }
235
+ else if (task.inline_task) {
236
+ commandArgs.push(task.inline_task);
237
+ }
238
+ const commandLine = [process.execPath, cliEntry, ...commandArgs];
239
+ let stdout = "";
240
+ let stderr = "";
241
+ let exitCode = null;
242
+ let executionError;
243
+ try {
244
+ const result = spawnSync(process.execPath, [cliEntry, ...commandArgs], {
245
+ cwd: this.options.workspace_root,
246
+ encoding: "utf8",
247
+ env: {
248
+ ...process.env,
249
+ ...(this.options.extra_env ?? {}),
250
+ },
251
+ timeout: this.options.timeout_ms ?? 20 * 60 * 1000,
252
+ });
253
+ stdout = (result.stdout ?? "").toString();
254
+ stderr = (result.stderr ?? "").toString();
255
+ exitCode = typeof result.status === "number" ? result.status : null;
256
+ if (result.error)
257
+ executionError = String(result.error);
258
+ if (result.signal && exitCode === null)
259
+ executionError = `terminated_by_signal:${result.signal}`;
260
+ }
261
+ catch (error) {
262
+ executionError = error instanceof Error ? error.message : String(error);
263
+ }
264
+ const runMeta = parseRunMeta(stderr);
265
+ const runLog = await readRunLog(runMeta?.logPath);
266
+ const reader = new RunLogReader(this.options.workspace_root, this.options.log_dir ?? DEFAULT_LOG_DIR);
267
+ const safetyEvents = runMeta?.runId
268
+ ? await reader.getSafetyEvents(runMeta.runId)
269
+ : [];
270
+ const verificationReports = runMeta?.runId
271
+ ? await reader.getVerificationReports(runMeta.runId)
272
+ : [];
273
+ const latestVerification = verificationReports.length
274
+ ? verificationReports[verificationReports.length - 1]
275
+ : undefined;
276
+ const runSummary = runLog.run_summary;
277
+ const normalizedRun = adaptRunSummaryForReport({
278
+ runSummary,
279
+ runId: runMeta?.runId,
280
+ taskId: task.id,
281
+ verificationOutcome: latestVerification?.outcome ?? null,
282
+ touchedFiles: runMeta?.touchedFiles ?? [],
283
+ });
284
+ const runSucceeded = exitCode === 0 && !executionError;
285
+ const firstPass = (() => {
286
+ const smartRuntime = asRecord(runSummary?.smartRuntime);
287
+ const attempts = asNumber(smartRuntime?.attempts);
288
+ if (attempts !== null)
289
+ return attempts <= 1;
290
+ return runSucceeded ? true : false;
291
+ })();
292
+ const patchApplySuccess = (() => {
293
+ if (runMeta?.touchedFiles?.length)
294
+ return true;
295
+ if (hasPatchFailure(stderr, runLog.run_failed_reasons))
296
+ return false;
297
+ return null;
298
+ })();
299
+ const verificationOutcome = (() => {
300
+ const normalizedOutcome = normalizedRun.verification_outcome;
301
+ if (normalizedOutcome)
302
+ return normalizedOutcome;
303
+ if (latestVerification?.outcome)
304
+ return latestVerification.outcome;
305
+ const verification = asRecord(runSummary?.verification);
306
+ const outcome = verification?.outcome;
307
+ if (outcome === "verified_passed"
308
+ || outcome === "verified_failed"
309
+ || outcome === "unverified_with_reason") {
310
+ return outcome;
311
+ }
312
+ return null;
313
+ })();
314
+ const verificationPassed = verificationOutcome === null ? null : verificationOutcome === "verified_passed";
315
+ const hallucinationDetected = detectHallucination(stderr, runLog.run_failed_reasons);
316
+ const scopeViolationDetected = detectScopeViolation(stderr, runLog.run_failed_reasons, safetyEvents);
317
+ const latencyMs = normalizedRun.duration_ms ?? asNumber(runSummary?.durationMs) ?? (Date.now() - startedAtMs);
318
+ const usage = asRecord(runSummary?.usage);
319
+ const tokensUsed = normalizedRun.usage_tokens_total
320
+ ?? asNumber(usage?.totalTokens)
321
+ ?? (() => {
322
+ const input = asNumber(usage?.inputTokens) ?? 0;
323
+ const output = asNumber(usage?.outputTokens) ?? 0;
324
+ return input + output > 0 ? input + output : null;
325
+ })();
326
+ const costUsd = normalizedRun.cost_usd ?? asNumber(runSummary?.actualCost);
327
+ const assertionResults = buildAssertions({
328
+ task,
329
+ runSucceeded,
330
+ exitCode,
331
+ patchApplySuccess,
332
+ verificationOutcome,
333
+ latencyMs,
334
+ costUsd,
335
+ hallucinationDetected,
336
+ scopeViolationDetected,
337
+ });
338
+ const taskPassed = assertionResults.every((assertion) => assertion.passed);
339
+ const endedAtMs = Date.now();
340
+ return {
341
+ task_id: task.id,
342
+ title: task.title,
343
+ command: task.command,
344
+ mode: task.mode,
345
+ started_at: startedAt,
346
+ ended_at: new Date(endedAtMs).toISOString(),
347
+ duration_ms: endedAtMs - startedAtMs,
348
+ exit_code: exitCode,
349
+ run_succeeded: runSucceeded,
350
+ task_passed: taskPassed,
351
+ first_pass: firstPass,
352
+ patch_apply_success: patchApplySuccess,
353
+ verification_outcome: verificationOutcome,
354
+ verification_passed: verificationPassed,
355
+ hallucination_detected: hallucinationDetected,
356
+ scope_violation_detected: scopeViolationDetected,
357
+ latency_ms: latencyMs,
358
+ tokens_used: tokensUsed,
359
+ cost_usd: costUsd,
360
+ assertion_results: assertionResults,
361
+ stdout,
362
+ stderr,
363
+ command_line: commandLine,
364
+ run_meta: runMeta,
365
+ run_summary: runSummary,
366
+ normalized_run: normalizedRun,
367
+ safety_events: safetyEvents,
368
+ execution_error: executionError,
369
+ };
370
+ }
371
+ }
@@ -0,0 +1,31 @@
1
+ import type { EvalMetrics } from "./MetricTypes.js";
2
+ import type { EvalRegressionComparison } from "./RegressionComparator.js";
3
+ export interface EvalGateThresholds {
4
+ patch_apply_drop_max: number;
5
+ verification_pass_rate_min: number;
6
+ hallucination_rate_max: number;
7
+ scope_violation_rate_max: number;
8
+ }
9
+ export interface EvalGateFailure {
10
+ code: string;
11
+ metric: string;
12
+ message: string;
13
+ threshold: number;
14
+ actual: number | null;
15
+ baseline?: number | null;
16
+ delta?: number | null;
17
+ }
18
+ export interface EvalGateResult {
19
+ schema_version: 1;
20
+ passed: boolean;
21
+ thresholds: EvalGateThresholds;
22
+ failures: EvalGateFailure[];
23
+ }
24
+ export declare const DEFAULT_EVAL_GATE_THRESHOLDS: EvalGateThresholds;
25
+ export declare const resolveGateThresholds: (...sources: Array<Partial<EvalGateThresholds> | undefined>) => EvalGateThresholds;
26
+ export declare const evaluateGates: (params: {
27
+ metrics: EvalMetrics;
28
+ thresholds: EvalGateThresholds;
29
+ comparison?: EvalRegressionComparison;
30
+ }) => EvalGateResult;
31
+ //# sourceMappingURL=GateEvaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"GateEvaluator.d.ts","sourceRoot":"","sources":["../../src/eval/GateEvaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,KAAK,EAAE,wBAAwB,EAAE,MAAM,2BAA2B,CAAC;AAE1E,MAAM,WAAW,kBAAkB;IACjC,oBAAoB,EAAE,MAAM,CAAC;IAC7B,0BAA0B,EAAE,MAAM,CAAC;IACnC,sBAAsB,EAAE,MAAM,CAAC;IAC/B,wBAAwB,EAAE,MAAM,CAAC;CAClC;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,KAAK,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CACvB;AAED,MAAM,WAAW,cAAc;IAC7B,cAAc,EAAE,CAAC,CAAC;IAClB,MAAM,EAAE,OAAO,CAAC;IAChB,UAAU,EAAE,kBAAkB,CAAC;IAC/B,QAAQ,EAAE,eAAe,EAAE,CAAC;CAC7B;AAED,eAAO,MAAM,4BAA4B,EAAE,kBAK1C,CAAC;AAiBF,eAAO,MAAM,qBAAqB,GAChC,GAAG,SAAS,KAAK,CAAC,OAAO,CAAC,kBAAkB,CAAC,GAAG,SAAS,CAAC,KACzD,kBAuCF,CAAC;AAgBF,eAAO,MAAM,aAAa,GAAI,QAAQ;IACpC,OAAO,EAAE,WAAW,CAAC;IACrB,UAAU,EAAE,kBAAkB,CAAC;IAC/B,UAAU,CAAC,EAAE,wBAAwB,CAAC;CACvC,KAAG,cAmFH,CAAC"}
@@ -0,0 +1,134 @@
1
+ export const DEFAULT_EVAL_GATE_THRESHOLDS = {
2
+ patch_apply_drop_max: 0.02,
3
+ verification_pass_rate_min: 0.9,
4
+ hallucination_rate_max: 0.02,
5
+ scope_violation_rate_max: 0,
6
+ };
7
+ const asRate = (value) => value !== null && Number.isFinite(value) ? value : null;
8
+ const normalizeThreshold = (value, fallback, label) => {
9
+ if (value === undefined)
10
+ return fallback;
11
+ if (!Number.isFinite(value) || value < 0 || value > 1) {
12
+ throw new Error(`Invalid ${label}: expected a number between 0 and 1.`);
13
+ }
14
+ return value;
15
+ };
16
+ export const resolveGateThresholds = (...sources) => {
17
+ const merged = { ...DEFAULT_EVAL_GATE_THRESHOLDS };
18
+ for (const source of sources) {
19
+ if (!source)
20
+ continue;
21
+ if (source.patch_apply_drop_max !== undefined) {
22
+ merged.patch_apply_drop_max = source.patch_apply_drop_max;
23
+ }
24
+ if (source.verification_pass_rate_min !== undefined) {
25
+ merged.verification_pass_rate_min = source.verification_pass_rate_min;
26
+ }
27
+ if (source.hallucination_rate_max !== undefined) {
28
+ merged.hallucination_rate_max = source.hallucination_rate_max;
29
+ }
30
+ if (source.scope_violation_rate_max !== undefined) {
31
+ merged.scope_violation_rate_max = source.scope_violation_rate_max;
32
+ }
33
+ }
34
+ return {
35
+ patch_apply_drop_max: normalizeThreshold(merged.patch_apply_drop_max, DEFAULT_EVAL_GATE_THRESHOLDS.patch_apply_drop_max, "patch_apply_drop_max"),
36
+ verification_pass_rate_min: normalizeThreshold(merged.verification_pass_rate_min, DEFAULT_EVAL_GATE_THRESHOLDS.verification_pass_rate_min, "verification_pass_rate_min"),
37
+ hallucination_rate_max: normalizeThreshold(merged.hallucination_rate_max, DEFAULT_EVAL_GATE_THRESHOLDS.hallucination_rate_max, "hallucination_rate_max"),
38
+ scope_violation_rate_max: normalizeThreshold(merged.scope_violation_rate_max, DEFAULT_EVAL_GATE_THRESHOLDS.scope_violation_rate_max, "scope_violation_rate_max"),
39
+ };
40
+ };
41
+ const findDelta = (comparison, key) => {
42
+ if (!comparison || comparison.status !== "compared")
43
+ return undefined;
44
+ const entry = comparison.deltas.find((delta) => delta.key === key);
45
+ if (!entry)
46
+ return undefined;
47
+ return {
48
+ baseline: entry.baseline,
49
+ current: entry.current,
50
+ delta: entry.delta,
51
+ };
52
+ };
53
+ export const evaluateGates = (params) => {
54
+ const failures = [];
55
+ const { metrics, thresholds, comparison } = params;
56
+ const patchDelta = findDelta(comparison, "m003_patch_apply_success_rate");
57
+ if (patchDelta && patchDelta.baseline !== null && patchDelta.current !== null) {
58
+ const drop = patchDelta.baseline - patchDelta.current;
59
+ if (drop > thresholds.patch_apply_drop_max) {
60
+ failures.push({
61
+ code: "gate_patch_apply_drop_exceeded",
62
+ metric: "m003_patch_apply_success_rate",
63
+ message: "Patch apply success rate dropped more than the allowed threshold.",
64
+ threshold: thresholds.patch_apply_drop_max,
65
+ actual: patchDelta.current,
66
+ baseline: patchDelta.baseline,
67
+ delta: patchDelta.delta,
68
+ });
69
+ }
70
+ }
71
+ const verificationRate = asRate(metrics.m004_verification_pass_rate.value);
72
+ if (verificationRate === null) {
73
+ failures.push({
74
+ code: "gate_verification_rate_missing",
75
+ metric: "m004_verification_pass_rate",
76
+ message: "Verification pass rate is unavailable.",
77
+ threshold: thresholds.verification_pass_rate_min,
78
+ actual: null,
79
+ });
80
+ }
81
+ else if (verificationRate < thresholds.verification_pass_rate_min) {
82
+ failures.push({
83
+ code: "gate_verification_rate_below_min",
84
+ metric: "m004_verification_pass_rate",
85
+ message: "Verification pass rate is below threshold.",
86
+ threshold: thresholds.verification_pass_rate_min,
87
+ actual: verificationRate,
88
+ });
89
+ }
90
+ const hallucinationRate = asRate(metrics.m005_hallucination_rate.value);
91
+ if (hallucinationRate === null) {
92
+ failures.push({
93
+ code: "gate_hallucination_rate_missing",
94
+ metric: "m005_hallucination_rate",
95
+ message: "Hallucination rate is unavailable.",
96
+ threshold: thresholds.hallucination_rate_max,
97
+ actual: null,
98
+ });
99
+ }
100
+ else if (hallucinationRate > thresholds.hallucination_rate_max) {
101
+ failures.push({
102
+ code: "gate_hallucination_rate_exceeded",
103
+ metric: "m005_hallucination_rate",
104
+ message: "Hallucination rate is above threshold.",
105
+ threshold: thresholds.hallucination_rate_max,
106
+ actual: hallucinationRate,
107
+ });
108
+ }
109
+ const scopeRate = asRate(metrics.m006_scope_violation_rate.value);
110
+ if (scopeRate === null) {
111
+ failures.push({
112
+ code: "gate_scope_violation_rate_missing",
113
+ metric: "m006_scope_violation_rate",
114
+ message: "Scope-violation rate is unavailable.",
115
+ threshold: thresholds.scope_violation_rate_max,
116
+ actual: null,
117
+ });
118
+ }
119
+ else if (scopeRate > thresholds.scope_violation_rate_max) {
120
+ failures.push({
121
+ code: "gate_scope_violation_rate_exceeded",
122
+ metric: "m006_scope_violation_rate",
123
+ message: "Scope-violation rate is above threshold.",
124
+ threshold: thresholds.scope_violation_rate_max,
125
+ actual: scopeRate,
126
+ });
127
+ }
128
+ return {
129
+ schema_version: 1,
130
+ passed: failures.length === 0,
131
+ thresholds,
132
+ failures,
133
+ };
134
+ };