@mcoda/mswarm 0.1.57 → 0.1.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/dist/codali-executor.d.ts +278 -0
- package/dist/codali-executor.d.ts.map +1 -0
- package/dist/codali-executor.js +243 -0
- package/dist/codali-executor.js.map +1 -0
- package/dist/runtime.d.ts +46 -1
- package/dist/runtime.d.ts.map +1 -1
- package/dist/runtime.js +298 -30
- package/dist/runtime.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +66 -1
- package/dist/server.js.map +1 -1
- package/dist/vendor/codali/agents/AgentProtocol.d.ts +287 -0
- package/dist/vendor/codali/agents/AgentProtocol.d.ts.map +1 -0
- package/dist/vendor/codali/agents/AgentProtocol.js +365 -0
- package/dist/vendor/codali/agents/AgentResolver.d.ts +23 -0
- package/dist/vendor/codali/agents/AgentResolver.d.ts.map +1 -0
- package/dist/vendor/codali/agents/AgentResolver.js +77 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.d.ts +23 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.d.ts.map +1 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.js +287 -0
- package/dist/vendor/codali/cli/EvalCommand.d.ts +37 -0
- package/dist/vendor/codali/cli/EvalCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/EvalCommand.js +333 -0
- package/dist/vendor/codali/cli/FeedbackCommand.d.ts +22 -0
- package/dist/vendor/codali/cli/FeedbackCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/FeedbackCommand.js +163 -0
- package/dist/vendor/codali/cli/RunCommand.d.ts +78 -0
- package/dist/vendor/codali/cli/RunCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/RunCommand.js +2261 -0
- package/dist/vendor/codali/cli.d.ts +3 -0
- package/dist/vendor/codali/cli.d.ts.map +1 -0
- package/dist/vendor/codali/cli.js +109 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.d.ts +107 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.js +1726 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.d.ts +25 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.js +164 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.d.ts +76 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.js +1159 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.d.ts +91 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.js +4547 -0
- package/dist/vendor/codali/cognitive/ContextBudget.d.ts +19 -0
- package/dist/vendor/codali/cognitive/ContextBudget.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextBudget.js +35 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.d.ts +30 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.js +307 -0
- package/dist/vendor/codali/cognitive/ContextManager.d.ts +47 -0
- package/dist/vendor/codali/cognitive/ContextManager.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextManager.js +272 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.d.ts +18 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.js +53 -0
- package/dist/vendor/codali/cognitive/ContextSelector.d.ts +22 -0
- package/dist/vendor/codali/cognitive/ContextSelector.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSelector.js +431 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.d.ts +8 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.js +882 -0
- package/dist/vendor/codali/cognitive/ContextStore.d.ts +27 -0
- package/dist/vendor/codali/cognitive/ContextStore.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextStore.js +79 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.d.ts +16 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.js +45 -0
- package/dist/vendor/codali/cognitive/CostEstimator.d.ts +31 -0
- package/dist/vendor/codali/cognitive/CostEstimator.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/CostEstimator.js +66 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.d.ts +32 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.js +297 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.d.ts +9 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.js +75 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.d.ts +12 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.js +34 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.d.ts +33 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.js +159 -0
- package/dist/vendor/codali/cognitive/IntentSignals.d.ts +7 -0
- package/dist/vendor/codali/cognitive/IntentSignals.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/IntentSignals.js +285 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.d.ts +100 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.js +276 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.d.ts +64 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.js +287 -0
- package/dist/vendor/codali/cognitive/PatchApplier.d.ts +49 -0
- package/dist/vendor/codali/cognitive/PatchApplier.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchApplier.js +199 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.d.ts +35 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.js +100 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.d.ts +7 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.js +59 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.d.ts +17 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.js +131 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.d.ts +3 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.js +85 -0
- package/dist/vendor/codali/cognitive/Prompts.d.ts +15 -0
- package/dist/vendor/codali/cognitive/Prompts.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/Prompts.js +326 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.d.ts +16 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.js +24 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.d.ts +12 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.js +262 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.d.ts +13 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.js +125 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.d.ts +92 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.js +4804 -0
- package/dist/vendor/codali/cognitive/Types.d.ts +474 -0
- package/dist/vendor/codali/cognitive/Types.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/Types.js +7 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.d.ts +57 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.js +515 -0
- package/dist/vendor/codali/config/Config.d.ts +249 -0
- package/dist/vendor/codali/config/Config.d.ts.map +1 -0
- package/dist/vendor/codali/config/Config.js +200 -0
- package/dist/vendor/codali/config/ConfigLoader.d.ts +56 -0
- package/dist/vendor/codali/config/ConfigLoader.d.ts.map +1 -0
- package/dist/vendor/codali/config/ConfigLoader.js +1246 -0
- package/dist/vendor/codali/docdex/DocdexClient.d.ts +158 -0
- package/dist/vendor/codali/docdex/DocdexClient.d.ts.map +1 -0
- package/dist/vendor/codali/docdex/DocdexClient.js +785 -0
- package/dist/vendor/codali/eval/EvalRunner.d.ts +35 -0
- package/dist/vendor/codali/eval/EvalRunner.d.ts.map +1 -0
- package/dist/vendor/codali/eval/EvalRunner.js +38 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.d.ts +81 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.d.ts.map +1 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.js +371 -0
- package/dist/vendor/codali/eval/GateEvaluator.d.ts +31 -0
- package/dist/vendor/codali/eval/GateEvaluator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/GateEvaluator.js +134 -0
- package/dist/vendor/codali/eval/MetricTypes.d.ts +28 -0
- package/dist/vendor/codali/eval/MetricTypes.d.ts.map +1 -0
- package/dist/vendor/codali/eval/MetricTypes.js +1 -0
- package/dist/vendor/codali/eval/MetricsAggregator.d.ts +4 -0
- package/dist/vendor/codali/eval/MetricsAggregator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/MetricsAggregator.js +97 -0
- package/dist/vendor/codali/eval/RegressionComparator.d.ts +29 -0
- package/dist/vendor/codali/eval/RegressionComparator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/RegressionComparator.js +155 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.d.ts +52 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.js +229 -0
- package/dist/vendor/codali/eval/ReportSerializer.d.ts +32 -0
- package/dist/vendor/codali/eval/ReportSerializer.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportSerializer.js +33 -0
- package/dist/vendor/codali/eval/ReportStore.d.ts +18 -0
- package/dist/vendor/codali/eval/ReportStore.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportStore.js +96 -0
- package/dist/vendor/codali/eval/SuiteLoader.d.ts +12 -0
- package/dist/vendor/codali/eval/SuiteLoader.d.ts.map +1 -0
- package/dist/vendor/codali/eval/SuiteLoader.js +51 -0
- package/dist/vendor/codali/eval/SuiteSchema.d.ts +56 -0
- package/dist/vendor/codali/eval/SuiteSchema.d.ts.map +1 -0
- package/dist/vendor/codali/eval/SuiteSchema.js +357 -0
- package/dist/vendor/codali/index.d.ts +11 -0
- package/dist/vendor/codali/index.d.ts.map +1 -0
- package/dist/vendor/codali/index.js +5 -0
- package/dist/vendor/codali/providers/CodexCliProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/CodexCliProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/CodexCliProvider.js +282 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.js +300 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.js +192 -0
- package/dist/vendor/codali/providers/ProviderRegistry.d.ts +12 -0
- package/dist/vendor/codali/providers/ProviderRegistry.d.ts.map +1 -0
- package/dist/vendor/codali/providers/ProviderRegistry.js +28 -0
- package/dist/vendor/codali/providers/ProviderTypes.d.ts +81 -0
- package/dist/vendor/codali/providers/ProviderTypes.d.ts.map +1 -0
- package/dist/vendor/codali/providers/ProviderTypes.js +1 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.d.ts +189 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.js +1435 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.d.ts +39 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.js +57 -0
- package/dist/vendor/codali/runtime/RunContext.d.ts +27 -0
- package/dist/vendor/codali/runtime/RunContext.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunContext.js +51 -0
- package/dist/vendor/codali/runtime/RunLogQuery.d.ts +48 -0
- package/dist/vendor/codali/runtime/RunLogQuery.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogQuery.js +36 -0
- package/dist/vendor/codali/runtime/RunLogReader.d.ts +19 -0
- package/dist/vendor/codali/runtime/RunLogReader.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogReader.js +361 -0
- package/dist/vendor/codali/runtime/RunLogger.d.ts +71 -0
- package/dist/vendor/codali/runtime/RunLogger.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogger.js +100 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.d.ts +117 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.js +299 -0
- package/dist/vendor/codali/runtime/Runner.d.ts +66 -0
- package/dist/vendor/codali/runtime/Runner.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/Runner.js +215 -0
- package/dist/vendor/codali/runtime/StoragePaths.d.ts +3 -0
- package/dist/vendor/codali/runtime/StoragePaths.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/StoragePaths.js +19 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.d.ts +30 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.js +141 -0
- package/dist/vendor/codali/session/InstructionLoader.d.ts +14 -0
- package/dist/vendor/codali/session/InstructionLoader.d.ts.map +1 -0
- package/dist/vendor/codali/session/InstructionLoader.js +107 -0
- package/dist/vendor/codali/session/SessionStore.d.ts +81 -0
- package/dist/vendor/codali/session/SessionStore.d.ts.map +1 -0
- package/dist/vendor/codali/session/SessionStore.js +244 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.d.ts +68 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.d.ts.map +1 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.js +150 -0
- package/dist/vendor/codali/tools/ToolRegistry.d.ts +9 -0
- package/dist/vendor/codali/tools/ToolRegistry.d.ts.map +1 -0
- package/dist/vendor/codali/tools/ToolRegistry.js +293 -0
- package/dist/vendor/codali/tools/ToolTypes.d.ts +66 -0
- package/dist/vendor/codali/tools/ToolTypes.d.ts.map +1 -0
- package/dist/vendor/codali/tools/ToolTypes.js +40 -0
- package/dist/vendor/codali/tools/diff/DiffTool.d.ts +3 -0
- package/dist/vendor/codali/tools/diff/DiffTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/diff/DiffTool.js +34 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.d.ts +4 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.d.ts.map +1 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.js +490 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.d.ts +3 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.d.ts.map +1 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.js +141 -0
- package/dist/vendor/codali/tools/search/SearchTool.d.ts +3 -0
- package/dist/vendor/codali/tools/search/SearchTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/search/SearchTool.js +46 -0
- package/dist/vendor/codali/tools/shell/ShellTool.d.ts +3 -0
- package/dist/vendor/codali/tools/shell/ShellTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/shell/ShellTool.js +104 -0
- package/package.json +5 -3
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
export const DEFAULT_EVAL_GATE_THRESHOLDS = {
|
|
2
|
+
patch_apply_drop_max: 0.02,
|
|
3
|
+
verification_pass_rate_min: 0.9,
|
|
4
|
+
hallucination_rate_max: 0.02,
|
|
5
|
+
scope_violation_rate_max: 0,
|
|
6
|
+
};
|
|
7
|
+
const asRate = (value) => value !== null && Number.isFinite(value) ? value : null;
|
|
8
|
+
const normalizeThreshold = (value, fallback, label) => {
|
|
9
|
+
if (value === undefined)
|
|
10
|
+
return fallback;
|
|
11
|
+
if (!Number.isFinite(value) || value < 0 || value > 1) {
|
|
12
|
+
throw new Error(`Invalid ${label}: expected a number between 0 and 1.`);
|
|
13
|
+
}
|
|
14
|
+
return value;
|
|
15
|
+
};
|
|
16
|
+
export const resolveGateThresholds = (...sources) => {
|
|
17
|
+
const merged = { ...DEFAULT_EVAL_GATE_THRESHOLDS };
|
|
18
|
+
for (const source of sources) {
|
|
19
|
+
if (!source)
|
|
20
|
+
continue;
|
|
21
|
+
if (source.patch_apply_drop_max !== undefined) {
|
|
22
|
+
merged.patch_apply_drop_max = source.patch_apply_drop_max;
|
|
23
|
+
}
|
|
24
|
+
if (source.verification_pass_rate_min !== undefined) {
|
|
25
|
+
merged.verification_pass_rate_min = source.verification_pass_rate_min;
|
|
26
|
+
}
|
|
27
|
+
if (source.hallucination_rate_max !== undefined) {
|
|
28
|
+
merged.hallucination_rate_max = source.hallucination_rate_max;
|
|
29
|
+
}
|
|
30
|
+
if (source.scope_violation_rate_max !== undefined) {
|
|
31
|
+
merged.scope_violation_rate_max = source.scope_violation_rate_max;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
return {
|
|
35
|
+
patch_apply_drop_max: normalizeThreshold(merged.patch_apply_drop_max, DEFAULT_EVAL_GATE_THRESHOLDS.patch_apply_drop_max, "patch_apply_drop_max"),
|
|
36
|
+
verification_pass_rate_min: normalizeThreshold(merged.verification_pass_rate_min, DEFAULT_EVAL_GATE_THRESHOLDS.verification_pass_rate_min, "verification_pass_rate_min"),
|
|
37
|
+
hallucination_rate_max: normalizeThreshold(merged.hallucination_rate_max, DEFAULT_EVAL_GATE_THRESHOLDS.hallucination_rate_max, "hallucination_rate_max"),
|
|
38
|
+
scope_violation_rate_max: normalizeThreshold(merged.scope_violation_rate_max, DEFAULT_EVAL_GATE_THRESHOLDS.scope_violation_rate_max, "scope_violation_rate_max"),
|
|
39
|
+
};
|
|
40
|
+
};
|
|
41
|
+
const findDelta = (comparison, key) => {
|
|
42
|
+
if (!comparison || comparison.status !== "compared")
|
|
43
|
+
return undefined;
|
|
44
|
+
const entry = comparison.deltas.find((delta) => delta.key === key);
|
|
45
|
+
if (!entry)
|
|
46
|
+
return undefined;
|
|
47
|
+
return {
|
|
48
|
+
baseline: entry.baseline,
|
|
49
|
+
current: entry.current,
|
|
50
|
+
delta: entry.delta,
|
|
51
|
+
};
|
|
52
|
+
};
|
|
53
|
+
export const evaluateGates = (params) => {
|
|
54
|
+
const failures = [];
|
|
55
|
+
const { metrics, thresholds, comparison } = params;
|
|
56
|
+
const patchDelta = findDelta(comparison, "m003_patch_apply_success_rate");
|
|
57
|
+
if (patchDelta && patchDelta.baseline !== null && patchDelta.current !== null) {
|
|
58
|
+
const drop = patchDelta.baseline - patchDelta.current;
|
|
59
|
+
if (drop > thresholds.patch_apply_drop_max) {
|
|
60
|
+
failures.push({
|
|
61
|
+
code: "gate_patch_apply_drop_exceeded",
|
|
62
|
+
metric: "m003_patch_apply_success_rate",
|
|
63
|
+
message: "Patch apply success rate dropped more than the allowed threshold.",
|
|
64
|
+
threshold: thresholds.patch_apply_drop_max,
|
|
65
|
+
actual: patchDelta.current,
|
|
66
|
+
baseline: patchDelta.baseline,
|
|
67
|
+
delta: patchDelta.delta,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
const verificationRate = asRate(metrics.m004_verification_pass_rate.value);
|
|
72
|
+
if (verificationRate === null) {
|
|
73
|
+
failures.push({
|
|
74
|
+
code: "gate_verification_rate_missing",
|
|
75
|
+
metric: "m004_verification_pass_rate",
|
|
76
|
+
message: "Verification pass rate is unavailable.",
|
|
77
|
+
threshold: thresholds.verification_pass_rate_min,
|
|
78
|
+
actual: null,
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
else if (verificationRate < thresholds.verification_pass_rate_min) {
|
|
82
|
+
failures.push({
|
|
83
|
+
code: "gate_verification_rate_below_min",
|
|
84
|
+
metric: "m004_verification_pass_rate",
|
|
85
|
+
message: "Verification pass rate is below threshold.",
|
|
86
|
+
threshold: thresholds.verification_pass_rate_min,
|
|
87
|
+
actual: verificationRate,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
const hallucinationRate = asRate(metrics.m005_hallucination_rate.value);
|
|
91
|
+
if (hallucinationRate === null) {
|
|
92
|
+
failures.push({
|
|
93
|
+
code: "gate_hallucination_rate_missing",
|
|
94
|
+
metric: "m005_hallucination_rate",
|
|
95
|
+
message: "Hallucination rate is unavailable.",
|
|
96
|
+
threshold: thresholds.hallucination_rate_max,
|
|
97
|
+
actual: null,
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
else if (hallucinationRate > thresholds.hallucination_rate_max) {
|
|
101
|
+
failures.push({
|
|
102
|
+
code: "gate_hallucination_rate_exceeded",
|
|
103
|
+
metric: "m005_hallucination_rate",
|
|
104
|
+
message: "Hallucination rate is above threshold.",
|
|
105
|
+
threshold: thresholds.hallucination_rate_max,
|
|
106
|
+
actual: hallucinationRate,
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
const scopeRate = asRate(metrics.m006_scope_violation_rate.value);
|
|
110
|
+
if (scopeRate === null) {
|
|
111
|
+
failures.push({
|
|
112
|
+
code: "gate_scope_violation_rate_missing",
|
|
113
|
+
metric: "m006_scope_violation_rate",
|
|
114
|
+
message: "Scope-violation rate is unavailable.",
|
|
115
|
+
threshold: thresholds.scope_violation_rate_max,
|
|
116
|
+
actual: null,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
else if (scopeRate > thresholds.scope_violation_rate_max) {
|
|
120
|
+
failures.push({
|
|
121
|
+
code: "gate_scope_violation_rate_exceeded",
|
|
122
|
+
metric: "m006_scope_violation_rate",
|
|
123
|
+
message: "Scope-violation rate is above threshold.",
|
|
124
|
+
threshold: thresholds.scope_violation_rate_max,
|
|
125
|
+
actual: scopeRate,
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
return {
|
|
129
|
+
schema_version: 1,
|
|
130
|
+
passed: failures.length === 0,
|
|
131
|
+
thresholds,
|
|
132
|
+
failures,
|
|
133
|
+
};
|
|
134
|
+
};
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
export interface RateMetric {
|
|
2
|
+
numerator: number;
|
|
3
|
+
denominator: number;
|
|
4
|
+
missing: number;
|
|
5
|
+
value: number | null;
|
|
6
|
+
}
|
|
7
|
+
export interface PercentileMetric {
|
|
8
|
+
sample_size: number;
|
|
9
|
+
missing: number;
|
|
10
|
+
median: number | null;
|
|
11
|
+
p95: number | null;
|
|
12
|
+
}
|
|
13
|
+
export interface EvalMetrics {
|
|
14
|
+
schema_version: 1;
|
|
15
|
+
generated_at: string;
|
|
16
|
+
task_count: number;
|
|
17
|
+
m001_task_success_rate: RateMetric;
|
|
18
|
+
m002_first_pass_success_rate: RateMetric;
|
|
19
|
+
m003_patch_apply_success_rate: RateMetric;
|
|
20
|
+
m004_verification_pass_rate: RateMetric;
|
|
21
|
+
m005_hallucination_rate: RateMetric;
|
|
22
|
+
m006_scope_violation_rate: RateMetric;
|
|
23
|
+
m007_latency_ms: PercentileMetric;
|
|
24
|
+
m008_success_tokens: PercentileMetric;
|
|
25
|
+
m008_success_cost_usd: PercentileMetric;
|
|
26
|
+
}
|
|
27
|
+
export type EvalMetricKey = "m001_task_success_rate" | "m002_first_pass_success_rate" | "m003_patch_apply_success_rate" | "m004_verification_pass_rate" | "m005_hallucination_rate" | "m006_scope_violation_rate" | "m007_latency_ms.median" | "m007_latency_ms.p95" | "m008_success_tokens.median" | "m008_success_tokens.p95" | "m008_success_cost_usd.median" | "m008_success_cost_usd.p95";
|
|
28
|
+
//# sourceMappingURL=MetricTypes.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"MetricTypes.d.ts","sourceRoot":"","sources":["../../src/eval/MetricTypes.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;CACtB;AAED,MAAM,WAAW,gBAAgB;IAC/B,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC;CACpB;AAED,MAAM,WAAW,WAAW;IAC1B,cAAc,EAAE,CAAC,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB,EAAE,UAAU,CAAC;IACnC,4BAA4B,EAAE,UAAU,CAAC;IACzC,6BAA6B,EAAE,UAAU,CAAC;IAC1C,2BAA2B,EAAE,UAAU,CAAC;IACxC,uBAAuB,EAAE,UAAU,CAAC;IACpC,yBAAyB,EAAE,UAAU,CAAC;IACtC,eAAe,EAAE,gBAAgB,CAAC;IAClC,mBAAmB,EAAE,gBAAgB,CAAC;IACtC,qBAAqB,EAAE,gBAAgB,CAAC;CACzC;AAED,MAAM,MAAM,aAAa,GACrB,wBAAwB,GACxB,8BAA8B,GAC9B,+BAA+B,GAC/B,6BAA6B,GAC7B,yBAAyB,GACzB,2BAA2B,GAC3B,wBAAwB,GACxB,qBAAqB,GACrB,4BAA4B,GAC5B,yBAAyB,GACzB,8BAA8B,GAC9B,2BAA2B,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"MetricsAggregator.d.ts","sourceRoot":"","sources":["../../src/eval/MetricsAggregator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAgC,MAAM,kBAAkB,CAAC;AAClF,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AA0ErD,eAAO,MAAM,gBAAgB,GAAI,KAAK,aAAa,KAAG,WA0BrD,CAAC"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
const computeRate = (samples) => {
|
|
2
|
+
let numerator = 0;
|
|
3
|
+
let denominator = 0;
|
|
4
|
+
let missing = 0;
|
|
5
|
+
for (const sample of samples) {
|
|
6
|
+
if (sample === null) {
|
|
7
|
+
missing += 1;
|
|
8
|
+
continue;
|
|
9
|
+
}
|
|
10
|
+
denominator += 1;
|
|
11
|
+
if (sample)
|
|
12
|
+
numerator += 1;
|
|
13
|
+
}
|
|
14
|
+
return {
|
|
15
|
+
numerator,
|
|
16
|
+
denominator,
|
|
17
|
+
missing,
|
|
18
|
+
value: denominator > 0 ? numerator / denominator : null,
|
|
19
|
+
};
|
|
20
|
+
};
|
|
21
|
+
const percentile = (values, fraction) => {
|
|
22
|
+
if (values.length === 0)
|
|
23
|
+
return null;
|
|
24
|
+
const sorted = [...values].sort((left, right) => left - right);
|
|
25
|
+
if (sorted.length === 1)
|
|
26
|
+
return sorted[0];
|
|
27
|
+
const index = (sorted.length - 1) * fraction;
|
|
28
|
+
const lower = Math.floor(index);
|
|
29
|
+
const upper = Math.ceil(index);
|
|
30
|
+
const lowerValue = sorted[lower];
|
|
31
|
+
const upperValue = sorted[upper];
|
|
32
|
+
if (lower === upper)
|
|
33
|
+
return lowerValue;
|
|
34
|
+
const weight = index - lower;
|
|
35
|
+
return lowerValue + (upperValue - lowerValue) * weight;
|
|
36
|
+
};
|
|
37
|
+
const computePercentiles = (samples) => {
|
|
38
|
+
const values = samples.filter((entry) => entry !== null);
|
|
39
|
+
return {
|
|
40
|
+
sample_size: values.length,
|
|
41
|
+
missing: samples.length - values.length,
|
|
42
|
+
median: percentile(values, 0.5),
|
|
43
|
+
p95: percentile(values, 0.95),
|
|
44
|
+
};
|
|
45
|
+
};
|
|
46
|
+
const toFirstPassSample = (result) => {
|
|
47
|
+
if (!result.task_passed)
|
|
48
|
+
return false;
|
|
49
|
+
if (result.first_pass === null)
|
|
50
|
+
return null;
|
|
51
|
+
return result.first_pass;
|
|
52
|
+
};
|
|
53
|
+
const toPatchApplySample = (result) => {
|
|
54
|
+
if (result.patch_apply_success !== null)
|
|
55
|
+
return result.patch_apply_success;
|
|
56
|
+
const hasPatchExpectation = result.assertion_results.some((assertion) => assertion.code === "assert_expect_patch_apply");
|
|
57
|
+
if (!hasPatchExpectation)
|
|
58
|
+
return null;
|
|
59
|
+
return false;
|
|
60
|
+
};
|
|
61
|
+
const toVerificationSample = (result) => result.verification_passed;
|
|
62
|
+
const toHallucinationSample = (result) => {
|
|
63
|
+
if (result.hallucination_detected === null)
|
|
64
|
+
return null;
|
|
65
|
+
return result.hallucination_detected;
|
|
66
|
+
};
|
|
67
|
+
const toScopeViolationSample = (result) => {
|
|
68
|
+
if (result.scope_violation_detected === null)
|
|
69
|
+
return null;
|
|
70
|
+
return result.scope_violation_detected;
|
|
71
|
+
};
|
|
72
|
+
export const aggregateMetrics = (run) => {
|
|
73
|
+
const taskSuccess = run.task_results.map((result) => result.task_passed);
|
|
74
|
+
const firstPass = run.task_results.map((result) => toFirstPassSample(result));
|
|
75
|
+
const patchApply = run.task_results.map((result) => toPatchApplySample(result));
|
|
76
|
+
const verificationPass = run.task_results.map((result) => toVerificationSample(result));
|
|
77
|
+
const hallucinationRate = run.task_results.map((result) => toHallucinationSample(result));
|
|
78
|
+
const scopeViolationRate = run.task_results.map((result) => toScopeViolationSample(result));
|
|
79
|
+
const latency = run.task_results.map((result) => result.latency_ms);
|
|
80
|
+
const successfulTasks = run.task_results.filter((result) => result.task_passed);
|
|
81
|
+
const successfulTokens = successfulTasks.map((result) => result.tokens_used);
|
|
82
|
+
const successfulCost = successfulTasks.map((result) => result.cost_usd);
|
|
83
|
+
return {
|
|
84
|
+
schema_version: 1,
|
|
85
|
+
generated_at: new Date().toISOString(),
|
|
86
|
+
task_count: run.summary.total,
|
|
87
|
+
m001_task_success_rate: computeRate(taskSuccess.map((value) => value)),
|
|
88
|
+
m002_first_pass_success_rate: computeRate(firstPass),
|
|
89
|
+
m003_patch_apply_success_rate: computeRate(patchApply),
|
|
90
|
+
m004_verification_pass_rate: computeRate(verificationPass),
|
|
91
|
+
m005_hallucination_rate: computeRate(hallucinationRate),
|
|
92
|
+
m006_scope_violation_rate: computeRate(scopeViolationRate),
|
|
93
|
+
m007_latency_ms: computePercentiles(latency),
|
|
94
|
+
m008_success_tokens: computePercentiles(successfulTokens),
|
|
95
|
+
m008_success_cost_usd: computePercentiles(successfulCost),
|
|
96
|
+
};
|
|
97
|
+
};
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { EvalMetricKey, EvalMetrics } from "./MetricTypes.js";
|
|
2
|
+
export interface EvalMetricDelta {
|
|
3
|
+
key: EvalMetricKey;
|
|
4
|
+
unit: "ratio" | "ms" | "tokens" | "usd";
|
|
5
|
+
higher_is_better: boolean;
|
|
6
|
+
baseline: number | null;
|
|
7
|
+
current: number | null;
|
|
8
|
+
delta: number | null;
|
|
9
|
+
direction: "up" | "down" | "flat" | "unknown";
|
|
10
|
+
regression: boolean;
|
|
11
|
+
improved: boolean;
|
|
12
|
+
}
|
|
13
|
+
export interface EvalRegressionComparison {
|
|
14
|
+
schema_version: 1;
|
|
15
|
+
status: "baseline_missing" | "compared";
|
|
16
|
+
baseline_report_id?: string;
|
|
17
|
+
baseline_created_at?: string;
|
|
18
|
+
deltas: EvalMetricDelta[];
|
|
19
|
+
regression_count: number;
|
|
20
|
+
improved_count: number;
|
|
21
|
+
unchanged_count: number;
|
|
22
|
+
}
|
|
23
|
+
export declare const compareAgainstBaseline: (params: {
|
|
24
|
+
current: EvalMetrics;
|
|
25
|
+
baseline?: EvalMetrics;
|
|
26
|
+
baseline_report_id?: string;
|
|
27
|
+
baseline_created_at?: string;
|
|
28
|
+
}) => EvalRegressionComparison;
|
|
29
|
+
//# sourceMappingURL=RegressionComparator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"RegressionComparator.d.ts","sourceRoot":"","sources":["../../src/eval/RegressionComparator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAEnE,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,aAAa,CAAC;IACnB,IAAI,EAAE,OAAO,GAAG,IAAI,GAAG,QAAQ,GAAG,KAAK,CAAC;IACxC,gBAAgB,EAAE,OAAO,CAAC;IAC1B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,SAAS,EAAE,IAAI,GAAG,MAAM,GAAG,MAAM,GAAG,SAAS,CAAC;IAC9C,UAAU,EAAE,OAAO,CAAC;IACpB,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,wBAAwB;IACvC,cAAc,EAAE,CAAC,CAAC;IAClB,MAAM,EAAE,kBAAkB,GAAG,UAAU,CAAC;IACxC,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,MAAM,EAAE,eAAe,EAAE,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;IACzB,cAAc,EAAE,MAAM,CAAC;IACvB,eAAe,EAAE,MAAM,CAAC;CACzB;AA0HD,eAAO,MAAM,sBAAsB,GAAI,QAAQ;IAC7C,OAAO,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B,KAAG,wBAiDH,CAAC"}
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
const EPSILON = 1e-12;
|
|
2
|
+
const flattenMetrics = (metrics) => [
|
|
3
|
+
{
|
|
4
|
+
key: "m001_task_success_rate",
|
|
5
|
+
value: metrics.m001_task_success_rate.value,
|
|
6
|
+
unit: "ratio",
|
|
7
|
+
higher_is_better: true,
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
key: "m002_first_pass_success_rate",
|
|
11
|
+
value: metrics.m002_first_pass_success_rate.value,
|
|
12
|
+
unit: "ratio",
|
|
13
|
+
higher_is_better: true,
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
key: "m003_patch_apply_success_rate",
|
|
17
|
+
value: metrics.m003_patch_apply_success_rate.value,
|
|
18
|
+
unit: "ratio",
|
|
19
|
+
higher_is_better: true,
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
key: "m004_verification_pass_rate",
|
|
23
|
+
value: metrics.m004_verification_pass_rate.value,
|
|
24
|
+
unit: "ratio",
|
|
25
|
+
higher_is_better: true,
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
key: "m005_hallucination_rate",
|
|
29
|
+
value: metrics.m005_hallucination_rate.value,
|
|
30
|
+
unit: "ratio",
|
|
31
|
+
higher_is_better: false,
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
key: "m006_scope_violation_rate",
|
|
35
|
+
value: metrics.m006_scope_violation_rate.value,
|
|
36
|
+
unit: "ratio",
|
|
37
|
+
higher_is_better: false,
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
key: "m007_latency_ms.median",
|
|
41
|
+
value: metrics.m007_latency_ms.median,
|
|
42
|
+
unit: "ms",
|
|
43
|
+
higher_is_better: false,
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
key: "m007_latency_ms.p95",
|
|
47
|
+
value: metrics.m007_latency_ms.p95,
|
|
48
|
+
unit: "ms",
|
|
49
|
+
higher_is_better: false,
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
key: "m008_success_tokens.median",
|
|
53
|
+
value: metrics.m008_success_tokens.median,
|
|
54
|
+
unit: "tokens",
|
|
55
|
+
higher_is_better: false,
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
key: "m008_success_tokens.p95",
|
|
59
|
+
value: metrics.m008_success_tokens.p95,
|
|
60
|
+
unit: "tokens",
|
|
61
|
+
higher_is_better: false,
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
key: "m008_success_cost_usd.median",
|
|
65
|
+
value: metrics.m008_success_cost_usd.median,
|
|
66
|
+
unit: "usd",
|
|
67
|
+
higher_is_better: false,
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
key: "m008_success_cost_usd.p95",
|
|
71
|
+
value: metrics.m008_success_cost_usd.p95,
|
|
72
|
+
unit: "usd",
|
|
73
|
+
higher_is_better: false,
|
|
74
|
+
},
|
|
75
|
+
];
|
|
76
|
+
const compareMetric = (current, baseline) => {
|
|
77
|
+
if (current.value === null || baseline.value === null) {
|
|
78
|
+
return {
|
|
79
|
+
key: current.key,
|
|
80
|
+
unit: current.unit,
|
|
81
|
+
higher_is_better: current.higher_is_better,
|
|
82
|
+
baseline: baseline.value,
|
|
83
|
+
current: current.value,
|
|
84
|
+
delta: null,
|
|
85
|
+
direction: "unknown",
|
|
86
|
+
regression: false,
|
|
87
|
+
improved: false,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
const delta = current.value - baseline.value;
|
|
91
|
+
const direction = Math.abs(delta) <= EPSILON ? "flat" : (delta > 0 ? "up" : "down");
|
|
92
|
+
const regression = current.higher_is_better ? delta < -EPSILON : delta > EPSILON;
|
|
93
|
+
const improved = current.higher_is_better ? delta > EPSILON : delta < -EPSILON;
|
|
94
|
+
return {
|
|
95
|
+
key: current.key,
|
|
96
|
+
unit: current.unit,
|
|
97
|
+
higher_is_better: current.higher_is_better,
|
|
98
|
+
baseline: baseline.value,
|
|
99
|
+
current: current.value,
|
|
100
|
+
delta: Math.abs(delta) <= EPSILON ? 0 : delta,
|
|
101
|
+
direction,
|
|
102
|
+
regression,
|
|
103
|
+
improved,
|
|
104
|
+
};
|
|
105
|
+
};
|
|
106
|
+
export const compareAgainstBaseline = (params) => {
|
|
107
|
+
const currentEntries = flattenMetrics(params.current);
|
|
108
|
+
if (!params.baseline) {
|
|
109
|
+
return {
|
|
110
|
+
schema_version: 1,
|
|
111
|
+
status: "baseline_missing",
|
|
112
|
+
baseline_report_id: params.baseline_report_id,
|
|
113
|
+
baseline_created_at: params.baseline_created_at,
|
|
114
|
+
deltas: currentEntries.map((entry) => ({
|
|
115
|
+
key: entry.key,
|
|
116
|
+
unit: entry.unit,
|
|
117
|
+
higher_is_better: entry.higher_is_better,
|
|
118
|
+
baseline: null,
|
|
119
|
+
current: entry.value,
|
|
120
|
+
delta: null,
|
|
121
|
+
direction: "unknown",
|
|
122
|
+
regression: false,
|
|
123
|
+
improved: false,
|
|
124
|
+
})),
|
|
125
|
+
regression_count: 0,
|
|
126
|
+
improved_count: 0,
|
|
127
|
+
unchanged_count: currentEntries.length,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
const baselineEntries = flattenMetrics(params.baseline);
|
|
131
|
+
const baselineByKey = new Map();
|
|
132
|
+
for (const entry of baselineEntries) {
|
|
133
|
+
baselineByKey.set(entry.key, entry);
|
|
134
|
+
}
|
|
135
|
+
const deltas = [];
|
|
136
|
+
for (const entry of currentEntries) {
|
|
137
|
+
const baselineEntry = baselineByKey.get(entry.key);
|
|
138
|
+
if (!baselineEntry)
|
|
139
|
+
continue;
|
|
140
|
+
deltas.push(compareMetric(entry, baselineEntry));
|
|
141
|
+
}
|
|
142
|
+
const regressionCount = deltas.filter((entry) => entry.regression).length;
|
|
143
|
+
const improvedCount = deltas.filter((entry) => entry.improved).length;
|
|
144
|
+
const unchangedCount = deltas.length - regressionCount - improvedCount;
|
|
145
|
+
return {
|
|
146
|
+
schema_version: 1,
|
|
147
|
+
status: "compared",
|
|
148
|
+
baseline_report_id: params.baseline_report_id,
|
|
149
|
+
baseline_created_at: params.baseline_created_at,
|
|
150
|
+
deltas,
|
|
151
|
+
regression_count: regressionCount,
|
|
152
|
+
improved_count: improvedCount,
|
|
153
|
+
unchanged_count: unchangedCount,
|
|
154
|
+
};
|
|
155
|
+
};
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import type { VerificationOutcome } from "../cognitive/Types.js";
|
|
2
|
+
export type NormalizedRunStatus = "pass" | "fail" | "degraded" | "unknown";
|
|
3
|
+
export type NormalizedPhaseStatus = "available" | "missing" | "degraded";
|
|
4
|
+
export interface NormalizedArtifactReference {
|
|
5
|
+
phase: string;
|
|
6
|
+
kind: string;
|
|
7
|
+
status: "present" | "missing";
|
|
8
|
+
path: string | null;
|
|
9
|
+
reason_code: string | null;
|
|
10
|
+
}
|
|
11
|
+
export interface NormalizedPhaseOutcome {
|
|
12
|
+
phase: string;
|
|
13
|
+
status: NormalizedPhaseStatus;
|
|
14
|
+
duration_ms: number | null;
|
|
15
|
+
provider: string | null;
|
|
16
|
+
model: string | null;
|
|
17
|
+
input_tokens: number | null;
|
|
18
|
+
output_tokens: number | null;
|
|
19
|
+
total_tokens: number | null;
|
|
20
|
+
cost_usd: number | null;
|
|
21
|
+
cost_source: string | null;
|
|
22
|
+
missing_usage_reason: string | null;
|
|
23
|
+
missing_cost_reason: string | null;
|
|
24
|
+
}
|
|
25
|
+
export interface NormalizedRunRecord {
|
|
26
|
+
schema_version: 1;
|
|
27
|
+
run_id: string | null;
|
|
28
|
+
task_id: string | null;
|
|
29
|
+
fingerprint: string | null;
|
|
30
|
+
duration_ms: number | null;
|
|
31
|
+
final_status: NormalizedRunStatus;
|
|
32
|
+
failure_class: string | null;
|
|
33
|
+
reason_codes: string[];
|
|
34
|
+
retryable: boolean | null;
|
|
35
|
+
verification_outcome: VerificationOutcome | null;
|
|
36
|
+
touched_files: string[];
|
|
37
|
+
artifact_references: NormalizedArtifactReference[];
|
|
38
|
+
missing_artifacts: string[];
|
|
39
|
+
phase_outcomes: NormalizedPhaseOutcome[];
|
|
40
|
+
usage_tokens_total: number | null;
|
|
41
|
+
cost_usd: number | null;
|
|
42
|
+
missing_data_markers: string[];
|
|
43
|
+
}
|
|
44
|
+
export interface AdaptRunSummaryInput {
|
|
45
|
+
runSummary?: unknown;
|
|
46
|
+
runId?: string;
|
|
47
|
+
taskId?: string;
|
|
48
|
+
verificationOutcome?: VerificationOutcome | null;
|
|
49
|
+
touchedFiles?: string[];
|
|
50
|
+
}
|
|
51
|
+
export declare const adaptRunSummaryForReport: (input?: AdaptRunSummaryInput) => NormalizedRunRecord;
|
|
52
|
+
//# sourceMappingURL=ReportInputAdapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ReportInputAdapter.d.ts","sourceRoot":"","sources":["../../src/eval/ReportInputAdapter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,uBAAuB,CAAC;AAEjE,MAAM,MAAM,mBAAmB,GAAG,MAAM,GAAG,MAAM,GAAG,UAAU,GAAG,SAAS,CAAC;AAC3E,MAAM,MAAM,qBAAqB,GAAG,WAAW,GAAG,SAAS,GAAG,UAAU,CAAC;AAEzE,MAAM,WAAW,2BAA2B;IAC1C,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,SAAS,GAAG,SAAS,CAAC;IAC9B,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IACpB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;CAC5B;AAED,MAAM,WAAW,sBAAsB;IACrC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,qBAAqB,CAAC;IAC9B,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,oBAAoB,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,mBAAmB,EAAE,MAAM,GAAG,IAAI,CAAC;CACpC;AAED,MAAM,WAAW,mBAAmB;IAClC,cAAc,EAAE,CAAC,CAAC;IAClB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,YAAY,EAAE,mBAAmB,CAAC;IAClC,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,EAAE,OAAO,GAAG,IAAI,CAAC;IAC1B,oBAAoB,EAAE,mBAAmB,GAAG,IAAI,CAAC;IACjD,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,mBAAmB,EAAE,2BAA2B,EAAE,CAAC;IACnD,iBAAiB,EAAE,MAAM,EAAE,CAAC;IAC5B,cAAc,EAAE,sBAAsB,EAAE,CAAC;IACzC,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,oBAAoB,EAAE,MAAM,EAAE,CAAC;CAChC;AAED,MAAM,WAAW,oBAAoB;IACnC,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,mBAAmB,CAAC,EAAE,mBAAmB,GAAG,IAAI,CAAC;IACjD,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;CACzB;AAmMD,eAAO,MAAM,wBAAwB,GAAI,QAAO,oBAAyB,KAAG,mBAsE3E,CAAC"}
|