@mcoda/codali 0.1.87 → 0.1.89
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/EvalCommand.d.ts +8 -0
- package/dist/cli/EvalCommand.d.ts.map +1 -1
- package/dist/cli/EvalCommand.js +93 -1
- package/dist/cli.d.ts.map +1 -1
- package/dist/cli.js +1 -0
- package/dist/docdex/DocdexClient.d.ts +8 -1
- package/dist/docdex/DocdexClient.d.ts.map +1 -1
- package/dist/docdex/DocdexClient.js +126 -33
- package/dist/eval/CodaliGatewayLiveHarness.d.ts +169 -0
- package/dist/eval/CodaliGatewayLiveHarness.d.ts.map +1 -0
- package/dist/eval/CodaliGatewayLiveHarness.js +824 -0
- package/dist/eval/GatewayEvalSuite.d.ts +202 -0
- package/dist/eval/GatewayEvalSuite.d.ts.map +1 -0
- package/dist/eval/GatewayEvalSuite.js +673 -0
- package/dist/gateway/AgentTierResolver.d.ts +74 -0
- package/dist/gateway/AgentTierResolver.d.ts.map +1 -0
- package/dist/gateway/AgentTierResolver.js +576 -0
- package/dist/gateway/AppToolGatewayDispatcher.d.ts +88 -0
- package/dist/gateway/AppToolGatewayDispatcher.d.ts.map +1 -0
- package/dist/gateway/AppToolGatewayDispatcher.js +381 -0
- package/dist/gateway/CodaliGateway.d.ts +73 -0
- package/dist/gateway/CodaliGateway.d.ts.map +1 -0
- package/dist/gateway/CodaliGateway.js +824 -0
- package/dist/gateway/CodaliGatewaySchemas.d.ts +21 -0
- package/dist/gateway/CodaliGatewaySchemas.d.ts.map +1 -0
- package/dist/gateway/CodaliGatewaySchemas.js +874 -0
- package/dist/gateway/CodaliGatewayStore.d.ts +157 -0
- package/dist/gateway/CodaliGatewayStore.d.ts.map +1 -0
- package/dist/gateway/CodaliGatewayStore.js +206 -0
- package/dist/gateway/CodaliGatewayTypes.d.ts +336 -0
- package/dist/gateway/CodaliGatewayTypes.d.ts.map +1 -0
- package/dist/gateway/CodaliGatewayTypes.js +1 -0
- package/dist/gateway/ContextPackBuilder.d.ts +43 -0
- package/dist/gateway/ContextPackBuilder.d.ts.map +1 -0
- package/dist/gateway/ContextPackBuilder.js +317 -0
- package/dist/gateway/EvidenceNormalizer.d.ts +42 -0
- package/dist/gateway/EvidenceNormalizer.d.ts.map +1 -0
- package/dist/gateway/EvidenceNormalizer.js +488 -0
- package/dist/gateway/GatewayPlanner.d.ts +195 -0
- package/dist/gateway/GatewayPlanner.d.ts.map +1 -0
- package/dist/gateway/GatewayPlanner.js +379 -0
- package/dist/gateway/GatewayPolicyCompiler.d.ts +30 -0
- package/dist/gateway/GatewayPolicyCompiler.d.ts.map +1 -0
- package/dist/gateway/GatewayPolicyCompiler.js +114 -0
- package/dist/gateway/GatewaySecurityPolicy.d.ts +14 -0
- package/dist/gateway/GatewaySecurityPolicy.d.ts.map +1 -0
- package/dist/gateway/GatewaySecurityPolicy.js +350 -0
- package/dist/gateway/GatewayStateMachine.d.ts +165 -0
- package/dist/gateway/GatewayStateMachine.d.ts.map +1 -0
- package/dist/gateway/GatewayStateMachine.js +790 -0
- package/dist/gateway/GatewayTraceReplay.d.ts +120 -0
- package/dist/gateway/GatewayTraceReplay.d.ts.map +1 -0
- package/dist/gateway/GatewayTraceReplay.js +273 -0
- package/dist/gateway/ToolCapabilityCompiler.d.ts +50 -0
- package/dist/gateway/ToolCapabilityCompiler.d.ts.map +1 -0
- package/dist/gateway/ToolCapabilityCompiler.js +442 -0
- package/dist/index.d.ts +33 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +16 -0
- package/dist/runtime/CodaliJobRuntime.d.ts +211 -0
- package/dist/runtime/CodaliJobRuntime.d.ts.map +1 -0
- package/dist/runtime/CodaliJobRuntime.js +590 -0
- package/dist/runtime/CodaliRuntime.d.ts +81 -1
- package/dist/runtime/CodaliRuntime.d.ts.map +1 -1
- package/dist/runtime/CodaliRuntime.js +619 -4
- package/dist/tools/ToolRegistry.d.ts.map +1 -1
- package/dist/tools/ToolRegistry.js +4 -0
- package/dist/tools/ToolTypes.d.ts +1 -1
- package/dist/tools/ToolTypes.d.ts.map +1 -1
- package/dist/tools/ToolTypes.js +5 -1
- package/package.json +3 -3
|
@@ -0,0 +1,673 @@
|
|
|
1
|
+
import { randomUUID } from "node:crypto";
|
|
2
|
+
export const DEFAULT_CODALI_GATEWAY_EVAL_THRESHOLDS = {
|
|
3
|
+
plannerSchemaValidityMin: 1,
|
|
4
|
+
evidencePrecisionMin: 1,
|
|
5
|
+
citationSourceCorrectnessMin: 1,
|
|
6
|
+
disabledToolLeakageMax: 0,
|
|
7
|
+
finalAnswerDirectnessMin: 1,
|
|
8
|
+
finalLargeModelRateMin: 1,
|
|
9
|
+
budgetComplianceMin: 1,
|
|
10
|
+
latencyP95MsMax: 120000,
|
|
11
|
+
costP95UsdMax: 10,
|
|
12
|
+
latencyRegressionRatioMax: 0.25,
|
|
13
|
+
costRegressionRatioMax: 0.25,
|
|
14
|
+
};
|
|
15
|
+
export const CODALI_GATEWAY_EVAL_CASES = [
|
|
16
|
+
{
|
|
17
|
+
id: "gateway-generic-question",
|
|
18
|
+
type: "generic_question",
|
|
19
|
+
prompt: "Answer a simple product-neutral question without calling tools.",
|
|
20
|
+
expectations: {
|
|
21
|
+
allowedTools: [],
|
|
22
|
+
deniedTools: [
|
|
23
|
+
"github_search",
|
|
24
|
+
"jira_search",
|
|
25
|
+
"microsoft_graph",
|
|
26
|
+
"smartclick_search",
|
|
27
|
+
],
|
|
28
|
+
requiresFinalLargeModel: true,
|
|
29
|
+
maxLatencyMs: 2000,
|
|
30
|
+
maxTokens: 800,
|
|
31
|
+
maxToolCalls: 0,
|
|
32
|
+
maxModelCalls: 3,
|
|
33
|
+
maxCostUsd: 0.05,
|
|
34
|
+
},
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
id: "gateway-code-repo-question",
|
|
38
|
+
type: "code_repo_question",
|
|
39
|
+
prompt: "Answer a repository question using Docdex evidence from the tenant repo.",
|
|
40
|
+
expectations: {
|
|
41
|
+
allowedTools: ["docdex_search", "docdex_open"],
|
|
42
|
+
requiredTools: ["docdex_search"],
|
|
43
|
+
requiredSourceTypes: ["docdex"],
|
|
44
|
+
deniedTools: ["shell", "write_file", "git_push"],
|
|
45
|
+
requiresEvidence: true,
|
|
46
|
+
requiresFinalLargeModel: true,
|
|
47
|
+
maxLatencyMs: 5000,
|
|
48
|
+
maxTokens: 2000,
|
|
49
|
+
maxToolCalls: 3,
|
|
50
|
+
maxModelCalls: 4,
|
|
51
|
+
maxCostUsd: 0.15,
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
id: "gateway-encrypted-docdex-search-question",
|
|
56
|
+
type: "encrypted_docdex_search_question",
|
|
57
|
+
prompt: "Answer a tenant-scoped encrypted Docdex search question.",
|
|
58
|
+
expectations: {
|
|
59
|
+
allowedTools: ["docdex_search", "docdex_open"],
|
|
60
|
+
requiredTools: ["docdex_search"],
|
|
61
|
+
requiredSourceTypes: ["encrypted_docdex"],
|
|
62
|
+
deniedTools: ["shell", "write_file", "local_docdex_fallback"],
|
|
63
|
+
requiresEvidence: true,
|
|
64
|
+
requiresFinalLargeModel: true,
|
|
65
|
+
maxLatencyMs: 8000,
|
|
66
|
+
maxTokens: 2200,
|
|
67
|
+
maxToolCalls: 4,
|
|
68
|
+
maxModelCalls: 4,
|
|
69
|
+
maxCostUsd: 0.2,
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
id: "gateway-product-tool-question",
|
|
74
|
+
type: "product_tool_question",
|
|
75
|
+
prompt: "Answer using a runtime-provided read-only app tool contract.",
|
|
76
|
+
expectations: {
|
|
77
|
+
allowedTools: ["app_tool_gateway"],
|
|
78
|
+
requiredTools: ["app_tool_gateway"],
|
|
79
|
+
requiredSourceTypes: ["app_tool_gateway"],
|
|
80
|
+
deniedTools: ["shell", "write_file", "github_issue_create"],
|
|
81
|
+
requiresEvidence: true,
|
|
82
|
+
requiresFinalLargeModel: true,
|
|
83
|
+
maxLatencyMs: 6000,
|
|
84
|
+
maxTokens: 2000,
|
|
85
|
+
maxToolCalls: 2,
|
|
86
|
+
maxModelCalls: 4,
|
|
87
|
+
maxCostUsd: 0.2,
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
id: "gateway-disabled-integration-question",
|
|
92
|
+
type: "disabled_integration_question",
|
|
93
|
+
prompt: "Handle a question that mentions disabled SmartClick/GitHub/Jira/Microsoft integrations.",
|
|
94
|
+
expectations: {
|
|
95
|
+
allowedTools: ["docdex_search"],
|
|
96
|
+
deniedTools: [
|
|
97
|
+
"smartclick_search",
|
|
98
|
+
"github_search",
|
|
99
|
+
"jira_search",
|
|
100
|
+
"microsoft_graph",
|
|
101
|
+
],
|
|
102
|
+
requiresFinalLargeModel: true,
|
|
103
|
+
maxLatencyMs: 4000,
|
|
104
|
+
maxTokens: 1000,
|
|
105
|
+
maxToolCalls: 1,
|
|
106
|
+
maxModelCalls: 3,
|
|
107
|
+
maxCostUsd: 0.1,
|
|
108
|
+
},
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
id: "gateway-image-generation-question",
|
|
112
|
+
type: "image_generation_question",
|
|
113
|
+
prompt: "Route an image request to the image worker and preserve artifact metadata.",
|
|
114
|
+
expectations: {
|
|
115
|
+
allowedTools: ["image_generate"],
|
|
116
|
+
requiredTools: ["image_generate"],
|
|
117
|
+
deniedTools: ["shell", "write_file"],
|
|
118
|
+
requiresImageArtifact: true,
|
|
119
|
+
requiresFinalLargeModel: true,
|
|
120
|
+
maxLatencyMs: 15000,
|
|
121
|
+
maxTokens: 2500,
|
|
122
|
+
maxToolCalls: 1,
|
|
123
|
+
maxModelCalls: 5,
|
|
124
|
+
maxCostUsd: 0.5,
|
|
125
|
+
},
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
id: "gateway-missing-evidence-question",
|
|
129
|
+
type: "missing_evidence_question",
|
|
130
|
+
prompt: "Handle a repo question with insufficient evidence without fabricating.",
|
|
131
|
+
expectations: {
|
|
132
|
+
allowedTools: ["docdex_search"],
|
|
133
|
+
requiredTools: ["docdex_search"],
|
|
134
|
+
deniedTools: ["shell", "write_file"],
|
|
135
|
+
expectsMissingEvidence: true,
|
|
136
|
+
requiresFinalLargeModel: true,
|
|
137
|
+
maxLatencyMs: 4000,
|
|
138
|
+
maxTokens: 1200,
|
|
139
|
+
maxToolCalls: 2,
|
|
140
|
+
maxModelCalls: 4,
|
|
141
|
+
maxCostUsd: 0.15,
|
|
142
|
+
},
|
|
143
|
+
},
|
|
144
|
+
];
|
|
145
|
+
const rateMetric = (numerator, denominator, missing = 0) => ({
|
|
146
|
+
numerator,
|
|
147
|
+
denominator,
|
|
148
|
+
missing,
|
|
149
|
+
value: denominator === 0 ? null : numerator / denominator,
|
|
150
|
+
});
|
|
151
|
+
const percentileMetric = (values) => {
|
|
152
|
+
const samples = values.filter((value) => (value !== undefined && Number.isFinite(value))).sort((left, right) => left - right);
|
|
153
|
+
const missing = values.length - samples.length;
|
|
154
|
+
if (samples.length === 0) {
|
|
155
|
+
return { sample_size: 0, missing, median: null, p95: null };
|
|
156
|
+
}
|
|
157
|
+
const medianIndex = Math.floor((samples.length - 1) / 2);
|
|
158
|
+
const p95Index = Math.min(samples.length - 1, Math.ceil(samples.length * 0.95) - 1);
|
|
159
|
+
return {
|
|
160
|
+
sample_size: samples.length,
|
|
161
|
+
missing,
|
|
162
|
+
median: samples[medianIndex] ?? null,
|
|
163
|
+
p95: samples[p95Index] ?? null,
|
|
164
|
+
};
|
|
165
|
+
};
|
|
166
|
+
const includesAll = (actual, expected) => {
|
|
167
|
+
if (!expected || expected.length === 0)
|
|
168
|
+
return true;
|
|
169
|
+
const actualSet = new Set(actual);
|
|
170
|
+
return expected.every((entry) => actualSet.has(entry));
|
|
171
|
+
};
|
|
172
|
+
const intersects = (actual, denied) => {
|
|
173
|
+
if (!denied || denied.length === 0)
|
|
174
|
+
return false;
|
|
175
|
+
const actualSet = new Set(actual);
|
|
176
|
+
return denied.some((entry) => actualSet.has(entry));
|
|
177
|
+
};
|
|
178
|
+
const usedUnallowedTool = (calledTools, allowedTools) => {
|
|
179
|
+
if (!allowedTools)
|
|
180
|
+
return false;
|
|
181
|
+
const allowed = new Set(allowedTools);
|
|
182
|
+
return calledTools.some((tool) => !allowed.has(tool));
|
|
183
|
+
};
|
|
184
|
+
const calculateEvidencePrecision = (record, evalCase) => {
|
|
185
|
+
if (!evalCase.expectations.requiresEvidence)
|
|
186
|
+
return null;
|
|
187
|
+
if (record.evidence.length === 0)
|
|
188
|
+
return 0;
|
|
189
|
+
const relevant = record.evidence.filter((item) => item.relevant !== false).length;
|
|
190
|
+
return relevant / record.evidence.length;
|
|
191
|
+
};
|
|
192
|
+
const citationMatchesEvidence = (citation, evidenceById, evidenceSourceTypes) => {
|
|
193
|
+
if (citation.evidenceId && evidenceById.has(citation.evidenceId))
|
|
194
|
+
return true;
|
|
195
|
+
if (citation.sourceType && evidenceSourceTypes.has(citation.sourceType))
|
|
196
|
+
return true;
|
|
197
|
+
return false;
|
|
198
|
+
};
|
|
199
|
+
const hasCorrectCitations = (record, evalCase) => {
|
|
200
|
+
const requiredSourceTypes = evalCase.expectations.requiredSourceTypes ?? [];
|
|
201
|
+
if (!evalCase.expectations.requiresEvidence && requiredSourceTypes.length === 0)
|
|
202
|
+
return true;
|
|
203
|
+
if (record.citations.length === 0)
|
|
204
|
+
return false;
|
|
205
|
+
const evidenceById = new Map(record.evidence.map((item) => [item.id, item]));
|
|
206
|
+
const evidenceSourceTypes = new Set(record.evidence.map((item) => item.sourceType));
|
|
207
|
+
const citedSourceTypes = new Set();
|
|
208
|
+
for (const evidence of record.evidence) {
|
|
209
|
+
if (evidence.cited)
|
|
210
|
+
citedSourceTypes.add(evidence.sourceType);
|
|
211
|
+
}
|
|
212
|
+
for (const citation of record.citations) {
|
|
213
|
+
if (!citationMatchesEvidence(citation, evidenceById, evidenceSourceTypes)) {
|
|
214
|
+
return false;
|
|
215
|
+
}
|
|
216
|
+
if (citation.sourceType)
|
|
217
|
+
citedSourceTypes.add(citation.sourceType);
|
|
218
|
+
if (citation.evidenceId) {
|
|
219
|
+
const evidence = evidenceById.get(citation.evidenceId);
|
|
220
|
+
if (evidence)
|
|
221
|
+
citedSourceTypes.add(evidence.sourceType);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
return includesAll(citedSourceTypes, requiredSourceTypes);
|
|
225
|
+
};
|
|
226
|
+
const isBudgetCompliant = (record, evalCase) => {
|
|
227
|
+
const expectations = evalCase.expectations;
|
|
228
|
+
const checks = [
|
|
229
|
+
[record.latencyMs, expectations.maxLatencyMs],
|
|
230
|
+
[record.tokensUsed, expectations.maxTokens],
|
|
231
|
+
[record.toolCallCount ?? record.calledTools.length, expectations.maxToolCalls],
|
|
232
|
+
[record.modelCallCount, expectations.maxModelCalls],
|
|
233
|
+
[record.costUsd, expectations.maxCostUsd],
|
|
234
|
+
];
|
|
235
|
+
return checks.every(([actual, max]) => (max === undefined || (actual !== undefined && actual <= max)));
|
|
236
|
+
};
|
|
237
|
+
export const evaluateCodaliGatewayEvalCase = (evalCase, record, thresholds = DEFAULT_CODALI_GATEWAY_EVAL_THRESHOLDS) => {
|
|
238
|
+
const failures = [];
|
|
239
|
+
const warnings = [...(record.warnings ?? [])];
|
|
240
|
+
const errors = [...(record.errors ?? [])];
|
|
241
|
+
const expectations = evalCase.expectations;
|
|
242
|
+
const plannerSchemaValid = record.plannerSchemaValid
|
|
243
|
+
&& record.taskType === evalCase.type
|
|
244
|
+
&& (!record.selectedTaskType || record.selectedTaskType === evalCase.type);
|
|
245
|
+
const evidencePrecision = calculateEvidencePrecision(record, evalCase);
|
|
246
|
+
const minEvidencePrecision = expectations.minEvidencePrecision ?? thresholds.evidencePrecisionMin;
|
|
247
|
+
const evidencePrecisionPassed = !expectations.requiresEvidence
|
|
248
|
+
|| (evidencePrecision !== null && evidencePrecision >= minEvidencePrecision);
|
|
249
|
+
const citationSourceCorrect = hasCorrectCitations(record, evalCase);
|
|
250
|
+
const disabledToolLeakageDetected = intersects(record.calledTools, expectations.deniedTools);
|
|
251
|
+
const unallowedToolCalled = usedUnallowedTool(record.calledTools, expectations.allowedTools);
|
|
252
|
+
const finalAnswerDirect = record.finalAnswerDirect === true && record.finalAnswer.trim().length > 0;
|
|
253
|
+
const finalLargeModelUsed = !expectations.requiresFinalLargeModel
|
|
254
|
+
|| record.finalModelTier === "large";
|
|
255
|
+
const budgetCompliant = isBudgetCompliant(record, evalCase);
|
|
256
|
+
const imageArtifactPresent = !expectations.requiresImageArtifact
|
|
257
|
+
|| !!record.imageArtifact?.uri
|
|
258
|
+
|| !!record.imageArtifact?.id;
|
|
259
|
+
const missingEvidenceHandled = !expectations.expectsMissingEvidence
|
|
260
|
+
|| record.missingEvidenceHandled === true;
|
|
261
|
+
if (record.status !== "passed")
|
|
262
|
+
failures.push("gateway_eval_runner_status_failed");
|
|
263
|
+
if (!plannerSchemaValid)
|
|
264
|
+
failures.push("gateway_planner_schema_or_task_type_invalid");
|
|
265
|
+
if (!includesAll(record.calledTools, expectations.requiredTools)) {
|
|
266
|
+
failures.push("gateway_required_tool_not_called");
|
|
267
|
+
}
|
|
268
|
+
if (unallowedToolCalled)
|
|
269
|
+
failures.push("gateway_unallowed_tool_called");
|
|
270
|
+
if (!evidencePrecisionPassed)
|
|
271
|
+
failures.push("gateway_evidence_precision_below_threshold");
|
|
272
|
+
if (!citationSourceCorrect)
|
|
273
|
+
failures.push("gateway_citation_source_incorrect");
|
|
274
|
+
if (disabledToolLeakageDetected)
|
|
275
|
+
failures.push("gateway_disabled_tool_leakage_detected");
|
|
276
|
+
if (!finalAnswerDirect)
|
|
277
|
+
failures.push("gateway_final_answer_not_direct");
|
|
278
|
+
if (!finalLargeModelUsed)
|
|
279
|
+
failures.push("gateway_final_large_model_missing");
|
|
280
|
+
if (!budgetCompliant)
|
|
281
|
+
failures.push("gateway_budget_exceeded");
|
|
282
|
+
if (!imageArtifactPresent)
|
|
283
|
+
failures.push("gateway_image_artifact_missing");
|
|
284
|
+
if (!missingEvidenceHandled)
|
|
285
|
+
failures.push("gateway_missing_evidence_not_handled");
|
|
286
|
+
return {
|
|
287
|
+
caseId: evalCase.id,
|
|
288
|
+
taskType: evalCase.type,
|
|
289
|
+
status: failures.length === 0 ? "passed" : "failed",
|
|
290
|
+
passed: failures.length === 0,
|
|
291
|
+
plannerSchemaValid,
|
|
292
|
+
evidencePrecision,
|
|
293
|
+
evidencePrecisionPassed,
|
|
294
|
+
citationSourceCorrect,
|
|
295
|
+
disabledToolLeakageDetected,
|
|
296
|
+
finalAnswerDirect,
|
|
297
|
+
finalLargeModelUsed,
|
|
298
|
+
budgetCompliant,
|
|
299
|
+
imageArtifactPresent,
|
|
300
|
+
missingEvidenceHandled,
|
|
301
|
+
calledTools: [...record.calledTools],
|
|
302
|
+
failures,
|
|
303
|
+
warnings,
|
|
304
|
+
errors,
|
|
305
|
+
latencyMs: record.latencyMs,
|
|
306
|
+
tokensUsed: record.tokensUsed,
|
|
307
|
+
costUsd: record.costUsd,
|
|
308
|
+
toolCallCount: record.toolCallCount ?? record.calledTools.length,
|
|
309
|
+
modelCallCount: record.modelCallCount,
|
|
310
|
+
};
|
|
311
|
+
};
|
|
312
|
+
export const aggregateCodaliGatewayEvalMetrics = (cases) => {
|
|
313
|
+
const taskCount = cases.length;
|
|
314
|
+
const evidenceCases = cases.filter((result) => result.evidencePrecision !== null);
|
|
315
|
+
const citationCases = cases.filter((result) => result.evidencePrecision !== null);
|
|
316
|
+
const finalLargeCases = cases.filter((result) => (result.finalLargeModelUsed || result.failures.includes("gateway_final_large_model_missing")));
|
|
317
|
+
return {
|
|
318
|
+
schemaVersion: 1,
|
|
319
|
+
generatedAt: new Date().toISOString(),
|
|
320
|
+
taskCount,
|
|
321
|
+
plannerSchemaValidityRate: rateMetric(cases.filter((result) => result.plannerSchemaValid).length, taskCount),
|
|
322
|
+
evidencePrecisionRate: rateMetric(evidenceCases.filter((result) => result.evidencePrecisionPassed).length, evidenceCases.length, taskCount - evidenceCases.length),
|
|
323
|
+
citationSourceCorrectnessRate: rateMetric(citationCases.filter((result) => result.citationSourceCorrect).length, citationCases.length, taskCount - citationCases.length),
|
|
324
|
+
disabledToolLeakageRate: rateMetric(cases.filter((result) => result.disabledToolLeakageDetected).length, taskCount),
|
|
325
|
+
finalAnswerDirectnessRate: rateMetric(cases.filter((result) => result.finalAnswerDirect).length, taskCount),
|
|
326
|
+
finalLargeModelRate: rateMetric(finalLargeCases.filter((result) => result.finalLargeModelUsed).length, finalLargeCases.length, taskCount - finalLargeCases.length),
|
|
327
|
+
budgetComplianceRate: rateMetric(cases.filter((result) => result.budgetCompliant).length, taskCount),
|
|
328
|
+
latencyMs: percentileMetric(cases.map((result) => result.latencyMs)),
|
|
329
|
+
tokensUsed: percentileMetric(cases.map((result) => result.tokensUsed)),
|
|
330
|
+
costUsd: percentileMetric(cases.map((result) => result.costUsd)),
|
|
331
|
+
toolCallCount: percentileMetric(cases.map((result) => result.toolCallCount)),
|
|
332
|
+
modelCallCount: percentileMetric(cases.map((result) => result.modelCallCount)),
|
|
333
|
+
};
|
|
334
|
+
};
|
|
335
|
+
const comparisonInput = (baseline) => {
|
|
336
|
+
if (!baseline)
|
|
337
|
+
return {};
|
|
338
|
+
if ("metrics" in baseline) {
|
|
339
|
+
return {
|
|
340
|
+
metrics: baseline.metrics,
|
|
341
|
+
reportId: baseline.reportId,
|
|
342
|
+
generatedAt: baseline.metrics.generatedAt,
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
return { metrics: baseline, generatedAt: baseline.generatedAt };
|
|
346
|
+
};
|
|
347
|
+
const metricDelta = (key, unit, current, baseline) => {
|
|
348
|
+
if (current === null || baseline === null) {
|
|
349
|
+
return {
|
|
350
|
+
key,
|
|
351
|
+
unit,
|
|
352
|
+
baseline,
|
|
353
|
+
current,
|
|
354
|
+
delta: null,
|
|
355
|
+
relativeDelta: null,
|
|
356
|
+
regression: false,
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
const delta = current - baseline;
|
|
360
|
+
const relativeDelta = baseline === 0 ? null : delta / baseline;
|
|
361
|
+
return {
|
|
362
|
+
key,
|
|
363
|
+
unit,
|
|
364
|
+
baseline,
|
|
365
|
+
current,
|
|
366
|
+
delta,
|
|
367
|
+
relativeDelta,
|
|
368
|
+
regression: delta > 0,
|
|
369
|
+
};
|
|
370
|
+
};
|
|
371
|
+
export const compareCodaliGatewayEvalBaseline = (params) => {
|
|
372
|
+
const baseline = comparisonInput(params.baseline);
|
|
373
|
+
const deltas = [
|
|
374
|
+
metricDelta("latency_ms.p95", "ms", params.current.latencyMs.p95, baseline.metrics?.latencyMs.p95 ?? null),
|
|
375
|
+
metricDelta("cost_usd.p95", "usd", params.current.costUsd.p95, baseline.metrics?.costUsd.p95 ?? null),
|
|
376
|
+
metricDelta("tokens_used.p95", "tokens", params.current.tokensUsed.p95, baseline.metrics?.tokensUsed.p95 ?? null),
|
|
377
|
+
];
|
|
378
|
+
return {
|
|
379
|
+
schemaVersion: 1,
|
|
380
|
+
status: baseline.metrics ? "compared" : "baseline_missing",
|
|
381
|
+
baselineReportId: baseline.reportId,
|
|
382
|
+
baselineGeneratedAt: baseline.generatedAt,
|
|
383
|
+
deltas,
|
|
384
|
+
regressionCount: deltas.filter((delta) => delta.regression).length,
|
|
385
|
+
};
|
|
386
|
+
};
|
|
387
|
+
const valueBelowMinimum = (value, minimum) => (value === null || value < minimum);
|
|
388
|
+
const valueAboveMaximum = (value, maximum) => (value === null || value > maximum);
|
|
389
|
+
const addRateMinimumFailure = (failures, params) => {
|
|
390
|
+
if (valueBelowMinimum(params.actual, params.threshold)) {
|
|
391
|
+
failures.push(params);
|
|
392
|
+
}
|
|
393
|
+
};
|
|
394
|
+
const addRateMaximumFailure = (failures, params) => {
|
|
395
|
+
if (valueAboveMaximum(params.actual, params.threshold)) {
|
|
396
|
+
failures.push(params);
|
|
397
|
+
}
|
|
398
|
+
};
|
|
399
|
+
const findRegressionDelta = (regression, key) => {
|
|
400
|
+
if (!regression || regression.status !== "compared")
|
|
401
|
+
return undefined;
|
|
402
|
+
return regression.deltas.find((delta) => delta.key === key);
|
|
403
|
+
};
|
|
404
|
+
export const evaluateCodaliGatewayEvalGates = (params) => {
|
|
405
|
+
const thresholds = {
|
|
406
|
+
...DEFAULT_CODALI_GATEWAY_EVAL_THRESHOLDS,
|
|
407
|
+
...(params.thresholds ?? {}),
|
|
408
|
+
};
|
|
409
|
+
const failures = [];
|
|
410
|
+
const { metrics } = params;
|
|
411
|
+
addRateMinimumFailure(failures, {
|
|
412
|
+
code: "gateway_planner_schema_validity_below_min",
|
|
413
|
+
metric: "plannerSchemaValidityRate",
|
|
414
|
+
message: "Planner schema validity is below threshold.",
|
|
415
|
+
threshold: thresholds.plannerSchemaValidityMin,
|
|
416
|
+
actual: metrics.plannerSchemaValidityRate.value,
|
|
417
|
+
});
|
|
418
|
+
addRateMinimumFailure(failures, {
|
|
419
|
+
code: "gateway_evidence_precision_below_min",
|
|
420
|
+
metric: "evidencePrecisionRate",
|
|
421
|
+
message: "Evidence precision is below threshold.",
|
|
422
|
+
threshold: thresholds.evidencePrecisionMin,
|
|
423
|
+
actual: metrics.evidencePrecisionRate.value,
|
|
424
|
+
});
|
|
425
|
+
addRateMinimumFailure(failures, {
|
|
426
|
+
code: "gateway_citation_source_correctness_below_min",
|
|
427
|
+
metric: "citationSourceCorrectnessRate",
|
|
428
|
+
message: "Citation/source correctness is below threshold.",
|
|
429
|
+
threshold: thresholds.citationSourceCorrectnessMin,
|
|
430
|
+
actual: metrics.citationSourceCorrectnessRate.value,
|
|
431
|
+
});
|
|
432
|
+
addRateMaximumFailure(failures, {
|
|
433
|
+
code: "gateway_disabled_tool_leakage_exceeded",
|
|
434
|
+
metric: "disabledToolLeakageRate",
|
|
435
|
+
message: "Disabled-tool leakage exceeded threshold.",
|
|
436
|
+
threshold: thresholds.disabledToolLeakageMax,
|
|
437
|
+
actual: metrics.disabledToolLeakageRate.value,
|
|
438
|
+
});
|
|
439
|
+
addRateMinimumFailure(failures, {
|
|
440
|
+
code: "gateway_final_answer_directness_below_min",
|
|
441
|
+
metric: "finalAnswerDirectnessRate",
|
|
442
|
+
message: "Final-answer directness is below threshold.",
|
|
443
|
+
threshold: thresholds.finalAnswerDirectnessMin,
|
|
444
|
+
actual: metrics.finalAnswerDirectnessRate.value,
|
|
445
|
+
});
|
|
446
|
+
addRateMinimumFailure(failures, {
|
|
447
|
+
code: "gateway_final_large_model_rate_below_min",
|
|
448
|
+
metric: "finalLargeModelRate",
|
|
449
|
+
message: "Final answers were not produced by the required large tier.",
|
|
450
|
+
threshold: thresholds.finalLargeModelRateMin,
|
|
451
|
+
actual: metrics.finalLargeModelRate.value,
|
|
452
|
+
});
|
|
453
|
+
addRateMinimumFailure(failures, {
|
|
454
|
+
code: "gateway_budget_compliance_below_min",
|
|
455
|
+
metric: "budgetComplianceRate",
|
|
456
|
+
message: "Gateway budget compliance is below threshold.",
|
|
457
|
+
threshold: thresholds.budgetComplianceMin,
|
|
458
|
+
actual: metrics.budgetComplianceRate.value,
|
|
459
|
+
});
|
|
460
|
+
if (thresholds.latencyP95MsMax !== undefined) {
|
|
461
|
+
addRateMaximumFailure(failures, {
|
|
462
|
+
code: "gateway_latency_p95_exceeded",
|
|
463
|
+
metric: "latencyMs.p95",
|
|
464
|
+
message: "Gateway p95 latency exceeded threshold.",
|
|
465
|
+
threshold: thresholds.latencyP95MsMax,
|
|
466
|
+
actual: metrics.latencyMs.p95,
|
|
467
|
+
});
|
|
468
|
+
}
|
|
469
|
+
if (thresholds.costP95UsdMax !== undefined) {
|
|
470
|
+
addRateMaximumFailure(failures, {
|
|
471
|
+
code: "gateway_cost_p95_exceeded",
|
|
472
|
+
metric: "costUsd.p95",
|
|
473
|
+
message: "Gateway p95 cost exceeded threshold.",
|
|
474
|
+
threshold: thresholds.costP95UsdMax,
|
|
475
|
+
actual: metrics.costUsd.p95,
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
const latencyDelta = findRegressionDelta(params.regression, "latency_ms.p95");
|
|
479
|
+
if (latencyDelta?.relativeDelta !== null
|
|
480
|
+
&& latencyDelta?.relativeDelta !== undefined
|
|
481
|
+
&& latencyDelta.relativeDelta > thresholds.latencyRegressionRatioMax) {
|
|
482
|
+
failures.push({
|
|
483
|
+
code: "gateway_latency_regression_exceeded",
|
|
484
|
+
metric: "latency_ms.p95",
|
|
485
|
+
message: "Gateway p95 latency regressed beyond the allowed ratio.",
|
|
486
|
+
threshold: thresholds.latencyRegressionRatioMax,
|
|
487
|
+
actual: latencyDelta.current,
|
|
488
|
+
baseline: latencyDelta.baseline,
|
|
489
|
+
delta: latencyDelta.relativeDelta,
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
const costDelta = findRegressionDelta(params.regression, "cost_usd.p95");
|
|
493
|
+
if (costDelta?.relativeDelta !== null
|
|
494
|
+
&& costDelta?.relativeDelta !== undefined
|
|
495
|
+
&& costDelta.relativeDelta > thresholds.costRegressionRatioMax) {
|
|
496
|
+
failures.push({
|
|
497
|
+
code: "gateway_cost_regression_exceeded",
|
|
498
|
+
metric: "cost_usd.p95",
|
|
499
|
+
message: "Gateway p95 cost regressed beyond the allowed ratio.",
|
|
500
|
+
threshold: thresholds.costRegressionRatioMax,
|
|
501
|
+
actual: costDelta.current,
|
|
502
|
+
baseline: costDelta.baseline,
|
|
503
|
+
delta: costDelta.relativeDelta,
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
return {
|
|
507
|
+
schemaVersion: 1,
|
|
508
|
+
passed: failures.length === 0,
|
|
509
|
+
thresholds,
|
|
510
|
+
failures,
|
|
511
|
+
};
|
|
512
|
+
};
|
|
513
|
+
const defaultEvidenceForCase = (evalCase) => {
|
|
514
|
+
const [sourceType] = evalCase.expectations.requiredSourceTypes ?? [];
|
|
515
|
+
if (!sourceType)
|
|
516
|
+
return [];
|
|
517
|
+
return [{
|
|
518
|
+
id: `${evalCase.id}-evidence-1`,
|
|
519
|
+
sourceType,
|
|
520
|
+
sourceId: `${sourceType}:tenant-scope-1`,
|
|
521
|
+
cited: true,
|
|
522
|
+
relevant: true,
|
|
523
|
+
}];
|
|
524
|
+
};
|
|
525
|
+
export const createDefaultCodaliGatewayEvalRunner = () => (async (evalCase) => {
|
|
526
|
+
const evidence = defaultEvidenceForCase(evalCase);
|
|
527
|
+
const calledTools = evalCase.expectations.requiredTools
|
|
528
|
+
? [...evalCase.expectations.requiredTools]
|
|
529
|
+
: [];
|
|
530
|
+
const citations = evidence.map((item) => ({
|
|
531
|
+
evidenceId: item.id,
|
|
532
|
+
sourceType: item.sourceType,
|
|
533
|
+
sourceId: item.sourceId,
|
|
534
|
+
}));
|
|
535
|
+
return {
|
|
536
|
+
caseId: evalCase.id,
|
|
537
|
+
taskType: evalCase.type,
|
|
538
|
+
status: "passed",
|
|
539
|
+
plannerSchemaValid: true,
|
|
540
|
+
selectedTaskType: evalCase.type,
|
|
541
|
+
calledTools,
|
|
542
|
+
evidence,
|
|
543
|
+
citations,
|
|
544
|
+
finalAnswer: evalCase.expectations.expectsMissingEvidence
|
|
545
|
+
? "I do not have enough tenant-scoped evidence to answer that directly."
|
|
546
|
+
: "Tenant-scoped evidence supports this direct answer.",
|
|
547
|
+
finalAnswerDirect: true,
|
|
548
|
+
finalModelTier: evalCase.expectations.requiresFinalLargeModel ? "large" : "medium",
|
|
549
|
+
latencyMs: Math.min(evalCase.expectations.maxLatencyMs ?? 1000, 1000),
|
|
550
|
+
tokensUsed: Math.min(evalCase.expectations.maxTokens ?? 500, 500),
|
|
551
|
+
costUsd: Math.min(evalCase.expectations.maxCostUsd ?? 0.01, 0.01),
|
|
552
|
+
toolCallCount: calledTools.length,
|
|
553
|
+
modelCallCount: Math.min(evalCase.expectations.maxModelCalls ?? 2, 2),
|
|
554
|
+
imageArtifact: evalCase.expectations.requiresImageArtifact
|
|
555
|
+
? { id: `${evalCase.id}-artifact`, uri: "artifact://codali-gateway-eval/image.png", mimeType: "image/png" }
|
|
556
|
+
: undefined,
|
|
557
|
+
missingEvidenceHandled: evalCase.expectations.expectsMissingEvidence ? true : undefined,
|
|
558
|
+
warnings: [],
|
|
559
|
+
errors: [],
|
|
560
|
+
};
|
|
561
|
+
});
|
|
562
|
+
export const runCodaliGatewayEvalSuite = async (options = {}) => {
|
|
563
|
+
const runId = options.runId ?? randomUUID();
|
|
564
|
+
const reportId = options.reportId ?? randomUUID();
|
|
565
|
+
const evalCases = options.cases ?? CODALI_GATEWAY_EVAL_CASES;
|
|
566
|
+
const runner = options.runner ?? createDefaultCodaliGatewayEvalRunner();
|
|
567
|
+
const thresholds = {
|
|
568
|
+
...DEFAULT_CODALI_GATEWAY_EVAL_THRESHOLDS,
|
|
569
|
+
...(options.thresholds ?? {}),
|
|
570
|
+
};
|
|
571
|
+
const startedAtMs = Date.now();
|
|
572
|
+
const startedAt = new Date(startedAtMs).toISOString();
|
|
573
|
+
const results = [];
|
|
574
|
+
const warnings = [];
|
|
575
|
+
const errors = [];
|
|
576
|
+
for (const evalCase of evalCases) {
|
|
577
|
+
try {
|
|
578
|
+
const record = await runner(evalCase);
|
|
579
|
+
const result = evaluateCodaliGatewayEvalCase(evalCase, record, thresholds);
|
|
580
|
+
results.push(result);
|
|
581
|
+
warnings.push(...result.warnings.map((warning) => `${evalCase.id}:${warning}`));
|
|
582
|
+
errors.push(...result.errors.map((error) => `${evalCase.id}:${error}`));
|
|
583
|
+
}
|
|
584
|
+
catch (error) {
|
|
585
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
586
|
+
errors.push(`${evalCase.id}:${message}`);
|
|
587
|
+
results.push({
|
|
588
|
+
caseId: evalCase.id,
|
|
589
|
+
taskType: evalCase.type,
|
|
590
|
+
status: "failed",
|
|
591
|
+
passed: false,
|
|
592
|
+
plannerSchemaValid: false,
|
|
593
|
+
evidencePrecision: null,
|
|
594
|
+
evidencePrecisionPassed: false,
|
|
595
|
+
citationSourceCorrect: false,
|
|
596
|
+
disabledToolLeakageDetected: false,
|
|
597
|
+
finalAnswerDirect: false,
|
|
598
|
+
finalLargeModelUsed: false,
|
|
599
|
+
budgetCompliant: false,
|
|
600
|
+
imageArtifactPresent: !evalCase.expectations.requiresImageArtifact,
|
|
601
|
+
missingEvidenceHandled: !evalCase.expectations.expectsMissingEvidence,
|
|
602
|
+
calledTools: [],
|
|
603
|
+
failures: ["gateway_eval_runner_threw"],
|
|
604
|
+
warnings: [],
|
|
605
|
+
errors: [message],
|
|
606
|
+
});
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
const metrics = aggregateCodaliGatewayEvalMetrics(results);
|
|
610
|
+
const regression = compareCodaliGatewayEvalBaseline({
|
|
611
|
+
current: metrics,
|
|
612
|
+
baseline: options.baseline,
|
|
613
|
+
});
|
|
614
|
+
const gates = evaluateCodaliGatewayEvalGates({
|
|
615
|
+
metrics,
|
|
616
|
+
thresholds,
|
|
617
|
+
regression,
|
|
618
|
+
});
|
|
619
|
+
const endedAtMs = Date.now();
|
|
620
|
+
const passed = results.filter((result) => result.passed).length;
|
|
621
|
+
const failed = results.length - passed;
|
|
622
|
+
return {
|
|
623
|
+
schemaVersion: 1,
|
|
624
|
+
reportId,
|
|
625
|
+
runId,
|
|
626
|
+
runtime: "codali_gateway_eval",
|
|
627
|
+
mode: "gateway_smoke",
|
|
628
|
+
startedAt,
|
|
629
|
+
endedAt: new Date(endedAtMs).toISOString(),
|
|
630
|
+
durationMs: endedAtMs - startedAtMs,
|
|
631
|
+
cases: results,
|
|
632
|
+
metrics,
|
|
633
|
+
regression,
|
|
634
|
+
gates,
|
|
635
|
+
summary: {
|
|
636
|
+
status: failed === 0 && gates.passed ? "passed" : "failed",
|
|
637
|
+
total: results.length,
|
|
638
|
+
passed,
|
|
639
|
+
failed,
|
|
640
|
+
},
|
|
641
|
+
warnings,
|
|
642
|
+
errors,
|
|
643
|
+
};
|
|
644
|
+
};
|
|
645
|
+
const formatRate = (metric) => (metric.value === null ? "n/a" : `${(metric.value * 100).toFixed(2)}%`);
|
|
646
|
+
const formatValue = (value, suffix = "") => (value === null ? "n/a" : `${value.toFixed(2)}${suffix}`);
|
|
647
|
+
export const formatCodaliGatewayEvalTextReport = (report) => {
|
|
648
|
+
const gateSummary = report.gates.passed
|
|
649
|
+
? "passed"
|
|
650
|
+
: `failed (${report.gates.failures.map((failure) => failure.code).join(", ")})`;
|
|
651
|
+
const lines = [
|
|
652
|
+
`Codali gateway eval smoke: ${report.summary.status}`,
|
|
653
|
+
`Run: ${report.runId}`,
|
|
654
|
+
`Cases: ${report.summary.passed}/${report.summary.total} passed`,
|
|
655
|
+
`Planner schema validity: ${formatRate(report.metrics.plannerSchemaValidityRate)}`,
|
|
656
|
+
`Evidence precision: ${formatRate(report.metrics.evidencePrecisionRate)}`,
|
|
657
|
+
`Citation/source correctness: ${formatRate(report.metrics.citationSourceCorrectnessRate)}`,
|
|
658
|
+
`Disabled-tool leakage: ${formatRate(report.metrics.disabledToolLeakageRate)}`,
|
|
659
|
+
`Final-answer directness: ${formatRate(report.metrics.finalAnswerDirectnessRate)}`,
|
|
660
|
+
`Final large-model usage: ${formatRate(report.metrics.finalLargeModelRate)}`,
|
|
661
|
+
`Budget compliance: ${formatRate(report.metrics.budgetComplianceRate)}`,
|
|
662
|
+
`Latency median/p95: ${formatValue(report.metrics.latencyMs.median, "ms")}/${formatValue(report.metrics.latencyMs.p95, "ms")}`,
|
|
663
|
+
`Tokens median/p95: ${formatValue(report.metrics.tokensUsed.median)}/${formatValue(report.metrics.tokensUsed.p95)}`,
|
|
664
|
+
`Cost median/p95: ${formatValue(report.metrics.costUsd.median, " USD")}/${formatValue(report.metrics.costUsd.p95, " USD")}`,
|
|
665
|
+
`Regression: ${report.regression.status}`,
|
|
666
|
+
`Gates: ${gateSummary}`,
|
|
667
|
+
];
|
|
668
|
+
for (const result of report.cases) {
|
|
669
|
+
const suffix = result.failures.length > 0 ? ` (${result.failures.join(", ")})` : "";
|
|
670
|
+
lines.push(`Case ${result.caseId}: ${result.status}${suffix}`);
|
|
671
|
+
}
|
|
672
|
+
return lines.join("\n");
|
|
673
|
+
};
|