@salesforce/plugin-agent 1.30.10 → 1.31.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -19
- package/lib/commands/agent/test/run-eval.d.ts +33 -0
- package/lib/commands/agent/test/run-eval.js +221 -0
- package/lib/commands/agent/test/run-eval.js.map +1 -0
- package/lib/evalFormatter.d.ts +30 -0
- package/lib/evalFormatter.js +263 -0
- package/lib/evalFormatter.js.map +1 -0
- package/lib/evalNormalizer.d.ts +57 -0
- package/lib/evalNormalizer.js +421 -0
- package/lib/evalNormalizer.js.map +1 -0
- package/lib/yamlSpecTranslator.d.ts +20 -0
- package/lib/yamlSpecTranslator.js +217 -0
- package/lib/yamlSpecTranslator.js.map +1 -0
- package/messages/agent.test.run-eval.md +91 -0
- package/oclif.manifest.json +303 -175
- package/package.json +6 -6
- package/schemas/agent-test-run__eval.json +52 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2026, Salesforce, Inc.
|
|
3
|
+
*
|
|
4
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
* you may not use this file except in compliance with the License.
|
|
6
|
+
* You may obtain a copy of the License at
|
|
7
|
+
*
|
|
8
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
*
|
|
10
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
* See the License for the specific language governing permissions and
|
|
14
|
+
* limitations under the License.
|
|
15
|
+
*/
|
|
16
|
+
export function formatResults(results, format) {
|
|
17
|
+
switch (format) {
|
|
18
|
+
case 'human':
|
|
19
|
+
return formatHuman(results);
|
|
20
|
+
case 'json':
|
|
21
|
+
return JSON.stringify(results, null, 2);
|
|
22
|
+
case 'junit':
|
|
23
|
+
return formatJunit(results);
|
|
24
|
+
case 'tap':
|
|
25
|
+
return formatTap(results);
|
|
26
|
+
default:
|
|
27
|
+
return formatHuman(results);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
// --- formatHuman helpers ---
|
|
31
|
+
function formatOutputLines(outputs) {
|
|
32
|
+
const lines = [];
|
|
33
|
+
for (const output of outputs) {
|
|
34
|
+
const stepType = output.type ?? '';
|
|
35
|
+
const stepId = output.id ?? '';
|
|
36
|
+
if (stepType === 'agent.create_session') {
|
|
37
|
+
const sessionId = output.session_id ?? 'N/A';
|
|
38
|
+
lines.push(`- **Create Session**: ${sessionId}`);
|
|
39
|
+
}
|
|
40
|
+
else if (stepType === 'agent.send_message') {
|
|
41
|
+
let agentMsg = output.response;
|
|
42
|
+
if (agentMsg !== null && typeof agentMsg === 'object' && !Array.isArray(agentMsg)) {
|
|
43
|
+
const msgObj = agentMsg;
|
|
44
|
+
const msgs = msgObj.messages;
|
|
45
|
+
agentMsg = msgs?.[0]?.message ?? String(agentMsg);
|
|
46
|
+
}
|
|
47
|
+
const msgStr = String(agentMsg ?? '');
|
|
48
|
+
const displayMsg = msgStr.length > 200 ? msgStr.substring(0, 200) + '...' : msgStr;
|
|
49
|
+
lines.push(`- **Agent Response** (${stepId}): ${displayMsg}`);
|
|
50
|
+
}
|
|
51
|
+
else if (stepType === 'agent.get_state') {
|
|
52
|
+
const respData = output.response;
|
|
53
|
+
if (respData !== null && typeof respData === 'object') {
|
|
54
|
+
const resp = respData;
|
|
55
|
+
const planner = resp.planner_response;
|
|
56
|
+
const lastExec = planner?.lastExecution;
|
|
57
|
+
const topic = lastExec?.topic ?? 'N/A';
|
|
58
|
+
const latency = lastExec?.latency ?? 'N/A';
|
|
59
|
+
lines.push(`- **Topic Selected**: ${String(topic)}`);
|
|
60
|
+
lines.push(`- **Response Latency**: ${String(latency)}ms`);
|
|
61
|
+
}
|
|
62
|
+
else {
|
|
63
|
+
lines.push(`- **State**: ${String(respData).substring(0, 200)}`);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return lines;
|
|
68
|
+
}
|
|
69
|
+
function formatEvaluationTable(evalResults) {
|
|
70
|
+
const lines = [];
|
|
71
|
+
if (evalResults.length > 0) {
|
|
72
|
+
lines.push('### Evaluation Results\n');
|
|
73
|
+
lines.push('| Metric | Score | Pass | Actual | Expected |');
|
|
74
|
+
lines.push('|--------|-------|------|--------|----------|');
|
|
75
|
+
for (const evalR of evalResults) {
|
|
76
|
+
const metricId = evalR.id ?? 'unknown';
|
|
77
|
+
const score = evalR.score;
|
|
78
|
+
const scoreStr = score != null ? score.toFixed(3) : 'N/A';
|
|
79
|
+
const isPass = evalR.is_pass;
|
|
80
|
+
const passStr = isPass === true ? 'PASS' : isPass === false ? 'FAIL' : 'N/A';
|
|
81
|
+
const actual = String(evalR.actual_value ?? '').substring(0, 60);
|
|
82
|
+
const expected = String(evalR.expected_value ?? '').substring(0, 60);
|
|
83
|
+
const error = evalR.error_message;
|
|
84
|
+
if (error) {
|
|
85
|
+
lines.push(`| ${metricId} | ERROR | - | ${error.substring(0, 80)} | - |`);
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
lines.push(`| ${metricId} | ${scoreStr} | ${passStr} | ${actual} | ${expected} |`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
lines.push('');
|
|
92
|
+
}
|
|
93
|
+
return lines;
|
|
94
|
+
}
|
|
95
|
+
function formatErrorLines(errors) {
|
|
96
|
+
const lines = [];
|
|
97
|
+
if (errors.length > 0) {
|
|
98
|
+
lines.push('### Errors\n');
|
|
99
|
+
for (const error of errors) {
|
|
100
|
+
const errorId = error.id ?? 'unknown';
|
|
101
|
+
const errorMsg = error.error_message ?? String(error);
|
|
102
|
+
lines.push(`- **${errorId}**: ${errorMsg}`);
|
|
103
|
+
}
|
|
104
|
+
lines.push('');
|
|
105
|
+
}
|
|
106
|
+
return lines;
|
|
107
|
+
}
|
|
108
|
+
function formatTestSummaryLines(evalResults, errors) {
|
|
109
|
+
const lines = [];
|
|
110
|
+
const totalEvals = evalResults.length;
|
|
111
|
+
const passed = evalResults.filter((e) => e.is_pass === true).length;
|
|
112
|
+
const failed = evalResults.filter((e) => e.is_pass === false).length;
|
|
113
|
+
const scored = evalResults.filter((e) => e.score != null && e.is_pass == null).length;
|
|
114
|
+
lines.push(`**Summary**: ${totalEvals} evaluations`);
|
|
115
|
+
if (passed || failed) {
|
|
116
|
+
lines.push(` - Passed: ${passed}, Failed: ${failed}`);
|
|
117
|
+
}
|
|
118
|
+
if (scored) {
|
|
119
|
+
lines.push(` - Scored (no threshold): ${scored}`);
|
|
120
|
+
}
|
|
121
|
+
if (errors.length > 0) {
|
|
122
|
+
lines.push(` - Errors: ${errors.length}`);
|
|
123
|
+
}
|
|
124
|
+
lines.push('');
|
|
125
|
+
return lines;
|
|
126
|
+
}
|
|
127
|
+
function formatHuman(results) {
|
|
128
|
+
const lines = ['# Agent Evaluation Results\n'];
|
|
129
|
+
for (const testResult of results.results ?? []) {
|
|
130
|
+
const testId = testResult.id ?? 'unknown';
|
|
131
|
+
const errors = testResult.errors ?? [];
|
|
132
|
+
const evalResults = testResult.evaluation_results ?? [];
|
|
133
|
+
const outputs = testResult.outputs ?? [];
|
|
134
|
+
lines.push(`## Test: ${testId}\n`);
|
|
135
|
+
lines.push(...formatOutputLines(outputs));
|
|
136
|
+
lines.push('');
|
|
137
|
+
lines.push(...formatEvaluationTable(evalResults));
|
|
138
|
+
lines.push(...formatErrorLines(errors));
|
|
139
|
+
lines.push(...formatTestSummaryLines(evalResults, errors));
|
|
140
|
+
}
|
|
141
|
+
return lines.join('\n');
|
|
142
|
+
}
|
|
143
|
+
function formatJunit(results) {
|
|
144
|
+
const allTests = [];
|
|
145
|
+
for (const testResult of results.results ?? []) {
|
|
146
|
+
const testId = testResult.id ?? 'unknown';
|
|
147
|
+
for (const evalR of testResult.evaluation_results ?? []) {
|
|
148
|
+
const stepId = evalR.id ?? 'unknown';
|
|
149
|
+
const name = `${testId}.${stepId}`;
|
|
150
|
+
const score = evalR.score;
|
|
151
|
+
const isPass = evalR.is_pass;
|
|
152
|
+
const error = evalR.error_message;
|
|
153
|
+
allTests.push({
|
|
154
|
+
name,
|
|
155
|
+
classname: 'agent-eval-labs',
|
|
156
|
+
failed: isPass === false,
|
|
157
|
+
errored: !!error,
|
|
158
|
+
message: error
|
|
159
|
+
? error
|
|
160
|
+
: isPass === false
|
|
161
|
+
? `Expected ${String(evalR.expected_value ?? '')} but got ${String(evalR.actual_value ?? '')}`
|
|
162
|
+
: '',
|
|
163
|
+
score: score != null ? score.toFixed(3) : 'N/A',
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
for (const err of testResult.errors ?? []) {
|
|
167
|
+
const stepId = err.id ?? 'unknown';
|
|
168
|
+
allTests.push({
|
|
169
|
+
name: `${testId}.${stepId}`,
|
|
170
|
+
classname: 'agent-eval-labs',
|
|
171
|
+
failed: false,
|
|
172
|
+
errored: true,
|
|
173
|
+
message: err.error_message ?? 'Unknown error',
|
|
174
|
+
score: 'N/A',
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
const totalTests = allTests.length;
|
|
179
|
+
const failures = allTests.filter((t) => t.failed).length;
|
|
180
|
+
const errors = allTests.filter((t) => t.errored).length;
|
|
181
|
+
const lines = [
|
|
182
|
+
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
183
|
+
'<testsuites>',
|
|
184
|
+
` <testsuite name="agent-eval-labs" tests="${totalTests}" failures="${failures}" errors="${errors}">`,
|
|
185
|
+
];
|
|
186
|
+
for (const tc of allTests) {
|
|
187
|
+
lines.push(` <testcase name="${escapeXml(tc.name)}" classname="${escapeXml(tc.classname)}">`);
|
|
188
|
+
if (tc.errored) {
|
|
189
|
+
lines.push(` <error message="${escapeXml(tc.message)}">${escapeXml(tc.message)}</error>`);
|
|
190
|
+
}
|
|
191
|
+
else if (tc.failed) {
|
|
192
|
+
lines.push(` <failure message="${escapeXml(tc.message)}">Score: ${tc.score}</failure>`);
|
|
193
|
+
}
|
|
194
|
+
lines.push(' </testcase>');
|
|
195
|
+
}
|
|
196
|
+
lines.push(' </testsuite>');
|
|
197
|
+
lines.push('</testsuites>');
|
|
198
|
+
return lines.join('\n');
|
|
199
|
+
}
|
|
200
|
+
function buildTapEntries(results) {
|
|
201
|
+
const entries = [];
|
|
202
|
+
for (const testResult of results.results ?? []) {
|
|
203
|
+
const testId = testResult.id ?? 'unknown';
|
|
204
|
+
for (const evalR of testResult.evaluation_results ?? []) {
|
|
205
|
+
const stepId = evalR.id ?? 'unknown';
|
|
206
|
+
const name = `${testId}.${stepId}`;
|
|
207
|
+
const score = evalR.score;
|
|
208
|
+
const isPass = evalR.is_pass;
|
|
209
|
+
const error = evalR.error_message;
|
|
210
|
+
entries.push({
|
|
211
|
+
ok: isPass !== false && !error,
|
|
212
|
+
name,
|
|
213
|
+
score: score != null ? score.toFixed(3) : 'N/A',
|
|
214
|
+
expected: evalR.expected_value != null ? String(evalR.expected_value) : undefined,
|
|
215
|
+
actual: evalR.actual_value != null ? String(evalR.actual_value) : undefined,
|
|
216
|
+
error: error ?? undefined,
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
for (const err of testResult.errors ?? []) {
|
|
220
|
+
const stepId = err.id ?? 'unknown';
|
|
221
|
+
entries.push({
|
|
222
|
+
ok: false,
|
|
223
|
+
name: `${testId}.${stepId}`,
|
|
224
|
+
score: 'N/A',
|
|
225
|
+
error: err.error_message ?? 'Unknown error',
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
return entries;
|
|
230
|
+
}
|
|
231
|
+
function formatTap(results) {
|
|
232
|
+
const entries = buildTapEntries(results);
|
|
233
|
+
const lines = ['TAP version 13', `1..${entries.length}`];
|
|
234
|
+
for (let i = 0; i < entries.length; i++) {
|
|
235
|
+
const e = entries[i];
|
|
236
|
+
const num = i + 1;
|
|
237
|
+
const prefix = e.ok ? 'ok' : 'not ok';
|
|
238
|
+
lines.push(`${prefix} ${num} - ${e.name} (score: ${e.score})`);
|
|
239
|
+
if (!e.ok) {
|
|
240
|
+
lines.push(' ---');
|
|
241
|
+
if (e.expected !== undefined) {
|
|
242
|
+
lines.push(` expected: "${e.expected}"`);
|
|
243
|
+
}
|
|
244
|
+
if (e.actual !== undefined) {
|
|
245
|
+
lines.push(` actual: "${e.actual}"`);
|
|
246
|
+
}
|
|
247
|
+
if (e.error) {
|
|
248
|
+
lines.push(` error: "${e.error}"`);
|
|
249
|
+
}
|
|
250
|
+
lines.push(' ...');
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
return lines.join('\n');
|
|
254
|
+
}
|
|
255
|
+
function escapeXml(str) {
|
|
256
|
+
return str
|
|
257
|
+
.replace(/&/g, '&')
|
|
258
|
+
.replace(/</g, '<')
|
|
259
|
+
.replace(/>/g, '>')
|
|
260
|
+
.replace(/"/g, '"')
|
|
261
|
+
.replace(/'/g, ''');
|
|
262
|
+
}
|
|
263
|
+
//# sourceMappingURL=evalFormatter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evalFormatter.js","sourceRoot":"","sources":["../src/evalFormatter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAoCH,MAAM,UAAU,aAAa,CAAC,OAAwB,EAAE,MAAoB;IAC1E,QAAQ,MAAM,EAAE,CAAC;QACf,KAAK,OAAO;YACV,OAAO,WAAW,CAAC,OAAO,CAAC,CAAC;QAC9B,KAAK,MAAM;YACT,OAAO,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;QAC1C,KAAK,OAAO;YACV,OAAO,WAAW,CAAC,OAAO,CAAC,CAAC;QAC9B,KAAK,KAAK;YACR,OAAO,SAAS,CAAC,OAAO,CAAC,CAAC;QAC5B;YACE,OAAO,WAAW,CAAC,OAAO,CAAC,CAAC;IAChC,CAAC;AACH,CAAC;AAED,8BAA8B;AAE9B,SAAS,iBAAiB,CAAC,OAAqB;IAC9C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC;QAE/B,IAAI,QAAQ,KAAK,sBAAsB,EAAE,CAAC;YACxC,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,IAAI,KAAK,CAAC;YAC7C,KAAK,CAAC,IAAI,CAAC,yBAAyB,SAAS,EAAE,CAAC,CAAC;QACnD,CAAC;aAAM,IAAI,QAAQ,KAAK,oBAAoB,EAAE,CAAC;YAC7C,IAAI,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;YAC/B,IAAI,QAAQ,KAAK,IAAI,IAAI,OAAO,QAAQ,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAClF,MAAM,MAAM,GAAG,QAAmC,CAAC;gBACnD,MAAM,IAAI,GAAG,MAAM,CAAC,QAAsD,CAAC;gBAC3E,QAAQ,GAAG,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC;YACpD,CAAC;YACD,MAAM,MAAM,GAAG,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;YACtC,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;YACnF,KAAK,CAAC,IAAI,CAAC,yBAAyB,MAAM,MAAM,UAAU,EAAE,CAAC,CAAC;QAChE,CAAC;aAAM,IAAI,QAAQ,KAAK,iBAAiB,EAAE,CAAC;YAC1C,MAAM,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;YACjC,IAAI,QAAQ,KAAK,IAAI,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE,CAAC;gBACtD,MAAM,IAAI,GAAG,QAAmC,CAAC;gBACjD,MAAM,OAAO,GAAG,IAAI,CAAC,gBAAuD,CAAC;gBAC7E,MAAM,QAAQ,GAAG,OAAO,EAAE,aAAoD,CAAC;gBAC/E,MAAM,KAAK,GAAG,QAAQ,EAAE,KAAK,IAAI,KAAK,CAAC;gBACvC,MAAM,OAAO,GAAG,QAAQ,EAAE,OAAO,IAAI,KAAK,CAAC;gBAC3C,KAAK,CAAC,IAAI,CAAC,yBAAyB,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;gBACrD,KAAK,CAAC,IAAI,CAAC,2BAA2B,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC7D,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,gBAAgB,MAAM,CAAC,QAAQ,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;YACnE,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,qBAAqB,CAAC,WAAyB;IACtD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,KAAK,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QACvC,KAAK,CAAC,IAAI,CAAC,+CAA+C,CAAC,CAAC;QAC5D,KAAK,CAAC,IAAI,CAAC,+CAA+C,CAAC,CAAC;QAE5D,KAAK,MAAM,KAAK,IAAI,WAAW,EAAE,CAAC;YAChC,MAAM,QAAQ,GAAG,KAAK,CAAC,EAAE,IAAI,SAAS,CAAC;YACvC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,QAAQ,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;YAC1D,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC;YAC7B,MAAM,OAAO,GAAG,MAAM,KAAK,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,KAAK,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC;YAC7E,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,YAAY,IAAI,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACjE,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACrE,MAAM,KAAK,GAAG,KAAK,CAAC,aAAa,CAAC;YAElC,IAAI,KAAK,EAAE,CAAC;gBACV,KAAK,CAAC,IAAI,CAAC,KAAK,QAAQ,kBAAkB,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC;YAC5E,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,KAAK,QAAQ,MAAM,QAAQ,MAAM,OAAO,MAAM,MAAM,MAAM,QAAQ,IAAI,CAAC,CAAC;YACrF,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,gBAAgB,CAAC,MAAmB;IAC3C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAC3B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,OAAO,GAAG,KAAK,CAAC,EAAE,IAAI,SAAS,CAAC;YACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,aAAa,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;YACtD,KAAK,CAAC,IAAI,CAAC,OAAO,OAAO,OAAO,QAAQ,EAAE,CAAC,CAAC;QAC9C,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,sBAAsB,CAAC,WAAyB,EAAE,MAAmB;IAC5E,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,CAAC;IACtC,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACpE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,KAAK,CAAC,CAAC,MAAM,CAAC;IACrE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,IAAI,CAAC,CAAC,OAAO,IAAI,IAAI,CAAC,CAAC,MAAM,CAAC;IAEtF,KAAK,CAAC,IAAI,CAAC,gBAAgB,UAAU,cAAc,CAAC,CAAC;IACrD,IAAI,MAAM,IAAI,MAAM,EAAE,CAAC;QACrB,KAAK,CAAC,IAAI,CAAC,eAAe,MAAM,aAAa,MAAM,EAAE,CAAC,CAAC;IACzD,CAAC;IACD,IAAI,MAAM,EAAE,CAAC;QACX,KAAK,CAAC,IAAI,CAAC,8BAA8B,MAAM,EAAE,CAAC,CAAC;IACrD,CAAC;IACD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,KAAK,CAAC,IAAI,CAAC,eAAe,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IAC7C,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,WAAW,CAAC,OAAwB;IAC3C,MAAM,KAAK,GAAa,CAAC,8BAA8B,CAAC,CAAC;IAEzD,KAAK,MAAM,UAAU,IAAI,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QAC/C,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,IAAI,SAAS,CAAC;QAC1C,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,IAAI,EAAE,CAAC;QACvC,MAAM,WAAW,GAAG,UAAU,CAAC,kBAAkB,IAAI,EAAE,CAAC;QACxD,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,IAAI,EAAE,CAAC;QAEzC,KAAK,CAAC,IAAI,CAAC,YAAY,MAAM,IAAI,CAAC,CAAC;QAEnC,KAAK,CAAC,IAAI,CAAC,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC;QAC1C,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,GAAG,qBAAqB,CAAC,WAAW,CAAC,CAAC,CAAC;QAClD,KAAK,CAAC,IAAI,CAAC,GAAG,gBAAgB,CAAC,MAAM,CAAC,CAAC,CAAC;QACxC,KAAK,CAAC,IAAI,CAAC,GAAG,sBAAsB,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,WAAW,CAAC,OAAwB;IAC3C,MAAM,QAAQ,GAOT,EAAE,CAAC;IAER,KAAK,MAAM,UAAU,IAAI,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QAC/C,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,IAAI,SAAS,CAAC;QAE1C,KAAK,MAAM,KAAK,IAAI,UAAU,CAAC,kBAAkB,IAAI,EAAE,EAAE,CAAC;YACxD,MAAM,MAAM,GAAG,KAAK,CAAC,EAAE,IAAI,SAAS,CAAC;YACrC,MAAM,IAAI,GAAG,GAAG,MAAM,IAAI,MAAM,EAAE,CAAC;YACnC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC;YAC7B,MAAM,KAAK,GAAG,KAAK,CAAC,aAAa,CAAC;YAElC,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI;gBACJ,SAAS,EAAE,iBAAiB;gBAC5B,MAAM,EAAE,MAAM,KAAK,KAAK;gBACxB,OAAO,EAAE,CAAC,CAAC,KAAK;gBAChB,OAAO,EAAE,KAAK;oBACZ,CAAC,CAAC,KAAK;oBACP,CAAC,CAAC,MAAM,KAAK,KAAK;wBAClB,CAAC,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,cAAc,IAAI,EAAE,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,YAAY,IAAI,EAAE,CAAC,EAAE;wBAC9F,CAAC,CAAC,EAAE;gBACN,KAAK,EAAE,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK;aAChD,CAAC,CAAC;QACL,CAAC;QAED,KAAK,MAAM,GAAG,IAAI,UAAU,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;YAC1C,MAAM,MAAM,GAAG,GAAG,CAAC,EAAE,IAAI,SAAS,CAAC;YACnC,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,GAAG,MAAM,IAAI,MAAM,EAAE;gBAC3B,SAAS,EAAE,iBAAiB;gBAC5B,MAAM,EAAE,KAAK;gBACb,OAAO,EAAE,IAAI;gBACb,OAAO,EAAE,GAAG,CAAC,aAAa,IAAI,eAAe;gBAC7C,KAAK,EAAE,KAAK;aACb,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC;IACnC,MAAM,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACzD,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;IAExD,MAAM,KAAK,GAAa;QACtB,wCAAwC;QACxC,cAAc;QACd,8CAA8C,UAAU,eAAe,QAAQ,aAAa,MAAM,IAAI;KACvG,CAAC;IAEF,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,KAAK,CAAC,IAAI,CAAC,uBAAuB,SAAS,CAAC,EAAE,CAAC,IAAI,CAAC,gBAAgB,SAAS,CAAC,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QACjG,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,yBAAyB,SAAS,CAAC,EAAE,CAAC,OAAO,CAAC,KAAK,SAAS,CAAC,EAAE,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;QACjG,CAAC;aAAM,IAAI,EAAE,CAAC,MAAM,EAAE,CAAC;YACrB,KAAK,CAAC,IAAI,CAAC,2BAA2B,SAAS,CAAC,EAAE,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,KAAK,YAAY,CAAC,CAAC;QAC/F,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;IAChC,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC7B,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAE5B,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAaD,SAAS,eAAe,CAAC,OAAwB;IAC/C,MAAM,OAAO,GAAe,EAAE,CAAC;IAE/B,KAAK,MAAM,UAAU,IAAI,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QAC/C,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,IAAI,SAAS,CAAC;QAE1C,KAAK,MAAM,KAAK,IAAI,UAAU,CAAC,kBAAkB,IAAI,EAAE,EAAE,CAAC;YACxD,MAAM,MAAM,GAAG,KAAK,CAAC,EAAE,IAAI,SAAS,CAAC;YACrC,MAAM,IAAI,GAAG,GAAG,MAAM,IAAI,MAAM,EAAE,CAAC;YACnC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC;YAC7B,MAAM,KAAK,GAAG,KAAK,CAAC,aAAa,CAAC;YAElC,OAAO,CAAC,IAAI,CAAC;gBACX,EAAE,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC,KAAK;gBAC9B,IAAI;gBACJ,KAAK,EAAE,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK;gBAC/C,QAAQ,EAAE,KAAK,CAAC,cAAc,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,SAAS;gBACjF,MAAM,EAAE,KAAK,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC3E,KAAK,EAAE,KAAK,IAAI,SAAS;aAC1B,CAAC,CAAC;QACL,CAAC;QAED,KAAK,MAAM,GAAG,IAAI,UAAU,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;YAC1C,MAAM,MAAM,GAAG,GAAG,CAAC,EAAE,IAAI,SAAS,CAAC;YACnC,OAAO,CAAC,IAAI,CAAC;gBACX,EAAE,EAAE,KAAK;gBACT,IAAI,EAAE,GAAG,MAAM,IAAI,MAAM,EAAE;gBAC3B,KAAK,EAAE,KAAK;gBACZ,KAAK,EAAE,GAAG,CAAC,aAAa,IAAI,eAAe;aAC5C,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,SAAS,CAAC,OAAwB;IACzC,MAAM,OAAO,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAEzC,MAAM,KAAK,GAAa,CAAC,gBAAgB,EAAE,MAAM,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IAEnE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACrB,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;QAClB,MAAM,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC;QACtC,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,IAAI,YAAY,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC;QAE/D,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;YACV,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACpB,IAAI,CAAC,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;gBAC7B,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC;YAC5C,CAAC;YACD,IAAI,CAAC,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;gBAC3B,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YACxC,CAAC;YACD,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;gBACZ,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC;YACtC,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,SAAS,CAAC,GAAW;IAC5B,OAAO,GAAG;SACP,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC;SACtB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC;SACvB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;AAC7B,CAAC"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
export type EvalPayload = {
|
|
2
|
+
tests: EvalTest[];
|
|
3
|
+
};
|
|
4
|
+
export type EvalTest = {
|
|
5
|
+
id: string;
|
|
6
|
+
steps: EvalStep[];
|
|
7
|
+
};
|
|
8
|
+
export type EvalStep = {
|
|
9
|
+
[key: string]: unknown;
|
|
10
|
+
type: string;
|
|
11
|
+
id: string;
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* Apply all normalizations to a test payload.
|
|
15
|
+
* Passes run in order: mcp-shorthand -> auto-correct -> camelCase -> evaluator fields -> shorthand refs -> defaults -> strip.
|
|
16
|
+
*/
|
|
17
|
+
export declare function normalizePayload(payload: EvalPayload): EvalPayload;
|
|
18
|
+
/**
|
|
19
|
+
* Convert MCP shorthand format to raw Eval API format.
|
|
20
|
+
* MCP uses type="evaluator" + evaluator_type, raw API uses type="evaluator.xxx".
|
|
21
|
+
* Also maps `field` to `actual` with proper JSONPath and auto-generates missing `id` fields.
|
|
22
|
+
*/
|
|
23
|
+
export declare function normalizeMcpShorthand(steps: EvalStep[]): EvalStep[];
|
|
24
|
+
/**
|
|
25
|
+
* Auto-correct common field name mistakes.
|
|
26
|
+
* Maps wrong field names to correct ones (agentId->agent_id, text->utterance, etc.)
|
|
27
|
+
*/
|
|
28
|
+
export declare function autoCorrectFields(steps: EvalStep[]): EvalStep[];
|
|
29
|
+
/**
|
|
30
|
+
* Normalize camelCase agent field names to snake_case.
|
|
31
|
+
* useAgentApi->use_agent_api, plannerDefinitionId->planner_id, etc.
|
|
32
|
+
*/
|
|
33
|
+
export declare function normalizeCamelCase(steps: EvalStep[]): EvalStep[];
|
|
34
|
+
/**
|
|
35
|
+
* Normalize evaluator field names based on evaluator category.
|
|
36
|
+
* Maps actual/expected <-> generated_output/reference_answer.
|
|
37
|
+
* Also auto-lowercases operator values and auto-injects metric_name.
|
|
38
|
+
*/
|
|
39
|
+
export declare function normalizeEvaluatorFields(steps: EvalStep[]): EvalStep[];
|
|
40
|
+
/**
|
|
41
|
+
* Convert {step_id.field} shorthand references to JSONPath $.outputs[N].field.
|
|
42
|
+
* Builds step_id->index mapping from non-evaluator steps.
|
|
43
|
+
*/
|
|
44
|
+
export declare function convertShorthandRefs(steps: EvalStep[]): EvalStep[];
|
|
45
|
+
/**
|
|
46
|
+
* Inject default values:
|
|
47
|
+
* - use_agent_api=true on agent.create_session if neither use_agent_api nor planner_id present
|
|
48
|
+
*/
|
|
49
|
+
export declare function injectDefaults(steps: EvalStep[]): EvalStep[];
|
|
50
|
+
/**
|
|
51
|
+
* Strip unrecognized fields from steps based on type-specific whitelists.
|
|
52
|
+
*/
|
|
53
|
+
export declare function stripUnrecognizedFields(steps: EvalStep[]): EvalStep[];
|
|
54
|
+
/**
|
|
55
|
+
* Split tests array into chunks of batchSize.
|
|
56
|
+
*/
|
|
57
|
+
export declare function splitIntoBatches(tests: EvalTest[], batchSize: number): EvalTest[][];
|