@salesforce/agents 1.5.1 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/agentEvalRunner.d.ts +26 -0
- package/lib/agentEvalRunner.js +102 -0
- package/lib/agentEvalRunner.js.map +1 -0
- package/lib/evalFormatter.d.ts +44 -0
- package/lib/evalFormatter.js +267 -0
- package/lib/evalFormatter.js.map +1 -0
- package/lib/evalNormalizer.d.ts +57 -0
- package/lib/evalNormalizer.js +442 -0
- package/lib/evalNormalizer.js.map +1 -0
- package/lib/index.d.ts +6 -2
- package/lib/index.js +25 -1
- package/lib/index.js.map +1 -1
- package/lib/utils.d.ts +21 -0
- package/lib/utils.js +55 -1
- package/lib/utils.js.map +1 -1
- package/lib/yamlSpecTranslator.d.ts +20 -0
- package/lib/yamlSpecTranslator.js +234 -0
- package/lib/yamlSpecTranslator.js.map +1 -0
- package/package.json +1 -1
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { Org } from '@salesforce/core';
|
|
2
|
+
import type { EvalPayload } from './evalNormalizer.js';
|
|
3
|
+
import type { EvalApiResponse, EvalResult, EvalOutput, TestResult } from './evalFormatter.js';
|
|
4
|
+
export type AgentEvalRunResult = {
|
|
5
|
+
tests: Array<{
|
|
6
|
+
id: string;
|
|
7
|
+
status: string;
|
|
8
|
+
evaluations: EvalResult[];
|
|
9
|
+
outputs: EvalOutput[];
|
|
10
|
+
}>;
|
|
11
|
+
summary: {
|
|
12
|
+
passed: number;
|
|
13
|
+
failed: number;
|
|
14
|
+
scored: number;
|
|
15
|
+
errors: number;
|
|
16
|
+
};
|
|
17
|
+
};
|
|
18
|
+
export declare function resolveAgent(org: Org, apiName: string): Promise<{
|
|
19
|
+
agentId: string;
|
|
20
|
+
versionId: string;
|
|
21
|
+
}>;
|
|
22
|
+
export declare function executeBatches(org: Org, batches: Array<EvalPayload['tests']>, log?: (msg: string) => void): Promise<TestResult[]>;
|
|
23
|
+
export declare function buildResultSummary(mergedResponse: EvalApiResponse): {
|
|
24
|
+
summary: AgentEvalRunResult['summary'];
|
|
25
|
+
testSummaries: AgentEvalRunResult['tests'];
|
|
26
|
+
};
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/*
|
|
3
|
+
* Copyright 2026, Salesforce, Inc.
|
|
4
|
+
*
|
|
5
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
* you may not use this file except in compliance with the License.
|
|
7
|
+
* You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
18
|
+
exports.resolveAgent = resolveAgent;
|
|
19
|
+
exports.executeBatches = executeBatches;
|
|
20
|
+
exports.buildResultSummary = buildResultSummary;
|
|
21
|
+
/* eslint-disable camelcase */
|
|
22
|
+
const core_1 = require("@salesforce/core");
|
|
23
|
+
const utils_1 = require("./utils");
|
|
24
|
+
async function getApiHeaders(org) {
|
|
25
|
+
const conn = org.getConnection();
|
|
26
|
+
const userInfo = await conn.request(`${conn.instanceUrl}/services/oauth2/userinfo`);
|
|
27
|
+
return {
|
|
28
|
+
orgId: org.getOrgId(),
|
|
29
|
+
userId: userInfo.user_id,
|
|
30
|
+
instanceUrl: conn.instanceUrl,
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
async function callEvalApi(org, payload, headers) {
|
|
34
|
+
const conn = org.getConnection();
|
|
35
|
+
return (0, utils_1.requestWithEndpointFallback)(conn, {
|
|
36
|
+
url: 'https://api.salesforce.com/einstein/evaluation/v1/tests',
|
|
37
|
+
method: 'POST',
|
|
38
|
+
headers: {
|
|
39
|
+
'Content-Type': 'application/json',
|
|
40
|
+
'x-sfdc-core-tenant-id': `core/prod/${headers.orgId}`,
|
|
41
|
+
'x-org-id': headers.orgId,
|
|
42
|
+
'x-sfdc-core-instance-url': headers.instanceUrl,
|
|
43
|
+
'x-sfdc-user-id': headers.userId,
|
|
44
|
+
'x-client-feature-id': 'AIPlatformEvaluation',
|
|
45
|
+
'x-sfdc-app-context': 'EinsteinGPT',
|
|
46
|
+
},
|
|
47
|
+
body: JSON.stringify(payload),
|
|
48
|
+
}, { retry: { maxRetries: 3 } });
|
|
49
|
+
}
|
|
50
|
+
async function resolveAgent(org, apiName) {
|
|
51
|
+
const conn = org.getConnection();
|
|
52
|
+
const escapedApiName = apiName.replace(/'/g, "''");
|
|
53
|
+
const botResult = await conn.query(`SELECT Id FROM BotDefinition WHERE DeveloperName = '${escapedApiName}'`);
|
|
54
|
+
if (!botResult.records.length) {
|
|
55
|
+
throw new core_1.SfError(`Agent '${apiName}' not found. Verify the DeveloperName exists in BotDefinition in the target org.`);
|
|
56
|
+
}
|
|
57
|
+
const agentId = botResult.records[0].Id;
|
|
58
|
+
const versionResult = await conn.query(`SELECT Id FROM BotVersion WHERE BotDefinitionId = '${agentId}' ORDER BY VersionNumber DESC LIMIT 1`);
|
|
59
|
+
if (!versionResult.records.length) {
|
|
60
|
+
throw new core_1.SfError(`No published version found for agent '${apiName}'. Ensure the agent has been saved and versioned in the target org.`);
|
|
61
|
+
}
|
|
62
|
+
const versionId = versionResult.records[0].Id;
|
|
63
|
+
return { agentId, versionId };
|
|
64
|
+
}
|
|
65
|
+
async function executeBatches(org, batches, log) {
|
|
66
|
+
const headers = await getApiHeaders(org);
|
|
67
|
+
if (batches.length > 1) {
|
|
68
|
+
log?.(`Running ${batches.length} batches in parallel`);
|
|
69
|
+
}
|
|
70
|
+
const batchPromises = batches.map(async (batch) => {
|
|
71
|
+
const batchPayload = { tests: batch };
|
|
72
|
+
const resultObj = await callEvalApi(org, batchPayload, headers);
|
|
73
|
+
return resultObj.results ?? [];
|
|
74
|
+
});
|
|
75
|
+
const batchResults = await Promise.all(batchPromises);
|
|
76
|
+
return batchResults.flat();
|
|
77
|
+
}
|
|
78
|
+
function buildResultSummary(mergedResponse) {
|
|
79
|
+
const summary = { passed: 0, failed: 0, scored: 0, errors: 0 };
|
|
80
|
+
const testSummaries = [];
|
|
81
|
+
for (const testResult of mergedResponse.results ?? []) {
|
|
82
|
+
const testId = testResult.id ?? 'unknown';
|
|
83
|
+
const evalResults = testResult.evaluation_results ?? [];
|
|
84
|
+
const testErrors = testResult.errors ?? [];
|
|
85
|
+
const passed = evalResults.filter((e) => e.is_pass === true).length;
|
|
86
|
+
const failed = evalResults.filter((e) => e.is_pass === false).length;
|
|
87
|
+
const scored = evalResults.filter((e) => e.score != null && e.is_pass == null).length;
|
|
88
|
+
summary.passed += passed;
|
|
89
|
+
summary.failed += failed;
|
|
90
|
+
summary.scored += scored;
|
|
91
|
+
summary.errors += testErrors.length;
|
|
92
|
+
const outputs = testResult.outputs ?? [];
|
|
93
|
+
testSummaries.push({
|
|
94
|
+
id: testId,
|
|
95
|
+
status: failed > 0 || testErrors.length > 0 ? 'failed' : 'passed',
|
|
96
|
+
evaluations: evalResults,
|
|
97
|
+
outputs,
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
return { summary, testSummaries };
|
|
101
|
+
}
|
|
102
|
+
//# sourceMappingURL=agentEvalRunner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"agentEvalRunner.js","sourceRoot":"","sources":["../src/agentEvalRunner.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;GAcG;;AAsDH,oCA0BC;AAED,wCAmBC;AAED,gDA+BC;AApID,8BAA8B;AAE9B,2CAAgD;AAChD,mCAAsD;AAetD,KAAK,UAAU,aAAa,CAAC,GAAQ;IACnC,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,EAAE,CAAC;IACjC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,OAAO,CAAsB,GAAG,IAAI,CAAC,WAAW,2BAA2B,CAAC,CAAC;IAEzG,OAAO;QACL,KAAK,EAAE,GAAG,CAAC,QAAQ,EAAE;QACrB,MAAM,EAAE,QAAQ,CAAC,OAAO;QACxB,WAAW,EAAE,IAAI,CAAC,WAAW;KAC9B,CAAC;AACJ,CAAC;AAED,KAAK,UAAU,WAAW,CAAC,GAAQ,EAAE,OAAoB,EAAE,OAAmB;IAC5E,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,EAAE,CAAC;IAEjC,OAAO,IAAA,mCAA2B,EAChC,IAAI,EACJ;QACE,GAAG,EAAE,yDAAyD;QAC9D,MAAM,EAAE,MAAM;QACd,OAAO,EAAE;YACP,cAAc,EAAE,kBAAkB;YAClC,uBAAuB,EAAE,aAAa,OAAO,CAAC,KAAK,EAAE;YACrD,UAAU,EAAE,OAAO,CAAC,KAAK;YACzB,0BAA0B,EAAE,OAAO,CAAC,WAAW;YAC/C,gBAAgB,EAAE,OAAO,CAAC,MAAM;YAChC,qBAAqB,EAAE,sBAAsB;YAC7C,oBAAoB,EAAE,aAAa;SACpC;QACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC;KAC9B,EACD,EAAE,KAAK,EAAE,EAAE,UAAU,EAAE,CAAC,EAAE,EAAE,CAC7B,CAAC;AACJ,CAAC;AAEM,KAAK,UAAU,YAAY,CAAC,GAAQ,EAAE,OAAe;IAC1D,MAAM,IAAI,GAAG,GAAG,CAAC,aAAa,EAAE,CAAC;IAEjC,MAAM,cAAc,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IAEnD,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,KAAK,CAChC,uDAAuD,cAAc,GAAG,CACzE,CAAC;IACF,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;QAC9B,MAAM,IAAI,cAAO,CACf,UAAU,OAAO,kFAAkF,CACpG,CAAC;IACJ,CAAC;IACD,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAExC,MAAM,aAAa,GAAG,MAAM,IAAI,CAAC,KAAK,CACpC,sDAAsD,OAAO,uCAAuC,CACrG,CAAC;IACF,IAAI,CAAC,aAAa,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC;QAClC,MAAM,IAAI,cAAO,CACf,yCAAyC,OAAO,qEAAqE,CACtH,CAAC;IACJ,CAAC;IACD,MAAM,SAAS,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAE9C,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC;AAChC,CAAC;AAEM,KAAK,UAAU,cAAc,CAClC,GAAQ,EACR,OAAoC,EACpC,GAA2B;IAE3B,MAAM,OAAO,GAAG,MAAM,aAAa,CAAC,GAAG,CAAC,CAAC;IAEzC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,GAAG,EAAE,CAAC,WAAW,OAAO,CAAC,MAAM,sBAAsB,CAAC,CAAC;IACzD,CAAC;IAED,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAChD,MAAM,YAAY,GAAgB,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;QACnD,MAAM,SAAS,GAAG,MAAM,WAAW,CAAC,GAAG,EAAE,YAAY,EAAE,OAAO,CAAC,CAAC;QAChE,OAAO,SAAS,CAAC,OAAO,IAAI,EAAE,CAAC;IACjC,CAAC,CAAC,CAAC;IAEH,MAAM,YAAY,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC,CAAC;IACtD,OAAO,YAAY,CAAC,IAAI,EAAE,CAAC;AAC7B,CAAC;AAED,SAAgB,kBAAkB,CAAC,cAA+B;IAIhE,MAAM,OAAO,GAAG,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;IAC/D,MAAM,aAAa,GAAgC,EAAE,CAAC;IAEtD,KAAK,MAAM,UAAU,IAAI,cAAc,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QACtD,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,IAAI,SAAS,CAAC;QAC1C,MAAM,WAAW,GAAiB,UAAU,CAAC,kBAAkB,IAAI,EAAE,CAAC;QACtE,MAAM,UAAU,GAAgB,UAAU,CAAC,MAAM,IAAI,EAAE,CAAC;QAExD,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;QACpE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,KAAK,CAAC,CAAC,MAAM,CAAC;QACrE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,IAAI,CAAC,CAAC,OAAO,IAAI,IAAI,CAAC,CAAC,MAAM,CAAC;QAEtF,OAAO,CAAC,MAAM,IAAI,MAAM,CAAC;QACzB,OAAO,CAAC,MAAM,IAAI,MAAM,CAAC;QACzB,OAAO,CAAC,MAAM,IAAI,MAAM,CAAC;QACzB,OAAO,CAAC,MAAM,IAAI,UAAU,CAAC,MAAM,CAAC;QAEpC,MAAM,OAAO,GAAiB,UAAU,CAAC,OAAO,IAAI,EAAE,CAAC;QACvD,aAAa,CAAC,IAAI,CAAC;YACjB,EAAE,EAAE,MAAM;YACV,MAAM,EAAE,MAAM,GAAG,CAAC,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ;YACjE,WAAW,EAAE,WAAW;YACxB,OAAO;SACR,CAAC,CAAC;IACL,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,aAAa,EAAE,CAAC;AACpC,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
export type ResultFormat = 'human' | 'json' | 'junit' | 'tap';
|
|
2
|
+
type SendMessageResponse = string | {
|
|
3
|
+
messages: Array<{
|
|
4
|
+
message: string;
|
|
5
|
+
}>;
|
|
6
|
+
};
|
|
7
|
+
type GetStateResponse = {
|
|
8
|
+
planner_response?: {
|
|
9
|
+
lastExecution?: {
|
|
10
|
+
topic?: string;
|
|
11
|
+
latency?: number;
|
|
12
|
+
invokedActions?: string[];
|
|
13
|
+
};
|
|
14
|
+
};
|
|
15
|
+
};
|
|
16
|
+
export type EvalOutput = {
|
|
17
|
+
type?: string;
|
|
18
|
+
id?: string;
|
|
19
|
+
session_id?: string;
|
|
20
|
+
response?: SendMessageResponse | GetStateResponse;
|
|
21
|
+
};
|
|
22
|
+
export type EvalResult = {
|
|
23
|
+
id?: string;
|
|
24
|
+
score?: number | null;
|
|
25
|
+
is_pass?: boolean | null;
|
|
26
|
+
actual_value?: string;
|
|
27
|
+
expected_value?: string;
|
|
28
|
+
error_message?: string;
|
|
29
|
+
};
|
|
30
|
+
export type TestError = {
|
|
31
|
+
id?: string;
|
|
32
|
+
error_message?: string;
|
|
33
|
+
};
|
|
34
|
+
export type TestResult = {
|
|
35
|
+
id?: string;
|
|
36
|
+
outputs?: EvalOutput[];
|
|
37
|
+
evaluation_results?: EvalResult[];
|
|
38
|
+
errors?: TestError[];
|
|
39
|
+
};
|
|
40
|
+
export type EvalApiResponse = {
|
|
41
|
+
results?: TestResult[];
|
|
42
|
+
};
|
|
43
|
+
export declare function formatResults(results: EvalApiResponse, format: ResultFormat): string;
|
|
44
|
+
export {};
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/*
|
|
3
|
+
* Copyright 2026, Salesforce, Inc.
|
|
4
|
+
*
|
|
5
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
* you may not use this file except in compliance with the License.
|
|
7
|
+
* You may obtain a copy of the License at
|
|
8
|
+
*
|
|
9
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
*
|
|
11
|
+
* Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
* See the License for the specific language governing permissions and
|
|
15
|
+
* limitations under the License.
|
|
16
|
+
*/
|
|
17
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
18
|
+
exports.formatResults = formatResults;
|
|
19
|
+
function formatResults(results, format) {
|
|
20
|
+
switch (format) {
|
|
21
|
+
case 'human':
|
|
22
|
+
return formatHuman(results);
|
|
23
|
+
case 'json':
|
|
24
|
+
return JSON.stringify(results, null, 2);
|
|
25
|
+
case 'junit':
|
|
26
|
+
return formatJunit(results);
|
|
27
|
+
case 'tap':
|
|
28
|
+
return formatTap(results);
|
|
29
|
+
default:
|
|
30
|
+
return formatHuman(results);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
// --- formatHuman helpers ---
|
|
34
|
+
function formatOutputLines(outputs) {
|
|
35
|
+
const lines = [];
|
|
36
|
+
for (const output of outputs) {
|
|
37
|
+
const stepType = output.type ?? '';
|
|
38
|
+
const stepId = output.id ?? '';
|
|
39
|
+
if (stepType === 'agent.create_session') {
|
|
40
|
+
const sessionId = output.session_id ?? 'N/A';
|
|
41
|
+
lines.push(`- **Create Session**: ${sessionId}`);
|
|
42
|
+
}
|
|
43
|
+
else if (stepType === 'agent.send_message') {
|
|
44
|
+
const resp = output.response;
|
|
45
|
+
let msgStr;
|
|
46
|
+
if (typeof resp === 'string') {
|
|
47
|
+
msgStr = resp;
|
|
48
|
+
}
|
|
49
|
+
else if (resp !== null && typeof resp === 'object' && 'messages' in resp) {
|
|
50
|
+
msgStr = resp.messages?.[0]?.message ?? '';
|
|
51
|
+
}
|
|
52
|
+
else {
|
|
53
|
+
msgStr = String(resp ?? '');
|
|
54
|
+
}
|
|
55
|
+
const displayMsg = msgStr.length > 200 ? msgStr.substring(0, 200) + '...' : msgStr;
|
|
56
|
+
lines.push(`- **Agent Response** (${stepId}): ${displayMsg}`);
|
|
57
|
+
}
|
|
58
|
+
else if (stepType === 'agent.get_state') {
|
|
59
|
+
const resp = output.response;
|
|
60
|
+
if (resp !== null && typeof resp === 'object' && 'planner_response' in resp) {
|
|
61
|
+
const { planner_response: plannerResp } = resp;
|
|
62
|
+
const lastExec = plannerResp?.lastExecution;
|
|
63
|
+
lines.push(`- **Topic Selected**: ${lastExec?.topic ?? 'N/A'}`);
|
|
64
|
+
lines.push(`- **Response Latency**: ${lastExec?.latency ?? 'N/A'}ms`);
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
lines.push(`- **State**: ${String(resp).substring(0, 200)}`);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return lines;
|
|
72
|
+
}
|
|
73
|
+
function formatEvaluationTable(evalResults) {
|
|
74
|
+
const lines = [];
|
|
75
|
+
if (evalResults.length > 0) {
|
|
76
|
+
lines.push('### Evaluation Results\n');
|
|
77
|
+
lines.push('| Metric | Score | Pass | Actual | Expected |');
|
|
78
|
+
lines.push('|--------|-------|------|--------|----------|');
|
|
79
|
+
for (const evalR of evalResults) {
|
|
80
|
+
const metricId = evalR.id ?? 'unknown';
|
|
81
|
+
const score = evalR.score;
|
|
82
|
+
const scoreStr = score != null ? score.toFixed(3) : 'N/A';
|
|
83
|
+
const isPass = evalR.is_pass;
|
|
84
|
+
const passStr = isPass === true ? 'PASS' : isPass === false ? 'FAIL' : 'N/A';
|
|
85
|
+
const actual = String(evalR.actual_value ?? '').substring(0, 60);
|
|
86
|
+
const expected = String(evalR.expected_value ?? '').substring(0, 60);
|
|
87
|
+
const error = evalR.error_message;
|
|
88
|
+
if (error) {
|
|
89
|
+
lines.push(`| ${metricId} | ERROR | - | ${error.substring(0, 80)} | - |`);
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
lines.push(`| ${metricId} | ${scoreStr} | ${passStr} | ${actual} | ${expected} |`);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
lines.push('');
|
|
96
|
+
}
|
|
97
|
+
return lines;
|
|
98
|
+
}
|
|
99
|
+
function formatErrorLines(errors) {
|
|
100
|
+
const lines = [];
|
|
101
|
+
if (errors.length > 0) {
|
|
102
|
+
lines.push('### Errors\n');
|
|
103
|
+
for (const error of errors) {
|
|
104
|
+
const errorId = error.id ?? 'unknown';
|
|
105
|
+
const errorMsg = error.error_message ?? String(error);
|
|
106
|
+
lines.push(`- **${errorId}**: ${errorMsg}`);
|
|
107
|
+
}
|
|
108
|
+
lines.push('');
|
|
109
|
+
}
|
|
110
|
+
return lines;
|
|
111
|
+
}
|
|
112
|
+
function formatTestSummaryLines(evalResults, errors) {
|
|
113
|
+
const lines = [];
|
|
114
|
+
const totalEvals = evalResults.length;
|
|
115
|
+
const passed = evalResults.filter((e) => e.is_pass === true).length;
|
|
116
|
+
const failed = evalResults.filter((e) => e.is_pass === false).length;
|
|
117
|
+
const scored = evalResults.filter((e) => e.score != null && e.is_pass == null).length;
|
|
118
|
+
lines.push(`**Summary**: ${totalEvals} evaluations`);
|
|
119
|
+
if (passed || failed) {
|
|
120
|
+
lines.push(` - Passed: ${passed}, Failed: ${failed}`);
|
|
121
|
+
}
|
|
122
|
+
if (scored) {
|
|
123
|
+
lines.push(` - Scored (no threshold): ${scored}`);
|
|
124
|
+
}
|
|
125
|
+
if (errors.length > 0) {
|
|
126
|
+
lines.push(` - Errors: ${errors.length}`);
|
|
127
|
+
}
|
|
128
|
+
lines.push('');
|
|
129
|
+
return lines;
|
|
130
|
+
}
|
|
131
|
+
function formatHuman(results) {
|
|
132
|
+
const lines = ['# Agent Evaluation Results\n'];
|
|
133
|
+
for (const testResult of results.results ?? []) {
|
|
134
|
+
const testId = testResult.id ?? 'unknown';
|
|
135
|
+
const errors = testResult.errors ?? [];
|
|
136
|
+
const evalResults = testResult.evaluation_results ?? [];
|
|
137
|
+
const outputs = testResult.outputs ?? [];
|
|
138
|
+
lines.push(`## Test: ${testId}\n`);
|
|
139
|
+
lines.push(...formatOutputLines(outputs));
|
|
140
|
+
lines.push('');
|
|
141
|
+
lines.push(...formatEvaluationTable(evalResults));
|
|
142
|
+
lines.push(...formatErrorLines(errors));
|
|
143
|
+
lines.push(...formatTestSummaryLines(evalResults, errors));
|
|
144
|
+
}
|
|
145
|
+
return lines.join('\n');
|
|
146
|
+
}
|
|
147
|
+
function formatJunit(results) {
|
|
148
|
+
const allTests = [];
|
|
149
|
+
for (const testResult of results.results ?? []) {
|
|
150
|
+
const testId = testResult.id ?? 'unknown';
|
|
151
|
+
for (const evalR of testResult.evaluation_results ?? []) {
|
|
152
|
+
const stepId = evalR.id ?? 'unknown';
|
|
153
|
+
const name = `${testId}.${stepId}`;
|
|
154
|
+
const score = evalR.score;
|
|
155
|
+
const isPass = evalR.is_pass;
|
|
156
|
+
const error = evalR.error_message;
|
|
157
|
+
allTests.push({
|
|
158
|
+
name,
|
|
159
|
+
classname: 'agent-eval-labs',
|
|
160
|
+
failed: isPass === false,
|
|
161
|
+
errored: !!error,
|
|
162
|
+
message: error
|
|
163
|
+
? error
|
|
164
|
+
: isPass === false
|
|
165
|
+
? `Expected ${String(evalR.expected_value ?? '')} but got ${String(evalR.actual_value ?? '')}`
|
|
166
|
+
: '',
|
|
167
|
+
score: score != null ? score.toFixed(3) : 'N/A',
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
for (const err of testResult.errors ?? []) {
|
|
171
|
+
const stepId = err.id ?? 'unknown';
|
|
172
|
+
allTests.push({
|
|
173
|
+
name: `${testId}.${stepId}`,
|
|
174
|
+
classname: 'agent-eval-labs',
|
|
175
|
+
failed: false,
|
|
176
|
+
errored: true,
|
|
177
|
+
message: err.error_message ?? 'Unknown error',
|
|
178
|
+
score: 'N/A',
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
const totalTests = allTests.length;
|
|
183
|
+
const failures = allTests.filter((t) => t.failed).length;
|
|
184
|
+
const errors = allTests.filter((t) => t.errored).length;
|
|
185
|
+
const lines = [
|
|
186
|
+
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
187
|
+
'<testsuites>',
|
|
188
|
+
` <testsuite name="agent-eval-labs" tests="${totalTests}" failures="${failures}" errors="${errors}">`,
|
|
189
|
+
];
|
|
190
|
+
for (const tc of allTests) {
|
|
191
|
+
lines.push(` <testcase name="${escapeXml(tc.name)}" classname="${escapeXml(tc.classname)}">`);
|
|
192
|
+
if (tc.errored) {
|
|
193
|
+
lines.push(` <error message="${escapeXml(tc.message)}">${escapeXml(tc.message)}</error>`);
|
|
194
|
+
}
|
|
195
|
+
else if (tc.failed) {
|
|
196
|
+
lines.push(` <failure message="${escapeXml(tc.message)}">Score: ${tc.score}</failure>`);
|
|
197
|
+
}
|
|
198
|
+
lines.push(' </testcase>');
|
|
199
|
+
}
|
|
200
|
+
lines.push(' </testsuite>');
|
|
201
|
+
lines.push('</testsuites>');
|
|
202
|
+
return lines.join('\n');
|
|
203
|
+
}
|
|
204
|
+
function buildTapEntries(results) {
|
|
205
|
+
const entries = [];
|
|
206
|
+
for (const testResult of results.results ?? []) {
|
|
207
|
+
const testId = testResult.id ?? 'unknown';
|
|
208
|
+
for (const evalR of testResult.evaluation_results ?? []) {
|
|
209
|
+
const stepId = evalR.id ?? 'unknown';
|
|
210
|
+
const name = `${testId}.${stepId}`;
|
|
211
|
+
const score = evalR.score;
|
|
212
|
+
const isPass = evalR.is_pass;
|
|
213
|
+
const error = evalR.error_message;
|
|
214
|
+
entries.push({
|
|
215
|
+
ok: isPass !== false && !error,
|
|
216
|
+
name,
|
|
217
|
+
score: score != null ? score.toFixed(3) : 'N/A',
|
|
218
|
+
expected: evalR.expected_value != null ? String(evalR.expected_value) : undefined,
|
|
219
|
+
actual: evalR.actual_value != null ? String(evalR.actual_value) : undefined,
|
|
220
|
+
error: error ?? undefined,
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
for (const err of testResult.errors ?? []) {
|
|
224
|
+
const stepId = err.id ?? 'unknown';
|
|
225
|
+
entries.push({
|
|
226
|
+
ok: false,
|
|
227
|
+
name: `${testId}.${stepId}`,
|
|
228
|
+
score: 'N/A',
|
|
229
|
+
error: err.error_message ?? 'Unknown error',
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
return entries;
|
|
234
|
+
}
|
|
235
|
+
function formatTap(results) {
|
|
236
|
+
const entries = buildTapEntries(results);
|
|
237
|
+
const lines = ['TAP version 13', `1..${entries.length}`];
|
|
238
|
+
for (let i = 0; i < entries.length; i++) {
|
|
239
|
+
const e = entries[i];
|
|
240
|
+
const num = i + 1;
|
|
241
|
+
const prefix = e.ok ? 'ok' : 'not ok';
|
|
242
|
+
lines.push(`${prefix} ${num} - ${e.name} (score: ${e.score})`);
|
|
243
|
+
if (!e.ok) {
|
|
244
|
+
lines.push(' ---');
|
|
245
|
+
if (e.expected !== undefined) {
|
|
246
|
+
lines.push(` expected: "${e.expected}"`);
|
|
247
|
+
}
|
|
248
|
+
if (e.actual !== undefined) {
|
|
249
|
+
lines.push(` actual: "${e.actual}"`);
|
|
250
|
+
}
|
|
251
|
+
if (e.error) {
|
|
252
|
+
lines.push(` error: "${e.error}"`);
|
|
253
|
+
}
|
|
254
|
+
lines.push(' ...');
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
return lines.join('\n');
|
|
258
|
+
}
|
|
259
|
+
function escapeXml(str) {
|
|
260
|
+
return str
|
|
261
|
+
.replace(/&/g, '&')
|
|
262
|
+
.replace(/</g, '<')
|
|
263
|
+
.replace(/>/g, '>')
|
|
264
|
+
.replace(/"/g, '"')
|
|
265
|
+
.replace(/'/g, ''');
|
|
266
|
+
}
|
|
267
|
+
//# sourceMappingURL=evalFormatter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evalFormatter.js","sourceRoot":"","sources":["../src/evalFormatter.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;GAcG;;AAgDH,sCAaC;AAbD,SAAgB,aAAa,CAAC,OAAwB,EAAE,MAAoB;IAC1E,QAAQ,MAAM,EAAE,CAAC;QACf,KAAK,OAAO;YACV,OAAO,WAAW,CAAC,OAAO,CAAC,CAAC;QAC9B,KAAK,MAAM;YACT,OAAO,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;QAC1C,KAAK,OAAO;YACV,OAAO,WAAW,CAAC,OAAO,CAAC,CAAC;QAC9B,KAAK,KAAK;YACR,OAAO,SAAS,CAAC,OAAO,CAAC,CAAC;QAC5B;YACE,OAAO,WAAW,CAAC,OAAO,CAAC,CAAC;IAChC,CAAC;AACH,CAAC;AAED,8BAA8B;AAE9B,SAAS,iBAAiB,CAAC,OAAqB;IAC9C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,KAAK,MAAM,MAAM,IAAI,OAAO,EAAE,CAAC;QAC7B,MAAM,QAAQ,GAAG,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC;QAE/B,IAAI,QAAQ,KAAK,sBAAsB,EAAE,CAAC;YACxC,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,IAAI,KAAK,CAAC;YAC7C,KAAK,CAAC,IAAI,CAAC,yBAAyB,SAAS,EAAE,CAAC,CAAC;QACnD,CAAC;aAAM,IAAI,QAAQ,KAAK,oBAAoB,EAAE,CAAC;YAC7C,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC;YAC7B,IAAI,MAAc,CAAC;YACnB,IAAI,OAAO,IAAI,KAAK,QAAQ,EAAE,CAAC;gBAC7B,MAAM,GAAG,IAAI,CAAC;YAChB,CAAC;iBAAM,IAAI,IAAI,KAAK,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,UAAU,IAAI,IAAI,EAAE,CAAC;gBAC3E,MAAM,GAAI,IAAiD,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,OAAO,IAAI,EAAE,CAAC;YAC3F,CAAC;iBAAM,CAAC;gBACN,MAAM,GAAG,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC;YAC9B,CAAC;YACD,MAAM,UAAU,GAAG,MAAM,CAAC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC;YACnF,KAAK,CAAC,IAAI,CAAC,yBAAyB,MAAM,MAAM,UAAU,EAAE,CAAC,CAAC;QAChE,CAAC;aAAM,IAAI,QAAQ,KAAK,iBAAiB,EAAE,CAAC;YAC1C,MAAM,IAAI,GAAG,MAAM,CAAC,QAAQ,CAAC;YAC7B,IAAI,IAAI,KAAK,IAAI,IAAI,OAAO,IAAI,KAAK,QAAQ,IAAI,kBAAkB,IAAI,IAAI,EAAE,CAAC;gBAC5E,MAAM,EAAE,gBAAgB,EAAE,WAAW,EAAE,GAAG,IAAmE,CAAC;gBAC9G,MAAM,QAAQ,GAAG,WAAW,EAAE,aAAa,CAAC;gBAC5C,KAAK,CAAC,IAAI,CAAC,yBAAyB,QAAQ,EAAE,KAAK,IAAI,KAAK,EAAE,CAAC,CAAC;gBAChE,KAAK,CAAC,IAAI,CAAC,2BAA2B,QAAQ,EAAE,OAAO,IAAI,KAAK,IAAI,CAAC,CAAC;YACxE,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,gBAAgB,MAAM,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;YAC/D,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,qBAAqB,CAAC,WAAyB;IACtD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC3B,KAAK,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;QACvC,KAAK,CAAC,IAAI,CAAC,+CAA+C,CAAC,CAAC;QAC5D,KAAK,CAAC,IAAI,CAAC,+CAA+C,CAAC,CAAC;QAE5D,KAAK,MAAM,KAAK,IAAI,WAAW,EAAE,CAAC;YAChC,MAAM,QAAQ,GAAG,KAAK,CAAC,EAAE,IAAI,SAAS,CAAC;YACvC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,QAAQ,GAAG,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC;YAC1D,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC;YAC7B,MAAM,OAAO,GAAG,MAAM,KAAK,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,KAAK,KAAK,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,KAAK,CAAC;YAC7E,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,YAAY,IAAI,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACjE,MAAM,QAAQ,GAAG,MAAM,CAAC,KAAK,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACrE,MAAM,KAAK,GAAG,KAAK,CAAC,aAAa,CAAC;YAElC,IAAI,KAAK,EAAE,CAAC;gBACV,KAAK,CAAC,IAAI,CAAC,KAAK,QAAQ,kBAAkB,KAAK,CAAC,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,CAAC;YAC5E,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,KAAK,QAAQ,MAAM,QAAQ,MAAM,OAAO,MAAM,MAAM,MAAM,QAAQ,IAAI,CAAC,CAAC;YACrF,CAAC;QACH,CAAC;QAED,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,gBAAgB,CAAC,MAAmB;IAC3C,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;QAC3B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,OAAO,GAAG,KAAK,CAAC,EAAE,IAAI,SAAS,CAAC;YACtC,MAAM,QAAQ,GAAG,KAAK,CAAC,aAAa,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;YACtD,KAAK,CAAC,IAAI,CAAC,OAAO,OAAO,OAAO,QAAQ,EAAE,CAAC,CAAC;QAC9C,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjB,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,sBAAsB,CAAC,WAAyB,EAAE,MAAmB;IAC5E,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,CAAC;IACtC,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC;IACpE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,KAAK,KAAK,CAAC,CAAC,MAAM,CAAC;IACrE,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,IAAI,IAAI,IAAI,CAAC,CAAC,OAAO,IAAI,IAAI,CAAC,CAAC,MAAM,CAAC;IAEtF,KAAK,CAAC,IAAI,CAAC,gBAAgB,UAAU,cAAc,CAAC,CAAC;IACrD,IAAI,MAAM,IAAI,MAAM,EAAE,CAAC;QACrB,KAAK,CAAC,IAAI,CAAC,eAAe,MAAM,aAAa,MAAM,EAAE,CAAC,CAAC;IACzD,CAAC;IACD,IAAI,MAAM,EAAE,CAAC;QACX,KAAK,CAAC,IAAI,CAAC,8BAA8B,MAAM,EAAE,CAAC,CAAC;IACrD,CAAC;IACD,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtB,KAAK,CAAC,IAAI,CAAC,eAAe,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC;IAC7C,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAEf,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,WAAW,CAAC,OAAwB;IAC3C,MAAM,KAAK,GAAa,CAAC,8BAA8B,CAAC,CAAC;IAEzD,KAAK,MAAM,UAAU,IAAI,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QAC/C,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,IAAI,SAAS,CAAC;QAC1C,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,IAAI,EAAE,CAAC;QACvC,MAAM,WAAW,GAAG,UAAU,CAAC,kBAAkB,IAAI,EAAE,CAAC;QACxD,MAAM,OAAO,GAAG,UAAU,CAAC,OAAO,IAAI,EAAE,CAAC;QAEzC,KAAK,CAAC,IAAI,CAAC,YAAY,MAAM,IAAI,CAAC,CAAC;QAEnC,KAAK,CAAC,IAAI,CAAC,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC,CAAC;QAC1C,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACf,KAAK,CAAC,IAAI,CAAC,GAAG,qBAAqB,CAAC,WAAW,CAAC,CAAC,CAAC;QAClD,KAAK,CAAC,IAAI,CAAC,GAAG,gBAAgB,CAAC,MAAM,CAAC,CAAC,CAAC;QACxC,KAAK,CAAC,IAAI,CAAC,GAAG,sBAAsB,CAAC,WAAW,EAAE,MAAM,CAAC,CAAC,CAAC;IAC7D,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,WAAW,CAAC,OAAwB;IAC3C,MAAM,QAAQ,GAOT,EAAE,CAAC;IAER,KAAK,MAAM,UAAU,IAAI,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QAC/C,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,IAAI,SAAS,CAAC;QAE1C,KAAK,MAAM,KAAK,IAAI,UAAU,CAAC,kBAAkB,IAAI,EAAE,EAAE,CAAC;YACxD,MAAM,MAAM,GAAG,KAAK,CAAC,EAAE,IAAI,SAAS,CAAC;YACrC,MAAM,IAAI,GAAG,GAAG,MAAM,IAAI,MAAM,EAAE,CAAC;YACnC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC;YAC7B,MAAM,KAAK,GAAG,KAAK,CAAC,aAAa,CAAC;YAElC,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI;gBACJ,SAAS,EAAE,iBAAiB;gBAC5B,MAAM,EAAE,MAAM,KAAK,KAAK;gBACxB,OAAO,EAAE,CAAC,CAAC,KAAK;gBAChB,OAAO,EAAE,KAAK;oBACZ,CAAC,CAAC,KAAK;oBACP,CAAC,CAAC,MAAM,KAAK,KAAK;wBAClB,CAAC,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,cAAc,IAAI,EAAE,CAAC,YAAY,MAAM,CAAC,KAAK,CAAC,YAAY,IAAI,EAAE,CAAC,EAAE;wBAC9F,CAAC,CAAC,EAAE;gBACN,KAAK,EAAE,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK;aAChD,CAAC,CAAC;QACL,CAAC;QAED,KAAK,MAAM,GAAG,IAAI,UAAU,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;YAC1C,MAAM,MAAM,GAAG,GAAG,CAAC,EAAE,IAAI,SAAS,CAAC;YACnC,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,GAAG,MAAM,IAAI,MAAM,EAAE;gBAC3B,SAAS,EAAE,iBAAiB;gBAC5B,MAAM,EAAE,KAAK;gBACb,OAAO,EAAE,IAAI;gBACb,OAAO,EAAE,GAAG,CAAC,aAAa,IAAI,eAAe;gBAC7C,KAAK,EAAE,KAAK;aACb,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,QAAQ,CAAC,MAAM,CAAC;IACnC,MAAM,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;IACzD,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;IAExD,MAAM,KAAK,GAAa;QACtB,wCAAwC;QACxC,cAAc;QACd,8CAA8C,UAAU,eAAe,QAAQ,aAAa,MAAM,IAAI;KACvG,CAAC;IAEF,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,KAAK,CAAC,IAAI,CAAC,uBAAuB,SAAS,CAAC,EAAE,CAAC,IAAI,CAAC,gBAAgB,SAAS,CAAC,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QACjG,IAAI,EAAE,CAAC,OAAO,EAAE,CAAC;YACf,KAAK,CAAC,IAAI,CAAC,yBAAyB,SAAS,CAAC,EAAE,CAAC,OAAO,CAAC,KAAK,SAAS,CAAC,EAAE,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;QACjG,CAAC;aAAM,IAAI,EAAE,CAAC,MAAM,EAAE,CAAC;YACrB,KAAK,CAAC,IAAI,CAAC,2BAA2B,SAAS,CAAC,EAAE,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC,KAAK,YAAY,CAAC,CAAC;QAC/F,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC;IAChC,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC;IAC7B,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAE5B,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAaD,SAAS,eAAe,CAAC,OAAwB;IAC/C,MAAM,OAAO,GAAe,EAAE,CAAC;IAE/B,KAAK,MAAM,UAAU,IAAI,OAAO,CAAC,OAAO,IAAI,EAAE,EAAE,CAAC;QAC/C,MAAM,MAAM,GAAG,UAAU,CAAC,EAAE,IAAI,SAAS,CAAC;QAE1C,KAAK,MAAM,KAAK,IAAI,UAAU,CAAC,kBAAkB,IAAI,EAAE,EAAE,CAAC;YACxD,MAAM,MAAM,GAAG,KAAK,CAAC,EAAE,IAAI,SAAS,CAAC;YACrC,MAAM,IAAI,GAAG,GAAG,MAAM,IAAI,MAAM,EAAE,CAAC;YACnC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;YAC1B,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC;YAC7B,MAAM,KAAK,GAAG,KAAK,CAAC,aAAa,CAAC;YAElC,OAAO,CAAC,IAAI,CAAC;gBACX,EAAE,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC,KAAK;gBAC9B,IAAI;gBACJ,KAAK,EAAE,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK;gBAC/C,QAAQ,EAAE,KAAK,CAAC,cAAc,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,SAAS;gBACjF,MAAM,EAAE,KAAK,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,SAAS;gBAC3E,KAAK,EAAE,KAAK,IAAI,SAAS;aAC1B,CAAC,CAAC;QACL,CAAC;QAED,KAAK,MAAM,GAAG,IAAI,UAAU,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;YAC1C,MAAM,MAAM,GAAG,GAAG,CAAC,EAAE,IAAI,SAAS,CAAC;YACnC,OAAO,CAAC,IAAI,CAAC;gBACX,EAAE,EAAE,KAAK;gBACT,IAAI,EAAE,GAAG,MAAM,IAAI,MAAM,EAAE;gBAC3B,KAAK,EAAE,KAAK;gBACZ,KAAK,EAAE,GAAG,CAAC,aAAa,IAAI,eAAe;aAC5C,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,SAAS,CAAC,OAAwB;IACzC,MAAM,OAAO,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAEzC,MAAM,KAAK,GAAa,CAAC,gBAAgB,EAAE,MAAM,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;IAEnE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACrB,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC;QAClB,MAAM,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC;QACtC,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,IAAI,YAAY,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC;QAE/D,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;YACV,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACpB,IAAI,CAAC,CAAC,QAAQ,KAAK,SAAS,EAAE,CAAC;gBAC7B,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC;YAC5C,CAAC;YACD,IAAI,CAAC,CAAC,MAAM,KAAK,SAAS,EAAE,CAAC;gBAC3B,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;YACxC,CAAC;YACD,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;gBACZ,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC;YACtC,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC;AAED,SAAS,SAAS,CAAC,GAAW;IAC5B,OAAO,GAAG;SACP,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC;SACtB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC;SACrB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC;SACvB,OAAO,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;AAC7B,CAAC"}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
export type EvalPayload = {
|
|
2
|
+
tests: EvalTest[];
|
|
3
|
+
};
|
|
4
|
+
export type EvalTest = {
|
|
5
|
+
id: string;
|
|
6
|
+
steps: EvalStep[];
|
|
7
|
+
};
|
|
8
|
+
export type EvalStep = {
|
|
9
|
+
[key: string]: unknown;
|
|
10
|
+
type: string;
|
|
11
|
+
id: string;
|
|
12
|
+
};
|
|
13
|
+
/**
|
|
14
|
+
* Apply all normalizations to a test payload.
|
|
15
|
+
* Passes run in order: mcp-shorthand -> auto-correct -> camelCase -> evaluator fields -> shorthand refs -> defaults -> strip.
|
|
16
|
+
*/
|
|
17
|
+
export declare function normalizePayload(payload: EvalPayload): EvalPayload;
|
|
18
|
+
/**
|
|
19
|
+
* Convert MCP shorthand format to raw Eval API format.
|
|
20
|
+
* MCP uses type="evaluator" + evaluator_type, raw API uses type="evaluator.xxx".
|
|
21
|
+
* Also maps `field` to `actual` with proper JSONPath and auto-generates missing `id` fields.
|
|
22
|
+
*/
|
|
23
|
+
export declare function normalizeMcpShorthand(steps: EvalStep[]): EvalStep[];
|
|
24
|
+
/**
|
|
25
|
+
* Auto-correct common field name mistakes.
|
|
26
|
+
* Maps wrong field names to correct ones (agentId->agent_id, text->utterance, etc.)
|
|
27
|
+
*/
|
|
28
|
+
export declare function autoCorrectFields(steps: EvalStep[]): EvalStep[];
|
|
29
|
+
/**
|
|
30
|
+
* Normalize camelCase agent field names to snake_case.
|
|
31
|
+
* useAgentApi->use_agent_api, plannerDefinitionId->planner_id, etc.
|
|
32
|
+
*/
|
|
33
|
+
export declare function normalizeCamelCase(steps: EvalStep[]): EvalStep[];
|
|
34
|
+
/**
|
|
35
|
+
* Normalize evaluator field names based on evaluator category.
|
|
36
|
+
* Maps actual/expected <-> generated_output/reference_answer.
|
|
37
|
+
* Also auto-lowercases operator values and auto-injects metric_name.
|
|
38
|
+
*/
|
|
39
|
+
export declare function normalizeEvaluatorFields(steps: EvalStep[]): EvalStep[];
|
|
40
|
+
/**
|
|
41
|
+
* Convert {step_id.field} shorthand references to JSONPath $.outputs[N].field.
|
|
42
|
+
* Builds step_id->index mapping from non-evaluator steps.
|
|
43
|
+
*/
|
|
44
|
+
export declare function convertShorthandRefs(steps: EvalStep[]): EvalStep[];
|
|
45
|
+
/**
|
|
46
|
+
* Inject default values:
|
|
47
|
+
* - use_agent_api=true on agent.create_session if neither use_agent_api nor planner_id present
|
|
48
|
+
*/
|
|
49
|
+
export declare function injectDefaults(steps: EvalStep[]): EvalStep[];
|
|
50
|
+
/**
|
|
51
|
+
* Strip unrecognized fields from steps based on type-specific whitelists.
|
|
52
|
+
*/
|
|
53
|
+
export declare function stripUnrecognizedFields(steps: EvalStep[]): EvalStep[];
|
|
54
|
+
/**
|
|
55
|
+
* Split tests array into chunks of batchSize.
|
|
56
|
+
*/
|
|
57
|
+
export declare function splitIntoBatches(tests: EvalTest[], batchSize: number): EvalTest[][];
|