@agentv/core 3.10.3 → 3.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-VCFYWLFV.js → chunk-AVTN5AB7.js} +17 -12
- package/dist/chunk-AVTN5AB7.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +173 -135
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -5
- package/dist/index.d.ts +11 -5
- package/dist/index.js +158 -125
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-VCFYWLFV.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
readTextFile,
|
|
20
20
|
resolveFileReference,
|
|
21
21
|
resolveTargetDefinition
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-AVTN5AB7.js";
|
|
23
23
|
import {
|
|
24
24
|
AgentvProvider
|
|
25
25
|
} from "./chunk-W5YDZWT4.js";
|
|
@@ -154,6 +154,64 @@ import path7 from "node:path";
|
|
|
154
154
|
import micromatch2 from "micromatch";
|
|
155
155
|
import { parse as parse2 } from "yaml";
|
|
156
156
|
|
|
157
|
+
// src/evaluation/input-message-utils.ts
|
|
158
|
+
function flattenInputMessages(messages) {
|
|
159
|
+
return messages.flatMap((message) => extractContentSegments(message.content));
|
|
160
|
+
}
|
|
161
|
+
function collectResolvedInputFilePaths(messages) {
|
|
162
|
+
const filePaths = [];
|
|
163
|
+
for (const message of messages) {
|
|
164
|
+
if (!Array.isArray(message.content)) {
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
for (const segment of message.content) {
|
|
168
|
+
if (isJsonObject(segment) && segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
169
|
+
filePaths.push(segment.resolvedPath);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return filePaths;
|
|
174
|
+
}
|
|
175
|
+
function extractContentSegments(content) {
|
|
176
|
+
if (typeof content === "string") {
|
|
177
|
+
return content.trim().length > 0 ? [{ type: "text", value: content }] : [];
|
|
178
|
+
}
|
|
179
|
+
if (isJsonObject(content)) {
|
|
180
|
+
const rendered = JSON.stringify(content, null, 2);
|
|
181
|
+
return rendered.trim().length > 0 ? [{ type: "text", value: rendered }] : [];
|
|
182
|
+
}
|
|
183
|
+
if (!Array.isArray(content)) {
|
|
184
|
+
return [];
|
|
185
|
+
}
|
|
186
|
+
const segments = [];
|
|
187
|
+
for (const segment of content) {
|
|
188
|
+
if (!isJsonObject(segment)) {
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
segments.push(cloneJsonObject(segment));
|
|
192
|
+
}
|
|
193
|
+
return segments;
|
|
194
|
+
}
|
|
195
|
+
function cloneJsonObject(source) {
|
|
196
|
+
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
197
|
+
return Object.fromEntries(entries);
|
|
198
|
+
}
|
|
199
|
+
function cloneJsonValue(value) {
|
|
200
|
+
if (value === null) {
|
|
201
|
+
return null;
|
|
202
|
+
}
|
|
203
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
204
|
+
return value;
|
|
205
|
+
}
|
|
206
|
+
if (Array.isArray(value)) {
|
|
207
|
+
return value.map((item) => cloneJsonValue(item));
|
|
208
|
+
}
|
|
209
|
+
if (typeof value === "object") {
|
|
210
|
+
return cloneJsonObject(value);
|
|
211
|
+
}
|
|
212
|
+
return value;
|
|
213
|
+
}
|
|
214
|
+
|
|
157
215
|
// src/evaluation/loaders/agent-skills-parser.ts
|
|
158
216
|
import { readFile } from "node:fs/promises";
|
|
159
217
|
import path from "node:path";
|
|
@@ -222,7 +280,6 @@ function parseAgentSkillsEvals(parsed, source = "evals.json", baseDir) {
|
|
|
222
280
|
id: String(id),
|
|
223
281
|
question: prompt,
|
|
224
282
|
input: [{ role: "user", content: prompt }],
|
|
225
|
-
input_segments: [{ type: "text", value: prompt }],
|
|
226
283
|
expected_output: evalCase.expected_output ? [{ role: "assistant", content: evalCase.expected_output }] : [],
|
|
227
284
|
reference_answer: evalCase.expected_output,
|
|
228
285
|
file_paths: filePaths,
|
|
@@ -357,7 +414,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
357
414
|
}
|
|
358
415
|
try {
|
|
359
416
|
const rawConfig = await readFile2(configPath, "utf8");
|
|
360
|
-
const parsed = parse(rawConfig);
|
|
417
|
+
const parsed = interpolateEnv(parse(rawConfig), process.env);
|
|
361
418
|
if (!isJsonObject(parsed)) {
|
|
362
419
|
logWarning(`Invalid .agentv/config.yaml format at ${configPath}`);
|
|
363
420
|
continue;
|
|
@@ -575,6 +632,27 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
575
632
|
} else if (otelFile !== void 0) {
|
|
576
633
|
logWarning(`Invalid execution.otel_file in ${configPath}, expected non-empty string`);
|
|
577
634
|
}
|
|
635
|
+
if (typeof obj.export_otel === "boolean") {
|
|
636
|
+
result.export_otel = obj.export_otel;
|
|
637
|
+
} else if (obj.export_otel !== void 0) {
|
|
638
|
+
logWarning(`Invalid execution.export_otel in ${configPath}, expected boolean`);
|
|
639
|
+
}
|
|
640
|
+
const otelBackend = obj.otel_backend;
|
|
641
|
+
if (typeof otelBackend === "string" && otelBackend.trim().length > 0) {
|
|
642
|
+
result.otel_backend = otelBackend.trim();
|
|
643
|
+
} else if (otelBackend !== void 0) {
|
|
644
|
+
logWarning(`Invalid execution.otel_backend in ${configPath}, expected non-empty string`);
|
|
645
|
+
}
|
|
646
|
+
if (typeof obj.otel_capture_content === "boolean") {
|
|
647
|
+
result.otel_capture_content = obj.otel_capture_content;
|
|
648
|
+
} else if (obj.otel_capture_content !== void 0) {
|
|
649
|
+
logWarning(`Invalid execution.otel_capture_content in ${configPath}, expected boolean`);
|
|
650
|
+
}
|
|
651
|
+
if (typeof obj.otel_group_turns === "boolean") {
|
|
652
|
+
result.otel_group_turns = obj.otel_group_turns;
|
|
653
|
+
} else if (obj.otel_group_turns !== void 0) {
|
|
654
|
+
logWarning(`Invalid execution.otel_group_turns in ${configPath}, expected boolean`);
|
|
655
|
+
}
|
|
578
656
|
if (typeof obj.pool_workspaces === "boolean") {
|
|
579
657
|
result.pool_workspaces = obj.pool_workspaces;
|
|
580
658
|
} else if (obj.pool_workspaces !== void 0) {
|
|
@@ -2045,27 +2123,28 @@ var ANSI_YELLOW4 = "\x1B[33m";
|
|
|
2045
2123
|
var ANSI_RESET5 = "\x1B[0m";
|
|
2046
2124
|
async function processMessages(options) {
|
|
2047
2125
|
const { messages, searchRoots, repoRootPath, textParts, messageType, verbose } = options;
|
|
2048
|
-
const
|
|
2126
|
+
const processedMessages = [];
|
|
2049
2127
|
for (const message of messages) {
|
|
2050
2128
|
const content = message.content;
|
|
2051
2129
|
if (typeof content === "string") {
|
|
2052
|
-
segments.push({ type: "text", value: content });
|
|
2053
2130
|
if (textParts) {
|
|
2054
2131
|
textParts.push(content);
|
|
2055
2132
|
}
|
|
2133
|
+
processedMessages.push({ ...message, content });
|
|
2056
2134
|
continue;
|
|
2057
2135
|
}
|
|
2058
2136
|
if (isJsonObject(content)) {
|
|
2059
2137
|
const rendered = JSON.stringify(content, null, 2);
|
|
2060
|
-
segments.push({ type: "text", value: rendered });
|
|
2061
2138
|
if (textParts) {
|
|
2062
2139
|
textParts.push(rendered);
|
|
2063
2140
|
}
|
|
2141
|
+
processedMessages.push({ ...message, content: cloneJsonObject(content) });
|
|
2064
2142
|
continue;
|
|
2065
2143
|
}
|
|
2066
2144
|
if (!Array.isArray(content)) {
|
|
2067
2145
|
continue;
|
|
2068
2146
|
}
|
|
2147
|
+
const processedContent = [];
|
|
2069
2148
|
for (const rawSegment of content) {
|
|
2070
2149
|
if (!isJsonObject(rawSegment)) {
|
|
2071
2150
|
continue;
|
|
@@ -2088,8 +2167,8 @@ async function processMessages(options) {
|
|
|
2088
2167
|
}
|
|
2089
2168
|
try {
|
|
2090
2169
|
const fileContent = (await readFile4(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
|
|
2091
|
-
|
|
2092
|
-
|
|
2170
|
+
processedContent.push({
|
|
2171
|
+
...cloneJsonObject(rawSegment),
|
|
2093
2172
|
path: displayPath,
|
|
2094
2173
|
text: fileContent,
|
|
2095
2174
|
resolvedPath: path5.resolve(resolvedPath)
|
|
@@ -2106,37 +2185,19 @@ async function processMessages(options) {
|
|
|
2106
2185
|
continue;
|
|
2107
2186
|
}
|
|
2108
2187
|
const clonedSegment = cloneJsonObject(rawSegment);
|
|
2109
|
-
|
|
2188
|
+
processedContent.push(clonedSegment);
|
|
2110
2189
|
const inlineValue = clonedSegment.value;
|
|
2111
2190
|
if (typeof inlineValue === "string" && textParts) {
|
|
2112
2191
|
textParts.push(inlineValue);
|
|
2113
2192
|
}
|
|
2114
2193
|
}
|
|
2194
|
+
processedMessages.push({ ...message, content: processedContent });
|
|
2115
2195
|
}
|
|
2116
|
-
return
|
|
2196
|
+
return processedMessages;
|
|
2117
2197
|
}
|
|
2118
2198
|
function asString3(value) {
|
|
2119
2199
|
return typeof value === "string" ? value : void 0;
|
|
2120
2200
|
}
|
|
2121
|
-
function cloneJsonObject(source) {
|
|
2122
|
-
const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
|
|
2123
|
-
return Object.fromEntries(entries);
|
|
2124
|
-
}
|
|
2125
|
-
function cloneJsonValue(value) {
|
|
2126
|
-
if (value === null) {
|
|
2127
|
-
return null;
|
|
2128
|
-
}
|
|
2129
|
-
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
2130
|
-
return value;
|
|
2131
|
-
}
|
|
2132
|
-
if (Array.isArray(value)) {
|
|
2133
|
-
return value.map((item) => cloneJsonValue(item));
|
|
2134
|
-
}
|
|
2135
|
-
if (typeof value === "object") {
|
|
2136
|
-
return cloneJsonObject(value);
|
|
2137
|
-
}
|
|
2138
|
-
return value;
|
|
2139
|
-
}
|
|
2140
2201
|
function logWarning3(message, details) {
|
|
2141
2202
|
if (details && details.length > 0) {
|
|
2142
2203
|
const detailBlock = details.join("\n");
|
|
@@ -2385,10 +2446,10 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2385
2446
|
);
|
|
2386
2447
|
}
|
|
2387
2448
|
}
|
|
2388
|
-
const
|
|
2449
|
+
const rawInputMessages = resolveInputMessages(evalcase);
|
|
2389
2450
|
const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
|
|
2390
2451
|
const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
|
|
2391
|
-
if (!id || !hasEvaluationSpec || !
|
|
2452
|
+
if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
|
|
2392
2453
|
logError2(
|
|
2393
2454
|
`Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
|
|
2394
2455
|
);
|
|
@@ -2396,8 +2457,8 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2396
2457
|
}
|
|
2397
2458
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
2398
2459
|
const inputTextParts = [];
|
|
2399
|
-
const
|
|
2400
|
-
messages:
|
|
2460
|
+
const inputMessages = await processMessages({
|
|
2461
|
+
messages: rawInputMessages,
|
|
2401
2462
|
searchRoots,
|
|
2402
2463
|
repoRootPath,
|
|
2403
2464
|
textParts: inputTextParts,
|
|
@@ -2443,19 +2504,13 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
2443
2504
|
}
|
|
2444
2505
|
}
|
|
2445
2506
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
2446
|
-
const userFilePaths =
|
|
2447
|
-
for (const segment of inputSegments) {
|
|
2448
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
2449
|
-
userFilePaths.push(segment.resolvedPath);
|
|
2450
|
-
}
|
|
2451
|
-
}
|
|
2507
|
+
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
2452
2508
|
const testCase = {
|
|
2453
2509
|
id,
|
|
2454
2510
|
eval_set: evalSetName,
|
|
2455
2511
|
conversation_id: conversationId,
|
|
2456
2512
|
question,
|
|
2457
2513
|
input: inputMessages,
|
|
2458
|
-
input_segments: inputSegments,
|
|
2459
2514
|
expected_output: outputSegments,
|
|
2460
2515
|
reference_answer: referenceAnswer,
|
|
2461
2516
|
file_paths: userFilePaths,
|
|
@@ -2521,50 +2576,9 @@ function parseMetadata(suite) {
|
|
|
2521
2576
|
|
|
2522
2577
|
// src/evaluation/formatting/prompt-builder.ts
|
|
2523
2578
|
async function buildPromptInputs(testCase, mode = "lm") {
|
|
2524
|
-
const segmentsByMessage =
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
|
|
2528
|
-
fileContentsByPath.set(segment.path, segment.text);
|
|
2529
|
-
}
|
|
2530
|
-
}
|
|
2531
|
-
for (const message of testCase.input) {
|
|
2532
|
-
const messageSegments = [];
|
|
2533
|
-
if (typeof message.content === "string") {
|
|
2534
|
-
if (message.content.trim().length > 0) {
|
|
2535
|
-
messageSegments.push({ type: "text", value: message.content });
|
|
2536
|
-
}
|
|
2537
|
-
} else if (Array.isArray(message.content)) {
|
|
2538
|
-
for (const segment of message.content) {
|
|
2539
|
-
if (typeof segment === "string") {
|
|
2540
|
-
if (segment.trim().length > 0) {
|
|
2541
|
-
messageSegments.push({ type: "text", value: segment });
|
|
2542
|
-
}
|
|
2543
|
-
} else if (isJsonObject(segment)) {
|
|
2544
|
-
const type = asString5(segment.type);
|
|
2545
|
-
if (type === "file") {
|
|
2546
|
-
const value = asString5(segment.value);
|
|
2547
|
-
if (!value) continue;
|
|
2548
|
-
const fileText = fileContentsByPath.get(value);
|
|
2549
|
-
if (fileText !== void 0) {
|
|
2550
|
-
messageSegments.push({ type: "file", text: fileText, path: value });
|
|
2551
|
-
}
|
|
2552
|
-
} else if (type === "text") {
|
|
2553
|
-
const textValue = asString5(segment.value);
|
|
2554
|
-
if (textValue && textValue.trim().length > 0) {
|
|
2555
|
-
messageSegments.push({ type: "text", value: textValue });
|
|
2556
|
-
}
|
|
2557
|
-
}
|
|
2558
|
-
}
|
|
2559
|
-
}
|
|
2560
|
-
} else if (isJsonObject(message.content)) {
|
|
2561
|
-
const rendered = JSON.stringify(message.content, null, 2);
|
|
2562
|
-
if (rendered.trim().length > 0) {
|
|
2563
|
-
messageSegments.push({ type: "text", value: rendered });
|
|
2564
|
-
}
|
|
2565
|
-
}
|
|
2566
|
-
segmentsByMessage.push(messageSegments);
|
|
2567
|
-
}
|
|
2579
|
+
const segmentsByMessage = testCase.input.map(
|
|
2580
|
+
(message) => extractContentSegments(message.content)
|
|
2581
|
+
);
|
|
2568
2582
|
const useRoleMarkers = needsRoleMarkers(testCase.input, segmentsByMessage);
|
|
2569
2583
|
let question;
|
|
2570
2584
|
if (useRoleMarkers) {
|
|
@@ -2592,7 +2606,7 @@ ${messageContent}`);
|
|
|
2592
2606
|
question = messageParts.join("\n\n");
|
|
2593
2607
|
} else {
|
|
2594
2608
|
const questionParts = [];
|
|
2595
|
-
for (const segment of testCase.
|
|
2609
|
+
for (const segment of flattenInputMessages(testCase.input)) {
|
|
2596
2610
|
const formattedContent = formatSegment(segment, mode);
|
|
2597
2611
|
if (formattedContent) {
|
|
2598
2612
|
questionParts.push(formattedContent);
|
|
@@ -2679,9 +2693,6 @@ function buildChatPromptFromSegments(options) {
|
|
|
2679
2693
|
}
|
|
2680
2694
|
return chatPrompt.length > 0 ? chatPrompt : void 0;
|
|
2681
2695
|
}
|
|
2682
|
-
function asString5(value) {
|
|
2683
|
-
return typeof value === "string" ? value : void 0;
|
|
2684
|
-
}
|
|
2685
2696
|
|
|
2686
2697
|
// src/evaluation/yaml-parser.ts
|
|
2687
2698
|
var ANSI_YELLOW6 = "\x1B[33m";
|
|
@@ -2764,7 +2775,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2764
2775
|
throw new Error(`Invalid test file format: ${evalFilePath}`);
|
|
2765
2776
|
}
|
|
2766
2777
|
const suite = interpolated;
|
|
2767
|
-
const evalSetNameFromSuite =
|
|
2778
|
+
const evalSetNameFromSuite = asString5(suite.name)?.trim();
|
|
2768
2779
|
const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
2769
2780
|
const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
|
|
2770
2781
|
const rawTestcases = resolveTests(suite);
|
|
@@ -2783,7 +2794,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2783
2794
|
const suiteInputMessages = expandInputShorthand(suite.input);
|
|
2784
2795
|
const suiteInputFiles = suite.input_files;
|
|
2785
2796
|
const rawGlobalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
|
|
2786
|
-
const _globalTarget =
|
|
2797
|
+
const _globalTarget = asString5(rawGlobalExecution?.target) ?? asString5(suite.target);
|
|
2787
2798
|
const suiteAssertions = suite.assertions ?? suite.assert;
|
|
2788
2799
|
if (suite.assert !== void 0 && suite.assertions === void 0) {
|
|
2789
2800
|
logWarning5("'assert' is deprecated at the suite level. Use 'assertions' instead.");
|
|
@@ -2796,17 +2807,17 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2796
2807
|
continue;
|
|
2797
2808
|
}
|
|
2798
2809
|
const evalcase = rawEvalcase;
|
|
2799
|
-
const id =
|
|
2810
|
+
const id = asString5(evalcase.id);
|
|
2800
2811
|
if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
|
|
2801
2812
|
continue;
|
|
2802
2813
|
}
|
|
2803
|
-
const conversationId =
|
|
2804
|
-
let outcome =
|
|
2814
|
+
const conversationId = asString5(evalcase.conversation_id);
|
|
2815
|
+
let outcome = asString5(evalcase.criteria);
|
|
2805
2816
|
if (!outcome && evalcase.expected_outcome !== void 0) {
|
|
2806
|
-
outcome =
|
|
2817
|
+
outcome = asString5(evalcase.expected_outcome);
|
|
2807
2818
|
if (outcome) {
|
|
2808
2819
|
logWarning5(
|
|
2809
|
-
`Test '${
|
|
2820
|
+
`Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
|
|
2810
2821
|
);
|
|
2811
2822
|
}
|
|
2812
2823
|
}
|
|
@@ -2823,10 +2834,9 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2823
2834
|
continue;
|
|
2824
2835
|
}
|
|
2825
2836
|
const effectiveSuiteInputMessages = suiteInputMessages && !skipDefaults ? suiteInputMessages : void 0;
|
|
2826
|
-
const inputMessages = effectiveSuiteInputMessages ? [...effectiveSuiteInputMessages, ...testInputMessages] : testInputMessages;
|
|
2827
2837
|
const hasExpectedMessages = expectedMessages.length > 0;
|
|
2828
2838
|
const inputTextParts = [];
|
|
2829
|
-
const
|
|
2839
|
+
const suiteResolvedInputMessages = effectiveSuiteInputMessages ? await processMessages({
|
|
2830
2840
|
messages: effectiveSuiteInputMessages,
|
|
2831
2841
|
searchRoots,
|
|
2832
2842
|
repoRootPath,
|
|
@@ -2834,7 +2844,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2834
2844
|
messageType: "input",
|
|
2835
2845
|
verbose
|
|
2836
2846
|
}) : [];
|
|
2837
|
-
const
|
|
2847
|
+
const testResolvedInputMessages = await processMessages({
|
|
2838
2848
|
messages: testInputMessages,
|
|
2839
2849
|
searchRoots,
|
|
2840
2850
|
repoRootPath,
|
|
@@ -2842,7 +2852,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2842
2852
|
messageType: "input",
|
|
2843
2853
|
verbose
|
|
2844
2854
|
});
|
|
2845
|
-
const
|
|
2855
|
+
const inputMessages = [...suiteResolvedInputMessages, ...testResolvedInputMessages];
|
|
2846
2856
|
const outputSegments = hasExpectedMessages ? await processExpectedMessages({
|
|
2847
2857
|
messages: expectedMessages,
|
|
2848
2858
|
searchRoots,
|
|
@@ -2880,12 +2890,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2880
2890
|
}
|
|
2881
2891
|
}
|
|
2882
2892
|
warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
|
|
2883
|
-
const userFilePaths =
|
|
2884
|
-
for (const segment of inputSegments) {
|
|
2885
|
-
if (segment.type === "file" && typeof segment.resolvedPath === "string") {
|
|
2886
|
-
userFilePaths.push(segment.resolvedPath);
|
|
2887
|
-
}
|
|
2888
|
-
}
|
|
2893
|
+
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
2889
2894
|
const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
|
|
2890
2895
|
const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
|
|
2891
2896
|
const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
|
|
@@ -2896,7 +2901,6 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
2896
2901
|
conversation_id: conversationId,
|
|
2897
2902
|
question,
|
|
2898
2903
|
input: inputMessages,
|
|
2899
|
-
input_segments: inputSegments,
|
|
2900
2904
|
expected_output: outputSegments,
|
|
2901
2905
|
reference_answer: referenceAnswer,
|
|
2902
2906
|
file_paths: userFilePaths,
|
|
@@ -3105,7 +3109,7 @@ function mergeWorkspaceConfigs(suiteLevel, caseLevel) {
|
|
|
3105
3109
|
path: caseLevel.path ?? suiteLevel.path
|
|
3106
3110
|
};
|
|
3107
3111
|
}
|
|
3108
|
-
function
|
|
3112
|
+
function asString5(value) {
|
|
3109
3113
|
return typeof value === "string" ? value : void 0;
|
|
3110
3114
|
}
|
|
3111
3115
|
function logWarning5(message, details) {
|
|
@@ -6857,7 +6861,7 @@ var PiAgentSdkProvider = class {
|
|
|
6857
6861
|
const { Agent, getModel, getEnvApiKey } = await loadPiModules();
|
|
6858
6862
|
const startTimeIso = (/* @__PURE__ */ new Date()).toISOString();
|
|
6859
6863
|
const startMs = Date.now();
|
|
6860
|
-
const providerName = this.config.
|
|
6864
|
+
const providerName = this.config.subprovider ?? "anthropic";
|
|
6861
6865
|
const modelId = this.config.model ?? "claude-sonnet-4-20250514";
|
|
6862
6866
|
const model = getModel(providerName, modelId);
|
|
6863
6867
|
const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
|
|
@@ -6969,7 +6973,7 @@ var PiAgentSdkProvider = class {
|
|
|
6969
6973
|
messages: agentMessages,
|
|
6970
6974
|
systemPrompt,
|
|
6971
6975
|
model: this.config.model,
|
|
6972
|
-
|
|
6976
|
+
subprovider: this.config.subprovider
|
|
6973
6977
|
},
|
|
6974
6978
|
output,
|
|
6975
6979
|
tokenUsage,
|
|
@@ -7205,8 +7209,8 @@ var PiCodingAgentProvider = class {
|
|
|
7205
7209
|
}
|
|
7206
7210
|
buildPiArgs(prompt, inputFiles, _captureFileChanges) {
|
|
7207
7211
|
const args = [];
|
|
7208
|
-
if (this.config.
|
|
7209
|
-
args.push("--provider", this.config.
|
|
7212
|
+
if (this.config.subprovider) {
|
|
7213
|
+
args.push("--provider", this.config.subprovider);
|
|
7210
7214
|
}
|
|
7211
7215
|
if (this.config.model) {
|
|
7212
7216
|
args.push("--model", this.config.model);
|
|
@@ -7264,7 +7268,7 @@ ${prompt}` : prompt;
|
|
|
7264
7268
|
buildEnv() {
|
|
7265
7269
|
const env = { ...process.env };
|
|
7266
7270
|
if (this.config.apiKey) {
|
|
7267
|
-
const provider = this.config.
|
|
7271
|
+
const provider = this.config.subprovider?.toLowerCase() ?? "google";
|
|
7268
7272
|
switch (provider) {
|
|
7269
7273
|
case "google":
|
|
7270
7274
|
case "gemini":
|
|
@@ -10110,7 +10114,8 @@ var freeformEvaluationSchema = z3.object({
|
|
|
10110
10114
|
passed: z3.boolean().describe("Whether this aspect was satisfied"),
|
|
10111
10115
|
evidence: z3.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
10112
10116
|
})
|
|
10113
|
-
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
10117
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional(),
|
|
10118
|
+
details: z3.record(z3.unknown()).describe("Optional structured metadata for domain-specific metrics").optional()
|
|
10114
10119
|
});
|
|
10115
10120
|
var rubricCheckResultSchema = z3.object({
|
|
10116
10121
|
id: z3.string().describe("The ID of the rubric item being checked"),
|
|
@@ -10172,7 +10177,7 @@ var LlmGraderEvaluator = class {
|
|
|
10172
10177
|
async evaluateFreeform(context, graderProvider) {
|
|
10173
10178
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
10174
10179
|
const variables = {
|
|
10175
|
-
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.
|
|
10180
|
+
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input, null, 2),
|
|
10176
10181
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
|
|
10177
10182
|
context.evalCase.expected_output,
|
|
10178
10183
|
null,
|
|
@@ -10215,6 +10220,7 @@ ${context.fileChanges}`;
|
|
|
10215
10220
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10216
10221
|
evaluatorRawRequest,
|
|
10217
10222
|
graderTarget: graderProvider.targetName,
|
|
10223
|
+
details: data.details,
|
|
10218
10224
|
tokenUsage
|
|
10219
10225
|
};
|
|
10220
10226
|
} catch (e) {
|
|
@@ -10634,7 +10640,7 @@ ${outputSchema}`;
|
|
|
10634
10640
|
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10635
10641
|
evaluatorRawRequest,
|
|
10636
10642
|
graderTarget,
|
|
10637
|
-
details
|
|
10643
|
+
details: data.details && Object.keys(data.details).length > 0 ? { ...details, ...data.details } : details
|
|
10638
10644
|
};
|
|
10639
10645
|
} catch {
|
|
10640
10646
|
return {
|
|
@@ -10781,7 +10787,8 @@ function buildOutputSchema() {
|
|
|
10781
10787
|
' "passed": <boolean>,',
|
|
10782
10788
|
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
10783
10789
|
" }",
|
|
10784
|
-
" ]",
|
|
10790
|
+
" ],",
|
|
10791
|
+
' "details": {<optional object with domain-specific structured metrics>}',
|
|
10785
10792
|
"}"
|
|
10786
10793
|
].join("\n");
|
|
10787
10794
|
}
|
|
@@ -12145,7 +12152,7 @@ function assembleLlmGraderPrompt(input) {
|
|
|
12145
12152
|
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
|
|
12146
12153
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
12147
12154
|
const variables = {
|
|
12148
|
-
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.
|
|
12155
|
+
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
|
|
12149
12156
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
12150
12157
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
12151
12158
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
@@ -14426,6 +14433,18 @@ var QUALITY_PASS_THRESHOLD = 0.8;
|
|
|
14426
14433
|
function classifyQualityStatus(score) {
|
|
14427
14434
|
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
14428
14435
|
}
|
|
14436
|
+
function buildSkippedEvaluatorError(scores) {
|
|
14437
|
+
const skippedScores = scores?.filter((score) => score.verdict === "skip") ?? [];
|
|
14438
|
+
if (skippedScores.length === 0) {
|
|
14439
|
+
return void 0;
|
|
14440
|
+
}
|
|
14441
|
+
const messages = skippedScores.map((score) => {
|
|
14442
|
+
const label = score.name || score.type;
|
|
14443
|
+
const assertionMessage = score.assertions.find((assertion) => !assertion.passed)?.text ?? "Evaluator skipped";
|
|
14444
|
+
return `${label}: ${assertionMessage}`;
|
|
14445
|
+
});
|
|
14446
|
+
return messages.length === 1 ? messages[0] : `Evaluators skipped: ${messages.join(" | ")}`;
|
|
14447
|
+
}
|
|
14429
14448
|
function usesFileReferencePrompt(provider) {
|
|
14430
14449
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
14431
14450
|
}
|
|
@@ -15690,7 +15709,8 @@ async function runEvalCase(options) {
|
|
|
15690
15709
|
durationMs: totalDurationMs,
|
|
15691
15710
|
...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
|
|
15692
15711
|
};
|
|
15693
|
-
const
|
|
15712
|
+
const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
|
|
15713
|
+
const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score);
|
|
15694
15714
|
const finalResult = providerError ? {
|
|
15695
15715
|
...result,
|
|
15696
15716
|
evalRun,
|
|
@@ -15702,7 +15722,26 @@ async function runEvalCase(options) {
|
|
|
15702
15722
|
beforeAllOutput,
|
|
15703
15723
|
beforeEachOutput,
|
|
15704
15724
|
afterEachOutput
|
|
15705
|
-
} :
|
|
15725
|
+
} : skippedEvaluatorError ? {
|
|
15726
|
+
...result,
|
|
15727
|
+
score: 0,
|
|
15728
|
+
evalRun,
|
|
15729
|
+
error: skippedEvaluatorError,
|
|
15730
|
+
executionStatus,
|
|
15731
|
+
failureStage: "evaluator",
|
|
15732
|
+
failureReasonCode: "evaluator_error",
|
|
15733
|
+
executionError: { message: skippedEvaluatorError, stage: "evaluator" },
|
|
15734
|
+
beforeAllOutput,
|
|
15735
|
+
beforeEachOutput,
|
|
15736
|
+
afterEachOutput
|
|
15737
|
+
} : {
|
|
15738
|
+
...result,
|
|
15739
|
+
evalRun,
|
|
15740
|
+
executionStatus,
|
|
15741
|
+
beforeAllOutput,
|
|
15742
|
+
beforeEachOutput,
|
|
15743
|
+
afterEachOutput
|
|
15744
|
+
};
|
|
15706
15745
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
15707
15746
|
if (workspacePath && !isSharedWorkspace) {
|
|
15708
15747
|
if (forceCleanup) {
|
|
@@ -16447,11 +16486,6 @@ async function evaluate(config) {
|
|
|
16447
16486
|
evalCases = (config.tests ?? []).map((test) => {
|
|
16448
16487
|
const input = typeof test.input === "string" ? [{ role: "user", content: test.input }] : test.input;
|
|
16449
16488
|
const question = typeof test.input === "string" ? test.input : test.input.find((m) => m.role === "user")?.content ?? "";
|
|
16450
|
-
const inputSegments = input.map((m) => ({
|
|
16451
|
-
type: "text",
|
|
16452
|
-
value: typeof m.content === "string" ? m.content : JSON.stringify(m.content),
|
|
16453
|
-
messageIndex: 0
|
|
16454
|
-
}));
|
|
16455
16489
|
const expectedOutputValue = test.expectedOutput ?? test.expected_output;
|
|
16456
16490
|
const expectedOutput = expectedOutputValue ? [
|
|
16457
16491
|
{ role: "assistant", content: expectedOutputValue }
|
|
@@ -16480,7 +16514,6 @@ async function evaluate(config) {
|
|
|
16480
16514
|
criteria: test.criteria ?? "",
|
|
16481
16515
|
question: String(question),
|
|
16482
16516
|
input,
|
|
16483
|
-
input_segments: inputSegments,
|
|
16484
16517
|
expected_output: expectedOutput,
|
|
16485
16518
|
reference_answer: expectedOutputValue,
|
|
16486
16519
|
file_paths: [],
|