@agentv/core 3.14.6 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-HP5PFOVK.js → chunk-PXYYRDHH.js} +142 -148
- package/dist/chunk-PXYYRDHH.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +9 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +569 -257
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +135 -93
- package/dist/index.d.ts +135 -93
- package/dist/index.js +459 -141
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-HP5PFOVK.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1315,12 +1315,12 @@ function serializeAttributeValue(value) {
|
|
|
1315
1315
|
if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
|
|
1316
1316
|
return { stringValue: String(value) };
|
|
1317
1317
|
}
|
|
1318
|
-
var import_promises35,
|
|
1318
|
+
var import_promises35, import_node_path52, OtlpJsonFileExporter;
|
|
1319
1319
|
var init_otlp_json_file_exporter = __esm({
|
|
1320
1320
|
"src/observability/otlp-json-file-exporter.ts"() {
|
|
1321
1321
|
"use strict";
|
|
1322
1322
|
import_promises35 = require("fs/promises");
|
|
1323
|
-
|
|
1323
|
+
import_node_path52 = require("path");
|
|
1324
1324
|
OtlpJsonFileExporter = class {
|
|
1325
1325
|
// biome-ignore lint/suspicious/noExplicitAny: serialized span data
|
|
1326
1326
|
spans = [];
|
|
@@ -1359,7 +1359,7 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1359
1359
|
}
|
|
1360
1360
|
async flush() {
|
|
1361
1361
|
if (this.spans.length === 0) return;
|
|
1362
|
-
await (0, import_promises35.mkdir)((0,
|
|
1362
|
+
await (0, import_promises35.mkdir)((0, import_node_path52.dirname)(this.filePath), { recursive: true });
|
|
1363
1363
|
const otlpJson = {
|
|
1364
1364
|
resourceSpans: [
|
|
1365
1365
|
{
|
|
@@ -1383,9 +1383,11 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1383
1383
|
// src/index.ts
|
|
1384
1384
|
var index_exports = {};
|
|
1385
1385
|
__export(index_exports, {
|
|
1386
|
+
COMMON_TARGET_SETTINGS: () => COMMON_TARGET_SETTINGS,
|
|
1386
1387
|
CodeEvaluator: () => CodeEvaluator,
|
|
1387
1388
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
1388
1389
|
CostEvaluator: () => CostEvaluator,
|
|
1390
|
+
DEFAULT_CATEGORY: () => DEFAULT_CATEGORY,
|
|
1389
1391
|
DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
|
|
1390
1392
|
DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
|
|
1391
1393
|
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
@@ -1439,6 +1441,7 @@ __export(index_exports, {
|
|
|
1439
1441
|
createTempWorkspace: () => createTempWorkspace,
|
|
1440
1442
|
deepEqual: () => deepEqual,
|
|
1441
1443
|
defineConfig: () => defineConfig,
|
|
1444
|
+
deriveCategory: () => deriveCategory,
|
|
1442
1445
|
detectFormat: () => detectFormat,
|
|
1443
1446
|
discoverAssertions: () => discoverAssertions,
|
|
1444
1447
|
discoverCopilotSessions: () => discoverCopilotSessions,
|
|
@@ -1452,7 +1455,9 @@ __export(index_exports, {
|
|
|
1452
1455
|
explorationRatio: () => explorationRatio,
|
|
1453
1456
|
extractCacheConfig: () => extractCacheConfig,
|
|
1454
1457
|
extractFailOnError: () => extractFailOnError,
|
|
1458
|
+
extractImageBlocks: () => extractImageBlocks,
|
|
1455
1459
|
extractJsonBlob: () => extractJsonBlob,
|
|
1460
|
+
extractLastAssistantContent: () => extractLastAssistantContent,
|
|
1456
1461
|
extractTargetFromSuite: () => extractTargetFromSuite,
|
|
1457
1462
|
extractTargetsFromSuite: () => extractTargetsFromSuite,
|
|
1458
1463
|
extractTargetsFromTestCase: () => extractTargetsFromTestCase,
|
|
@@ -1466,12 +1471,15 @@ __export(index_exports, {
|
|
|
1466
1471
|
getAgentvHome: () => getAgentvHome,
|
|
1467
1472
|
getOutputFilenames: () => getOutputFilenames,
|
|
1468
1473
|
getSubagentsRoot: () => getSubagentsRoot,
|
|
1474
|
+
getTextContent: () => getTextContent,
|
|
1469
1475
|
getTraceStateRoot: () => getTraceStateRoot,
|
|
1470
1476
|
getWorkspacePath: () => getWorkspacePath,
|
|
1471
1477
|
getWorkspacePoolRoot: () => getWorkspacePoolRoot,
|
|
1472
1478
|
getWorkspacesRoot: () => getWorkspacesRoot,
|
|
1473
1479
|
initializeBaseline: () => initializeBaseline,
|
|
1474
1480
|
isAgentSkillsFormat: () => isAgentSkillsFormat,
|
|
1481
|
+
isContent: () => isContent,
|
|
1482
|
+
isContentArray: () => isContentArray,
|
|
1475
1483
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
1476
1484
|
isJsonObject: () => isJsonObject,
|
|
1477
1485
|
isJsonValue: () => isJsonValue,
|
|
@@ -1533,6 +1541,29 @@ __export(index_exports, {
|
|
|
1533
1541
|
});
|
|
1534
1542
|
module.exports = __toCommonJS(index_exports);
|
|
1535
1543
|
|
|
1544
|
+
// src/evaluation/content.ts
|
|
1545
|
+
var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
|
|
1546
|
+
function isContent(value) {
|
|
1547
|
+
if (!value || typeof value !== "object") return false;
|
|
1548
|
+
const v = value;
|
|
1549
|
+
return typeof v.type === "string" && CONTENT_TYPES.has(v.type);
|
|
1550
|
+
}
|
|
1551
|
+
function isContentArray(value) {
|
|
1552
|
+
return Array.isArray(value) && value.length > 0 && value.every(isContent);
|
|
1553
|
+
}
|
|
1554
|
+
function getTextContent(content) {
|
|
1555
|
+
if (content == null) return "";
|
|
1556
|
+
if (typeof content === "string") return content;
|
|
1557
|
+
if (!Array.isArray(content)) return "";
|
|
1558
|
+
const parts = [];
|
|
1559
|
+
for (const block of content) {
|
|
1560
|
+
if (block.type === "text") {
|
|
1561
|
+
parts.push(block.text);
|
|
1562
|
+
}
|
|
1563
|
+
}
|
|
1564
|
+
return parts.join("\n");
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1536
1567
|
// src/evaluation/types.ts
|
|
1537
1568
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
1538
1569
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
@@ -2411,15 +2442,23 @@ var TEMPLATE_VARIABLES = {
|
|
|
2411
2442
|
INPUT: "input",
|
|
2412
2443
|
OUTPUT: "output",
|
|
2413
2444
|
FILE_CHANGES: "file_changes",
|
|
2445
|
+
/** @deprecated Use INPUT instead — resolves to the same text value. */
|
|
2414
2446
|
INPUT_TEXT: "input_text",
|
|
2447
|
+
/** @deprecated Use OUTPUT instead — resolves to the same text value. */
|
|
2415
2448
|
OUTPUT_TEXT: "output_text",
|
|
2449
|
+
/** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
|
|
2416
2450
|
EXPECTED_OUTPUT_TEXT: "expected_output_text"
|
|
2417
2451
|
};
|
|
2418
2452
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
2419
2453
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
2420
|
-
TEMPLATE_VARIABLES.
|
|
2454
|
+
TEMPLATE_VARIABLES.OUTPUT,
|
|
2421
2455
|
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
2422
2456
|
]);
|
|
2457
|
+
var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
|
|
2458
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
|
|
2459
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
|
|
2460
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
|
|
2461
|
+
]);
|
|
2423
2462
|
|
|
2424
2463
|
// src/evaluation/validation/prompt-validator.ts
|
|
2425
2464
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
@@ -2441,16 +2480,29 @@ function validateTemplateVariables(content, source) {
|
|
|
2441
2480
|
}
|
|
2442
2481
|
match = variablePattern.exec(content);
|
|
2443
2482
|
}
|
|
2444
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
2445
|
-
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
2483
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
2484
|
+
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
|
|
2446
2485
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
2447
2486
|
if (!hasRequiredFields) {
|
|
2448
2487
|
throw new Error(
|
|
2449
2488
|
`Missing required fields. Must include at least one of:
|
|
2450
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
2489
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
|
|
2451
2490
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
2452
2491
|
);
|
|
2453
2492
|
}
|
|
2493
|
+
const deprecatedUsed = [];
|
|
2494
|
+
for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
|
|
2495
|
+
if (foundVariables.has(deprecated)) {
|
|
2496
|
+
deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
|
|
2497
|
+
}
|
|
2498
|
+
}
|
|
2499
|
+
if (deprecatedUsed.length > 0) {
|
|
2500
|
+
console.warn(
|
|
2501
|
+
`${ANSI_YELLOW3}Warning: Template at ${source} uses deprecated variable names:
|
|
2502
|
+
${deprecatedUsed.join("\n ")}
|
|
2503
|
+
These still work but will be removed in a future version.${ANSI_RESET4}`
|
|
2504
|
+
);
|
|
2505
|
+
}
|
|
2454
2506
|
if (invalidVariables.length > 0) {
|
|
2455
2507
|
const warningMessage = `${ANSI_YELLOW3}Warning: Custom evaluator template at ${source}
|
|
2456
2508
|
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
@@ -3868,6 +3920,19 @@ function asString2(value) {
|
|
|
3868
3920
|
}
|
|
3869
3921
|
|
|
3870
3922
|
// src/evaluation/loaders/message-processor.ts
|
|
3923
|
+
var IMAGE_MEDIA_TYPES = {
|
|
3924
|
+
".png": "image/png",
|
|
3925
|
+
".jpg": "image/jpeg",
|
|
3926
|
+
".jpeg": "image/jpeg",
|
|
3927
|
+
".gif": "image/gif",
|
|
3928
|
+
".webp": "image/webp",
|
|
3929
|
+
".svg": "image/svg+xml",
|
|
3930
|
+
".bmp": "image/bmp"
|
|
3931
|
+
};
|
|
3932
|
+
function detectImageMediaType(filePath) {
|
|
3933
|
+
const ext = import_node_path6.default.extname(filePath).toLowerCase();
|
|
3934
|
+
return IMAGE_MEDIA_TYPES[ext];
|
|
3935
|
+
}
|
|
3871
3936
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
3872
3937
|
var ANSI_RESET6 = "\x1B[0m";
|
|
3873
3938
|
async function processMessages(options) {
|
|
@@ -3933,6 +3998,47 @@ async function processMessages(options) {
|
|
|
3933
3998
|
}
|
|
3934
3999
|
continue;
|
|
3935
4000
|
}
|
|
4001
|
+
if (segmentType === "image") {
|
|
4002
|
+
const rawValue = asString3(rawSegment.value);
|
|
4003
|
+
if (!rawValue) {
|
|
4004
|
+
continue;
|
|
4005
|
+
}
|
|
4006
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
4007
|
+
rawValue,
|
|
4008
|
+
searchRoots
|
|
4009
|
+
);
|
|
4010
|
+
if (!resolvedPath) {
|
|
4011
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
4012
|
+
const context2 = messageType === "input" ? "" : " in expected_output";
|
|
4013
|
+
logWarning3(`Image file not found${context2}: ${displayPath}`, attempts);
|
|
4014
|
+
continue;
|
|
4015
|
+
}
|
|
4016
|
+
const mediaType = detectImageMediaType(resolvedPath);
|
|
4017
|
+
if (!mediaType) {
|
|
4018
|
+
logWarning3(
|
|
4019
|
+
`Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
|
|
4020
|
+
);
|
|
4021
|
+
continue;
|
|
4022
|
+
}
|
|
4023
|
+
try {
|
|
4024
|
+
const imageBuffer = await (0, import_promises6.readFile)(resolvedPath);
|
|
4025
|
+
const base64 = imageBuffer.toString("base64");
|
|
4026
|
+
processedContent.push({
|
|
4027
|
+
type: "image",
|
|
4028
|
+
media_type: mediaType,
|
|
4029
|
+
source: `data:${mediaType};base64,${base64}`
|
|
4030
|
+
});
|
|
4031
|
+
if (verbose) {
|
|
4032
|
+
const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
|
|
4033
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
4034
|
+
console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
|
|
4035
|
+
}
|
|
4036
|
+
} catch (error) {
|
|
4037
|
+
const context2 = messageType === "input" ? "" : " expected output";
|
|
4038
|
+
logWarning3(`Could not read${context2} image ${resolvedPath}: ${error.message}`);
|
|
4039
|
+
}
|
|
4040
|
+
continue;
|
|
4041
|
+
}
|
|
3936
4042
|
const clonedSegment = cloneJsonObject(rawSegment);
|
|
3937
4043
|
processedContent.push(clonedSegment);
|
|
3938
4044
|
const inlineValue = clonedSegment.value;
|
|
@@ -4010,6 +4116,46 @@ async function processExpectedMessages(options) {
|
|
|
4010
4116
|
}
|
|
4011
4117
|
continue;
|
|
4012
4118
|
}
|
|
4119
|
+
if (segmentType === "image") {
|
|
4120
|
+
const rawValue = asString3(rawSegment.value);
|
|
4121
|
+
if (!rawValue) {
|
|
4122
|
+
continue;
|
|
4123
|
+
}
|
|
4124
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
4125
|
+
rawValue,
|
|
4126
|
+
searchRoots
|
|
4127
|
+
);
|
|
4128
|
+
if (!resolvedPath) {
|
|
4129
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
4130
|
+
logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
|
|
4131
|
+
continue;
|
|
4132
|
+
}
|
|
4133
|
+
const mediaType = detectImageMediaType(resolvedPath);
|
|
4134
|
+
if (!mediaType) {
|
|
4135
|
+
logWarning3(
|
|
4136
|
+
`Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
|
|
4137
|
+
);
|
|
4138
|
+
continue;
|
|
4139
|
+
}
|
|
4140
|
+
try {
|
|
4141
|
+
const imageBuffer = await (0, import_promises6.readFile)(resolvedPath);
|
|
4142
|
+
const base64 = imageBuffer.toString("base64");
|
|
4143
|
+
processedContent.push({
|
|
4144
|
+
type: "image",
|
|
4145
|
+
media_type: mediaType,
|
|
4146
|
+
source: `data:${mediaType};base64,${base64}`
|
|
4147
|
+
});
|
|
4148
|
+
if (verbose) {
|
|
4149
|
+
console.log(` [Expected Output Image] Found: ${displayPath}`);
|
|
4150
|
+
console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
|
|
4151
|
+
}
|
|
4152
|
+
} catch (error) {
|
|
4153
|
+
logWarning3(
|
|
4154
|
+
`Could not read expected output image ${resolvedPath}: ${error.message}`
|
|
4155
|
+
);
|
|
4156
|
+
}
|
|
4157
|
+
continue;
|
|
4158
|
+
}
|
|
4013
4159
|
processedContent.push(cloneJsonObject(rawSegment));
|
|
4014
4160
|
}
|
|
4015
4161
|
segment.content = processedContent;
|
|
@@ -4256,7 +4402,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4256
4402
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
4257
4403
|
const testCase = {
|
|
4258
4404
|
id,
|
|
4259
|
-
|
|
4405
|
+
dataset: evalSetName,
|
|
4260
4406
|
conversation_id: conversationId,
|
|
4261
4407
|
question,
|
|
4262
4408
|
input: inputMessages,
|
|
@@ -4527,7 +4673,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4527
4673
|
}
|
|
4528
4674
|
const suite = interpolated;
|
|
4529
4675
|
const evalSetNameFromSuite = asString5(suite.name)?.trim();
|
|
4530
|
-
const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
4676
|
+
const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
4531
4677
|
const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
|
|
4532
4678
|
const rawTestcases = resolveTests(suite);
|
|
4533
4679
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
@@ -4648,7 +4794,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4648
4794
|
const caseTargets = extractTargetsFromTestCase(evalcase);
|
|
4649
4795
|
const testCase = {
|
|
4650
4796
|
id,
|
|
4651
|
-
|
|
4797
|
+
dataset: evalSetName,
|
|
4798
|
+
category: options?.category,
|
|
4652
4799
|
conversation_id: conversationId,
|
|
4653
4800
|
question,
|
|
4654
4801
|
input: inputMessages,
|
|
@@ -5690,6 +5837,49 @@ var import_node_fs4 = require("fs");
|
|
|
5690
5837
|
var import_promises10 = require("fs/promises");
|
|
5691
5838
|
var import_node_path12 = __toESM(require("path"), 1);
|
|
5692
5839
|
|
|
5840
|
+
// src/evaluation/providers/claude-content.ts
|
|
5841
|
+
function toContentArray(content) {
|
|
5842
|
+
if (!Array.isArray(content)) return void 0;
|
|
5843
|
+
let hasNonText = false;
|
|
5844
|
+
const blocks = [];
|
|
5845
|
+
for (const part of content) {
|
|
5846
|
+
if (!part || typeof part !== "object") continue;
|
|
5847
|
+
const p = part;
|
|
5848
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
5849
|
+
blocks.push({ type: "text", text: p.text });
|
|
5850
|
+
} else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
|
|
5851
|
+
const src = p.source;
|
|
5852
|
+
const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
|
|
5853
|
+
const data = typeof src.data === "string" && src.data !== "" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" && p.url !== "" ? p.url : "";
|
|
5854
|
+
if (!data) continue;
|
|
5855
|
+
blocks.push({ type: "image", media_type: mediaType, source: data });
|
|
5856
|
+
hasNonText = true;
|
|
5857
|
+
} else if (p.type === "tool_use") {
|
|
5858
|
+
} else if (p.type === "tool_result") {
|
|
5859
|
+
}
|
|
5860
|
+
}
|
|
5861
|
+
return hasNonText && blocks.length > 0 ? blocks : void 0;
|
|
5862
|
+
}
|
|
5863
|
+
function extractTextContent(content) {
|
|
5864
|
+
if (typeof content === "string") {
|
|
5865
|
+
return content;
|
|
5866
|
+
}
|
|
5867
|
+
if (!Array.isArray(content)) {
|
|
5868
|
+
return void 0;
|
|
5869
|
+
}
|
|
5870
|
+
const textParts = [];
|
|
5871
|
+
for (const part of content) {
|
|
5872
|
+
if (!part || typeof part !== "object") {
|
|
5873
|
+
continue;
|
|
5874
|
+
}
|
|
5875
|
+
const p = part;
|
|
5876
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
5877
|
+
textParts.push(p.text);
|
|
5878
|
+
}
|
|
5879
|
+
}
|
|
5880
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
5881
|
+
}
|
|
5882
|
+
|
|
5693
5883
|
// src/evaluation/providers/claude-log-tracker.ts
|
|
5694
5884
|
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
|
|
5695
5885
|
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
|
|
@@ -5855,11 +6045,12 @@ var ClaudeCliProvider = class {
|
|
|
5855
6045
|
if (betaMessage && typeof betaMessage === "object") {
|
|
5856
6046
|
const msg = betaMessage;
|
|
5857
6047
|
const content = msg.content;
|
|
6048
|
+
const structuredContent = toContentArray(content);
|
|
5858
6049
|
const textContent = extractTextContent(content);
|
|
5859
6050
|
const toolCalls = extractToolCalls(content);
|
|
5860
6051
|
const outputMsg = {
|
|
5861
6052
|
role: "assistant",
|
|
5862
|
-
content: textContent,
|
|
6053
|
+
content: structuredContent ?? textContent,
|
|
5863
6054
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
5864
6055
|
};
|
|
5865
6056
|
output.push(outputMsg);
|
|
@@ -6198,25 +6389,6 @@ function summarizeEvent(event) {
|
|
|
6198
6389
|
return void 0;
|
|
6199
6390
|
}
|
|
6200
6391
|
}
|
|
6201
|
-
function extractTextContent(content) {
|
|
6202
|
-
if (typeof content === "string") {
|
|
6203
|
-
return content;
|
|
6204
|
-
}
|
|
6205
|
-
if (!Array.isArray(content)) {
|
|
6206
|
-
return void 0;
|
|
6207
|
-
}
|
|
6208
|
-
const textParts = [];
|
|
6209
|
-
for (const part of content) {
|
|
6210
|
-
if (!part || typeof part !== "object") {
|
|
6211
|
-
continue;
|
|
6212
|
-
}
|
|
6213
|
-
const p = part;
|
|
6214
|
-
if (p.type === "text" && typeof p.text === "string") {
|
|
6215
|
-
textParts.push(p.text);
|
|
6216
|
-
}
|
|
6217
|
-
}
|
|
6218
|
-
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
6219
|
-
}
|
|
6220
6392
|
function extractToolCalls(content) {
|
|
6221
6393
|
if (!Array.isArray(content)) {
|
|
6222
6394
|
return [];
|
|
@@ -6389,11 +6561,12 @@ var ClaudeSdkProvider = class {
|
|
|
6389
6561
|
if (betaMessage && typeof betaMessage === "object") {
|
|
6390
6562
|
const msg = betaMessage;
|
|
6391
6563
|
const content = msg.content;
|
|
6392
|
-
const
|
|
6564
|
+
const structuredContent = toContentArray(content);
|
|
6565
|
+
const textContent = extractTextContent(content);
|
|
6393
6566
|
const toolCalls = extractToolCalls2(content);
|
|
6394
6567
|
const outputMsg = {
|
|
6395
6568
|
role: "assistant",
|
|
6396
|
-
content: textContent,
|
|
6569
|
+
content: structuredContent ?? textContent,
|
|
6397
6570
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
6398
6571
|
};
|
|
6399
6572
|
output.push(outputMsg);
|
|
@@ -6511,25 +6684,6 @@ var ClaudeSdkProvider = class {
|
|
|
6511
6684
|
}
|
|
6512
6685
|
}
|
|
6513
6686
|
};
|
|
6514
|
-
function extractTextContent2(content) {
|
|
6515
|
-
if (typeof content === "string") {
|
|
6516
|
-
return content;
|
|
6517
|
-
}
|
|
6518
|
-
if (!Array.isArray(content)) {
|
|
6519
|
-
return void 0;
|
|
6520
|
-
}
|
|
6521
|
-
const textParts = [];
|
|
6522
|
-
for (const part of content) {
|
|
6523
|
-
if (!part || typeof part !== "object") {
|
|
6524
|
-
continue;
|
|
6525
|
-
}
|
|
6526
|
-
const p = part;
|
|
6527
|
-
if (p.type === "text" && typeof p.text === "string") {
|
|
6528
|
-
textParts.push(p.text);
|
|
6529
|
-
}
|
|
6530
|
-
}
|
|
6531
|
-
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
6532
|
-
}
|
|
6533
6687
|
function extractToolCalls2(content) {
|
|
6534
6688
|
if (!Array.isArray(content)) {
|
|
6535
6689
|
return [];
|
|
@@ -6753,7 +6907,7 @@ function convertMessages(messages) {
|
|
|
6753
6907
|
return messages.map((msg) => ({
|
|
6754
6908
|
role: msg.role,
|
|
6755
6909
|
name: msg.name,
|
|
6756
|
-
content: msg.content,
|
|
6910
|
+
content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
|
|
6757
6911
|
toolCalls: msg.tool_calls?.map((tc) => ({
|
|
6758
6912
|
tool: tc.tool,
|
|
6759
6913
|
input: tc.input,
|
|
@@ -9007,6 +9161,35 @@ function extractPiTextContent(content) {
|
|
|
9007
9161
|
}
|
|
9008
9162
|
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
9009
9163
|
}
|
|
9164
|
+
function toPiContentArray(content) {
|
|
9165
|
+
if (!Array.isArray(content)) return void 0;
|
|
9166
|
+
let hasNonText = false;
|
|
9167
|
+
const blocks = [];
|
|
9168
|
+
for (const part of content) {
|
|
9169
|
+
if (!part || typeof part !== "object") continue;
|
|
9170
|
+
const p = part;
|
|
9171
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
9172
|
+
blocks.push({ type: "text", text: p.text });
|
|
9173
|
+
} else if (p.type === "image") {
|
|
9174
|
+
const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
|
|
9175
|
+
let source = "";
|
|
9176
|
+
if (typeof p.source === "object" && p.source !== null) {
|
|
9177
|
+
const src = p.source;
|
|
9178
|
+
const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
|
|
9179
|
+
source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
|
|
9180
|
+
}
|
|
9181
|
+
if (!source && typeof p.url === "string") {
|
|
9182
|
+
source = p.url;
|
|
9183
|
+
}
|
|
9184
|
+
if (source) {
|
|
9185
|
+
blocks.push({ type: "image", media_type: mediaType, source });
|
|
9186
|
+
hasNonText = true;
|
|
9187
|
+
}
|
|
9188
|
+
} else if (p.type === "tool_use" || p.type === "tool_result") {
|
|
9189
|
+
}
|
|
9190
|
+
}
|
|
9191
|
+
return hasNonText && blocks.length > 0 ? blocks : void 0;
|
|
9192
|
+
}
|
|
9010
9193
|
function toFiniteNumber(value) {
|
|
9011
9194
|
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
9012
9195
|
return void 0;
|
|
@@ -10178,7 +10361,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
|
|
|
10178
10361
|
}
|
|
10179
10362
|
const msg = message;
|
|
10180
10363
|
const role = typeof msg.role === "string" ? msg.role : "unknown";
|
|
10181
|
-
const
|
|
10364
|
+
const structuredContent = toPiContentArray(msg.content);
|
|
10365
|
+
const content = structuredContent ?? extractPiTextContent(msg.content);
|
|
10182
10366
|
const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
|
|
10183
10367
|
const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
10184
10368
|
let msgTokenUsage;
|
|
@@ -10440,6 +10624,12 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
|
10440
10624
|
"FILES",
|
|
10441
10625
|
"OUTPUT_FILE"
|
|
10442
10626
|
]);
|
|
10627
|
+
var COMMON_TARGET_SETTINGS = [
|
|
10628
|
+
"provider_batching",
|
|
10629
|
+
"providerBatching",
|
|
10630
|
+
"subagent_mode_allowed",
|
|
10631
|
+
"subagentModeAllowed"
|
|
10632
|
+
];
|
|
10443
10633
|
var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
10444
10634
|
name: import_zod3.z.string().min(1, "target name is required"),
|
|
10445
10635
|
provider: import_zod3.z.string().min(1, "provider is required"),
|
|
@@ -10448,7 +10638,8 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
|
10448
10638
|
// backward compat
|
|
10449
10639
|
workers: import_zod3.z.number().int().min(1).optional(),
|
|
10450
10640
|
workspace_template: import_zod3.z.string().optional(),
|
|
10451
|
-
workspaceTemplate: import_zod3.z.string().optional()
|
|
10641
|
+
workspaceTemplate: import_zod3.z.string().optional(),
|
|
10642
|
+
subagent_mode_allowed: import_zod3.z.boolean().optional()
|
|
10452
10643
|
}).passthrough();
|
|
10453
10644
|
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
10454
10645
|
var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
|
@@ -10511,42 +10702,40 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10511
10702
|
const providerBatching = resolveOptionalBoolean(
|
|
10512
10703
|
parsed.provider_batching ?? parsed.providerBatching
|
|
10513
10704
|
);
|
|
10705
|
+
const subagentModeAllowed = resolveOptionalBoolean(
|
|
10706
|
+
parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
|
|
10707
|
+
);
|
|
10708
|
+
const base = {
|
|
10709
|
+
name: parsed.name,
|
|
10710
|
+
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10711
|
+
workers: parsed.workers,
|
|
10712
|
+
providerBatching,
|
|
10713
|
+
subagentModeAllowed
|
|
10714
|
+
};
|
|
10514
10715
|
switch (provider) {
|
|
10515
10716
|
case "openai":
|
|
10516
10717
|
return {
|
|
10517
10718
|
kind: "openai",
|
|
10518
|
-
|
|
10519
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10520
|
-
workers: parsed.workers,
|
|
10521
|
-
providerBatching,
|
|
10719
|
+
...base,
|
|
10522
10720
|
config: resolveOpenAIConfig(parsed, env)
|
|
10523
10721
|
};
|
|
10524
10722
|
case "openrouter":
|
|
10525
10723
|
return {
|
|
10526
10724
|
kind: "openrouter",
|
|
10527
|
-
|
|
10528
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10529
|
-
workers: parsed.workers,
|
|
10530
|
-
providerBatching,
|
|
10725
|
+
...base,
|
|
10531
10726
|
config: resolveOpenRouterConfig(parsed, env)
|
|
10532
10727
|
};
|
|
10533
10728
|
case "azure":
|
|
10534
10729
|
case "azure-openai":
|
|
10535
10730
|
return {
|
|
10536
10731
|
kind: "azure",
|
|
10537
|
-
|
|
10538
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10539
|
-
workers: parsed.workers,
|
|
10540
|
-
providerBatching,
|
|
10732
|
+
...base,
|
|
10541
10733
|
config: resolveAzureConfig(parsed, env)
|
|
10542
10734
|
};
|
|
10543
10735
|
case "anthropic":
|
|
10544
10736
|
return {
|
|
10545
10737
|
kind: "anthropic",
|
|
10546
|
-
|
|
10547
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10548
|
-
workers: parsed.workers,
|
|
10549
|
-
providerBatching,
|
|
10738
|
+
...base,
|
|
10550
10739
|
config: resolveAnthropicConfig(parsed, env)
|
|
10551
10740
|
};
|
|
10552
10741
|
case "gemini":
|
|
@@ -10554,68 +10743,47 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10554
10743
|
case "google-gemini":
|
|
10555
10744
|
return {
|
|
10556
10745
|
kind: "gemini",
|
|
10557
|
-
|
|
10558
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10559
|
-
workers: parsed.workers,
|
|
10560
|
-
providerBatching,
|
|
10746
|
+
...base,
|
|
10561
10747
|
config: resolveGeminiConfig(parsed, env)
|
|
10562
10748
|
};
|
|
10563
10749
|
case "codex":
|
|
10564
10750
|
case "codex-cli":
|
|
10565
10751
|
return {
|
|
10566
10752
|
kind: "codex",
|
|
10567
|
-
|
|
10568
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10569
|
-
workers: parsed.workers,
|
|
10570
|
-
providerBatching,
|
|
10753
|
+
...base,
|
|
10571
10754
|
config: resolveCodexConfig(parsed, env, evalFilePath)
|
|
10572
10755
|
};
|
|
10573
10756
|
case "copilot-sdk":
|
|
10574
10757
|
case "copilot_sdk":
|
|
10575
10758
|
return {
|
|
10576
10759
|
kind: "copilot-sdk",
|
|
10577
|
-
|
|
10578
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10579
|
-
workers: parsed.workers,
|
|
10580
|
-
providerBatching,
|
|
10760
|
+
...base,
|
|
10581
10761
|
config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
|
|
10582
10762
|
};
|
|
10583
10763
|
case "copilot":
|
|
10584
10764
|
case "copilot-cli":
|
|
10585
10765
|
return {
|
|
10586
10766
|
kind: "copilot-cli",
|
|
10587
|
-
|
|
10588
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10589
|
-
workers: parsed.workers,
|
|
10590
|
-
providerBatching,
|
|
10767
|
+
...base,
|
|
10591
10768
|
config: resolveCopilotCliConfig(parsed, env, evalFilePath)
|
|
10592
10769
|
};
|
|
10593
10770
|
case "copilot-log":
|
|
10594
10771
|
return {
|
|
10595
10772
|
kind: "copilot-log",
|
|
10596
|
-
|
|
10597
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10598
|
-
workers: parsed.workers,
|
|
10599
|
-
providerBatching,
|
|
10773
|
+
...base,
|
|
10600
10774
|
config: resolveCopilotLogConfig(parsed, env)
|
|
10601
10775
|
};
|
|
10602
10776
|
case "pi":
|
|
10603
10777
|
case "pi-coding-agent":
|
|
10604
10778
|
return {
|
|
10605
10779
|
kind: "pi-coding-agent",
|
|
10606
|
-
|
|
10607
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10608
|
-
workers: parsed.workers,
|
|
10609
|
-
providerBatching,
|
|
10780
|
+
...base,
|
|
10610
10781
|
config: resolvePiCodingAgentConfig(parsed, env, evalFilePath)
|
|
10611
10782
|
};
|
|
10612
10783
|
case "pi-cli":
|
|
10613
10784
|
return {
|
|
10614
10785
|
kind: "pi-cli",
|
|
10615
|
-
|
|
10616
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10617
|
-
workers: parsed.workers,
|
|
10618
|
-
providerBatching,
|
|
10786
|
+
...base,
|
|
10619
10787
|
config: resolvePiCliConfig(parsed, env, evalFilePath)
|
|
10620
10788
|
};
|
|
10621
10789
|
case "claude":
|
|
@@ -10623,38 +10791,26 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10623
10791
|
case "claude-cli":
|
|
10624
10792
|
return {
|
|
10625
10793
|
kind: "claude-cli",
|
|
10626
|
-
|
|
10627
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10628
|
-
workers: parsed.workers,
|
|
10629
|
-
providerBatching,
|
|
10794
|
+
...base,
|
|
10630
10795
|
config: resolveClaudeConfig(parsed, env, evalFilePath)
|
|
10631
10796
|
};
|
|
10632
10797
|
case "claude-sdk":
|
|
10633
10798
|
return {
|
|
10634
10799
|
kind: "claude-sdk",
|
|
10635
|
-
|
|
10636
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10637
|
-
workers: parsed.workers,
|
|
10638
|
-
providerBatching,
|
|
10800
|
+
...base,
|
|
10639
10801
|
config: resolveClaudeConfig(parsed, env, evalFilePath)
|
|
10640
10802
|
};
|
|
10641
10803
|
case "mock":
|
|
10642
10804
|
return {
|
|
10643
10805
|
kind: "mock",
|
|
10644
|
-
|
|
10645
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10646
|
-
workers: parsed.workers,
|
|
10647
|
-
providerBatching,
|
|
10806
|
+
...base,
|
|
10648
10807
|
config: resolveMockConfig(parsed)
|
|
10649
10808
|
};
|
|
10650
10809
|
case "vscode":
|
|
10651
10810
|
case "vscode-insiders":
|
|
10652
10811
|
return {
|
|
10653
10812
|
kind: provider,
|
|
10654
|
-
|
|
10655
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10656
|
-
workers: parsed.workers,
|
|
10657
|
-
providerBatching,
|
|
10813
|
+
...base,
|
|
10658
10814
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders", evalFilePath)
|
|
10659
10815
|
};
|
|
10660
10816
|
case "agentv": {
|
|
@@ -10667,29 +10823,21 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10667
10823
|
const temperature = typeof parsed.temperature === "number" ? parsed.temperature : 0;
|
|
10668
10824
|
return {
|
|
10669
10825
|
kind: "agentv",
|
|
10670
|
-
|
|
10671
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10826
|
+
...base,
|
|
10672
10827
|
workers: typeof parsed.workers === "number" ? parsed.workers : void 0,
|
|
10673
|
-
providerBatching,
|
|
10674
10828
|
config: { model, temperature }
|
|
10675
10829
|
};
|
|
10676
10830
|
}
|
|
10677
10831
|
case "cli":
|
|
10678
10832
|
return {
|
|
10679
10833
|
kind: "cli",
|
|
10680
|
-
|
|
10681
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10682
|
-
workers: parsed.workers,
|
|
10683
|
-
providerBatching,
|
|
10834
|
+
...base,
|
|
10684
10835
|
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
10685
10836
|
};
|
|
10686
10837
|
default:
|
|
10687
10838
|
return {
|
|
10688
10839
|
kind: "cli",
|
|
10689
|
-
|
|
10690
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10691
|
-
workers: parsed.workers,
|
|
10692
|
-
providerBatching,
|
|
10840
|
+
...base,
|
|
10693
10841
|
config: resolveDiscoveredProviderConfig(parsed, provider, env, evalFilePath)
|
|
10694
10842
|
};
|
|
10695
10843
|
}
|
|
@@ -11317,8 +11465,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
11317
11465
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
11318
11466
|
if (!parseResult.success) {
|
|
11319
11467
|
const firstError = parseResult.error.errors[0];
|
|
11320
|
-
const
|
|
11321
|
-
const prefix =
|
|
11468
|
+
const path51 = firstError?.path.join(".") || "";
|
|
11469
|
+
const prefix = path51 ? `${target.name} ${path51}: ` : `${target.name}: `;
|
|
11322
11470
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
11323
11471
|
}
|
|
11324
11472
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -13007,6 +13155,41 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
13007
13155
|
}
|
|
13008
13156
|
}
|
|
13009
13157
|
|
|
13158
|
+
// src/evaluation/providers/types.ts
|
|
13159
|
+
var AGENT_PROVIDER_KINDS = [
|
|
13160
|
+
"codex",
|
|
13161
|
+
"copilot-sdk",
|
|
13162
|
+
"copilot-cli",
|
|
13163
|
+
"pi-coding-agent",
|
|
13164
|
+
"pi-cli",
|
|
13165
|
+
"claude",
|
|
13166
|
+
"claude-cli",
|
|
13167
|
+
"claude-sdk",
|
|
13168
|
+
"vscode",
|
|
13169
|
+
"vscode-insiders"
|
|
13170
|
+
];
|
|
13171
|
+
function extractLastAssistantContent(messages) {
|
|
13172
|
+
if (!messages || messages.length === 0) {
|
|
13173
|
+
return "";
|
|
13174
|
+
}
|
|
13175
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
13176
|
+
const msg = messages[i];
|
|
13177
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
13178
|
+
if (typeof msg.content === "string") {
|
|
13179
|
+
return msg.content;
|
|
13180
|
+
}
|
|
13181
|
+
if (isContentArray(msg.content)) {
|
|
13182
|
+
return getTextContent(msg.content);
|
|
13183
|
+
}
|
|
13184
|
+
return JSON.stringify(msg.content);
|
|
13185
|
+
}
|
|
13186
|
+
}
|
|
13187
|
+
return "";
|
|
13188
|
+
}
|
|
13189
|
+
function isAgentProvider(provider) {
|
|
13190
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
13191
|
+
}
|
|
13192
|
+
|
|
13010
13193
|
// src/evaluation/providers/targets-file.ts
|
|
13011
13194
|
var import_node_fs11 = require("fs");
|
|
13012
13195
|
var import_promises27 = require("fs/promises");
|
|
@@ -13319,13 +13502,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
13319
13502
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
13320
13503
|
const { mkdir: mkdir17, readFile: readFile16, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
13321
13504
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
13322
|
-
const
|
|
13505
|
+
const path51 = await import("path");
|
|
13323
13506
|
const { randomUUID: randomUUID10 } = await import("crypto");
|
|
13324
|
-
const dir =
|
|
13507
|
+
const dir = path51.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
13325
13508
|
await mkdir17(dir, { recursive: true });
|
|
13326
|
-
const stdinPath =
|
|
13327
|
-
const stdoutPath =
|
|
13328
|
-
const stderrPath =
|
|
13509
|
+
const stdinPath = path51.join(dir, "stdin.txt");
|
|
13510
|
+
const stdoutPath = path51.join(dir, "stdout.txt");
|
|
13511
|
+
const stderrPath = path51.join(dir, "stderr.txt");
|
|
13329
13512
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
13330
13513
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
13331
13514
|
const { spawn: spawn5 } = await import("child_process");
|
|
@@ -13457,7 +13640,7 @@ async function createTargetProxy(options) {
|
|
|
13457
13640
|
totalOutputTokens += response.tokenUsage.output;
|
|
13458
13641
|
}
|
|
13459
13642
|
const output = response.output ?? [];
|
|
13460
|
-
const rawText =
|
|
13643
|
+
const rawText = extractLastAssistantContent2(output);
|
|
13461
13644
|
const result = {
|
|
13462
13645
|
output,
|
|
13463
13646
|
rawText,
|
|
@@ -13515,7 +13698,7 @@ async function createTargetProxy(options) {
|
|
|
13515
13698
|
const output = response.output ?? [];
|
|
13516
13699
|
responses.push({
|
|
13517
13700
|
output,
|
|
13518
|
-
rawText:
|
|
13701
|
+
rawText: extractLastAssistantContent2(output),
|
|
13519
13702
|
tokenUsage: response.tokenUsage
|
|
13520
13703
|
});
|
|
13521
13704
|
} catch (error) {
|
|
@@ -13572,7 +13755,7 @@ function readBody(req) {
|
|
|
13572
13755
|
req.on("error", reject);
|
|
13573
13756
|
});
|
|
13574
13757
|
}
|
|
13575
|
-
function
|
|
13758
|
+
function extractLastAssistantContent2(messages) {
|
|
13576
13759
|
for (let i = messages.length - 1; i >= 0; i--) {
|
|
13577
13760
|
const msg = messages[i];
|
|
13578
13761
|
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
@@ -13641,6 +13824,56 @@ function toCamelCaseDeep(obj) {
|
|
|
13641
13824
|
|
|
13642
13825
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
13643
13826
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
13827
|
+
var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
|
|
13828
|
+
async function materializeContentForGrader(messages, getWorkDir) {
|
|
13829
|
+
if (!messages || messages.length === 0) return messages ?? null;
|
|
13830
|
+
let hasAnyImage = false;
|
|
13831
|
+
for (const msg of messages) {
|
|
13832
|
+
if (isContentArray(msg.content)) {
|
|
13833
|
+
for (const block of msg.content) {
|
|
13834
|
+
if (block.type === "image") {
|
|
13835
|
+
hasAnyImage = true;
|
|
13836
|
+
break;
|
|
13837
|
+
}
|
|
13838
|
+
}
|
|
13839
|
+
}
|
|
13840
|
+
if (hasAnyImage) break;
|
|
13841
|
+
}
|
|
13842
|
+
if (!hasAnyImage) return messages;
|
|
13843
|
+
let counter = 0;
|
|
13844
|
+
const result = [];
|
|
13845
|
+
for (const msg of messages) {
|
|
13846
|
+
if (!isContentArray(msg.content)) {
|
|
13847
|
+
result.push(msg);
|
|
13848
|
+
continue;
|
|
13849
|
+
}
|
|
13850
|
+
if (!msg.content.some((b) => b.type === "image")) {
|
|
13851
|
+
result.push(msg);
|
|
13852
|
+
continue;
|
|
13853
|
+
}
|
|
13854
|
+
const blocks = [];
|
|
13855
|
+
for (const block of msg.content) {
|
|
13856
|
+
if (block.type !== "image") {
|
|
13857
|
+
blocks.push({ ...block });
|
|
13858
|
+
continue;
|
|
13859
|
+
}
|
|
13860
|
+
const img = block;
|
|
13861
|
+
const match = DATA_URI_RE.exec(img.source);
|
|
13862
|
+
if (match) {
|
|
13863
|
+
const [, mediaType, base64Data] = match;
|
|
13864
|
+
const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
|
|
13865
|
+
const dir = await getWorkDir();
|
|
13866
|
+
const filePath = (0, import_node_path38.join)(dir, `img-${counter++}.${ext}`);
|
|
13867
|
+
await (0, import_promises28.writeFile)(filePath, Buffer.from(base64Data, "base64"));
|
|
13868
|
+
blocks.push({ type: "image", media_type: img.media_type, path: filePath });
|
|
13869
|
+
} else {
|
|
13870
|
+
blocks.push({ type: "image", media_type: img.media_type, path: img.source });
|
|
13871
|
+
}
|
|
13872
|
+
}
|
|
13873
|
+
result.push({ ...msg, content: blocks });
|
|
13874
|
+
}
|
|
13875
|
+
return result;
|
|
13876
|
+
}
|
|
13644
13877
|
var CodeEvaluator = class {
|
|
13645
13878
|
kind = "code-grader";
|
|
13646
13879
|
command;
|
|
@@ -13656,7 +13889,18 @@ var CodeEvaluator = class {
|
|
|
13656
13889
|
this.target = options.target;
|
|
13657
13890
|
}
|
|
13658
13891
|
async evaluate(context2) {
|
|
13659
|
-
let
|
|
13892
|
+
let imageTmpDir;
|
|
13893
|
+
const getImageDir = async () => {
|
|
13894
|
+
if (!imageTmpDir) {
|
|
13895
|
+
imageTmpDir = await (0, import_promises28.mkdtemp)((0, import_node_path38.join)((0, import_node_os7.tmpdir)(), "agentv-img-"));
|
|
13896
|
+
}
|
|
13897
|
+
return imageTmpDir;
|
|
13898
|
+
};
|
|
13899
|
+
const materializedOutput = await materializeContentForGrader(
|
|
13900
|
+
context2.output,
|
|
13901
|
+
getImageDir
|
|
13902
|
+
);
|
|
13903
|
+
let outputForPayload = materializedOutput;
|
|
13660
13904
|
let outputPath;
|
|
13661
13905
|
if (outputForPayload) {
|
|
13662
13906
|
const serialized = JSON.stringify(outputForPayload);
|
|
@@ -13669,12 +13913,17 @@ var CodeEvaluator = class {
|
|
|
13669
13913
|
}
|
|
13670
13914
|
const payload = {
|
|
13671
13915
|
criteria: context2.evalCase.criteria,
|
|
13672
|
-
expectedOutput:
|
|
13673
|
-
|
|
13916
|
+
expectedOutput: await materializeContentForGrader(
|
|
13917
|
+
context2.evalCase.expected_output,
|
|
13918
|
+
getImageDir
|
|
13919
|
+
),
|
|
13674
13920
|
output: outputForPayload,
|
|
13675
13921
|
outputPath,
|
|
13676
13922
|
inputFiles: context2.evalCase.file_paths,
|
|
13677
|
-
input:
|
|
13923
|
+
input: await materializeContentForGrader(
|
|
13924
|
+
context2.evalCase.input,
|
|
13925
|
+
getImageDir
|
|
13926
|
+
),
|
|
13678
13927
|
trace: context2.trace ?? null,
|
|
13679
13928
|
tokenUsage: context2.tokenUsage ?? null,
|
|
13680
13929
|
costUsd: context2.costUsd ?? null,
|
|
@@ -13683,9 +13932,7 @@ var CodeEvaluator = class {
|
|
|
13683
13932
|
endTime: context2.endTime ?? null,
|
|
13684
13933
|
fileChanges: context2.fileChanges ?? null,
|
|
13685
13934
|
workspacePath: context2.workspacePath ?? null,
|
|
13686
|
-
config: this.config ?? null
|
|
13687
|
-
inputText: context2.evalCase.question,
|
|
13688
|
-
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
13935
|
+
config: this.config ?? null
|
|
13689
13936
|
};
|
|
13690
13937
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
13691
13938
|
let proxyEnv;
|
|
@@ -13775,6 +14022,10 @@ var CodeEvaluator = class {
|
|
|
13775
14022
|
await (0, import_promises28.rm)((0, import_node_path38.dirname)(outputPath), { recursive: true, force: true }).catch(() => {
|
|
13776
14023
|
});
|
|
13777
14024
|
}
|
|
14025
|
+
if (imageTmpDir) {
|
|
14026
|
+
await (0, import_promises28.rm)(imageTmpDir, { recursive: true, force: true }).catch(() => {
|
|
14027
|
+
});
|
|
14028
|
+
}
|
|
13778
14029
|
}
|
|
13779
14030
|
}
|
|
13780
14031
|
};
|
|
@@ -13802,38 +14053,6 @@ ${tail}`;
|
|
|
13802
14053
|
// src/evaluation/evaluators/composite.ts
|
|
13803
14054
|
var import_ai3 = require("ai");
|
|
13804
14055
|
|
|
13805
|
-
// src/evaluation/providers/types.ts
|
|
13806
|
-
var AGENT_PROVIDER_KINDS = [
|
|
13807
|
-
"codex",
|
|
13808
|
-
"copilot-sdk",
|
|
13809
|
-
"copilot-cli",
|
|
13810
|
-
"pi-coding-agent",
|
|
13811
|
-
"pi-cli",
|
|
13812
|
-
"claude",
|
|
13813
|
-
"claude-cli",
|
|
13814
|
-
"claude-sdk",
|
|
13815
|
-
"vscode",
|
|
13816
|
-
"vscode-insiders"
|
|
13817
|
-
];
|
|
13818
|
-
function extractLastAssistantContent2(messages) {
|
|
13819
|
-
if (!messages || messages.length === 0) {
|
|
13820
|
-
return "";
|
|
13821
|
-
}
|
|
13822
|
-
for (let i = messages.length - 1; i >= 0; i--) {
|
|
13823
|
-
const msg = messages[i];
|
|
13824
|
-
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
13825
|
-
if (typeof msg.content === "string") {
|
|
13826
|
-
return msg.content;
|
|
13827
|
-
}
|
|
13828
|
-
return JSON.stringify(msg.content);
|
|
13829
|
-
}
|
|
13830
|
-
}
|
|
13831
|
-
return "";
|
|
13832
|
-
}
|
|
13833
|
-
function isAgentProvider(provider) {
|
|
13834
|
-
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
13835
|
-
}
|
|
13836
|
-
|
|
13837
14056
|
// src/evaluation/evaluators/llm-grader.ts
|
|
13838
14057
|
var import_promises29 = __toESM(require("fs/promises"), 1);
|
|
13839
14058
|
var import_node_path39 = __toESM(require("path"), 1);
|
|
@@ -13884,13 +14103,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
13884
14103
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
13885
14104
|
|
|
13886
14105
|
[[ ## question ## ]]
|
|
13887
|
-
{{${TEMPLATE_VARIABLES.
|
|
14106
|
+
{{${TEMPLATE_VARIABLES.INPUT}}}
|
|
13888
14107
|
|
|
13889
14108
|
[[ ## reference_answer ## ]]
|
|
13890
|
-
{{${TEMPLATE_VARIABLES.
|
|
14109
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
|
|
13891
14110
|
|
|
13892
14111
|
[[ ## answer ## ]]
|
|
13893
|
-
{{${TEMPLATE_VARIABLES.
|
|
14112
|
+
{{${TEMPLATE_VARIABLES.OUTPUT}}}`;
|
|
13894
14113
|
var freeformEvaluationSchema = import_zod4.z.object({
|
|
13895
14114
|
score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
13896
14115
|
assertions: import_zod4.z.array(
|
|
@@ -13962,21 +14181,19 @@ var LlmGraderEvaluator = class {
|
|
|
13962
14181
|
async evaluateFreeform(context2, graderProvider) {
|
|
13963
14182
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13964
14183
|
const variables = {
|
|
13965
|
-
[TEMPLATE_VARIABLES.INPUT]:
|
|
13966
|
-
[TEMPLATE_VARIABLES.
|
|
13967
|
-
|
|
13968
|
-
null,
|
|
13969
|
-
2
|
|
13970
|
-
),
|
|
13971
|
-
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
|
|
14184
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
14185
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
14186
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13972
14187
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
13973
14188
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
14189
|
+
// Deprecated aliases — same values as the primary variables above
|
|
13974
14190
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
13975
14191
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
13976
14192
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
13977
14193
|
};
|
|
13978
14194
|
const systemPrompt = buildOutputSchema();
|
|
13979
14195
|
const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
14196
|
+
warnDeprecatedTemplateVars(evaluatorTemplate);
|
|
13980
14197
|
let userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
13981
14198
|
if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
|
|
13982
14199
|
userPrompt += `
|
|
@@ -13988,13 +14205,15 @@ ${context2.fileChanges}`;
|
|
|
13988
14205
|
userPrompt,
|
|
13989
14206
|
systemPrompt
|
|
13990
14207
|
};
|
|
14208
|
+
const images = context2.output ? extractImageBlocks(context2.output) : [];
|
|
13991
14209
|
try {
|
|
13992
14210
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
13993
14211
|
context: context2,
|
|
13994
14212
|
graderProvider,
|
|
13995
14213
|
systemPrompt,
|
|
13996
14214
|
userPrompt,
|
|
13997
|
-
schema: freeformEvaluationSchema
|
|
14215
|
+
schema: freeformEvaluationSchema,
|
|
14216
|
+
images
|
|
13998
14217
|
});
|
|
13999
14218
|
const score = clampScore(data.score);
|
|
14000
14219
|
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
@@ -14038,13 +14257,15 @@ ${context2.fileChanges}`;
|
|
|
14038
14257
|
userPrompt: prompt,
|
|
14039
14258
|
systemPrompt
|
|
14040
14259
|
};
|
|
14260
|
+
const images = context2.output ? extractImageBlocks(context2.output) : [];
|
|
14041
14261
|
try {
|
|
14042
14262
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
14043
14263
|
context: context2,
|
|
14044
14264
|
graderProvider,
|
|
14045
14265
|
systemPrompt,
|
|
14046
14266
|
userPrompt: prompt,
|
|
14047
|
-
schema: rubricEvaluationSchema
|
|
14267
|
+
schema: rubricEvaluationSchema,
|
|
14268
|
+
images
|
|
14048
14269
|
});
|
|
14049
14270
|
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
14050
14271
|
return {
|
|
@@ -14081,13 +14302,15 @@ ${context2.fileChanges}`;
|
|
|
14081
14302
|
userPrompt: prompt,
|
|
14082
14303
|
systemPrompt
|
|
14083
14304
|
};
|
|
14305
|
+
const images = context2.output ? extractImageBlocks(context2.output) : [];
|
|
14084
14306
|
try {
|
|
14085
14307
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
14086
14308
|
context: context2,
|
|
14087
14309
|
graderProvider,
|
|
14088
14310
|
systemPrompt,
|
|
14089
14311
|
userPrompt: prompt,
|
|
14090
|
-
schema: scoreRangeEvaluationSchema
|
|
14312
|
+
schema: scoreRangeEvaluationSchema,
|
|
14313
|
+
images
|
|
14091
14314
|
});
|
|
14092
14315
|
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
14093
14316
|
return {
|
|
@@ -14217,7 +14440,7 @@ ${context2.fileChanges}`;
|
|
|
14217
14440
|
evalCaseId: context2.evalCase.id,
|
|
14218
14441
|
attempt: context2.attempt
|
|
14219
14442
|
});
|
|
14220
|
-
const assistantContent =
|
|
14443
|
+
const assistantContent = extractLastAssistantContent(response.output);
|
|
14221
14444
|
if (!assistantContent) {
|
|
14222
14445
|
return {
|
|
14223
14446
|
score: 0,
|
|
@@ -14294,12 +14517,17 @@ ${context2.fileChanges}`;
|
|
|
14294
14517
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
14295
14518
|
const variables = {
|
|
14296
14519
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
14520
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
14521
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
14522
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
14523
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
14524
|
+
// Deprecated aliases
|
|
14297
14525
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
14298
14526
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
14299
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
14300
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
14527
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
14301
14528
|
};
|
|
14302
14529
|
if (this.evaluatorTemplate) {
|
|
14530
|
+
warnDeprecatedTemplateVars(this.evaluatorTemplate);
|
|
14303
14531
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
14304
14532
|
}
|
|
14305
14533
|
const config = context2.evaluator;
|
|
@@ -14350,11 +14578,16 @@ ${context2.fileChanges}`;
|
|
|
14350
14578
|
if (this.evaluatorTemplate) {
|
|
14351
14579
|
const variables = {
|
|
14352
14580
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
14581
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
14582
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
14583
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
14584
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
14585
|
+
// Deprecated aliases
|
|
14353
14586
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
14354
14587
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
14355
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
14356
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
14588
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
14357
14589
|
};
|
|
14590
|
+
warnDeprecatedTemplateVars(this.evaluatorTemplate);
|
|
14358
14591
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
14359
14592
|
const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
14360
14593
|
return `${customPrompt}
|
|
@@ -14525,18 +14758,35 @@ ${outputSchema}`;
|
|
|
14525
14758
|
// LLM mode retry logic
|
|
14526
14759
|
// ---------------------------------------------------------------------------
|
|
14527
14760
|
async runWithRetry(options) {
|
|
14528
|
-
const { context: context2, graderProvider, systemPrompt, userPrompt, schema } = options;
|
|
14761
|
+
const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
|
|
14529
14762
|
let lastError;
|
|
14530
14763
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
14531
14764
|
try {
|
|
14532
14765
|
const model = graderProvider.asLanguageModel?.();
|
|
14533
14766
|
if (model) {
|
|
14534
|
-
const
|
|
14767
|
+
const modelOptions = {
|
|
14768
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
14769
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
14770
|
+
};
|
|
14771
|
+
const hasImages = images && images.length > 0;
|
|
14772
|
+
const result = hasImages ? await (0, import_ai2.generateText)({
|
|
14773
|
+
model,
|
|
14774
|
+
system: systemPrompt,
|
|
14775
|
+
messages: [
|
|
14776
|
+
{
|
|
14777
|
+
role: "user",
|
|
14778
|
+
content: [
|
|
14779
|
+
{ type: "text", text: userPrompt },
|
|
14780
|
+
...toAiSdkImageParts(images)
|
|
14781
|
+
]
|
|
14782
|
+
}
|
|
14783
|
+
],
|
|
14784
|
+
...modelOptions
|
|
14785
|
+
}) : await (0, import_ai2.generateText)({
|
|
14535
14786
|
model,
|
|
14536
14787
|
system: systemPrompt,
|
|
14537
14788
|
prompt: userPrompt,
|
|
14538
|
-
...
|
|
14539
|
-
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
14789
|
+
...modelOptions
|
|
14540
14790
|
});
|
|
14541
14791
|
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
14542
14792
|
const rawUsage = result.usage;
|
|
@@ -14551,7 +14801,7 @@ ${outputSchema}`;
|
|
|
14551
14801
|
maxOutputTokens: this.maxOutputTokens,
|
|
14552
14802
|
temperature: this.temperature
|
|
14553
14803
|
});
|
|
14554
|
-
const data = schema.parse(parseJsonFromText(
|
|
14804
|
+
const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
|
|
14555
14805
|
return { data, providerResponse: response, tokenUsage: response.tokenUsage };
|
|
14556
14806
|
} catch (e) {
|
|
14557
14807
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -14596,6 +14846,26 @@ function substituteVariables(template, variables) {
|
|
|
14596
14846
|
return variables[varName] ?? match;
|
|
14597
14847
|
});
|
|
14598
14848
|
}
|
|
14849
|
+
var ANSI_YELLOW8 = "\x1B[33m";
|
|
14850
|
+
var ANSI_RESET9 = "\x1B[0m";
|
|
14851
|
+
var warnedTemplateStrings = /* @__PURE__ */ new Set();
|
|
14852
|
+
function warnDeprecatedTemplateVars(template) {
|
|
14853
|
+
if (warnedTemplateStrings.has(template)) return;
|
|
14854
|
+
const used = [];
|
|
14855
|
+
for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
|
|
14856
|
+
if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
|
|
14857
|
+
used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
|
|
14858
|
+
}
|
|
14859
|
+
}
|
|
14860
|
+
if (used.length > 0) {
|
|
14861
|
+
warnedTemplateStrings.add(template);
|
|
14862
|
+
console.warn(
|
|
14863
|
+
`${ANSI_YELLOW8}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
|
|
14864
|
+
${used.join("\n ")}
|
|
14865
|
+
Update your custom evaluator template to use the new names.${ANSI_RESET9}`
|
|
14866
|
+
);
|
|
14867
|
+
}
|
|
14868
|
+
}
|
|
14599
14869
|
function calculateRubricScore(result, rubrics) {
|
|
14600
14870
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
14601
14871
|
const assertions = [];
|
|
@@ -14690,6 +14960,26 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
14690
14960
|
}
|
|
14691
14961
|
};
|
|
14692
14962
|
}
|
|
14963
|
+
function extractImageBlocks(messages) {
|
|
14964
|
+
const images = [];
|
|
14965
|
+
for (const msg of messages) {
|
|
14966
|
+
if (msg.role !== "assistant") continue;
|
|
14967
|
+
if (!isContentArray(msg.content)) continue;
|
|
14968
|
+
for (const block of msg.content) {
|
|
14969
|
+
if (block.type === "image") {
|
|
14970
|
+
images.push(block);
|
|
14971
|
+
}
|
|
14972
|
+
}
|
|
14973
|
+
}
|
|
14974
|
+
return images;
|
|
14975
|
+
}
|
|
14976
|
+
function toAiSdkImageParts(images) {
|
|
14977
|
+
return images.map((img) => ({
|
|
14978
|
+
type: "image",
|
|
14979
|
+
image: img.source,
|
|
14980
|
+
mediaType: img.media_type || void 0
|
|
14981
|
+
}));
|
|
14982
|
+
}
|
|
14693
14983
|
function resolveSandboxed(basePath, relativePath) {
|
|
14694
14984
|
const resolved = import_node_path39.default.resolve(basePath, relativePath);
|
|
14695
14985
|
if (!resolved.startsWith(basePath + import_node_path39.default.sep) && resolved !== basePath) {
|
|
@@ -15075,7 +15365,7 @@ var CompositeEvaluator = class {
|
|
|
15075
15365
|
attempt: context2.attempt
|
|
15076
15366
|
});
|
|
15077
15367
|
const data = freeformEvaluationSchema.parse(
|
|
15078
|
-
parseJsonFromText(
|
|
15368
|
+
parseJsonFromText(extractLastAssistantContent(response.output))
|
|
15079
15369
|
);
|
|
15080
15370
|
const score = clampScore(data.score);
|
|
15081
15371
|
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
@@ -15431,115 +15721,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
15431
15721
|
* Evaluate a single field against the expected value.
|
|
15432
15722
|
*/
|
|
15433
15723
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
15434
|
-
const { path:
|
|
15435
|
-
const candidateValue = resolvePath(candidateData,
|
|
15436
|
-
const expectedValue = resolvePath(expectedData,
|
|
15724
|
+
const { path: path51, match, required = true, weight = 1 } = fieldConfig;
|
|
15725
|
+
const candidateValue = resolvePath(candidateData, path51);
|
|
15726
|
+
const expectedValue = resolvePath(expectedData, path51);
|
|
15437
15727
|
if (expectedValue === void 0) {
|
|
15438
15728
|
return {
|
|
15439
|
-
path:
|
|
15729
|
+
path: path51,
|
|
15440
15730
|
score: 1,
|
|
15441
15731
|
// No expected value means no comparison needed
|
|
15442
15732
|
weight,
|
|
15443
15733
|
hit: true,
|
|
15444
|
-
message: `${
|
|
15734
|
+
message: `${path51}: no expected value`
|
|
15445
15735
|
};
|
|
15446
15736
|
}
|
|
15447
15737
|
if (candidateValue === void 0) {
|
|
15448
15738
|
if (required) {
|
|
15449
15739
|
return {
|
|
15450
|
-
path:
|
|
15740
|
+
path: path51,
|
|
15451
15741
|
score: 0,
|
|
15452
15742
|
weight,
|
|
15453
15743
|
hit: false,
|
|
15454
|
-
message: `${
|
|
15744
|
+
message: `${path51} (required, missing)`
|
|
15455
15745
|
};
|
|
15456
15746
|
}
|
|
15457
15747
|
return {
|
|
15458
|
-
path:
|
|
15748
|
+
path: path51,
|
|
15459
15749
|
score: 1,
|
|
15460
15750
|
// Don't penalize missing optional fields
|
|
15461
15751
|
weight: 0,
|
|
15462
15752
|
// Zero weight means it won't affect the score
|
|
15463
15753
|
hit: true,
|
|
15464
|
-
message: `${
|
|
15754
|
+
message: `${path51}: optional field missing`
|
|
15465
15755
|
};
|
|
15466
15756
|
}
|
|
15467
15757
|
switch (match) {
|
|
15468
15758
|
case "exact":
|
|
15469
|
-
return this.compareExact(
|
|
15759
|
+
return this.compareExact(path51, candidateValue, expectedValue, weight);
|
|
15470
15760
|
case "numeric_tolerance":
|
|
15471
15761
|
return this.compareNumericTolerance(
|
|
15472
|
-
|
|
15762
|
+
path51,
|
|
15473
15763
|
candidateValue,
|
|
15474
15764
|
expectedValue,
|
|
15475
15765
|
fieldConfig,
|
|
15476
15766
|
weight
|
|
15477
15767
|
);
|
|
15478
15768
|
case "date":
|
|
15479
|
-
return this.compareDate(
|
|
15769
|
+
return this.compareDate(path51, candidateValue, expectedValue, fieldConfig, weight);
|
|
15480
15770
|
default:
|
|
15481
15771
|
return {
|
|
15482
|
-
path:
|
|
15772
|
+
path: path51,
|
|
15483
15773
|
score: 0,
|
|
15484
15774
|
weight,
|
|
15485
15775
|
hit: false,
|
|
15486
|
-
message: `${
|
|
15776
|
+
message: `${path51}: unknown match type "${match}"`
|
|
15487
15777
|
};
|
|
15488
15778
|
}
|
|
15489
15779
|
}
|
|
15490
15780
|
/**
|
|
15491
15781
|
* Exact equality comparison.
|
|
15492
15782
|
*/
|
|
15493
|
-
compareExact(
|
|
15783
|
+
compareExact(path51, candidateValue, expectedValue, weight) {
|
|
15494
15784
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
15495
15785
|
return {
|
|
15496
|
-
path:
|
|
15786
|
+
path: path51,
|
|
15497
15787
|
score: 1,
|
|
15498
15788
|
weight,
|
|
15499
15789
|
hit: true,
|
|
15500
|
-
message:
|
|
15790
|
+
message: path51
|
|
15501
15791
|
};
|
|
15502
15792
|
}
|
|
15503
15793
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
15504
15794
|
return {
|
|
15505
|
-
path:
|
|
15795
|
+
path: path51,
|
|
15506
15796
|
score: 0,
|
|
15507
15797
|
weight,
|
|
15508
15798
|
hit: false,
|
|
15509
|
-
message: `${
|
|
15799
|
+
message: `${path51} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
15510
15800
|
};
|
|
15511
15801
|
}
|
|
15512
15802
|
return {
|
|
15513
|
-
path:
|
|
15803
|
+
path: path51,
|
|
15514
15804
|
score: 0,
|
|
15515
15805
|
weight,
|
|
15516
15806
|
hit: false,
|
|
15517
|
-
message: `${
|
|
15807
|
+
message: `${path51} (value mismatch)`
|
|
15518
15808
|
};
|
|
15519
15809
|
}
|
|
15520
15810
|
/**
|
|
15521
15811
|
* Numeric comparison with absolute or relative tolerance.
|
|
15522
15812
|
*/
|
|
15523
|
-
compareNumericTolerance(
|
|
15813
|
+
compareNumericTolerance(path51, candidateValue, expectedValue, fieldConfig, weight) {
|
|
15524
15814
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
15525
15815
|
const candidateNum = toNumber(candidateValue);
|
|
15526
15816
|
const expectedNum = toNumber(expectedValue);
|
|
15527
15817
|
if (candidateNum === null || expectedNum === null) {
|
|
15528
15818
|
return {
|
|
15529
|
-
path:
|
|
15819
|
+
path: path51,
|
|
15530
15820
|
score: 0,
|
|
15531
15821
|
weight,
|
|
15532
15822
|
hit: false,
|
|
15533
|
-
message: `${
|
|
15823
|
+
message: `${path51} (non-numeric value)`
|
|
15534
15824
|
};
|
|
15535
15825
|
}
|
|
15536
15826
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
15537
15827
|
return {
|
|
15538
|
-
path:
|
|
15828
|
+
path: path51,
|
|
15539
15829
|
score: 0,
|
|
15540
15830
|
weight,
|
|
15541
15831
|
hit: false,
|
|
15542
|
-
message: `${
|
|
15832
|
+
message: `${path51} (invalid numeric value)`
|
|
15543
15833
|
};
|
|
15544
15834
|
}
|
|
15545
15835
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -15552,61 +15842,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
15552
15842
|
}
|
|
15553
15843
|
if (withinTolerance) {
|
|
15554
15844
|
return {
|
|
15555
|
-
path:
|
|
15845
|
+
path: path51,
|
|
15556
15846
|
score: 1,
|
|
15557
15847
|
weight,
|
|
15558
15848
|
hit: true,
|
|
15559
|
-
message: `${
|
|
15849
|
+
message: `${path51} (within tolerance: diff=${diff.toFixed(2)})`
|
|
15560
15850
|
};
|
|
15561
15851
|
}
|
|
15562
15852
|
return {
|
|
15563
|
-
path:
|
|
15853
|
+
path: path51,
|
|
15564
15854
|
score: 0,
|
|
15565
15855
|
weight,
|
|
15566
15856
|
hit: false,
|
|
15567
|
-
message: `${
|
|
15857
|
+
message: `${path51} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
15568
15858
|
};
|
|
15569
15859
|
}
|
|
15570
15860
|
/**
|
|
15571
15861
|
* Date comparison with format normalization.
|
|
15572
15862
|
*/
|
|
15573
|
-
compareDate(
|
|
15863
|
+
compareDate(path51, candidateValue, expectedValue, fieldConfig, weight) {
|
|
15574
15864
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
15575
15865
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
15576
15866
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
15577
15867
|
if (candidateDate === null) {
|
|
15578
15868
|
return {
|
|
15579
|
-
path:
|
|
15869
|
+
path: path51,
|
|
15580
15870
|
score: 0,
|
|
15581
15871
|
weight,
|
|
15582
15872
|
hit: false,
|
|
15583
|
-
message: `${
|
|
15873
|
+
message: `${path51} (unparseable candidate date)`
|
|
15584
15874
|
};
|
|
15585
15875
|
}
|
|
15586
15876
|
if (expectedDate === null) {
|
|
15587
15877
|
return {
|
|
15588
|
-
path:
|
|
15878
|
+
path: path51,
|
|
15589
15879
|
score: 0,
|
|
15590
15880
|
weight,
|
|
15591
15881
|
hit: false,
|
|
15592
|
-
message: `${
|
|
15882
|
+
message: `${path51} (unparseable expected date)`
|
|
15593
15883
|
};
|
|
15594
15884
|
}
|
|
15595
15885
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
15596
15886
|
return {
|
|
15597
|
-
path:
|
|
15887
|
+
path: path51,
|
|
15598
15888
|
score: 1,
|
|
15599
15889
|
weight,
|
|
15600
15890
|
hit: true,
|
|
15601
|
-
message:
|
|
15891
|
+
message: path51
|
|
15602
15892
|
};
|
|
15603
15893
|
}
|
|
15604
15894
|
return {
|
|
15605
|
-
path:
|
|
15895
|
+
path: path51,
|
|
15606
15896
|
score: 0,
|
|
15607
15897
|
weight,
|
|
15608
15898
|
hit: false,
|
|
15609
|
-
message: `${
|
|
15899
|
+
message: `${path51} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
15610
15900
|
};
|
|
15611
15901
|
}
|
|
15612
15902
|
/**
|
|
@@ -15639,11 +15929,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
15639
15929
|
};
|
|
15640
15930
|
}
|
|
15641
15931
|
};
|
|
15642
|
-
function resolvePath(obj,
|
|
15643
|
-
if (!
|
|
15932
|
+
function resolvePath(obj, path51) {
|
|
15933
|
+
if (!path51 || !obj) {
|
|
15644
15934
|
return void 0;
|
|
15645
15935
|
}
|
|
15646
|
-
const parts =
|
|
15936
|
+
const parts = path51.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
15647
15937
|
let current = obj;
|
|
15648
15938
|
for (const part of parts) {
|
|
15649
15939
|
if (current === null || current === void 0) {
|
|
@@ -15935,11 +16225,12 @@ function assembleLlmGraderPrompt(input) {
|
|
|
15935
16225
|
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
|
|
15936
16226
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
15937
16227
|
const variables = {
|
|
15938
|
-
[TEMPLATE_VARIABLES.INPUT]:
|
|
15939
|
-
[TEMPLATE_VARIABLES.
|
|
15940
|
-
[TEMPLATE_VARIABLES.
|
|
16228
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
16229
|
+
[TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
|
|
16230
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
|
|
15941
16231
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
15942
16232
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
16233
|
+
// Deprecated aliases
|
|
15943
16234
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
15944
16235
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
15945
16236
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -16126,8 +16417,8 @@ var TokenUsageEvaluator = class {
|
|
|
16126
16417
|
};
|
|
16127
16418
|
|
|
16128
16419
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
16129
|
-
function getNestedValue(obj,
|
|
16130
|
-
const parts =
|
|
16420
|
+
function getNestedValue(obj, path51) {
|
|
16421
|
+
const parts = path51.split(".");
|
|
16131
16422
|
let current = obj;
|
|
16132
16423
|
for (const part of parts) {
|
|
16133
16424
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -16996,16 +17287,13 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
|
16996
17287
|
const payload = {
|
|
16997
17288
|
criteria: context2.evalCase.criteria,
|
|
16998
17289
|
expectedOutput: context2.evalCase.expected_output,
|
|
16999
|
-
outputText: context2.candidate,
|
|
17000
17290
|
output: context2.output ?? null,
|
|
17001
17291
|
inputFiles: context2.evalCase.file_paths,
|
|
17002
17292
|
input: context2.evalCase.input,
|
|
17003
17293
|
trace: context2.trace ?? null,
|
|
17004
17294
|
fileChanges: context2.fileChanges ?? null,
|
|
17005
17295
|
workspacePath: context2.workspacePath ?? null,
|
|
17006
|
-
config: config ?? context2.config ?? null
|
|
17007
|
-
inputText: context2.evalCase.question,
|
|
17008
|
-
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
17296
|
+
config: config ?? context2.config ?? null
|
|
17009
17297
|
};
|
|
17010
17298
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
17011
17299
|
const scriptPath = script[script.length - 1];
|
|
@@ -18685,7 +18973,8 @@ async function runEvaluation(options) {
|
|
|
18685
18973
|
const budgetResult = {
|
|
18686
18974
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
18687
18975
|
testId: evalCase.id,
|
|
18688
|
-
|
|
18976
|
+
dataset: evalCase.dataset,
|
|
18977
|
+
category: evalCase.category,
|
|
18689
18978
|
score: 0,
|
|
18690
18979
|
assertions: [],
|
|
18691
18980
|
output: [],
|
|
@@ -18721,7 +19010,8 @@ async function runEvaluation(options) {
|
|
|
18721
19010
|
const haltResult = {
|
|
18722
19011
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
18723
19012
|
testId: evalCase.id,
|
|
18724
|
-
|
|
19013
|
+
dataset: evalCase.dataset,
|
|
19014
|
+
category: evalCase.category,
|
|
18725
19015
|
score: 0,
|
|
18726
19016
|
assertions: [],
|
|
18727
19017
|
output: [],
|
|
@@ -19004,7 +19294,7 @@ async function runBatchEvaluation(options) {
|
|
|
19004
19294
|
const tokenUsage = merged?.tokenUsage;
|
|
19005
19295
|
const startTime = merged?.startTime;
|
|
19006
19296
|
const endTime = merged?.endTime;
|
|
19007
|
-
const candidate =
|
|
19297
|
+
const candidate = extractLastAssistantContent(output);
|
|
19008
19298
|
const providerError = extractProviderError(providerResponse);
|
|
19009
19299
|
let result;
|
|
19010
19300
|
try {
|
|
@@ -19412,7 +19702,7 @@ async function runEvalCase(options) {
|
|
|
19412
19702
|
const tokenUsage = merged?.tokenUsage;
|
|
19413
19703
|
const startTime = merged?.startTime;
|
|
19414
19704
|
const endTime = merged?.endTime;
|
|
19415
|
-
const candidate =
|
|
19705
|
+
const candidate = extractLastAssistantContent(output);
|
|
19416
19706
|
let fileChanges;
|
|
19417
19707
|
if (baselineCommit && workspacePath) {
|
|
19418
19708
|
try {
|
|
@@ -19720,7 +20010,8 @@ async function evaluateCandidate(options) {
|
|
|
19720
20010
|
return {
|
|
19721
20011
|
timestamp: completedAt.toISOString(),
|
|
19722
20012
|
testId: evalCase.id,
|
|
19723
|
-
|
|
20013
|
+
dataset: evalCase.dataset,
|
|
20014
|
+
category: evalCase.category,
|
|
19724
20015
|
conversationId: evalCase.conversation_id,
|
|
19725
20016
|
score: score.score,
|
|
19726
20017
|
assertions: score.assertions,
|
|
@@ -20070,7 +20361,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
20070
20361
|
return {
|
|
20071
20362
|
timestamp: timestamp.toISOString(),
|
|
20072
20363
|
testId: evalCase.id,
|
|
20073
|
-
|
|
20364
|
+
dataset: evalCase.dataset,
|
|
20365
|
+
category: evalCase.category,
|
|
20074
20366
|
conversationId: evalCase.conversation_id,
|
|
20075
20367
|
score: 0,
|
|
20076
20368
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
@@ -20643,6 +20935,18 @@ function trimBaselineResult(result) {
|
|
|
20643
20935
|
return trimmed;
|
|
20644
20936
|
}
|
|
20645
20937
|
|
|
20938
|
+
// src/evaluation/category.ts
|
|
20939
|
+
var import_node_path51 = __toESM(require("path"), 1);
|
|
20940
|
+
var DEFAULT_CATEGORY = "Uncategorized";
|
|
20941
|
+
function deriveCategory(relativePath) {
|
|
20942
|
+
const parts = relativePath.split(import_node_path51.default.sep);
|
|
20943
|
+
if (parts.length <= 1) {
|
|
20944
|
+
return DEFAULT_CATEGORY;
|
|
20945
|
+
}
|
|
20946
|
+
const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
|
|
20947
|
+
return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
|
|
20948
|
+
}
|
|
20949
|
+
|
|
20646
20950
|
// src/observability/otel-exporter.ts
|
|
20647
20951
|
var OTEL_BACKEND_PRESETS = {
|
|
20648
20952
|
langfuse: {
|
|
@@ -20766,7 +21070,7 @@ var OtelTraceExporter = class {
|
|
|
20766
21070
|
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
20767
21071
|
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
20768
21072
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
20769
|
-
if (result.
|
|
21073
|
+
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
20770
21074
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
20771
21075
|
if (captureContent && result.output.length > 0) {
|
|
20772
21076
|
const lastMsg = result.output[result.output.length - 1];
|
|
@@ -20975,7 +21279,7 @@ var OtelStreamingObserver = class {
|
|
|
20975
21279
|
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
20976
21280
|
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
20977
21281
|
this.rootSpan.setAttribute("agentv.target", target);
|
|
20978
|
-
if (evalSet) this.rootSpan.setAttribute("agentv.
|
|
21282
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
|
|
20979
21283
|
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
20980
21284
|
}
|
|
20981
21285
|
/** Create and immediately export a tool span */
|
|
@@ -21151,9 +21455,11 @@ function createAgentKernel() {
|
|
|
21151
21455
|
}
|
|
21152
21456
|
// Annotate the CommonJS export names for ESM import in node:
|
|
21153
21457
|
0 && (module.exports = {
|
|
21458
|
+
COMMON_TARGET_SETTINGS,
|
|
21154
21459
|
CodeEvaluator,
|
|
21155
21460
|
CompositeEvaluator,
|
|
21156
21461
|
CostEvaluator,
|
|
21462
|
+
DEFAULT_CATEGORY,
|
|
21157
21463
|
DEFAULT_EVALUATOR_TEMPLATE,
|
|
21158
21464
|
DEFAULT_EVAL_PATTERNS,
|
|
21159
21465
|
DEFAULT_EXPLORATION_TOOLS,
|
|
@@ -21207,6 +21513,7 @@ function createAgentKernel() {
|
|
|
21207
21513
|
createTempWorkspace,
|
|
21208
21514
|
deepEqual,
|
|
21209
21515
|
defineConfig,
|
|
21516
|
+
deriveCategory,
|
|
21210
21517
|
detectFormat,
|
|
21211
21518
|
discoverAssertions,
|
|
21212
21519
|
discoverCopilotSessions,
|
|
@@ -21220,7 +21527,9 @@ function createAgentKernel() {
|
|
|
21220
21527
|
explorationRatio,
|
|
21221
21528
|
extractCacheConfig,
|
|
21222
21529
|
extractFailOnError,
|
|
21530
|
+
extractImageBlocks,
|
|
21223
21531
|
extractJsonBlob,
|
|
21532
|
+
extractLastAssistantContent,
|
|
21224
21533
|
extractTargetFromSuite,
|
|
21225
21534
|
extractTargetsFromSuite,
|
|
21226
21535
|
extractTargetsFromTestCase,
|
|
@@ -21234,12 +21543,15 @@ function createAgentKernel() {
|
|
|
21234
21543
|
getAgentvHome,
|
|
21235
21544
|
getOutputFilenames,
|
|
21236
21545
|
getSubagentsRoot,
|
|
21546
|
+
getTextContent,
|
|
21237
21547
|
getTraceStateRoot,
|
|
21238
21548
|
getWorkspacePath,
|
|
21239
21549
|
getWorkspacePoolRoot,
|
|
21240
21550
|
getWorkspacesRoot,
|
|
21241
21551
|
initializeBaseline,
|
|
21242
21552
|
isAgentSkillsFormat,
|
|
21553
|
+
isContent,
|
|
21554
|
+
isContentArray,
|
|
21243
21555
|
isEvaluatorKind,
|
|
21244
21556
|
isJsonObject,
|
|
21245
21557
|
isJsonValue,
|