@agentv/core 3.14.5 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-HP5PFOVK.js → chunk-PXYYRDHH.js} +142 -148
- package/dist/chunk-PXYYRDHH.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +9 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +3 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +567 -256
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +135 -93
- package/dist/index.d.ts +135 -93
- package/dist/index.js +457 -140
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-HP5PFOVK.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1315,12 +1315,12 @@ function serializeAttributeValue(value) {
|
|
|
1315
1315
|
if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
|
|
1316
1316
|
return { stringValue: String(value) };
|
|
1317
1317
|
}
|
|
1318
|
-
var import_promises35,
|
|
1318
|
+
var import_promises35, import_node_path52, OtlpJsonFileExporter;
|
|
1319
1319
|
var init_otlp_json_file_exporter = __esm({
|
|
1320
1320
|
"src/observability/otlp-json-file-exporter.ts"() {
|
|
1321
1321
|
"use strict";
|
|
1322
1322
|
import_promises35 = require("fs/promises");
|
|
1323
|
-
|
|
1323
|
+
import_node_path52 = require("path");
|
|
1324
1324
|
OtlpJsonFileExporter = class {
|
|
1325
1325
|
// biome-ignore lint/suspicious/noExplicitAny: serialized span data
|
|
1326
1326
|
spans = [];
|
|
@@ -1359,7 +1359,7 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1359
1359
|
}
|
|
1360
1360
|
async flush() {
|
|
1361
1361
|
if (this.spans.length === 0) return;
|
|
1362
|
-
await (0, import_promises35.mkdir)((0,
|
|
1362
|
+
await (0, import_promises35.mkdir)((0, import_node_path52.dirname)(this.filePath), { recursive: true });
|
|
1363
1363
|
const otlpJson = {
|
|
1364
1364
|
resourceSpans: [
|
|
1365
1365
|
{
|
|
@@ -1383,9 +1383,11 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1383
1383
|
// src/index.ts
|
|
1384
1384
|
var index_exports = {};
|
|
1385
1385
|
__export(index_exports, {
|
|
1386
|
+
COMMON_TARGET_SETTINGS: () => COMMON_TARGET_SETTINGS,
|
|
1386
1387
|
CodeEvaluator: () => CodeEvaluator,
|
|
1387
1388
|
CompositeEvaluator: () => CompositeEvaluator,
|
|
1388
1389
|
CostEvaluator: () => CostEvaluator,
|
|
1390
|
+
DEFAULT_CATEGORY: () => DEFAULT_CATEGORY,
|
|
1389
1391
|
DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
|
|
1390
1392
|
DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
|
|
1391
1393
|
DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
|
|
@@ -1439,6 +1441,7 @@ __export(index_exports, {
|
|
|
1439
1441
|
createTempWorkspace: () => createTempWorkspace,
|
|
1440
1442
|
deepEqual: () => deepEqual,
|
|
1441
1443
|
defineConfig: () => defineConfig,
|
|
1444
|
+
deriveCategory: () => deriveCategory,
|
|
1442
1445
|
detectFormat: () => detectFormat,
|
|
1443
1446
|
discoverAssertions: () => discoverAssertions,
|
|
1444
1447
|
discoverCopilotSessions: () => discoverCopilotSessions,
|
|
@@ -1452,7 +1455,9 @@ __export(index_exports, {
|
|
|
1452
1455
|
explorationRatio: () => explorationRatio,
|
|
1453
1456
|
extractCacheConfig: () => extractCacheConfig,
|
|
1454
1457
|
extractFailOnError: () => extractFailOnError,
|
|
1458
|
+
extractImageBlocks: () => extractImageBlocks,
|
|
1455
1459
|
extractJsonBlob: () => extractJsonBlob,
|
|
1460
|
+
extractLastAssistantContent: () => extractLastAssistantContent,
|
|
1456
1461
|
extractTargetFromSuite: () => extractTargetFromSuite,
|
|
1457
1462
|
extractTargetsFromSuite: () => extractTargetsFromSuite,
|
|
1458
1463
|
extractTargetsFromTestCase: () => extractTargetsFromTestCase,
|
|
@@ -1466,12 +1471,15 @@ __export(index_exports, {
|
|
|
1466
1471
|
getAgentvHome: () => getAgentvHome,
|
|
1467
1472
|
getOutputFilenames: () => getOutputFilenames,
|
|
1468
1473
|
getSubagentsRoot: () => getSubagentsRoot,
|
|
1474
|
+
getTextContent: () => getTextContent,
|
|
1469
1475
|
getTraceStateRoot: () => getTraceStateRoot,
|
|
1470
1476
|
getWorkspacePath: () => getWorkspacePath,
|
|
1471
1477
|
getWorkspacePoolRoot: () => getWorkspacePoolRoot,
|
|
1472
1478
|
getWorkspacesRoot: () => getWorkspacesRoot,
|
|
1473
1479
|
initializeBaseline: () => initializeBaseline,
|
|
1474
1480
|
isAgentSkillsFormat: () => isAgentSkillsFormat,
|
|
1481
|
+
isContent: () => isContent,
|
|
1482
|
+
isContentArray: () => isContentArray,
|
|
1475
1483
|
isEvaluatorKind: () => isEvaluatorKind,
|
|
1476
1484
|
isJsonObject: () => isJsonObject,
|
|
1477
1485
|
isJsonValue: () => isJsonValue,
|
|
@@ -1533,6 +1541,29 @@ __export(index_exports, {
|
|
|
1533
1541
|
});
|
|
1534
1542
|
module.exports = __toCommonJS(index_exports);
|
|
1535
1543
|
|
|
1544
|
+
// src/evaluation/content.ts
|
|
1545
|
+
var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
|
|
1546
|
+
function isContent(value) {
|
|
1547
|
+
if (!value || typeof value !== "object") return false;
|
|
1548
|
+
const v = value;
|
|
1549
|
+
return typeof v.type === "string" && CONTENT_TYPES.has(v.type);
|
|
1550
|
+
}
|
|
1551
|
+
function isContentArray(value) {
|
|
1552
|
+
return Array.isArray(value) && value.length > 0 && value.every(isContent);
|
|
1553
|
+
}
|
|
1554
|
+
function getTextContent(content) {
|
|
1555
|
+
if (content == null) return "";
|
|
1556
|
+
if (typeof content === "string") return content;
|
|
1557
|
+
if (!Array.isArray(content)) return "";
|
|
1558
|
+
const parts = [];
|
|
1559
|
+
for (const block of content) {
|
|
1560
|
+
if (block.type === "text") {
|
|
1561
|
+
parts.push(block.text);
|
|
1562
|
+
}
|
|
1563
|
+
}
|
|
1564
|
+
return parts.join("\n");
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1536
1567
|
// src/evaluation/types.ts
|
|
1537
1568
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
1538
1569
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
@@ -2411,15 +2442,23 @@ var TEMPLATE_VARIABLES = {
|
|
|
2411
2442
|
INPUT: "input",
|
|
2412
2443
|
OUTPUT: "output",
|
|
2413
2444
|
FILE_CHANGES: "file_changes",
|
|
2445
|
+
/** @deprecated Use INPUT instead — resolves to the same text value. */
|
|
2414
2446
|
INPUT_TEXT: "input_text",
|
|
2447
|
+
/** @deprecated Use OUTPUT instead — resolves to the same text value. */
|
|
2415
2448
|
OUTPUT_TEXT: "output_text",
|
|
2449
|
+
/** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
|
|
2416
2450
|
EXPECTED_OUTPUT_TEXT: "expected_output_text"
|
|
2417
2451
|
};
|
|
2418
2452
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
2419
2453
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
2420
|
-
TEMPLATE_VARIABLES.
|
|
2454
|
+
TEMPLATE_VARIABLES.OUTPUT,
|
|
2421
2455
|
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
2422
2456
|
]);
|
|
2457
|
+
var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
|
|
2458
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
|
|
2459
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
|
|
2460
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
|
|
2461
|
+
]);
|
|
2423
2462
|
|
|
2424
2463
|
// src/evaluation/validation/prompt-validator.ts
|
|
2425
2464
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
@@ -2441,16 +2480,29 @@ function validateTemplateVariables(content, source) {
|
|
|
2441
2480
|
}
|
|
2442
2481
|
match = variablePattern.exec(content);
|
|
2443
2482
|
}
|
|
2444
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
2483
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
2445
2484
|
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
2446
2485
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
2447
2486
|
if (!hasRequiredFields) {
|
|
2448
2487
|
throw new Error(
|
|
2449
2488
|
`Missing required fields. Must include at least one of:
|
|
2450
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
2489
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
|
|
2451
2490
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
2452
2491
|
);
|
|
2453
2492
|
}
|
|
2493
|
+
const deprecatedUsed = [];
|
|
2494
|
+
for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
|
|
2495
|
+
if (foundVariables.has(deprecated)) {
|
|
2496
|
+
deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
|
|
2497
|
+
}
|
|
2498
|
+
}
|
|
2499
|
+
if (deprecatedUsed.length > 0) {
|
|
2500
|
+
console.warn(
|
|
2501
|
+
`${ANSI_YELLOW3}Warning: Template at ${source} uses deprecated variable names:
|
|
2502
|
+
${deprecatedUsed.join("\n ")}
|
|
2503
|
+
These still work but will be removed in a future version.${ANSI_RESET4}`
|
|
2504
|
+
);
|
|
2505
|
+
}
|
|
2454
2506
|
if (invalidVariables.length > 0) {
|
|
2455
2507
|
const warningMessage = `${ANSI_YELLOW3}Warning: Custom evaluator template at ${source}
|
|
2456
2508
|
Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
|
|
@@ -3868,6 +3920,19 @@ function asString2(value) {
|
|
|
3868
3920
|
}
|
|
3869
3921
|
|
|
3870
3922
|
// src/evaluation/loaders/message-processor.ts
|
|
3923
|
+
var IMAGE_MEDIA_TYPES = {
|
|
3924
|
+
".png": "image/png",
|
|
3925
|
+
".jpg": "image/jpeg",
|
|
3926
|
+
".jpeg": "image/jpeg",
|
|
3927
|
+
".gif": "image/gif",
|
|
3928
|
+
".webp": "image/webp",
|
|
3929
|
+
".svg": "image/svg+xml",
|
|
3930
|
+
".bmp": "image/bmp"
|
|
3931
|
+
};
|
|
3932
|
+
function detectImageMediaType(filePath) {
|
|
3933
|
+
const ext = import_node_path6.default.extname(filePath).toLowerCase();
|
|
3934
|
+
return IMAGE_MEDIA_TYPES[ext];
|
|
3935
|
+
}
|
|
3871
3936
|
var ANSI_YELLOW5 = "\x1B[33m";
|
|
3872
3937
|
var ANSI_RESET6 = "\x1B[0m";
|
|
3873
3938
|
async function processMessages(options) {
|
|
@@ -3933,6 +3998,47 @@ async function processMessages(options) {
|
|
|
3933
3998
|
}
|
|
3934
3999
|
continue;
|
|
3935
4000
|
}
|
|
4001
|
+
if (segmentType === "image") {
|
|
4002
|
+
const rawValue = asString3(rawSegment.value);
|
|
4003
|
+
if (!rawValue) {
|
|
4004
|
+
continue;
|
|
4005
|
+
}
|
|
4006
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
4007
|
+
rawValue,
|
|
4008
|
+
searchRoots
|
|
4009
|
+
);
|
|
4010
|
+
if (!resolvedPath) {
|
|
4011
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
4012
|
+
const context2 = messageType === "input" ? "" : " in expected_output";
|
|
4013
|
+
logWarning3(`Image file not found${context2}: ${displayPath}`, attempts);
|
|
4014
|
+
continue;
|
|
4015
|
+
}
|
|
4016
|
+
const mediaType = detectImageMediaType(resolvedPath);
|
|
4017
|
+
if (!mediaType) {
|
|
4018
|
+
logWarning3(
|
|
4019
|
+
`Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
|
|
4020
|
+
);
|
|
4021
|
+
continue;
|
|
4022
|
+
}
|
|
4023
|
+
try {
|
|
4024
|
+
const imageBuffer = await (0, import_promises6.readFile)(resolvedPath);
|
|
4025
|
+
const base64 = imageBuffer.toString("base64");
|
|
4026
|
+
processedContent.push({
|
|
4027
|
+
type: "image",
|
|
4028
|
+
media_type: mediaType,
|
|
4029
|
+
source: `data:${mediaType};base64,${base64}`
|
|
4030
|
+
});
|
|
4031
|
+
if (verbose) {
|
|
4032
|
+
const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
|
|
4033
|
+
console.log(` ${label} Found: ${displayPath}`);
|
|
4034
|
+
console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
|
|
4035
|
+
}
|
|
4036
|
+
} catch (error) {
|
|
4037
|
+
const context2 = messageType === "input" ? "" : " expected output";
|
|
4038
|
+
logWarning3(`Could not read${context2} image ${resolvedPath}: ${error.message}`);
|
|
4039
|
+
}
|
|
4040
|
+
continue;
|
|
4041
|
+
}
|
|
3936
4042
|
const clonedSegment = cloneJsonObject(rawSegment);
|
|
3937
4043
|
processedContent.push(clonedSegment);
|
|
3938
4044
|
const inlineValue = clonedSegment.value;
|
|
@@ -4010,6 +4116,46 @@ async function processExpectedMessages(options) {
|
|
|
4010
4116
|
}
|
|
4011
4117
|
continue;
|
|
4012
4118
|
}
|
|
4119
|
+
if (segmentType === "image") {
|
|
4120
|
+
const rawValue = asString3(rawSegment.value);
|
|
4121
|
+
if (!rawValue) {
|
|
4122
|
+
continue;
|
|
4123
|
+
}
|
|
4124
|
+
const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
|
|
4125
|
+
rawValue,
|
|
4126
|
+
searchRoots
|
|
4127
|
+
);
|
|
4128
|
+
if (!resolvedPath) {
|
|
4129
|
+
const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
|
|
4130
|
+
logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
|
|
4131
|
+
continue;
|
|
4132
|
+
}
|
|
4133
|
+
const mediaType = detectImageMediaType(resolvedPath);
|
|
4134
|
+
if (!mediaType) {
|
|
4135
|
+
logWarning3(
|
|
4136
|
+
`Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
|
|
4137
|
+
);
|
|
4138
|
+
continue;
|
|
4139
|
+
}
|
|
4140
|
+
try {
|
|
4141
|
+
const imageBuffer = await (0, import_promises6.readFile)(resolvedPath);
|
|
4142
|
+
const base64 = imageBuffer.toString("base64");
|
|
4143
|
+
processedContent.push({
|
|
4144
|
+
type: "image",
|
|
4145
|
+
media_type: mediaType,
|
|
4146
|
+
source: `data:${mediaType};base64,${base64}`
|
|
4147
|
+
});
|
|
4148
|
+
if (verbose) {
|
|
4149
|
+
console.log(` [Expected Output Image] Found: ${displayPath}`);
|
|
4150
|
+
console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
|
|
4151
|
+
}
|
|
4152
|
+
} catch (error) {
|
|
4153
|
+
logWarning3(
|
|
4154
|
+
`Could not read expected output image ${resolvedPath}: ${error.message}`
|
|
4155
|
+
);
|
|
4156
|
+
}
|
|
4157
|
+
continue;
|
|
4158
|
+
}
|
|
4013
4159
|
processedContent.push(cloneJsonObject(rawSegment));
|
|
4014
4160
|
}
|
|
4015
4161
|
segment.content = processedContent;
|
|
@@ -4256,7 +4402,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
|
|
|
4256
4402
|
const userFilePaths = collectResolvedInputFilePaths(inputMessages);
|
|
4257
4403
|
const testCase = {
|
|
4258
4404
|
id,
|
|
4259
|
-
|
|
4405
|
+
dataset: evalSetName,
|
|
4260
4406
|
conversation_id: conversationId,
|
|
4261
4407
|
question,
|
|
4262
4408
|
input: inputMessages,
|
|
@@ -4527,7 +4673,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4527
4673
|
}
|
|
4528
4674
|
const suite = interpolated;
|
|
4529
4675
|
const evalSetNameFromSuite = asString5(suite.name)?.trim();
|
|
4530
|
-
const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
|
|
4676
|
+
const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
|
|
4531
4677
|
const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
|
|
4532
4678
|
const rawTestcases = resolveTests(suite);
|
|
4533
4679
|
const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
|
|
@@ -4648,7 +4794,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
|
|
|
4648
4794
|
const caseTargets = extractTargetsFromTestCase(evalcase);
|
|
4649
4795
|
const testCase = {
|
|
4650
4796
|
id,
|
|
4651
|
-
|
|
4797
|
+
dataset: evalSetName,
|
|
4798
|
+
category: options?.category,
|
|
4652
4799
|
conversation_id: conversationId,
|
|
4653
4800
|
question,
|
|
4654
4801
|
input: inputMessages,
|
|
@@ -5690,6 +5837,48 @@ var import_node_fs4 = require("fs");
|
|
|
5690
5837
|
var import_promises10 = require("fs/promises");
|
|
5691
5838
|
var import_node_path12 = __toESM(require("path"), 1);
|
|
5692
5839
|
|
|
5840
|
+
// src/evaluation/providers/claude-content.ts
|
|
5841
|
+
function toContentArray(content) {
|
|
5842
|
+
if (!Array.isArray(content)) return void 0;
|
|
5843
|
+
let hasNonText = false;
|
|
5844
|
+
const blocks = [];
|
|
5845
|
+
for (const part of content) {
|
|
5846
|
+
if (!part || typeof part !== "object") continue;
|
|
5847
|
+
const p = part;
|
|
5848
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
5849
|
+
blocks.push({ type: "text", text: p.text });
|
|
5850
|
+
} else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
|
|
5851
|
+
const src = p.source;
|
|
5852
|
+
const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
|
|
5853
|
+
const data = typeof src.data === "string" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" ? p.url : "";
|
|
5854
|
+
blocks.push({ type: "image", media_type: mediaType, source: data });
|
|
5855
|
+
hasNonText = true;
|
|
5856
|
+
} else if (p.type === "tool_use") {
|
|
5857
|
+
} else if (p.type === "tool_result") {
|
|
5858
|
+
}
|
|
5859
|
+
}
|
|
5860
|
+
return hasNonText && blocks.length > 0 ? blocks : void 0;
|
|
5861
|
+
}
|
|
5862
|
+
function extractTextContent(content) {
|
|
5863
|
+
if (typeof content === "string") {
|
|
5864
|
+
return content;
|
|
5865
|
+
}
|
|
5866
|
+
if (!Array.isArray(content)) {
|
|
5867
|
+
return void 0;
|
|
5868
|
+
}
|
|
5869
|
+
const textParts = [];
|
|
5870
|
+
for (const part of content) {
|
|
5871
|
+
if (!part || typeof part !== "object") {
|
|
5872
|
+
continue;
|
|
5873
|
+
}
|
|
5874
|
+
const p = part;
|
|
5875
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
5876
|
+
textParts.push(p.text);
|
|
5877
|
+
}
|
|
5878
|
+
}
|
|
5879
|
+
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
5880
|
+
}
|
|
5881
|
+
|
|
5693
5882
|
// src/evaluation/providers/claude-log-tracker.ts
|
|
5694
5883
|
var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
|
|
5695
5884
|
var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
|
|
@@ -5855,11 +6044,12 @@ var ClaudeCliProvider = class {
|
|
|
5855
6044
|
if (betaMessage && typeof betaMessage === "object") {
|
|
5856
6045
|
const msg = betaMessage;
|
|
5857
6046
|
const content = msg.content;
|
|
6047
|
+
const structuredContent = toContentArray(content);
|
|
5858
6048
|
const textContent = extractTextContent(content);
|
|
5859
6049
|
const toolCalls = extractToolCalls(content);
|
|
5860
6050
|
const outputMsg = {
|
|
5861
6051
|
role: "assistant",
|
|
5862
|
-
content: textContent,
|
|
6052
|
+
content: structuredContent ?? textContent,
|
|
5863
6053
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
5864
6054
|
};
|
|
5865
6055
|
output.push(outputMsg);
|
|
@@ -6198,25 +6388,6 @@ function summarizeEvent(event) {
|
|
|
6198
6388
|
return void 0;
|
|
6199
6389
|
}
|
|
6200
6390
|
}
|
|
6201
|
-
function extractTextContent(content) {
|
|
6202
|
-
if (typeof content === "string") {
|
|
6203
|
-
return content;
|
|
6204
|
-
}
|
|
6205
|
-
if (!Array.isArray(content)) {
|
|
6206
|
-
return void 0;
|
|
6207
|
-
}
|
|
6208
|
-
const textParts = [];
|
|
6209
|
-
for (const part of content) {
|
|
6210
|
-
if (!part || typeof part !== "object") {
|
|
6211
|
-
continue;
|
|
6212
|
-
}
|
|
6213
|
-
const p = part;
|
|
6214
|
-
if (p.type === "text" && typeof p.text === "string") {
|
|
6215
|
-
textParts.push(p.text);
|
|
6216
|
-
}
|
|
6217
|
-
}
|
|
6218
|
-
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
6219
|
-
}
|
|
6220
6391
|
function extractToolCalls(content) {
|
|
6221
6392
|
if (!Array.isArray(content)) {
|
|
6222
6393
|
return [];
|
|
@@ -6389,11 +6560,12 @@ var ClaudeSdkProvider = class {
|
|
|
6389
6560
|
if (betaMessage && typeof betaMessage === "object") {
|
|
6390
6561
|
const msg = betaMessage;
|
|
6391
6562
|
const content = msg.content;
|
|
6392
|
-
const
|
|
6563
|
+
const structuredContent = toContentArray(content);
|
|
6564
|
+
const textContent = extractTextContent(content);
|
|
6393
6565
|
const toolCalls = extractToolCalls2(content);
|
|
6394
6566
|
const outputMsg = {
|
|
6395
6567
|
role: "assistant",
|
|
6396
|
-
content: textContent,
|
|
6568
|
+
content: structuredContent ?? textContent,
|
|
6397
6569
|
toolCalls: toolCalls.length > 0 ? toolCalls : void 0
|
|
6398
6570
|
};
|
|
6399
6571
|
output.push(outputMsg);
|
|
@@ -6511,25 +6683,6 @@ var ClaudeSdkProvider = class {
|
|
|
6511
6683
|
}
|
|
6512
6684
|
}
|
|
6513
6685
|
};
|
|
6514
|
-
function extractTextContent2(content) {
|
|
6515
|
-
if (typeof content === "string") {
|
|
6516
|
-
return content;
|
|
6517
|
-
}
|
|
6518
|
-
if (!Array.isArray(content)) {
|
|
6519
|
-
return void 0;
|
|
6520
|
-
}
|
|
6521
|
-
const textParts = [];
|
|
6522
|
-
for (const part of content) {
|
|
6523
|
-
if (!part || typeof part !== "object") {
|
|
6524
|
-
continue;
|
|
6525
|
-
}
|
|
6526
|
-
const p = part;
|
|
6527
|
-
if (p.type === "text" && typeof p.text === "string") {
|
|
6528
|
-
textParts.push(p.text);
|
|
6529
|
-
}
|
|
6530
|
-
}
|
|
6531
|
-
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
6532
|
-
}
|
|
6533
6686
|
function extractToolCalls2(content) {
|
|
6534
6687
|
if (!Array.isArray(content)) {
|
|
6535
6688
|
return [];
|
|
@@ -6753,7 +6906,7 @@ function convertMessages(messages) {
|
|
|
6753
6906
|
return messages.map((msg) => ({
|
|
6754
6907
|
role: msg.role,
|
|
6755
6908
|
name: msg.name,
|
|
6756
|
-
content: msg.content,
|
|
6909
|
+
content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
|
|
6757
6910
|
toolCalls: msg.tool_calls?.map((tc) => ({
|
|
6758
6911
|
tool: tc.tool,
|
|
6759
6912
|
input: tc.input,
|
|
@@ -9007,6 +9160,35 @@ function extractPiTextContent(content) {
|
|
|
9007
9160
|
}
|
|
9008
9161
|
return textParts.length > 0 ? textParts.join("\n") : void 0;
|
|
9009
9162
|
}
|
|
9163
|
+
function toPiContentArray(content) {
|
|
9164
|
+
if (!Array.isArray(content)) return void 0;
|
|
9165
|
+
let hasNonText = false;
|
|
9166
|
+
const blocks = [];
|
|
9167
|
+
for (const part of content) {
|
|
9168
|
+
if (!part || typeof part !== "object") continue;
|
|
9169
|
+
const p = part;
|
|
9170
|
+
if (p.type === "text" && typeof p.text === "string") {
|
|
9171
|
+
blocks.push({ type: "text", text: p.text });
|
|
9172
|
+
} else if (p.type === "image") {
|
|
9173
|
+
const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
|
|
9174
|
+
let source = "";
|
|
9175
|
+
if (typeof p.source === "object" && p.source !== null) {
|
|
9176
|
+
const src = p.source;
|
|
9177
|
+
const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
|
|
9178
|
+
source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
|
|
9179
|
+
}
|
|
9180
|
+
if (!source && typeof p.url === "string") {
|
|
9181
|
+
source = p.url;
|
|
9182
|
+
}
|
|
9183
|
+
if (source) {
|
|
9184
|
+
blocks.push({ type: "image", media_type: mediaType, source });
|
|
9185
|
+
hasNonText = true;
|
|
9186
|
+
}
|
|
9187
|
+
} else if (p.type === "tool_use" || p.type === "tool_result") {
|
|
9188
|
+
}
|
|
9189
|
+
}
|
|
9190
|
+
return hasNonText && blocks.length > 0 ? blocks : void 0;
|
|
9191
|
+
}
|
|
9010
9192
|
function toFiniteNumber(value) {
|
|
9011
9193
|
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
9012
9194
|
return void 0;
|
|
@@ -10178,7 +10360,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
|
|
|
10178
10360
|
}
|
|
10179
10361
|
const msg = message;
|
|
10180
10362
|
const role = typeof msg.role === "string" ? msg.role : "unknown";
|
|
10181
|
-
const
|
|
10363
|
+
const structuredContent = toPiContentArray(msg.content);
|
|
10364
|
+
const content = structuredContent ?? extractPiTextContent(msg.content);
|
|
10182
10365
|
const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
|
|
10183
10366
|
const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
|
|
10184
10367
|
let msgTokenUsage;
|
|
@@ -10440,6 +10623,12 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
|
|
|
10440
10623
|
"FILES",
|
|
10441
10624
|
"OUTPUT_FILE"
|
|
10442
10625
|
]);
|
|
10626
|
+
var COMMON_TARGET_SETTINGS = [
|
|
10627
|
+
"provider_batching",
|
|
10628
|
+
"providerBatching",
|
|
10629
|
+
"subagent_mode_allowed",
|
|
10630
|
+
"subagentModeAllowed"
|
|
10631
|
+
];
|
|
10443
10632
|
var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
10444
10633
|
name: import_zod3.z.string().min(1, "target name is required"),
|
|
10445
10634
|
provider: import_zod3.z.string().min(1, "provider is required"),
|
|
@@ -10448,7 +10637,8 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
|
|
|
10448
10637
|
// backward compat
|
|
10449
10638
|
workers: import_zod3.z.number().int().min(1).optional(),
|
|
10450
10639
|
workspace_template: import_zod3.z.string().optional(),
|
|
10451
|
-
workspaceTemplate: import_zod3.z.string().optional()
|
|
10640
|
+
workspaceTemplate: import_zod3.z.string().optional(),
|
|
10641
|
+
subagent_mode_allowed: import_zod3.z.boolean().optional()
|
|
10452
10642
|
}).passthrough();
|
|
10453
10643
|
var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
|
|
10454
10644
|
var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
|
|
@@ -10511,42 +10701,40 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10511
10701
|
const providerBatching = resolveOptionalBoolean(
|
|
10512
10702
|
parsed.provider_batching ?? parsed.providerBatching
|
|
10513
10703
|
);
|
|
10704
|
+
const subagentModeAllowed = resolveOptionalBoolean(
|
|
10705
|
+
parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
|
|
10706
|
+
);
|
|
10707
|
+
const base = {
|
|
10708
|
+
name: parsed.name,
|
|
10709
|
+
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10710
|
+
workers: parsed.workers,
|
|
10711
|
+
providerBatching,
|
|
10712
|
+
subagentModeAllowed
|
|
10713
|
+
};
|
|
10514
10714
|
switch (provider) {
|
|
10515
10715
|
case "openai":
|
|
10516
10716
|
return {
|
|
10517
10717
|
kind: "openai",
|
|
10518
|
-
|
|
10519
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10520
|
-
workers: parsed.workers,
|
|
10521
|
-
providerBatching,
|
|
10718
|
+
...base,
|
|
10522
10719
|
config: resolveOpenAIConfig(parsed, env)
|
|
10523
10720
|
};
|
|
10524
10721
|
case "openrouter":
|
|
10525
10722
|
return {
|
|
10526
10723
|
kind: "openrouter",
|
|
10527
|
-
|
|
10528
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10529
|
-
workers: parsed.workers,
|
|
10530
|
-
providerBatching,
|
|
10724
|
+
...base,
|
|
10531
10725
|
config: resolveOpenRouterConfig(parsed, env)
|
|
10532
10726
|
};
|
|
10533
10727
|
case "azure":
|
|
10534
10728
|
case "azure-openai":
|
|
10535
10729
|
return {
|
|
10536
10730
|
kind: "azure",
|
|
10537
|
-
|
|
10538
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10539
|
-
workers: parsed.workers,
|
|
10540
|
-
providerBatching,
|
|
10731
|
+
...base,
|
|
10541
10732
|
config: resolveAzureConfig(parsed, env)
|
|
10542
10733
|
};
|
|
10543
10734
|
case "anthropic":
|
|
10544
10735
|
return {
|
|
10545
10736
|
kind: "anthropic",
|
|
10546
|
-
|
|
10547
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10548
|
-
workers: parsed.workers,
|
|
10549
|
-
providerBatching,
|
|
10737
|
+
...base,
|
|
10550
10738
|
config: resolveAnthropicConfig(parsed, env)
|
|
10551
10739
|
};
|
|
10552
10740
|
case "gemini":
|
|
@@ -10554,68 +10742,47 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10554
10742
|
case "google-gemini":
|
|
10555
10743
|
return {
|
|
10556
10744
|
kind: "gemini",
|
|
10557
|
-
|
|
10558
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10559
|
-
workers: parsed.workers,
|
|
10560
|
-
providerBatching,
|
|
10745
|
+
...base,
|
|
10561
10746
|
config: resolveGeminiConfig(parsed, env)
|
|
10562
10747
|
};
|
|
10563
10748
|
case "codex":
|
|
10564
10749
|
case "codex-cli":
|
|
10565
10750
|
return {
|
|
10566
10751
|
kind: "codex",
|
|
10567
|
-
|
|
10568
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10569
|
-
workers: parsed.workers,
|
|
10570
|
-
providerBatching,
|
|
10752
|
+
...base,
|
|
10571
10753
|
config: resolveCodexConfig(parsed, env, evalFilePath)
|
|
10572
10754
|
};
|
|
10573
10755
|
case "copilot-sdk":
|
|
10574
10756
|
case "copilot_sdk":
|
|
10575
10757
|
return {
|
|
10576
10758
|
kind: "copilot-sdk",
|
|
10577
|
-
|
|
10578
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10579
|
-
workers: parsed.workers,
|
|
10580
|
-
providerBatching,
|
|
10759
|
+
...base,
|
|
10581
10760
|
config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
|
|
10582
10761
|
};
|
|
10583
10762
|
case "copilot":
|
|
10584
10763
|
case "copilot-cli":
|
|
10585
10764
|
return {
|
|
10586
10765
|
kind: "copilot-cli",
|
|
10587
|
-
|
|
10588
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10589
|
-
workers: parsed.workers,
|
|
10590
|
-
providerBatching,
|
|
10766
|
+
...base,
|
|
10591
10767
|
config: resolveCopilotCliConfig(parsed, env, evalFilePath)
|
|
10592
10768
|
};
|
|
10593
10769
|
case "copilot-log":
|
|
10594
10770
|
return {
|
|
10595
10771
|
kind: "copilot-log",
|
|
10596
|
-
|
|
10597
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10598
|
-
workers: parsed.workers,
|
|
10599
|
-
providerBatching,
|
|
10772
|
+
...base,
|
|
10600
10773
|
config: resolveCopilotLogConfig(parsed, env)
|
|
10601
10774
|
};
|
|
10602
10775
|
case "pi":
|
|
10603
10776
|
case "pi-coding-agent":
|
|
10604
10777
|
return {
|
|
10605
10778
|
kind: "pi-coding-agent",
|
|
10606
|
-
|
|
10607
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10608
|
-
workers: parsed.workers,
|
|
10609
|
-
providerBatching,
|
|
10779
|
+
...base,
|
|
10610
10780
|
config: resolvePiCodingAgentConfig(parsed, env, evalFilePath)
|
|
10611
10781
|
};
|
|
10612
10782
|
case "pi-cli":
|
|
10613
10783
|
return {
|
|
10614
10784
|
kind: "pi-cli",
|
|
10615
|
-
|
|
10616
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10617
|
-
workers: parsed.workers,
|
|
10618
|
-
providerBatching,
|
|
10785
|
+
...base,
|
|
10619
10786
|
config: resolvePiCliConfig(parsed, env, evalFilePath)
|
|
10620
10787
|
};
|
|
10621
10788
|
case "claude":
|
|
@@ -10623,38 +10790,26 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10623
10790
|
case "claude-cli":
|
|
10624
10791
|
return {
|
|
10625
10792
|
kind: "claude-cli",
|
|
10626
|
-
|
|
10627
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10628
|
-
workers: parsed.workers,
|
|
10629
|
-
providerBatching,
|
|
10793
|
+
...base,
|
|
10630
10794
|
config: resolveClaudeConfig(parsed, env, evalFilePath)
|
|
10631
10795
|
};
|
|
10632
10796
|
case "claude-sdk":
|
|
10633
10797
|
return {
|
|
10634
10798
|
kind: "claude-sdk",
|
|
10635
|
-
|
|
10636
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10637
|
-
workers: parsed.workers,
|
|
10638
|
-
providerBatching,
|
|
10799
|
+
...base,
|
|
10639
10800
|
config: resolveClaudeConfig(parsed, env, evalFilePath)
|
|
10640
10801
|
};
|
|
10641
10802
|
case "mock":
|
|
10642
10803
|
return {
|
|
10643
10804
|
kind: "mock",
|
|
10644
|
-
|
|
10645
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10646
|
-
workers: parsed.workers,
|
|
10647
|
-
providerBatching,
|
|
10805
|
+
...base,
|
|
10648
10806
|
config: resolveMockConfig(parsed)
|
|
10649
10807
|
};
|
|
10650
10808
|
case "vscode":
|
|
10651
10809
|
case "vscode-insiders":
|
|
10652
10810
|
return {
|
|
10653
10811
|
kind: provider,
|
|
10654
|
-
|
|
10655
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10656
|
-
workers: parsed.workers,
|
|
10657
|
-
providerBatching,
|
|
10812
|
+
...base,
|
|
10658
10813
|
config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders", evalFilePath)
|
|
10659
10814
|
};
|
|
10660
10815
|
case "agentv": {
|
|
@@ -10667,29 +10822,21 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
|
|
|
10667
10822
|
const temperature = typeof parsed.temperature === "number" ? parsed.temperature : 0;
|
|
10668
10823
|
return {
|
|
10669
10824
|
kind: "agentv",
|
|
10670
|
-
|
|
10671
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10825
|
+
...base,
|
|
10672
10826
|
workers: typeof parsed.workers === "number" ? parsed.workers : void 0,
|
|
10673
|
-
providerBatching,
|
|
10674
10827
|
config: { model, temperature }
|
|
10675
10828
|
};
|
|
10676
10829
|
}
|
|
10677
10830
|
case "cli":
|
|
10678
10831
|
return {
|
|
10679
10832
|
kind: "cli",
|
|
10680
|
-
|
|
10681
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10682
|
-
workers: parsed.workers,
|
|
10683
|
-
providerBatching,
|
|
10833
|
+
...base,
|
|
10684
10834
|
config: resolveCliConfig(parsed, env, evalFilePath)
|
|
10685
10835
|
};
|
|
10686
10836
|
default:
|
|
10687
10837
|
return {
|
|
10688
10838
|
kind: "cli",
|
|
10689
|
-
|
|
10690
|
-
graderTarget: parsed.grader_target ?? parsed.judge_target,
|
|
10691
|
-
workers: parsed.workers,
|
|
10692
|
-
providerBatching,
|
|
10839
|
+
...base,
|
|
10693
10840
|
config: resolveDiscoveredProviderConfig(parsed, provider, env, evalFilePath)
|
|
10694
10841
|
};
|
|
10695
10842
|
}
|
|
@@ -11317,8 +11464,8 @@ function resolveCliConfig(target, env, evalFilePath) {
|
|
|
11317
11464
|
const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
|
|
11318
11465
|
if (!parseResult.success) {
|
|
11319
11466
|
const firstError = parseResult.error.errors[0];
|
|
11320
|
-
const
|
|
11321
|
-
const prefix =
|
|
11467
|
+
const path51 = firstError?.path.join(".") || "";
|
|
11468
|
+
const prefix = path51 ? `${target.name} ${path51}: ` : `${target.name}: `;
|
|
11322
11469
|
throw new Error(`${prefix}${firstError?.message}`);
|
|
11323
11470
|
}
|
|
11324
11471
|
const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
|
|
@@ -13007,6 +13154,41 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
|
|
|
13007
13154
|
}
|
|
13008
13155
|
}
|
|
13009
13156
|
|
|
13157
|
+
// src/evaluation/providers/types.ts
|
|
13158
|
+
var AGENT_PROVIDER_KINDS = [
|
|
13159
|
+
"codex",
|
|
13160
|
+
"copilot-sdk",
|
|
13161
|
+
"copilot-cli",
|
|
13162
|
+
"pi-coding-agent",
|
|
13163
|
+
"pi-cli",
|
|
13164
|
+
"claude",
|
|
13165
|
+
"claude-cli",
|
|
13166
|
+
"claude-sdk",
|
|
13167
|
+
"vscode",
|
|
13168
|
+
"vscode-insiders"
|
|
13169
|
+
];
|
|
13170
|
+
function extractLastAssistantContent(messages) {
|
|
13171
|
+
if (!messages || messages.length === 0) {
|
|
13172
|
+
return "";
|
|
13173
|
+
}
|
|
13174
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
13175
|
+
const msg = messages[i];
|
|
13176
|
+
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
13177
|
+
if (typeof msg.content === "string") {
|
|
13178
|
+
return msg.content;
|
|
13179
|
+
}
|
|
13180
|
+
if (isContentArray(msg.content)) {
|
|
13181
|
+
return getTextContent(msg.content);
|
|
13182
|
+
}
|
|
13183
|
+
return JSON.stringify(msg.content);
|
|
13184
|
+
}
|
|
13185
|
+
}
|
|
13186
|
+
return "";
|
|
13187
|
+
}
|
|
13188
|
+
function isAgentProvider(provider) {
|
|
13189
|
+
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
13190
|
+
}
|
|
13191
|
+
|
|
13010
13192
|
// src/evaluation/providers/targets-file.ts
|
|
13011
13193
|
var import_node_fs11 = require("fs");
|
|
13012
13194
|
var import_promises27 = require("fs/promises");
|
|
@@ -13319,13 +13501,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
13319
13501
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
13320
13502
|
const { mkdir: mkdir17, readFile: readFile16, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
13321
13503
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
13322
|
-
const
|
|
13504
|
+
const path51 = await import("path");
|
|
13323
13505
|
const { randomUUID: randomUUID10 } = await import("crypto");
|
|
13324
|
-
const dir =
|
|
13506
|
+
const dir = path51.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
13325
13507
|
await mkdir17(dir, { recursive: true });
|
|
13326
|
-
const stdinPath =
|
|
13327
|
-
const stdoutPath =
|
|
13328
|
-
const stderrPath =
|
|
13508
|
+
const stdinPath = path51.join(dir, "stdin.txt");
|
|
13509
|
+
const stdoutPath = path51.join(dir, "stdout.txt");
|
|
13510
|
+
const stderrPath = path51.join(dir, "stderr.txt");
|
|
13329
13511
|
await writeFile9(stdinPath, stdinPayload, "utf8");
|
|
13330
13512
|
const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
|
|
13331
13513
|
const { spawn: spawn5 } = await import("child_process");
|
|
@@ -13457,7 +13639,7 @@ async function createTargetProxy(options) {
|
|
|
13457
13639
|
totalOutputTokens += response.tokenUsage.output;
|
|
13458
13640
|
}
|
|
13459
13641
|
const output = response.output ?? [];
|
|
13460
|
-
const rawText =
|
|
13642
|
+
const rawText = extractLastAssistantContent2(output);
|
|
13461
13643
|
const result = {
|
|
13462
13644
|
output,
|
|
13463
13645
|
rawText,
|
|
@@ -13515,7 +13697,7 @@ async function createTargetProxy(options) {
|
|
|
13515
13697
|
const output = response.output ?? [];
|
|
13516
13698
|
responses.push({
|
|
13517
13699
|
output,
|
|
13518
|
-
rawText:
|
|
13700
|
+
rawText: extractLastAssistantContent2(output),
|
|
13519
13701
|
tokenUsage: response.tokenUsage
|
|
13520
13702
|
});
|
|
13521
13703
|
} catch (error) {
|
|
@@ -13572,7 +13754,7 @@ function readBody(req) {
|
|
|
13572
13754
|
req.on("error", reject);
|
|
13573
13755
|
});
|
|
13574
13756
|
}
|
|
13575
|
-
function
|
|
13757
|
+
function extractLastAssistantContent2(messages) {
|
|
13576
13758
|
for (let i = messages.length - 1; i >= 0; i--) {
|
|
13577
13759
|
const msg = messages[i];
|
|
13578
13760
|
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
@@ -13641,6 +13823,56 @@ function toCamelCaseDeep(obj) {
|
|
|
13641
13823
|
|
|
13642
13824
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
13643
13825
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
13826
|
+
var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
|
|
13827
|
+
async function materializeContentForGrader(messages, getWorkDir) {
|
|
13828
|
+
if (!messages || messages.length === 0) return messages ?? null;
|
|
13829
|
+
let hasAnyImage = false;
|
|
13830
|
+
for (const msg of messages) {
|
|
13831
|
+
if (isContentArray(msg.content)) {
|
|
13832
|
+
for (const block of msg.content) {
|
|
13833
|
+
if (block.type === "image") {
|
|
13834
|
+
hasAnyImage = true;
|
|
13835
|
+
break;
|
|
13836
|
+
}
|
|
13837
|
+
}
|
|
13838
|
+
}
|
|
13839
|
+
if (hasAnyImage) break;
|
|
13840
|
+
}
|
|
13841
|
+
if (!hasAnyImage) return messages;
|
|
13842
|
+
let counter = 0;
|
|
13843
|
+
const result = [];
|
|
13844
|
+
for (const msg of messages) {
|
|
13845
|
+
if (!isContentArray(msg.content)) {
|
|
13846
|
+
result.push(msg);
|
|
13847
|
+
continue;
|
|
13848
|
+
}
|
|
13849
|
+
if (!msg.content.some((b) => b.type === "image")) {
|
|
13850
|
+
result.push(msg);
|
|
13851
|
+
continue;
|
|
13852
|
+
}
|
|
13853
|
+
const blocks = [];
|
|
13854
|
+
for (const block of msg.content) {
|
|
13855
|
+
if (block.type !== "image") {
|
|
13856
|
+
blocks.push({ ...block });
|
|
13857
|
+
continue;
|
|
13858
|
+
}
|
|
13859
|
+
const img = block;
|
|
13860
|
+
const match = DATA_URI_RE.exec(img.source);
|
|
13861
|
+
if (match) {
|
|
13862
|
+
const [, mediaType, base64Data] = match;
|
|
13863
|
+
const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
|
|
13864
|
+
const dir = await getWorkDir();
|
|
13865
|
+
const filePath = (0, import_node_path38.join)(dir, `img-${counter++}.${ext}`);
|
|
13866
|
+
await (0, import_promises28.writeFile)(filePath, Buffer.from(base64Data, "base64"));
|
|
13867
|
+
blocks.push({ type: "image", media_type: img.media_type, path: filePath });
|
|
13868
|
+
} else {
|
|
13869
|
+
blocks.push({ type: "image", media_type: img.media_type, path: img.source });
|
|
13870
|
+
}
|
|
13871
|
+
}
|
|
13872
|
+
result.push({ ...msg, content: blocks });
|
|
13873
|
+
}
|
|
13874
|
+
return result;
|
|
13875
|
+
}
|
|
13644
13876
|
var CodeEvaluator = class {
|
|
13645
13877
|
kind = "code-grader";
|
|
13646
13878
|
command;
|
|
@@ -13656,7 +13888,18 @@ var CodeEvaluator = class {
|
|
|
13656
13888
|
this.target = options.target;
|
|
13657
13889
|
}
|
|
13658
13890
|
async evaluate(context2) {
|
|
13659
|
-
let
|
|
13891
|
+
let imageTmpDir;
|
|
13892
|
+
const getImageDir = async () => {
|
|
13893
|
+
if (!imageTmpDir) {
|
|
13894
|
+
imageTmpDir = await (0, import_promises28.mkdtemp)((0, import_node_path38.join)((0, import_node_os7.tmpdir)(), "agentv-img-"));
|
|
13895
|
+
}
|
|
13896
|
+
return imageTmpDir;
|
|
13897
|
+
};
|
|
13898
|
+
const materializedOutput = await materializeContentForGrader(
|
|
13899
|
+
context2.output,
|
|
13900
|
+
getImageDir
|
|
13901
|
+
);
|
|
13902
|
+
let outputForPayload = materializedOutput;
|
|
13660
13903
|
let outputPath;
|
|
13661
13904
|
if (outputForPayload) {
|
|
13662
13905
|
const serialized = JSON.stringify(outputForPayload);
|
|
@@ -13669,12 +13912,17 @@ var CodeEvaluator = class {
|
|
|
13669
13912
|
}
|
|
13670
13913
|
const payload = {
|
|
13671
13914
|
criteria: context2.evalCase.criteria,
|
|
13672
|
-
expectedOutput:
|
|
13673
|
-
|
|
13915
|
+
expectedOutput: await materializeContentForGrader(
|
|
13916
|
+
context2.evalCase.expected_output,
|
|
13917
|
+
getImageDir
|
|
13918
|
+
),
|
|
13674
13919
|
output: outputForPayload,
|
|
13675
13920
|
outputPath,
|
|
13676
13921
|
inputFiles: context2.evalCase.file_paths,
|
|
13677
|
-
input:
|
|
13922
|
+
input: await materializeContentForGrader(
|
|
13923
|
+
context2.evalCase.input,
|
|
13924
|
+
getImageDir
|
|
13925
|
+
),
|
|
13678
13926
|
trace: context2.trace ?? null,
|
|
13679
13927
|
tokenUsage: context2.tokenUsage ?? null,
|
|
13680
13928
|
costUsd: context2.costUsd ?? null,
|
|
@@ -13683,9 +13931,7 @@ var CodeEvaluator = class {
|
|
|
13683
13931
|
endTime: context2.endTime ?? null,
|
|
13684
13932
|
fileChanges: context2.fileChanges ?? null,
|
|
13685
13933
|
workspacePath: context2.workspacePath ?? null,
|
|
13686
|
-
config: this.config ?? null
|
|
13687
|
-
inputText: context2.evalCase.question,
|
|
13688
|
-
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
13934
|
+
config: this.config ?? null
|
|
13689
13935
|
};
|
|
13690
13936
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
13691
13937
|
let proxyEnv;
|
|
@@ -13775,6 +14021,10 @@ var CodeEvaluator = class {
|
|
|
13775
14021
|
await (0, import_promises28.rm)((0, import_node_path38.dirname)(outputPath), { recursive: true, force: true }).catch(() => {
|
|
13776
14022
|
});
|
|
13777
14023
|
}
|
|
14024
|
+
if (imageTmpDir) {
|
|
14025
|
+
await (0, import_promises28.rm)(imageTmpDir, { recursive: true, force: true }).catch(() => {
|
|
14026
|
+
});
|
|
14027
|
+
}
|
|
13778
14028
|
}
|
|
13779
14029
|
}
|
|
13780
14030
|
};
|
|
@@ -13802,38 +14052,6 @@ ${tail}`;
|
|
|
13802
14052
|
// src/evaluation/evaluators/composite.ts
|
|
13803
14053
|
var import_ai3 = require("ai");
|
|
13804
14054
|
|
|
13805
|
-
// src/evaluation/providers/types.ts
|
|
13806
|
-
var AGENT_PROVIDER_KINDS = [
|
|
13807
|
-
"codex",
|
|
13808
|
-
"copilot-sdk",
|
|
13809
|
-
"copilot-cli",
|
|
13810
|
-
"pi-coding-agent",
|
|
13811
|
-
"pi-cli",
|
|
13812
|
-
"claude",
|
|
13813
|
-
"claude-cli",
|
|
13814
|
-
"claude-sdk",
|
|
13815
|
-
"vscode",
|
|
13816
|
-
"vscode-insiders"
|
|
13817
|
-
];
|
|
13818
|
-
function extractLastAssistantContent2(messages) {
|
|
13819
|
-
if (!messages || messages.length === 0) {
|
|
13820
|
-
return "";
|
|
13821
|
-
}
|
|
13822
|
-
for (let i = messages.length - 1; i >= 0; i--) {
|
|
13823
|
-
const msg = messages[i];
|
|
13824
|
-
if (msg.role === "assistant" && msg.content !== void 0) {
|
|
13825
|
-
if (typeof msg.content === "string") {
|
|
13826
|
-
return msg.content;
|
|
13827
|
-
}
|
|
13828
|
-
return JSON.stringify(msg.content);
|
|
13829
|
-
}
|
|
13830
|
-
}
|
|
13831
|
-
return "";
|
|
13832
|
-
}
|
|
13833
|
-
function isAgentProvider(provider) {
|
|
13834
|
-
return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
|
|
13835
|
-
}
|
|
13836
|
-
|
|
13837
14055
|
// src/evaluation/evaluators/llm-grader.ts
|
|
13838
14056
|
var import_promises29 = __toESM(require("fs/promises"), 1);
|
|
13839
14057
|
var import_node_path39 = __toESM(require("path"), 1);
|
|
@@ -13884,13 +14102,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
13884
14102
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
13885
14103
|
|
|
13886
14104
|
[[ ## question ## ]]
|
|
13887
|
-
{{${TEMPLATE_VARIABLES.
|
|
14105
|
+
{{${TEMPLATE_VARIABLES.INPUT}}}
|
|
13888
14106
|
|
|
13889
14107
|
[[ ## reference_answer ## ]]
|
|
13890
|
-
{{${TEMPLATE_VARIABLES.
|
|
14108
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
|
|
13891
14109
|
|
|
13892
14110
|
[[ ## answer ## ]]
|
|
13893
|
-
{{${TEMPLATE_VARIABLES.
|
|
14111
|
+
{{${TEMPLATE_VARIABLES.OUTPUT}}}`;
|
|
13894
14112
|
var freeformEvaluationSchema = import_zod4.z.object({
|
|
13895
14113
|
score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
13896
14114
|
assertions: import_zod4.z.array(
|
|
@@ -13962,21 +14180,19 @@ var LlmGraderEvaluator = class {
|
|
|
13962
14180
|
async evaluateFreeform(context2, graderProvider) {
|
|
13963
14181
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13964
14182
|
const variables = {
|
|
13965
|
-
[TEMPLATE_VARIABLES.INPUT]:
|
|
13966
|
-
[TEMPLATE_VARIABLES.
|
|
13967
|
-
|
|
13968
|
-
null,
|
|
13969
|
-
2
|
|
13970
|
-
),
|
|
13971
|
-
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
|
|
14183
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
14184
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
14185
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13972
14186
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
13973
14187
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
14188
|
+
// Deprecated aliases — same values as the primary variables above
|
|
13974
14189
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
13975
14190
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
13976
14191
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
13977
14192
|
};
|
|
13978
14193
|
const systemPrompt = buildOutputSchema();
|
|
13979
14194
|
const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
14195
|
+
warnDeprecatedTemplateVars(evaluatorTemplate);
|
|
13980
14196
|
let userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
13981
14197
|
if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
|
|
13982
14198
|
userPrompt += `
|
|
@@ -13988,13 +14204,15 @@ ${context2.fileChanges}`;
|
|
|
13988
14204
|
userPrompt,
|
|
13989
14205
|
systemPrompt
|
|
13990
14206
|
};
|
|
14207
|
+
const images = context2.output ? extractImageBlocks(context2.output) : [];
|
|
13991
14208
|
try {
|
|
13992
14209
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
13993
14210
|
context: context2,
|
|
13994
14211
|
graderProvider,
|
|
13995
14212
|
systemPrompt,
|
|
13996
14213
|
userPrompt,
|
|
13997
|
-
schema: freeformEvaluationSchema
|
|
14214
|
+
schema: freeformEvaluationSchema,
|
|
14215
|
+
images
|
|
13998
14216
|
});
|
|
13999
14217
|
const score = clampScore(data.score);
|
|
14000
14218
|
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
@@ -14038,13 +14256,15 @@ ${context2.fileChanges}`;
|
|
|
14038
14256
|
userPrompt: prompt,
|
|
14039
14257
|
systemPrompt
|
|
14040
14258
|
};
|
|
14259
|
+
const images = context2.output ? extractImageBlocks(context2.output) : [];
|
|
14041
14260
|
try {
|
|
14042
14261
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
14043
14262
|
context: context2,
|
|
14044
14263
|
graderProvider,
|
|
14045
14264
|
systemPrompt,
|
|
14046
14265
|
userPrompt: prompt,
|
|
14047
|
-
schema: rubricEvaluationSchema
|
|
14266
|
+
schema: rubricEvaluationSchema,
|
|
14267
|
+
images
|
|
14048
14268
|
});
|
|
14049
14269
|
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
14050
14270
|
return {
|
|
@@ -14081,13 +14301,15 @@ ${context2.fileChanges}`;
|
|
|
14081
14301
|
userPrompt: prompt,
|
|
14082
14302
|
systemPrompt
|
|
14083
14303
|
};
|
|
14304
|
+
const images = context2.output ? extractImageBlocks(context2.output) : [];
|
|
14084
14305
|
try {
|
|
14085
14306
|
const { data, tokenUsage } = await this.runWithRetry({
|
|
14086
14307
|
context: context2,
|
|
14087
14308
|
graderProvider,
|
|
14088
14309
|
systemPrompt,
|
|
14089
14310
|
userPrompt: prompt,
|
|
14090
|
-
schema: scoreRangeEvaluationSchema
|
|
14311
|
+
schema: scoreRangeEvaluationSchema,
|
|
14312
|
+
images
|
|
14091
14313
|
});
|
|
14092
14314
|
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
14093
14315
|
return {
|
|
@@ -14217,7 +14439,7 @@ ${context2.fileChanges}`;
|
|
|
14217
14439
|
evalCaseId: context2.evalCase.id,
|
|
14218
14440
|
attempt: context2.attempt
|
|
14219
14441
|
});
|
|
14220
|
-
const assistantContent =
|
|
14442
|
+
const assistantContent = extractLastAssistantContent(response.output);
|
|
14221
14443
|
if (!assistantContent) {
|
|
14222
14444
|
return {
|
|
14223
14445
|
score: 0,
|
|
@@ -14294,12 +14516,17 @@ ${context2.fileChanges}`;
|
|
|
14294
14516
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
14295
14517
|
const variables = {
|
|
14296
14518
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
14519
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
14520
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
14521
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
14522
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
14523
|
+
// Deprecated aliases
|
|
14297
14524
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
14298
14525
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
14299
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
14300
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
14526
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
14301
14527
|
};
|
|
14302
14528
|
if (this.evaluatorTemplate) {
|
|
14529
|
+
warnDeprecatedTemplateVars(this.evaluatorTemplate);
|
|
14303
14530
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
14304
14531
|
}
|
|
14305
14532
|
const config = context2.evaluator;
|
|
@@ -14350,11 +14577,16 @@ ${context2.fileChanges}`;
|
|
|
14350
14577
|
if (this.evaluatorTemplate) {
|
|
14351
14578
|
const variables = {
|
|
14352
14579
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
14580
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
14581
|
+
[TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
|
|
14582
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
14583
|
+
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
14584
|
+
// Deprecated aliases
|
|
14353
14585
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
14354
14586
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
14355
|
-
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
14356
|
-
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
14587
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
14357
14588
|
};
|
|
14589
|
+
warnDeprecatedTemplateVars(this.evaluatorTemplate);
|
|
14358
14590
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
14359
14591
|
const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
|
|
14360
14592
|
return `${customPrompt}
|
|
@@ -14525,18 +14757,35 @@ ${outputSchema}`;
|
|
|
14525
14757
|
// LLM mode retry logic
|
|
14526
14758
|
// ---------------------------------------------------------------------------
|
|
14527
14759
|
async runWithRetry(options) {
|
|
14528
|
-
const { context: context2, graderProvider, systemPrompt, userPrompt, schema } = options;
|
|
14760
|
+
const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
|
|
14529
14761
|
let lastError;
|
|
14530
14762
|
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
14531
14763
|
try {
|
|
14532
14764
|
const model = graderProvider.asLanguageModel?.();
|
|
14533
14765
|
if (model) {
|
|
14534
|
-
const
|
|
14766
|
+
const modelOptions = {
|
|
14767
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
14768
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
14769
|
+
};
|
|
14770
|
+
const hasImages = images && images.length > 0;
|
|
14771
|
+
const result = hasImages ? await (0, import_ai2.generateText)({
|
|
14772
|
+
model,
|
|
14773
|
+
system: systemPrompt,
|
|
14774
|
+
messages: [
|
|
14775
|
+
{
|
|
14776
|
+
role: "user",
|
|
14777
|
+
content: [
|
|
14778
|
+
{ type: "text", text: userPrompt },
|
|
14779
|
+
...toAiSdkImageParts(images)
|
|
14780
|
+
]
|
|
14781
|
+
}
|
|
14782
|
+
],
|
|
14783
|
+
...modelOptions
|
|
14784
|
+
}) : await (0, import_ai2.generateText)({
|
|
14535
14785
|
model,
|
|
14536
14786
|
system: systemPrompt,
|
|
14537
14787
|
prompt: userPrompt,
|
|
14538
|
-
...
|
|
14539
|
-
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
14788
|
+
...modelOptions
|
|
14540
14789
|
});
|
|
14541
14790
|
const data2 = schema.parse(parseJsonFromText(result.text));
|
|
14542
14791
|
const rawUsage = result.usage;
|
|
@@ -14551,7 +14800,7 @@ ${outputSchema}`;
|
|
|
14551
14800
|
maxOutputTokens: this.maxOutputTokens,
|
|
14552
14801
|
temperature: this.temperature
|
|
14553
14802
|
});
|
|
14554
|
-
const data = schema.parse(parseJsonFromText(
|
|
14803
|
+
const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
|
|
14555
14804
|
return { data, providerResponse: response, tokenUsage: response.tokenUsage };
|
|
14556
14805
|
} catch (e) {
|
|
14557
14806
|
lastError = e instanceof Error ? e : new Error(String(e));
|
|
@@ -14596,6 +14845,26 @@ function substituteVariables(template, variables) {
|
|
|
14596
14845
|
return variables[varName] ?? match;
|
|
14597
14846
|
});
|
|
14598
14847
|
}
|
|
14848
|
+
var ANSI_YELLOW8 = "\x1B[33m";
|
|
14849
|
+
var ANSI_RESET9 = "\x1B[0m";
|
|
14850
|
+
var warnedTemplateStrings = /* @__PURE__ */ new Set();
|
|
14851
|
+
function warnDeprecatedTemplateVars(template) {
|
|
14852
|
+
if (warnedTemplateStrings.has(template)) return;
|
|
14853
|
+
const used = [];
|
|
14854
|
+
for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
|
|
14855
|
+
if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
|
|
14856
|
+
used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
|
|
14857
|
+
}
|
|
14858
|
+
}
|
|
14859
|
+
if (used.length > 0) {
|
|
14860
|
+
warnedTemplateStrings.add(template);
|
|
14861
|
+
console.warn(
|
|
14862
|
+
`${ANSI_YELLOW8}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
|
|
14863
|
+
${used.join("\n ")}
|
|
14864
|
+
Update your custom evaluator template to use the new names.${ANSI_RESET9}`
|
|
14865
|
+
);
|
|
14866
|
+
}
|
|
14867
|
+
}
|
|
14599
14868
|
function calculateRubricScore(result, rubrics) {
|
|
14600
14869
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
14601
14870
|
const assertions = [];
|
|
@@ -14690,6 +14959,26 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
14690
14959
|
}
|
|
14691
14960
|
};
|
|
14692
14961
|
}
|
|
14962
|
+
function extractImageBlocks(messages) {
|
|
14963
|
+
const images = [];
|
|
14964
|
+
for (const msg of messages) {
|
|
14965
|
+
if (msg.role !== "assistant") continue;
|
|
14966
|
+
if (!isContentArray(msg.content)) continue;
|
|
14967
|
+
for (const block of msg.content) {
|
|
14968
|
+
if (block.type === "image") {
|
|
14969
|
+
images.push(block);
|
|
14970
|
+
}
|
|
14971
|
+
}
|
|
14972
|
+
}
|
|
14973
|
+
return images;
|
|
14974
|
+
}
|
|
14975
|
+
function toAiSdkImageParts(images) {
|
|
14976
|
+
return images.map((img) => ({
|
|
14977
|
+
type: "image",
|
|
14978
|
+
image: img.source,
|
|
14979
|
+
mediaType: img.media_type || void 0
|
|
14980
|
+
}));
|
|
14981
|
+
}
|
|
14693
14982
|
function resolveSandboxed(basePath, relativePath) {
|
|
14694
14983
|
const resolved = import_node_path39.default.resolve(basePath, relativePath);
|
|
14695
14984
|
if (!resolved.startsWith(basePath + import_node_path39.default.sep) && resolved !== basePath) {
|
|
@@ -15075,7 +15364,7 @@ var CompositeEvaluator = class {
|
|
|
15075
15364
|
attempt: context2.attempt
|
|
15076
15365
|
});
|
|
15077
15366
|
const data = freeformEvaluationSchema.parse(
|
|
15078
|
-
parseJsonFromText(
|
|
15367
|
+
parseJsonFromText(extractLastAssistantContent(response.output))
|
|
15079
15368
|
);
|
|
15080
15369
|
const score = clampScore(data.score);
|
|
15081
15370
|
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
@@ -15431,115 +15720,115 @@ var FieldAccuracyEvaluator = class {
|
|
|
15431
15720
|
* Evaluate a single field against the expected value.
|
|
15432
15721
|
*/
|
|
15433
15722
|
evaluateField(fieldConfig, candidateData, expectedData) {
|
|
15434
|
-
const { path:
|
|
15435
|
-
const candidateValue = resolvePath(candidateData,
|
|
15436
|
-
const expectedValue = resolvePath(expectedData,
|
|
15723
|
+
const { path: path51, match, required = true, weight = 1 } = fieldConfig;
|
|
15724
|
+
const candidateValue = resolvePath(candidateData, path51);
|
|
15725
|
+
const expectedValue = resolvePath(expectedData, path51);
|
|
15437
15726
|
if (expectedValue === void 0) {
|
|
15438
15727
|
return {
|
|
15439
|
-
path:
|
|
15728
|
+
path: path51,
|
|
15440
15729
|
score: 1,
|
|
15441
15730
|
// No expected value means no comparison needed
|
|
15442
15731
|
weight,
|
|
15443
15732
|
hit: true,
|
|
15444
|
-
message: `${
|
|
15733
|
+
message: `${path51}: no expected value`
|
|
15445
15734
|
};
|
|
15446
15735
|
}
|
|
15447
15736
|
if (candidateValue === void 0) {
|
|
15448
15737
|
if (required) {
|
|
15449
15738
|
return {
|
|
15450
|
-
path:
|
|
15739
|
+
path: path51,
|
|
15451
15740
|
score: 0,
|
|
15452
15741
|
weight,
|
|
15453
15742
|
hit: false,
|
|
15454
|
-
message: `${
|
|
15743
|
+
message: `${path51} (required, missing)`
|
|
15455
15744
|
};
|
|
15456
15745
|
}
|
|
15457
15746
|
return {
|
|
15458
|
-
path:
|
|
15747
|
+
path: path51,
|
|
15459
15748
|
score: 1,
|
|
15460
15749
|
// Don't penalize missing optional fields
|
|
15461
15750
|
weight: 0,
|
|
15462
15751
|
// Zero weight means it won't affect the score
|
|
15463
15752
|
hit: true,
|
|
15464
|
-
message: `${
|
|
15753
|
+
message: `${path51}: optional field missing`
|
|
15465
15754
|
};
|
|
15466
15755
|
}
|
|
15467
15756
|
switch (match) {
|
|
15468
15757
|
case "exact":
|
|
15469
|
-
return this.compareExact(
|
|
15758
|
+
return this.compareExact(path51, candidateValue, expectedValue, weight);
|
|
15470
15759
|
case "numeric_tolerance":
|
|
15471
15760
|
return this.compareNumericTolerance(
|
|
15472
|
-
|
|
15761
|
+
path51,
|
|
15473
15762
|
candidateValue,
|
|
15474
15763
|
expectedValue,
|
|
15475
15764
|
fieldConfig,
|
|
15476
15765
|
weight
|
|
15477
15766
|
);
|
|
15478
15767
|
case "date":
|
|
15479
|
-
return this.compareDate(
|
|
15768
|
+
return this.compareDate(path51, candidateValue, expectedValue, fieldConfig, weight);
|
|
15480
15769
|
default:
|
|
15481
15770
|
return {
|
|
15482
|
-
path:
|
|
15771
|
+
path: path51,
|
|
15483
15772
|
score: 0,
|
|
15484
15773
|
weight,
|
|
15485
15774
|
hit: false,
|
|
15486
|
-
message: `${
|
|
15775
|
+
message: `${path51}: unknown match type "${match}"`
|
|
15487
15776
|
};
|
|
15488
15777
|
}
|
|
15489
15778
|
}
|
|
15490
15779
|
/**
|
|
15491
15780
|
* Exact equality comparison.
|
|
15492
15781
|
*/
|
|
15493
|
-
compareExact(
|
|
15782
|
+
compareExact(path51, candidateValue, expectedValue, weight) {
|
|
15494
15783
|
if (deepEqual(candidateValue, expectedValue)) {
|
|
15495
15784
|
return {
|
|
15496
|
-
path:
|
|
15785
|
+
path: path51,
|
|
15497
15786
|
score: 1,
|
|
15498
15787
|
weight,
|
|
15499
15788
|
hit: true,
|
|
15500
|
-
message:
|
|
15789
|
+
message: path51
|
|
15501
15790
|
};
|
|
15502
15791
|
}
|
|
15503
15792
|
if (typeof candidateValue !== typeof expectedValue) {
|
|
15504
15793
|
return {
|
|
15505
|
-
path:
|
|
15794
|
+
path: path51,
|
|
15506
15795
|
score: 0,
|
|
15507
15796
|
weight,
|
|
15508
15797
|
hit: false,
|
|
15509
|
-
message: `${
|
|
15798
|
+
message: `${path51} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
|
|
15510
15799
|
};
|
|
15511
15800
|
}
|
|
15512
15801
|
return {
|
|
15513
|
-
path:
|
|
15802
|
+
path: path51,
|
|
15514
15803
|
score: 0,
|
|
15515
15804
|
weight,
|
|
15516
15805
|
hit: false,
|
|
15517
|
-
message: `${
|
|
15806
|
+
message: `${path51} (value mismatch)`
|
|
15518
15807
|
};
|
|
15519
15808
|
}
|
|
15520
15809
|
/**
|
|
15521
15810
|
* Numeric comparison with absolute or relative tolerance.
|
|
15522
15811
|
*/
|
|
15523
|
-
compareNumericTolerance(
|
|
15812
|
+
compareNumericTolerance(path51, candidateValue, expectedValue, fieldConfig, weight) {
|
|
15524
15813
|
const { tolerance = 0, relative = false } = fieldConfig;
|
|
15525
15814
|
const candidateNum = toNumber(candidateValue);
|
|
15526
15815
|
const expectedNum = toNumber(expectedValue);
|
|
15527
15816
|
if (candidateNum === null || expectedNum === null) {
|
|
15528
15817
|
return {
|
|
15529
|
-
path:
|
|
15818
|
+
path: path51,
|
|
15530
15819
|
score: 0,
|
|
15531
15820
|
weight,
|
|
15532
15821
|
hit: false,
|
|
15533
|
-
message: `${
|
|
15822
|
+
message: `${path51} (non-numeric value)`
|
|
15534
15823
|
};
|
|
15535
15824
|
}
|
|
15536
15825
|
if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
|
|
15537
15826
|
return {
|
|
15538
|
-
path:
|
|
15827
|
+
path: path51,
|
|
15539
15828
|
score: 0,
|
|
15540
15829
|
weight,
|
|
15541
15830
|
hit: false,
|
|
15542
|
-
message: `${
|
|
15831
|
+
message: `${path51} (invalid numeric value)`
|
|
15543
15832
|
};
|
|
15544
15833
|
}
|
|
15545
15834
|
const diff = Math.abs(candidateNum - expectedNum);
|
|
@@ -15552,61 +15841,61 @@ var FieldAccuracyEvaluator = class {
|
|
|
15552
15841
|
}
|
|
15553
15842
|
if (withinTolerance) {
|
|
15554
15843
|
return {
|
|
15555
|
-
path:
|
|
15844
|
+
path: path51,
|
|
15556
15845
|
score: 1,
|
|
15557
15846
|
weight,
|
|
15558
15847
|
hit: true,
|
|
15559
|
-
message: `${
|
|
15848
|
+
message: `${path51} (within tolerance: diff=${diff.toFixed(2)})`
|
|
15560
15849
|
};
|
|
15561
15850
|
}
|
|
15562
15851
|
return {
|
|
15563
|
-
path:
|
|
15852
|
+
path: path51,
|
|
15564
15853
|
score: 0,
|
|
15565
15854
|
weight,
|
|
15566
15855
|
hit: false,
|
|
15567
|
-
message: `${
|
|
15856
|
+
message: `${path51} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
|
|
15568
15857
|
};
|
|
15569
15858
|
}
|
|
15570
15859
|
/**
|
|
15571
15860
|
* Date comparison with format normalization.
|
|
15572
15861
|
*/
|
|
15573
|
-
compareDate(
|
|
15862
|
+
compareDate(path51, candidateValue, expectedValue, fieldConfig, weight) {
|
|
15574
15863
|
const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
|
|
15575
15864
|
const candidateDate = parseDate(String(candidateValue), formats);
|
|
15576
15865
|
const expectedDate = parseDate(String(expectedValue), formats);
|
|
15577
15866
|
if (candidateDate === null) {
|
|
15578
15867
|
return {
|
|
15579
|
-
path:
|
|
15868
|
+
path: path51,
|
|
15580
15869
|
score: 0,
|
|
15581
15870
|
weight,
|
|
15582
15871
|
hit: false,
|
|
15583
|
-
message: `${
|
|
15872
|
+
message: `${path51} (unparseable candidate date)`
|
|
15584
15873
|
};
|
|
15585
15874
|
}
|
|
15586
15875
|
if (expectedDate === null) {
|
|
15587
15876
|
return {
|
|
15588
|
-
path:
|
|
15877
|
+
path: path51,
|
|
15589
15878
|
score: 0,
|
|
15590
15879
|
weight,
|
|
15591
15880
|
hit: false,
|
|
15592
|
-
message: `${
|
|
15881
|
+
message: `${path51} (unparseable expected date)`
|
|
15593
15882
|
};
|
|
15594
15883
|
}
|
|
15595
15884
|
if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
|
|
15596
15885
|
return {
|
|
15597
|
-
path:
|
|
15886
|
+
path: path51,
|
|
15598
15887
|
score: 1,
|
|
15599
15888
|
weight,
|
|
15600
15889
|
hit: true,
|
|
15601
|
-
message:
|
|
15890
|
+
message: path51
|
|
15602
15891
|
};
|
|
15603
15892
|
}
|
|
15604
15893
|
return {
|
|
15605
|
-
path:
|
|
15894
|
+
path: path51,
|
|
15606
15895
|
score: 0,
|
|
15607
15896
|
weight,
|
|
15608
15897
|
hit: false,
|
|
15609
|
-
message: `${
|
|
15898
|
+
message: `${path51} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
|
|
15610
15899
|
};
|
|
15611
15900
|
}
|
|
15612
15901
|
/**
|
|
@@ -15639,11 +15928,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
15639
15928
|
};
|
|
15640
15929
|
}
|
|
15641
15930
|
};
|
|
15642
|
-
function resolvePath(obj,
|
|
15643
|
-
if (!
|
|
15931
|
+
function resolvePath(obj, path51) {
|
|
15932
|
+
if (!path51 || !obj) {
|
|
15644
15933
|
return void 0;
|
|
15645
15934
|
}
|
|
15646
|
-
const parts =
|
|
15935
|
+
const parts = path51.split(/\.|\[|\]/).filter((p) => p.length > 0);
|
|
15647
15936
|
let current = obj;
|
|
15648
15937
|
for (const part of parts) {
|
|
15649
15938
|
if (current === null || current === void 0) {
|
|
@@ -15935,11 +16224,12 @@ function assembleLlmGraderPrompt(input) {
|
|
|
15935
16224
|
function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
|
|
15936
16225
|
const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
|
|
15937
16226
|
const variables = {
|
|
15938
|
-
[TEMPLATE_VARIABLES.INPUT]:
|
|
15939
|
-
[TEMPLATE_VARIABLES.
|
|
15940
|
-
[TEMPLATE_VARIABLES.
|
|
16227
|
+
[TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
|
|
16228
|
+
[TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
|
|
16229
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
|
|
15941
16230
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
15942
16231
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
16232
|
+
// Deprecated aliases
|
|
15943
16233
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
15944
16234
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
15945
16235
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -16126,8 +16416,8 @@ var TokenUsageEvaluator = class {
|
|
|
16126
16416
|
};
|
|
16127
16417
|
|
|
16128
16418
|
// src/evaluation/evaluators/tool-trajectory.ts
|
|
16129
|
-
function getNestedValue(obj,
|
|
16130
|
-
const parts =
|
|
16419
|
+
function getNestedValue(obj, path51) {
|
|
16420
|
+
const parts = path51.split(".");
|
|
16131
16421
|
let current = obj;
|
|
16132
16422
|
for (const part of parts) {
|
|
16133
16423
|
if (current === null || current === void 0 || typeof current !== "object") {
|
|
@@ -16996,16 +17286,13 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
|
16996
17286
|
const payload = {
|
|
16997
17287
|
criteria: context2.evalCase.criteria,
|
|
16998
17288
|
expectedOutput: context2.evalCase.expected_output,
|
|
16999
|
-
outputText: context2.candidate,
|
|
17000
17289
|
output: context2.output ?? null,
|
|
17001
17290
|
inputFiles: context2.evalCase.file_paths,
|
|
17002
17291
|
input: context2.evalCase.input,
|
|
17003
17292
|
trace: context2.trace ?? null,
|
|
17004
17293
|
fileChanges: context2.fileChanges ?? null,
|
|
17005
17294
|
workspacePath: context2.workspacePath ?? null,
|
|
17006
|
-
config: config ?? context2.config ?? null
|
|
17007
|
-
inputText: context2.evalCase.question,
|
|
17008
|
-
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
17295
|
+
config: config ?? context2.config ?? null
|
|
17009
17296
|
};
|
|
17010
17297
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
17011
17298
|
const scriptPath = script[script.length - 1];
|
|
@@ -18685,7 +18972,8 @@ async function runEvaluation(options) {
|
|
|
18685
18972
|
const budgetResult = {
|
|
18686
18973
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
18687
18974
|
testId: evalCase.id,
|
|
18688
|
-
|
|
18975
|
+
dataset: evalCase.dataset,
|
|
18976
|
+
category: evalCase.category,
|
|
18689
18977
|
score: 0,
|
|
18690
18978
|
assertions: [],
|
|
18691
18979
|
output: [],
|
|
@@ -18721,7 +19009,8 @@ async function runEvaluation(options) {
|
|
|
18721
19009
|
const haltResult = {
|
|
18722
19010
|
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
18723
19011
|
testId: evalCase.id,
|
|
18724
|
-
|
|
19012
|
+
dataset: evalCase.dataset,
|
|
19013
|
+
category: evalCase.category,
|
|
18725
19014
|
score: 0,
|
|
18726
19015
|
assertions: [],
|
|
18727
19016
|
output: [],
|
|
@@ -19004,7 +19293,7 @@ async function runBatchEvaluation(options) {
|
|
|
19004
19293
|
const tokenUsage = merged?.tokenUsage;
|
|
19005
19294
|
const startTime = merged?.startTime;
|
|
19006
19295
|
const endTime = merged?.endTime;
|
|
19007
|
-
const candidate =
|
|
19296
|
+
const candidate = extractLastAssistantContent(output);
|
|
19008
19297
|
const providerError = extractProviderError(providerResponse);
|
|
19009
19298
|
let result;
|
|
19010
19299
|
try {
|
|
@@ -19412,7 +19701,7 @@ async function runEvalCase(options) {
|
|
|
19412
19701
|
const tokenUsage = merged?.tokenUsage;
|
|
19413
19702
|
const startTime = merged?.startTime;
|
|
19414
19703
|
const endTime = merged?.endTime;
|
|
19415
|
-
const candidate =
|
|
19704
|
+
const candidate = extractLastAssistantContent(output);
|
|
19416
19705
|
let fileChanges;
|
|
19417
19706
|
if (baselineCommit && workspacePath) {
|
|
19418
19707
|
try {
|
|
@@ -19720,7 +20009,8 @@ async function evaluateCandidate(options) {
|
|
|
19720
20009
|
return {
|
|
19721
20010
|
timestamp: completedAt.toISOString(),
|
|
19722
20011
|
testId: evalCase.id,
|
|
19723
|
-
|
|
20012
|
+
dataset: evalCase.dataset,
|
|
20013
|
+
category: evalCase.category,
|
|
19724
20014
|
conversationId: evalCase.conversation_id,
|
|
19725
20015
|
score: score.score,
|
|
19726
20016
|
assertions: score.assertions,
|
|
@@ -20070,7 +20360,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
20070
20360
|
return {
|
|
20071
20361
|
timestamp: timestamp.toISOString(),
|
|
20072
20362
|
testId: evalCase.id,
|
|
20073
|
-
|
|
20363
|
+
dataset: evalCase.dataset,
|
|
20364
|
+
category: evalCase.category,
|
|
20074
20365
|
conversationId: evalCase.conversation_id,
|
|
20075
20366
|
score: 0,
|
|
20076
20367
|
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
@@ -20643,6 +20934,18 @@ function trimBaselineResult(result) {
|
|
|
20643
20934
|
return trimmed;
|
|
20644
20935
|
}
|
|
20645
20936
|
|
|
20937
|
+
// src/evaluation/category.ts
|
|
20938
|
+
var import_node_path51 = __toESM(require("path"), 1);
|
|
20939
|
+
var DEFAULT_CATEGORY = "Uncategorized";
|
|
20940
|
+
function deriveCategory(relativePath) {
|
|
20941
|
+
const parts = relativePath.split(import_node_path51.default.sep);
|
|
20942
|
+
if (parts.length <= 1) {
|
|
20943
|
+
return DEFAULT_CATEGORY;
|
|
20944
|
+
}
|
|
20945
|
+
const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
|
|
20946
|
+
return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
|
|
20947
|
+
}
|
|
20948
|
+
|
|
20646
20949
|
// src/observability/otel-exporter.ts
|
|
20647
20950
|
var OTEL_BACKEND_PRESETS = {
|
|
20648
20951
|
langfuse: {
|
|
@@ -20766,7 +21069,7 @@ var OtelTraceExporter = class {
|
|
|
20766
21069
|
rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
20767
21070
|
rootSpan.setAttribute("agentv.test_id", result.testId);
|
|
20768
21071
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
20769
|
-
if (result.
|
|
21072
|
+
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
20770
21073
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
20771
21074
|
if (captureContent && result.output.length > 0) {
|
|
20772
21075
|
const lastMsg = result.output[result.output.length - 1];
|
|
@@ -20975,7 +21278,7 @@ var OtelStreamingObserver = class {
|
|
|
20975
21278
|
this.rootSpan.setAttribute("gen_ai.system", "agentv");
|
|
20976
21279
|
this.rootSpan.setAttribute("agentv.test_id", testId);
|
|
20977
21280
|
this.rootSpan.setAttribute("agentv.target", target);
|
|
20978
|
-
if (evalSet) this.rootSpan.setAttribute("agentv.
|
|
21281
|
+
if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
|
|
20979
21282
|
this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
|
|
20980
21283
|
}
|
|
20981
21284
|
/** Create and immediately export a tool span */
|
|
@@ -21151,9 +21454,11 @@ function createAgentKernel() {
|
|
|
21151
21454
|
}
|
|
21152
21455
|
// Annotate the CommonJS export names for ESM import in node:
|
|
21153
21456
|
0 && (module.exports = {
|
|
21457
|
+
COMMON_TARGET_SETTINGS,
|
|
21154
21458
|
CodeEvaluator,
|
|
21155
21459
|
CompositeEvaluator,
|
|
21156
21460
|
CostEvaluator,
|
|
21461
|
+
DEFAULT_CATEGORY,
|
|
21157
21462
|
DEFAULT_EVALUATOR_TEMPLATE,
|
|
21158
21463
|
DEFAULT_EVAL_PATTERNS,
|
|
21159
21464
|
DEFAULT_EXPLORATION_TOOLS,
|
|
@@ -21207,6 +21512,7 @@ function createAgentKernel() {
|
|
|
21207
21512
|
createTempWorkspace,
|
|
21208
21513
|
deepEqual,
|
|
21209
21514
|
defineConfig,
|
|
21515
|
+
deriveCategory,
|
|
21210
21516
|
detectFormat,
|
|
21211
21517
|
discoverAssertions,
|
|
21212
21518
|
discoverCopilotSessions,
|
|
@@ -21220,7 +21526,9 @@ function createAgentKernel() {
|
|
|
21220
21526
|
explorationRatio,
|
|
21221
21527
|
extractCacheConfig,
|
|
21222
21528
|
extractFailOnError,
|
|
21529
|
+
extractImageBlocks,
|
|
21223
21530
|
extractJsonBlob,
|
|
21531
|
+
extractLastAssistantContent,
|
|
21224
21532
|
extractTargetFromSuite,
|
|
21225
21533
|
extractTargetsFromSuite,
|
|
21226
21534
|
extractTargetsFromTestCase,
|
|
@@ -21234,12 +21542,15 @@ function createAgentKernel() {
|
|
|
21234
21542
|
getAgentvHome,
|
|
21235
21543
|
getOutputFilenames,
|
|
21236
21544
|
getSubagentsRoot,
|
|
21545
|
+
getTextContent,
|
|
21237
21546
|
getTraceStateRoot,
|
|
21238
21547
|
getWorkspacePath,
|
|
21239
21548
|
getWorkspacePoolRoot,
|
|
21240
21549
|
getWorkspacesRoot,
|
|
21241
21550
|
initializeBaseline,
|
|
21242
21551
|
isAgentSkillsFormat,
|
|
21552
|
+
isContent,
|
|
21553
|
+
isContentArray,
|
|
21243
21554
|
isEvaluatorKind,
|
|
21244
21555
|
isJsonObject,
|
|
21245
21556
|
isJsonValue,
|