@agentv/core 3.12.0 → 3.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-4XWPXNQM.js → chunk-ZB3AUPES.js} +1 -3
- package/dist/chunk-ZB3AUPES.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +0 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +63 -177
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +15 -55
- package/dist/index.d.ts +15 -55
- package/dist/index.js +62 -49
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-3G2KXH7N.js +0 -120
- package/dist/chunk-3G2KXH7N.js.map +0 -1
- package/dist/chunk-4XWPXNQM.js.map +0 -1
- package/dist/simple-trace-file-exporter-CRIO5HDZ.js +0 -7
- package/dist/simple-trace-file-exporter-CRIO5HDZ.js.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1380,132 +1380,6 @@ var init_otlp_json_file_exporter = __esm({
|
|
|
1380
1380
|
}
|
|
1381
1381
|
});
|
|
1382
1382
|
|
|
1383
|
-
// src/observability/simple-trace-file-exporter.ts
|
|
1384
|
-
var simple_trace_file_exporter_exports = {};
|
|
1385
|
-
__export(simple_trace_file_exporter_exports, {
|
|
1386
|
-
SimpleTraceFileExporter: () => SimpleTraceFileExporter
|
|
1387
|
-
});
|
|
1388
|
-
function hrTimeDiffMs(start, end) {
|
|
1389
|
-
const diffSec = end[0] - start[0];
|
|
1390
|
-
const diffNano = end[1] - start[1];
|
|
1391
|
-
return Math.round(diffSec * 1e3 + diffNano / 1e6);
|
|
1392
|
-
}
|
|
1393
|
-
var import_node_fs16, import_promises34, import_node_path50, SimpleTraceFileExporter;
|
|
1394
|
-
var init_simple_trace_file_exporter = __esm({
|
|
1395
|
-
"src/observability/simple-trace-file-exporter.ts"() {
|
|
1396
|
-
"use strict";
|
|
1397
|
-
import_node_fs16 = require("fs");
|
|
1398
|
-
import_promises34 = require("fs/promises");
|
|
1399
|
-
import_node_path50 = require("path");
|
|
1400
|
-
SimpleTraceFileExporter = class {
|
|
1401
|
-
stream = null;
|
|
1402
|
-
filePath;
|
|
1403
|
-
streamReady = null;
|
|
1404
|
-
pendingWrites = [];
|
|
1405
|
-
_shuttingDown = false;
|
|
1406
|
-
spansByTraceId = /* @__PURE__ */ new Map();
|
|
1407
|
-
constructor(filePath) {
|
|
1408
|
-
this.filePath = filePath;
|
|
1409
|
-
}
|
|
1410
|
-
async ensureStream() {
|
|
1411
|
-
if (!this.streamReady) {
|
|
1412
|
-
this.streamReady = (async () => {
|
|
1413
|
-
await (0, import_promises34.mkdir)((0, import_node_path50.dirname)(this.filePath), { recursive: true });
|
|
1414
|
-
this.stream = (0, import_node_fs16.createWriteStream)(this.filePath, { flags: "w" });
|
|
1415
|
-
return this.stream;
|
|
1416
|
-
})();
|
|
1417
|
-
}
|
|
1418
|
-
return this.streamReady;
|
|
1419
|
-
}
|
|
1420
|
-
export(spans, resultCallback) {
|
|
1421
|
-
if (this._shuttingDown) {
|
|
1422
|
-
resultCallback({ code: 0 });
|
|
1423
|
-
return;
|
|
1424
|
-
}
|
|
1425
|
-
const rootSpans = [];
|
|
1426
|
-
for (const span of spans) {
|
|
1427
|
-
const traceId = span.spanContext().traceId;
|
|
1428
|
-
const existing = this.spansByTraceId.get(traceId) ?? [];
|
|
1429
|
-
existing.push(span);
|
|
1430
|
-
this.spansByTraceId.set(traceId, existing);
|
|
1431
|
-
if (span.name === "agentv.eval") {
|
|
1432
|
-
rootSpans.push(span);
|
|
1433
|
-
}
|
|
1434
|
-
}
|
|
1435
|
-
const writePromise = this.ensureStream().then((stream) => {
|
|
1436
|
-
for (const root of rootSpans) {
|
|
1437
|
-
const traceId = root.spanContext().traceId;
|
|
1438
|
-
const traceSpans = this.spansByTraceId.get(traceId) ?? [root];
|
|
1439
|
-
const children = traceSpans.filter(
|
|
1440
|
-
(span) => span.spanContext().spanId !== root.spanContext().spanId
|
|
1441
|
-
);
|
|
1442
|
-
const record = this.buildSimpleRecord(root, children);
|
|
1443
|
-
stream.write(`${JSON.stringify(record)}
|
|
1444
|
-
`);
|
|
1445
|
-
this.spansByTraceId.delete(traceId);
|
|
1446
|
-
}
|
|
1447
|
-
});
|
|
1448
|
-
this.pendingWrites.push(writePromise);
|
|
1449
|
-
resultCallback({ code: 0 });
|
|
1450
|
-
}
|
|
1451
|
-
async shutdown() {
|
|
1452
|
-
this._shuttingDown = true;
|
|
1453
|
-
await Promise.all(this.pendingWrites);
|
|
1454
|
-
this.pendingWrites = [];
|
|
1455
|
-
this.spansByTraceId.clear();
|
|
1456
|
-
return new Promise((resolve) => {
|
|
1457
|
-
if (this.stream) {
|
|
1458
|
-
this.stream.end(() => resolve());
|
|
1459
|
-
} else {
|
|
1460
|
-
resolve();
|
|
1461
|
-
}
|
|
1462
|
-
});
|
|
1463
|
-
}
|
|
1464
|
-
async forceFlush() {
|
|
1465
|
-
await Promise.all(this.pendingWrites);
|
|
1466
|
-
this.pendingWrites = [];
|
|
1467
|
-
}
|
|
1468
|
-
buildSimpleRecord(root, children) {
|
|
1469
|
-
const attrs = root.attributes || {};
|
|
1470
|
-
const durationMs = typeof attrs["agentv.trace.duration_ms"] === "number" ? attrs["agentv.trace.duration_ms"] : hrTimeDiffMs(root.startTime, root.endTime);
|
|
1471
|
-
let inputTokens = 0;
|
|
1472
|
-
let outputTokens = 0;
|
|
1473
|
-
for (const child of children) {
|
|
1474
|
-
const ca = child.attributes || {};
|
|
1475
|
-
if (ca["gen_ai.usage.input_tokens"]) inputTokens += ca["gen_ai.usage.input_tokens"];
|
|
1476
|
-
if (ca["gen_ai.usage.output_tokens"]) outputTokens += ca["gen_ai.usage.output_tokens"];
|
|
1477
|
-
}
|
|
1478
|
-
const rootInputTokens = typeof attrs["agentv.trace.token_input"] === "number" ? attrs["agentv.trace.token_input"] : 0;
|
|
1479
|
-
const rootOutputTokens = typeof attrs["agentv.trace.token_output"] === "number" ? attrs["agentv.trace.token_output"] : 0;
|
|
1480
|
-
const rootCachedTokens = typeof attrs["agentv.trace.token_cached"] === "number" ? attrs["agentv.trace.token_cached"] : void 0;
|
|
1481
|
-
const llmSpans = children.filter((s) => s.attributes?.["gen_ai.operation.name"] === "chat").map((s) => ({
|
|
1482
|
-
type: "llm",
|
|
1483
|
-
name: s.name,
|
|
1484
|
-
duration_ms: hrTimeDiffMs(s.startTime, s.endTime)
|
|
1485
|
-
}));
|
|
1486
|
-
const toolSpans = children.filter((s) => s.attributes?.["gen_ai.tool.name"]).map((s) => ({
|
|
1487
|
-
type: "tool",
|
|
1488
|
-
name: s.attributes["gen_ai.tool.name"],
|
|
1489
|
-
duration_ms: hrTimeDiffMs(s.startTime, s.endTime)
|
|
1490
|
-
}));
|
|
1491
|
-
return {
|
|
1492
|
-
test_id: attrs["agentv.test_id"],
|
|
1493
|
-
target: attrs["agentv.target"],
|
|
1494
|
-
score: attrs["agentv.score"],
|
|
1495
|
-
duration_ms: durationMs,
|
|
1496
|
-
cost_usd: attrs["agentv.trace.cost_usd"],
|
|
1497
|
-
token_usage: inputTokens || outputTokens || rootInputTokens || rootOutputTokens || rootCachedTokens ? {
|
|
1498
|
-
input: inputTokens || rootInputTokens,
|
|
1499
|
-
output: outputTokens || rootOutputTokens,
|
|
1500
|
-
...rootCachedTokens ? { cached: rootCachedTokens } : {}
|
|
1501
|
-
} : void 0,
|
|
1502
|
-
spans: [...llmSpans, ...toolSpans].length > 0 ? [...llmSpans, ...toolSpans] : void 0
|
|
1503
|
-
};
|
|
1504
|
-
}
|
|
1505
|
-
};
|
|
1506
|
-
}
|
|
1507
|
-
});
|
|
1508
|
-
|
|
1509
1383
|
// src/index.ts
|
|
1510
1384
|
var index_exports = {};
|
|
1511
1385
|
__export(index_exports, {
|
|
@@ -1529,7 +1403,6 @@ __export(index_exports, {
|
|
|
1529
1403
|
ProviderRegistry: () => ProviderRegistry,
|
|
1530
1404
|
RepoManager: () => RepoManager,
|
|
1531
1405
|
ResponseCache: () => ResponseCache,
|
|
1532
|
-
SimpleTraceFileExporter: () => SimpleTraceFileExporter,
|
|
1533
1406
|
SkillTriggerEvaluator: () => SkillTriggerEvaluator,
|
|
1534
1407
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
1535
1408
|
TemplateNotDirectoryError: () => TemplateNotDirectoryError,
|
|
@@ -1708,8 +1581,6 @@ function isTestMessage(value) {
|
|
|
1708
1581
|
var EVALUATOR_KIND_VALUES = [
|
|
1709
1582
|
"code-grader",
|
|
1710
1583
|
"llm-grader",
|
|
1711
|
-
"code-judge",
|
|
1712
|
-
"llm-judge",
|
|
1713
1584
|
"rubric",
|
|
1714
1585
|
"composite",
|
|
1715
1586
|
"tool-trajectory",
|
|
@@ -2460,12 +2331,6 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
2460
2331
|
} else if (obj.verbose !== void 0) {
|
|
2461
2332
|
logWarning(`Invalid execution.verbose in ${configPath}, expected boolean`);
|
|
2462
2333
|
}
|
|
2463
|
-
const traceFile = obj.trace_file;
|
|
2464
|
-
if (typeof traceFile === "string" && traceFile.trim().length > 0) {
|
|
2465
|
-
result.trace_file = traceFile.trim();
|
|
2466
|
-
} else if (traceFile !== void 0) {
|
|
2467
|
-
logWarning(`Invalid execution.trace_file in ${configPath}, expected non-empty string`);
|
|
2468
|
-
}
|
|
2469
2334
|
if (typeof obj.keep_workspaces === "boolean") {
|
|
2470
2335
|
result.keep_workspaces = obj.keep_workspaces;
|
|
2471
2336
|
} else if (obj.keep_workspaces !== void 0) {
|
|
@@ -2582,6 +2447,9 @@ var ANSI_RESET5 = "\x1B[0m";
|
|
|
2582
2447
|
function normalizeEvaluatorType(type) {
|
|
2583
2448
|
return type.replace(/_/g, "-");
|
|
2584
2449
|
}
|
|
2450
|
+
function isDeprecatedJudgeType(type) {
|
|
2451
|
+
return type === "code-judge" || type === "llm-judge";
|
|
2452
|
+
}
|
|
2585
2453
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
2586
2454
|
const execution = rawEvalCase.execution;
|
|
2587
2455
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -2644,6 +2512,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2644
2512
|
const rawName = asString(rawEvaluator.name);
|
|
2645
2513
|
const rawType = rawEvaluator.type;
|
|
2646
2514
|
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
2515
|
+
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
2516
|
+
logWarning2(
|
|
2517
|
+
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
2518
|
+
);
|
|
2519
|
+
continue;
|
|
2520
|
+
}
|
|
2647
2521
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
2648
2522
|
if (typeof typeValue !== "string") {
|
|
2649
2523
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -2676,7 +2550,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2676
2550
|
});
|
|
2677
2551
|
continue;
|
|
2678
2552
|
}
|
|
2679
|
-
if (typeValue === "code-grader"
|
|
2553
|
+
if (typeValue === "code-grader") {
|
|
2680
2554
|
let command;
|
|
2681
2555
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
2682
2556
|
console.warn(
|
|
@@ -2786,7 +2660,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2786
2660
|
continue;
|
|
2787
2661
|
}
|
|
2788
2662
|
const aggregatorType = asString(rawAggregator.type);
|
|
2789
|
-
|
|
2663
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
|
|
2664
|
+
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
2665
|
+
logWarning2(
|
|
2666
|
+
`Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
2667
|
+
);
|
|
2668
|
+
continue;
|
|
2669
|
+
}
|
|
2670
|
+
if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
|
|
2790
2671
|
logWarning2(
|
|
2791
2672
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
2792
2673
|
);
|
|
@@ -2821,7 +2702,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2821
2702
|
continue;
|
|
2822
2703
|
}
|
|
2823
2704
|
let aggregator;
|
|
2824
|
-
if (
|
|
2705
|
+
if (normalizedAggregatorType === "weighted_average") {
|
|
2825
2706
|
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
2826
2707
|
const parsedWeights = {};
|
|
2827
2708
|
if (weights) {
|
|
@@ -2835,7 +2716,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2835
2716
|
type: "weighted_average",
|
|
2836
2717
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
2837
2718
|
};
|
|
2838
|
-
} else if (
|
|
2719
|
+
} else if (normalizedAggregatorType === "code-grader") {
|
|
2839
2720
|
const aggregatorPath = asString(rawAggregator.path);
|
|
2840
2721
|
if (!aggregatorPath) {
|
|
2841
2722
|
logWarning2(
|
|
@@ -2848,7 +2729,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
2848
2729
|
path: aggregatorPath,
|
|
2849
2730
|
cwd: searchRoots[0]
|
|
2850
2731
|
};
|
|
2851
|
-
} else if (
|
|
2732
|
+
} else if (normalizedAggregatorType === "threshold") {
|
|
2852
2733
|
const thresholdValue = rawAggregator.threshold;
|
|
2853
2734
|
if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
|
|
2854
2735
|
logWarning2(
|
|
@@ -3596,10 +3477,15 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
3596
3477
|
return void 0;
|
|
3597
3478
|
}
|
|
3598
3479
|
const normalized = normalizeEvaluatorType(candidate);
|
|
3480
|
+
if (isDeprecatedJudgeType(normalized)) {
|
|
3481
|
+
throw new Error(
|
|
3482
|
+
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
3483
|
+
);
|
|
3484
|
+
}
|
|
3599
3485
|
if (isEvaluatorKind(normalized)) {
|
|
3600
3486
|
return normalized;
|
|
3601
3487
|
}
|
|
3602
|
-
logWarning2(`Unknown
|
|
3488
|
+
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
3603
3489
|
return void 0;
|
|
3604
3490
|
}
|
|
3605
3491
|
function asString(value) {
|
|
@@ -5032,9 +4918,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
5032
4918
|
case "ends_with":
|
|
5033
4919
|
return `Output ends with '${entry.value}'`;
|
|
5034
4920
|
case "llm-grader":
|
|
5035
|
-
case "llm_grader":
|
|
5036
|
-
case "llm-judge":
|
|
5037
|
-
case "llm_judge": {
|
|
4921
|
+
case "llm_grader": {
|
|
5038
4922
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
5039
4923
|
return null;
|
|
5040
4924
|
}
|
|
@@ -5047,9 +4931,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
5047
4931
|
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
5048
4932
|
}
|
|
5049
4933
|
case "code-grader":
|
|
5050
|
-
case "code_grader":
|
|
5051
|
-
case "code-judge":
|
|
5052
|
-
case "code_judge": {
|
|
4934
|
+
case "code_grader": {
|
|
5053
4935
|
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
5054
4936
|
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
5055
4937
|
return codeGraderInstruction(graderName, desc);
|
|
@@ -5080,7 +4962,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
5080
4962
|
}
|
|
5081
4963
|
}
|
|
5082
4964
|
function assertionToNaturalLanguageList(entry) {
|
|
5083
|
-
if (entry.type === "llm-grader" || entry.type === "llm_grader"
|
|
4965
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
5084
4966
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
5085
4967
|
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
5086
4968
|
}
|
|
@@ -12810,10 +12692,26 @@ function extractJsonBlob(text) {
|
|
|
12810
12692
|
const match = text.match(/\{[\s\S]*\}/);
|
|
12811
12693
|
return match?.[0];
|
|
12812
12694
|
}
|
|
12695
|
+
function repairSchemaNearBooleanFields(text) {
|
|
12696
|
+
return text.replace(
|
|
12697
|
+
/("passed"\s*:\s*)(?:"([^"]+)"|([A-Za-z_][A-Za-z0-9_-]*))/gi,
|
|
12698
|
+
(_match, prefix, quotedValue, bareValue) => {
|
|
12699
|
+
const value = (quotedValue ?? bareValue ?? "").trim().toLowerCase();
|
|
12700
|
+
if (value === "true") {
|
|
12701
|
+
return `${prefix}true`;
|
|
12702
|
+
}
|
|
12703
|
+
if (value === "false") {
|
|
12704
|
+
return `${prefix}false`;
|
|
12705
|
+
}
|
|
12706
|
+
return `${prefix}false`;
|
|
12707
|
+
}
|
|
12708
|
+
);
|
|
12709
|
+
}
|
|
12813
12710
|
function parseJsonFromText(text) {
|
|
12814
12711
|
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
12815
12712
|
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
12816
|
-
|
|
12713
|
+
const repaired = repairSchemaNearBooleanFields(blob);
|
|
12714
|
+
return JSON.parse(repaired);
|
|
12817
12715
|
}
|
|
12818
12716
|
function isNonEmptyString(value) {
|
|
12819
12717
|
return typeof value === "string" && value.trim().length > 0;
|
|
@@ -12960,12 +12858,12 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
|
|
|
12960
12858
|
});
|
|
12961
12859
|
}
|
|
12962
12860
|
async function execShellWithStdin(command, stdinPayload, options = {}) {
|
|
12963
|
-
const { mkdir:
|
|
12861
|
+
const { mkdir: mkdir17, readFile: readFile14, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
|
|
12964
12862
|
const { tmpdir: tmpdir3 } = await import("os");
|
|
12965
12863
|
const path48 = await import("path");
|
|
12966
12864
|
const { randomUUID: randomUUID10 } = await import("crypto");
|
|
12967
12865
|
const dir = path48.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
|
|
12968
|
-
await
|
|
12866
|
+
await mkdir17(dir, { recursive: true });
|
|
12969
12867
|
const stdinPath = path48.join(dir, "stdin.txt");
|
|
12970
12868
|
const stdoutPath = path48.join(dir, "stdout.txt");
|
|
12971
12869
|
const stderrPath = path48.join(dir, "stderr.txt");
|
|
@@ -13285,7 +13183,7 @@ function toCamelCaseDeep(obj) {
|
|
|
13285
13183
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
13286
13184
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
13287
13185
|
var CodeEvaluator = class {
|
|
13288
|
-
kind = "code-
|
|
13186
|
+
kind = "code-grader";
|
|
13289
13187
|
command;
|
|
13290
13188
|
cwd;
|
|
13291
13189
|
agentTimeoutMs;
|
|
@@ -13304,7 +13202,7 @@ var CodeEvaluator = class {
|
|
|
13304
13202
|
if (outputForPayload) {
|
|
13305
13203
|
const serialized = JSON.stringify(outputForPayload);
|
|
13306
13204
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
13307
|
-
const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-
|
|
13205
|
+
const tmpDir = await (0, import_promises26.mkdtemp)((0, import_node_path36.join)((0, import_node_os5.tmpdir)(), "agentv-grader-"));
|
|
13308
13206
|
outputPath = (0, import_node_path36.join)(tmpDir, "output.json");
|
|
13309
13207
|
await (0, import_promises26.writeFile)(outputPath, serialized);
|
|
13310
13208
|
outputForPayload = null;
|
|
@@ -13594,7 +13492,7 @@ var LlmGraderEvaluator = class {
|
|
|
13594
13492
|
return this.evaluateWithDelegatedAgent(context2, graderProvider);
|
|
13595
13493
|
}
|
|
13596
13494
|
const config = context2.evaluator;
|
|
13597
|
-
if (
|
|
13495
|
+
if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
|
|
13598
13496
|
return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
|
|
13599
13497
|
}
|
|
13600
13498
|
return this.evaluateFreeform(context2, graderProvider);
|
|
@@ -13779,7 +13677,7 @@ ${context2.fileChanges}`;
|
|
|
13779
13677
|
const systemPrompt = this.buildAgentSystemPrompt(context2);
|
|
13780
13678
|
const userPrompt = this.buildAgentUserPrompt(context2);
|
|
13781
13679
|
const config = context2.evaluator;
|
|
13782
|
-
const rubrics = config?.type === "llm-grader"
|
|
13680
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13783
13681
|
const fsTools = createFilesystemTools(workspacePath);
|
|
13784
13682
|
const evaluatorRawRequest = {
|
|
13785
13683
|
mode: "built-in",
|
|
@@ -13875,7 +13773,7 @@ ${context2.fileChanges}`;
|
|
|
13875
13773
|
};
|
|
13876
13774
|
}
|
|
13877
13775
|
const config = context2.evaluator;
|
|
13878
|
-
const rubrics = config?.type === "llm-grader"
|
|
13776
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13879
13777
|
const details = {
|
|
13880
13778
|
mode: modeLabel,
|
|
13881
13779
|
grader_target: provider.targetName
|
|
@@ -13915,7 +13813,7 @@ ${context2.fileChanges}`;
|
|
|
13915
13813
|
*/
|
|
13916
13814
|
buildAgentSystemPrompt(context2) {
|
|
13917
13815
|
const config = context2.evaluator;
|
|
13918
|
-
const rubrics = config?.type === "llm-grader"
|
|
13816
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13919
13817
|
const parts = [
|
|
13920
13818
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
13921
13819
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -13946,7 +13844,7 @@ ${context2.fileChanges}`;
|
|
|
13946
13844
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
13947
13845
|
}
|
|
13948
13846
|
const config = context2.evaluator;
|
|
13949
|
-
const rubrics = config?.type === "llm-grader"
|
|
13847
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13950
13848
|
const parts = [
|
|
13951
13849
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
13952
13850
|
"",
|
|
@@ -13989,7 +13887,7 @@ ${context2.fileChanges}`;
|
|
|
13989
13887
|
buildDelegatedPrompt(context2) {
|
|
13990
13888
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13991
13889
|
const config = context2.evaluator;
|
|
13992
|
-
const rubrics = config?.type === "llm-grader"
|
|
13890
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
13993
13891
|
if (this.evaluatorTemplate) {
|
|
13994
13892
|
const variables = {
|
|
13995
13893
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
@@ -14486,10 +14384,8 @@ var CompositeEvaluator = class {
|
|
|
14486
14384
|
const aggregator = this.config.aggregator;
|
|
14487
14385
|
switch (aggregator.type) {
|
|
14488
14386
|
case "code-grader":
|
|
14489
|
-
case "code-judge":
|
|
14490
14387
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
14491
14388
|
case "llm-grader":
|
|
14492
|
-
case "llm-judge":
|
|
14493
14389
|
return this.runLlmAggregator(results, context2, aggregator);
|
|
14494
14390
|
case "threshold":
|
|
14495
14391
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -16911,7 +16807,7 @@ var endsWithFactory = (config) => {
|
|
|
16911
16807
|
};
|
|
16912
16808
|
function createBuiltinRegistry() {
|
|
16913
16809
|
const registry = new EvaluatorRegistry();
|
|
16914
|
-
registry.register("llm-grader", llmGraderFactory).register("
|
|
16810
|
+
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
16915
16811
|
const fn = config[INLINE_ASSERT_FN];
|
|
16916
16812
|
if (!fn) {
|
|
16917
16813
|
throw new Error(
|
|
@@ -19629,7 +19525,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
19629
19525
|
return evalCases.filter((evalCase) => import_micromatch3.default.isMatch(evalCase.id, filter));
|
|
19630
19526
|
}
|
|
19631
19527
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
19632
|
-
const llmGrader = overrides?.["llm-grader"] ??
|
|
19528
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
19633
19529
|
resolveGraderProvider: async (context2) => {
|
|
19634
19530
|
if (context2.graderProvider) {
|
|
19635
19531
|
return context2.graderProvider;
|
|
@@ -20061,8 +19957,6 @@ var AgentVConfigSchema = import_zod5.z.object({
|
|
|
20061
19957
|
agentTimeoutMs: import_zod5.z.number().int().min(0).optional(),
|
|
20062
19958
|
/** Enable verbose logging */
|
|
20063
19959
|
verbose: import_zod5.z.boolean().optional(),
|
|
20064
|
-
/** Write human-readable trace JSONL to this path (supports {timestamp} placeholder) */
|
|
20065
|
-
traceFile: import_zod5.z.string().optional(),
|
|
20066
19960
|
/** Always keep temp workspaces after eval */
|
|
20067
19961
|
keepWorkspaces: import_zod5.z.boolean().optional(),
|
|
20068
19962
|
/** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
|
|
@@ -20362,12 +20256,6 @@ var OtelTraceExporter = class {
|
|
|
20362
20256
|
new SimpleSpanProcessor(new OtlpJsonFileExporter2(this.options.otlpFilePath))
|
|
20363
20257
|
);
|
|
20364
20258
|
}
|
|
20365
|
-
if (this.options.traceFilePath) {
|
|
20366
|
-
const { SimpleTraceFileExporter: SimpleTraceFileExporter2 } = await Promise.resolve().then(() => (init_simple_trace_file_exporter(), simple_trace_file_exporter_exports));
|
|
20367
|
-
processors.push(
|
|
20368
|
-
new SimpleSpanProcessor(new SimpleTraceFileExporter2(this.options.traceFilePath))
|
|
20369
|
-
);
|
|
20370
|
-
}
|
|
20371
20259
|
if (processors.length === 0) {
|
|
20372
20260
|
return false;
|
|
20373
20261
|
}
|
|
@@ -20481,10 +20369,10 @@ var OtelTraceExporter = class {
|
|
|
20481
20369
|
}
|
|
20482
20370
|
if (result.scores) {
|
|
20483
20371
|
for (const score of result.scores) {
|
|
20484
|
-
rootSpan.addEvent(`agentv.
|
|
20485
|
-
"agentv.
|
|
20486
|
-
"agentv.
|
|
20487
|
-
...score.verdict ? { "agentv.
|
|
20372
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
20373
|
+
"agentv.grader.score": score.score,
|
|
20374
|
+
"agentv.grader.type": score.type,
|
|
20375
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
20488
20376
|
});
|
|
20489
20377
|
}
|
|
20490
20378
|
}
|
|
@@ -20795,7 +20683,6 @@ function toHrTime(iso) {
|
|
|
20795
20683
|
|
|
20796
20684
|
// src/observability/index.ts
|
|
20797
20685
|
init_otlp_json_file_exporter();
|
|
20798
|
-
init_simple_trace_file_exporter();
|
|
20799
20686
|
|
|
20800
20687
|
// src/index.ts
|
|
20801
20688
|
function createAgentKernel() {
|
|
@@ -20823,7 +20710,6 @@ function createAgentKernel() {
|
|
|
20823
20710
|
ProviderRegistry,
|
|
20824
20711
|
RepoManager,
|
|
20825
20712
|
ResponseCache,
|
|
20826
|
-
SimpleTraceFileExporter,
|
|
20827
20713
|
SkillTriggerEvaluator,
|
|
20828
20714
|
TEST_MESSAGE_ROLES,
|
|
20829
20715
|
TemplateNotDirectoryError,
|