agentv 3.12.0 → 3.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -10
- package/dist/{chunk-UYBLUYHN.js → chunk-K747KGDP.js} +64 -49
- package/dist/chunk-K747KGDP.js.map +1 -0
- package/dist/{chunk-VLOFRXH4.js → chunk-LSXO22CF.js} +17 -43
- package/dist/chunk-LSXO22CF.js.map +1 -0
- package/dist/{chunk-2ELQ6F3C.js → chunk-UK7UMQOX.js} +29 -34
- package/dist/chunk-UK7UMQOX.js.map +1 -0
- package/dist/cli.js +3 -4
- package/dist/cli.js.map +1 -1
- package/dist/{dist-L6R5HJ72.js → dist-LCZDS36N.js} +2 -6
- package/dist/index.js +3 -4
- package/dist/{interactive-5X62YEEX.js → interactive-76ZJVPI7.js} +3 -4
- package/dist/{interactive-5X62YEEX.js.map → interactive-76ZJVPI7.js.map} +1 -1
- package/package.json +1 -1
- package/dist/chunk-2ELQ6F3C.js.map +0 -1
- package/dist/chunk-NR7QVL75.js +0 -122
- package/dist/chunk-NR7QVL75.js.map +0 -1
- package/dist/chunk-UYBLUYHN.js.map +0 -1
- package/dist/chunk-VLOFRXH4.js.map +0 -1
- package/dist/simple-trace-file-exporter-CRIO5HDZ-QYYT2QQT.js +0 -9
- package/dist/simple-trace-file-exporter-CRIO5HDZ-QYYT2QQT.js.map +0 -1
- /package/dist/{dist-L6R5HJ72.js.map → dist-LCZDS36N.js.map} +0 -0
package/README.md
CHANGED
|
@@ -209,8 +209,8 @@ agentv eval evals/my-eval.yaml
|
|
|
209
209
|
# Self-contained HTML dashboard (opens in any browser, no server needed)
|
|
210
210
|
agentv eval evals/my-eval.yaml -o report.html
|
|
211
211
|
|
|
212
|
-
# Explicit JSONL
|
|
213
|
-
agentv eval evals/my-eval.yaml -o
|
|
212
|
+
# Explicit JSONL output
|
|
213
|
+
agentv eval evals/my-eval.yaml -o output.jsonl
|
|
214
214
|
|
|
215
215
|
# Multiple formats simultaneously
|
|
216
216
|
agentv eval evals/my-eval.yaml -o report.html
|
|
@@ -221,14 +221,13 @@ agentv eval evals/my-eval.yaml -o results.xml
|
|
|
221
221
|
|
|
222
222
|
The HTML report auto-refreshes every 2 seconds during a live run, then locks once the run completes.
|
|
223
223
|
|
|
224
|
-
By default, `agentv eval`
|
|
225
|
-
with `index.jsonl` as the
|
|
226
|
-
is still written alongside it for legacy tooling during the deprecation window.
|
|
224
|
+
By default, `agentv eval` creates a run workspace under `.agentv/results/runs/<run>/`
|
|
225
|
+
with `index.jsonl` as the machine-facing manifest.
|
|
227
226
|
|
|
228
|
-
You can also convert an existing manifest
|
|
227
|
+
You can also convert an existing manifest to HTML after the fact:
|
|
229
228
|
|
|
230
229
|
```bash
|
|
231
|
-
agentv convert .agentv/results/
|
|
230
|
+
agentv convert .agentv/results/runs/eval_<timestamp>/index.jsonl -o report.html
|
|
232
231
|
```
|
|
233
232
|
|
|
234
233
|
#### Timeouts
|
|
@@ -359,7 +358,7 @@ agentv create eval my-eval # → evals/my-eval.eval.yaml + .cases.jsonl
|
|
|
359
358
|
Compare a combined results file across all targets (N-way matrix):
|
|
360
359
|
|
|
361
360
|
```bash
|
|
362
|
-
agentv compare .agentv/results/
|
|
361
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl
|
|
363
362
|
```
|
|
364
363
|
|
|
365
364
|
```
|
|
@@ -380,8 +379,8 @@ Pairwise Summary:
|
|
|
380
379
|
Designate a baseline for CI regression gating, or compare two specific targets:
|
|
381
380
|
|
|
382
381
|
```bash
|
|
383
|
-
agentv compare .agentv/results/
|
|
384
|
-
agentv compare .agentv/results/
|
|
382
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1
|
|
383
|
+
agentv compare .agentv/results/runs/eval_<timestamp>/index.jsonl --baseline gpt-4.1 --candidate gpt-5-mini
|
|
385
384
|
agentv compare before.jsonl after.jsonl # two-file pairwise
|
|
386
385
|
```
|
|
387
386
|
|
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-ZB3AUPES.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-ZB3AUPES.js
|
|
423
423
|
import { readFile as readFile2 } from "node:fs/promises";
|
|
424
424
|
import path3 from "node:path";
|
|
425
425
|
import fg from "fast-glob";
|
|
@@ -473,8 +473,6 @@ function isTestMessage(value) {
|
|
|
473
473
|
var EVALUATOR_KIND_VALUES = [
|
|
474
474
|
"code-grader",
|
|
475
475
|
"llm-grader",
|
|
476
|
-
"code-judge",
|
|
477
|
-
"llm-judge",
|
|
478
476
|
"rubric",
|
|
479
477
|
"composite",
|
|
480
478
|
"tool-trajectory",
|
|
@@ -14856,12 +14854,6 @@ function parseExecutionDefaults(raw, configPath) {
|
|
|
14856
14854
|
} else if (obj.verbose !== void 0) {
|
|
14857
14855
|
logWarning(`Invalid execution.verbose in ${configPath}, expected boolean`);
|
|
14858
14856
|
}
|
|
14859
|
-
const traceFile = obj.trace_file;
|
|
14860
|
-
if (typeof traceFile === "string" && traceFile.trim().length > 0) {
|
|
14861
|
-
result.trace_file = traceFile.trim();
|
|
14862
|
-
} else if (traceFile !== void 0) {
|
|
14863
|
-
logWarning(`Invalid execution.trace_file in ${configPath}, expected non-empty string`);
|
|
14864
|
-
}
|
|
14865
14857
|
if (typeof obj.keep_workspaces === "boolean") {
|
|
14866
14858
|
result.keep_workspaces = obj.keep_workspaces;
|
|
14867
14859
|
} else if (obj.keep_workspaces !== void 0) {
|
|
@@ -14966,6 +14958,9 @@ var ANSI_RESET4 = "\x1B[0m";
|
|
|
14966
14958
|
function normalizeEvaluatorType(type) {
|
|
14967
14959
|
return type.replace(/_/g, "-");
|
|
14968
14960
|
}
|
|
14961
|
+
function isDeprecatedJudgeType(type) {
|
|
14962
|
+
return type === "code-judge" || type === "llm-judge";
|
|
14963
|
+
}
|
|
14969
14964
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
14970
14965
|
const execution = rawEvalCase.execution;
|
|
14971
14966
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -15028,6 +15023,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15028
15023
|
const rawName = asString(rawEvaluator.name);
|
|
15029
15024
|
const rawType = rawEvaluator.type;
|
|
15030
15025
|
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
15026
|
+
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
15027
|
+
logWarning2(
|
|
15028
|
+
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
15029
|
+
);
|
|
15030
|
+
continue;
|
|
15031
|
+
}
|
|
15031
15032
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
15032
15033
|
if (typeof typeValue !== "string") {
|
|
15033
15034
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -15060,7 +15061,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15060
15061
|
});
|
|
15061
15062
|
continue;
|
|
15062
15063
|
}
|
|
15063
|
-
if (typeValue === "code-grader"
|
|
15064
|
+
if (typeValue === "code-grader") {
|
|
15064
15065
|
let command;
|
|
15065
15066
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
15066
15067
|
console.warn(
|
|
@@ -15170,7 +15171,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15170
15171
|
continue;
|
|
15171
15172
|
}
|
|
15172
15173
|
const aggregatorType = asString(rawAggregator.type);
|
|
15173
|
-
|
|
15174
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
|
|
15175
|
+
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
15176
|
+
logWarning2(
|
|
15177
|
+
`Skipping composite evaluator '${name21}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
15178
|
+
);
|
|
15179
|
+
continue;
|
|
15180
|
+
}
|
|
15181
|
+
if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
|
|
15174
15182
|
logWarning2(
|
|
15175
15183
|
`Skipping composite evaluator '${name21}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
15176
15184
|
);
|
|
@@ -15205,7 +15213,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15205
15213
|
continue;
|
|
15206
15214
|
}
|
|
15207
15215
|
let aggregator;
|
|
15208
|
-
if (
|
|
15216
|
+
if (normalizedAggregatorType === "weighted_average") {
|
|
15209
15217
|
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
15210
15218
|
const parsedWeights = {};
|
|
15211
15219
|
if (weights) {
|
|
@@ -15219,7 +15227,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15219
15227
|
type: "weighted_average",
|
|
15220
15228
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
15221
15229
|
};
|
|
15222
|
-
} else if (
|
|
15230
|
+
} else if (normalizedAggregatorType === "code-grader") {
|
|
15223
15231
|
const aggregatorPath = asString(rawAggregator.path);
|
|
15224
15232
|
if (!aggregatorPath) {
|
|
15225
15233
|
logWarning2(
|
|
@@ -15232,7 +15240,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
15232
15240
|
path: aggregatorPath,
|
|
15233
15241
|
cwd: searchRoots[0]
|
|
15234
15242
|
};
|
|
15235
|
-
} else if (
|
|
15243
|
+
} else if (normalizedAggregatorType === "threshold") {
|
|
15236
15244
|
const thresholdValue = rawAggregator.threshold;
|
|
15237
15245
|
if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
|
|
15238
15246
|
logWarning2(
|
|
@@ -15980,10 +15988,15 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
15980
15988
|
return void 0;
|
|
15981
15989
|
}
|
|
15982
15990
|
const normalized = normalizeEvaluatorType(candidate);
|
|
15991
|
+
if (isDeprecatedJudgeType(normalized)) {
|
|
15992
|
+
throw new Error(
|
|
15993
|
+
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
15994
|
+
);
|
|
15995
|
+
}
|
|
15983
15996
|
if (isEvaluatorKind(normalized)) {
|
|
15984
15997
|
return normalized;
|
|
15985
15998
|
}
|
|
15986
|
-
logWarning2(`Unknown
|
|
15999
|
+
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
15987
16000
|
return void 0;
|
|
15988
16001
|
}
|
|
15989
16002
|
function asString(value) {
|
|
@@ -17386,9 +17399,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17386
17399
|
case "ends_with":
|
|
17387
17400
|
return `Output ends with '${entry.value}'`;
|
|
17388
17401
|
case "llm-grader":
|
|
17389
|
-
case "llm_grader":
|
|
17390
|
-
case "llm-judge":
|
|
17391
|
-
case "llm_judge": {
|
|
17402
|
+
case "llm_grader": {
|
|
17392
17403
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
17393
17404
|
return null;
|
|
17394
17405
|
}
|
|
@@ -17401,9 +17412,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17401
17412
|
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
17402
17413
|
}
|
|
17403
17414
|
case "code-grader":
|
|
17404
|
-
case "code_grader":
|
|
17405
|
-
case "code-judge":
|
|
17406
|
-
case "code_judge": {
|
|
17415
|
+
case "code_grader": {
|
|
17407
17416
|
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
17408
17417
|
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
17409
17418
|
return codeGraderInstruction(graderName, desc);
|
|
@@ -17434,7 +17443,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
17434
17443
|
}
|
|
17435
17444
|
}
|
|
17436
17445
|
function assertionToNaturalLanguageList(entry) {
|
|
17437
|
-
if (entry.type === "llm-grader" || entry.type === "llm_grader"
|
|
17446
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
17438
17447
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
17439
17448
|
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
17440
17449
|
}
|
|
@@ -23614,10 +23623,26 @@ function extractJsonBlob(text2) {
|
|
|
23614
23623
|
const match = text2.match(/\{[\s\S]*\}/);
|
|
23615
23624
|
return match?.[0];
|
|
23616
23625
|
}
|
|
23626
|
+
function repairSchemaNearBooleanFields(text2) {
|
|
23627
|
+
return text2.replace(
|
|
23628
|
+
/("passed"\s*:\s*)(?:"([^"]+)"|([A-Za-z_][A-Za-z0-9_-]*))/gi,
|
|
23629
|
+
(_match, prefix, quotedValue, bareValue) => {
|
|
23630
|
+
const value = (quotedValue ?? bareValue ?? "").trim().toLowerCase();
|
|
23631
|
+
if (value === "true") {
|
|
23632
|
+
return `${prefix}true`;
|
|
23633
|
+
}
|
|
23634
|
+
if (value === "false") {
|
|
23635
|
+
return `${prefix}false`;
|
|
23636
|
+
}
|
|
23637
|
+
return `${prefix}false`;
|
|
23638
|
+
}
|
|
23639
|
+
);
|
|
23640
|
+
}
|
|
23617
23641
|
function parseJsonFromText(text2) {
|
|
23618
23642
|
const cleaned = typeof text2 === "string" ? text2.replace(/```json\n?|```/g, "").trim() : "";
|
|
23619
23643
|
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
23620
|
-
|
|
23644
|
+
const repaired = repairSchemaNearBooleanFields(blob);
|
|
23645
|
+
return JSON.parse(repaired);
|
|
23621
23646
|
}
|
|
23622
23647
|
function isNonEmptyString(value) {
|
|
23623
23648
|
return typeof value === "string" && value.trim().length > 0;
|
|
@@ -24074,7 +24099,7 @@ function toCamelCaseDeep(obj) {
|
|
|
24074
24099
|
}
|
|
24075
24100
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
24076
24101
|
var CodeEvaluator = class {
|
|
24077
|
-
kind = "code-
|
|
24102
|
+
kind = "code-grader";
|
|
24078
24103
|
command;
|
|
24079
24104
|
cwd;
|
|
24080
24105
|
agentTimeoutMs;
|
|
@@ -24093,7 +24118,7 @@ var CodeEvaluator = class {
|
|
|
24093
24118
|
if (outputForPayload) {
|
|
24094
24119
|
const serialized = JSON.stringify(outputForPayload);
|
|
24095
24120
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
24096
|
-
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-
|
|
24121
|
+
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
|
|
24097
24122
|
outputPath = join(tmpDir, "output.json");
|
|
24098
24123
|
await writeFile6(outputPath, serialized);
|
|
24099
24124
|
outputForPayload = null;
|
|
@@ -24342,7 +24367,7 @@ var LlmGraderEvaluator = class {
|
|
|
24342
24367
|
return this.evaluateWithDelegatedAgent(context2, graderProvider);
|
|
24343
24368
|
}
|
|
24344
24369
|
const config = context2.evaluator;
|
|
24345
|
-
if (
|
|
24370
|
+
if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
|
|
24346
24371
|
return this.evaluateWithRubrics(context2, graderProvider, config.rubrics);
|
|
24347
24372
|
}
|
|
24348
24373
|
return this.evaluateFreeform(context2, graderProvider);
|
|
@@ -24527,7 +24552,7 @@ ${context2.fileChanges}`;
|
|
|
24527
24552
|
const systemPrompt = this.buildAgentSystemPrompt(context2);
|
|
24528
24553
|
const userPrompt = this.buildAgentUserPrompt(context2);
|
|
24529
24554
|
const config = context2.evaluator;
|
|
24530
|
-
const rubrics = config?.type === "llm-grader"
|
|
24555
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24531
24556
|
const fsTools = createFilesystemTools(workspacePath);
|
|
24532
24557
|
const evaluatorRawRequest = {
|
|
24533
24558
|
mode: "built-in",
|
|
@@ -24623,7 +24648,7 @@ ${context2.fileChanges}`;
|
|
|
24623
24648
|
};
|
|
24624
24649
|
}
|
|
24625
24650
|
const config = context2.evaluator;
|
|
24626
|
-
const rubrics = config?.type === "llm-grader"
|
|
24651
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24627
24652
|
const details = {
|
|
24628
24653
|
mode: modeLabel,
|
|
24629
24654
|
grader_target: provider.targetName
|
|
@@ -24663,7 +24688,7 @@ ${context2.fileChanges}`;
|
|
|
24663
24688
|
*/
|
|
24664
24689
|
buildAgentSystemPrompt(context2) {
|
|
24665
24690
|
const config = context2.evaluator;
|
|
24666
|
-
const rubrics = config?.type === "llm-grader"
|
|
24691
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24667
24692
|
const parts = [
|
|
24668
24693
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
24669
24694
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -24694,7 +24719,7 @@ ${context2.fileChanges}`;
|
|
|
24694
24719
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
24695
24720
|
}
|
|
24696
24721
|
const config = context2.evaluator;
|
|
24697
|
-
const rubrics = config?.type === "llm-grader"
|
|
24722
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24698
24723
|
const parts = [
|
|
24699
24724
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
24700
24725
|
"",
|
|
@@ -24737,7 +24762,7 @@ ${context2.fileChanges}`;
|
|
|
24737
24762
|
buildDelegatedPrompt(context2) {
|
|
24738
24763
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
24739
24764
|
const config = context2.evaluator;
|
|
24740
|
-
const rubrics = config?.type === "llm-grader"
|
|
24765
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
24741
24766
|
if (this.evaluatorTemplate) {
|
|
24742
24767
|
const variables = {
|
|
24743
24768
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
@@ -25232,10 +25257,8 @@ var CompositeEvaluator = class {
|
|
|
25232
25257
|
const aggregator = this.config.aggregator;
|
|
25233
25258
|
switch (aggregator.type) {
|
|
25234
25259
|
case "code-grader":
|
|
25235
|
-
case "code-judge":
|
|
25236
25260
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
25237
25261
|
case "llm-grader":
|
|
25238
|
-
case "llm-judge":
|
|
25239
25262
|
return this.runLlmAggregator(results, context2, aggregator);
|
|
25240
25263
|
case "threshold":
|
|
25241
25264
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -27620,7 +27643,7 @@ var endsWithFactory = (config) => {
|
|
|
27620
27643
|
};
|
|
27621
27644
|
function createBuiltinRegistry() {
|
|
27622
27645
|
const registry = new EvaluatorRegistry();
|
|
27623
|
-
registry.register("llm-grader", llmGraderFactory).register("
|
|
27646
|
+
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
27624
27647
|
const fn = config[INLINE_ASSERT_FN];
|
|
27625
27648
|
if (!fn) {
|
|
27626
27649
|
throw new Error(
|
|
@@ -30296,7 +30319,7 @@ function filterEvalCases(evalCases, filter2) {
|
|
|
30296
30319
|
return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter2));
|
|
30297
30320
|
}
|
|
30298
30321
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
30299
|
-
const llmGrader = overrides?.["llm-grader"] ??
|
|
30322
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
30300
30323
|
resolveGraderProvider: async (context2) => {
|
|
30301
30324
|
if (context2.graderProvider) {
|
|
30302
30325
|
return context2.graderProvider;
|
|
@@ -30717,8 +30740,6 @@ var AgentVConfigSchema = external_exports2.object({
|
|
|
30717
30740
|
agentTimeoutMs: external_exports2.number().int().min(0).optional(),
|
|
30718
30741
|
/** Enable verbose logging */
|
|
30719
30742
|
verbose: external_exports2.boolean().optional(),
|
|
30720
|
-
/** Write human-readable trace JSONL to this path (supports {timestamp} placeholder) */
|
|
30721
|
-
traceFile: external_exports2.string().optional(),
|
|
30722
30743
|
/** Always keep temp workspaces after eval */
|
|
30723
30744
|
keepWorkspaces: external_exports2.boolean().optional(),
|
|
30724
30745
|
/** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
|
|
@@ -31006,12 +31027,6 @@ var OtelTraceExporter = class {
|
|
|
31006
31027
|
new SimpleSpanProcessor(new OtlpJsonFileExporter2(this.options.otlpFilePath))
|
|
31007
31028
|
);
|
|
31008
31029
|
}
|
|
31009
|
-
if (this.options.traceFilePath) {
|
|
31010
|
-
const { SimpleTraceFileExporter: SimpleTraceFileExporter2 } = await import("./simple-trace-file-exporter-CRIO5HDZ-QYYT2QQT.js");
|
|
31011
|
-
processors.push(
|
|
31012
|
-
new SimpleSpanProcessor(new SimpleTraceFileExporter2(this.options.traceFilePath))
|
|
31013
|
-
);
|
|
31014
|
-
}
|
|
31015
31030
|
if (processors.length === 0) {
|
|
31016
31031
|
return false;
|
|
31017
31032
|
}
|
|
@@ -31125,10 +31140,10 @@ var OtelTraceExporter = class {
|
|
|
31125
31140
|
}
|
|
31126
31141
|
if (result.scores) {
|
|
31127
31142
|
for (const score of result.scores) {
|
|
31128
|
-
rootSpan.addEvent(`agentv.
|
|
31129
|
-
"agentv.
|
|
31130
|
-
"agentv.
|
|
31131
|
-
...score.verdict ? { "agentv.
|
|
31143
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
31144
|
+
"agentv.grader.score": score.score,
|
|
31145
|
+
"agentv.grader.type": score.type,
|
|
31146
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
31132
31147
|
});
|
|
31133
31148
|
}
|
|
31134
31149
|
}
|
|
@@ -31588,4 +31603,4 @@ export {
|
|
|
31588
31603
|
OtelStreamingObserver,
|
|
31589
31604
|
createAgentKernel
|
|
31590
31605
|
};
|
|
31591
|
-
//# sourceMappingURL=chunk-
|
|
31606
|
+
//# sourceMappingURL=chunk-K747KGDP.js.map
|