@agentv/core 3.4.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-NFFLXG5M.js +7 -0
- package/dist/{chunk-JO4HIAEF.js → chunk-EFR4JHPL.js} +1 -5
- package/dist/chunk-EFR4JHPL.js.map +1 -0
- package/dist/{chunk-Q52FQPKQ.js → chunk-W5YDZWT4.js} +2 -2
- package/dist/chunk-W5YDZWT4.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +382 -436
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +55 -46
- package/dist/index.d.ts +55 -46
- package/dist/index.js +384 -435
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/agentv-provider-HDSAUUEF.js +0 -7
- package/dist/chunk-JO4HIAEF.js.map +0 -1
- package/dist/chunk-Q52FQPKQ.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF.js.map → agentv-provider-NFFLXG5M.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -5,7 +5,6 @@ import {
|
|
|
5
5
|
extractLastAssistantContent,
|
|
6
6
|
fileExists,
|
|
7
7
|
findGitRoot,
|
|
8
|
-
getHitCount,
|
|
9
8
|
isAgentProvider,
|
|
10
9
|
isEvaluatorKind,
|
|
11
10
|
isJsonObject,
|
|
@@ -17,10 +16,10 @@ import {
|
|
|
17
16
|
readTextFile,
|
|
18
17
|
resolveFileReference,
|
|
19
18
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-EFR4JHPL.js";
|
|
21
20
|
import {
|
|
22
21
|
AgentvProvider
|
|
23
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-W5YDZWT4.js";
|
|
24
23
|
import {
|
|
25
24
|
OtlpJsonFileExporter
|
|
26
25
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -3752,7 +3751,7 @@ var AzureProvider = class {
|
|
|
3752
3751
|
};
|
|
3753
3752
|
this.retryConfig = config.retry;
|
|
3754
3753
|
const azure = createAzure(buildAzureOptions(config));
|
|
3755
|
-
this.model = azure(config.deploymentName);
|
|
3754
|
+
this.model = azure.chat(config.deploymentName);
|
|
3756
3755
|
}
|
|
3757
3756
|
id;
|
|
3758
3757
|
kind = "azure";
|
|
@@ -9784,9 +9783,11 @@ function negateScore(score) {
|
|
|
9784
9783
|
...score,
|
|
9785
9784
|
score: negatedScore,
|
|
9786
9785
|
verdict: negatedVerdict,
|
|
9787
|
-
|
|
9788
|
-
|
|
9789
|
-
|
|
9786
|
+
assertions: score.assertions.map((a) => ({
|
|
9787
|
+
...a,
|
|
9788
|
+
passed: !a.passed,
|
|
9789
|
+
evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
|
|
9790
|
+
}))
|
|
9790
9791
|
};
|
|
9791
9792
|
}
|
|
9792
9793
|
|
|
@@ -10301,9 +10302,13 @@ var CodeEvaluator = class {
|
|
|
10301
10302
|
);
|
|
10302
10303
|
const parsed = parseJsonSafe(stdout);
|
|
10303
10304
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
10304
|
-
const
|
|
10305
|
-
|
|
10306
|
-
|
|
10305
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
10306
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
10307
|
+
).map((a) => ({
|
|
10308
|
+
text: String(a.text),
|
|
10309
|
+
passed: Boolean(a.passed),
|
|
10310
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
10311
|
+
})) : [];
|
|
10307
10312
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
10308
10313
|
const proxyUsage = getProxyUsage?.();
|
|
10309
10314
|
const evaluatorRawRequest = {
|
|
@@ -10319,10 +10324,8 @@ var CodeEvaluator = class {
|
|
|
10319
10324
|
return {
|
|
10320
10325
|
score,
|
|
10321
10326
|
verdict: scoreToVerdict(score),
|
|
10322
|
-
|
|
10323
|
-
|
|
10324
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
10325
|
-
reasoning,
|
|
10327
|
+
assertions,
|
|
10328
|
+
expectedAspectCount: assertions.length || 1,
|
|
10326
10329
|
evaluatorRawRequest,
|
|
10327
10330
|
...details ? { details } : {},
|
|
10328
10331
|
tokenUsage: proxyUsage?.tokenUsage
|
|
@@ -10333,10 +10336,8 @@ var CodeEvaluator = class {
|
|
|
10333
10336
|
return {
|
|
10334
10337
|
score: 0,
|
|
10335
10338
|
verdict: "fail",
|
|
10336
|
-
|
|
10337
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
10339
|
+
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
10338
10340
|
expectedAspectCount: 1,
|
|
10339
|
-
reasoning: message,
|
|
10340
10341
|
evaluatorRawRequest: {
|
|
10341
10342
|
command: this.command,
|
|
10342
10343
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
@@ -10444,9 +10445,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
10444
10445
|
{{${TEMPLATE_VARIABLES.ANSWER}}}`;
|
|
10445
10446
|
var freeformEvaluationSchema = z3.object({
|
|
10446
10447
|
score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
10447
|
-
|
|
10448
|
-
|
|
10449
|
-
|
|
10448
|
+
assertions: z3.array(
|
|
10449
|
+
z3.object({
|
|
10450
|
+
text: z3.string().describe("Brief description of what was checked"),
|
|
10451
|
+
passed: z3.boolean().describe("Whether this aspect was satisfied"),
|
|
10452
|
+
evidence: z3.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
10453
|
+
})
|
|
10454
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
10450
10455
|
});
|
|
10451
10456
|
var rubricCheckResultSchema = z3.object({
|
|
10452
10457
|
id: z3.string().describe("The ID of the rubric item being checked"),
|
|
@@ -10548,17 +10553,12 @@ ${context.fileChanges}`;
|
|
|
10548
10553
|
schema: freeformEvaluationSchema
|
|
10549
10554
|
});
|
|
10550
10555
|
const score = clampScore(data.score);
|
|
10551
|
-
const
|
|
10552
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
10553
|
-
const reasoning = data.reasoning;
|
|
10554
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
10556
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
10555
10557
|
return {
|
|
10556
10558
|
score,
|
|
10557
10559
|
verdict: scoreToVerdict(score),
|
|
10558
|
-
|
|
10559
|
-
|
|
10560
|
-
expectedAspectCount,
|
|
10561
|
-
reasoning,
|
|
10560
|
+
assertions,
|
|
10561
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10562
10562
|
evaluatorRawRequest,
|
|
10563
10563
|
tokenUsage
|
|
10564
10564
|
};
|
|
@@ -10569,10 +10569,8 @@ ${context.fileChanges}`;
|
|
|
10569
10569
|
return {
|
|
10570
10570
|
score: 0,
|
|
10571
10571
|
verdict: "skip",
|
|
10572
|
-
|
|
10573
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
10572
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10574
10573
|
expectedAspectCount: 1,
|
|
10575
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
10576
10574
|
evaluatorRawRequest
|
|
10577
10575
|
};
|
|
10578
10576
|
}
|
|
@@ -10602,14 +10600,12 @@ ${context.fileChanges}`;
|
|
|
10602
10600
|
userPrompt: prompt,
|
|
10603
10601
|
schema: rubricEvaluationSchema
|
|
10604
10602
|
});
|
|
10605
|
-
const { score, verdict,
|
|
10603
|
+
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
10606
10604
|
return {
|
|
10607
10605
|
score,
|
|
10608
10606
|
verdict,
|
|
10609
|
-
|
|
10610
|
-
misses,
|
|
10607
|
+
assertions,
|
|
10611
10608
|
expectedAspectCount: rubrics.length,
|
|
10612
|
-
reasoning: data.overall_reasoning,
|
|
10613
10609
|
evaluatorRawRequest,
|
|
10614
10610
|
tokenUsage
|
|
10615
10611
|
};
|
|
@@ -10620,10 +10616,8 @@ ${context.fileChanges}`;
|
|
|
10620
10616
|
return {
|
|
10621
10617
|
score: 0,
|
|
10622
10618
|
verdict: "skip",
|
|
10623
|
-
|
|
10624
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
10619
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10625
10620
|
expectedAspectCount: rubrics.length,
|
|
10626
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
10627
10621
|
evaluatorRawRequest
|
|
10628
10622
|
};
|
|
10629
10623
|
}
|
|
@@ -10648,14 +10642,12 @@ ${context.fileChanges}`;
|
|
|
10648
10642
|
userPrompt: prompt,
|
|
10649
10643
|
schema: scoreRangeEvaluationSchema
|
|
10650
10644
|
});
|
|
10651
|
-
const { score, verdict,
|
|
10645
|
+
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
10652
10646
|
return {
|
|
10653
10647
|
score,
|
|
10654
10648
|
verdict,
|
|
10655
|
-
|
|
10656
|
-
misses,
|
|
10649
|
+
assertions,
|
|
10657
10650
|
expectedAspectCount: rubrics.length,
|
|
10658
|
-
reasoning: data.overall_reasoning,
|
|
10659
10651
|
evaluatorRawRequest,
|
|
10660
10652
|
details,
|
|
10661
10653
|
tokenUsage
|
|
@@ -10667,10 +10659,8 @@ ${context.fileChanges}`;
|
|
|
10667
10659
|
return {
|
|
10668
10660
|
score: 0,
|
|
10669
10661
|
verdict: "skip",
|
|
10670
|
-
|
|
10671
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
10662
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10672
10663
|
expectedAspectCount: rubrics.length,
|
|
10673
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
10674
10664
|
evaluatorRawRequest
|
|
10675
10665
|
};
|
|
10676
10666
|
}
|
|
@@ -10727,8 +10717,7 @@ ${context.fileChanges}`;
|
|
|
10727
10717
|
return {
|
|
10728
10718
|
score: 0,
|
|
10729
10719
|
verdict: "fail",
|
|
10730
|
-
|
|
10731
|
-
misses: [`llm-grader built-in evaluation failed: ${message}`],
|
|
10720
|
+
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
10732
10721
|
expectedAspectCount: 1,
|
|
10733
10722
|
evaluatorRawRequest,
|
|
10734
10723
|
details: { mode: "built-in", error: message }
|
|
@@ -10778,8 +10767,9 @@ ${context.fileChanges}`;
|
|
|
10778
10767
|
return {
|
|
10779
10768
|
score: 0,
|
|
10780
10769
|
verdict: "fail",
|
|
10781
|
-
|
|
10782
|
-
|
|
10770
|
+
assertions: [
|
|
10771
|
+
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
10772
|
+
],
|
|
10783
10773
|
expectedAspectCount: 1,
|
|
10784
10774
|
evaluatorRawRequest,
|
|
10785
10775
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
@@ -10797,8 +10787,9 @@ ${context.fileChanges}`;
|
|
|
10797
10787
|
return {
|
|
10798
10788
|
score: 0,
|
|
10799
10789
|
verdict: "fail",
|
|
10800
|
-
|
|
10801
|
-
|
|
10790
|
+
assertions: [
|
|
10791
|
+
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
10792
|
+
],
|
|
10802
10793
|
expectedAspectCount: 1,
|
|
10803
10794
|
evaluatorRawRequest,
|
|
10804
10795
|
details: {
|
|
@@ -10950,29 +10941,24 @@ ${outputSchema}`;
|
|
|
10950
10941
|
const parsed = parseJsonFromText(text);
|
|
10951
10942
|
if (rubrics && rubrics.length > 0) {
|
|
10952
10943
|
const data2 = rubricEvaluationSchema.parse(parsed);
|
|
10953
|
-
const { score: score2, verdict,
|
|
10944
|
+
const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
|
|
10954
10945
|
return {
|
|
10955
10946
|
score: score2,
|
|
10956
10947
|
verdict,
|
|
10957
|
-
|
|
10958
|
-
misses: misses2,
|
|
10948
|
+
assertions: assertions2,
|
|
10959
10949
|
expectedAspectCount: rubrics.length,
|
|
10960
|
-
reasoning: data2.overall_reasoning,
|
|
10961
10950
|
evaluatorRawRequest,
|
|
10962
10951
|
details
|
|
10963
10952
|
};
|
|
10964
10953
|
}
|
|
10965
10954
|
const data = freeformEvaluationSchema.parse(parsed);
|
|
10966
10955
|
const score = clampScore(data.score);
|
|
10967
|
-
const
|
|
10968
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
10956
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
10969
10957
|
return {
|
|
10970
10958
|
score,
|
|
10971
10959
|
verdict: scoreToVerdict(score),
|
|
10972
|
-
|
|
10973
|
-
|
|
10974
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
10975
|
-
reasoning: data.reasoning,
|
|
10960
|
+
assertions,
|
|
10961
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10976
10962
|
evaluatorRawRequest,
|
|
10977
10963
|
details
|
|
10978
10964
|
};
|
|
@@ -10980,8 +10966,12 @@ ${outputSchema}`;
|
|
|
10980
10966
|
return {
|
|
10981
10967
|
score: 0,
|
|
10982
10968
|
verdict: "fail",
|
|
10983
|
-
|
|
10984
|
-
|
|
10969
|
+
assertions: [
|
|
10970
|
+
{
|
|
10971
|
+
text: "Failed to parse llm-grader agent response as valid evaluation JSON",
|
|
10972
|
+
passed: false
|
|
10973
|
+
}
|
|
10974
|
+
],
|
|
10985
10975
|
expectedAspectCount: 1,
|
|
10986
10976
|
evaluatorRawRequest,
|
|
10987
10977
|
details
|
|
@@ -11110,9 +11100,13 @@ function buildOutputSchema() {
|
|
|
11110
11100
|
"",
|
|
11111
11101
|
"{",
|
|
11112
11102
|
' "score": <number between 0.0 and 1.0>,',
|
|
11113
|
-
' "
|
|
11114
|
-
|
|
11115
|
-
'
|
|
11103
|
+
' "assertions": [',
|
|
11104
|
+
" {",
|
|
11105
|
+
' "text": "<brief description of what was checked>",',
|
|
11106
|
+
' "passed": <boolean>,',
|
|
11107
|
+
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
11108
|
+
" }",
|
|
11109
|
+
" ]",
|
|
11116
11110
|
"}"
|
|
11117
11111
|
].join("\n");
|
|
11118
11112
|
}
|
|
@@ -11137,8 +11131,7 @@ function substituteVariables(template, variables) {
|
|
|
11137
11131
|
}
|
|
11138
11132
|
function calculateRubricScore(result, rubrics) {
|
|
11139
11133
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
11140
|
-
const
|
|
11141
|
-
const misses = [];
|
|
11134
|
+
const assertions = [];
|
|
11142
11135
|
let totalWeight = 0;
|
|
11143
11136
|
let earnedWeight = 0;
|
|
11144
11137
|
let failedRequired = false;
|
|
@@ -11148,19 +11141,20 @@ function calculateRubricScore(result, rubrics) {
|
|
|
11148
11141
|
continue;
|
|
11149
11142
|
}
|
|
11150
11143
|
totalWeight += rubric.weight;
|
|
11144
|
+
assertions.push({
|
|
11145
|
+
text: `[${rubric.id}] ${rubric.outcome}`,
|
|
11146
|
+
passed: check.satisfied,
|
|
11147
|
+
evidence: check.reasoning
|
|
11148
|
+
});
|
|
11151
11149
|
if (check.satisfied) {
|
|
11152
11150
|
earnedWeight += rubric.weight;
|
|
11153
|
-
|
|
11154
|
-
|
|
11155
|
-
misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
|
|
11156
|
-
if (rubric.required) {
|
|
11157
|
-
failedRequired = true;
|
|
11158
|
-
}
|
|
11151
|
+
} else if (rubric.required) {
|
|
11152
|
+
failedRequired = true;
|
|
11159
11153
|
}
|
|
11160
11154
|
}
|
|
11161
11155
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
11162
11156
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
11163
|
-
return { score, verdict,
|
|
11157
|
+
return { score, verdict, assertions };
|
|
11164
11158
|
}
|
|
11165
11159
|
function buildScoreRangeOutputSchema() {
|
|
11166
11160
|
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
@@ -11180,8 +11174,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
|
|
|
11180
11174
|
}
|
|
11181
11175
|
function calculateScoreRangeResult(result, rubrics) {
|
|
11182
11176
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
11183
|
-
const
|
|
11184
|
-
const misses = [];
|
|
11177
|
+
const assertions = [];
|
|
11185
11178
|
const rawScores = {};
|
|
11186
11179
|
let totalWeight = 0;
|
|
11187
11180
|
let weightedScoreSum = 0;
|
|
@@ -11207,24 +11200,22 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
11207
11200
|
);
|
|
11208
11201
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
11209
11202
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
11210
|
-
const
|
|
11211
|
-
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
11203
|
+
const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
|
|
11212
11204
|
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
11213
11205
|
failedRequired = true;
|
|
11214
|
-
misses.push(scoreInfo);
|
|
11215
|
-
} else if (rawScore >= 7) {
|
|
11216
|
-
hits.push(scoreInfo);
|
|
11217
|
-
} else {
|
|
11218
|
-
misses.push(scoreInfo);
|
|
11219
11206
|
}
|
|
11207
|
+
assertions.push({
|
|
11208
|
+
text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
|
|
11209
|
+
passed,
|
|
11210
|
+
evidence: check.reasoning
|
|
11211
|
+
});
|
|
11220
11212
|
}
|
|
11221
11213
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
11222
11214
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
11223
11215
|
return {
|
|
11224
11216
|
score,
|
|
11225
11217
|
verdict,
|
|
11226
|
-
|
|
11227
|
-
misses,
|
|
11218
|
+
assertions,
|
|
11228
11219
|
details: {
|
|
11229
11220
|
raw_scores: rawScores,
|
|
11230
11221
|
normalization: "score / 10",
|
|
@@ -11400,9 +11391,7 @@ var CompositeEvaluator = class {
|
|
|
11400
11391
|
let totalWeight = 0;
|
|
11401
11392
|
let weightedSum = 0;
|
|
11402
11393
|
let evaluatedCount = 0;
|
|
11403
|
-
const
|
|
11404
|
-
const allMisses = [];
|
|
11405
|
-
const reasoningParts = [];
|
|
11394
|
+
const allAssertions = [];
|
|
11406
11395
|
const scores = [];
|
|
11407
11396
|
for (const member of results) {
|
|
11408
11397
|
const weight = weights?.[member.id] ?? 1;
|
|
@@ -11412,9 +11401,7 @@ var CompositeEvaluator = class {
|
|
|
11412
11401
|
score: member.result.score,
|
|
11413
11402
|
weight,
|
|
11414
11403
|
verdict: member.result.verdict,
|
|
11415
|
-
|
|
11416
|
-
misses: [...member.result.misses],
|
|
11417
|
-
reasoning: member.result.reasoning,
|
|
11404
|
+
assertions: [...member.result.assertions],
|
|
11418
11405
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
11419
11406
|
scores: member.result.scores,
|
|
11420
11407
|
details: member.result.details,
|
|
@@ -11426,20 +11413,16 @@ var CompositeEvaluator = class {
|
|
|
11426
11413
|
evaluatedCount++;
|
|
11427
11414
|
totalWeight += weight;
|
|
11428
11415
|
weightedSum += member.result.score * weight;
|
|
11429
|
-
|
|
11430
|
-
|
|
11431
|
-
|
|
11432
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
11433
|
-
}
|
|
11416
|
+
allAssertions.push(
|
|
11417
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
11418
|
+
);
|
|
11434
11419
|
}
|
|
11435
11420
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
11436
11421
|
return {
|
|
11437
11422
|
score: 0,
|
|
11438
11423
|
verdict: "skip",
|
|
11439
|
-
|
|
11440
|
-
misses: [],
|
|
11424
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
11441
11425
|
expectedAspectCount: 1,
|
|
11442
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
11443
11426
|
evaluatorRawRequest: {
|
|
11444
11427
|
aggregator: "weighted_average",
|
|
11445
11428
|
...weights ? { weights } : {}
|
|
@@ -11451,10 +11434,8 @@ var CompositeEvaluator = class {
|
|
|
11451
11434
|
return {
|
|
11452
11435
|
score: clampScore(finalScore),
|
|
11453
11436
|
verdict: scoreToVerdict(finalScore),
|
|
11454
|
-
|
|
11455
|
-
|
|
11456
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
11457
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
11437
|
+
assertions: allAssertions,
|
|
11438
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
11458
11439
|
evaluatorRawRequest: {
|
|
11459
11440
|
aggregator: "weighted_average",
|
|
11460
11441
|
...weights ? { weights } : {}
|
|
@@ -11464,11 +11445,8 @@ var CompositeEvaluator = class {
|
|
|
11464
11445
|
}
|
|
11465
11446
|
runThreshold(results, threshold) {
|
|
11466
11447
|
const scores = [];
|
|
11467
|
-
const
|
|
11468
|
-
const allMisses = [];
|
|
11469
|
-
const reasoningParts = [];
|
|
11448
|
+
const allAssertions = [];
|
|
11470
11449
|
let passingCount = 0;
|
|
11471
|
-
let borderlineCount = 0;
|
|
11472
11450
|
let evaluatedCount = 0;
|
|
11473
11451
|
for (const member of results) {
|
|
11474
11452
|
scores.push({
|
|
@@ -11476,9 +11454,7 @@ var CompositeEvaluator = class {
|
|
|
11476
11454
|
type: member.type,
|
|
11477
11455
|
score: member.result.score,
|
|
11478
11456
|
verdict: member.result.verdict,
|
|
11479
|
-
|
|
11480
|
-
misses: [...member.result.misses],
|
|
11481
|
-
reasoning: member.result.reasoning,
|
|
11457
|
+
assertions: [...member.result.assertions],
|
|
11482
11458
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
11483
11459
|
scores: member.result.scores,
|
|
11484
11460
|
details: member.result.details,
|
|
@@ -11491,24 +11467,17 @@ var CompositeEvaluator = class {
|
|
|
11491
11467
|
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
11492
11468
|
if (isPassing) {
|
|
11493
11469
|
passingCount++;
|
|
11494
|
-
if (member.result.verdict === "borderline") {
|
|
11495
|
-
borderlineCount++;
|
|
11496
|
-
}
|
|
11497
|
-
}
|
|
11498
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
11499
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
11500
|
-
if (member.result.reasoning) {
|
|
11501
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
11502
11470
|
}
|
|
11471
|
+
allAssertions.push(
|
|
11472
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
11473
|
+
);
|
|
11503
11474
|
}
|
|
11504
11475
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
11505
11476
|
return {
|
|
11506
11477
|
score: 0,
|
|
11507
11478
|
verdict: "skip",
|
|
11508
|
-
|
|
11509
|
-
misses: [],
|
|
11479
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
11510
11480
|
expectedAspectCount: 1,
|
|
11511
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
11512
11481
|
evaluatorRawRequest: {
|
|
11513
11482
|
aggregator: "threshold",
|
|
11514
11483
|
threshold
|
|
@@ -11519,19 +11488,15 @@ var CompositeEvaluator = class {
|
|
|
11519
11488
|
const totalCount = evaluatedCount;
|
|
11520
11489
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
11521
11490
|
const pass = score >= threshold;
|
|
11522
|
-
|
|
11523
|
-
|
|
11524
|
-
|
|
11525
|
-
|
|
11526
|
-
`${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
|
|
11527
|
-
);
|
|
11491
|
+
allAssertions.unshift({
|
|
11492
|
+
text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
|
|
11493
|
+
passed: pass
|
|
11494
|
+
});
|
|
11528
11495
|
return {
|
|
11529
11496
|
score: clampScore(score),
|
|
11530
11497
|
verdict: pass ? "pass" : "fail",
|
|
11531
|
-
|
|
11532
|
-
|
|
11533
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
11534
|
-
reasoning: reasoningParts.join("; "),
|
|
11498
|
+
assertions: allAssertions,
|
|
11499
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
11535
11500
|
evaluatorRawRequest: {
|
|
11536
11501
|
aggregator: "threshold",
|
|
11537
11502
|
threshold
|
|
@@ -11548,9 +11513,7 @@ var CompositeEvaluator = class {
|
|
|
11548
11513
|
score: member.result.score,
|
|
11549
11514
|
weight: weights?.[member.id] ?? 1,
|
|
11550
11515
|
verdict: member.result.verdict,
|
|
11551
|
-
|
|
11552
|
-
misses: [...member.result.misses],
|
|
11553
|
-
reasoning: member.result.reasoning,
|
|
11516
|
+
assertions: [...member.result.assertions],
|
|
11554
11517
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
11555
11518
|
scores: member.result.scores,
|
|
11556
11519
|
details: member.result.details
|
|
@@ -11559,17 +11522,19 @@ var CompositeEvaluator = class {
|
|
|
11559
11522
|
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
11560
11523
|
const parsed = parseJsonSafe(stdout);
|
|
11561
11524
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
11562
|
-
const
|
|
11563
|
-
|
|
11564
|
-
|
|
11525
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
11526
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
11527
|
+
).map((a) => ({
|
|
11528
|
+
text: String(a.text),
|
|
11529
|
+
passed: Boolean(a.passed),
|
|
11530
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
11531
|
+
})) : [];
|
|
11565
11532
|
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
11566
11533
|
return {
|
|
11567
11534
|
score,
|
|
11568
11535
|
verdict,
|
|
11569
|
-
|
|
11570
|
-
|
|
11571
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
11572
|
-
reasoning,
|
|
11536
|
+
assertions,
|
|
11537
|
+
expectedAspectCount: assertions.length || 1,
|
|
11573
11538
|
evaluatorRawRequest: {
|
|
11574
11539
|
aggregator: "code-grader",
|
|
11575
11540
|
script: scriptPath
|
|
@@ -11581,10 +11546,8 @@ var CompositeEvaluator = class {
|
|
|
11581
11546
|
return {
|
|
11582
11547
|
score: 0,
|
|
11583
11548
|
verdict: "fail",
|
|
11584
|
-
|
|
11585
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
11549
|
+
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
11586
11550
|
expectedAspectCount: 1,
|
|
11587
|
-
reasoning: message,
|
|
11588
11551
|
evaluatorRawRequest: {
|
|
11589
11552
|
aggregator: "code-grader",
|
|
11590
11553
|
script: scriptPath,
|
|
@@ -11606,9 +11569,7 @@ var CompositeEvaluator = class {
|
|
|
11606
11569
|
type: member.type,
|
|
11607
11570
|
score: member.result.score,
|
|
11608
11571
|
verdict: member.result.verdict,
|
|
11609
|
-
|
|
11610
|
-
misses: [...member.result.misses],
|
|
11611
|
-
reasoning: member.result.reasoning,
|
|
11572
|
+
assertions: [...member.result.assertions],
|
|
11612
11573
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
11613
11574
|
scores: member.result.scores,
|
|
11614
11575
|
details: member.result.details
|
|
@@ -11632,16 +11593,12 @@ var CompositeEvaluator = class {
|
|
|
11632
11593
|
});
|
|
11633
11594
|
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
11634
11595
|
const score2 = clampScore(data2.score);
|
|
11635
|
-
const
|
|
11636
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
11637
|
-
const reasoning2 = data2.reasoning;
|
|
11596
|
+
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
11638
11597
|
return {
|
|
11639
11598
|
score: score2,
|
|
11640
11599
|
verdict: scoreToVerdict(score2),
|
|
11641
|
-
|
|
11642
|
-
|
|
11643
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
11644
|
-
reasoning: reasoning2,
|
|
11600
|
+
assertions: assertions2,
|
|
11601
|
+
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
11645
11602
|
evaluatorRawRequest,
|
|
11646
11603
|
scores
|
|
11647
11604
|
};
|
|
@@ -11656,16 +11613,12 @@ var CompositeEvaluator = class {
|
|
|
11656
11613
|
parseJsonFromText(extractLastAssistantContent(response.output))
|
|
11657
11614
|
);
|
|
11658
11615
|
const score = clampScore(data.score);
|
|
11659
|
-
const
|
|
11660
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
11661
|
-
const reasoning = data.reasoning;
|
|
11616
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
11662
11617
|
return {
|
|
11663
11618
|
score,
|
|
11664
11619
|
verdict: scoreToVerdict(score),
|
|
11665
|
-
|
|
11666
|
-
|
|
11667
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
11668
|
-
reasoning,
|
|
11620
|
+
assertions,
|
|
11621
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
11669
11622
|
evaluatorRawRequest,
|
|
11670
11623
|
scores
|
|
11671
11624
|
};
|
|
@@ -11673,8 +11626,7 @@ var CompositeEvaluator = class {
|
|
|
11673
11626
|
return {
|
|
11674
11627
|
score: 0,
|
|
11675
11628
|
verdict: "fail",
|
|
11676
|
-
|
|
11677
|
-
misses: [],
|
|
11629
|
+
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
11678
11630
|
expectedAspectCount: 1,
|
|
11679
11631
|
evaluatorRawRequest,
|
|
11680
11632
|
scores
|
|
@@ -11697,10 +11649,8 @@ var CostEvaluator = class {
|
|
|
11697
11649
|
return {
|
|
11698
11650
|
score: 0,
|
|
11699
11651
|
verdict: "fail",
|
|
11700
|
-
|
|
11701
|
-
misses: ["No cost data available in trace"],
|
|
11652
|
+
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
11702
11653
|
expectedAspectCount: 1,
|
|
11703
|
-
reasoning: "Execution cost not reported by provider",
|
|
11704
11654
|
evaluatorRawRequest: {
|
|
11705
11655
|
type: "cost",
|
|
11706
11656
|
budget,
|
|
@@ -11714,10 +11664,10 @@ var CostEvaluator = class {
|
|
|
11714
11664
|
return {
|
|
11715
11665
|
score,
|
|
11716
11666
|
verdict: passed ? "pass" : "fail",
|
|
11717
|
-
|
|
11718
|
-
|
|
11667
|
+
assertions: [
|
|
11668
|
+
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
11669
|
+
],
|
|
11719
11670
|
expectedAspectCount: 1,
|
|
11720
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
11721
11671
|
evaluatorRawRequest: {
|
|
11722
11672
|
type: "cost",
|
|
11723
11673
|
budget,
|
|
@@ -11750,10 +11700,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
11750
11700
|
return {
|
|
11751
11701
|
score: 0,
|
|
11752
11702
|
verdict: "fail",
|
|
11753
|
-
|
|
11754
|
-
misses: ["No trace summary available"],
|
|
11703
|
+
assertions: [{ text: "No trace summary available", passed: false }],
|
|
11755
11704
|
expectedAspectCount: 1,
|
|
11756
|
-
reasoning: "Execution metrics not available - no trace summary provided",
|
|
11757
11705
|
evaluatorRawRequest: {
|
|
11758
11706
|
type: "execution-metrics",
|
|
11759
11707
|
config: this.extractConfiguredThresholds(),
|
|
@@ -11762,116 +11710,114 @@ var ExecutionMetricsEvaluator = class {
|
|
|
11762
11710
|
};
|
|
11763
11711
|
}
|
|
11764
11712
|
const narrowedTrace = trace;
|
|
11765
|
-
const
|
|
11766
|
-
const misses = [];
|
|
11713
|
+
const assertions = [];
|
|
11767
11714
|
const actualMetrics = {};
|
|
11768
11715
|
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
11769
11716
|
const toolCalls = narrowedTrace.eventCount;
|
|
11770
11717
|
actualMetrics.tool_calls = toolCalls;
|
|
11771
11718
|
if (toolCalls <= max_tool_calls) {
|
|
11772
|
-
|
|
11719
|
+
assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
|
|
11773
11720
|
} else {
|
|
11774
|
-
|
|
11721
|
+
assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
|
|
11775
11722
|
}
|
|
11776
11723
|
}
|
|
11777
11724
|
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
11778
11725
|
const llmCalls = narrowedTrace.llmCallCount;
|
|
11779
11726
|
if (llmCalls === void 0) {
|
|
11780
|
-
|
|
11727
|
+
assertions.push({ text: "LLM call count data not available", passed: false });
|
|
11781
11728
|
} else {
|
|
11782
11729
|
actualMetrics.llm_calls = llmCalls;
|
|
11783
11730
|
if (llmCalls <= max_llm_calls) {
|
|
11784
|
-
|
|
11731
|
+
assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
|
|
11785
11732
|
} else {
|
|
11786
|
-
|
|
11733
|
+
assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
|
|
11787
11734
|
}
|
|
11788
11735
|
}
|
|
11789
11736
|
}
|
|
11790
11737
|
if (max_tokens !== void 0) {
|
|
11791
11738
|
if (!tokenUsage) {
|
|
11792
|
-
|
|
11739
|
+
assertions.push({ text: "Token usage data not available", passed: false });
|
|
11793
11740
|
} else {
|
|
11794
11741
|
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
11795
11742
|
actualMetrics.tokens = totalTokens;
|
|
11796
11743
|
if (totalTokens <= max_tokens) {
|
|
11797
|
-
|
|
11744
|
+
assertions.push({
|
|
11745
|
+
text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
|
|
11746
|
+
passed: true
|
|
11747
|
+
});
|
|
11798
11748
|
} else {
|
|
11799
|
-
|
|
11749
|
+
assertions.push({
|
|
11750
|
+
text: `Total tokens ${totalTokens} > ${max_tokens} max`,
|
|
11751
|
+
passed: false
|
|
11752
|
+
});
|
|
11800
11753
|
}
|
|
11801
11754
|
}
|
|
11802
11755
|
}
|
|
11803
11756
|
if (max_cost_usd !== void 0) {
|
|
11804
11757
|
if (costUsd === void 0) {
|
|
11805
|
-
|
|
11758
|
+
assertions.push({ text: "Cost data not available", passed: false });
|
|
11806
11759
|
} else {
|
|
11807
11760
|
actualMetrics.cost_usd = costUsd;
|
|
11808
11761
|
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
11809
11762
|
if (costUsd <= max_cost_usd) {
|
|
11810
|
-
|
|
11763
|
+
assertions.push({
|
|
11764
|
+
text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
|
|
11765
|
+
passed: true
|
|
11766
|
+
});
|
|
11811
11767
|
} else {
|
|
11812
|
-
|
|
11768
|
+
assertions.push({
|
|
11769
|
+
text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
|
|
11770
|
+
passed: false
|
|
11771
|
+
});
|
|
11813
11772
|
}
|
|
11814
11773
|
}
|
|
11815
11774
|
}
|
|
11816
11775
|
if (max_duration_ms !== void 0) {
|
|
11817
11776
|
if (durationMs === void 0) {
|
|
11818
|
-
|
|
11777
|
+
assertions.push({ text: "Duration data not available", passed: false });
|
|
11819
11778
|
} else {
|
|
11820
11779
|
actualMetrics.duration_ms = durationMs;
|
|
11821
11780
|
if (durationMs <= max_duration_ms) {
|
|
11822
|
-
|
|
11781
|
+
assertions.push({
|
|
11782
|
+
text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
|
|
11783
|
+
passed: true
|
|
11784
|
+
});
|
|
11823
11785
|
} else {
|
|
11824
|
-
|
|
11786
|
+
assertions.push({
|
|
11787
|
+
text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
|
|
11788
|
+
passed: false
|
|
11789
|
+
});
|
|
11825
11790
|
}
|
|
11826
11791
|
}
|
|
11827
11792
|
}
|
|
11828
11793
|
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
11829
11794
|
const ratio = explorationRatio(narrowedTrace);
|
|
11830
11795
|
if (ratio === void 0) {
|
|
11831
|
-
|
|
11796
|
+
assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
|
|
11832
11797
|
} else {
|
|
11833
11798
|
actualMetrics.exploration_ratio = ratio;
|
|
11834
11799
|
const diff = Math.abs(ratio - target_exploration_ratio);
|
|
11835
11800
|
if (diff <= exploration_tolerance) {
|
|
11836
|
-
|
|
11837
|
-
`Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}
|
|
11838
|
-
|
|
11801
|
+
assertions.push({
|
|
11802
|
+
text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
|
|
11803
|
+
passed: true
|
|
11804
|
+
});
|
|
11839
11805
|
} else {
|
|
11840
|
-
|
|
11841
|
-
`Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})
|
|
11842
|
-
|
|
11806
|
+
assertions.push({
|
|
11807
|
+
text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
|
|
11808
|
+
passed: false
|
|
11809
|
+
});
|
|
11843
11810
|
}
|
|
11844
11811
|
}
|
|
11845
11812
|
}
|
|
11846
|
-
const totalChecks =
|
|
11847
|
-
const
|
|
11848
|
-
const
|
|
11849
|
-
if (actualMetrics.tool_calls !== void 0) {
|
|
11850
|
-
reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
|
|
11851
|
-
}
|
|
11852
|
-
if (actualMetrics.llm_calls !== void 0) {
|
|
11853
|
-
reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
|
|
11854
|
-
}
|
|
11855
|
-
if (actualMetrics.tokens !== void 0) {
|
|
11856
|
-
reasoningParts.push(`tokens=${actualMetrics.tokens}`);
|
|
11857
|
-
}
|
|
11858
|
-
if (actualMetrics.cost_usd !== void 0) {
|
|
11859
|
-
reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
|
|
11860
|
-
}
|
|
11861
|
-
if (actualMetrics.duration_ms !== void 0) {
|
|
11862
|
-
reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
|
|
11863
|
-
}
|
|
11864
|
-
if (actualMetrics.exploration_ratio !== void 0) {
|
|
11865
|
-
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
11866
|
-
}
|
|
11867
|
-
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
11813
|
+
const totalChecks = assertions.length;
|
|
11814
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
11815
|
+
const score = totalChecks > 0 ? passedCount / totalChecks : 0;
|
|
11868
11816
|
return {
|
|
11869
11817
|
score,
|
|
11870
11818
|
verdict: scoreToVerdict(score),
|
|
11871
|
-
|
|
11872
|
-
misses,
|
|
11819
|
+
assertions,
|
|
11873
11820
|
expectedAspectCount: totalChecks || 1,
|
|
11874
|
-
reasoning,
|
|
11875
11821
|
evaluatorRawRequest: {
|
|
11876
11822
|
type: "execution-metrics",
|
|
11877
11823
|
config: this.extractConfiguredThresholds(),
|
|
@@ -11975,10 +11921,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
11975
11921
|
return {
|
|
11976
11922
|
score: 0,
|
|
11977
11923
|
verdict: "fail",
|
|
11978
|
-
|
|
11979
|
-
|
|
11980
|
-
expectedAspectCount: this.config.fields.length,
|
|
11981
|
-
reasoning: "Candidate answer is not valid JSON"
|
|
11924
|
+
assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
|
|
11925
|
+
expectedAspectCount: this.config.fields.length
|
|
11982
11926
|
};
|
|
11983
11927
|
}
|
|
11984
11928
|
const expectedData = this.extractExpectedData(evalCase.expected_output);
|
|
@@ -11986,10 +11930,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
11986
11930
|
return {
|
|
11987
11931
|
score: 0,
|
|
11988
11932
|
verdict: "fail",
|
|
11989
|
-
|
|
11990
|
-
|
|
11991
|
-
expectedAspectCount: this.config.fields.length,
|
|
11992
|
-
reasoning: "Could not extract expected data from expected_output"
|
|
11933
|
+
assertions: [{ text: "No expected data found in expected_output", passed: false }],
|
|
11934
|
+
expectedAspectCount: this.config.fields.length
|
|
11993
11935
|
};
|
|
11994
11936
|
}
|
|
11995
11937
|
const fieldResults = [];
|
|
@@ -12207,18 +12149,14 @@ var FieldAccuracyEvaluator = class {
|
|
|
12207
12149
|
*/
|
|
12208
12150
|
aggregateResults(results) {
|
|
12209
12151
|
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
12210
|
-
const
|
|
12211
|
-
const misses = [];
|
|
12152
|
+
const assertions = [];
|
|
12212
12153
|
for (const result of results) {
|
|
12213
|
-
|
|
12214
|
-
hits.push(result.message);
|
|
12215
|
-
} else {
|
|
12216
|
-
misses.push(result.message);
|
|
12217
|
-
}
|
|
12154
|
+
assertions.push({ text: result.message, passed: result.hit });
|
|
12218
12155
|
}
|
|
12219
12156
|
let score;
|
|
12220
12157
|
if (aggregation === "all_or_nothing") {
|
|
12221
|
-
|
|
12158
|
+
const hasFailed = assertions.some((a) => !a.passed);
|
|
12159
|
+
score = hasFailed ? 0 : 1;
|
|
12222
12160
|
} else {
|
|
12223
12161
|
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
12224
12162
|
if (totalWeight === 0) {
|
|
@@ -12228,15 +12166,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
12228
12166
|
score = weightedSum / totalWeight;
|
|
12229
12167
|
}
|
|
12230
12168
|
}
|
|
12231
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
12232
12169
|
return {
|
|
12233
12170
|
score: clampScore(score),
|
|
12234
12171
|
verdict: scoreToVerdict(score),
|
|
12235
|
-
|
|
12236
|
-
|
|
12237
|
-
misses: misses.slice(0, 4),
|
|
12238
|
-
expectedAspectCount: results.length,
|
|
12239
|
-
reasoning
|
|
12172
|
+
assertions,
|
|
12173
|
+
expectedAspectCount: results.length
|
|
12240
12174
|
};
|
|
12241
12175
|
}
|
|
12242
12176
|
};
|
|
@@ -12345,10 +12279,8 @@ var LatencyEvaluator = class {
|
|
|
12345
12279
|
return {
|
|
12346
12280
|
score: 0,
|
|
12347
12281
|
verdict: "fail",
|
|
12348
|
-
|
|
12349
|
-
misses: ["No duration data available in trace"],
|
|
12282
|
+
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
12350
12283
|
expectedAspectCount: 1,
|
|
12351
|
-
reasoning: "Execution duration not reported by provider",
|
|
12352
12284
|
evaluatorRawRequest: {
|
|
12353
12285
|
type: "latency",
|
|
12354
12286
|
threshold,
|
|
@@ -12361,10 +12293,10 @@ var LatencyEvaluator = class {
|
|
|
12361
12293
|
return {
|
|
12362
12294
|
score,
|
|
12363
12295
|
verdict: passed ? "pass" : "fail",
|
|
12364
|
-
|
|
12365
|
-
|
|
12296
|
+
assertions: [
|
|
12297
|
+
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
12298
|
+
],
|
|
12366
12299
|
expectedAspectCount: 1,
|
|
12367
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
12368
12300
|
evaluatorRawRequest: {
|
|
12369
12301
|
type: "latency",
|
|
12370
12302
|
threshold,
|
|
@@ -12440,23 +12372,25 @@ var SkillTriggerEvaluator = class {
|
|
|
12440
12372
|
return {
|
|
12441
12373
|
score: 1,
|
|
12442
12374
|
verdict: "pass",
|
|
12443
|
-
|
|
12444
|
-
|
|
12375
|
+
assertions: [
|
|
12376
|
+
{
|
|
12377
|
+
text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
|
|
12378
|
+
passed: true
|
|
12379
|
+
}
|
|
12445
12380
|
],
|
|
12446
|
-
|
|
12447
|
-
expectedAspectCount: 1,
|
|
12448
|
-
reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
|
|
12381
|
+
expectedAspectCount: 1
|
|
12449
12382
|
};
|
|
12450
12383
|
}
|
|
12451
12384
|
return {
|
|
12452
12385
|
score: 0,
|
|
12453
12386
|
verdict: "fail",
|
|
12454
|
-
|
|
12455
|
-
|
|
12456
|
-
|
|
12387
|
+
assertions: [
|
|
12388
|
+
{
|
|
12389
|
+
text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
|
|
12390
|
+
passed: false
|
|
12391
|
+
}
|
|
12457
12392
|
],
|
|
12458
|
-
expectedAspectCount: 1
|
|
12459
|
-
reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
|
|
12393
|
+
expectedAspectCount: 1
|
|
12460
12394
|
};
|
|
12461
12395
|
}
|
|
12462
12396
|
};
|
|
@@ -12625,10 +12559,8 @@ var TokenUsageEvaluator = class {
|
|
|
12625
12559
|
return {
|
|
12626
12560
|
score: 0,
|
|
12627
12561
|
verdict: "fail",
|
|
12628
|
-
|
|
12629
|
-
misses: ["No token usage data available in trace"],
|
|
12562
|
+
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
12630
12563
|
expectedAspectCount,
|
|
12631
|
-
reasoning: "Token usage not reported by provider",
|
|
12632
12564
|
evaluatorRawRequest: {
|
|
12633
12565
|
type: "token-usage",
|
|
12634
12566
|
max_total: maxTotal ?? null,
|
|
@@ -12642,37 +12574,34 @@ var TokenUsageEvaluator = class {
|
|
|
12642
12574
|
const output = usage.output;
|
|
12643
12575
|
const cached = usage.cached ?? 0;
|
|
12644
12576
|
const total = input + output + cached;
|
|
12645
|
-
const
|
|
12646
|
-
const misses = [];
|
|
12577
|
+
const assertions = [];
|
|
12647
12578
|
if (typeof maxInput === "number") {
|
|
12648
12579
|
if (input <= maxInput) {
|
|
12649
|
-
|
|
12580
|
+
assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
|
|
12650
12581
|
} else {
|
|
12651
|
-
|
|
12582
|
+
assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
|
|
12652
12583
|
}
|
|
12653
12584
|
}
|
|
12654
12585
|
if (typeof maxOutput === "number") {
|
|
12655
12586
|
if (output <= maxOutput) {
|
|
12656
|
-
|
|
12587
|
+
assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
|
|
12657
12588
|
} else {
|
|
12658
|
-
|
|
12589
|
+
assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
|
|
12659
12590
|
}
|
|
12660
12591
|
}
|
|
12661
12592
|
if (typeof maxTotal === "number") {
|
|
12662
12593
|
if (total <= maxTotal) {
|
|
12663
|
-
|
|
12594
|
+
assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
|
|
12664
12595
|
} else {
|
|
12665
|
-
|
|
12596
|
+
assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
|
|
12666
12597
|
}
|
|
12667
12598
|
}
|
|
12668
|
-
const passed =
|
|
12599
|
+
const passed = assertions.every((a) => a.passed);
|
|
12669
12600
|
return {
|
|
12670
12601
|
score: passed ? 1 : 0,
|
|
12671
12602
|
verdict: passed ? "pass" : "fail",
|
|
12672
|
-
|
|
12673
|
-
misses,
|
|
12603
|
+
assertions,
|
|
12674
12604
|
expectedAspectCount,
|
|
12675
|
-
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
12676
12605
|
evaluatorRawRequest: {
|
|
12677
12606
|
type: "token-usage",
|
|
12678
12607
|
max_total: maxTotal ?? null,
|
|
@@ -12772,8 +12701,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12772
12701
|
return {
|
|
12773
12702
|
score: 0,
|
|
12774
12703
|
verdict: "fail",
|
|
12775
|
-
|
|
12776
|
-
misses: ["No trace available for evaluation"],
|
|
12704
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
12777
12705
|
expectedAspectCount: 1
|
|
12778
12706
|
};
|
|
12779
12707
|
}
|
|
@@ -12784,8 +12712,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12784
12712
|
return {
|
|
12785
12713
|
score: 0,
|
|
12786
12714
|
verdict: "fail",
|
|
12787
|
-
|
|
12788
|
-
misses: ["No trace available for evaluation"],
|
|
12715
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
12789
12716
|
expectedAspectCount: 1
|
|
12790
12717
|
};
|
|
12791
12718
|
}
|
|
@@ -12803,8 +12730,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12803
12730
|
return {
|
|
12804
12731
|
score: 0,
|
|
12805
12732
|
verdict: "fail",
|
|
12806
|
-
|
|
12807
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
12733
|
+
assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
|
|
12808
12734
|
expectedAspectCount: 1
|
|
12809
12735
|
};
|
|
12810
12736
|
}
|
|
@@ -12853,28 +12779,32 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12853
12779
|
return {
|
|
12854
12780
|
score: 1,
|
|
12855
12781
|
verdict: "pass",
|
|
12856
|
-
|
|
12857
|
-
misses: [],
|
|
12782
|
+
assertions: [{ text: "No tool requirements specified", passed: true }],
|
|
12858
12783
|
expectedAspectCount: 0
|
|
12859
12784
|
};
|
|
12860
12785
|
}
|
|
12861
|
-
const
|
|
12862
|
-
const misses = [];
|
|
12786
|
+
const assertions = [];
|
|
12863
12787
|
for (const toolName of toolNames) {
|
|
12864
12788
|
const required = minimums[toolName];
|
|
12865
12789
|
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
12866
12790
|
if (actual >= required) {
|
|
12867
|
-
|
|
12791
|
+
assertions.push({
|
|
12792
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
12793
|
+
passed: true
|
|
12794
|
+
});
|
|
12868
12795
|
} else {
|
|
12869
|
-
|
|
12796
|
+
assertions.push({
|
|
12797
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
12798
|
+
passed: false
|
|
12799
|
+
});
|
|
12870
12800
|
}
|
|
12871
12801
|
}
|
|
12872
|
-
const
|
|
12802
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
12803
|
+
const score = passedCount / toolNames.length;
|
|
12873
12804
|
return {
|
|
12874
12805
|
score,
|
|
12875
12806
|
verdict: scoreToVerdict(score),
|
|
12876
|
-
|
|
12877
|
-
misses,
|
|
12807
|
+
assertions,
|
|
12878
12808
|
expectedAspectCount: toolNames.length
|
|
12879
12809
|
};
|
|
12880
12810
|
}
|
|
@@ -12884,13 +12814,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12884
12814
|
return {
|
|
12885
12815
|
score: 1,
|
|
12886
12816
|
verdict: "pass",
|
|
12887
|
-
|
|
12888
|
-
misses: [],
|
|
12817
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
12889
12818
|
expectedAspectCount: 0
|
|
12890
12819
|
};
|
|
12891
12820
|
}
|
|
12892
|
-
const
|
|
12893
|
-
const misses = [];
|
|
12821
|
+
const assertions = [];
|
|
12894
12822
|
const warnings = [];
|
|
12895
12823
|
let actualIndex = 0;
|
|
12896
12824
|
let sequenceHits = 0;
|
|
@@ -12910,16 +12838,20 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12910
12838
|
const actualCall = toolCalls[actualIndex];
|
|
12911
12839
|
if (actualCall.name === expectedTool) {
|
|
12912
12840
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
12913
|
-
|
|
12841
|
+
assertions.push({
|
|
12842
|
+
text: `Found ${expectedTool} at position ${actualIndex}`,
|
|
12843
|
+
passed: true
|
|
12844
|
+
});
|
|
12914
12845
|
sequenceHits++;
|
|
12915
12846
|
matchedCall = actualCall;
|
|
12916
12847
|
actualIndex++;
|
|
12917
12848
|
found = true;
|
|
12918
12849
|
break;
|
|
12919
12850
|
}
|
|
12920
|
-
|
|
12921
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch
|
|
12922
|
-
|
|
12851
|
+
assertions.push({
|
|
12852
|
+
text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
|
|
12853
|
+
passed: false
|
|
12854
|
+
});
|
|
12923
12855
|
actualIndex++;
|
|
12924
12856
|
argsMismatch = true;
|
|
12925
12857
|
break;
|
|
@@ -12927,7 +12859,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12927
12859
|
actualIndex++;
|
|
12928
12860
|
}
|
|
12929
12861
|
if (!found && !argsMismatch) {
|
|
12930
|
-
|
|
12862
|
+
assertions.push({
|
|
12863
|
+
text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
|
|
12864
|
+
passed: false
|
|
12865
|
+
});
|
|
12931
12866
|
}
|
|
12932
12867
|
if (found && matchedCall) {
|
|
12933
12868
|
const latencyResult = checkLatency(
|
|
@@ -12936,10 +12871,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12936
12871
|
matchedCall.durationMs
|
|
12937
12872
|
);
|
|
12938
12873
|
if (latencyResult.status === "pass") {
|
|
12939
|
-
|
|
12874
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
12940
12875
|
latencyHits++;
|
|
12941
12876
|
} else if (latencyResult.status === "fail") {
|
|
12942
|
-
|
|
12877
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
12943
12878
|
} else if (latencyResult.message) {
|
|
12944
12879
|
warnings.push(latencyResult.message);
|
|
12945
12880
|
latencySkips++;
|
|
@@ -12955,8 +12890,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12955
12890
|
return {
|
|
12956
12891
|
score,
|
|
12957
12892
|
verdict: scoreToVerdict(score),
|
|
12958
|
-
|
|
12959
|
-
misses,
|
|
12893
|
+
assertions,
|
|
12960
12894
|
expectedAspectCount: totalAssertions
|
|
12961
12895
|
};
|
|
12962
12896
|
}
|
|
@@ -12966,13 +12900,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12966
12900
|
return {
|
|
12967
12901
|
score: 1,
|
|
12968
12902
|
verdict: "pass",
|
|
12969
|
-
|
|
12970
|
-
misses: [],
|
|
12903
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
12971
12904
|
expectedAspectCount: 0
|
|
12972
12905
|
};
|
|
12973
12906
|
}
|
|
12974
|
-
const
|
|
12975
|
-
const misses = [];
|
|
12907
|
+
const assertions = [];
|
|
12976
12908
|
const warnings = [];
|
|
12977
12909
|
let sequenceHits = 0;
|
|
12978
12910
|
let latencyHits = 0;
|
|
@@ -12981,7 +12913,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12981
12913
|
(item) => item.maxDurationMs !== void 0
|
|
12982
12914
|
).length;
|
|
12983
12915
|
if (toolCalls.length !== expected.length) {
|
|
12984
|
-
|
|
12916
|
+
assertions.push({
|
|
12917
|
+
text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
|
|
12918
|
+
passed: false
|
|
12919
|
+
});
|
|
12985
12920
|
}
|
|
12986
12921
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
12987
12922
|
for (let i = 0; i < checkLength; i++) {
|
|
@@ -12993,14 +12928,17 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12993
12928
|
let sequenceMatched = false;
|
|
12994
12929
|
if (actualTool === expectedTool) {
|
|
12995
12930
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
12996
|
-
|
|
12931
|
+
assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
|
|
12997
12932
|
sequenceHits++;
|
|
12998
12933
|
sequenceMatched = true;
|
|
12999
12934
|
} else {
|
|
13000
|
-
|
|
12935
|
+
assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
|
|
13001
12936
|
}
|
|
13002
12937
|
} else {
|
|
13003
|
-
|
|
12938
|
+
assertions.push({
|
|
12939
|
+
text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
|
|
12940
|
+
passed: false
|
|
12941
|
+
});
|
|
13004
12942
|
}
|
|
13005
12943
|
if (sequenceMatched) {
|
|
13006
12944
|
const latencyResult = checkLatency(
|
|
@@ -13009,10 +12947,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13009
12947
|
actualCall.durationMs
|
|
13010
12948
|
);
|
|
13011
12949
|
if (latencyResult.status === "pass") {
|
|
13012
|
-
|
|
12950
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
13013
12951
|
latencyHits++;
|
|
13014
12952
|
} else if (latencyResult.status === "fail") {
|
|
13015
|
-
|
|
12953
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
13016
12954
|
} else if (latencyResult.message) {
|
|
13017
12955
|
warnings.push(latencyResult.message);
|
|
13018
12956
|
latencySkips++;
|
|
@@ -13020,7 +12958,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13020
12958
|
}
|
|
13021
12959
|
}
|
|
13022
12960
|
for (let i = checkLength; i < expected.length; i++) {
|
|
13023
|
-
|
|
12961
|
+
assertions.push({
|
|
12962
|
+
text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
|
|
12963
|
+
passed: false
|
|
12964
|
+
});
|
|
13024
12965
|
}
|
|
13025
12966
|
for (const warning of warnings) {
|
|
13026
12967
|
console.warn(`[tool-trajectory] ${warning}`);
|
|
@@ -13031,8 +12972,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13031
12972
|
return {
|
|
13032
12973
|
score,
|
|
13033
12974
|
verdict: scoreToVerdict(score),
|
|
13034
|
-
|
|
13035
|
-
misses,
|
|
12975
|
+
assertions,
|
|
13036
12976
|
expectedAspectCount: totalAssertions
|
|
13037
12977
|
};
|
|
13038
12978
|
}
|
|
@@ -13047,13 +12987,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13047
12987
|
return {
|
|
13048
12988
|
score: 1,
|
|
13049
12989
|
verdict: "pass",
|
|
13050
|
-
|
|
13051
|
-
misses: [],
|
|
12990
|
+
assertions: [{ text: "No expected tools specified", passed: true }],
|
|
13052
12991
|
expectedAspectCount: 0
|
|
13053
12992
|
};
|
|
13054
12993
|
}
|
|
13055
|
-
const
|
|
13056
|
-
const misses = [];
|
|
12994
|
+
const assertions = [];
|
|
13057
12995
|
const consumed = /* @__PURE__ */ new Set();
|
|
13058
12996
|
for (let i = 0; i < expected.length; i++) {
|
|
13059
12997
|
const expectedItem = expected[i];
|
|
@@ -13064,22 +13002,25 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13064
13002
|
if (consumed.has(j)) continue;
|
|
13065
13003
|
const actualCall = toolCalls[j];
|
|
13066
13004
|
if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
13067
|
-
|
|
13005
|
+
assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
|
|
13068
13006
|
consumed.add(j);
|
|
13069
13007
|
found = true;
|
|
13070
13008
|
break;
|
|
13071
13009
|
}
|
|
13072
13010
|
}
|
|
13073
13011
|
if (!found) {
|
|
13074
|
-
|
|
13012
|
+
assertions.push({
|
|
13013
|
+
text: `Expected ${expectedTool} not found in actual trajectory`,
|
|
13014
|
+
passed: false
|
|
13015
|
+
});
|
|
13075
13016
|
}
|
|
13076
13017
|
}
|
|
13077
|
-
const
|
|
13018
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
13019
|
+
const score = expected.length > 0 ? passedCount / expected.length : 1;
|
|
13078
13020
|
return {
|
|
13079
13021
|
score,
|
|
13080
13022
|
verdict: scoreToVerdict(score),
|
|
13081
|
-
|
|
13082
|
-
misses,
|
|
13023
|
+
assertions,
|
|
13083
13024
|
expectedAspectCount: expected.length
|
|
13084
13025
|
};
|
|
13085
13026
|
}
|
|
@@ -13095,16 +13036,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13095
13036
|
return {
|
|
13096
13037
|
score: 1,
|
|
13097
13038
|
verdict: "pass",
|
|
13098
|
-
|
|
13099
|
-
misses: [],
|
|
13039
|
+
assertions: [{ text: "No tool calls and no expected tools", passed: true }],
|
|
13100
13040
|
expectedAspectCount: 0
|
|
13101
13041
|
};
|
|
13102
13042
|
}
|
|
13103
13043
|
return {
|
|
13104
13044
|
score: 0,
|
|
13105
13045
|
verdict: "fail",
|
|
13106
|
-
|
|
13107
|
-
|
|
13046
|
+
assertions: [
|
|
13047
|
+
{
|
|
13048
|
+
text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
|
|
13049
|
+
passed: false
|
|
13050
|
+
}
|
|
13051
|
+
],
|
|
13108
13052
|
expectedAspectCount: toolCalls.length
|
|
13109
13053
|
};
|
|
13110
13054
|
}
|
|
@@ -13112,13 +13056,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13112
13056
|
return {
|
|
13113
13057
|
score: 1,
|
|
13114
13058
|
verdict: "pass",
|
|
13115
|
-
|
|
13116
|
-
misses: [],
|
|
13059
|
+
assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
|
|
13117
13060
|
expectedAspectCount: 0
|
|
13118
13061
|
};
|
|
13119
13062
|
}
|
|
13120
|
-
const
|
|
13121
|
-
const misses = [];
|
|
13063
|
+
const assertions = [];
|
|
13122
13064
|
for (let i = 0; i < toolCalls.length; i++) {
|
|
13123
13065
|
const actualCall = toolCalls[i];
|
|
13124
13066
|
let allowed = false;
|
|
@@ -13130,17 +13072,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13130
13072
|
}
|
|
13131
13073
|
}
|
|
13132
13074
|
if (allowed) {
|
|
13133
|
-
|
|
13075
|
+
assertions.push({
|
|
13076
|
+
text: `Position ${i}: ${actualCall.name} is in allowed set`,
|
|
13077
|
+
passed: true
|
|
13078
|
+
});
|
|
13134
13079
|
} else {
|
|
13135
|
-
|
|
13080
|
+
assertions.push({
|
|
13081
|
+
text: `Position ${i}: ${actualCall.name} is not in allowed set`,
|
|
13082
|
+
passed: false
|
|
13083
|
+
});
|
|
13136
13084
|
}
|
|
13137
13085
|
}
|
|
13138
|
-
const
|
|
13086
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
13087
|
+
const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
|
|
13139
13088
|
return {
|
|
13140
13089
|
score,
|
|
13141
13090
|
verdict: scoreToVerdict(score),
|
|
13142
|
-
|
|
13143
|
-
misses,
|
|
13091
|
+
assertions,
|
|
13144
13092
|
expectedAspectCount: toolCalls.length
|
|
13145
13093
|
};
|
|
13146
13094
|
}
|
|
@@ -13151,8 +13099,12 @@ function runContainsAssertion(output, value) {
|
|
|
13151
13099
|
const passed = output.includes(value);
|
|
13152
13100
|
return {
|
|
13153
13101
|
score: passed ? 1 : 0,
|
|
13154
|
-
|
|
13155
|
-
|
|
13102
|
+
assertions: [
|
|
13103
|
+
{
|
|
13104
|
+
text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
|
|
13105
|
+
passed
|
|
13106
|
+
}
|
|
13107
|
+
]
|
|
13156
13108
|
};
|
|
13157
13109
|
}
|
|
13158
13110
|
function runContainsAnyAssertion(output, values) {
|
|
@@ -13160,8 +13112,12 @@ function runContainsAnyAssertion(output, values) {
|
|
|
13160
13112
|
const passed = matched.length > 0;
|
|
13161
13113
|
return {
|
|
13162
13114
|
score: passed ? 1 : 0,
|
|
13163
|
-
|
|
13164
|
-
|
|
13115
|
+
assertions: [
|
|
13116
|
+
{
|
|
13117
|
+
text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
|
|
13118
|
+
passed
|
|
13119
|
+
}
|
|
13120
|
+
]
|
|
13165
13121
|
};
|
|
13166
13122
|
}
|
|
13167
13123
|
function runContainsAllAssertion(output, values) {
|
|
@@ -13169,16 +13125,24 @@ function runContainsAllAssertion(output, values) {
|
|
|
13169
13125
|
const passed = missing.length === 0;
|
|
13170
13126
|
return {
|
|
13171
13127
|
score: passed ? 1 : 0,
|
|
13172
|
-
|
|
13173
|
-
|
|
13128
|
+
assertions: [
|
|
13129
|
+
{
|
|
13130
|
+
text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
13131
|
+
passed
|
|
13132
|
+
}
|
|
13133
|
+
]
|
|
13174
13134
|
};
|
|
13175
13135
|
}
|
|
13176
13136
|
function runIcontainsAssertion(output, value) {
|
|
13177
13137
|
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
13178
13138
|
return {
|
|
13179
13139
|
score: passed ? 1 : 0,
|
|
13180
|
-
|
|
13181
|
-
|
|
13140
|
+
assertions: [
|
|
13141
|
+
{
|
|
13142
|
+
text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
|
|
13143
|
+
passed
|
|
13144
|
+
}
|
|
13145
|
+
]
|
|
13182
13146
|
};
|
|
13183
13147
|
}
|
|
13184
13148
|
function runIcontainsAnyAssertion(output, values) {
|
|
@@ -13187,9 +13151,11 @@ function runIcontainsAnyAssertion(output, values) {
|
|
|
13187
13151
|
const passed = matched.length > 0;
|
|
13188
13152
|
return {
|
|
13189
13153
|
score: passed ? 1 : 0,
|
|
13190
|
-
|
|
13191
|
-
|
|
13192
|
-
|
|
13154
|
+
assertions: [
|
|
13155
|
+
{
|
|
13156
|
+
text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
|
|
13157
|
+
passed
|
|
13158
|
+
}
|
|
13193
13159
|
]
|
|
13194
13160
|
};
|
|
13195
13161
|
}
|
|
@@ -13199,24 +13165,36 @@ function runIcontainsAllAssertion(output, values) {
|
|
|
13199
13165
|
const passed = missing.length === 0;
|
|
13200
13166
|
return {
|
|
13201
13167
|
score: passed ? 1 : 0,
|
|
13202
|
-
|
|
13203
|
-
|
|
13168
|
+
assertions: [
|
|
13169
|
+
{
|
|
13170
|
+
text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
13171
|
+
passed
|
|
13172
|
+
}
|
|
13173
|
+
]
|
|
13204
13174
|
};
|
|
13205
13175
|
}
|
|
13206
13176
|
function runStartsWithAssertion(output, value) {
|
|
13207
13177
|
const passed = output.trim().startsWith(value.trim());
|
|
13208
13178
|
return {
|
|
13209
13179
|
score: passed ? 1 : 0,
|
|
13210
|
-
|
|
13211
|
-
|
|
13180
|
+
assertions: [
|
|
13181
|
+
{
|
|
13182
|
+
text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
|
|
13183
|
+
passed
|
|
13184
|
+
}
|
|
13185
|
+
]
|
|
13212
13186
|
};
|
|
13213
13187
|
}
|
|
13214
13188
|
function runEndsWithAssertion(output, value) {
|
|
13215
13189
|
const passed = output.trim().endsWith(value.trim());
|
|
13216
13190
|
return {
|
|
13217
13191
|
score: passed ? 1 : 0,
|
|
13218
|
-
|
|
13219
|
-
|
|
13192
|
+
assertions: [
|
|
13193
|
+
{
|
|
13194
|
+
text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
|
|
13195
|
+
passed
|
|
13196
|
+
}
|
|
13197
|
+
]
|
|
13220
13198
|
};
|
|
13221
13199
|
}
|
|
13222
13200
|
function runRegexAssertion(output, pattern, flags) {
|
|
@@ -13225,8 +13203,12 @@ function runRegexAssertion(output, pattern, flags) {
|
|
|
13225
13203
|
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
13226
13204
|
return {
|
|
13227
13205
|
score: passed ? 1 : 0,
|
|
13228
|
-
|
|
13229
|
-
|
|
13206
|
+
assertions: [
|
|
13207
|
+
{
|
|
13208
|
+
text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
|
|
13209
|
+
passed
|
|
13210
|
+
}
|
|
13211
|
+
]
|
|
13230
13212
|
};
|
|
13231
13213
|
}
|
|
13232
13214
|
function runIsJsonAssertion(output) {
|
|
@@ -13238,16 +13220,24 @@ function runIsJsonAssertion(output) {
|
|
|
13238
13220
|
}
|
|
13239
13221
|
return {
|
|
13240
13222
|
score: passed ? 1 : 0,
|
|
13241
|
-
|
|
13242
|
-
|
|
13223
|
+
assertions: [
|
|
13224
|
+
{
|
|
13225
|
+
text: passed ? "Output is valid JSON" : "Output is not valid JSON",
|
|
13226
|
+
passed
|
|
13227
|
+
}
|
|
13228
|
+
]
|
|
13243
13229
|
};
|
|
13244
13230
|
}
|
|
13245
13231
|
function runEqualsAssertion(output, value) {
|
|
13246
13232
|
const passed = output.trim() === value.trim();
|
|
13247
13233
|
return {
|
|
13248
13234
|
score: passed ? 1 : 0,
|
|
13249
|
-
|
|
13250
|
-
|
|
13235
|
+
assertions: [
|
|
13236
|
+
{
|
|
13237
|
+
text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
|
|
13238
|
+
passed
|
|
13239
|
+
}
|
|
13240
|
+
]
|
|
13251
13241
|
};
|
|
13252
13242
|
}
|
|
13253
13243
|
|
|
@@ -13460,10 +13450,8 @@ var InlineAssertEvaluator = class {
|
|
|
13460
13450
|
return {
|
|
13461
13451
|
score,
|
|
13462
13452
|
verdict: scoreToVerdict(score),
|
|
13463
|
-
|
|
13464
|
-
misses: score < 0.5 ? [result.name] : [],
|
|
13453
|
+
assertions: [{ text: result.name, passed: score >= 0.5 }],
|
|
13465
13454
|
expectedAspectCount: 1,
|
|
13466
|
-
reasoning: void 0,
|
|
13467
13455
|
details: result.metadata ? result.metadata : void 0
|
|
13468
13456
|
};
|
|
13469
13457
|
}
|
|
@@ -13656,9 +13644,7 @@ var containsFactory = (config) => {
|
|
|
13656
13644
|
return {
|
|
13657
13645
|
score: result.score,
|
|
13658
13646
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13659
|
-
|
|
13660
|
-
misses: result.misses,
|
|
13661
|
-
reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
|
|
13647
|
+
assertions: result.assertions,
|
|
13662
13648
|
expectedAspectCount: 1
|
|
13663
13649
|
};
|
|
13664
13650
|
});
|
|
@@ -13670,9 +13656,7 @@ var regexFactory = (config) => {
|
|
|
13670
13656
|
return {
|
|
13671
13657
|
score: result.score,
|
|
13672
13658
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13673
|
-
|
|
13674
|
-
misses: result.misses,
|
|
13675
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
13659
|
+
assertions: result.assertions,
|
|
13676
13660
|
expectedAspectCount: 1
|
|
13677
13661
|
};
|
|
13678
13662
|
});
|
|
@@ -13683,9 +13667,7 @@ var isJsonFactory = () => {
|
|
|
13683
13667
|
return {
|
|
13684
13668
|
score: result.score,
|
|
13685
13669
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13686
|
-
|
|
13687
|
-
misses: result.misses,
|
|
13688
|
-
reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
|
|
13670
|
+
assertions: result.assertions,
|
|
13689
13671
|
expectedAspectCount: 1
|
|
13690
13672
|
};
|
|
13691
13673
|
});
|
|
@@ -13697,9 +13679,7 @@ var equalsFactory = (config) => {
|
|
|
13697
13679
|
return {
|
|
13698
13680
|
score: result.score,
|
|
13699
13681
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13700
|
-
|
|
13701
|
-
misses: result.misses,
|
|
13702
|
-
reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
|
|
13682
|
+
assertions: result.assertions,
|
|
13703
13683
|
expectedAspectCount: 1
|
|
13704
13684
|
};
|
|
13705
13685
|
});
|
|
@@ -13711,9 +13691,7 @@ var containsAnyFactory = (config) => {
|
|
|
13711
13691
|
return {
|
|
13712
13692
|
score: result.score,
|
|
13713
13693
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13714
|
-
|
|
13715
|
-
misses: result.misses,
|
|
13716
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13694
|
+
assertions: result.assertions,
|
|
13717
13695
|
expectedAspectCount: 1
|
|
13718
13696
|
};
|
|
13719
13697
|
});
|
|
@@ -13725,9 +13703,7 @@ var containsAllFactory = (config) => {
|
|
|
13725
13703
|
return {
|
|
13726
13704
|
score: result.score,
|
|
13727
13705
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13728
|
-
|
|
13729
|
-
misses: result.misses,
|
|
13730
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13706
|
+
assertions: result.assertions,
|
|
13731
13707
|
expectedAspectCount: 1
|
|
13732
13708
|
};
|
|
13733
13709
|
});
|
|
@@ -13739,9 +13715,7 @@ var icontainsFactory = (config) => {
|
|
|
13739
13715
|
return {
|
|
13740
13716
|
score: result.score,
|
|
13741
13717
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13742
|
-
|
|
13743
|
-
misses: result.misses,
|
|
13744
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13718
|
+
assertions: result.assertions,
|
|
13745
13719
|
expectedAspectCount: 1
|
|
13746
13720
|
};
|
|
13747
13721
|
});
|
|
@@ -13753,9 +13727,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
13753
13727
|
return {
|
|
13754
13728
|
score: result.score,
|
|
13755
13729
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13756
|
-
|
|
13757
|
-
misses: result.misses,
|
|
13758
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13730
|
+
assertions: result.assertions,
|
|
13759
13731
|
expectedAspectCount: 1
|
|
13760
13732
|
};
|
|
13761
13733
|
});
|
|
@@ -13767,9 +13739,7 @@ var icontainsAllFactory = (config) => {
|
|
|
13767
13739
|
return {
|
|
13768
13740
|
score: result.score,
|
|
13769
13741
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13770
|
-
|
|
13771
|
-
misses: result.misses,
|
|
13772
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13742
|
+
assertions: result.assertions,
|
|
13773
13743
|
expectedAspectCount: 1
|
|
13774
13744
|
};
|
|
13775
13745
|
});
|
|
@@ -13781,9 +13751,7 @@ var startsWithFactory = (config) => {
|
|
|
13781
13751
|
return {
|
|
13782
13752
|
score: result.score,
|
|
13783
13753
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13784
|
-
|
|
13785
|
-
misses: result.misses,
|
|
13786
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13754
|
+
assertions: result.assertions,
|
|
13787
13755
|
expectedAspectCount: 1
|
|
13788
13756
|
};
|
|
13789
13757
|
});
|
|
@@ -13795,9 +13763,7 @@ var endsWithFactory = (config) => {
|
|
|
13795
13763
|
return {
|
|
13796
13764
|
score: result.score,
|
|
13797
13765
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13798
|
-
|
|
13799
|
-
misses: result.misses,
|
|
13800
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13766
|
+
assertions: result.assertions,
|
|
13801
13767
|
expectedAspectCount: 1
|
|
13802
13768
|
};
|
|
13803
13769
|
});
|
|
@@ -14868,7 +14834,7 @@ async function runEvaluation(options) {
|
|
|
14868
14834
|
if (!cliModel) {
|
|
14869
14835
|
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
14870
14836
|
}
|
|
14871
|
-
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-
|
|
14837
|
+
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M.js");
|
|
14872
14838
|
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
14873
14839
|
}
|
|
14874
14840
|
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
@@ -15203,8 +15169,7 @@ async function runEvaluation(options) {
|
|
|
15203
15169
|
testId: evalCase.id,
|
|
15204
15170
|
dataset: evalCase.dataset,
|
|
15205
15171
|
score: 0,
|
|
15206
|
-
|
|
15207
|
-
misses: [],
|
|
15172
|
+
assertions: [],
|
|
15208
15173
|
answer: "",
|
|
15209
15174
|
target: target.name,
|
|
15210
15175
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
@@ -15240,8 +15205,7 @@ async function runEvaluation(options) {
|
|
|
15240
15205
|
testId: evalCase.id,
|
|
15241
15206
|
dataset: evalCase.dataset,
|
|
15242
15207
|
score: 0,
|
|
15243
|
-
|
|
15244
|
-
misses: [],
|
|
15208
|
+
assertions: [],
|
|
15245
15209
|
answer: "",
|
|
15246
15210
|
target: target.name,
|
|
15247
15211
|
error: errorMsg,
|
|
@@ -16208,11 +16172,9 @@ async function evaluateCandidate(options) {
|
|
|
16208
16172
|
dataset: evalCase.dataset,
|
|
16209
16173
|
conversationId: evalCase.conversation_id,
|
|
16210
16174
|
score: score.score,
|
|
16211
|
-
|
|
16212
|
-
misses: score.misses,
|
|
16175
|
+
assertions: score.assertions,
|
|
16213
16176
|
answer: candidate,
|
|
16214
16177
|
target: target.name,
|
|
16215
|
-
reasoning: score.reasoning,
|
|
16216
16178
|
tokenUsage,
|
|
16217
16179
|
costUsd,
|
|
16218
16180
|
durationMs,
|
|
@@ -16386,9 +16348,7 @@ async function runEvaluatorList(options) {
|
|
|
16386
16348
|
score: score2.score,
|
|
16387
16349
|
weight,
|
|
16388
16350
|
verdict: score2.verdict,
|
|
16389
|
-
|
|
16390
|
-
misses: score2.misses,
|
|
16391
|
-
reasoning: score2.reasoning,
|
|
16351
|
+
assertions: score2.assertions,
|
|
16392
16352
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
16393
16353
|
details: score2.details,
|
|
16394
16354
|
scores: mapChildResults(score2.scores),
|
|
@@ -16403,10 +16363,10 @@ async function runEvaluatorList(options) {
|
|
|
16403
16363
|
const fallbackScore = {
|
|
16404
16364
|
score: 0,
|
|
16405
16365
|
verdict: "fail",
|
|
16406
|
-
|
|
16407
|
-
|
|
16408
|
-
|
|
16409
|
-
|
|
16366
|
+
assertions: [
|
|
16367
|
+
{ text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
16368
|
+
],
|
|
16369
|
+
expectedAspectCount: 1
|
|
16410
16370
|
};
|
|
16411
16371
|
const weight = evaluatorConfig.weight ?? 1;
|
|
16412
16372
|
scored.push({
|
|
@@ -16422,9 +16382,12 @@ async function runEvaluatorList(options) {
|
|
|
16422
16382
|
score: 0,
|
|
16423
16383
|
weight,
|
|
16424
16384
|
verdict: "fail",
|
|
16425
|
-
|
|
16426
|
-
|
|
16427
|
-
|
|
16385
|
+
assertions: [
|
|
16386
|
+
{
|
|
16387
|
+
text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
16388
|
+
passed: false
|
|
16389
|
+
}
|
|
16390
|
+
],
|
|
16428
16391
|
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
16429
16392
|
startedAt: startedAt.toISOString(),
|
|
16430
16393
|
endedAt: endedAt.toISOString()
|
|
@@ -16440,9 +16403,7 @@ async function runEvaluatorList(options) {
|
|
|
16440
16403
|
...scores[lastScoresIdx],
|
|
16441
16404
|
score: negated.score,
|
|
16442
16405
|
verdict: negated.verdict,
|
|
16443
|
-
|
|
16444
|
-
misses: [...negated.misses],
|
|
16445
|
-
reasoning: negated.reasoning
|
|
16406
|
+
assertions: [...negated.assertions]
|
|
16446
16407
|
};
|
|
16447
16408
|
}
|
|
16448
16409
|
}
|
|
@@ -16457,21 +16418,13 @@ async function runEvaluatorList(options) {
|
|
|
16457
16418
|
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
16458
16419
|
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
16459
16420
|
) : 0;
|
|
16460
|
-
const
|
|
16461
|
-
const
|
|
16462
|
-
const expectedAspectCount = scored.reduce(
|
|
16463
|
-
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
16464
|
-
0
|
|
16465
|
-
);
|
|
16466
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
16467
|
-
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
16421
|
+
const assertions = scored.flatMap((entry) => entry.score.assertions);
|
|
16422
|
+
const expectedAspectCount = assertions.length || 1;
|
|
16468
16423
|
const score = {
|
|
16469
16424
|
score: aggregateScore,
|
|
16470
16425
|
verdict: scoreToVerdict(aggregateScore),
|
|
16471
|
-
|
|
16472
|
-
|
|
16473
|
-
expectedAspectCount,
|
|
16474
|
-
reasoning
|
|
16426
|
+
assertions,
|
|
16427
|
+
expectedAspectCount
|
|
16475
16428
|
};
|
|
16476
16429
|
return { score, scores };
|
|
16477
16430
|
}
|
|
@@ -16575,8 +16528,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
16575
16528
|
dataset: evalCase.dataset,
|
|
16576
16529
|
conversationId: evalCase.conversation_id,
|
|
16577
16530
|
score: 0,
|
|
16578
|
-
|
|
16579
|
-
misses: [`Error: ${message}`],
|
|
16531
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
16580
16532
|
answer: `Error occurred: ${message}`,
|
|
16581
16533
|
target: targetName,
|
|
16582
16534
|
requests,
|
|
@@ -16686,9 +16638,7 @@ function mapChildResults(children) {
|
|
|
16686
16638
|
score: child.score,
|
|
16687
16639
|
weight: child.weight,
|
|
16688
16640
|
verdict: child.verdict,
|
|
16689
|
-
|
|
16690
|
-
misses: child.misses,
|
|
16691
|
-
reasoning: child.reasoning,
|
|
16641
|
+
assertions: child.assertions,
|
|
16692
16642
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
16693
16643
|
scores: mapChildResults(child.scores),
|
|
16694
16644
|
details: child.details,
|
|
@@ -17653,7 +17603,6 @@ export {
|
|
|
17653
17603
|
freeformEvaluationSchema,
|
|
17654
17604
|
generateRubrics,
|
|
17655
17605
|
getAgentvHome,
|
|
17656
|
-
getHitCount,
|
|
17657
17606
|
getOutputFilenames,
|
|
17658
17607
|
getSubagentsRoot,
|
|
17659
17608
|
getTraceStateRoot,
|