@agentv/core 3.4.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-NFFLXG5M.js +7 -0
- package/dist/{chunk-JO4HIAEF.js → chunk-EFR4JHPL.js} +1 -5
- package/dist/chunk-EFR4JHPL.js.map +1 -0
- package/dist/{chunk-Q52FQPKQ.js → chunk-W5YDZWT4.js} +2 -2
- package/dist/chunk-W5YDZWT4.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +382 -436
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +55 -46
- package/dist/index.d.ts +55 -46
- package/dist/index.js +384 -435
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/agentv-provider-HDSAUUEF.js +0 -7
- package/dist/chunk-JO4HIAEF.js.map +0 -1
- package/dist/chunk-Q52FQPKQ.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF.js.map → agentv-provider-NFFLXG5M.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -55,7 +55,7 @@ function createLanguageModel(modelString) {
|
|
|
55
55
|
case "anthropic":
|
|
56
56
|
return (0, import_anthropic.createAnthropic)()(modelName);
|
|
57
57
|
case "azure":
|
|
58
|
-
return (0, import_azure.createAzure)()(modelName);
|
|
58
|
+
return (0, import_azure.createAzure)().chat(modelName);
|
|
59
59
|
case "google":
|
|
60
60
|
return (0, import_google.createGoogleGenerativeAI)()(modelName);
|
|
61
61
|
default:
|
|
@@ -1580,7 +1580,6 @@ __export(index_exports, {
|
|
|
1580
1580
|
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
1581
1581
|
generateRubrics: () => generateRubrics,
|
|
1582
1582
|
getAgentvHome: () => getAgentvHome,
|
|
1583
|
-
getHitCount: () => getHitCount,
|
|
1584
1583
|
getOutputFilenames: () => getOutputFilenames,
|
|
1585
1584
|
getSubagentsRoot: () => getSubagentsRoot,
|
|
1586
1585
|
getTraceStateRoot: () => getTraceStateRoot,
|
|
@@ -1730,9 +1729,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
|
1730
1729
|
function isEvaluatorKind(value) {
|
|
1731
1730
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
1732
1731
|
}
|
|
1733
|
-
function getHitCount(result) {
|
|
1734
|
-
return result.hits.length;
|
|
1735
|
-
}
|
|
1736
1732
|
|
|
1737
1733
|
// src/evaluation/trace.ts
|
|
1738
1734
|
function computeTraceSummary(messages) {
|
|
@@ -5576,7 +5572,7 @@ var AzureProvider = class {
|
|
|
5576
5572
|
};
|
|
5577
5573
|
this.retryConfig = config.retry;
|
|
5578
5574
|
const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
|
|
5579
|
-
this.model = azure(config.deploymentName);
|
|
5575
|
+
this.model = azure.chat(config.deploymentName);
|
|
5580
5576
|
}
|
|
5581
5577
|
id;
|
|
5582
5578
|
kind = "azure";
|
|
@@ -12807,9 +12803,11 @@ function negateScore(score) {
|
|
|
12807
12803
|
...score,
|
|
12808
12804
|
score: negatedScore,
|
|
12809
12805
|
verdict: negatedVerdict,
|
|
12810
|
-
|
|
12811
|
-
|
|
12812
|
-
|
|
12806
|
+
assertions: score.assertions.map((a) => ({
|
|
12807
|
+
...a,
|
|
12808
|
+
passed: !a.passed,
|
|
12809
|
+
evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
|
|
12810
|
+
}))
|
|
12813
12811
|
};
|
|
12814
12812
|
}
|
|
12815
12813
|
|
|
@@ -13324,9 +13322,13 @@ var CodeEvaluator = class {
|
|
|
13324
13322
|
);
|
|
13325
13323
|
const parsed = parseJsonSafe(stdout);
|
|
13326
13324
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
13327
|
-
const
|
|
13328
|
-
|
|
13329
|
-
|
|
13325
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
13326
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
13327
|
+
).map((a) => ({
|
|
13328
|
+
text: String(a.text),
|
|
13329
|
+
passed: Boolean(a.passed),
|
|
13330
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
13331
|
+
})) : [];
|
|
13330
13332
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
13331
13333
|
const proxyUsage = getProxyUsage?.();
|
|
13332
13334
|
const evaluatorRawRequest = {
|
|
@@ -13342,10 +13344,8 @@ var CodeEvaluator = class {
|
|
|
13342
13344
|
return {
|
|
13343
13345
|
score,
|
|
13344
13346
|
verdict: scoreToVerdict(score),
|
|
13345
|
-
|
|
13346
|
-
|
|
13347
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
13348
|
-
reasoning,
|
|
13347
|
+
assertions,
|
|
13348
|
+
expectedAspectCount: assertions.length || 1,
|
|
13349
13349
|
evaluatorRawRequest,
|
|
13350
13350
|
...details ? { details } : {},
|
|
13351
13351
|
tokenUsage: proxyUsage?.tokenUsage
|
|
@@ -13356,10 +13356,8 @@ var CodeEvaluator = class {
|
|
|
13356
13356
|
return {
|
|
13357
13357
|
score: 0,
|
|
13358
13358
|
verdict: "fail",
|
|
13359
|
-
|
|
13360
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
13359
|
+
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
13361
13360
|
expectedAspectCount: 1,
|
|
13362
|
-
reasoning: message,
|
|
13363
13361
|
evaluatorRawRequest: {
|
|
13364
13362
|
command: this.command,
|
|
13365
13363
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
@@ -13499,9 +13497,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
13499
13497
|
{{${TEMPLATE_VARIABLES.ANSWER}}}`;
|
|
13500
13498
|
var freeformEvaluationSchema = import_zod4.z.object({
|
|
13501
13499
|
score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
13502
|
-
|
|
13503
|
-
|
|
13504
|
-
|
|
13500
|
+
assertions: import_zod4.z.array(
|
|
13501
|
+
import_zod4.z.object({
|
|
13502
|
+
text: import_zod4.z.string().describe("Brief description of what was checked"),
|
|
13503
|
+
passed: import_zod4.z.boolean().describe("Whether this aspect was satisfied"),
|
|
13504
|
+
evidence: import_zod4.z.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
13505
|
+
})
|
|
13506
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
13505
13507
|
});
|
|
13506
13508
|
var rubricCheckResultSchema = import_zod4.z.object({
|
|
13507
13509
|
id: import_zod4.z.string().describe("The ID of the rubric item being checked"),
|
|
@@ -13603,17 +13605,12 @@ ${context2.fileChanges}`;
|
|
|
13603
13605
|
schema: freeformEvaluationSchema
|
|
13604
13606
|
});
|
|
13605
13607
|
const score = clampScore(data.score);
|
|
13606
|
-
const
|
|
13607
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
13608
|
-
const reasoning = data.reasoning;
|
|
13609
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
13608
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
13610
13609
|
return {
|
|
13611
13610
|
score,
|
|
13612
13611
|
verdict: scoreToVerdict(score),
|
|
13613
|
-
|
|
13614
|
-
|
|
13615
|
-
expectedAspectCount,
|
|
13616
|
-
reasoning,
|
|
13612
|
+
assertions,
|
|
13613
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13617
13614
|
evaluatorRawRequest,
|
|
13618
13615
|
tokenUsage
|
|
13619
13616
|
};
|
|
@@ -13624,10 +13621,8 @@ ${context2.fileChanges}`;
|
|
|
13624
13621
|
return {
|
|
13625
13622
|
score: 0,
|
|
13626
13623
|
verdict: "skip",
|
|
13627
|
-
|
|
13628
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
13624
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13629
13625
|
expectedAspectCount: 1,
|
|
13630
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
13631
13626
|
evaluatorRawRequest
|
|
13632
13627
|
};
|
|
13633
13628
|
}
|
|
@@ -13657,14 +13652,12 @@ ${context2.fileChanges}`;
|
|
|
13657
13652
|
userPrompt: prompt,
|
|
13658
13653
|
schema: rubricEvaluationSchema
|
|
13659
13654
|
});
|
|
13660
|
-
const { score, verdict,
|
|
13655
|
+
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
13661
13656
|
return {
|
|
13662
13657
|
score,
|
|
13663
13658
|
verdict,
|
|
13664
|
-
|
|
13665
|
-
misses,
|
|
13659
|
+
assertions,
|
|
13666
13660
|
expectedAspectCount: rubrics.length,
|
|
13667
|
-
reasoning: data.overall_reasoning,
|
|
13668
13661
|
evaluatorRawRequest,
|
|
13669
13662
|
tokenUsage
|
|
13670
13663
|
};
|
|
@@ -13675,10 +13668,8 @@ ${context2.fileChanges}`;
|
|
|
13675
13668
|
return {
|
|
13676
13669
|
score: 0,
|
|
13677
13670
|
verdict: "skip",
|
|
13678
|
-
|
|
13679
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
13671
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13680
13672
|
expectedAspectCount: rubrics.length,
|
|
13681
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
13682
13673
|
evaluatorRawRequest
|
|
13683
13674
|
};
|
|
13684
13675
|
}
|
|
@@ -13703,14 +13694,12 @@ ${context2.fileChanges}`;
|
|
|
13703
13694
|
userPrompt: prompt,
|
|
13704
13695
|
schema: scoreRangeEvaluationSchema
|
|
13705
13696
|
});
|
|
13706
|
-
const { score, verdict,
|
|
13697
|
+
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
13707
13698
|
return {
|
|
13708
13699
|
score,
|
|
13709
13700
|
verdict,
|
|
13710
|
-
|
|
13711
|
-
misses,
|
|
13701
|
+
assertions,
|
|
13712
13702
|
expectedAspectCount: rubrics.length,
|
|
13713
|
-
reasoning: data.overall_reasoning,
|
|
13714
13703
|
evaluatorRawRequest,
|
|
13715
13704
|
details,
|
|
13716
13705
|
tokenUsage
|
|
@@ -13722,10 +13711,8 @@ ${context2.fileChanges}`;
|
|
|
13722
13711
|
return {
|
|
13723
13712
|
score: 0,
|
|
13724
13713
|
verdict: "skip",
|
|
13725
|
-
|
|
13726
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
13714
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13727
13715
|
expectedAspectCount: rubrics.length,
|
|
13728
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
13729
13716
|
evaluatorRawRequest
|
|
13730
13717
|
};
|
|
13731
13718
|
}
|
|
@@ -13782,8 +13769,7 @@ ${context2.fileChanges}`;
|
|
|
13782
13769
|
return {
|
|
13783
13770
|
score: 0,
|
|
13784
13771
|
verdict: "fail",
|
|
13785
|
-
|
|
13786
|
-
misses: [`llm-grader built-in evaluation failed: ${message}`],
|
|
13772
|
+
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
13787
13773
|
expectedAspectCount: 1,
|
|
13788
13774
|
evaluatorRawRequest,
|
|
13789
13775
|
details: { mode: "built-in", error: message }
|
|
@@ -13833,8 +13819,9 @@ ${context2.fileChanges}`;
|
|
|
13833
13819
|
return {
|
|
13834
13820
|
score: 0,
|
|
13835
13821
|
verdict: "fail",
|
|
13836
|
-
|
|
13837
|
-
|
|
13822
|
+
assertions: [
|
|
13823
|
+
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
13824
|
+
],
|
|
13838
13825
|
expectedAspectCount: 1,
|
|
13839
13826
|
evaluatorRawRequest,
|
|
13840
13827
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
@@ -13852,8 +13839,9 @@ ${context2.fileChanges}`;
|
|
|
13852
13839
|
return {
|
|
13853
13840
|
score: 0,
|
|
13854
13841
|
verdict: "fail",
|
|
13855
|
-
|
|
13856
|
-
|
|
13842
|
+
assertions: [
|
|
13843
|
+
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
13844
|
+
],
|
|
13857
13845
|
expectedAspectCount: 1,
|
|
13858
13846
|
evaluatorRawRequest,
|
|
13859
13847
|
details: {
|
|
@@ -14005,29 +13993,24 @@ ${outputSchema}`;
|
|
|
14005
13993
|
const parsed = parseJsonFromText(text);
|
|
14006
13994
|
if (rubrics && rubrics.length > 0) {
|
|
14007
13995
|
const data2 = rubricEvaluationSchema.parse(parsed);
|
|
14008
|
-
const { score: score2, verdict,
|
|
13996
|
+
const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
|
|
14009
13997
|
return {
|
|
14010
13998
|
score: score2,
|
|
14011
13999
|
verdict,
|
|
14012
|
-
|
|
14013
|
-
misses: misses2,
|
|
14000
|
+
assertions: assertions2,
|
|
14014
14001
|
expectedAspectCount: rubrics.length,
|
|
14015
|
-
reasoning: data2.overall_reasoning,
|
|
14016
14002
|
evaluatorRawRequest,
|
|
14017
14003
|
details
|
|
14018
14004
|
};
|
|
14019
14005
|
}
|
|
14020
14006
|
const data = freeformEvaluationSchema.parse(parsed);
|
|
14021
14007
|
const score = clampScore(data.score);
|
|
14022
|
-
const
|
|
14023
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
14008
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
14024
14009
|
return {
|
|
14025
14010
|
score,
|
|
14026
14011
|
verdict: scoreToVerdict(score),
|
|
14027
|
-
|
|
14028
|
-
|
|
14029
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
14030
|
-
reasoning: data.reasoning,
|
|
14012
|
+
assertions,
|
|
14013
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
14031
14014
|
evaluatorRawRequest,
|
|
14032
14015
|
details
|
|
14033
14016
|
};
|
|
@@ -14035,8 +14018,12 @@ ${outputSchema}`;
|
|
|
14035
14018
|
return {
|
|
14036
14019
|
score: 0,
|
|
14037
14020
|
verdict: "fail",
|
|
14038
|
-
|
|
14039
|
-
|
|
14021
|
+
assertions: [
|
|
14022
|
+
{
|
|
14023
|
+
text: "Failed to parse llm-grader agent response as valid evaluation JSON",
|
|
14024
|
+
passed: false
|
|
14025
|
+
}
|
|
14026
|
+
],
|
|
14040
14027
|
expectedAspectCount: 1,
|
|
14041
14028
|
evaluatorRawRequest,
|
|
14042
14029
|
details
|
|
@@ -14165,9 +14152,13 @@ function buildOutputSchema() {
|
|
|
14165
14152
|
"",
|
|
14166
14153
|
"{",
|
|
14167
14154
|
' "score": <number between 0.0 and 1.0>,',
|
|
14168
|
-
' "
|
|
14169
|
-
|
|
14170
|
-
'
|
|
14155
|
+
' "assertions": [',
|
|
14156
|
+
" {",
|
|
14157
|
+
' "text": "<brief description of what was checked>",',
|
|
14158
|
+
' "passed": <boolean>,',
|
|
14159
|
+
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
14160
|
+
" }",
|
|
14161
|
+
" ]",
|
|
14171
14162
|
"}"
|
|
14172
14163
|
].join("\n");
|
|
14173
14164
|
}
|
|
@@ -14192,8 +14183,7 @@ function substituteVariables(template, variables) {
|
|
|
14192
14183
|
}
|
|
14193
14184
|
function calculateRubricScore(result, rubrics) {
|
|
14194
14185
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
14195
|
-
const
|
|
14196
|
-
const misses = [];
|
|
14186
|
+
const assertions = [];
|
|
14197
14187
|
let totalWeight = 0;
|
|
14198
14188
|
let earnedWeight = 0;
|
|
14199
14189
|
let failedRequired = false;
|
|
@@ -14203,19 +14193,20 @@ function calculateRubricScore(result, rubrics) {
|
|
|
14203
14193
|
continue;
|
|
14204
14194
|
}
|
|
14205
14195
|
totalWeight += rubric.weight;
|
|
14196
|
+
assertions.push({
|
|
14197
|
+
text: `[${rubric.id}] ${rubric.outcome}`,
|
|
14198
|
+
passed: check.satisfied,
|
|
14199
|
+
evidence: check.reasoning
|
|
14200
|
+
});
|
|
14206
14201
|
if (check.satisfied) {
|
|
14207
14202
|
earnedWeight += rubric.weight;
|
|
14208
|
-
|
|
14209
|
-
|
|
14210
|
-
misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
|
|
14211
|
-
if (rubric.required) {
|
|
14212
|
-
failedRequired = true;
|
|
14213
|
-
}
|
|
14203
|
+
} else if (rubric.required) {
|
|
14204
|
+
failedRequired = true;
|
|
14214
14205
|
}
|
|
14215
14206
|
}
|
|
14216
14207
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
14217
14208
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
14218
|
-
return { score, verdict,
|
|
14209
|
+
return { score, verdict, assertions };
|
|
14219
14210
|
}
|
|
14220
14211
|
function buildScoreRangeOutputSchema() {
|
|
14221
14212
|
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
@@ -14235,8 +14226,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
|
|
|
14235
14226
|
}
|
|
14236
14227
|
function calculateScoreRangeResult(result, rubrics) {
|
|
14237
14228
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
14238
|
-
const
|
|
14239
|
-
const misses = [];
|
|
14229
|
+
const assertions = [];
|
|
14240
14230
|
const rawScores = {};
|
|
14241
14231
|
let totalWeight = 0;
|
|
14242
14232
|
let weightedScoreSum = 0;
|
|
@@ -14262,24 +14252,22 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
14262
14252
|
);
|
|
14263
14253
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
14264
14254
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
14265
|
-
const
|
|
14266
|
-
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
14255
|
+
const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
|
|
14267
14256
|
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
14268
14257
|
failedRequired = true;
|
|
14269
|
-
misses.push(scoreInfo);
|
|
14270
|
-
} else if (rawScore >= 7) {
|
|
14271
|
-
hits.push(scoreInfo);
|
|
14272
|
-
} else {
|
|
14273
|
-
misses.push(scoreInfo);
|
|
14274
14258
|
}
|
|
14259
|
+
assertions.push({
|
|
14260
|
+
text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
|
|
14261
|
+
passed,
|
|
14262
|
+
evidence: check.reasoning
|
|
14263
|
+
});
|
|
14275
14264
|
}
|
|
14276
14265
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
14277
14266
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
14278
14267
|
return {
|
|
14279
14268
|
score,
|
|
14280
14269
|
verdict,
|
|
14281
|
-
|
|
14282
|
-
misses,
|
|
14270
|
+
assertions,
|
|
14283
14271
|
details: {
|
|
14284
14272
|
raw_scores: rawScores,
|
|
14285
14273
|
normalization: "score / 10",
|
|
@@ -14455,9 +14443,7 @@ var CompositeEvaluator = class {
|
|
|
14455
14443
|
let totalWeight = 0;
|
|
14456
14444
|
let weightedSum = 0;
|
|
14457
14445
|
let evaluatedCount = 0;
|
|
14458
|
-
const
|
|
14459
|
-
const allMisses = [];
|
|
14460
|
-
const reasoningParts = [];
|
|
14446
|
+
const allAssertions = [];
|
|
14461
14447
|
const scores = [];
|
|
14462
14448
|
for (const member of results) {
|
|
14463
14449
|
const weight = weights?.[member.id] ?? 1;
|
|
@@ -14467,9 +14453,7 @@ var CompositeEvaluator = class {
|
|
|
14467
14453
|
score: member.result.score,
|
|
14468
14454
|
weight,
|
|
14469
14455
|
verdict: member.result.verdict,
|
|
14470
|
-
|
|
14471
|
-
misses: [...member.result.misses],
|
|
14472
|
-
reasoning: member.result.reasoning,
|
|
14456
|
+
assertions: [...member.result.assertions],
|
|
14473
14457
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
14474
14458
|
scores: member.result.scores,
|
|
14475
14459
|
details: member.result.details,
|
|
@@ -14481,20 +14465,16 @@ var CompositeEvaluator = class {
|
|
|
14481
14465
|
evaluatedCount++;
|
|
14482
14466
|
totalWeight += weight;
|
|
14483
14467
|
weightedSum += member.result.score * weight;
|
|
14484
|
-
|
|
14485
|
-
|
|
14486
|
-
|
|
14487
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
14488
|
-
}
|
|
14468
|
+
allAssertions.push(
|
|
14469
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
14470
|
+
);
|
|
14489
14471
|
}
|
|
14490
14472
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
14491
14473
|
return {
|
|
14492
14474
|
score: 0,
|
|
14493
14475
|
verdict: "skip",
|
|
14494
|
-
|
|
14495
|
-
misses: [],
|
|
14476
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
14496
14477
|
expectedAspectCount: 1,
|
|
14497
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
14498
14478
|
evaluatorRawRequest: {
|
|
14499
14479
|
aggregator: "weighted_average",
|
|
14500
14480
|
...weights ? { weights } : {}
|
|
@@ -14506,10 +14486,8 @@ var CompositeEvaluator = class {
|
|
|
14506
14486
|
return {
|
|
14507
14487
|
score: clampScore(finalScore),
|
|
14508
14488
|
verdict: scoreToVerdict(finalScore),
|
|
14509
|
-
|
|
14510
|
-
|
|
14511
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
14512
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
14489
|
+
assertions: allAssertions,
|
|
14490
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
14513
14491
|
evaluatorRawRequest: {
|
|
14514
14492
|
aggregator: "weighted_average",
|
|
14515
14493
|
...weights ? { weights } : {}
|
|
@@ -14519,11 +14497,8 @@ var CompositeEvaluator = class {
|
|
|
14519
14497
|
}
|
|
14520
14498
|
runThreshold(results, threshold) {
|
|
14521
14499
|
const scores = [];
|
|
14522
|
-
const
|
|
14523
|
-
const allMisses = [];
|
|
14524
|
-
const reasoningParts = [];
|
|
14500
|
+
const allAssertions = [];
|
|
14525
14501
|
let passingCount = 0;
|
|
14526
|
-
let borderlineCount = 0;
|
|
14527
14502
|
let evaluatedCount = 0;
|
|
14528
14503
|
for (const member of results) {
|
|
14529
14504
|
scores.push({
|
|
@@ -14531,9 +14506,7 @@ var CompositeEvaluator = class {
|
|
|
14531
14506
|
type: member.type,
|
|
14532
14507
|
score: member.result.score,
|
|
14533
14508
|
verdict: member.result.verdict,
|
|
14534
|
-
|
|
14535
|
-
misses: [...member.result.misses],
|
|
14536
|
-
reasoning: member.result.reasoning,
|
|
14509
|
+
assertions: [...member.result.assertions],
|
|
14537
14510
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
14538
14511
|
scores: member.result.scores,
|
|
14539
14512
|
details: member.result.details,
|
|
@@ -14546,24 +14519,17 @@ var CompositeEvaluator = class {
|
|
|
14546
14519
|
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
14547
14520
|
if (isPassing) {
|
|
14548
14521
|
passingCount++;
|
|
14549
|
-
if (member.result.verdict === "borderline") {
|
|
14550
|
-
borderlineCount++;
|
|
14551
|
-
}
|
|
14552
|
-
}
|
|
14553
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
14554
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
14555
|
-
if (member.result.reasoning) {
|
|
14556
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
14557
14522
|
}
|
|
14523
|
+
allAssertions.push(
|
|
14524
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
14525
|
+
);
|
|
14558
14526
|
}
|
|
14559
14527
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
14560
14528
|
return {
|
|
14561
14529
|
score: 0,
|
|
14562
14530
|
verdict: "skip",
|
|
14563
|
-
|
|
14564
|
-
misses: [],
|
|
14531
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
14565
14532
|
expectedAspectCount: 1,
|
|
14566
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
14567
14533
|
evaluatorRawRequest: {
|
|
14568
14534
|
aggregator: "threshold",
|
|
14569
14535
|
threshold
|
|
@@ -14574,19 +14540,15 @@ var CompositeEvaluator = class {
|
|
|
14574
14540
|
const totalCount = evaluatedCount;
|
|
14575
14541
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
14576
14542
|
const pass = score >= threshold;
|
|
14577
|
-
|
|
14578
|
-
|
|
14579
|
-
|
|
14580
|
-
|
|
14581
|
-
`${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
|
|
14582
|
-
);
|
|
14543
|
+
allAssertions.unshift({
|
|
14544
|
+
text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
|
|
14545
|
+
passed: pass
|
|
14546
|
+
});
|
|
14583
14547
|
return {
|
|
14584
14548
|
score: clampScore(score),
|
|
14585
14549
|
verdict: pass ? "pass" : "fail",
|
|
14586
|
-
|
|
14587
|
-
|
|
14588
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
14589
|
-
reasoning: reasoningParts.join("; "),
|
|
14550
|
+
assertions: allAssertions,
|
|
14551
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
14590
14552
|
evaluatorRawRequest: {
|
|
14591
14553
|
aggregator: "threshold",
|
|
14592
14554
|
threshold
|
|
@@ -14603,9 +14565,7 @@ var CompositeEvaluator = class {
|
|
|
14603
14565
|
score: member.result.score,
|
|
14604
14566
|
weight: weights?.[member.id] ?? 1,
|
|
14605
14567
|
verdict: member.result.verdict,
|
|
14606
|
-
|
|
14607
|
-
misses: [...member.result.misses],
|
|
14608
|
-
reasoning: member.result.reasoning,
|
|
14568
|
+
assertions: [...member.result.assertions],
|
|
14609
14569
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
14610
14570
|
scores: member.result.scores,
|
|
14611
14571
|
details: member.result.details
|
|
@@ -14614,17 +14574,19 @@ var CompositeEvaluator = class {
|
|
|
14614
14574
|
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
14615
14575
|
const parsed = parseJsonSafe(stdout);
|
|
14616
14576
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
14617
|
-
const
|
|
14618
|
-
|
|
14619
|
-
|
|
14577
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
14578
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
14579
|
+
).map((a) => ({
|
|
14580
|
+
text: String(a.text),
|
|
14581
|
+
passed: Boolean(a.passed),
|
|
14582
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
14583
|
+
})) : [];
|
|
14620
14584
|
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
14621
14585
|
return {
|
|
14622
14586
|
score,
|
|
14623
14587
|
verdict,
|
|
14624
|
-
|
|
14625
|
-
|
|
14626
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
14627
|
-
reasoning,
|
|
14588
|
+
assertions,
|
|
14589
|
+
expectedAspectCount: assertions.length || 1,
|
|
14628
14590
|
evaluatorRawRequest: {
|
|
14629
14591
|
aggregator: "code-grader",
|
|
14630
14592
|
script: scriptPath
|
|
@@ -14636,10 +14598,8 @@ var CompositeEvaluator = class {
|
|
|
14636
14598
|
return {
|
|
14637
14599
|
score: 0,
|
|
14638
14600
|
verdict: "fail",
|
|
14639
|
-
|
|
14640
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
14601
|
+
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
14641
14602
|
expectedAspectCount: 1,
|
|
14642
|
-
reasoning: message,
|
|
14643
14603
|
evaluatorRawRequest: {
|
|
14644
14604
|
aggregator: "code-grader",
|
|
14645
14605
|
script: scriptPath,
|
|
@@ -14661,9 +14621,7 @@ var CompositeEvaluator = class {
|
|
|
14661
14621
|
type: member.type,
|
|
14662
14622
|
score: member.result.score,
|
|
14663
14623
|
verdict: member.result.verdict,
|
|
14664
|
-
|
|
14665
|
-
misses: [...member.result.misses],
|
|
14666
|
-
reasoning: member.result.reasoning,
|
|
14624
|
+
assertions: [...member.result.assertions],
|
|
14667
14625
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
14668
14626
|
scores: member.result.scores,
|
|
14669
14627
|
details: member.result.details
|
|
@@ -14687,16 +14645,12 @@ var CompositeEvaluator = class {
|
|
|
14687
14645
|
});
|
|
14688
14646
|
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
14689
14647
|
const score2 = clampScore(data2.score);
|
|
14690
|
-
const
|
|
14691
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
14692
|
-
const reasoning2 = data2.reasoning;
|
|
14648
|
+
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
14693
14649
|
return {
|
|
14694
14650
|
score: score2,
|
|
14695
14651
|
verdict: scoreToVerdict(score2),
|
|
14696
|
-
|
|
14697
|
-
|
|
14698
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
14699
|
-
reasoning: reasoning2,
|
|
14652
|
+
assertions: assertions2,
|
|
14653
|
+
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
14700
14654
|
evaluatorRawRequest,
|
|
14701
14655
|
scores
|
|
14702
14656
|
};
|
|
@@ -14711,16 +14665,12 @@ var CompositeEvaluator = class {
|
|
|
14711
14665
|
parseJsonFromText(extractLastAssistantContent2(response.output))
|
|
14712
14666
|
);
|
|
14713
14667
|
const score = clampScore(data.score);
|
|
14714
|
-
const
|
|
14715
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
14716
|
-
const reasoning = data.reasoning;
|
|
14668
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
14717
14669
|
return {
|
|
14718
14670
|
score,
|
|
14719
14671
|
verdict: scoreToVerdict(score),
|
|
14720
|
-
|
|
14721
|
-
|
|
14722
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
14723
|
-
reasoning,
|
|
14672
|
+
assertions,
|
|
14673
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
14724
14674
|
evaluatorRawRequest,
|
|
14725
14675
|
scores
|
|
14726
14676
|
};
|
|
@@ -14728,8 +14678,7 @@ var CompositeEvaluator = class {
|
|
|
14728
14678
|
return {
|
|
14729
14679
|
score: 0,
|
|
14730
14680
|
verdict: "fail",
|
|
14731
|
-
|
|
14732
|
-
misses: [],
|
|
14681
|
+
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
14733
14682
|
expectedAspectCount: 1,
|
|
14734
14683
|
evaluatorRawRequest,
|
|
14735
14684
|
scores
|
|
@@ -14752,10 +14701,8 @@ var CostEvaluator = class {
|
|
|
14752
14701
|
return {
|
|
14753
14702
|
score: 0,
|
|
14754
14703
|
verdict: "fail",
|
|
14755
|
-
|
|
14756
|
-
misses: ["No cost data available in trace"],
|
|
14704
|
+
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
14757
14705
|
expectedAspectCount: 1,
|
|
14758
|
-
reasoning: "Execution cost not reported by provider",
|
|
14759
14706
|
evaluatorRawRequest: {
|
|
14760
14707
|
type: "cost",
|
|
14761
14708
|
budget,
|
|
@@ -14769,10 +14716,10 @@ var CostEvaluator = class {
|
|
|
14769
14716
|
return {
|
|
14770
14717
|
score,
|
|
14771
14718
|
verdict: passed ? "pass" : "fail",
|
|
14772
|
-
|
|
14773
|
-
|
|
14719
|
+
assertions: [
|
|
14720
|
+
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
14721
|
+
],
|
|
14774
14722
|
expectedAspectCount: 1,
|
|
14775
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
14776
14723
|
evaluatorRawRequest: {
|
|
14777
14724
|
type: "cost",
|
|
14778
14725
|
budget,
|
|
@@ -14805,10 +14752,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
14805
14752
|
return {
|
|
14806
14753
|
score: 0,
|
|
14807
14754
|
verdict: "fail",
|
|
14808
|
-
|
|
14809
|
-
misses: ["No trace summary available"],
|
|
14755
|
+
assertions: [{ text: "No trace summary available", passed: false }],
|
|
14810
14756
|
expectedAspectCount: 1,
|
|
14811
|
-
reasoning: "Execution metrics not available - no trace summary provided",
|
|
14812
14757
|
evaluatorRawRequest: {
|
|
14813
14758
|
type: "execution-metrics",
|
|
14814
14759
|
config: this.extractConfiguredThresholds(),
|
|
@@ -14817,116 +14762,114 @@ var ExecutionMetricsEvaluator = class {
|
|
|
14817
14762
|
};
|
|
14818
14763
|
}
|
|
14819
14764
|
const narrowedTrace = trace2;
|
|
14820
|
-
const
|
|
14821
|
-
const misses = [];
|
|
14765
|
+
const assertions = [];
|
|
14822
14766
|
const actualMetrics = {};
|
|
14823
14767
|
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
14824
14768
|
const toolCalls = narrowedTrace.eventCount;
|
|
14825
14769
|
actualMetrics.tool_calls = toolCalls;
|
|
14826
14770
|
if (toolCalls <= max_tool_calls) {
|
|
14827
|
-
|
|
14771
|
+
assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
|
|
14828
14772
|
} else {
|
|
14829
|
-
|
|
14773
|
+
assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
|
|
14830
14774
|
}
|
|
14831
14775
|
}
|
|
14832
14776
|
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
14833
14777
|
const llmCalls = narrowedTrace.llmCallCount;
|
|
14834
14778
|
if (llmCalls === void 0) {
|
|
14835
|
-
|
|
14779
|
+
assertions.push({ text: "LLM call count data not available", passed: false });
|
|
14836
14780
|
} else {
|
|
14837
14781
|
actualMetrics.llm_calls = llmCalls;
|
|
14838
14782
|
if (llmCalls <= max_llm_calls) {
|
|
14839
|
-
|
|
14783
|
+
assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
|
|
14840
14784
|
} else {
|
|
14841
|
-
|
|
14785
|
+
assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
|
|
14842
14786
|
}
|
|
14843
14787
|
}
|
|
14844
14788
|
}
|
|
14845
14789
|
if (max_tokens !== void 0) {
|
|
14846
14790
|
if (!tokenUsage) {
|
|
14847
|
-
|
|
14791
|
+
assertions.push({ text: "Token usage data not available", passed: false });
|
|
14848
14792
|
} else {
|
|
14849
14793
|
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
14850
14794
|
actualMetrics.tokens = totalTokens;
|
|
14851
14795
|
if (totalTokens <= max_tokens) {
|
|
14852
|
-
|
|
14796
|
+
assertions.push({
|
|
14797
|
+
text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
|
|
14798
|
+
passed: true
|
|
14799
|
+
});
|
|
14853
14800
|
} else {
|
|
14854
|
-
|
|
14801
|
+
assertions.push({
|
|
14802
|
+
text: `Total tokens ${totalTokens} > ${max_tokens} max`,
|
|
14803
|
+
passed: false
|
|
14804
|
+
});
|
|
14855
14805
|
}
|
|
14856
14806
|
}
|
|
14857
14807
|
}
|
|
14858
14808
|
if (max_cost_usd !== void 0) {
|
|
14859
14809
|
if (costUsd === void 0) {
|
|
14860
|
-
|
|
14810
|
+
assertions.push({ text: "Cost data not available", passed: false });
|
|
14861
14811
|
} else {
|
|
14862
14812
|
actualMetrics.cost_usd = costUsd;
|
|
14863
14813
|
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
14864
14814
|
if (costUsd <= max_cost_usd) {
|
|
14865
|
-
|
|
14815
|
+
assertions.push({
|
|
14816
|
+
text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
|
|
14817
|
+
passed: true
|
|
14818
|
+
});
|
|
14866
14819
|
} else {
|
|
14867
|
-
|
|
14820
|
+
assertions.push({
|
|
14821
|
+
text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
|
|
14822
|
+
passed: false
|
|
14823
|
+
});
|
|
14868
14824
|
}
|
|
14869
14825
|
}
|
|
14870
14826
|
}
|
|
14871
14827
|
if (max_duration_ms !== void 0) {
|
|
14872
14828
|
if (durationMs === void 0) {
|
|
14873
|
-
|
|
14829
|
+
assertions.push({ text: "Duration data not available", passed: false });
|
|
14874
14830
|
} else {
|
|
14875
14831
|
actualMetrics.duration_ms = durationMs;
|
|
14876
14832
|
if (durationMs <= max_duration_ms) {
|
|
14877
|
-
|
|
14833
|
+
assertions.push({
|
|
14834
|
+
text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
|
|
14835
|
+
passed: true
|
|
14836
|
+
});
|
|
14878
14837
|
} else {
|
|
14879
|
-
|
|
14838
|
+
assertions.push({
|
|
14839
|
+
text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
|
|
14840
|
+
passed: false
|
|
14841
|
+
});
|
|
14880
14842
|
}
|
|
14881
14843
|
}
|
|
14882
14844
|
}
|
|
14883
14845
|
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
14884
14846
|
const ratio = explorationRatio(narrowedTrace);
|
|
14885
14847
|
if (ratio === void 0) {
|
|
14886
|
-
|
|
14848
|
+
assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
|
|
14887
14849
|
} else {
|
|
14888
14850
|
actualMetrics.exploration_ratio = ratio;
|
|
14889
14851
|
const diff = Math.abs(ratio - target_exploration_ratio);
|
|
14890
14852
|
if (diff <= exploration_tolerance) {
|
|
14891
|
-
|
|
14892
|
-
`Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}
|
|
14893
|
-
|
|
14853
|
+
assertions.push({
|
|
14854
|
+
text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
|
|
14855
|
+
passed: true
|
|
14856
|
+
});
|
|
14894
14857
|
} else {
|
|
14895
|
-
|
|
14896
|
-
`Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})
|
|
14897
|
-
|
|
14858
|
+
assertions.push({
|
|
14859
|
+
text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
|
|
14860
|
+
passed: false
|
|
14861
|
+
});
|
|
14898
14862
|
}
|
|
14899
14863
|
}
|
|
14900
14864
|
}
|
|
14901
|
-
const totalChecks =
|
|
14902
|
-
const
|
|
14903
|
-
const
|
|
14904
|
-
if (actualMetrics.tool_calls !== void 0) {
|
|
14905
|
-
reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
|
|
14906
|
-
}
|
|
14907
|
-
if (actualMetrics.llm_calls !== void 0) {
|
|
14908
|
-
reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
|
|
14909
|
-
}
|
|
14910
|
-
if (actualMetrics.tokens !== void 0) {
|
|
14911
|
-
reasoningParts.push(`tokens=${actualMetrics.tokens}`);
|
|
14912
|
-
}
|
|
14913
|
-
if (actualMetrics.cost_usd !== void 0) {
|
|
14914
|
-
reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
|
|
14915
|
-
}
|
|
14916
|
-
if (actualMetrics.duration_ms !== void 0) {
|
|
14917
|
-
reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
|
|
14918
|
-
}
|
|
14919
|
-
if (actualMetrics.exploration_ratio !== void 0) {
|
|
14920
|
-
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
14921
|
-
}
|
|
14922
|
-
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
14865
|
+
const totalChecks = assertions.length;
|
|
14866
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
14867
|
+
const score = totalChecks > 0 ? passedCount / totalChecks : 0;
|
|
14923
14868
|
return {
|
|
14924
14869
|
score,
|
|
14925
14870
|
verdict: scoreToVerdict(score),
|
|
14926
|
-
|
|
14927
|
-
misses,
|
|
14871
|
+
assertions,
|
|
14928
14872
|
expectedAspectCount: totalChecks || 1,
|
|
14929
|
-
reasoning,
|
|
14930
14873
|
evaluatorRawRequest: {
|
|
14931
14874
|
type: "execution-metrics",
|
|
14932
14875
|
config: this.extractConfiguredThresholds(),
|
|
@@ -15030,10 +14973,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
15030
14973
|
return {
|
|
15031
14974
|
score: 0,
|
|
15032
14975
|
verdict: "fail",
|
|
15033
|
-
|
|
15034
|
-
|
|
15035
|
-
expectedAspectCount: this.config.fields.length,
|
|
15036
|
-
reasoning: "Candidate answer is not valid JSON"
|
|
14976
|
+
assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
|
|
14977
|
+
expectedAspectCount: this.config.fields.length
|
|
15037
14978
|
};
|
|
15038
14979
|
}
|
|
15039
14980
|
const expectedData = this.extractExpectedData(evalCase.expected_output);
|
|
@@ -15041,10 +14982,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
15041
14982
|
return {
|
|
15042
14983
|
score: 0,
|
|
15043
14984
|
verdict: "fail",
|
|
15044
|
-
|
|
15045
|
-
|
|
15046
|
-
expectedAspectCount: this.config.fields.length,
|
|
15047
|
-
reasoning: "Could not extract expected data from expected_output"
|
|
14985
|
+
assertions: [{ text: "No expected data found in expected_output", passed: false }],
|
|
14986
|
+
expectedAspectCount: this.config.fields.length
|
|
15048
14987
|
};
|
|
15049
14988
|
}
|
|
15050
14989
|
const fieldResults = [];
|
|
@@ -15262,18 +15201,14 @@ var FieldAccuracyEvaluator = class {
|
|
|
15262
15201
|
*/
|
|
15263
15202
|
aggregateResults(results) {
|
|
15264
15203
|
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
15265
|
-
const
|
|
15266
|
-
const misses = [];
|
|
15204
|
+
const assertions = [];
|
|
15267
15205
|
for (const result of results) {
|
|
15268
|
-
|
|
15269
|
-
hits.push(result.message);
|
|
15270
|
-
} else {
|
|
15271
|
-
misses.push(result.message);
|
|
15272
|
-
}
|
|
15206
|
+
assertions.push({ text: result.message, passed: result.hit });
|
|
15273
15207
|
}
|
|
15274
15208
|
let score;
|
|
15275
15209
|
if (aggregation === "all_or_nothing") {
|
|
15276
|
-
|
|
15210
|
+
const hasFailed = assertions.some((a) => !a.passed);
|
|
15211
|
+
score = hasFailed ? 0 : 1;
|
|
15277
15212
|
} else {
|
|
15278
15213
|
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
15279
15214
|
if (totalWeight === 0) {
|
|
@@ -15283,15 +15218,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
15283
15218
|
score = weightedSum / totalWeight;
|
|
15284
15219
|
}
|
|
15285
15220
|
}
|
|
15286
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
15287
15221
|
return {
|
|
15288
15222
|
score: clampScore(score),
|
|
15289
15223
|
verdict: scoreToVerdict(score),
|
|
15290
|
-
|
|
15291
|
-
|
|
15292
|
-
misses: misses.slice(0, 4),
|
|
15293
|
-
expectedAspectCount: results.length,
|
|
15294
|
-
reasoning
|
|
15224
|
+
assertions,
|
|
15225
|
+
expectedAspectCount: results.length
|
|
15295
15226
|
};
|
|
15296
15227
|
}
|
|
15297
15228
|
};
|
|
@@ -15400,10 +15331,8 @@ var LatencyEvaluator = class {
|
|
|
15400
15331
|
return {
|
|
15401
15332
|
score: 0,
|
|
15402
15333
|
verdict: "fail",
|
|
15403
|
-
|
|
15404
|
-
misses: ["No duration data available in trace"],
|
|
15334
|
+
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
15405
15335
|
expectedAspectCount: 1,
|
|
15406
|
-
reasoning: "Execution duration not reported by provider",
|
|
15407
15336
|
evaluatorRawRequest: {
|
|
15408
15337
|
type: "latency",
|
|
15409
15338
|
threshold,
|
|
@@ -15416,10 +15345,10 @@ var LatencyEvaluator = class {
|
|
|
15416
15345
|
return {
|
|
15417
15346
|
score,
|
|
15418
15347
|
verdict: passed ? "pass" : "fail",
|
|
15419
|
-
|
|
15420
|
-
|
|
15348
|
+
assertions: [
|
|
15349
|
+
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
15350
|
+
],
|
|
15421
15351
|
expectedAspectCount: 1,
|
|
15422
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
15423
15352
|
evaluatorRawRequest: {
|
|
15424
15353
|
type: "latency",
|
|
15425
15354
|
threshold,
|
|
@@ -15495,23 +15424,25 @@ var SkillTriggerEvaluator = class {
|
|
|
15495
15424
|
return {
|
|
15496
15425
|
score: 1,
|
|
15497
15426
|
verdict: "pass",
|
|
15498
|
-
|
|
15499
|
-
|
|
15427
|
+
assertions: [
|
|
15428
|
+
{
|
|
15429
|
+
text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
|
|
15430
|
+
passed: true
|
|
15431
|
+
}
|
|
15500
15432
|
],
|
|
15501
|
-
|
|
15502
|
-
expectedAspectCount: 1,
|
|
15503
|
-
reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
|
|
15433
|
+
expectedAspectCount: 1
|
|
15504
15434
|
};
|
|
15505
15435
|
}
|
|
15506
15436
|
return {
|
|
15507
15437
|
score: 0,
|
|
15508
15438
|
verdict: "fail",
|
|
15509
|
-
|
|
15510
|
-
|
|
15511
|
-
|
|
15439
|
+
assertions: [
|
|
15440
|
+
{
|
|
15441
|
+
text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
|
|
15442
|
+
passed: false
|
|
15443
|
+
}
|
|
15512
15444
|
],
|
|
15513
|
-
expectedAspectCount: 1
|
|
15514
|
-
reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
|
|
15445
|
+
expectedAspectCount: 1
|
|
15515
15446
|
};
|
|
15516
15447
|
}
|
|
15517
15448
|
};
|
|
@@ -15680,10 +15611,8 @@ var TokenUsageEvaluator = class {
|
|
|
15680
15611
|
return {
|
|
15681
15612
|
score: 0,
|
|
15682
15613
|
verdict: "fail",
|
|
15683
|
-
|
|
15684
|
-
misses: ["No token usage data available in trace"],
|
|
15614
|
+
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
15685
15615
|
expectedAspectCount,
|
|
15686
|
-
reasoning: "Token usage not reported by provider",
|
|
15687
15616
|
evaluatorRawRequest: {
|
|
15688
15617
|
type: "token-usage",
|
|
15689
15618
|
max_total: maxTotal ?? null,
|
|
@@ -15697,37 +15626,34 @@ var TokenUsageEvaluator = class {
|
|
|
15697
15626
|
const output = usage.output;
|
|
15698
15627
|
const cached = usage.cached ?? 0;
|
|
15699
15628
|
const total = input + output + cached;
|
|
15700
|
-
const
|
|
15701
|
-
const misses = [];
|
|
15629
|
+
const assertions = [];
|
|
15702
15630
|
if (typeof maxInput === "number") {
|
|
15703
15631
|
if (input <= maxInput) {
|
|
15704
|
-
|
|
15632
|
+
assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
|
|
15705
15633
|
} else {
|
|
15706
|
-
|
|
15634
|
+
assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
|
|
15707
15635
|
}
|
|
15708
15636
|
}
|
|
15709
15637
|
if (typeof maxOutput === "number") {
|
|
15710
15638
|
if (output <= maxOutput) {
|
|
15711
|
-
|
|
15639
|
+
assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
|
|
15712
15640
|
} else {
|
|
15713
|
-
|
|
15641
|
+
assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
|
|
15714
15642
|
}
|
|
15715
15643
|
}
|
|
15716
15644
|
if (typeof maxTotal === "number") {
|
|
15717
15645
|
if (total <= maxTotal) {
|
|
15718
|
-
|
|
15646
|
+
assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
|
|
15719
15647
|
} else {
|
|
15720
|
-
|
|
15648
|
+
assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
|
|
15721
15649
|
}
|
|
15722
15650
|
}
|
|
15723
|
-
const passed =
|
|
15651
|
+
const passed = assertions.every((a) => a.passed);
|
|
15724
15652
|
return {
|
|
15725
15653
|
score: passed ? 1 : 0,
|
|
15726
15654
|
verdict: passed ? "pass" : "fail",
|
|
15727
|
-
|
|
15728
|
-
misses,
|
|
15655
|
+
assertions,
|
|
15729
15656
|
expectedAspectCount,
|
|
15730
|
-
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
15731
15657
|
evaluatorRawRequest: {
|
|
15732
15658
|
type: "token-usage",
|
|
15733
15659
|
max_total: maxTotal ?? null,
|
|
@@ -15827,8 +15753,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15827
15753
|
return {
|
|
15828
15754
|
score: 0,
|
|
15829
15755
|
verdict: "fail",
|
|
15830
|
-
|
|
15831
|
-
misses: ["No trace available for evaluation"],
|
|
15756
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
15832
15757
|
expectedAspectCount: 1
|
|
15833
15758
|
};
|
|
15834
15759
|
}
|
|
@@ -15839,8 +15764,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15839
15764
|
return {
|
|
15840
15765
|
score: 0,
|
|
15841
15766
|
verdict: "fail",
|
|
15842
|
-
|
|
15843
|
-
misses: ["No trace available for evaluation"],
|
|
15767
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
15844
15768
|
expectedAspectCount: 1
|
|
15845
15769
|
};
|
|
15846
15770
|
}
|
|
@@ -15858,8 +15782,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15858
15782
|
return {
|
|
15859
15783
|
score: 0,
|
|
15860
15784
|
verdict: "fail",
|
|
15861
|
-
|
|
15862
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
15785
|
+
assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
|
|
15863
15786
|
expectedAspectCount: 1
|
|
15864
15787
|
};
|
|
15865
15788
|
}
|
|
@@ -15908,28 +15831,32 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15908
15831
|
return {
|
|
15909
15832
|
score: 1,
|
|
15910
15833
|
verdict: "pass",
|
|
15911
|
-
|
|
15912
|
-
misses: [],
|
|
15834
|
+
assertions: [{ text: "No tool requirements specified", passed: true }],
|
|
15913
15835
|
expectedAspectCount: 0
|
|
15914
15836
|
};
|
|
15915
15837
|
}
|
|
15916
|
-
const
|
|
15917
|
-
const misses = [];
|
|
15838
|
+
const assertions = [];
|
|
15918
15839
|
for (const toolName of toolNames) {
|
|
15919
15840
|
const required = minimums[toolName];
|
|
15920
15841
|
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
15921
15842
|
if (actual >= required) {
|
|
15922
|
-
|
|
15843
|
+
assertions.push({
|
|
15844
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
15845
|
+
passed: true
|
|
15846
|
+
});
|
|
15923
15847
|
} else {
|
|
15924
|
-
|
|
15848
|
+
assertions.push({
|
|
15849
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
15850
|
+
passed: false
|
|
15851
|
+
});
|
|
15925
15852
|
}
|
|
15926
15853
|
}
|
|
15927
|
-
const
|
|
15854
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
15855
|
+
const score = passedCount / toolNames.length;
|
|
15928
15856
|
return {
|
|
15929
15857
|
score,
|
|
15930
15858
|
verdict: scoreToVerdict(score),
|
|
15931
|
-
|
|
15932
|
-
misses,
|
|
15859
|
+
assertions,
|
|
15933
15860
|
expectedAspectCount: toolNames.length
|
|
15934
15861
|
};
|
|
15935
15862
|
}
|
|
@@ -15939,13 +15866,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15939
15866
|
return {
|
|
15940
15867
|
score: 1,
|
|
15941
15868
|
verdict: "pass",
|
|
15942
|
-
|
|
15943
|
-
misses: [],
|
|
15869
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
15944
15870
|
expectedAspectCount: 0
|
|
15945
15871
|
};
|
|
15946
15872
|
}
|
|
15947
|
-
const
|
|
15948
|
-
const misses = [];
|
|
15873
|
+
const assertions = [];
|
|
15949
15874
|
const warnings = [];
|
|
15950
15875
|
let actualIndex = 0;
|
|
15951
15876
|
let sequenceHits = 0;
|
|
@@ -15965,16 +15890,20 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15965
15890
|
const actualCall = toolCalls[actualIndex];
|
|
15966
15891
|
if (actualCall.name === expectedTool) {
|
|
15967
15892
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
15968
|
-
|
|
15893
|
+
assertions.push({
|
|
15894
|
+
text: `Found ${expectedTool} at position ${actualIndex}`,
|
|
15895
|
+
passed: true
|
|
15896
|
+
});
|
|
15969
15897
|
sequenceHits++;
|
|
15970
15898
|
matchedCall = actualCall;
|
|
15971
15899
|
actualIndex++;
|
|
15972
15900
|
found = true;
|
|
15973
15901
|
break;
|
|
15974
15902
|
}
|
|
15975
|
-
|
|
15976
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch
|
|
15977
|
-
|
|
15903
|
+
assertions.push({
|
|
15904
|
+
text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
|
|
15905
|
+
passed: false
|
|
15906
|
+
});
|
|
15978
15907
|
actualIndex++;
|
|
15979
15908
|
argsMismatch = true;
|
|
15980
15909
|
break;
|
|
@@ -15982,7 +15911,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15982
15911
|
actualIndex++;
|
|
15983
15912
|
}
|
|
15984
15913
|
if (!found && !argsMismatch) {
|
|
15985
|
-
|
|
15914
|
+
assertions.push({
|
|
15915
|
+
text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
|
|
15916
|
+
passed: false
|
|
15917
|
+
});
|
|
15986
15918
|
}
|
|
15987
15919
|
if (found && matchedCall) {
|
|
15988
15920
|
const latencyResult = checkLatency(
|
|
@@ -15991,10 +15923,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15991
15923
|
matchedCall.durationMs
|
|
15992
15924
|
);
|
|
15993
15925
|
if (latencyResult.status === "pass") {
|
|
15994
|
-
|
|
15926
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
15995
15927
|
latencyHits++;
|
|
15996
15928
|
} else if (latencyResult.status === "fail") {
|
|
15997
|
-
|
|
15929
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
15998
15930
|
} else if (latencyResult.message) {
|
|
15999
15931
|
warnings.push(latencyResult.message);
|
|
16000
15932
|
latencySkips++;
|
|
@@ -16010,8 +15942,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16010
15942
|
return {
|
|
16011
15943
|
score,
|
|
16012
15944
|
verdict: scoreToVerdict(score),
|
|
16013
|
-
|
|
16014
|
-
misses,
|
|
15945
|
+
assertions,
|
|
16015
15946
|
expectedAspectCount: totalAssertions
|
|
16016
15947
|
};
|
|
16017
15948
|
}
|
|
@@ -16021,13 +15952,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16021
15952
|
return {
|
|
16022
15953
|
score: 1,
|
|
16023
15954
|
verdict: "pass",
|
|
16024
|
-
|
|
16025
|
-
misses: [],
|
|
15955
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
16026
15956
|
expectedAspectCount: 0
|
|
16027
15957
|
};
|
|
16028
15958
|
}
|
|
16029
|
-
const
|
|
16030
|
-
const misses = [];
|
|
15959
|
+
const assertions = [];
|
|
16031
15960
|
const warnings = [];
|
|
16032
15961
|
let sequenceHits = 0;
|
|
16033
15962
|
let latencyHits = 0;
|
|
@@ -16036,7 +15965,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16036
15965
|
(item) => item.maxDurationMs !== void 0
|
|
16037
15966
|
).length;
|
|
16038
15967
|
if (toolCalls.length !== expected.length) {
|
|
16039
|
-
|
|
15968
|
+
assertions.push({
|
|
15969
|
+
text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
|
|
15970
|
+
passed: false
|
|
15971
|
+
});
|
|
16040
15972
|
}
|
|
16041
15973
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
16042
15974
|
for (let i = 0; i < checkLength; i++) {
|
|
@@ -16048,14 +15980,17 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16048
15980
|
let sequenceMatched = false;
|
|
16049
15981
|
if (actualTool === expectedTool) {
|
|
16050
15982
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
16051
|
-
|
|
15983
|
+
assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
|
|
16052
15984
|
sequenceHits++;
|
|
16053
15985
|
sequenceMatched = true;
|
|
16054
15986
|
} else {
|
|
16055
|
-
|
|
15987
|
+
assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
|
|
16056
15988
|
}
|
|
16057
15989
|
} else {
|
|
16058
|
-
|
|
15990
|
+
assertions.push({
|
|
15991
|
+
text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
|
|
15992
|
+
passed: false
|
|
15993
|
+
});
|
|
16059
15994
|
}
|
|
16060
15995
|
if (sequenceMatched) {
|
|
16061
15996
|
const latencyResult = checkLatency(
|
|
@@ -16064,10 +15999,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16064
15999
|
actualCall.durationMs
|
|
16065
16000
|
);
|
|
16066
16001
|
if (latencyResult.status === "pass") {
|
|
16067
|
-
|
|
16002
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
16068
16003
|
latencyHits++;
|
|
16069
16004
|
} else if (latencyResult.status === "fail") {
|
|
16070
|
-
|
|
16005
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
16071
16006
|
} else if (latencyResult.message) {
|
|
16072
16007
|
warnings.push(latencyResult.message);
|
|
16073
16008
|
latencySkips++;
|
|
@@ -16075,7 +16010,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16075
16010
|
}
|
|
16076
16011
|
}
|
|
16077
16012
|
for (let i = checkLength; i < expected.length; i++) {
|
|
16078
|
-
|
|
16013
|
+
assertions.push({
|
|
16014
|
+
text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
|
|
16015
|
+
passed: false
|
|
16016
|
+
});
|
|
16079
16017
|
}
|
|
16080
16018
|
for (const warning of warnings) {
|
|
16081
16019
|
console.warn(`[tool-trajectory] ${warning}`);
|
|
@@ -16086,8 +16024,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16086
16024
|
return {
|
|
16087
16025
|
score,
|
|
16088
16026
|
verdict: scoreToVerdict(score),
|
|
16089
|
-
|
|
16090
|
-
misses,
|
|
16027
|
+
assertions,
|
|
16091
16028
|
expectedAspectCount: totalAssertions
|
|
16092
16029
|
};
|
|
16093
16030
|
}
|
|
@@ -16102,13 +16039,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16102
16039
|
return {
|
|
16103
16040
|
score: 1,
|
|
16104
16041
|
verdict: "pass",
|
|
16105
|
-
|
|
16106
|
-
misses: [],
|
|
16042
|
+
assertions: [{ text: "No expected tools specified", passed: true }],
|
|
16107
16043
|
expectedAspectCount: 0
|
|
16108
16044
|
};
|
|
16109
16045
|
}
|
|
16110
|
-
const
|
|
16111
|
-
const misses = [];
|
|
16046
|
+
const assertions = [];
|
|
16112
16047
|
const consumed = /* @__PURE__ */ new Set();
|
|
16113
16048
|
for (let i = 0; i < expected.length; i++) {
|
|
16114
16049
|
const expectedItem = expected[i];
|
|
@@ -16119,22 +16054,25 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16119
16054
|
if (consumed.has(j)) continue;
|
|
16120
16055
|
const actualCall = toolCalls[j];
|
|
16121
16056
|
if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
16122
|
-
|
|
16057
|
+
assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
|
|
16123
16058
|
consumed.add(j);
|
|
16124
16059
|
found = true;
|
|
16125
16060
|
break;
|
|
16126
16061
|
}
|
|
16127
16062
|
}
|
|
16128
16063
|
if (!found) {
|
|
16129
|
-
|
|
16064
|
+
assertions.push({
|
|
16065
|
+
text: `Expected ${expectedTool} not found in actual trajectory`,
|
|
16066
|
+
passed: false
|
|
16067
|
+
});
|
|
16130
16068
|
}
|
|
16131
16069
|
}
|
|
16132
|
-
const
|
|
16070
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
16071
|
+
const score = expected.length > 0 ? passedCount / expected.length : 1;
|
|
16133
16072
|
return {
|
|
16134
16073
|
score,
|
|
16135
16074
|
verdict: scoreToVerdict(score),
|
|
16136
|
-
|
|
16137
|
-
misses,
|
|
16075
|
+
assertions,
|
|
16138
16076
|
expectedAspectCount: expected.length
|
|
16139
16077
|
};
|
|
16140
16078
|
}
|
|
@@ -16150,16 +16088,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16150
16088
|
return {
|
|
16151
16089
|
score: 1,
|
|
16152
16090
|
verdict: "pass",
|
|
16153
|
-
|
|
16154
|
-
misses: [],
|
|
16091
|
+
assertions: [{ text: "No tool calls and no expected tools", passed: true }],
|
|
16155
16092
|
expectedAspectCount: 0
|
|
16156
16093
|
};
|
|
16157
16094
|
}
|
|
16158
16095
|
return {
|
|
16159
16096
|
score: 0,
|
|
16160
16097
|
verdict: "fail",
|
|
16161
|
-
|
|
16162
|
-
|
|
16098
|
+
assertions: [
|
|
16099
|
+
{
|
|
16100
|
+
text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
|
|
16101
|
+
passed: false
|
|
16102
|
+
}
|
|
16103
|
+
],
|
|
16163
16104
|
expectedAspectCount: toolCalls.length
|
|
16164
16105
|
};
|
|
16165
16106
|
}
|
|
@@ -16167,13 +16108,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16167
16108
|
return {
|
|
16168
16109
|
score: 1,
|
|
16169
16110
|
verdict: "pass",
|
|
16170
|
-
|
|
16171
|
-
misses: [],
|
|
16111
|
+
assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
|
|
16172
16112
|
expectedAspectCount: 0
|
|
16173
16113
|
};
|
|
16174
16114
|
}
|
|
16175
|
-
const
|
|
16176
|
-
const misses = [];
|
|
16115
|
+
const assertions = [];
|
|
16177
16116
|
for (let i = 0; i < toolCalls.length; i++) {
|
|
16178
16117
|
const actualCall = toolCalls[i];
|
|
16179
16118
|
let allowed = false;
|
|
@@ -16185,17 +16124,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16185
16124
|
}
|
|
16186
16125
|
}
|
|
16187
16126
|
if (allowed) {
|
|
16188
|
-
|
|
16127
|
+
assertions.push({
|
|
16128
|
+
text: `Position ${i}: ${actualCall.name} is in allowed set`,
|
|
16129
|
+
passed: true
|
|
16130
|
+
});
|
|
16189
16131
|
} else {
|
|
16190
|
-
|
|
16132
|
+
assertions.push({
|
|
16133
|
+
text: `Position ${i}: ${actualCall.name} is not in allowed set`,
|
|
16134
|
+
passed: false
|
|
16135
|
+
});
|
|
16191
16136
|
}
|
|
16192
16137
|
}
|
|
16193
|
-
const
|
|
16138
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
16139
|
+
const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
|
|
16194
16140
|
return {
|
|
16195
16141
|
score,
|
|
16196
16142
|
verdict: scoreToVerdict(score),
|
|
16197
|
-
|
|
16198
|
-
misses,
|
|
16143
|
+
assertions,
|
|
16199
16144
|
expectedAspectCount: toolCalls.length
|
|
16200
16145
|
};
|
|
16201
16146
|
}
|
|
@@ -16206,8 +16151,12 @@ function runContainsAssertion(output, value) {
|
|
|
16206
16151
|
const passed = output.includes(value);
|
|
16207
16152
|
return {
|
|
16208
16153
|
score: passed ? 1 : 0,
|
|
16209
|
-
|
|
16210
|
-
|
|
16154
|
+
assertions: [
|
|
16155
|
+
{
|
|
16156
|
+
text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
|
|
16157
|
+
passed
|
|
16158
|
+
}
|
|
16159
|
+
]
|
|
16211
16160
|
};
|
|
16212
16161
|
}
|
|
16213
16162
|
function runContainsAnyAssertion(output, values) {
|
|
@@ -16215,8 +16164,12 @@ function runContainsAnyAssertion(output, values) {
|
|
|
16215
16164
|
const passed = matched.length > 0;
|
|
16216
16165
|
return {
|
|
16217
16166
|
score: passed ? 1 : 0,
|
|
16218
|
-
|
|
16219
|
-
|
|
16167
|
+
assertions: [
|
|
16168
|
+
{
|
|
16169
|
+
text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
|
|
16170
|
+
passed
|
|
16171
|
+
}
|
|
16172
|
+
]
|
|
16220
16173
|
};
|
|
16221
16174
|
}
|
|
16222
16175
|
function runContainsAllAssertion(output, values) {
|
|
@@ -16224,16 +16177,24 @@ function runContainsAllAssertion(output, values) {
|
|
|
16224
16177
|
const passed = missing.length === 0;
|
|
16225
16178
|
return {
|
|
16226
16179
|
score: passed ? 1 : 0,
|
|
16227
|
-
|
|
16228
|
-
|
|
16180
|
+
assertions: [
|
|
16181
|
+
{
|
|
16182
|
+
text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
16183
|
+
passed
|
|
16184
|
+
}
|
|
16185
|
+
]
|
|
16229
16186
|
};
|
|
16230
16187
|
}
|
|
16231
16188
|
function runIcontainsAssertion(output, value) {
|
|
16232
16189
|
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
16233
16190
|
return {
|
|
16234
16191
|
score: passed ? 1 : 0,
|
|
16235
|
-
|
|
16236
|
-
|
|
16192
|
+
assertions: [
|
|
16193
|
+
{
|
|
16194
|
+
text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
|
|
16195
|
+
passed
|
|
16196
|
+
}
|
|
16197
|
+
]
|
|
16237
16198
|
};
|
|
16238
16199
|
}
|
|
16239
16200
|
function runIcontainsAnyAssertion(output, values) {
|
|
@@ -16242,9 +16203,11 @@ function runIcontainsAnyAssertion(output, values) {
|
|
|
16242
16203
|
const passed = matched.length > 0;
|
|
16243
16204
|
return {
|
|
16244
16205
|
score: passed ? 1 : 0,
|
|
16245
|
-
|
|
16246
|
-
|
|
16247
|
-
|
|
16206
|
+
assertions: [
|
|
16207
|
+
{
|
|
16208
|
+
text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
|
|
16209
|
+
passed
|
|
16210
|
+
}
|
|
16248
16211
|
]
|
|
16249
16212
|
};
|
|
16250
16213
|
}
|
|
@@ -16254,24 +16217,36 @@ function runIcontainsAllAssertion(output, values) {
|
|
|
16254
16217
|
const passed = missing.length === 0;
|
|
16255
16218
|
return {
|
|
16256
16219
|
score: passed ? 1 : 0,
|
|
16257
|
-
|
|
16258
|
-
|
|
16220
|
+
assertions: [
|
|
16221
|
+
{
|
|
16222
|
+
text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
16223
|
+
passed
|
|
16224
|
+
}
|
|
16225
|
+
]
|
|
16259
16226
|
};
|
|
16260
16227
|
}
|
|
16261
16228
|
function runStartsWithAssertion(output, value) {
|
|
16262
16229
|
const passed = output.trim().startsWith(value.trim());
|
|
16263
16230
|
return {
|
|
16264
16231
|
score: passed ? 1 : 0,
|
|
16265
|
-
|
|
16266
|
-
|
|
16232
|
+
assertions: [
|
|
16233
|
+
{
|
|
16234
|
+
text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
|
|
16235
|
+
passed
|
|
16236
|
+
}
|
|
16237
|
+
]
|
|
16267
16238
|
};
|
|
16268
16239
|
}
|
|
16269
16240
|
function runEndsWithAssertion(output, value) {
|
|
16270
16241
|
const passed = output.trim().endsWith(value.trim());
|
|
16271
16242
|
return {
|
|
16272
16243
|
score: passed ? 1 : 0,
|
|
16273
|
-
|
|
16274
|
-
|
|
16244
|
+
assertions: [
|
|
16245
|
+
{
|
|
16246
|
+
text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
|
|
16247
|
+
passed
|
|
16248
|
+
}
|
|
16249
|
+
]
|
|
16275
16250
|
};
|
|
16276
16251
|
}
|
|
16277
16252
|
function runRegexAssertion(output, pattern, flags) {
|
|
@@ -16280,8 +16255,12 @@ function runRegexAssertion(output, pattern, flags) {
|
|
|
16280
16255
|
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
16281
16256
|
return {
|
|
16282
16257
|
score: passed ? 1 : 0,
|
|
16283
|
-
|
|
16284
|
-
|
|
16258
|
+
assertions: [
|
|
16259
|
+
{
|
|
16260
|
+
text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
|
|
16261
|
+
passed
|
|
16262
|
+
}
|
|
16263
|
+
]
|
|
16285
16264
|
};
|
|
16286
16265
|
}
|
|
16287
16266
|
function runIsJsonAssertion(output) {
|
|
@@ -16293,16 +16272,24 @@ function runIsJsonAssertion(output) {
|
|
|
16293
16272
|
}
|
|
16294
16273
|
return {
|
|
16295
16274
|
score: passed ? 1 : 0,
|
|
16296
|
-
|
|
16297
|
-
|
|
16275
|
+
assertions: [
|
|
16276
|
+
{
|
|
16277
|
+
text: passed ? "Output is valid JSON" : "Output is not valid JSON",
|
|
16278
|
+
passed
|
|
16279
|
+
}
|
|
16280
|
+
]
|
|
16298
16281
|
};
|
|
16299
16282
|
}
|
|
16300
16283
|
function runEqualsAssertion(output, value) {
|
|
16301
16284
|
const passed = output.trim() === value.trim();
|
|
16302
16285
|
return {
|
|
16303
16286
|
score: passed ? 1 : 0,
|
|
16304
|
-
|
|
16305
|
-
|
|
16287
|
+
assertions: [
|
|
16288
|
+
{
|
|
16289
|
+
text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
|
|
16290
|
+
passed
|
|
16291
|
+
}
|
|
16292
|
+
]
|
|
16306
16293
|
};
|
|
16307
16294
|
}
|
|
16308
16295
|
|
|
@@ -16515,10 +16502,8 @@ var InlineAssertEvaluator = class {
|
|
|
16515
16502
|
return {
|
|
16516
16503
|
score,
|
|
16517
16504
|
verdict: scoreToVerdict(score),
|
|
16518
|
-
|
|
16519
|
-
misses: score < 0.5 ? [result.name] : [],
|
|
16505
|
+
assertions: [{ text: result.name, passed: score >= 0.5 }],
|
|
16520
16506
|
expectedAspectCount: 1,
|
|
16521
|
-
reasoning: void 0,
|
|
16522
16507
|
details: result.metadata ? result.metadata : void 0
|
|
16523
16508
|
};
|
|
16524
16509
|
}
|
|
@@ -16711,9 +16696,7 @@ var containsFactory = (config) => {
|
|
|
16711
16696
|
return {
|
|
16712
16697
|
score: result.score,
|
|
16713
16698
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16714
|
-
|
|
16715
|
-
misses: result.misses,
|
|
16716
|
-
reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
|
|
16699
|
+
assertions: result.assertions,
|
|
16717
16700
|
expectedAspectCount: 1
|
|
16718
16701
|
};
|
|
16719
16702
|
});
|
|
@@ -16725,9 +16708,7 @@ var regexFactory = (config) => {
|
|
|
16725
16708
|
return {
|
|
16726
16709
|
score: result.score,
|
|
16727
16710
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16728
|
-
|
|
16729
|
-
misses: result.misses,
|
|
16730
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
16711
|
+
assertions: result.assertions,
|
|
16731
16712
|
expectedAspectCount: 1
|
|
16732
16713
|
};
|
|
16733
16714
|
});
|
|
@@ -16738,9 +16719,7 @@ var isJsonFactory = () => {
|
|
|
16738
16719
|
return {
|
|
16739
16720
|
score: result.score,
|
|
16740
16721
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16741
|
-
|
|
16742
|
-
misses: result.misses,
|
|
16743
|
-
reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
|
|
16722
|
+
assertions: result.assertions,
|
|
16744
16723
|
expectedAspectCount: 1
|
|
16745
16724
|
};
|
|
16746
16725
|
});
|
|
@@ -16752,9 +16731,7 @@ var equalsFactory = (config) => {
|
|
|
16752
16731
|
return {
|
|
16753
16732
|
score: result.score,
|
|
16754
16733
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16755
|
-
|
|
16756
|
-
misses: result.misses,
|
|
16757
|
-
reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
|
|
16734
|
+
assertions: result.assertions,
|
|
16758
16735
|
expectedAspectCount: 1
|
|
16759
16736
|
};
|
|
16760
16737
|
});
|
|
@@ -16766,9 +16743,7 @@ var containsAnyFactory = (config) => {
|
|
|
16766
16743
|
return {
|
|
16767
16744
|
score: result.score,
|
|
16768
16745
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16769
|
-
|
|
16770
|
-
misses: result.misses,
|
|
16771
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16746
|
+
assertions: result.assertions,
|
|
16772
16747
|
expectedAspectCount: 1
|
|
16773
16748
|
};
|
|
16774
16749
|
});
|
|
@@ -16780,9 +16755,7 @@ var containsAllFactory = (config) => {
|
|
|
16780
16755
|
return {
|
|
16781
16756
|
score: result.score,
|
|
16782
16757
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16783
|
-
|
|
16784
|
-
misses: result.misses,
|
|
16785
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16758
|
+
assertions: result.assertions,
|
|
16786
16759
|
expectedAspectCount: 1
|
|
16787
16760
|
};
|
|
16788
16761
|
});
|
|
@@ -16794,9 +16767,7 @@ var icontainsFactory = (config) => {
|
|
|
16794
16767
|
return {
|
|
16795
16768
|
score: result.score,
|
|
16796
16769
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16797
|
-
|
|
16798
|
-
misses: result.misses,
|
|
16799
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16770
|
+
assertions: result.assertions,
|
|
16800
16771
|
expectedAspectCount: 1
|
|
16801
16772
|
};
|
|
16802
16773
|
});
|
|
@@ -16808,9 +16779,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
16808
16779
|
return {
|
|
16809
16780
|
score: result.score,
|
|
16810
16781
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16811
|
-
|
|
16812
|
-
misses: result.misses,
|
|
16813
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16782
|
+
assertions: result.assertions,
|
|
16814
16783
|
expectedAspectCount: 1
|
|
16815
16784
|
};
|
|
16816
16785
|
});
|
|
@@ -16822,9 +16791,7 @@ var icontainsAllFactory = (config) => {
|
|
|
16822
16791
|
return {
|
|
16823
16792
|
score: result.score,
|
|
16824
16793
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16825
|
-
|
|
16826
|
-
misses: result.misses,
|
|
16827
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16794
|
+
assertions: result.assertions,
|
|
16828
16795
|
expectedAspectCount: 1
|
|
16829
16796
|
};
|
|
16830
16797
|
});
|
|
@@ -16836,9 +16803,7 @@ var startsWithFactory = (config) => {
|
|
|
16836
16803
|
return {
|
|
16837
16804
|
score: result.score,
|
|
16838
16805
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16839
|
-
|
|
16840
|
-
misses: result.misses,
|
|
16841
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16806
|
+
assertions: result.assertions,
|
|
16842
16807
|
expectedAspectCount: 1
|
|
16843
16808
|
};
|
|
16844
16809
|
});
|
|
@@ -16850,9 +16815,7 @@ var endsWithFactory = (config) => {
|
|
|
16850
16815
|
return {
|
|
16851
16816
|
score: result.score,
|
|
16852
16817
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16853
|
-
|
|
16854
|
-
misses: result.misses,
|
|
16855
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16818
|
+
assertions: result.assertions,
|
|
16856
16819
|
expectedAspectCount: 1
|
|
16857
16820
|
};
|
|
16858
16821
|
});
|
|
@@ -18258,8 +18221,7 @@ async function runEvaluation(options) {
|
|
|
18258
18221
|
testId: evalCase.id,
|
|
18259
18222
|
dataset: evalCase.dataset,
|
|
18260
18223
|
score: 0,
|
|
18261
|
-
|
|
18262
|
-
misses: [],
|
|
18224
|
+
assertions: [],
|
|
18263
18225
|
answer: "",
|
|
18264
18226
|
target: target.name,
|
|
18265
18227
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
@@ -18295,8 +18257,7 @@ async function runEvaluation(options) {
|
|
|
18295
18257
|
testId: evalCase.id,
|
|
18296
18258
|
dataset: evalCase.dataset,
|
|
18297
18259
|
score: 0,
|
|
18298
|
-
|
|
18299
|
-
misses: [],
|
|
18260
|
+
assertions: [],
|
|
18300
18261
|
answer: "",
|
|
18301
18262
|
target: target.name,
|
|
18302
18263
|
error: errorMsg,
|
|
@@ -19263,11 +19224,9 @@ async function evaluateCandidate(options) {
|
|
|
19263
19224
|
dataset: evalCase.dataset,
|
|
19264
19225
|
conversationId: evalCase.conversation_id,
|
|
19265
19226
|
score: score.score,
|
|
19266
|
-
|
|
19267
|
-
misses: score.misses,
|
|
19227
|
+
assertions: score.assertions,
|
|
19268
19228
|
answer: candidate,
|
|
19269
19229
|
target: target.name,
|
|
19270
|
-
reasoning: score.reasoning,
|
|
19271
19230
|
tokenUsage,
|
|
19272
19231
|
costUsd,
|
|
19273
19232
|
durationMs,
|
|
@@ -19441,9 +19400,7 @@ async function runEvaluatorList(options) {
|
|
|
19441
19400
|
score: score2.score,
|
|
19442
19401
|
weight,
|
|
19443
19402
|
verdict: score2.verdict,
|
|
19444
|
-
|
|
19445
|
-
misses: score2.misses,
|
|
19446
|
-
reasoning: score2.reasoning,
|
|
19403
|
+
assertions: score2.assertions,
|
|
19447
19404
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
19448
19405
|
details: score2.details,
|
|
19449
19406
|
scores: mapChildResults(score2.scores),
|
|
@@ -19458,10 +19415,10 @@ async function runEvaluatorList(options) {
|
|
|
19458
19415
|
const fallbackScore = {
|
|
19459
19416
|
score: 0,
|
|
19460
19417
|
verdict: "fail",
|
|
19461
|
-
|
|
19462
|
-
|
|
19463
|
-
|
|
19464
|
-
|
|
19418
|
+
assertions: [
|
|
19419
|
+
{ text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
19420
|
+
],
|
|
19421
|
+
expectedAspectCount: 1
|
|
19465
19422
|
};
|
|
19466
19423
|
const weight = evaluatorConfig.weight ?? 1;
|
|
19467
19424
|
scored.push({
|
|
@@ -19477,9 +19434,12 @@ async function runEvaluatorList(options) {
|
|
|
19477
19434
|
score: 0,
|
|
19478
19435
|
weight,
|
|
19479
19436
|
verdict: "fail",
|
|
19480
|
-
|
|
19481
|
-
|
|
19482
|
-
|
|
19437
|
+
assertions: [
|
|
19438
|
+
{
|
|
19439
|
+
text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
19440
|
+
passed: false
|
|
19441
|
+
}
|
|
19442
|
+
],
|
|
19483
19443
|
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
19484
19444
|
startedAt: startedAt.toISOString(),
|
|
19485
19445
|
endedAt: endedAt.toISOString()
|
|
@@ -19495,9 +19455,7 @@ async function runEvaluatorList(options) {
|
|
|
19495
19455
|
...scores[lastScoresIdx],
|
|
19496
19456
|
score: negated.score,
|
|
19497
19457
|
verdict: negated.verdict,
|
|
19498
|
-
|
|
19499
|
-
misses: [...negated.misses],
|
|
19500
|
-
reasoning: negated.reasoning
|
|
19458
|
+
assertions: [...negated.assertions]
|
|
19501
19459
|
};
|
|
19502
19460
|
}
|
|
19503
19461
|
}
|
|
@@ -19512,21 +19470,13 @@ async function runEvaluatorList(options) {
|
|
|
19512
19470
|
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
19513
19471
|
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
19514
19472
|
) : 0;
|
|
19515
|
-
const
|
|
19516
|
-
const
|
|
19517
|
-
const expectedAspectCount = scored.reduce(
|
|
19518
|
-
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
19519
|
-
0
|
|
19520
|
-
);
|
|
19521
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
19522
|
-
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
19473
|
+
const assertions = scored.flatMap((entry) => entry.score.assertions);
|
|
19474
|
+
const expectedAspectCount = assertions.length || 1;
|
|
19523
19475
|
const score = {
|
|
19524
19476
|
score: aggregateScore,
|
|
19525
19477
|
verdict: scoreToVerdict(aggregateScore),
|
|
19526
|
-
|
|
19527
|
-
|
|
19528
|
-
expectedAspectCount,
|
|
19529
|
-
reasoning
|
|
19478
|
+
assertions,
|
|
19479
|
+
expectedAspectCount
|
|
19530
19480
|
};
|
|
19531
19481
|
return { score, scores };
|
|
19532
19482
|
}
|
|
@@ -19630,8 +19580,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
19630
19580
|
dataset: evalCase.dataset,
|
|
19631
19581
|
conversationId: evalCase.conversation_id,
|
|
19632
19582
|
score: 0,
|
|
19633
|
-
|
|
19634
|
-
misses: [`Error: ${message}`],
|
|
19583
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
19635
19584
|
answer: `Error occurred: ${message}`,
|
|
19636
19585
|
target: targetName,
|
|
19637
19586
|
requests,
|
|
@@ -19741,9 +19690,7 @@ function mapChildResults(children) {
|
|
|
19741
19690
|
score: child.score,
|
|
19742
19691
|
weight: child.weight,
|
|
19743
19692
|
verdict: child.verdict,
|
|
19744
|
-
|
|
19745
|
-
misses: child.misses,
|
|
19746
|
-
reasoning: child.reasoning,
|
|
19693
|
+
assertions: child.assertions,
|
|
19747
19694
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
19748
19695
|
scores: mapChildResults(child.scores),
|
|
19749
19696
|
details: child.details,
|
|
@@ -20713,7 +20660,6 @@ function createAgentKernel() {
|
|
|
20713
20660
|
freeformEvaluationSchema,
|
|
20714
20661
|
generateRubrics,
|
|
20715
20662
|
getAgentvHome,
|
|
20716
|
-
getHitCount,
|
|
20717
20663
|
getOutputFilenames,
|
|
20718
20664
|
getSubagentsRoot,
|
|
20719
20665
|
getTraceStateRoot,
|