agentv 3.4.0 → 3.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -8
- package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js → agentv-provider-NFFLXG5M-TJAWCWCX.js} +2 -2
- package/dist/{chunk-A7ZDUB46.js → chunk-5GG6DDP5.js} +27 -17
- package/dist/chunk-5GG6DDP5.js.map +1 -0
- package/dist/{chunk-AR3QEKXH.js → chunk-BJV6MDBE.js} +3 -3
- package/dist/{chunk-AR3QEKXH.js.map → chunk-BJV6MDBE.js.map} +1 -1
- package/dist/{chunk-GOZV2HN2.js → chunk-D6G4N2H2.js} +386 -439
- package/dist/chunk-D6G4N2H2.js.map +1 -0
- package/dist/{chunk-RE5I3U2S.js → chunk-RLL4QGNL.js} +26 -45
- package/dist/chunk-RLL4QGNL.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/{dist-AFDYFH6Y.js → dist-MZFXE6B5.js} +3 -5
- package/dist/index.js +4 -4
- package/dist/{interactive-WXXTZ7PD.js → interactive-J7SUWZH2.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-A7ZDUB46.js.map +0 -1
- package/dist/chunk-GOZV2HN2.js.map +0 -1
- package/dist/chunk-RE5I3U2S.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js.map → agentv-provider-NFFLXG5M-TJAWCWCX.js.map} +0 -0
- /package/dist/{dist-AFDYFH6Y.js.map → dist-MZFXE6B5.js.map} +0 -0
- /package/dist/{interactive-WXXTZ7PD.js.map → interactive-J7SUWZH2.js.map} +0 -0
|
@@ -149,7 +149,7 @@ import {
|
|
|
149
149
|
withUserAgentSuffix,
|
|
150
150
|
withoutTrailingSlash,
|
|
151
151
|
zodSchema
|
|
152
|
-
} from "./chunk-
|
|
152
|
+
} from "./chunk-BJV6MDBE.js";
|
|
153
153
|
import {
|
|
154
154
|
SpanStatusCode,
|
|
155
155
|
context,
|
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-EFR4JHPL.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-EFR4JHPL.js
|
|
423
423
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
424
424
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
425
425
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -498,9 +498,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
|
498
498
|
function isEvaluatorKind(value) {
|
|
499
499
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
500
500
|
}
|
|
501
|
-
function getHitCount(result) {
|
|
502
|
-
return result.hits.length;
|
|
503
|
-
}
|
|
504
501
|
async function fileExists(filePath) {
|
|
505
502
|
try {
|
|
506
503
|
await access(filePath, constants.F_OK);
|
|
@@ -17623,7 +17620,7 @@ var AzureProvider = class {
|
|
|
17623
17620
|
};
|
|
17624
17621
|
this.retryConfig = config.retry;
|
|
17625
17622
|
const azure = createAzure(buildAzureOptions(config));
|
|
17626
|
-
this.model = azure(config.deploymentName);
|
|
17623
|
+
this.model = azure.chat(config.deploymentName);
|
|
17627
17624
|
}
|
|
17628
17625
|
id;
|
|
17629
17626
|
kind = "azure";
|
|
@@ -23483,9 +23480,11 @@ function negateScore(score) {
|
|
|
23483
23480
|
...score,
|
|
23484
23481
|
score: negatedScore,
|
|
23485
23482
|
verdict: negatedVerdict,
|
|
23486
|
-
|
|
23487
|
-
|
|
23488
|
-
|
|
23483
|
+
assertions: score.assertions.map((a) => ({
|
|
23484
|
+
...a,
|
|
23485
|
+
passed: !a.passed,
|
|
23486
|
+
evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
|
|
23487
|
+
}))
|
|
23489
23488
|
};
|
|
23490
23489
|
}
|
|
23491
23490
|
function shellEscapePath(value) {
|
|
@@ -23985,9 +23984,13 @@ var CodeEvaluator = class {
|
|
|
23985
23984
|
);
|
|
23986
23985
|
const parsed = parseJsonSafe(stdout);
|
|
23987
23986
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
23988
|
-
const
|
|
23989
|
-
|
|
23990
|
-
|
|
23987
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
23988
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
23989
|
+
).map((a) => ({
|
|
23990
|
+
text: String(a.text),
|
|
23991
|
+
passed: Boolean(a.passed),
|
|
23992
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
23993
|
+
})) : [];
|
|
23991
23994
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
23992
23995
|
const proxyUsage = getProxyUsage?.();
|
|
23993
23996
|
const evaluatorRawRequest = {
|
|
@@ -24003,10 +24006,8 @@ var CodeEvaluator = class {
|
|
|
24003
24006
|
return {
|
|
24004
24007
|
score,
|
|
24005
24008
|
verdict: scoreToVerdict(score),
|
|
24006
|
-
|
|
24007
|
-
|
|
24008
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
24009
|
-
reasoning,
|
|
24009
|
+
assertions,
|
|
24010
|
+
expectedAspectCount: assertions.length || 1,
|
|
24010
24011
|
evaluatorRawRequest,
|
|
24011
24012
|
...details ? { details } : {},
|
|
24012
24013
|
tokenUsage: proxyUsage?.tokenUsage
|
|
@@ -24017,10 +24018,8 @@ var CodeEvaluator = class {
|
|
|
24017
24018
|
return {
|
|
24018
24019
|
score: 0,
|
|
24019
24020
|
verdict: "fail",
|
|
24020
|
-
|
|
24021
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
24021
|
+
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
24022
24022
|
expectedAspectCount: 1,
|
|
24023
|
-
reasoning: message,
|
|
24024
24023
|
evaluatorRawRequest: {
|
|
24025
24024
|
command: this.command,
|
|
24026
24025
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
@@ -24119,9 +24118,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
24119
24118
|
{{${TEMPLATE_VARIABLES.ANSWER}}}`;
|
|
24120
24119
|
var freeformEvaluationSchema = external_exports2.object({
|
|
24121
24120
|
score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
24122
|
-
|
|
24123
|
-
|
|
24124
|
-
|
|
24121
|
+
assertions: external_exports2.array(
|
|
24122
|
+
external_exports2.object({
|
|
24123
|
+
text: external_exports2.string().describe("Brief description of what was checked"),
|
|
24124
|
+
passed: external_exports2.boolean().describe("Whether this aspect was satisfied"),
|
|
24125
|
+
evidence: external_exports2.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
24126
|
+
})
|
|
24127
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
24125
24128
|
});
|
|
24126
24129
|
var rubricCheckResultSchema = external_exports2.object({
|
|
24127
24130
|
id: external_exports2.string().describe("The ID of the rubric item being checked"),
|
|
@@ -24223,17 +24226,12 @@ ${context2.fileChanges}`;
|
|
|
24223
24226
|
schema: freeformEvaluationSchema
|
|
24224
24227
|
});
|
|
24225
24228
|
const score = clampScore(data.score);
|
|
24226
|
-
const
|
|
24227
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
24228
|
-
const reasoning = data.reasoning;
|
|
24229
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
24229
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
24230
24230
|
return {
|
|
24231
24231
|
score,
|
|
24232
24232
|
verdict: scoreToVerdict(score),
|
|
24233
|
-
|
|
24234
|
-
|
|
24235
|
-
expectedAspectCount,
|
|
24236
|
-
reasoning,
|
|
24233
|
+
assertions,
|
|
24234
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24237
24235
|
evaluatorRawRequest,
|
|
24238
24236
|
tokenUsage
|
|
24239
24237
|
};
|
|
@@ -24244,10 +24242,8 @@ ${context2.fileChanges}`;
|
|
|
24244
24242
|
return {
|
|
24245
24243
|
score: 0,
|
|
24246
24244
|
verdict: "skip",
|
|
24247
|
-
|
|
24248
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24245
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24249
24246
|
expectedAspectCount: 1,
|
|
24250
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24251
24247
|
evaluatorRawRequest
|
|
24252
24248
|
};
|
|
24253
24249
|
}
|
|
@@ -24277,14 +24273,12 @@ ${context2.fileChanges}`;
|
|
|
24277
24273
|
userPrompt: prompt,
|
|
24278
24274
|
schema: rubricEvaluationSchema
|
|
24279
24275
|
});
|
|
24280
|
-
const { score, verdict,
|
|
24276
|
+
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
24281
24277
|
return {
|
|
24282
24278
|
score,
|
|
24283
24279
|
verdict,
|
|
24284
|
-
|
|
24285
|
-
misses,
|
|
24280
|
+
assertions,
|
|
24286
24281
|
expectedAspectCount: rubrics.length,
|
|
24287
|
-
reasoning: data.overall_reasoning,
|
|
24288
24282
|
evaluatorRawRequest,
|
|
24289
24283
|
tokenUsage
|
|
24290
24284
|
};
|
|
@@ -24295,10 +24289,8 @@ ${context2.fileChanges}`;
|
|
|
24295
24289
|
return {
|
|
24296
24290
|
score: 0,
|
|
24297
24291
|
verdict: "skip",
|
|
24298
|
-
|
|
24299
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24292
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24300
24293
|
expectedAspectCount: rubrics.length,
|
|
24301
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24302
24294
|
evaluatorRawRequest
|
|
24303
24295
|
};
|
|
24304
24296
|
}
|
|
@@ -24323,14 +24315,12 @@ ${context2.fileChanges}`;
|
|
|
24323
24315
|
userPrompt: prompt,
|
|
24324
24316
|
schema: scoreRangeEvaluationSchema
|
|
24325
24317
|
});
|
|
24326
|
-
const { score, verdict,
|
|
24318
|
+
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
24327
24319
|
return {
|
|
24328
24320
|
score,
|
|
24329
24321
|
verdict,
|
|
24330
|
-
|
|
24331
|
-
misses,
|
|
24322
|
+
assertions,
|
|
24332
24323
|
expectedAspectCount: rubrics.length,
|
|
24333
|
-
reasoning: data.overall_reasoning,
|
|
24334
24324
|
evaluatorRawRequest,
|
|
24335
24325
|
details,
|
|
24336
24326
|
tokenUsage
|
|
@@ -24342,10 +24332,8 @@ ${context2.fileChanges}`;
|
|
|
24342
24332
|
return {
|
|
24343
24333
|
score: 0,
|
|
24344
24334
|
verdict: "skip",
|
|
24345
|
-
|
|
24346
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24335
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24347
24336
|
expectedAspectCount: rubrics.length,
|
|
24348
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24349
24337
|
evaluatorRawRequest
|
|
24350
24338
|
};
|
|
24351
24339
|
}
|
|
@@ -24402,8 +24390,7 @@ ${context2.fileChanges}`;
|
|
|
24402
24390
|
return {
|
|
24403
24391
|
score: 0,
|
|
24404
24392
|
verdict: "fail",
|
|
24405
|
-
|
|
24406
|
-
misses: [`llm-grader built-in evaluation failed: ${message}`],
|
|
24393
|
+
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
24407
24394
|
expectedAspectCount: 1,
|
|
24408
24395
|
evaluatorRawRequest,
|
|
24409
24396
|
details: { mode: "built-in", error: message }
|
|
@@ -24453,8 +24440,9 @@ ${context2.fileChanges}`;
|
|
|
24453
24440
|
return {
|
|
24454
24441
|
score: 0,
|
|
24455
24442
|
verdict: "fail",
|
|
24456
|
-
|
|
24457
|
-
|
|
24443
|
+
assertions: [
|
|
24444
|
+
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
24445
|
+
],
|
|
24458
24446
|
expectedAspectCount: 1,
|
|
24459
24447
|
evaluatorRawRequest,
|
|
24460
24448
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
@@ -24472,8 +24460,9 @@ ${context2.fileChanges}`;
|
|
|
24472
24460
|
return {
|
|
24473
24461
|
score: 0,
|
|
24474
24462
|
verdict: "fail",
|
|
24475
|
-
|
|
24476
|
-
|
|
24463
|
+
assertions: [
|
|
24464
|
+
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
24465
|
+
],
|
|
24477
24466
|
expectedAspectCount: 1,
|
|
24478
24467
|
evaluatorRawRequest,
|
|
24479
24468
|
details: {
|
|
@@ -24625,29 +24614,24 @@ ${outputSchema2}`;
|
|
|
24625
24614
|
const parsed = parseJsonFromText(text2);
|
|
24626
24615
|
if (rubrics && rubrics.length > 0) {
|
|
24627
24616
|
const data2 = rubricEvaluationSchema.parse(parsed);
|
|
24628
|
-
const { score: score2, verdict,
|
|
24617
|
+
const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
|
|
24629
24618
|
return {
|
|
24630
24619
|
score: score2,
|
|
24631
24620
|
verdict,
|
|
24632
|
-
|
|
24633
|
-
misses: misses2,
|
|
24621
|
+
assertions: assertions2,
|
|
24634
24622
|
expectedAspectCount: rubrics.length,
|
|
24635
|
-
reasoning: data2.overall_reasoning,
|
|
24636
24623
|
evaluatorRawRequest,
|
|
24637
24624
|
details
|
|
24638
24625
|
};
|
|
24639
24626
|
}
|
|
24640
24627
|
const data = freeformEvaluationSchema.parse(parsed);
|
|
24641
24628
|
const score = clampScore(data.score);
|
|
24642
|
-
const
|
|
24643
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
24629
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
24644
24630
|
return {
|
|
24645
24631
|
score,
|
|
24646
24632
|
verdict: scoreToVerdict(score),
|
|
24647
|
-
|
|
24648
|
-
|
|
24649
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
24650
|
-
reasoning: data.reasoning,
|
|
24633
|
+
assertions,
|
|
24634
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24651
24635
|
evaluatorRawRequest,
|
|
24652
24636
|
details
|
|
24653
24637
|
};
|
|
@@ -24655,8 +24639,12 @@ ${outputSchema2}`;
|
|
|
24655
24639
|
return {
|
|
24656
24640
|
score: 0,
|
|
24657
24641
|
verdict: "fail",
|
|
24658
|
-
|
|
24659
|
-
|
|
24642
|
+
assertions: [
|
|
24643
|
+
{
|
|
24644
|
+
text: "Failed to parse llm-grader agent response as valid evaluation JSON",
|
|
24645
|
+
passed: false
|
|
24646
|
+
}
|
|
24647
|
+
],
|
|
24660
24648
|
expectedAspectCount: 1,
|
|
24661
24649
|
evaluatorRawRequest,
|
|
24662
24650
|
details
|
|
@@ -24785,9 +24773,13 @@ function buildOutputSchema() {
|
|
|
24785
24773
|
"",
|
|
24786
24774
|
"{",
|
|
24787
24775
|
' "score": <number between 0.0 and 1.0>,',
|
|
24788
|
-
' "
|
|
24789
|
-
|
|
24790
|
-
'
|
|
24776
|
+
' "assertions": [',
|
|
24777
|
+
" {",
|
|
24778
|
+
' "text": "<brief description of what was checked>",',
|
|
24779
|
+
' "passed": <boolean>,',
|
|
24780
|
+
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
24781
|
+
" }",
|
|
24782
|
+
" ]",
|
|
24791
24783
|
"}"
|
|
24792
24784
|
].join("\n");
|
|
24793
24785
|
}
|
|
@@ -24812,8 +24804,7 @@ function substituteVariables(template, variables) {
|
|
|
24812
24804
|
}
|
|
24813
24805
|
function calculateRubricScore(result, rubrics) {
|
|
24814
24806
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
24815
|
-
const
|
|
24816
|
-
const misses = [];
|
|
24807
|
+
const assertions = [];
|
|
24817
24808
|
let totalWeight = 0;
|
|
24818
24809
|
let earnedWeight = 0;
|
|
24819
24810
|
let failedRequired = false;
|
|
@@ -24823,19 +24814,20 @@ function calculateRubricScore(result, rubrics) {
|
|
|
24823
24814
|
continue;
|
|
24824
24815
|
}
|
|
24825
24816
|
totalWeight += rubric.weight;
|
|
24817
|
+
assertions.push({
|
|
24818
|
+
text: `[${rubric.id}] ${rubric.outcome}`,
|
|
24819
|
+
passed: check.satisfied,
|
|
24820
|
+
evidence: check.reasoning
|
|
24821
|
+
});
|
|
24826
24822
|
if (check.satisfied) {
|
|
24827
24823
|
earnedWeight += rubric.weight;
|
|
24828
|
-
|
|
24829
|
-
|
|
24830
|
-
misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
|
|
24831
|
-
if (rubric.required) {
|
|
24832
|
-
failedRequired = true;
|
|
24833
|
-
}
|
|
24824
|
+
} else if (rubric.required) {
|
|
24825
|
+
failedRequired = true;
|
|
24834
24826
|
}
|
|
24835
24827
|
}
|
|
24836
24828
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
24837
24829
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
24838
|
-
return { score, verdict,
|
|
24830
|
+
return { score, verdict, assertions };
|
|
24839
24831
|
}
|
|
24840
24832
|
function buildScoreRangeOutputSchema() {
|
|
24841
24833
|
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
@@ -24855,8 +24847,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
|
|
|
24855
24847
|
}
|
|
24856
24848
|
function calculateScoreRangeResult(result, rubrics) {
|
|
24857
24849
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
24858
|
-
const
|
|
24859
|
-
const misses = [];
|
|
24850
|
+
const assertions = [];
|
|
24860
24851
|
const rawScores = {};
|
|
24861
24852
|
let totalWeight = 0;
|
|
24862
24853
|
let weightedScoreSum = 0;
|
|
@@ -24882,24 +24873,22 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
24882
24873
|
);
|
|
24883
24874
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
24884
24875
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
24885
|
-
const
|
|
24886
|
-
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
24876
|
+
const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
|
|
24887
24877
|
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
24888
24878
|
failedRequired = true;
|
|
24889
|
-
misses.push(scoreInfo);
|
|
24890
|
-
} else if (rawScore >= 7) {
|
|
24891
|
-
hits.push(scoreInfo);
|
|
24892
|
-
} else {
|
|
24893
|
-
misses.push(scoreInfo);
|
|
24894
24879
|
}
|
|
24880
|
+
assertions.push({
|
|
24881
|
+
text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
|
|
24882
|
+
passed,
|
|
24883
|
+
evidence: check.reasoning
|
|
24884
|
+
});
|
|
24895
24885
|
}
|
|
24896
24886
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
24897
24887
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
24898
24888
|
return {
|
|
24899
24889
|
score,
|
|
24900
24890
|
verdict,
|
|
24901
|
-
|
|
24902
|
-
misses,
|
|
24891
|
+
assertions,
|
|
24903
24892
|
details: {
|
|
24904
24893
|
raw_scores: rawScores,
|
|
24905
24894
|
normalization: "score / 10",
|
|
@@ -25073,9 +25062,7 @@ var CompositeEvaluator = class {
|
|
|
25073
25062
|
let totalWeight = 0;
|
|
25074
25063
|
let weightedSum = 0;
|
|
25075
25064
|
let evaluatedCount = 0;
|
|
25076
|
-
const
|
|
25077
|
-
const allMisses = [];
|
|
25078
|
-
const reasoningParts = [];
|
|
25065
|
+
const allAssertions = [];
|
|
25079
25066
|
const scores = [];
|
|
25080
25067
|
for (const member of results) {
|
|
25081
25068
|
const weight = weights?.[member.id] ?? 1;
|
|
@@ -25085,9 +25072,7 @@ var CompositeEvaluator = class {
|
|
|
25085
25072
|
score: member.result.score,
|
|
25086
25073
|
weight,
|
|
25087
25074
|
verdict: member.result.verdict,
|
|
25088
|
-
|
|
25089
|
-
misses: [...member.result.misses],
|
|
25090
|
-
reasoning: member.result.reasoning,
|
|
25075
|
+
assertions: [...member.result.assertions],
|
|
25091
25076
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25092
25077
|
scores: member.result.scores,
|
|
25093
25078
|
details: member.result.details,
|
|
@@ -25099,20 +25084,16 @@ var CompositeEvaluator = class {
|
|
|
25099
25084
|
evaluatedCount++;
|
|
25100
25085
|
totalWeight += weight;
|
|
25101
25086
|
weightedSum += member.result.score * weight;
|
|
25102
|
-
|
|
25103
|
-
|
|
25104
|
-
|
|
25105
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
25106
|
-
}
|
|
25087
|
+
allAssertions.push(
|
|
25088
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
25089
|
+
);
|
|
25107
25090
|
}
|
|
25108
25091
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
25109
25092
|
return {
|
|
25110
25093
|
score: 0,
|
|
25111
25094
|
verdict: "skip",
|
|
25112
|
-
|
|
25113
|
-
misses: [],
|
|
25095
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
25114
25096
|
expectedAspectCount: 1,
|
|
25115
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
25116
25097
|
evaluatorRawRequest: {
|
|
25117
25098
|
aggregator: "weighted_average",
|
|
25118
25099
|
...weights ? { weights } : {}
|
|
@@ -25124,10 +25105,8 @@ var CompositeEvaluator = class {
|
|
|
25124
25105
|
return {
|
|
25125
25106
|
score: clampScore(finalScore),
|
|
25126
25107
|
verdict: scoreToVerdict(finalScore),
|
|
25127
|
-
|
|
25128
|
-
|
|
25129
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
25130
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
25108
|
+
assertions: allAssertions,
|
|
25109
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
25131
25110
|
evaluatorRawRequest: {
|
|
25132
25111
|
aggregator: "weighted_average",
|
|
25133
25112
|
...weights ? { weights } : {}
|
|
@@ -25137,11 +25116,8 @@ var CompositeEvaluator = class {
|
|
|
25137
25116
|
}
|
|
25138
25117
|
runThreshold(results, threshold) {
|
|
25139
25118
|
const scores = [];
|
|
25140
|
-
const
|
|
25141
|
-
const allMisses = [];
|
|
25142
|
-
const reasoningParts = [];
|
|
25119
|
+
const allAssertions = [];
|
|
25143
25120
|
let passingCount = 0;
|
|
25144
|
-
let borderlineCount = 0;
|
|
25145
25121
|
let evaluatedCount = 0;
|
|
25146
25122
|
for (const member of results) {
|
|
25147
25123
|
scores.push({
|
|
@@ -25149,9 +25125,7 @@ var CompositeEvaluator = class {
|
|
|
25149
25125
|
type: member.type,
|
|
25150
25126
|
score: member.result.score,
|
|
25151
25127
|
verdict: member.result.verdict,
|
|
25152
|
-
|
|
25153
|
-
misses: [...member.result.misses],
|
|
25154
|
-
reasoning: member.result.reasoning,
|
|
25128
|
+
assertions: [...member.result.assertions],
|
|
25155
25129
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25156
25130
|
scores: member.result.scores,
|
|
25157
25131
|
details: member.result.details,
|
|
@@ -25164,24 +25138,17 @@ var CompositeEvaluator = class {
|
|
|
25164
25138
|
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
25165
25139
|
if (isPassing) {
|
|
25166
25140
|
passingCount++;
|
|
25167
|
-
if (member.result.verdict === "borderline") {
|
|
25168
|
-
borderlineCount++;
|
|
25169
|
-
}
|
|
25170
|
-
}
|
|
25171
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
25172
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
25173
|
-
if (member.result.reasoning) {
|
|
25174
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
25175
25141
|
}
|
|
25142
|
+
allAssertions.push(
|
|
25143
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
25144
|
+
);
|
|
25176
25145
|
}
|
|
25177
25146
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
25178
25147
|
return {
|
|
25179
25148
|
score: 0,
|
|
25180
25149
|
verdict: "skip",
|
|
25181
|
-
|
|
25182
|
-
misses: [],
|
|
25150
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
25183
25151
|
expectedAspectCount: 1,
|
|
25184
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
25185
25152
|
evaluatorRawRequest: {
|
|
25186
25153
|
aggregator: "threshold",
|
|
25187
25154
|
threshold
|
|
@@ -25192,19 +25159,15 @@ var CompositeEvaluator = class {
|
|
|
25192
25159
|
const totalCount = evaluatedCount;
|
|
25193
25160
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
25194
25161
|
const pass = score >= threshold;
|
|
25195
|
-
|
|
25196
|
-
|
|
25197
|
-
|
|
25198
|
-
|
|
25199
|
-
`${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
|
|
25200
|
-
);
|
|
25162
|
+
allAssertions.unshift({
|
|
25163
|
+
text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
|
|
25164
|
+
passed: pass
|
|
25165
|
+
});
|
|
25201
25166
|
return {
|
|
25202
25167
|
score: clampScore(score),
|
|
25203
25168
|
verdict: pass ? "pass" : "fail",
|
|
25204
|
-
|
|
25205
|
-
|
|
25206
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
25207
|
-
reasoning: reasoningParts.join("; "),
|
|
25169
|
+
assertions: allAssertions,
|
|
25170
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
25208
25171
|
evaluatorRawRequest: {
|
|
25209
25172
|
aggregator: "threshold",
|
|
25210
25173
|
threshold
|
|
@@ -25221,9 +25184,7 @@ var CompositeEvaluator = class {
|
|
|
25221
25184
|
score: member.result.score,
|
|
25222
25185
|
weight: weights?.[member.id] ?? 1,
|
|
25223
25186
|
verdict: member.result.verdict,
|
|
25224
|
-
|
|
25225
|
-
misses: [...member.result.misses],
|
|
25226
|
-
reasoning: member.result.reasoning,
|
|
25187
|
+
assertions: [...member.result.assertions],
|
|
25227
25188
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25228
25189
|
scores: member.result.scores,
|
|
25229
25190
|
details: member.result.details
|
|
@@ -25232,17 +25193,19 @@ var CompositeEvaluator = class {
|
|
|
25232
25193
|
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
25233
25194
|
const parsed = parseJsonSafe(stdout);
|
|
25234
25195
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
25235
|
-
const
|
|
25236
|
-
|
|
25237
|
-
|
|
25196
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
25197
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
25198
|
+
).map((a) => ({
|
|
25199
|
+
text: String(a.text),
|
|
25200
|
+
passed: Boolean(a.passed),
|
|
25201
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
25202
|
+
})) : [];
|
|
25238
25203
|
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
25239
25204
|
return {
|
|
25240
25205
|
score,
|
|
25241
25206
|
verdict,
|
|
25242
|
-
|
|
25243
|
-
|
|
25244
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
25245
|
-
reasoning,
|
|
25207
|
+
assertions,
|
|
25208
|
+
expectedAspectCount: assertions.length || 1,
|
|
25246
25209
|
evaluatorRawRequest: {
|
|
25247
25210
|
aggregator: "code-grader",
|
|
25248
25211
|
script: scriptPath
|
|
@@ -25254,10 +25217,8 @@ var CompositeEvaluator = class {
|
|
|
25254
25217
|
return {
|
|
25255
25218
|
score: 0,
|
|
25256
25219
|
verdict: "fail",
|
|
25257
|
-
|
|
25258
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
25220
|
+
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
25259
25221
|
expectedAspectCount: 1,
|
|
25260
|
-
reasoning: message,
|
|
25261
25222
|
evaluatorRawRequest: {
|
|
25262
25223
|
aggregator: "code-grader",
|
|
25263
25224
|
script: scriptPath,
|
|
@@ -25279,9 +25240,7 @@ var CompositeEvaluator = class {
|
|
|
25279
25240
|
type: member.type,
|
|
25280
25241
|
score: member.result.score,
|
|
25281
25242
|
verdict: member.result.verdict,
|
|
25282
|
-
|
|
25283
|
-
misses: [...member.result.misses],
|
|
25284
|
-
reasoning: member.result.reasoning,
|
|
25243
|
+
assertions: [...member.result.assertions],
|
|
25285
25244
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25286
25245
|
scores: member.result.scores,
|
|
25287
25246
|
details: member.result.details
|
|
@@ -25305,16 +25264,12 @@ var CompositeEvaluator = class {
|
|
|
25305
25264
|
});
|
|
25306
25265
|
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
|
|
25307
25266
|
const score2 = clampScore(data2.score);
|
|
25308
|
-
const
|
|
25309
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
25310
|
-
const reasoning2 = data2.reasoning;
|
|
25267
|
+
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
25311
25268
|
return {
|
|
25312
25269
|
score: score2,
|
|
25313
25270
|
verdict: scoreToVerdict(score2),
|
|
25314
|
-
|
|
25315
|
-
|
|
25316
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
25317
|
-
reasoning: reasoning2,
|
|
25271
|
+
assertions: assertions2,
|
|
25272
|
+
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
25318
25273
|
evaluatorRawRequest,
|
|
25319
25274
|
scores
|
|
25320
25275
|
};
|
|
@@ -25329,16 +25284,12 @@ var CompositeEvaluator = class {
|
|
|
25329
25284
|
parseJsonFromText(extractLastAssistantContent(response.output))
|
|
25330
25285
|
);
|
|
25331
25286
|
const score = clampScore(data.score);
|
|
25332
|
-
const
|
|
25333
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
25334
|
-
const reasoning = data.reasoning;
|
|
25287
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
25335
25288
|
return {
|
|
25336
25289
|
score,
|
|
25337
25290
|
verdict: scoreToVerdict(score),
|
|
25338
|
-
|
|
25339
|
-
|
|
25340
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
25341
|
-
reasoning,
|
|
25291
|
+
assertions,
|
|
25292
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
25342
25293
|
evaluatorRawRequest,
|
|
25343
25294
|
scores
|
|
25344
25295
|
};
|
|
@@ -25346,8 +25297,7 @@ var CompositeEvaluator = class {
|
|
|
25346
25297
|
return {
|
|
25347
25298
|
score: 0,
|
|
25348
25299
|
verdict: "fail",
|
|
25349
|
-
|
|
25350
|
-
misses: [],
|
|
25300
|
+
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
25351
25301
|
expectedAspectCount: 1,
|
|
25352
25302
|
evaluatorRawRequest,
|
|
25353
25303
|
scores
|
|
@@ -25368,10 +25318,8 @@ var CostEvaluator = class {
|
|
|
25368
25318
|
return {
|
|
25369
25319
|
score: 0,
|
|
25370
25320
|
verdict: "fail",
|
|
25371
|
-
|
|
25372
|
-
misses: ["No cost data available in trace"],
|
|
25321
|
+
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
25373
25322
|
expectedAspectCount: 1,
|
|
25374
|
-
reasoning: "Execution cost not reported by provider",
|
|
25375
25323
|
evaluatorRawRequest: {
|
|
25376
25324
|
type: "cost",
|
|
25377
25325
|
budget,
|
|
@@ -25385,10 +25333,10 @@ var CostEvaluator = class {
|
|
|
25385
25333
|
return {
|
|
25386
25334
|
score,
|
|
25387
25335
|
verdict: passed ? "pass" : "fail",
|
|
25388
|
-
|
|
25389
|
-
|
|
25336
|
+
assertions: [
|
|
25337
|
+
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
25338
|
+
],
|
|
25390
25339
|
expectedAspectCount: 1,
|
|
25391
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
25392
25340
|
evaluatorRawRequest: {
|
|
25393
25341
|
type: "cost",
|
|
25394
25342
|
budget,
|
|
@@ -25419,10 +25367,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
25419
25367
|
return {
|
|
25420
25368
|
score: 0,
|
|
25421
25369
|
verdict: "fail",
|
|
25422
|
-
|
|
25423
|
-
misses: ["No trace summary available"],
|
|
25370
|
+
assertions: [{ text: "No trace summary available", passed: false }],
|
|
25424
25371
|
expectedAspectCount: 1,
|
|
25425
|
-
reasoning: "Execution metrics not available - no trace summary provided",
|
|
25426
25372
|
evaluatorRawRequest: {
|
|
25427
25373
|
type: "execution-metrics",
|
|
25428
25374
|
config: this.extractConfiguredThresholds(),
|
|
@@ -25431,116 +25377,114 @@ var ExecutionMetricsEvaluator = class {
|
|
|
25431
25377
|
};
|
|
25432
25378
|
}
|
|
25433
25379
|
const narrowedTrace = trace2;
|
|
25434
|
-
const
|
|
25435
|
-
const misses = [];
|
|
25380
|
+
const assertions = [];
|
|
25436
25381
|
const actualMetrics = {};
|
|
25437
25382
|
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
25438
25383
|
const toolCalls = narrowedTrace.eventCount;
|
|
25439
25384
|
actualMetrics.tool_calls = toolCalls;
|
|
25440
25385
|
if (toolCalls <= max_tool_calls) {
|
|
25441
|
-
|
|
25386
|
+
assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
|
|
25442
25387
|
} else {
|
|
25443
|
-
|
|
25388
|
+
assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
|
|
25444
25389
|
}
|
|
25445
25390
|
}
|
|
25446
25391
|
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
25447
25392
|
const llmCalls = narrowedTrace.llmCallCount;
|
|
25448
25393
|
if (llmCalls === void 0) {
|
|
25449
|
-
|
|
25394
|
+
assertions.push({ text: "LLM call count data not available", passed: false });
|
|
25450
25395
|
} else {
|
|
25451
25396
|
actualMetrics.llm_calls = llmCalls;
|
|
25452
25397
|
if (llmCalls <= max_llm_calls) {
|
|
25453
|
-
|
|
25398
|
+
assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
|
|
25454
25399
|
} else {
|
|
25455
|
-
|
|
25400
|
+
assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
|
|
25456
25401
|
}
|
|
25457
25402
|
}
|
|
25458
25403
|
}
|
|
25459
25404
|
if (max_tokens !== void 0) {
|
|
25460
25405
|
if (!tokenUsage) {
|
|
25461
|
-
|
|
25406
|
+
assertions.push({ text: "Token usage data not available", passed: false });
|
|
25462
25407
|
} else {
|
|
25463
25408
|
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
25464
25409
|
actualMetrics.tokens = totalTokens;
|
|
25465
25410
|
if (totalTokens <= max_tokens) {
|
|
25466
|
-
|
|
25411
|
+
assertions.push({
|
|
25412
|
+
text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
|
|
25413
|
+
passed: true
|
|
25414
|
+
});
|
|
25467
25415
|
} else {
|
|
25468
|
-
|
|
25416
|
+
assertions.push({
|
|
25417
|
+
text: `Total tokens ${totalTokens} > ${max_tokens} max`,
|
|
25418
|
+
passed: false
|
|
25419
|
+
});
|
|
25469
25420
|
}
|
|
25470
25421
|
}
|
|
25471
25422
|
}
|
|
25472
25423
|
if (max_cost_usd !== void 0) {
|
|
25473
25424
|
if (costUsd === void 0) {
|
|
25474
|
-
|
|
25425
|
+
assertions.push({ text: "Cost data not available", passed: false });
|
|
25475
25426
|
} else {
|
|
25476
25427
|
actualMetrics.cost_usd = costUsd;
|
|
25477
25428
|
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
25478
25429
|
if (costUsd <= max_cost_usd) {
|
|
25479
|
-
|
|
25430
|
+
assertions.push({
|
|
25431
|
+
text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
|
|
25432
|
+
passed: true
|
|
25433
|
+
});
|
|
25480
25434
|
} else {
|
|
25481
|
-
|
|
25435
|
+
assertions.push({
|
|
25436
|
+
text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
|
|
25437
|
+
passed: false
|
|
25438
|
+
});
|
|
25482
25439
|
}
|
|
25483
25440
|
}
|
|
25484
25441
|
}
|
|
25485
25442
|
if (max_duration_ms !== void 0) {
|
|
25486
25443
|
if (durationMs === void 0) {
|
|
25487
|
-
|
|
25444
|
+
assertions.push({ text: "Duration data not available", passed: false });
|
|
25488
25445
|
} else {
|
|
25489
25446
|
actualMetrics.duration_ms = durationMs;
|
|
25490
25447
|
if (durationMs <= max_duration_ms) {
|
|
25491
|
-
|
|
25448
|
+
assertions.push({
|
|
25449
|
+
text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
|
|
25450
|
+
passed: true
|
|
25451
|
+
});
|
|
25492
25452
|
} else {
|
|
25493
|
-
|
|
25453
|
+
assertions.push({
|
|
25454
|
+
text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
|
|
25455
|
+
passed: false
|
|
25456
|
+
});
|
|
25494
25457
|
}
|
|
25495
25458
|
}
|
|
25496
25459
|
}
|
|
25497
25460
|
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
25498
25461
|
const ratio = explorationRatio(narrowedTrace);
|
|
25499
25462
|
if (ratio === void 0) {
|
|
25500
|
-
|
|
25463
|
+
assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
|
|
25501
25464
|
} else {
|
|
25502
25465
|
actualMetrics.exploration_ratio = ratio;
|
|
25503
25466
|
const diff = Math.abs(ratio - target_exploration_ratio);
|
|
25504
25467
|
if (diff <= exploration_tolerance) {
|
|
25505
|
-
|
|
25506
|
-
`Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}
|
|
25507
|
-
|
|
25468
|
+
assertions.push({
|
|
25469
|
+
text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
|
|
25470
|
+
passed: true
|
|
25471
|
+
});
|
|
25508
25472
|
} else {
|
|
25509
|
-
|
|
25510
|
-
`Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})
|
|
25511
|
-
|
|
25473
|
+
assertions.push({
|
|
25474
|
+
text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
|
|
25475
|
+
passed: false
|
|
25476
|
+
});
|
|
25512
25477
|
}
|
|
25513
25478
|
}
|
|
25514
25479
|
}
|
|
25515
|
-
const totalChecks =
|
|
25516
|
-
const
|
|
25517
|
-
const
|
|
25518
|
-
if (actualMetrics.tool_calls !== void 0) {
|
|
25519
|
-
reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
|
|
25520
|
-
}
|
|
25521
|
-
if (actualMetrics.llm_calls !== void 0) {
|
|
25522
|
-
reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
|
|
25523
|
-
}
|
|
25524
|
-
if (actualMetrics.tokens !== void 0) {
|
|
25525
|
-
reasoningParts.push(`tokens=${actualMetrics.tokens}`);
|
|
25526
|
-
}
|
|
25527
|
-
if (actualMetrics.cost_usd !== void 0) {
|
|
25528
|
-
reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
|
|
25529
|
-
}
|
|
25530
|
-
if (actualMetrics.duration_ms !== void 0) {
|
|
25531
|
-
reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
|
|
25532
|
-
}
|
|
25533
|
-
if (actualMetrics.exploration_ratio !== void 0) {
|
|
25534
|
-
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
25535
|
-
}
|
|
25536
|
-
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
25480
|
+
const totalChecks = assertions.length;
|
|
25481
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
25482
|
+
const score = totalChecks > 0 ? passedCount / totalChecks : 0;
|
|
25537
25483
|
return {
|
|
25538
25484
|
score,
|
|
25539
25485
|
verdict: scoreToVerdict(score),
|
|
25540
|
-
|
|
25541
|
-
misses,
|
|
25486
|
+
assertions,
|
|
25542
25487
|
expectedAspectCount: totalChecks || 1,
|
|
25543
|
-
reasoning,
|
|
25544
25488
|
evaluatorRawRequest: {
|
|
25545
25489
|
type: "execution-metrics",
|
|
25546
25490
|
config: this.extractConfiguredThresholds(),
|
|
@@ -25642,10 +25586,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
25642
25586
|
return {
|
|
25643
25587
|
score: 0,
|
|
25644
25588
|
verdict: "fail",
|
|
25645
|
-
|
|
25646
|
-
|
|
25647
|
-
expectedAspectCount: this.config.fields.length,
|
|
25648
|
-
reasoning: "Candidate answer is not valid JSON"
|
|
25589
|
+
assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
|
|
25590
|
+
expectedAspectCount: this.config.fields.length
|
|
25649
25591
|
};
|
|
25650
25592
|
}
|
|
25651
25593
|
const expectedData = this.extractExpectedData(evalCase.expected_output);
|
|
@@ -25653,10 +25595,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
25653
25595
|
return {
|
|
25654
25596
|
score: 0,
|
|
25655
25597
|
verdict: "fail",
|
|
25656
|
-
|
|
25657
|
-
|
|
25658
|
-
expectedAspectCount: this.config.fields.length,
|
|
25659
|
-
reasoning: "Could not extract expected data from expected_output"
|
|
25598
|
+
assertions: [{ text: "No expected data found in expected_output", passed: false }],
|
|
25599
|
+
expectedAspectCount: this.config.fields.length
|
|
25660
25600
|
};
|
|
25661
25601
|
}
|
|
25662
25602
|
const fieldResults = [];
|
|
@@ -25874,18 +25814,14 @@ var FieldAccuracyEvaluator = class {
|
|
|
25874
25814
|
*/
|
|
25875
25815
|
aggregateResults(results) {
|
|
25876
25816
|
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
25877
|
-
const
|
|
25878
|
-
const misses = [];
|
|
25817
|
+
const assertions = [];
|
|
25879
25818
|
for (const result of results) {
|
|
25880
|
-
|
|
25881
|
-
hits.push(result.message);
|
|
25882
|
-
} else {
|
|
25883
|
-
misses.push(result.message);
|
|
25884
|
-
}
|
|
25819
|
+
assertions.push({ text: result.message, passed: result.hit });
|
|
25885
25820
|
}
|
|
25886
25821
|
let score;
|
|
25887
25822
|
if (aggregation === "all_or_nothing") {
|
|
25888
|
-
|
|
25823
|
+
const hasFailed = assertions.some((a) => !a.passed);
|
|
25824
|
+
score = hasFailed ? 0 : 1;
|
|
25889
25825
|
} else {
|
|
25890
25826
|
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
25891
25827
|
if (totalWeight === 0) {
|
|
@@ -25895,15 +25831,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
25895
25831
|
score = weightedSum / totalWeight;
|
|
25896
25832
|
}
|
|
25897
25833
|
}
|
|
25898
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
25899
25834
|
return {
|
|
25900
25835
|
score: clampScore(score),
|
|
25901
25836
|
verdict: scoreToVerdict(score),
|
|
25902
|
-
|
|
25903
|
-
|
|
25904
|
-
misses: misses.slice(0, 4),
|
|
25905
|
-
expectedAspectCount: results.length,
|
|
25906
|
-
reasoning
|
|
25837
|
+
assertions,
|
|
25838
|
+
expectedAspectCount: results.length
|
|
25907
25839
|
};
|
|
25908
25840
|
}
|
|
25909
25841
|
};
|
|
@@ -26010,10 +25942,8 @@ var LatencyEvaluator = class {
|
|
|
26010
25942
|
return {
|
|
26011
25943
|
score: 0,
|
|
26012
25944
|
verdict: "fail",
|
|
26013
|
-
|
|
26014
|
-
misses: ["No duration data available in trace"],
|
|
25945
|
+
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
26015
25946
|
expectedAspectCount: 1,
|
|
26016
|
-
reasoning: "Execution duration not reported by provider",
|
|
26017
25947
|
evaluatorRawRequest: {
|
|
26018
25948
|
type: "latency",
|
|
26019
25949
|
threshold,
|
|
@@ -26026,10 +25956,10 @@ var LatencyEvaluator = class {
|
|
|
26026
25956
|
return {
|
|
26027
25957
|
score,
|
|
26028
25958
|
verdict: passed ? "pass" : "fail",
|
|
26029
|
-
|
|
26030
|
-
|
|
25959
|
+
assertions: [
|
|
25960
|
+
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
25961
|
+
],
|
|
26031
25962
|
expectedAspectCount: 1,
|
|
26032
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
26033
25963
|
evaluatorRawRequest: {
|
|
26034
25964
|
type: "latency",
|
|
26035
25965
|
threshold,
|
|
@@ -26103,23 +26033,25 @@ var SkillTriggerEvaluator = class {
|
|
|
26103
26033
|
return {
|
|
26104
26034
|
score: 1,
|
|
26105
26035
|
verdict: "pass",
|
|
26106
|
-
|
|
26107
|
-
|
|
26036
|
+
assertions: [
|
|
26037
|
+
{
|
|
26038
|
+
text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
|
|
26039
|
+
passed: true
|
|
26040
|
+
}
|
|
26108
26041
|
],
|
|
26109
|
-
|
|
26110
|
-
expectedAspectCount: 1,
|
|
26111
|
-
reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
|
|
26042
|
+
expectedAspectCount: 1
|
|
26112
26043
|
};
|
|
26113
26044
|
}
|
|
26114
26045
|
return {
|
|
26115
26046
|
score: 0,
|
|
26116
26047
|
verdict: "fail",
|
|
26117
|
-
|
|
26118
|
-
|
|
26119
|
-
|
|
26048
|
+
assertions: [
|
|
26049
|
+
{
|
|
26050
|
+
text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
|
|
26051
|
+
passed: false
|
|
26052
|
+
}
|
|
26120
26053
|
],
|
|
26121
|
-
expectedAspectCount: 1
|
|
26122
|
-
reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
|
|
26054
|
+
expectedAspectCount: 1
|
|
26123
26055
|
};
|
|
26124
26056
|
}
|
|
26125
26057
|
};
|
|
@@ -26284,10 +26216,8 @@ var TokenUsageEvaluator = class {
|
|
|
26284
26216
|
return {
|
|
26285
26217
|
score: 0,
|
|
26286
26218
|
verdict: "fail",
|
|
26287
|
-
|
|
26288
|
-
misses: ["No token usage data available in trace"],
|
|
26219
|
+
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
26289
26220
|
expectedAspectCount,
|
|
26290
|
-
reasoning: "Token usage not reported by provider",
|
|
26291
26221
|
evaluatorRawRequest: {
|
|
26292
26222
|
type: "token-usage",
|
|
26293
26223
|
max_total: maxTotal ?? null,
|
|
@@ -26301,37 +26231,34 @@ var TokenUsageEvaluator = class {
|
|
|
26301
26231
|
const output = usage.output;
|
|
26302
26232
|
const cached = usage.cached ?? 0;
|
|
26303
26233
|
const total = input + output + cached;
|
|
26304
|
-
const
|
|
26305
|
-
const misses = [];
|
|
26234
|
+
const assertions = [];
|
|
26306
26235
|
if (typeof maxInput === "number") {
|
|
26307
26236
|
if (input <= maxInput) {
|
|
26308
|
-
|
|
26237
|
+
assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
|
|
26309
26238
|
} else {
|
|
26310
|
-
|
|
26239
|
+
assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
|
|
26311
26240
|
}
|
|
26312
26241
|
}
|
|
26313
26242
|
if (typeof maxOutput === "number") {
|
|
26314
26243
|
if (output <= maxOutput) {
|
|
26315
|
-
|
|
26244
|
+
assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
|
|
26316
26245
|
} else {
|
|
26317
|
-
|
|
26246
|
+
assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
|
|
26318
26247
|
}
|
|
26319
26248
|
}
|
|
26320
26249
|
if (typeof maxTotal === "number") {
|
|
26321
26250
|
if (total <= maxTotal) {
|
|
26322
|
-
|
|
26251
|
+
assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
|
|
26323
26252
|
} else {
|
|
26324
|
-
|
|
26253
|
+
assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
|
|
26325
26254
|
}
|
|
26326
26255
|
}
|
|
26327
|
-
const passed =
|
|
26256
|
+
const passed = assertions.every((a) => a.passed);
|
|
26328
26257
|
return {
|
|
26329
26258
|
score: passed ? 1 : 0,
|
|
26330
26259
|
verdict: passed ? "pass" : "fail",
|
|
26331
|
-
|
|
26332
|
-
misses,
|
|
26260
|
+
assertions,
|
|
26333
26261
|
expectedAspectCount,
|
|
26334
|
-
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
26335
26262
|
evaluatorRawRequest: {
|
|
26336
26263
|
type: "token-usage",
|
|
26337
26264
|
max_total: maxTotal ?? null,
|
|
@@ -26429,8 +26356,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26429
26356
|
return {
|
|
26430
26357
|
score: 0,
|
|
26431
26358
|
verdict: "fail",
|
|
26432
|
-
|
|
26433
|
-
misses: ["No trace available for evaluation"],
|
|
26359
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
26434
26360
|
expectedAspectCount: 1
|
|
26435
26361
|
};
|
|
26436
26362
|
}
|
|
@@ -26441,8 +26367,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26441
26367
|
return {
|
|
26442
26368
|
score: 0,
|
|
26443
26369
|
verdict: "fail",
|
|
26444
|
-
|
|
26445
|
-
misses: ["No trace available for evaluation"],
|
|
26370
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
26446
26371
|
expectedAspectCount: 1
|
|
26447
26372
|
};
|
|
26448
26373
|
}
|
|
@@ -26460,8 +26385,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26460
26385
|
return {
|
|
26461
26386
|
score: 0,
|
|
26462
26387
|
verdict: "fail",
|
|
26463
|
-
|
|
26464
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
26388
|
+
assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
|
|
26465
26389
|
expectedAspectCount: 1
|
|
26466
26390
|
};
|
|
26467
26391
|
}
|
|
@@ -26510,28 +26434,32 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26510
26434
|
return {
|
|
26511
26435
|
score: 1,
|
|
26512
26436
|
verdict: "pass",
|
|
26513
|
-
|
|
26514
|
-
misses: [],
|
|
26437
|
+
assertions: [{ text: "No tool requirements specified", passed: true }],
|
|
26515
26438
|
expectedAspectCount: 0
|
|
26516
26439
|
};
|
|
26517
26440
|
}
|
|
26518
|
-
const
|
|
26519
|
-
const misses = [];
|
|
26441
|
+
const assertions = [];
|
|
26520
26442
|
for (const toolName of toolNames) {
|
|
26521
26443
|
const required = minimums[toolName];
|
|
26522
26444
|
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
26523
26445
|
if (actual >= required) {
|
|
26524
|
-
|
|
26446
|
+
assertions.push({
|
|
26447
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
26448
|
+
passed: true
|
|
26449
|
+
});
|
|
26525
26450
|
} else {
|
|
26526
|
-
|
|
26451
|
+
assertions.push({
|
|
26452
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
26453
|
+
passed: false
|
|
26454
|
+
});
|
|
26527
26455
|
}
|
|
26528
26456
|
}
|
|
26529
|
-
const
|
|
26457
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26458
|
+
const score = passedCount / toolNames.length;
|
|
26530
26459
|
return {
|
|
26531
26460
|
score,
|
|
26532
26461
|
verdict: scoreToVerdict(score),
|
|
26533
|
-
|
|
26534
|
-
misses,
|
|
26462
|
+
assertions,
|
|
26535
26463
|
expectedAspectCount: toolNames.length
|
|
26536
26464
|
};
|
|
26537
26465
|
}
|
|
@@ -26541,13 +26469,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26541
26469
|
return {
|
|
26542
26470
|
score: 1,
|
|
26543
26471
|
verdict: "pass",
|
|
26544
|
-
|
|
26545
|
-
misses: [],
|
|
26472
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
26546
26473
|
expectedAspectCount: 0
|
|
26547
26474
|
};
|
|
26548
26475
|
}
|
|
26549
|
-
const
|
|
26550
|
-
const misses = [];
|
|
26476
|
+
const assertions = [];
|
|
26551
26477
|
const warnings = [];
|
|
26552
26478
|
let actualIndex = 0;
|
|
26553
26479
|
let sequenceHits = 0;
|
|
@@ -26567,16 +26493,20 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26567
26493
|
const actualCall = toolCalls[actualIndex];
|
|
26568
26494
|
if (actualCall.name === expectedTool) {
|
|
26569
26495
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26570
|
-
|
|
26496
|
+
assertions.push({
|
|
26497
|
+
text: `Found ${expectedTool} at position ${actualIndex}`,
|
|
26498
|
+
passed: true
|
|
26499
|
+
});
|
|
26571
26500
|
sequenceHits++;
|
|
26572
26501
|
matchedCall = actualCall;
|
|
26573
26502
|
actualIndex++;
|
|
26574
26503
|
found = true;
|
|
26575
26504
|
break;
|
|
26576
26505
|
}
|
|
26577
|
-
|
|
26578
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch
|
|
26579
|
-
|
|
26506
|
+
assertions.push({
|
|
26507
|
+
text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
|
|
26508
|
+
passed: false
|
|
26509
|
+
});
|
|
26580
26510
|
actualIndex++;
|
|
26581
26511
|
argsMismatch = true;
|
|
26582
26512
|
break;
|
|
@@ -26584,7 +26514,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26584
26514
|
actualIndex++;
|
|
26585
26515
|
}
|
|
26586
26516
|
if (!found && !argsMismatch) {
|
|
26587
|
-
|
|
26517
|
+
assertions.push({
|
|
26518
|
+
text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
|
|
26519
|
+
passed: false
|
|
26520
|
+
});
|
|
26588
26521
|
}
|
|
26589
26522
|
if (found && matchedCall) {
|
|
26590
26523
|
const latencyResult = checkLatency(
|
|
@@ -26593,10 +26526,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26593
26526
|
matchedCall.durationMs
|
|
26594
26527
|
);
|
|
26595
26528
|
if (latencyResult.status === "pass") {
|
|
26596
|
-
|
|
26529
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
26597
26530
|
latencyHits++;
|
|
26598
26531
|
} else if (latencyResult.status === "fail") {
|
|
26599
|
-
|
|
26532
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
26600
26533
|
} else if (latencyResult.message) {
|
|
26601
26534
|
warnings.push(latencyResult.message);
|
|
26602
26535
|
latencySkips++;
|
|
@@ -26612,8 +26545,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26612
26545
|
return {
|
|
26613
26546
|
score,
|
|
26614
26547
|
verdict: scoreToVerdict(score),
|
|
26615
|
-
|
|
26616
|
-
misses,
|
|
26548
|
+
assertions,
|
|
26617
26549
|
expectedAspectCount: totalAssertions
|
|
26618
26550
|
};
|
|
26619
26551
|
}
|
|
@@ -26623,13 +26555,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26623
26555
|
return {
|
|
26624
26556
|
score: 1,
|
|
26625
26557
|
verdict: "pass",
|
|
26626
|
-
|
|
26627
|
-
misses: [],
|
|
26558
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
26628
26559
|
expectedAspectCount: 0
|
|
26629
26560
|
};
|
|
26630
26561
|
}
|
|
26631
|
-
const
|
|
26632
|
-
const misses = [];
|
|
26562
|
+
const assertions = [];
|
|
26633
26563
|
const warnings = [];
|
|
26634
26564
|
let sequenceHits = 0;
|
|
26635
26565
|
let latencyHits = 0;
|
|
@@ -26638,7 +26568,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26638
26568
|
(item) => item.maxDurationMs !== void 0
|
|
26639
26569
|
).length;
|
|
26640
26570
|
if (toolCalls.length !== expected.length) {
|
|
26641
|
-
|
|
26571
|
+
assertions.push({
|
|
26572
|
+
text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
|
|
26573
|
+
passed: false
|
|
26574
|
+
});
|
|
26642
26575
|
}
|
|
26643
26576
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
26644
26577
|
for (let i = 0; i < checkLength; i++) {
|
|
@@ -26650,14 +26583,17 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26650
26583
|
let sequenceMatched = false;
|
|
26651
26584
|
if (actualTool === expectedTool) {
|
|
26652
26585
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26653
|
-
|
|
26586
|
+
assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
|
|
26654
26587
|
sequenceHits++;
|
|
26655
26588
|
sequenceMatched = true;
|
|
26656
26589
|
} else {
|
|
26657
|
-
|
|
26590
|
+
assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
|
|
26658
26591
|
}
|
|
26659
26592
|
} else {
|
|
26660
|
-
|
|
26593
|
+
assertions.push({
|
|
26594
|
+
text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
|
|
26595
|
+
passed: false
|
|
26596
|
+
});
|
|
26661
26597
|
}
|
|
26662
26598
|
if (sequenceMatched) {
|
|
26663
26599
|
const latencyResult = checkLatency(
|
|
@@ -26666,10 +26602,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26666
26602
|
actualCall.durationMs
|
|
26667
26603
|
);
|
|
26668
26604
|
if (latencyResult.status === "pass") {
|
|
26669
|
-
|
|
26605
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
26670
26606
|
latencyHits++;
|
|
26671
26607
|
} else if (latencyResult.status === "fail") {
|
|
26672
|
-
|
|
26608
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
26673
26609
|
} else if (latencyResult.message) {
|
|
26674
26610
|
warnings.push(latencyResult.message);
|
|
26675
26611
|
latencySkips++;
|
|
@@ -26677,7 +26613,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26677
26613
|
}
|
|
26678
26614
|
}
|
|
26679
26615
|
for (let i = checkLength; i < expected.length; i++) {
|
|
26680
|
-
|
|
26616
|
+
assertions.push({
|
|
26617
|
+
text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
|
|
26618
|
+
passed: false
|
|
26619
|
+
});
|
|
26681
26620
|
}
|
|
26682
26621
|
for (const warning of warnings) {
|
|
26683
26622
|
console.warn(`[tool-trajectory] ${warning}`);
|
|
@@ -26688,8 +26627,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26688
26627
|
return {
|
|
26689
26628
|
score,
|
|
26690
26629
|
verdict: scoreToVerdict(score),
|
|
26691
|
-
|
|
26692
|
-
misses,
|
|
26630
|
+
assertions,
|
|
26693
26631
|
expectedAspectCount: totalAssertions
|
|
26694
26632
|
};
|
|
26695
26633
|
}
|
|
@@ -26704,13 +26642,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26704
26642
|
return {
|
|
26705
26643
|
score: 1,
|
|
26706
26644
|
verdict: "pass",
|
|
26707
|
-
|
|
26708
|
-
misses: [],
|
|
26645
|
+
assertions: [{ text: "No expected tools specified", passed: true }],
|
|
26709
26646
|
expectedAspectCount: 0
|
|
26710
26647
|
};
|
|
26711
26648
|
}
|
|
26712
|
-
const
|
|
26713
|
-
const misses = [];
|
|
26649
|
+
const assertions = [];
|
|
26714
26650
|
const consumed = /* @__PURE__ */ new Set();
|
|
26715
26651
|
for (let i = 0; i < expected.length; i++) {
|
|
26716
26652
|
const expectedItem = expected[i];
|
|
@@ -26721,22 +26657,25 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26721
26657
|
if (consumed.has(j)) continue;
|
|
26722
26658
|
const actualCall = toolCalls[j];
|
|
26723
26659
|
if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26724
|
-
|
|
26660
|
+
assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
|
|
26725
26661
|
consumed.add(j);
|
|
26726
26662
|
found = true;
|
|
26727
26663
|
break;
|
|
26728
26664
|
}
|
|
26729
26665
|
}
|
|
26730
26666
|
if (!found) {
|
|
26731
|
-
|
|
26667
|
+
assertions.push({
|
|
26668
|
+
text: `Expected ${expectedTool} not found in actual trajectory`,
|
|
26669
|
+
passed: false
|
|
26670
|
+
});
|
|
26732
26671
|
}
|
|
26733
26672
|
}
|
|
26734
|
-
const
|
|
26673
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26674
|
+
const score = expected.length > 0 ? passedCount / expected.length : 1;
|
|
26735
26675
|
return {
|
|
26736
26676
|
score,
|
|
26737
26677
|
verdict: scoreToVerdict(score),
|
|
26738
|
-
|
|
26739
|
-
misses,
|
|
26678
|
+
assertions,
|
|
26740
26679
|
expectedAspectCount: expected.length
|
|
26741
26680
|
};
|
|
26742
26681
|
}
|
|
@@ -26752,16 +26691,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26752
26691
|
return {
|
|
26753
26692
|
score: 1,
|
|
26754
26693
|
verdict: "pass",
|
|
26755
|
-
|
|
26756
|
-
misses: [],
|
|
26694
|
+
assertions: [{ text: "No tool calls and no expected tools", passed: true }],
|
|
26757
26695
|
expectedAspectCount: 0
|
|
26758
26696
|
};
|
|
26759
26697
|
}
|
|
26760
26698
|
return {
|
|
26761
26699
|
score: 0,
|
|
26762
26700
|
verdict: "fail",
|
|
26763
|
-
|
|
26764
|
-
|
|
26701
|
+
assertions: [
|
|
26702
|
+
{
|
|
26703
|
+
text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
|
|
26704
|
+
passed: false
|
|
26705
|
+
}
|
|
26706
|
+
],
|
|
26765
26707
|
expectedAspectCount: toolCalls.length
|
|
26766
26708
|
};
|
|
26767
26709
|
}
|
|
@@ -26769,13 +26711,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26769
26711
|
return {
|
|
26770
26712
|
score: 1,
|
|
26771
26713
|
verdict: "pass",
|
|
26772
|
-
|
|
26773
|
-
misses: [],
|
|
26714
|
+
assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
|
|
26774
26715
|
expectedAspectCount: 0
|
|
26775
26716
|
};
|
|
26776
26717
|
}
|
|
26777
|
-
const
|
|
26778
|
-
const misses = [];
|
|
26718
|
+
const assertions = [];
|
|
26779
26719
|
for (let i = 0; i < toolCalls.length; i++) {
|
|
26780
26720
|
const actualCall = toolCalls[i];
|
|
26781
26721
|
let allowed = false;
|
|
@@ -26787,17 +26727,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26787
26727
|
}
|
|
26788
26728
|
}
|
|
26789
26729
|
if (allowed) {
|
|
26790
|
-
|
|
26730
|
+
assertions.push({
|
|
26731
|
+
text: `Position ${i}: ${actualCall.name} is in allowed set`,
|
|
26732
|
+
passed: true
|
|
26733
|
+
});
|
|
26791
26734
|
} else {
|
|
26792
|
-
|
|
26735
|
+
assertions.push({
|
|
26736
|
+
text: `Position ${i}: ${actualCall.name} is not in allowed set`,
|
|
26737
|
+
passed: false
|
|
26738
|
+
});
|
|
26793
26739
|
}
|
|
26794
26740
|
}
|
|
26795
|
-
const
|
|
26741
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26742
|
+
const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
|
|
26796
26743
|
return {
|
|
26797
26744
|
score,
|
|
26798
26745
|
verdict: scoreToVerdict(score),
|
|
26799
|
-
|
|
26800
|
-
misses,
|
|
26746
|
+
assertions,
|
|
26801
26747
|
expectedAspectCount: toolCalls.length
|
|
26802
26748
|
};
|
|
26803
26749
|
}
|
|
@@ -26806,8 +26752,12 @@ function runContainsAssertion(output, value) {
|
|
|
26806
26752
|
const passed = output.includes(value);
|
|
26807
26753
|
return {
|
|
26808
26754
|
score: passed ? 1 : 0,
|
|
26809
|
-
|
|
26810
|
-
|
|
26755
|
+
assertions: [
|
|
26756
|
+
{
|
|
26757
|
+
text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
|
|
26758
|
+
passed
|
|
26759
|
+
}
|
|
26760
|
+
]
|
|
26811
26761
|
};
|
|
26812
26762
|
}
|
|
26813
26763
|
function runContainsAnyAssertion(output, values) {
|
|
@@ -26815,8 +26765,12 @@ function runContainsAnyAssertion(output, values) {
|
|
|
26815
26765
|
const passed = matched.length > 0;
|
|
26816
26766
|
return {
|
|
26817
26767
|
score: passed ? 1 : 0,
|
|
26818
|
-
|
|
26819
|
-
|
|
26768
|
+
assertions: [
|
|
26769
|
+
{
|
|
26770
|
+
text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
|
|
26771
|
+
passed
|
|
26772
|
+
}
|
|
26773
|
+
]
|
|
26820
26774
|
};
|
|
26821
26775
|
}
|
|
26822
26776
|
function runContainsAllAssertion(output, values) {
|
|
@@ -26824,16 +26778,24 @@ function runContainsAllAssertion(output, values) {
|
|
|
26824
26778
|
const passed = missing.length === 0;
|
|
26825
26779
|
return {
|
|
26826
26780
|
score: passed ? 1 : 0,
|
|
26827
|
-
|
|
26828
|
-
|
|
26781
|
+
assertions: [
|
|
26782
|
+
{
|
|
26783
|
+
text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
26784
|
+
passed
|
|
26785
|
+
}
|
|
26786
|
+
]
|
|
26829
26787
|
};
|
|
26830
26788
|
}
|
|
26831
26789
|
function runIcontainsAssertion(output, value) {
|
|
26832
26790
|
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
26833
26791
|
return {
|
|
26834
26792
|
score: passed ? 1 : 0,
|
|
26835
|
-
|
|
26836
|
-
|
|
26793
|
+
assertions: [
|
|
26794
|
+
{
|
|
26795
|
+
text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
|
|
26796
|
+
passed
|
|
26797
|
+
}
|
|
26798
|
+
]
|
|
26837
26799
|
};
|
|
26838
26800
|
}
|
|
26839
26801
|
function runIcontainsAnyAssertion(output, values) {
|
|
@@ -26842,9 +26804,11 @@ function runIcontainsAnyAssertion(output, values) {
|
|
|
26842
26804
|
const passed = matched.length > 0;
|
|
26843
26805
|
return {
|
|
26844
26806
|
score: passed ? 1 : 0,
|
|
26845
|
-
|
|
26846
|
-
|
|
26847
|
-
|
|
26807
|
+
assertions: [
|
|
26808
|
+
{
|
|
26809
|
+
text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
|
|
26810
|
+
passed
|
|
26811
|
+
}
|
|
26848
26812
|
]
|
|
26849
26813
|
};
|
|
26850
26814
|
}
|
|
@@ -26854,24 +26818,36 @@ function runIcontainsAllAssertion(output, values) {
|
|
|
26854
26818
|
const passed = missing.length === 0;
|
|
26855
26819
|
return {
|
|
26856
26820
|
score: passed ? 1 : 0,
|
|
26857
|
-
|
|
26858
|
-
|
|
26821
|
+
assertions: [
|
|
26822
|
+
{
|
|
26823
|
+
text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
26824
|
+
passed
|
|
26825
|
+
}
|
|
26826
|
+
]
|
|
26859
26827
|
};
|
|
26860
26828
|
}
|
|
26861
26829
|
function runStartsWithAssertion(output, value) {
|
|
26862
26830
|
const passed = output.trim().startsWith(value.trim());
|
|
26863
26831
|
return {
|
|
26864
26832
|
score: passed ? 1 : 0,
|
|
26865
|
-
|
|
26866
|
-
|
|
26833
|
+
assertions: [
|
|
26834
|
+
{
|
|
26835
|
+
text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
|
|
26836
|
+
passed
|
|
26837
|
+
}
|
|
26838
|
+
]
|
|
26867
26839
|
};
|
|
26868
26840
|
}
|
|
26869
26841
|
function runEndsWithAssertion(output, value) {
|
|
26870
26842
|
const passed = output.trim().endsWith(value.trim());
|
|
26871
26843
|
return {
|
|
26872
26844
|
score: passed ? 1 : 0,
|
|
26873
|
-
|
|
26874
|
-
|
|
26845
|
+
assertions: [
|
|
26846
|
+
{
|
|
26847
|
+
text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
|
|
26848
|
+
passed
|
|
26849
|
+
}
|
|
26850
|
+
]
|
|
26875
26851
|
};
|
|
26876
26852
|
}
|
|
26877
26853
|
function runRegexAssertion(output, pattern, flags) {
|
|
@@ -26880,8 +26856,12 @@ function runRegexAssertion(output, pattern, flags) {
|
|
|
26880
26856
|
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
26881
26857
|
return {
|
|
26882
26858
|
score: passed ? 1 : 0,
|
|
26883
|
-
|
|
26884
|
-
|
|
26859
|
+
assertions: [
|
|
26860
|
+
{
|
|
26861
|
+
text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
|
|
26862
|
+
passed
|
|
26863
|
+
}
|
|
26864
|
+
]
|
|
26885
26865
|
};
|
|
26886
26866
|
}
|
|
26887
26867
|
function runIsJsonAssertion(output) {
|
|
@@ -26893,16 +26873,24 @@ function runIsJsonAssertion(output) {
|
|
|
26893
26873
|
}
|
|
26894
26874
|
return {
|
|
26895
26875
|
score: passed ? 1 : 0,
|
|
26896
|
-
|
|
26897
|
-
|
|
26876
|
+
assertions: [
|
|
26877
|
+
{
|
|
26878
|
+
text: passed ? "Output is valid JSON" : "Output is not valid JSON",
|
|
26879
|
+
passed
|
|
26880
|
+
}
|
|
26881
|
+
]
|
|
26898
26882
|
};
|
|
26899
26883
|
}
|
|
26900
26884
|
function runEqualsAssertion(output, value) {
|
|
26901
26885
|
const passed = output.trim() === value.trim();
|
|
26902
26886
|
return {
|
|
26903
26887
|
score: passed ? 1 : 0,
|
|
26904
|
-
|
|
26905
|
-
|
|
26888
|
+
assertions: [
|
|
26889
|
+
{
|
|
26890
|
+
text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
|
|
26891
|
+
passed
|
|
26892
|
+
}
|
|
26893
|
+
]
|
|
26906
26894
|
};
|
|
26907
26895
|
}
|
|
26908
26896
|
var Node = class {
|
|
@@ -27101,10 +27089,8 @@ var InlineAssertEvaluator = class {
|
|
|
27101
27089
|
return {
|
|
27102
27090
|
score,
|
|
27103
27091
|
verdict: scoreToVerdict(score),
|
|
27104
|
-
|
|
27105
|
-
misses: score < 0.5 ? [result.name] : [],
|
|
27092
|
+
assertions: [{ text: result.name, passed: score >= 0.5 }],
|
|
27106
27093
|
expectedAspectCount: 1,
|
|
27107
|
-
reasoning: void 0,
|
|
27108
27094
|
details: result.metadata ? result.metadata : void 0
|
|
27109
27095
|
};
|
|
27110
27096
|
}
|
|
@@ -27292,9 +27278,7 @@ var containsFactory = (config) => {
|
|
|
27292
27278
|
return {
|
|
27293
27279
|
score: result.score,
|
|
27294
27280
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27295
|
-
|
|
27296
|
-
misses: result.misses,
|
|
27297
|
-
reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
|
|
27281
|
+
assertions: result.assertions,
|
|
27298
27282
|
expectedAspectCount: 1
|
|
27299
27283
|
};
|
|
27300
27284
|
});
|
|
@@ -27306,9 +27290,7 @@ var regexFactory = (config) => {
|
|
|
27306
27290
|
return {
|
|
27307
27291
|
score: result.score,
|
|
27308
27292
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27309
|
-
|
|
27310
|
-
misses: result.misses,
|
|
27311
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
27293
|
+
assertions: result.assertions,
|
|
27312
27294
|
expectedAspectCount: 1
|
|
27313
27295
|
};
|
|
27314
27296
|
});
|
|
@@ -27319,9 +27301,7 @@ var isJsonFactory = () => {
|
|
|
27319
27301
|
return {
|
|
27320
27302
|
score: result.score,
|
|
27321
27303
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27322
|
-
|
|
27323
|
-
misses: result.misses,
|
|
27324
|
-
reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
|
|
27304
|
+
assertions: result.assertions,
|
|
27325
27305
|
expectedAspectCount: 1
|
|
27326
27306
|
};
|
|
27327
27307
|
});
|
|
@@ -27333,9 +27313,7 @@ var equalsFactory = (config) => {
|
|
|
27333
27313
|
return {
|
|
27334
27314
|
score: result.score,
|
|
27335
27315
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27336
|
-
|
|
27337
|
-
misses: result.misses,
|
|
27338
|
-
reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
|
|
27316
|
+
assertions: result.assertions,
|
|
27339
27317
|
expectedAspectCount: 1
|
|
27340
27318
|
};
|
|
27341
27319
|
});
|
|
@@ -27347,9 +27325,7 @@ var containsAnyFactory = (config) => {
|
|
|
27347
27325
|
return {
|
|
27348
27326
|
score: result.score,
|
|
27349
27327
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27350
|
-
|
|
27351
|
-
misses: result.misses,
|
|
27352
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27328
|
+
assertions: result.assertions,
|
|
27353
27329
|
expectedAspectCount: 1
|
|
27354
27330
|
};
|
|
27355
27331
|
});
|
|
@@ -27361,9 +27337,7 @@ var containsAllFactory = (config) => {
|
|
|
27361
27337
|
return {
|
|
27362
27338
|
score: result.score,
|
|
27363
27339
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27364
|
-
|
|
27365
|
-
misses: result.misses,
|
|
27366
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27340
|
+
assertions: result.assertions,
|
|
27367
27341
|
expectedAspectCount: 1
|
|
27368
27342
|
};
|
|
27369
27343
|
});
|
|
@@ -27375,9 +27349,7 @@ var icontainsFactory = (config) => {
|
|
|
27375
27349
|
return {
|
|
27376
27350
|
score: result.score,
|
|
27377
27351
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27378
|
-
|
|
27379
|
-
misses: result.misses,
|
|
27380
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27352
|
+
assertions: result.assertions,
|
|
27381
27353
|
expectedAspectCount: 1
|
|
27382
27354
|
};
|
|
27383
27355
|
});
|
|
@@ -27389,9 +27361,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
27389
27361
|
return {
|
|
27390
27362
|
score: result.score,
|
|
27391
27363
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27392
|
-
|
|
27393
|
-
misses: result.misses,
|
|
27394
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27364
|
+
assertions: result.assertions,
|
|
27395
27365
|
expectedAspectCount: 1
|
|
27396
27366
|
};
|
|
27397
27367
|
});
|
|
@@ -27403,9 +27373,7 @@ var icontainsAllFactory = (config) => {
|
|
|
27403
27373
|
return {
|
|
27404
27374
|
score: result.score,
|
|
27405
27375
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27406
|
-
|
|
27407
|
-
misses: result.misses,
|
|
27408
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27376
|
+
assertions: result.assertions,
|
|
27409
27377
|
expectedAspectCount: 1
|
|
27410
27378
|
};
|
|
27411
27379
|
});
|
|
@@ -27417,9 +27385,7 @@ var startsWithFactory = (config) => {
|
|
|
27417
27385
|
return {
|
|
27418
27386
|
score: result.score,
|
|
27419
27387
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27420
|
-
|
|
27421
|
-
misses: result.misses,
|
|
27422
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27388
|
+
assertions: result.assertions,
|
|
27423
27389
|
expectedAspectCount: 1
|
|
27424
27390
|
};
|
|
27425
27391
|
});
|
|
@@ -27431,9 +27397,7 @@ var endsWithFactory = (config) => {
|
|
|
27431
27397
|
return {
|
|
27432
27398
|
score: result.score,
|
|
27433
27399
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27434
|
-
|
|
27435
|
-
misses: result.misses,
|
|
27436
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27400
|
+
assertions: result.assertions,
|
|
27437
27401
|
expectedAspectCount: 1
|
|
27438
27402
|
};
|
|
27439
27403
|
});
|
|
@@ -28462,7 +28426,7 @@ async function runEvaluation(options) {
|
|
|
28462
28426
|
if (!cliModel) {
|
|
28463
28427
|
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
28464
28428
|
}
|
|
28465
|
-
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-
|
|
28429
|
+
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M-TJAWCWCX.js");
|
|
28466
28430
|
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
28467
28431
|
}
|
|
28468
28432
|
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
@@ -28797,8 +28761,7 @@ async function runEvaluation(options) {
|
|
|
28797
28761
|
testId: evalCase.id,
|
|
28798
28762
|
dataset: evalCase.dataset,
|
|
28799
28763
|
score: 0,
|
|
28800
|
-
|
|
28801
|
-
misses: [],
|
|
28764
|
+
assertions: [],
|
|
28802
28765
|
answer: "",
|
|
28803
28766
|
target: target.name,
|
|
28804
28767
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
@@ -28834,8 +28797,7 @@ async function runEvaluation(options) {
|
|
|
28834
28797
|
testId: evalCase.id,
|
|
28835
28798
|
dataset: evalCase.dataset,
|
|
28836
28799
|
score: 0,
|
|
28837
|
-
|
|
28838
|
-
misses: [],
|
|
28800
|
+
assertions: [],
|
|
28839
28801
|
answer: "",
|
|
28840
28802
|
target: target.name,
|
|
28841
28803
|
error: errorMsg,
|
|
@@ -29802,11 +29764,9 @@ async function evaluateCandidate(options) {
|
|
|
29802
29764
|
dataset: evalCase.dataset,
|
|
29803
29765
|
conversationId: evalCase.conversation_id,
|
|
29804
29766
|
score: score.score,
|
|
29805
|
-
|
|
29806
|
-
misses: score.misses,
|
|
29767
|
+
assertions: score.assertions,
|
|
29807
29768
|
answer: candidate,
|
|
29808
29769
|
target: target.name,
|
|
29809
|
-
reasoning: score.reasoning,
|
|
29810
29770
|
tokenUsage,
|
|
29811
29771
|
costUsd,
|
|
29812
29772
|
durationMs,
|
|
@@ -29980,9 +29940,7 @@ async function runEvaluatorList(options) {
|
|
|
29980
29940
|
score: score2.score,
|
|
29981
29941
|
weight,
|
|
29982
29942
|
verdict: score2.verdict,
|
|
29983
|
-
|
|
29984
|
-
misses: score2.misses,
|
|
29985
|
-
reasoning: score2.reasoning,
|
|
29943
|
+
assertions: score2.assertions,
|
|
29986
29944
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
29987
29945
|
details: score2.details,
|
|
29988
29946
|
scores: mapChildResults(score2.scores),
|
|
@@ -29997,10 +29955,10 @@ async function runEvaluatorList(options) {
|
|
|
29997
29955
|
const fallbackScore = {
|
|
29998
29956
|
score: 0,
|
|
29999
29957
|
verdict: "fail",
|
|
30000
|
-
|
|
30001
|
-
|
|
30002
|
-
|
|
30003
|
-
|
|
29958
|
+
assertions: [
|
|
29959
|
+
{ text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
29960
|
+
],
|
|
29961
|
+
expectedAspectCount: 1
|
|
30004
29962
|
};
|
|
30005
29963
|
const weight = evaluatorConfig.weight ?? 1;
|
|
30006
29964
|
scored.push({
|
|
@@ -30016,9 +29974,12 @@ async function runEvaluatorList(options) {
|
|
|
30016
29974
|
score: 0,
|
|
30017
29975
|
weight,
|
|
30018
29976
|
verdict: "fail",
|
|
30019
|
-
|
|
30020
|
-
|
|
30021
|
-
|
|
29977
|
+
assertions: [
|
|
29978
|
+
{
|
|
29979
|
+
text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
29980
|
+
passed: false
|
|
29981
|
+
}
|
|
29982
|
+
],
|
|
30022
29983
|
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
30023
29984
|
startedAt: startedAt.toISOString(),
|
|
30024
29985
|
endedAt: endedAt.toISOString()
|
|
@@ -30034,9 +29995,7 @@ async function runEvaluatorList(options) {
|
|
|
30034
29995
|
...scores[lastScoresIdx],
|
|
30035
29996
|
score: negated.score,
|
|
30036
29997
|
verdict: negated.verdict,
|
|
30037
|
-
|
|
30038
|
-
misses: [...negated.misses],
|
|
30039
|
-
reasoning: negated.reasoning
|
|
29998
|
+
assertions: [...negated.assertions]
|
|
30040
29999
|
};
|
|
30041
30000
|
}
|
|
30042
30001
|
}
|
|
@@ -30051,21 +30010,13 @@ async function runEvaluatorList(options) {
|
|
|
30051
30010
|
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
30052
30011
|
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
30053
30012
|
) : 0;
|
|
30054
|
-
const
|
|
30055
|
-
const
|
|
30056
|
-
const expectedAspectCount = scored.reduce(
|
|
30057
|
-
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
30058
|
-
0
|
|
30059
|
-
);
|
|
30060
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
30061
|
-
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
30013
|
+
const assertions = scored.flatMap((entry) => entry.score.assertions);
|
|
30014
|
+
const expectedAspectCount = assertions.length || 1;
|
|
30062
30015
|
const score = {
|
|
30063
30016
|
score: aggregateScore,
|
|
30064
30017
|
verdict: scoreToVerdict(aggregateScore),
|
|
30065
|
-
|
|
30066
|
-
|
|
30067
|
-
expectedAspectCount,
|
|
30068
|
-
reasoning
|
|
30018
|
+
assertions,
|
|
30019
|
+
expectedAspectCount
|
|
30069
30020
|
};
|
|
30070
30021
|
return { score, scores };
|
|
30071
30022
|
}
|
|
@@ -30169,8 +30120,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
30169
30120
|
dataset: evalCase.dataset,
|
|
30170
30121
|
conversationId: evalCase.conversation_id,
|
|
30171
30122
|
score: 0,
|
|
30172
|
-
|
|
30173
|
-
misses: [`Error: ${message}`],
|
|
30123
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
30174
30124
|
answer: `Error occurred: ${message}`,
|
|
30175
30125
|
target: targetName,
|
|
30176
30126
|
requests,
|
|
@@ -30280,9 +30230,7 @@ function mapChildResults(children) {
|
|
|
30280
30230
|
score: child.score,
|
|
30281
30231
|
weight: child.weight,
|
|
30282
30232
|
verdict: child.verdict,
|
|
30283
|
-
|
|
30284
|
-
misses: child.misses,
|
|
30285
|
-
reasoning: child.reasoning,
|
|
30233
|
+
assertions: child.assertions,
|
|
30286
30234
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
30287
30235
|
scores: mapChildResults(child.scores),
|
|
30288
30236
|
details: child.details,
|
|
@@ -31150,7 +31098,6 @@ export {
|
|
|
31150
31098
|
isJsonValue,
|
|
31151
31099
|
isTestMessage,
|
|
31152
31100
|
isEvaluatorKind,
|
|
31153
|
-
getHitCount,
|
|
31154
31101
|
fileExists,
|
|
31155
31102
|
normalizeLineEndings,
|
|
31156
31103
|
readTextFile,
|
|
@@ -31290,4 +31237,4 @@ export {
|
|
|
31290
31237
|
OtelStreamingObserver,
|
|
31291
31238
|
createAgentKernel
|
|
31292
31239
|
};
|
|
31293
|
-
//# sourceMappingURL=chunk-
|
|
31240
|
+
//# sourceMappingURL=chunk-D6G4N2H2.js.map
|