agentv 3.4.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -11
- package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js → agentv-provider-NFFLXG5M-TJAWCWCX.js} +2 -2
- package/dist/{chunk-AR3QEKXH.js → chunk-BJV6MDBE.js} +3 -3
- package/dist/{chunk-AR3QEKXH.js.map → chunk-BJV6MDBE.js.map} +1 -1
- package/dist/{chunk-A7ZDUB46.js → chunk-IP5BO54H.js} +35 -26
- package/dist/chunk-IP5BO54H.js.map +1 -0
- package/dist/{chunk-GOZV2HN2.js → chunk-K4RXLQWV.js} +453 -494
- package/dist/chunk-K4RXLQWV.js.map +1 -0
- package/dist/{chunk-RE5I3U2S.js → chunk-UU5N43YS.js} +27 -46
- package/dist/chunk-UU5N43YS.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/{dist-AFDYFH6Y.js → dist-VWEFBDZ5.js} +3 -5
- package/dist/index.js +4 -4
- package/dist/{interactive-WXXTZ7PD.js → interactive-5S4ILY2Y.js} +4 -4
- package/dist/templates/.agentv/.env.example +9 -11
- package/dist/templates/.agentv/config.yaml +0 -5
- package/dist/templates/.agentv/targets.yaml +16 -0
- package/package.json +1 -1
- package/dist/chunk-A7ZDUB46.js.map +0 -1
- package/dist/chunk-GOZV2HN2.js.map +0 -1
- package/dist/chunk-RE5I3U2S.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF-LUBMM7TH.js.map → agentv-provider-NFFLXG5M-TJAWCWCX.js.map} +0 -0
- /package/dist/{dist-AFDYFH6Y.js.map → dist-VWEFBDZ5.js.map} +0 -0
- /package/dist/{interactive-WXXTZ7PD.js.map → interactive-5S4ILY2Y.js.map} +0 -0
|
@@ -149,7 +149,7 @@ import {
|
|
|
149
149
|
withUserAgentSuffix,
|
|
150
150
|
withoutTrailingSlash,
|
|
151
151
|
zodSchema
|
|
152
|
-
} from "./chunk-
|
|
152
|
+
} from "./chunk-BJV6MDBE.js";
|
|
153
153
|
import {
|
|
154
154
|
SpanStatusCode,
|
|
155
155
|
context,
|
|
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
|
|
|
301
301
|
}
|
|
302
302
|
});
|
|
303
303
|
|
|
304
|
-
// ../../packages/core/dist/chunk-
|
|
304
|
+
// ../../packages/core/dist/chunk-2IZOTQ25.js
|
|
305
305
|
import { constants } from "node:fs";
|
|
306
306
|
import { access, readFile } from "node:fs/promises";
|
|
307
307
|
import path from "node:path";
|
|
@@ -419,7 +419,7 @@ __export(external_exports2, {
|
|
|
419
419
|
void: () => voidType
|
|
420
420
|
});
|
|
421
421
|
|
|
422
|
-
// ../../packages/core/dist/chunk-
|
|
422
|
+
// ../../packages/core/dist/chunk-2IZOTQ25.js
|
|
423
423
|
var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
|
|
424
424
|
var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
|
|
425
425
|
var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
|
|
@@ -498,9 +498,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
|
498
498
|
function isEvaluatorKind(value) {
|
|
499
499
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
500
500
|
}
|
|
501
|
-
function getHitCount(result) {
|
|
502
|
-
return result.hits.length;
|
|
503
|
-
}
|
|
504
501
|
async function fileExists(filePath) {
|
|
505
502
|
try {
|
|
506
503
|
await access(filePath, constants.F_OK);
|
|
@@ -14658,14 +14655,8 @@ function logWarning(message) {
|
|
|
14658
14655
|
console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET3}`);
|
|
14659
14656
|
}
|
|
14660
14657
|
var TEMPLATE_VARIABLES = {
|
|
14661
|
-
/** @deprecated Use OUTPUT_TEXT instead */
|
|
14662
|
-
ANSWER: "answer",
|
|
14663
14658
|
EXPECTED_OUTPUT: "expected_output",
|
|
14664
|
-
/** @deprecated Use INPUT_TEXT instead */
|
|
14665
|
-
QUESTION: "question",
|
|
14666
14659
|
CRITERIA: "criteria",
|
|
14667
|
-
/** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
|
|
14668
|
-
REFERENCE_ANSWER: "reference_answer",
|
|
14669
14660
|
INPUT: "input",
|
|
14670
14661
|
OUTPUT: "output",
|
|
14671
14662
|
FILE_CHANGES: "file_changes",
|
|
@@ -14675,9 +14666,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
14675
14666
|
};
|
|
14676
14667
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
14677
14668
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
14678
|
-
TEMPLATE_VARIABLES.
|
|
14679
|
-
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
14680
|
-
TEMPLATE_VARIABLES.OUTPUT_TEXT
|
|
14669
|
+
TEMPLATE_VARIABLES.OUTPUT_TEXT,
|
|
14670
|
+
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
14681
14671
|
]);
|
|
14682
14672
|
var ANSI_YELLOW3 = "\x1B[33m";
|
|
14683
14673
|
var ANSI_RESET4 = "\x1B[0m";
|
|
@@ -14698,13 +14688,13 @@ function validateTemplateVariables(content, source) {
|
|
|
14698
14688
|
}
|
|
14699
14689
|
match = variablePattern.exec(content);
|
|
14700
14690
|
}
|
|
14701
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.
|
|
14691
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
14702
14692
|
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
14703
14693
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
14704
14694
|
if (!hasRequiredFields) {
|
|
14705
14695
|
throw new Error(
|
|
14706
14696
|
`Missing required fields. Must include at least one of:
|
|
14707
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
14697
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
|
|
14708
14698
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
14709
14699
|
);
|
|
14710
14700
|
}
|
|
@@ -17623,7 +17613,7 @@ var AzureProvider = class {
|
|
|
17623
17613
|
};
|
|
17624
17614
|
this.retryConfig = config.retry;
|
|
17625
17615
|
const azure = createAzure(buildAzureOptions(config));
|
|
17626
|
-
this.model = azure(config.deploymentName);
|
|
17616
|
+
this.model = azure.chat(config.deploymentName);
|
|
17627
17617
|
}
|
|
17628
17618
|
id;
|
|
17629
17619
|
kind = "azure";
|
|
@@ -17846,6 +17836,8 @@ async function invokeModel(options) {
|
|
|
17846
17836
|
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
17847
17837
|
const chatPrompt = buildChatPrompt(request);
|
|
17848
17838
|
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
17839
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
17840
|
+
const startMs = Date.now();
|
|
17849
17841
|
const result = await withRetry(
|
|
17850
17842
|
() => generateText({
|
|
17851
17843
|
model,
|
|
@@ -17859,9 +17851,11 @@ async function invokeModel(options) {
|
|
|
17859
17851
|
retryConfig,
|
|
17860
17852
|
request.signal
|
|
17861
17853
|
);
|
|
17862
|
-
|
|
17854
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
17855
|
+
const durationMs = Date.now() - startMs;
|
|
17856
|
+
return mapResponse(result, { durationMs, startTime, endTime });
|
|
17863
17857
|
}
|
|
17864
|
-
function mapResponse(result) {
|
|
17858
|
+
function mapResponse(result, timing) {
|
|
17865
17859
|
const content = result.text ?? "";
|
|
17866
17860
|
const rawUsage = result.totalUsage ?? result.usage;
|
|
17867
17861
|
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
@@ -17876,7 +17870,10 @@ function mapResponse(result) {
|
|
|
17876
17870
|
raw: result,
|
|
17877
17871
|
usage: toJsonObject(rawUsage),
|
|
17878
17872
|
output: [{ role: "assistant", content }],
|
|
17879
|
-
tokenUsage
|
|
17873
|
+
tokenUsage,
|
|
17874
|
+
durationMs: timing?.durationMs,
|
|
17875
|
+
startTime: timing?.startTime,
|
|
17876
|
+
endTime: timing?.endTime
|
|
17880
17877
|
};
|
|
17881
17878
|
}
|
|
17882
17879
|
function toJsonObject(value) {
|
|
@@ -18734,10 +18731,12 @@ var ClaudeSdkProvider = class {
|
|
|
18734
18731
|
if (usage) {
|
|
18735
18732
|
const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
|
|
18736
18733
|
const outputTokens = usage.output_tokens ?? 0;
|
|
18734
|
+
const reasoningTokens = usage.reasoning_tokens ?? void 0;
|
|
18737
18735
|
tokenUsage = {
|
|
18738
18736
|
input: inputTokens,
|
|
18739
18737
|
output: outputTokens,
|
|
18740
|
-
cached: usage.cache_read_input_tokens ?? void 0
|
|
18738
|
+
cached: usage.cache_read_input_tokens ?? void 0,
|
|
18739
|
+
reasoning: reasoningTokens
|
|
18741
18740
|
};
|
|
18742
18741
|
request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
|
|
18743
18742
|
}
|
|
@@ -19733,7 +19732,8 @@ ${basePrompt}` : basePrompt;
|
|
|
19733
19732
|
onUsage({
|
|
19734
19733
|
input: usage.input_tokens ?? 0,
|
|
19735
19734
|
output: usage.output_tokens ?? 0,
|
|
19736
|
-
cached: usage.cached_input_tokens ?? void 0
|
|
19735
|
+
cached: usage.cached_input_tokens ?? void 0,
|
|
19736
|
+
reasoning: usage.reasoning_tokens ?? void 0
|
|
19737
19737
|
});
|
|
19738
19738
|
}
|
|
19739
19739
|
}
|
|
@@ -21701,10 +21701,12 @@ function extractTokenUsage(events) {
|
|
|
21701
21701
|
output: output ?? 0
|
|
21702
21702
|
};
|
|
21703
21703
|
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
21704
|
-
|
|
21705
|
-
|
|
21706
|
-
|
|
21707
|
-
|
|
21704
|
+
const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
|
|
21705
|
+
return {
|
|
21706
|
+
...result,
|
|
21707
|
+
...cached !== void 0 ? { cached } : {},
|
|
21708
|
+
...reasoning !== void 0 ? { reasoning } : {}
|
|
21709
|
+
};
|
|
21708
21710
|
}
|
|
21709
21711
|
}
|
|
21710
21712
|
const messages = record.messages;
|
|
@@ -23483,9 +23485,11 @@ function negateScore(score) {
|
|
|
23483
23485
|
...score,
|
|
23484
23486
|
score: negatedScore,
|
|
23485
23487
|
verdict: negatedVerdict,
|
|
23486
|
-
|
|
23487
|
-
|
|
23488
|
-
|
|
23488
|
+
assertions: score.assertions.map((a) => ({
|
|
23489
|
+
...a,
|
|
23490
|
+
passed: !a.passed,
|
|
23491
|
+
evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
|
|
23492
|
+
}))
|
|
23489
23493
|
};
|
|
23490
23494
|
}
|
|
23491
23495
|
function shellEscapePath(value) {
|
|
@@ -23928,11 +23932,9 @@ var CodeEvaluator = class {
|
|
|
23928
23932
|
}
|
|
23929
23933
|
}
|
|
23930
23934
|
const payload = {
|
|
23931
|
-
question: context2.evalCase.question,
|
|
23932
23935
|
criteria: context2.evalCase.criteria,
|
|
23933
23936
|
expectedOutput: context2.evalCase.expected_output,
|
|
23934
|
-
|
|
23935
|
-
answer: context2.candidate,
|
|
23937
|
+
outputText: context2.candidate,
|
|
23936
23938
|
output: outputForPayload,
|
|
23937
23939
|
outputPath,
|
|
23938
23940
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
@@ -23949,9 +23951,7 @@ var CodeEvaluator = class {
|
|
|
23949
23951
|
fileChanges: context2.fileChanges ?? null,
|
|
23950
23952
|
workspacePath: context2.workspacePath ?? null,
|
|
23951
23953
|
config: this.config ?? null,
|
|
23952
|
-
// Text convenience accessors (new names, always strings)
|
|
23953
23954
|
inputText: context2.evalCase.question,
|
|
23954
|
-
outputText: context2.candidate,
|
|
23955
23955
|
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
23956
23956
|
};
|
|
23957
23957
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -23985,9 +23985,13 @@ var CodeEvaluator = class {
|
|
|
23985
23985
|
);
|
|
23986
23986
|
const parsed = parseJsonSafe(stdout);
|
|
23987
23987
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
23988
|
-
const
|
|
23989
|
-
|
|
23990
|
-
|
|
23988
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
23989
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
23990
|
+
).map((a) => ({
|
|
23991
|
+
text: String(a.text),
|
|
23992
|
+
passed: Boolean(a.passed),
|
|
23993
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
23994
|
+
})) : [];
|
|
23991
23995
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
23992
23996
|
const proxyUsage = getProxyUsage?.();
|
|
23993
23997
|
const evaluatorRawRequest = {
|
|
@@ -24003,10 +24007,8 @@ var CodeEvaluator = class {
|
|
|
24003
24007
|
return {
|
|
24004
24008
|
score,
|
|
24005
24009
|
verdict: scoreToVerdict(score),
|
|
24006
|
-
|
|
24007
|
-
|
|
24008
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
24009
|
-
reasoning,
|
|
24010
|
+
assertions,
|
|
24011
|
+
expectedAspectCount: assertions.length || 1,
|
|
24010
24012
|
evaluatorRawRequest,
|
|
24011
24013
|
...details ? { details } : {},
|
|
24012
24014
|
tokenUsage: proxyUsage?.tokenUsage
|
|
@@ -24017,10 +24019,8 @@ var CodeEvaluator = class {
|
|
|
24017
24019
|
return {
|
|
24018
24020
|
score: 0,
|
|
24019
24021
|
verdict: "fail",
|
|
24020
|
-
|
|
24021
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
24022
|
+
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
24022
24023
|
expectedAspectCount: 1,
|
|
24023
|
-
reasoning: message,
|
|
24024
24024
|
evaluatorRawRequest: {
|
|
24025
24025
|
command: this.command,
|
|
24026
24026
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
@@ -24110,18 +24110,22 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
24110
24110
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
24111
24111
|
|
|
24112
24112
|
[[ ## question ## ]]
|
|
24113
|
-
{{${TEMPLATE_VARIABLES.
|
|
24113
|
+
{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
|
|
24114
24114
|
|
|
24115
24115
|
[[ ## reference_answer ## ]]
|
|
24116
|
-
{{${TEMPLATE_VARIABLES.
|
|
24116
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
|
|
24117
24117
|
|
|
24118
24118
|
[[ ## answer ## ]]
|
|
24119
|
-
{{${TEMPLATE_VARIABLES.
|
|
24119
|
+
{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
|
|
24120
24120
|
var freeformEvaluationSchema = external_exports2.object({
|
|
24121
24121
|
score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
24122
|
-
|
|
24123
|
-
|
|
24124
|
-
|
|
24122
|
+
assertions: external_exports2.array(
|
|
24123
|
+
external_exports2.object({
|
|
24124
|
+
text: external_exports2.string().describe("Brief description of what was checked"),
|
|
24125
|
+
passed: external_exports2.boolean().describe("Whether this aspect was satisfied"),
|
|
24126
|
+
evidence: external_exports2.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
24127
|
+
})
|
|
24128
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
24125
24129
|
});
|
|
24126
24130
|
var rubricCheckResultSchema = external_exports2.object({
|
|
24127
24131
|
id: external_exports2.string().describe("The ID of the rubric item being checked"),
|
|
@@ -24190,12 +24194,8 @@ var LlmGraderEvaluator = class {
|
|
|
24190
24194
|
2
|
|
24191
24195
|
),
|
|
24192
24196
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
|
|
24193
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
24194
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24195
24197
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
24196
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
24197
24198
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
24198
|
-
// Text convenience accessors (new names, always strings)
|
|
24199
24199
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
24200
24200
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
24201
24201
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
@@ -24223,17 +24223,12 @@ ${context2.fileChanges}`;
|
|
|
24223
24223
|
schema: freeformEvaluationSchema
|
|
24224
24224
|
});
|
|
24225
24225
|
const score = clampScore(data.score);
|
|
24226
|
-
const
|
|
24227
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
24228
|
-
const reasoning = data.reasoning;
|
|
24229
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
24226
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
24230
24227
|
return {
|
|
24231
24228
|
score,
|
|
24232
24229
|
verdict: scoreToVerdict(score),
|
|
24233
|
-
|
|
24234
|
-
|
|
24235
|
-
expectedAspectCount,
|
|
24236
|
-
reasoning,
|
|
24230
|
+
assertions,
|
|
24231
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24237
24232
|
evaluatorRawRequest,
|
|
24238
24233
|
tokenUsage
|
|
24239
24234
|
};
|
|
@@ -24244,10 +24239,8 @@ ${context2.fileChanges}`;
|
|
|
24244
24239
|
return {
|
|
24245
24240
|
score: 0,
|
|
24246
24241
|
verdict: "skip",
|
|
24247
|
-
|
|
24248
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24242
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24249
24243
|
expectedAspectCount: 1,
|
|
24250
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24251
24244
|
evaluatorRawRequest
|
|
24252
24245
|
};
|
|
24253
24246
|
}
|
|
@@ -24277,14 +24270,12 @@ ${context2.fileChanges}`;
|
|
|
24277
24270
|
userPrompt: prompt,
|
|
24278
24271
|
schema: rubricEvaluationSchema
|
|
24279
24272
|
});
|
|
24280
|
-
const { score, verdict,
|
|
24273
|
+
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
24281
24274
|
return {
|
|
24282
24275
|
score,
|
|
24283
24276
|
verdict,
|
|
24284
|
-
|
|
24285
|
-
misses,
|
|
24277
|
+
assertions,
|
|
24286
24278
|
expectedAspectCount: rubrics.length,
|
|
24287
|
-
reasoning: data.overall_reasoning,
|
|
24288
24279
|
evaluatorRawRequest,
|
|
24289
24280
|
tokenUsage
|
|
24290
24281
|
};
|
|
@@ -24295,10 +24286,8 @@ ${context2.fileChanges}`;
|
|
|
24295
24286
|
return {
|
|
24296
24287
|
score: 0,
|
|
24297
24288
|
verdict: "skip",
|
|
24298
|
-
|
|
24299
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24289
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24300
24290
|
expectedAspectCount: rubrics.length,
|
|
24301
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24302
24291
|
evaluatorRawRequest
|
|
24303
24292
|
};
|
|
24304
24293
|
}
|
|
@@ -24323,14 +24312,12 @@ ${context2.fileChanges}`;
|
|
|
24323
24312
|
userPrompt: prompt,
|
|
24324
24313
|
schema: scoreRangeEvaluationSchema
|
|
24325
24314
|
});
|
|
24326
|
-
const { score, verdict,
|
|
24315
|
+
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
24327
24316
|
return {
|
|
24328
24317
|
score,
|
|
24329
24318
|
verdict,
|
|
24330
|
-
|
|
24331
|
-
misses,
|
|
24319
|
+
assertions,
|
|
24332
24320
|
expectedAspectCount: rubrics.length,
|
|
24333
|
-
reasoning: data.overall_reasoning,
|
|
24334
24321
|
evaluatorRawRequest,
|
|
24335
24322
|
details,
|
|
24336
24323
|
tokenUsage
|
|
@@ -24342,10 +24329,8 @@ ${context2.fileChanges}`;
|
|
|
24342
24329
|
return {
|
|
24343
24330
|
score: 0,
|
|
24344
24331
|
verdict: "skip",
|
|
24345
|
-
|
|
24346
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
24332
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
24347
24333
|
expectedAspectCount: rubrics.length,
|
|
24348
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
24349
24334
|
evaluatorRawRequest
|
|
24350
24335
|
};
|
|
24351
24336
|
}
|
|
@@ -24402,8 +24387,7 @@ ${context2.fileChanges}`;
|
|
|
24402
24387
|
return {
|
|
24403
24388
|
score: 0,
|
|
24404
24389
|
verdict: "fail",
|
|
24405
|
-
|
|
24406
|
-
misses: [`llm-grader built-in evaluation failed: ${message}`],
|
|
24390
|
+
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
24407
24391
|
expectedAspectCount: 1,
|
|
24408
24392
|
evaluatorRawRequest,
|
|
24409
24393
|
details: { mode: "built-in", error: message }
|
|
@@ -24453,8 +24437,9 @@ ${context2.fileChanges}`;
|
|
|
24453
24437
|
return {
|
|
24454
24438
|
score: 0,
|
|
24455
24439
|
verdict: "fail",
|
|
24456
|
-
|
|
24457
|
-
|
|
24440
|
+
assertions: [
|
|
24441
|
+
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
24442
|
+
],
|
|
24458
24443
|
expectedAspectCount: 1,
|
|
24459
24444
|
evaluatorRawRequest,
|
|
24460
24445
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
@@ -24472,8 +24457,9 @@ ${context2.fileChanges}`;
|
|
|
24472
24457
|
return {
|
|
24473
24458
|
score: 0,
|
|
24474
24459
|
verdict: "fail",
|
|
24475
|
-
|
|
24476
|
-
|
|
24460
|
+
assertions: [
|
|
24461
|
+
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
24462
|
+
],
|
|
24477
24463
|
expectedAspectCount: 1,
|
|
24478
24464
|
evaluatorRawRequest,
|
|
24479
24465
|
details: {
|
|
@@ -24514,10 +24500,10 @@ ${context2.fileChanges}`;
|
|
|
24514
24500
|
buildAgentUserPrompt(context2) {
|
|
24515
24501
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
24516
24502
|
const variables = {
|
|
24517
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
24518
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24519
24503
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
24520
|
-
[TEMPLATE_VARIABLES.
|
|
24504
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
24505
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
24506
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24521
24507
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
24522
24508
|
};
|
|
24523
24509
|
if (this.evaluatorTemplate) {
|
|
@@ -24570,10 +24556,10 @@ ${context2.fileChanges}`;
|
|
|
24570
24556
|
const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
|
|
24571
24557
|
if (this.evaluatorTemplate) {
|
|
24572
24558
|
const variables = {
|
|
24573
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
24574
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24575
24559
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
24576
|
-
[TEMPLATE_VARIABLES.
|
|
24560
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
24561
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
24562
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
24577
24563
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
24578
24564
|
};
|
|
24579
24565
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
@@ -24625,29 +24611,24 @@ ${outputSchema2}`;
|
|
|
24625
24611
|
const parsed = parseJsonFromText(text2);
|
|
24626
24612
|
if (rubrics && rubrics.length > 0) {
|
|
24627
24613
|
const data2 = rubricEvaluationSchema.parse(parsed);
|
|
24628
|
-
const { score: score2, verdict,
|
|
24614
|
+
const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
|
|
24629
24615
|
return {
|
|
24630
24616
|
score: score2,
|
|
24631
24617
|
verdict,
|
|
24632
|
-
|
|
24633
|
-
misses: misses2,
|
|
24618
|
+
assertions: assertions2,
|
|
24634
24619
|
expectedAspectCount: rubrics.length,
|
|
24635
|
-
reasoning: data2.overall_reasoning,
|
|
24636
24620
|
evaluatorRawRequest,
|
|
24637
24621
|
details
|
|
24638
24622
|
};
|
|
24639
24623
|
}
|
|
24640
24624
|
const data = freeformEvaluationSchema.parse(parsed);
|
|
24641
24625
|
const score = clampScore(data.score);
|
|
24642
|
-
const
|
|
24643
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
24626
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
24644
24627
|
return {
|
|
24645
24628
|
score,
|
|
24646
24629
|
verdict: scoreToVerdict(score),
|
|
24647
|
-
|
|
24648
|
-
|
|
24649
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
24650
|
-
reasoning: data.reasoning,
|
|
24630
|
+
assertions,
|
|
24631
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
24651
24632
|
evaluatorRawRequest,
|
|
24652
24633
|
details
|
|
24653
24634
|
};
|
|
@@ -24655,8 +24636,12 @@ ${outputSchema2}`;
|
|
|
24655
24636
|
return {
|
|
24656
24637
|
score: 0,
|
|
24657
24638
|
verdict: "fail",
|
|
24658
|
-
|
|
24659
|
-
|
|
24639
|
+
assertions: [
|
|
24640
|
+
{
|
|
24641
|
+
text: "Failed to parse llm-grader agent response as valid evaluation JSON",
|
|
24642
|
+
passed: false
|
|
24643
|
+
}
|
|
24644
|
+
],
|
|
24660
24645
|
expectedAspectCount: 1,
|
|
24661
24646
|
evaluatorRawRequest,
|
|
24662
24647
|
details
|
|
@@ -24785,9 +24770,13 @@ function buildOutputSchema() {
|
|
|
24785
24770
|
"",
|
|
24786
24771
|
"{",
|
|
24787
24772
|
' "score": <number between 0.0 and 1.0>,',
|
|
24788
|
-
' "
|
|
24789
|
-
|
|
24790
|
-
'
|
|
24773
|
+
' "assertions": [',
|
|
24774
|
+
" {",
|
|
24775
|
+
' "text": "<brief description of what was checked>",',
|
|
24776
|
+
' "passed": <boolean>,',
|
|
24777
|
+
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
24778
|
+
" }",
|
|
24779
|
+
" ]",
|
|
24791
24780
|
"}"
|
|
24792
24781
|
].join("\n");
|
|
24793
24782
|
}
|
|
@@ -24812,8 +24801,7 @@ function substituteVariables(template, variables) {
|
|
|
24812
24801
|
}
|
|
24813
24802
|
function calculateRubricScore(result, rubrics) {
|
|
24814
24803
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
24815
|
-
const
|
|
24816
|
-
const misses = [];
|
|
24804
|
+
const assertions = [];
|
|
24817
24805
|
let totalWeight = 0;
|
|
24818
24806
|
let earnedWeight = 0;
|
|
24819
24807
|
let failedRequired = false;
|
|
@@ -24823,19 +24811,20 @@ function calculateRubricScore(result, rubrics) {
|
|
|
24823
24811
|
continue;
|
|
24824
24812
|
}
|
|
24825
24813
|
totalWeight += rubric.weight;
|
|
24814
|
+
assertions.push({
|
|
24815
|
+
text: `[${rubric.id}] ${rubric.outcome}`,
|
|
24816
|
+
passed: check.satisfied,
|
|
24817
|
+
evidence: check.reasoning
|
|
24818
|
+
});
|
|
24826
24819
|
if (check.satisfied) {
|
|
24827
24820
|
earnedWeight += rubric.weight;
|
|
24828
|
-
|
|
24829
|
-
|
|
24830
|
-
misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
|
|
24831
|
-
if (rubric.required) {
|
|
24832
|
-
failedRequired = true;
|
|
24833
|
-
}
|
|
24821
|
+
} else if (rubric.required) {
|
|
24822
|
+
failedRequired = true;
|
|
24834
24823
|
}
|
|
24835
24824
|
}
|
|
24836
24825
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
24837
24826
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
24838
|
-
return { score, verdict,
|
|
24827
|
+
return { score, verdict, assertions };
|
|
24839
24828
|
}
|
|
24840
24829
|
function buildScoreRangeOutputSchema() {
|
|
24841
24830
|
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
@@ -24855,8 +24844,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
|
|
|
24855
24844
|
}
|
|
24856
24845
|
function calculateScoreRangeResult(result, rubrics) {
|
|
24857
24846
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
24858
|
-
const
|
|
24859
|
-
const misses = [];
|
|
24847
|
+
const assertions = [];
|
|
24860
24848
|
const rawScores = {};
|
|
24861
24849
|
let totalWeight = 0;
|
|
24862
24850
|
let weightedScoreSum = 0;
|
|
@@ -24882,24 +24870,22 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
24882
24870
|
);
|
|
24883
24871
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
24884
24872
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
24885
|
-
const
|
|
24886
|
-
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
24873
|
+
const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
|
|
24887
24874
|
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
24888
24875
|
failedRequired = true;
|
|
24889
|
-
misses.push(scoreInfo);
|
|
24890
|
-
} else if (rawScore >= 7) {
|
|
24891
|
-
hits.push(scoreInfo);
|
|
24892
|
-
} else {
|
|
24893
|
-
misses.push(scoreInfo);
|
|
24894
24876
|
}
|
|
24877
|
+
assertions.push({
|
|
24878
|
+
text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
|
|
24879
|
+
passed,
|
|
24880
|
+
evidence: check.reasoning
|
|
24881
|
+
});
|
|
24895
24882
|
}
|
|
24896
24883
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
24897
24884
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
24898
24885
|
return {
|
|
24899
24886
|
score,
|
|
24900
24887
|
verdict,
|
|
24901
|
-
|
|
24902
|
-
misses,
|
|
24888
|
+
assertions,
|
|
24903
24889
|
details: {
|
|
24904
24890
|
raw_scores: rawScores,
|
|
24905
24891
|
normalization: "score / 10",
|
|
@@ -25073,9 +25059,7 @@ var CompositeEvaluator = class {
|
|
|
25073
25059
|
let totalWeight = 0;
|
|
25074
25060
|
let weightedSum = 0;
|
|
25075
25061
|
let evaluatedCount = 0;
|
|
25076
|
-
const
|
|
25077
|
-
const allMisses = [];
|
|
25078
|
-
const reasoningParts = [];
|
|
25062
|
+
const allAssertions = [];
|
|
25079
25063
|
const scores = [];
|
|
25080
25064
|
for (const member of results) {
|
|
25081
25065
|
const weight = weights?.[member.id] ?? 1;
|
|
@@ -25085,9 +25069,7 @@ var CompositeEvaluator = class {
|
|
|
25085
25069
|
score: member.result.score,
|
|
25086
25070
|
weight,
|
|
25087
25071
|
verdict: member.result.verdict,
|
|
25088
|
-
|
|
25089
|
-
misses: [...member.result.misses],
|
|
25090
|
-
reasoning: member.result.reasoning,
|
|
25072
|
+
assertions: [...member.result.assertions],
|
|
25091
25073
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25092
25074
|
scores: member.result.scores,
|
|
25093
25075
|
details: member.result.details,
|
|
@@ -25099,20 +25081,16 @@ var CompositeEvaluator = class {
|
|
|
25099
25081
|
evaluatedCount++;
|
|
25100
25082
|
totalWeight += weight;
|
|
25101
25083
|
weightedSum += member.result.score * weight;
|
|
25102
|
-
|
|
25103
|
-
|
|
25104
|
-
|
|
25105
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
25106
|
-
}
|
|
25084
|
+
allAssertions.push(
|
|
25085
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
25086
|
+
);
|
|
25107
25087
|
}
|
|
25108
25088
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
25109
25089
|
return {
|
|
25110
25090
|
score: 0,
|
|
25111
25091
|
verdict: "skip",
|
|
25112
|
-
|
|
25113
|
-
misses: [],
|
|
25092
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
25114
25093
|
expectedAspectCount: 1,
|
|
25115
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
25116
25094
|
evaluatorRawRequest: {
|
|
25117
25095
|
aggregator: "weighted_average",
|
|
25118
25096
|
...weights ? { weights } : {}
|
|
@@ -25124,10 +25102,8 @@ var CompositeEvaluator = class {
|
|
|
25124
25102
|
return {
|
|
25125
25103
|
score: clampScore(finalScore),
|
|
25126
25104
|
verdict: scoreToVerdict(finalScore),
|
|
25127
|
-
|
|
25128
|
-
|
|
25129
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
25130
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
25105
|
+
assertions: allAssertions,
|
|
25106
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
25131
25107
|
evaluatorRawRequest: {
|
|
25132
25108
|
aggregator: "weighted_average",
|
|
25133
25109
|
...weights ? { weights } : {}
|
|
@@ -25137,11 +25113,8 @@ var CompositeEvaluator = class {
|
|
|
25137
25113
|
}
|
|
25138
25114
|
runThreshold(results, threshold) {
|
|
25139
25115
|
const scores = [];
|
|
25140
|
-
const
|
|
25141
|
-
const allMisses = [];
|
|
25142
|
-
const reasoningParts = [];
|
|
25116
|
+
const allAssertions = [];
|
|
25143
25117
|
let passingCount = 0;
|
|
25144
|
-
let borderlineCount = 0;
|
|
25145
25118
|
let evaluatedCount = 0;
|
|
25146
25119
|
for (const member of results) {
|
|
25147
25120
|
scores.push({
|
|
@@ -25149,9 +25122,7 @@ var CompositeEvaluator = class {
|
|
|
25149
25122
|
type: member.type,
|
|
25150
25123
|
score: member.result.score,
|
|
25151
25124
|
verdict: member.result.verdict,
|
|
25152
|
-
|
|
25153
|
-
misses: [...member.result.misses],
|
|
25154
|
-
reasoning: member.result.reasoning,
|
|
25125
|
+
assertions: [...member.result.assertions],
|
|
25155
25126
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25156
25127
|
scores: member.result.scores,
|
|
25157
25128
|
details: member.result.details,
|
|
@@ -25164,24 +25135,17 @@ var CompositeEvaluator = class {
|
|
|
25164
25135
|
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
25165
25136
|
if (isPassing) {
|
|
25166
25137
|
passingCount++;
|
|
25167
|
-
if (member.result.verdict === "borderline") {
|
|
25168
|
-
borderlineCount++;
|
|
25169
|
-
}
|
|
25170
|
-
}
|
|
25171
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
25172
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
25173
|
-
if (member.result.reasoning) {
|
|
25174
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
25175
25138
|
}
|
|
25139
|
+
allAssertions.push(
|
|
25140
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
25141
|
+
);
|
|
25176
25142
|
}
|
|
25177
25143
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
25178
25144
|
return {
|
|
25179
25145
|
score: 0,
|
|
25180
25146
|
verdict: "skip",
|
|
25181
|
-
|
|
25182
|
-
misses: [],
|
|
25147
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
25183
25148
|
expectedAspectCount: 1,
|
|
25184
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
25185
25149
|
evaluatorRawRequest: {
|
|
25186
25150
|
aggregator: "threshold",
|
|
25187
25151
|
threshold
|
|
@@ -25192,19 +25156,15 @@ var CompositeEvaluator = class {
|
|
|
25192
25156
|
const totalCount = evaluatedCount;
|
|
25193
25157
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
25194
25158
|
const pass = score >= threshold;
|
|
25195
|
-
|
|
25196
|
-
|
|
25197
|
-
|
|
25198
|
-
|
|
25199
|
-
`${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
|
|
25200
|
-
);
|
|
25159
|
+
allAssertions.unshift({
|
|
25160
|
+
text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
|
|
25161
|
+
passed: pass
|
|
25162
|
+
});
|
|
25201
25163
|
return {
|
|
25202
25164
|
score: clampScore(score),
|
|
25203
25165
|
verdict: pass ? "pass" : "fail",
|
|
25204
|
-
|
|
25205
|
-
|
|
25206
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
25207
|
-
reasoning: reasoningParts.join("; "),
|
|
25166
|
+
assertions: allAssertions,
|
|
25167
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
25208
25168
|
evaluatorRawRequest: {
|
|
25209
25169
|
aggregator: "threshold",
|
|
25210
25170
|
threshold
|
|
@@ -25221,9 +25181,7 @@ var CompositeEvaluator = class {
|
|
|
25221
25181
|
score: member.result.score,
|
|
25222
25182
|
weight: weights?.[member.id] ?? 1,
|
|
25223
25183
|
verdict: member.result.verdict,
|
|
25224
|
-
|
|
25225
|
-
misses: [...member.result.misses],
|
|
25226
|
-
reasoning: member.result.reasoning,
|
|
25184
|
+
assertions: [...member.result.assertions],
|
|
25227
25185
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25228
25186
|
scores: member.result.scores,
|
|
25229
25187
|
details: member.result.details
|
|
@@ -25232,17 +25190,19 @@ var CompositeEvaluator = class {
|
|
|
25232
25190
|
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
25233
25191
|
const parsed = parseJsonSafe(stdout);
|
|
25234
25192
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
25235
|
-
const
|
|
25236
|
-
|
|
25237
|
-
|
|
25193
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
25194
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
25195
|
+
).map((a) => ({
|
|
25196
|
+
text: String(a.text),
|
|
25197
|
+
passed: Boolean(a.passed),
|
|
25198
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
25199
|
+
})) : [];
|
|
25238
25200
|
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
25239
25201
|
return {
|
|
25240
25202
|
score,
|
|
25241
25203
|
verdict,
|
|
25242
|
-
|
|
25243
|
-
|
|
25244
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
25245
|
-
reasoning,
|
|
25204
|
+
assertions,
|
|
25205
|
+
expectedAspectCount: assertions.length || 1,
|
|
25246
25206
|
evaluatorRawRequest: {
|
|
25247
25207
|
aggregator: "code-grader",
|
|
25248
25208
|
script: scriptPath
|
|
@@ -25254,10 +25214,8 @@ var CompositeEvaluator = class {
|
|
|
25254
25214
|
return {
|
|
25255
25215
|
score: 0,
|
|
25256
25216
|
verdict: "fail",
|
|
25257
|
-
|
|
25258
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
25217
|
+
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
25259
25218
|
expectedAspectCount: 1,
|
|
25260
|
-
reasoning: message,
|
|
25261
25219
|
evaluatorRawRequest: {
|
|
25262
25220
|
aggregator: "code-grader",
|
|
25263
25221
|
script: scriptPath,
|
|
@@ -25279,9 +25237,7 @@ var CompositeEvaluator = class {
|
|
|
25279
25237
|
type: member.type,
|
|
25280
25238
|
score: member.result.score,
|
|
25281
25239
|
verdict: member.result.verdict,
|
|
25282
|
-
|
|
25283
|
-
misses: [...member.result.misses],
|
|
25284
|
-
reasoning: member.result.reasoning,
|
|
25240
|
+
assertions: [...member.result.assertions],
|
|
25285
25241
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
25286
25242
|
scores: member.result.scores,
|
|
25287
25243
|
details: member.result.details
|
|
@@ -25305,16 +25261,12 @@ var CompositeEvaluator = class {
|
|
|
25305
25261
|
});
|
|
25306
25262
|
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
|
|
25307
25263
|
const score2 = clampScore(data2.score);
|
|
25308
|
-
const
|
|
25309
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
25310
|
-
const reasoning2 = data2.reasoning;
|
|
25264
|
+
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
25311
25265
|
return {
|
|
25312
25266
|
score: score2,
|
|
25313
25267
|
verdict: scoreToVerdict(score2),
|
|
25314
|
-
|
|
25315
|
-
|
|
25316
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
25317
|
-
reasoning: reasoning2,
|
|
25268
|
+
assertions: assertions2,
|
|
25269
|
+
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
25318
25270
|
evaluatorRawRequest,
|
|
25319
25271
|
scores
|
|
25320
25272
|
};
|
|
@@ -25329,16 +25281,12 @@ var CompositeEvaluator = class {
|
|
|
25329
25281
|
parseJsonFromText(extractLastAssistantContent(response.output))
|
|
25330
25282
|
);
|
|
25331
25283
|
const score = clampScore(data.score);
|
|
25332
|
-
const
|
|
25333
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
25334
|
-
const reasoning = data.reasoning;
|
|
25284
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
25335
25285
|
return {
|
|
25336
25286
|
score,
|
|
25337
25287
|
verdict: scoreToVerdict(score),
|
|
25338
|
-
|
|
25339
|
-
|
|
25340
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
25341
|
-
reasoning,
|
|
25288
|
+
assertions,
|
|
25289
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
25342
25290
|
evaluatorRawRequest,
|
|
25343
25291
|
scores
|
|
25344
25292
|
};
|
|
@@ -25346,8 +25294,7 @@ var CompositeEvaluator = class {
|
|
|
25346
25294
|
return {
|
|
25347
25295
|
score: 0,
|
|
25348
25296
|
verdict: "fail",
|
|
25349
|
-
|
|
25350
|
-
misses: [],
|
|
25297
|
+
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
25351
25298
|
expectedAspectCount: 1,
|
|
25352
25299
|
evaluatorRawRequest,
|
|
25353
25300
|
scores
|
|
@@ -25368,10 +25315,8 @@ var CostEvaluator = class {
|
|
|
25368
25315
|
return {
|
|
25369
25316
|
score: 0,
|
|
25370
25317
|
verdict: "fail",
|
|
25371
|
-
|
|
25372
|
-
misses: ["No cost data available in trace"],
|
|
25318
|
+
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
25373
25319
|
expectedAspectCount: 1,
|
|
25374
|
-
reasoning: "Execution cost not reported by provider",
|
|
25375
25320
|
evaluatorRawRequest: {
|
|
25376
25321
|
type: "cost",
|
|
25377
25322
|
budget,
|
|
@@ -25385,10 +25330,10 @@ var CostEvaluator = class {
|
|
|
25385
25330
|
return {
|
|
25386
25331
|
score,
|
|
25387
25332
|
verdict: passed ? "pass" : "fail",
|
|
25388
|
-
|
|
25389
|
-
|
|
25333
|
+
assertions: [
|
|
25334
|
+
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
25335
|
+
],
|
|
25390
25336
|
expectedAspectCount: 1,
|
|
25391
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
25392
25337
|
evaluatorRawRequest: {
|
|
25393
25338
|
type: "cost",
|
|
25394
25339
|
budget,
|
|
@@ -25419,10 +25364,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
25419
25364
|
return {
|
|
25420
25365
|
score: 0,
|
|
25421
25366
|
verdict: "fail",
|
|
25422
|
-
|
|
25423
|
-
misses: ["No trace summary available"],
|
|
25367
|
+
assertions: [{ text: "No trace summary available", passed: false }],
|
|
25424
25368
|
expectedAspectCount: 1,
|
|
25425
|
-
reasoning: "Execution metrics not available - no trace summary provided",
|
|
25426
25369
|
evaluatorRawRequest: {
|
|
25427
25370
|
type: "execution-metrics",
|
|
25428
25371
|
config: this.extractConfiguredThresholds(),
|
|
@@ -25431,116 +25374,114 @@ var ExecutionMetricsEvaluator = class {
|
|
|
25431
25374
|
};
|
|
25432
25375
|
}
|
|
25433
25376
|
const narrowedTrace = trace2;
|
|
25434
|
-
const
|
|
25435
|
-
const misses = [];
|
|
25377
|
+
const assertions = [];
|
|
25436
25378
|
const actualMetrics = {};
|
|
25437
25379
|
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
25438
25380
|
const toolCalls = narrowedTrace.eventCount;
|
|
25439
25381
|
actualMetrics.tool_calls = toolCalls;
|
|
25440
25382
|
if (toolCalls <= max_tool_calls) {
|
|
25441
|
-
|
|
25383
|
+
assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
|
|
25442
25384
|
} else {
|
|
25443
|
-
|
|
25385
|
+
assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
|
|
25444
25386
|
}
|
|
25445
25387
|
}
|
|
25446
25388
|
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
25447
25389
|
const llmCalls = narrowedTrace.llmCallCount;
|
|
25448
25390
|
if (llmCalls === void 0) {
|
|
25449
|
-
|
|
25391
|
+
assertions.push({ text: "LLM call count data not available", passed: false });
|
|
25450
25392
|
} else {
|
|
25451
25393
|
actualMetrics.llm_calls = llmCalls;
|
|
25452
25394
|
if (llmCalls <= max_llm_calls) {
|
|
25453
|
-
|
|
25395
|
+
assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
|
|
25454
25396
|
} else {
|
|
25455
|
-
|
|
25397
|
+
assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
|
|
25456
25398
|
}
|
|
25457
25399
|
}
|
|
25458
25400
|
}
|
|
25459
25401
|
if (max_tokens !== void 0) {
|
|
25460
25402
|
if (!tokenUsage) {
|
|
25461
|
-
|
|
25403
|
+
assertions.push({ text: "Token usage data not available", passed: false });
|
|
25462
25404
|
} else {
|
|
25463
25405
|
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
25464
25406
|
actualMetrics.tokens = totalTokens;
|
|
25465
25407
|
if (totalTokens <= max_tokens) {
|
|
25466
|
-
|
|
25408
|
+
assertions.push({
|
|
25409
|
+
text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
|
|
25410
|
+
passed: true
|
|
25411
|
+
});
|
|
25467
25412
|
} else {
|
|
25468
|
-
|
|
25413
|
+
assertions.push({
|
|
25414
|
+
text: `Total tokens ${totalTokens} > ${max_tokens} max`,
|
|
25415
|
+
passed: false
|
|
25416
|
+
});
|
|
25469
25417
|
}
|
|
25470
25418
|
}
|
|
25471
25419
|
}
|
|
25472
25420
|
if (max_cost_usd !== void 0) {
|
|
25473
25421
|
if (costUsd === void 0) {
|
|
25474
|
-
|
|
25422
|
+
assertions.push({ text: "Cost data not available", passed: false });
|
|
25475
25423
|
} else {
|
|
25476
25424
|
actualMetrics.cost_usd = costUsd;
|
|
25477
25425
|
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
25478
25426
|
if (costUsd <= max_cost_usd) {
|
|
25479
|
-
|
|
25427
|
+
assertions.push({
|
|
25428
|
+
text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
|
|
25429
|
+
passed: true
|
|
25430
|
+
});
|
|
25480
25431
|
} else {
|
|
25481
|
-
|
|
25432
|
+
assertions.push({
|
|
25433
|
+
text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
|
|
25434
|
+
passed: false
|
|
25435
|
+
});
|
|
25482
25436
|
}
|
|
25483
25437
|
}
|
|
25484
25438
|
}
|
|
25485
25439
|
if (max_duration_ms !== void 0) {
|
|
25486
25440
|
if (durationMs === void 0) {
|
|
25487
|
-
|
|
25441
|
+
assertions.push({ text: "Duration data not available", passed: false });
|
|
25488
25442
|
} else {
|
|
25489
25443
|
actualMetrics.duration_ms = durationMs;
|
|
25490
25444
|
if (durationMs <= max_duration_ms) {
|
|
25491
|
-
|
|
25445
|
+
assertions.push({
|
|
25446
|
+
text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
|
|
25447
|
+
passed: true
|
|
25448
|
+
});
|
|
25492
25449
|
} else {
|
|
25493
|
-
|
|
25450
|
+
assertions.push({
|
|
25451
|
+
text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
|
|
25452
|
+
passed: false
|
|
25453
|
+
});
|
|
25494
25454
|
}
|
|
25495
25455
|
}
|
|
25496
25456
|
}
|
|
25497
25457
|
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
25498
25458
|
const ratio = explorationRatio(narrowedTrace);
|
|
25499
25459
|
if (ratio === void 0) {
|
|
25500
|
-
|
|
25460
|
+
assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
|
|
25501
25461
|
} else {
|
|
25502
25462
|
actualMetrics.exploration_ratio = ratio;
|
|
25503
25463
|
const diff = Math.abs(ratio - target_exploration_ratio);
|
|
25504
25464
|
if (diff <= exploration_tolerance) {
|
|
25505
|
-
|
|
25506
|
-
`Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}
|
|
25507
|
-
|
|
25465
|
+
assertions.push({
|
|
25466
|
+
text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
|
|
25467
|
+
passed: true
|
|
25468
|
+
});
|
|
25508
25469
|
} else {
|
|
25509
|
-
|
|
25510
|
-
`Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})
|
|
25511
|
-
|
|
25470
|
+
assertions.push({
|
|
25471
|
+
text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
|
|
25472
|
+
passed: false
|
|
25473
|
+
});
|
|
25512
25474
|
}
|
|
25513
25475
|
}
|
|
25514
25476
|
}
|
|
25515
|
-
const totalChecks =
|
|
25516
|
-
const
|
|
25517
|
-
const
|
|
25518
|
-
if (actualMetrics.tool_calls !== void 0) {
|
|
25519
|
-
reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
|
|
25520
|
-
}
|
|
25521
|
-
if (actualMetrics.llm_calls !== void 0) {
|
|
25522
|
-
reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
|
|
25523
|
-
}
|
|
25524
|
-
if (actualMetrics.tokens !== void 0) {
|
|
25525
|
-
reasoningParts.push(`tokens=${actualMetrics.tokens}`);
|
|
25526
|
-
}
|
|
25527
|
-
if (actualMetrics.cost_usd !== void 0) {
|
|
25528
|
-
reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
|
|
25529
|
-
}
|
|
25530
|
-
if (actualMetrics.duration_ms !== void 0) {
|
|
25531
|
-
reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
|
|
25532
|
-
}
|
|
25533
|
-
if (actualMetrics.exploration_ratio !== void 0) {
|
|
25534
|
-
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
25535
|
-
}
|
|
25536
|
-
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
25477
|
+
const totalChecks = assertions.length;
|
|
25478
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
25479
|
+
const score = totalChecks > 0 ? passedCount / totalChecks : 0;
|
|
25537
25480
|
return {
|
|
25538
25481
|
score,
|
|
25539
25482
|
verdict: scoreToVerdict(score),
|
|
25540
|
-
|
|
25541
|
-
misses,
|
|
25483
|
+
assertions,
|
|
25542
25484
|
expectedAspectCount: totalChecks || 1,
|
|
25543
|
-
reasoning,
|
|
25544
25485
|
evaluatorRawRequest: {
|
|
25545
25486
|
type: "execution-metrics",
|
|
25546
25487
|
config: this.extractConfiguredThresholds(),
|
|
@@ -25642,10 +25583,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
25642
25583
|
return {
|
|
25643
25584
|
score: 0,
|
|
25644
25585
|
verdict: "fail",
|
|
25645
|
-
|
|
25646
|
-
|
|
25647
|
-
expectedAspectCount: this.config.fields.length,
|
|
25648
|
-
reasoning: "Candidate answer is not valid JSON"
|
|
25586
|
+
assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
|
|
25587
|
+
expectedAspectCount: this.config.fields.length
|
|
25649
25588
|
};
|
|
25650
25589
|
}
|
|
25651
25590
|
const expectedData = this.extractExpectedData(evalCase.expected_output);
|
|
@@ -25653,10 +25592,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
25653
25592
|
return {
|
|
25654
25593
|
score: 0,
|
|
25655
25594
|
verdict: "fail",
|
|
25656
|
-
|
|
25657
|
-
|
|
25658
|
-
expectedAspectCount: this.config.fields.length,
|
|
25659
|
-
reasoning: "Could not extract expected data from expected_output"
|
|
25595
|
+
assertions: [{ text: "No expected data found in expected_output", passed: false }],
|
|
25596
|
+
expectedAspectCount: this.config.fields.length
|
|
25660
25597
|
};
|
|
25661
25598
|
}
|
|
25662
25599
|
const fieldResults = [];
|
|
@@ -25874,18 +25811,14 @@ var FieldAccuracyEvaluator = class {
|
|
|
25874
25811
|
*/
|
|
25875
25812
|
aggregateResults(results) {
|
|
25876
25813
|
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
25877
|
-
const
|
|
25878
|
-
const misses = [];
|
|
25814
|
+
const assertions = [];
|
|
25879
25815
|
for (const result of results) {
|
|
25880
|
-
|
|
25881
|
-
hits.push(result.message);
|
|
25882
|
-
} else {
|
|
25883
|
-
misses.push(result.message);
|
|
25884
|
-
}
|
|
25816
|
+
assertions.push({ text: result.message, passed: result.hit });
|
|
25885
25817
|
}
|
|
25886
25818
|
let score;
|
|
25887
25819
|
if (aggregation === "all_or_nothing") {
|
|
25888
|
-
|
|
25820
|
+
const hasFailed = assertions.some((a) => !a.passed);
|
|
25821
|
+
score = hasFailed ? 0 : 1;
|
|
25889
25822
|
} else {
|
|
25890
25823
|
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
25891
25824
|
if (totalWeight === 0) {
|
|
@@ -25895,15 +25828,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
25895
25828
|
score = weightedSum / totalWeight;
|
|
25896
25829
|
}
|
|
25897
25830
|
}
|
|
25898
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
25899
25831
|
return {
|
|
25900
25832
|
score: clampScore(score),
|
|
25901
25833
|
verdict: scoreToVerdict(score),
|
|
25902
|
-
|
|
25903
|
-
|
|
25904
|
-
misses: misses.slice(0, 4),
|
|
25905
|
-
expectedAspectCount: results.length,
|
|
25906
|
-
reasoning
|
|
25834
|
+
assertions,
|
|
25835
|
+
expectedAspectCount: results.length
|
|
25907
25836
|
};
|
|
25908
25837
|
}
|
|
25909
25838
|
};
|
|
@@ -26010,10 +25939,8 @@ var LatencyEvaluator = class {
|
|
|
26010
25939
|
return {
|
|
26011
25940
|
score: 0,
|
|
26012
25941
|
verdict: "fail",
|
|
26013
|
-
|
|
26014
|
-
misses: ["No duration data available in trace"],
|
|
25942
|
+
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
26015
25943
|
expectedAspectCount: 1,
|
|
26016
|
-
reasoning: "Execution duration not reported by provider",
|
|
26017
25944
|
evaluatorRawRequest: {
|
|
26018
25945
|
type: "latency",
|
|
26019
25946
|
threshold,
|
|
@@ -26026,10 +25953,10 @@ var LatencyEvaluator = class {
|
|
|
26026
25953
|
return {
|
|
26027
25954
|
score,
|
|
26028
25955
|
verdict: passed ? "pass" : "fail",
|
|
26029
|
-
|
|
26030
|
-
|
|
25956
|
+
assertions: [
|
|
25957
|
+
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
25958
|
+
],
|
|
26031
25959
|
expectedAspectCount: 1,
|
|
26032
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
26033
25960
|
evaluatorRawRequest: {
|
|
26034
25961
|
type: "latency",
|
|
26035
25962
|
threshold,
|
|
@@ -26048,7 +25975,10 @@ var COPILOT_MATCHER = {
|
|
|
26048
25975
|
skillTools: ["Skill", "skill"],
|
|
26049
25976
|
skillInputField: "skill",
|
|
26050
25977
|
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
26051
|
-
readInputField: "file_path"
|
|
25978
|
+
readInputField: "file_path",
|
|
25979
|
+
skillToolPrefixes: ["Using skill: "],
|
|
25980
|
+
readToolPrefixes: ["Viewing "],
|
|
25981
|
+
readInputFields: ["file_path", "path"]
|
|
26052
25982
|
};
|
|
26053
25983
|
var PROVIDER_TOOL_SEMANTICS = {
|
|
26054
25984
|
claude: CLAUDE_MATCHER,
|
|
@@ -26090,12 +26020,22 @@ var SkillTriggerEvaluator = class {
|
|
|
26090
26020
|
triggered = true;
|
|
26091
26021
|
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
|
|
26092
26022
|
}
|
|
26023
|
+
} else if (matcher.skillToolPrefixes?.some(
|
|
26024
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
26025
|
+
)) {
|
|
26026
|
+
triggered = true;
|
|
26027
|
+
evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
|
|
26093
26028
|
} else if (matcher.readTools.includes(firstTool.tool)) {
|
|
26094
|
-
const filePath =
|
|
26029
|
+
const filePath = this.readPathFromInput(input, matcher);
|
|
26095
26030
|
if (filePath.includes(skillName)) {
|
|
26096
26031
|
triggered = true;
|
|
26097
26032
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
26098
26033
|
}
|
|
26034
|
+
} else if (matcher.readToolPrefixes?.some(
|
|
26035
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
26036
|
+
)) {
|
|
26037
|
+
triggered = true;
|
|
26038
|
+
evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
|
|
26099
26039
|
}
|
|
26100
26040
|
}
|
|
26101
26041
|
const pass = triggered === shouldTrigger;
|
|
@@ -26103,25 +26043,37 @@ var SkillTriggerEvaluator = class {
|
|
|
26103
26043
|
return {
|
|
26104
26044
|
score: 1,
|
|
26105
26045
|
verdict: "pass",
|
|
26106
|
-
|
|
26107
|
-
|
|
26046
|
+
assertions: [
|
|
26047
|
+
{
|
|
26048
|
+
text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
|
|
26049
|
+
passed: true
|
|
26050
|
+
}
|
|
26108
26051
|
],
|
|
26109
|
-
|
|
26110
|
-
expectedAspectCount: 1,
|
|
26111
|
-
reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
|
|
26052
|
+
expectedAspectCount: 1
|
|
26112
26053
|
};
|
|
26113
26054
|
}
|
|
26114
26055
|
return {
|
|
26115
26056
|
score: 0,
|
|
26116
26057
|
verdict: "fail",
|
|
26117
|
-
|
|
26118
|
-
|
|
26119
|
-
|
|
26058
|
+
assertions: [
|
|
26059
|
+
{
|
|
26060
|
+
text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
|
|
26061
|
+
passed: false
|
|
26062
|
+
}
|
|
26120
26063
|
],
|
|
26121
|
-
expectedAspectCount: 1
|
|
26122
|
-
reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
|
|
26064
|
+
expectedAspectCount: 1
|
|
26123
26065
|
};
|
|
26124
26066
|
}
|
|
26067
|
+
readPathFromInput(input, matcher) {
|
|
26068
|
+
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
26069
|
+
for (const field of fields) {
|
|
26070
|
+
const value = input[field];
|
|
26071
|
+
if (value !== void 0 && value !== null) {
|
|
26072
|
+
return String(value);
|
|
26073
|
+
}
|
|
26074
|
+
}
|
|
26075
|
+
return "";
|
|
26076
|
+
}
|
|
26125
26077
|
};
|
|
26126
26078
|
function assembleLlmGraderPrompt(input) {
|
|
26127
26079
|
const {
|
|
@@ -26154,12 +26106,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
26154
26106
|
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
|
|
26155
26107
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
26156
26108
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
26157
|
-
[TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
|
|
26158
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
|
|
26159
26109
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
26160
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
26161
26110
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
26162
|
-
// Text convenience accessors (new names, always strings)
|
|
26163
26111
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
26164
26112
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
26165
26113
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -26284,10 +26232,8 @@ var TokenUsageEvaluator = class {
|
|
|
26284
26232
|
return {
|
|
26285
26233
|
score: 0,
|
|
26286
26234
|
verdict: "fail",
|
|
26287
|
-
|
|
26288
|
-
misses: ["No token usage data available in trace"],
|
|
26235
|
+
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
26289
26236
|
expectedAspectCount,
|
|
26290
|
-
reasoning: "Token usage not reported by provider",
|
|
26291
26237
|
evaluatorRawRequest: {
|
|
26292
26238
|
type: "token-usage",
|
|
26293
26239
|
max_total: maxTotal ?? null,
|
|
@@ -26301,37 +26247,34 @@ var TokenUsageEvaluator = class {
|
|
|
26301
26247
|
const output = usage.output;
|
|
26302
26248
|
const cached = usage.cached ?? 0;
|
|
26303
26249
|
const total = input + output + cached;
|
|
26304
|
-
const
|
|
26305
|
-
const misses = [];
|
|
26250
|
+
const assertions = [];
|
|
26306
26251
|
if (typeof maxInput === "number") {
|
|
26307
26252
|
if (input <= maxInput) {
|
|
26308
|
-
|
|
26253
|
+
assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
|
|
26309
26254
|
} else {
|
|
26310
|
-
|
|
26255
|
+
assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
|
|
26311
26256
|
}
|
|
26312
26257
|
}
|
|
26313
26258
|
if (typeof maxOutput === "number") {
|
|
26314
26259
|
if (output <= maxOutput) {
|
|
26315
|
-
|
|
26260
|
+
assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
|
|
26316
26261
|
} else {
|
|
26317
|
-
|
|
26262
|
+
assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
|
|
26318
26263
|
}
|
|
26319
26264
|
}
|
|
26320
26265
|
if (typeof maxTotal === "number") {
|
|
26321
26266
|
if (total <= maxTotal) {
|
|
26322
|
-
|
|
26267
|
+
assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
|
|
26323
26268
|
} else {
|
|
26324
|
-
|
|
26269
|
+
assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
|
|
26325
26270
|
}
|
|
26326
26271
|
}
|
|
26327
|
-
const passed =
|
|
26272
|
+
const passed = assertions.every((a) => a.passed);
|
|
26328
26273
|
return {
|
|
26329
26274
|
score: passed ? 1 : 0,
|
|
26330
26275
|
verdict: passed ? "pass" : "fail",
|
|
26331
|
-
|
|
26332
|
-
misses,
|
|
26276
|
+
assertions,
|
|
26333
26277
|
expectedAspectCount,
|
|
26334
|
-
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
26335
26278
|
evaluatorRawRequest: {
|
|
26336
26279
|
type: "token-usage",
|
|
26337
26280
|
max_total: maxTotal ?? null,
|
|
@@ -26429,8 +26372,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26429
26372
|
return {
|
|
26430
26373
|
score: 0,
|
|
26431
26374
|
verdict: "fail",
|
|
26432
|
-
|
|
26433
|
-
misses: ["No trace available for evaluation"],
|
|
26375
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
26434
26376
|
expectedAspectCount: 1
|
|
26435
26377
|
};
|
|
26436
26378
|
}
|
|
@@ -26441,8 +26383,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26441
26383
|
return {
|
|
26442
26384
|
score: 0,
|
|
26443
26385
|
verdict: "fail",
|
|
26444
|
-
|
|
26445
|
-
misses: ["No trace available for evaluation"],
|
|
26386
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
26446
26387
|
expectedAspectCount: 1
|
|
26447
26388
|
};
|
|
26448
26389
|
}
|
|
@@ -26460,8 +26401,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26460
26401
|
return {
|
|
26461
26402
|
score: 0,
|
|
26462
26403
|
verdict: "fail",
|
|
26463
|
-
|
|
26464
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
26404
|
+
assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
|
|
26465
26405
|
expectedAspectCount: 1
|
|
26466
26406
|
};
|
|
26467
26407
|
}
|
|
@@ -26510,28 +26450,32 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26510
26450
|
return {
|
|
26511
26451
|
score: 1,
|
|
26512
26452
|
verdict: "pass",
|
|
26513
|
-
|
|
26514
|
-
misses: [],
|
|
26453
|
+
assertions: [{ text: "No tool requirements specified", passed: true }],
|
|
26515
26454
|
expectedAspectCount: 0
|
|
26516
26455
|
};
|
|
26517
26456
|
}
|
|
26518
|
-
const
|
|
26519
|
-
const misses = [];
|
|
26457
|
+
const assertions = [];
|
|
26520
26458
|
for (const toolName of toolNames) {
|
|
26521
26459
|
const required = minimums[toolName];
|
|
26522
26460
|
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
26523
26461
|
if (actual >= required) {
|
|
26524
|
-
|
|
26462
|
+
assertions.push({
|
|
26463
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
26464
|
+
passed: true
|
|
26465
|
+
});
|
|
26525
26466
|
} else {
|
|
26526
|
-
|
|
26467
|
+
assertions.push({
|
|
26468
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
26469
|
+
passed: false
|
|
26470
|
+
});
|
|
26527
26471
|
}
|
|
26528
26472
|
}
|
|
26529
|
-
const
|
|
26473
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26474
|
+
const score = passedCount / toolNames.length;
|
|
26530
26475
|
return {
|
|
26531
26476
|
score,
|
|
26532
26477
|
verdict: scoreToVerdict(score),
|
|
26533
|
-
|
|
26534
|
-
misses,
|
|
26478
|
+
assertions,
|
|
26535
26479
|
expectedAspectCount: toolNames.length
|
|
26536
26480
|
};
|
|
26537
26481
|
}
|
|
@@ -26541,13 +26485,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26541
26485
|
return {
|
|
26542
26486
|
score: 1,
|
|
26543
26487
|
verdict: "pass",
|
|
26544
|
-
|
|
26545
|
-
misses: [],
|
|
26488
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
26546
26489
|
expectedAspectCount: 0
|
|
26547
26490
|
};
|
|
26548
26491
|
}
|
|
26549
|
-
const
|
|
26550
|
-
const misses = [];
|
|
26492
|
+
const assertions = [];
|
|
26551
26493
|
const warnings = [];
|
|
26552
26494
|
let actualIndex = 0;
|
|
26553
26495
|
let sequenceHits = 0;
|
|
@@ -26567,16 +26509,20 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26567
26509
|
const actualCall = toolCalls[actualIndex];
|
|
26568
26510
|
if (actualCall.name === expectedTool) {
|
|
26569
26511
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26570
|
-
|
|
26512
|
+
assertions.push({
|
|
26513
|
+
text: `Found ${expectedTool} at position ${actualIndex}`,
|
|
26514
|
+
passed: true
|
|
26515
|
+
});
|
|
26571
26516
|
sequenceHits++;
|
|
26572
26517
|
matchedCall = actualCall;
|
|
26573
26518
|
actualIndex++;
|
|
26574
26519
|
found = true;
|
|
26575
26520
|
break;
|
|
26576
26521
|
}
|
|
26577
|
-
|
|
26578
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch
|
|
26579
|
-
|
|
26522
|
+
assertions.push({
|
|
26523
|
+
text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
|
|
26524
|
+
passed: false
|
|
26525
|
+
});
|
|
26580
26526
|
actualIndex++;
|
|
26581
26527
|
argsMismatch = true;
|
|
26582
26528
|
break;
|
|
@@ -26584,7 +26530,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26584
26530
|
actualIndex++;
|
|
26585
26531
|
}
|
|
26586
26532
|
if (!found && !argsMismatch) {
|
|
26587
|
-
|
|
26533
|
+
assertions.push({
|
|
26534
|
+
text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
|
|
26535
|
+
passed: false
|
|
26536
|
+
});
|
|
26588
26537
|
}
|
|
26589
26538
|
if (found && matchedCall) {
|
|
26590
26539
|
const latencyResult = checkLatency(
|
|
@@ -26593,10 +26542,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26593
26542
|
matchedCall.durationMs
|
|
26594
26543
|
);
|
|
26595
26544
|
if (latencyResult.status === "pass") {
|
|
26596
|
-
|
|
26545
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
26597
26546
|
latencyHits++;
|
|
26598
26547
|
} else if (latencyResult.status === "fail") {
|
|
26599
|
-
|
|
26548
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
26600
26549
|
} else if (latencyResult.message) {
|
|
26601
26550
|
warnings.push(latencyResult.message);
|
|
26602
26551
|
latencySkips++;
|
|
@@ -26612,8 +26561,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26612
26561
|
return {
|
|
26613
26562
|
score,
|
|
26614
26563
|
verdict: scoreToVerdict(score),
|
|
26615
|
-
|
|
26616
|
-
misses,
|
|
26564
|
+
assertions,
|
|
26617
26565
|
expectedAspectCount: totalAssertions
|
|
26618
26566
|
};
|
|
26619
26567
|
}
|
|
@@ -26623,13 +26571,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26623
26571
|
return {
|
|
26624
26572
|
score: 1,
|
|
26625
26573
|
verdict: "pass",
|
|
26626
|
-
|
|
26627
|
-
misses: [],
|
|
26574
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
26628
26575
|
expectedAspectCount: 0
|
|
26629
26576
|
};
|
|
26630
26577
|
}
|
|
26631
|
-
const
|
|
26632
|
-
const misses = [];
|
|
26578
|
+
const assertions = [];
|
|
26633
26579
|
const warnings = [];
|
|
26634
26580
|
let sequenceHits = 0;
|
|
26635
26581
|
let latencyHits = 0;
|
|
@@ -26638,7 +26584,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26638
26584
|
(item) => item.maxDurationMs !== void 0
|
|
26639
26585
|
).length;
|
|
26640
26586
|
if (toolCalls.length !== expected.length) {
|
|
26641
|
-
|
|
26587
|
+
assertions.push({
|
|
26588
|
+
text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
|
|
26589
|
+
passed: false
|
|
26590
|
+
});
|
|
26642
26591
|
}
|
|
26643
26592
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
26644
26593
|
for (let i = 0; i < checkLength; i++) {
|
|
@@ -26650,14 +26599,17 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26650
26599
|
let sequenceMatched = false;
|
|
26651
26600
|
if (actualTool === expectedTool) {
|
|
26652
26601
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26653
|
-
|
|
26602
|
+
assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
|
|
26654
26603
|
sequenceHits++;
|
|
26655
26604
|
sequenceMatched = true;
|
|
26656
26605
|
} else {
|
|
26657
|
-
|
|
26606
|
+
assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
|
|
26658
26607
|
}
|
|
26659
26608
|
} else {
|
|
26660
|
-
|
|
26609
|
+
assertions.push({
|
|
26610
|
+
text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
|
|
26611
|
+
passed: false
|
|
26612
|
+
});
|
|
26661
26613
|
}
|
|
26662
26614
|
if (sequenceMatched) {
|
|
26663
26615
|
const latencyResult = checkLatency(
|
|
@@ -26666,10 +26618,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26666
26618
|
actualCall.durationMs
|
|
26667
26619
|
);
|
|
26668
26620
|
if (latencyResult.status === "pass") {
|
|
26669
|
-
|
|
26621
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
26670
26622
|
latencyHits++;
|
|
26671
26623
|
} else if (latencyResult.status === "fail") {
|
|
26672
|
-
|
|
26624
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
26673
26625
|
} else if (latencyResult.message) {
|
|
26674
26626
|
warnings.push(latencyResult.message);
|
|
26675
26627
|
latencySkips++;
|
|
@@ -26677,7 +26629,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26677
26629
|
}
|
|
26678
26630
|
}
|
|
26679
26631
|
for (let i = checkLength; i < expected.length; i++) {
|
|
26680
|
-
|
|
26632
|
+
assertions.push({
|
|
26633
|
+
text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
|
|
26634
|
+
passed: false
|
|
26635
|
+
});
|
|
26681
26636
|
}
|
|
26682
26637
|
for (const warning of warnings) {
|
|
26683
26638
|
console.warn(`[tool-trajectory] ${warning}`);
|
|
@@ -26688,8 +26643,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26688
26643
|
return {
|
|
26689
26644
|
score,
|
|
26690
26645
|
verdict: scoreToVerdict(score),
|
|
26691
|
-
|
|
26692
|
-
misses,
|
|
26646
|
+
assertions,
|
|
26693
26647
|
expectedAspectCount: totalAssertions
|
|
26694
26648
|
};
|
|
26695
26649
|
}
|
|
@@ -26704,13 +26658,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26704
26658
|
return {
|
|
26705
26659
|
score: 1,
|
|
26706
26660
|
verdict: "pass",
|
|
26707
|
-
|
|
26708
|
-
misses: [],
|
|
26661
|
+
assertions: [{ text: "No expected tools specified", passed: true }],
|
|
26709
26662
|
expectedAspectCount: 0
|
|
26710
26663
|
};
|
|
26711
26664
|
}
|
|
26712
|
-
const
|
|
26713
|
-
const misses = [];
|
|
26665
|
+
const assertions = [];
|
|
26714
26666
|
const consumed = /* @__PURE__ */ new Set();
|
|
26715
26667
|
for (let i = 0; i < expected.length; i++) {
|
|
26716
26668
|
const expectedItem = expected[i];
|
|
@@ -26721,22 +26673,25 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26721
26673
|
if (consumed.has(j)) continue;
|
|
26722
26674
|
const actualCall = toolCalls[j];
|
|
26723
26675
|
if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
26724
|
-
|
|
26676
|
+
assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
|
|
26725
26677
|
consumed.add(j);
|
|
26726
26678
|
found = true;
|
|
26727
26679
|
break;
|
|
26728
26680
|
}
|
|
26729
26681
|
}
|
|
26730
26682
|
if (!found) {
|
|
26731
|
-
|
|
26683
|
+
assertions.push({
|
|
26684
|
+
text: `Expected ${expectedTool} not found in actual trajectory`,
|
|
26685
|
+
passed: false
|
|
26686
|
+
});
|
|
26732
26687
|
}
|
|
26733
26688
|
}
|
|
26734
|
-
const
|
|
26689
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26690
|
+
const score = expected.length > 0 ? passedCount / expected.length : 1;
|
|
26735
26691
|
return {
|
|
26736
26692
|
score,
|
|
26737
26693
|
verdict: scoreToVerdict(score),
|
|
26738
|
-
|
|
26739
|
-
misses,
|
|
26694
|
+
assertions,
|
|
26740
26695
|
expectedAspectCount: expected.length
|
|
26741
26696
|
};
|
|
26742
26697
|
}
|
|
@@ -26752,16 +26707,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26752
26707
|
return {
|
|
26753
26708
|
score: 1,
|
|
26754
26709
|
verdict: "pass",
|
|
26755
|
-
|
|
26756
|
-
misses: [],
|
|
26710
|
+
assertions: [{ text: "No tool calls and no expected tools", passed: true }],
|
|
26757
26711
|
expectedAspectCount: 0
|
|
26758
26712
|
};
|
|
26759
26713
|
}
|
|
26760
26714
|
return {
|
|
26761
26715
|
score: 0,
|
|
26762
26716
|
verdict: "fail",
|
|
26763
|
-
|
|
26764
|
-
|
|
26717
|
+
assertions: [
|
|
26718
|
+
{
|
|
26719
|
+
text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
|
|
26720
|
+
passed: false
|
|
26721
|
+
}
|
|
26722
|
+
],
|
|
26765
26723
|
expectedAspectCount: toolCalls.length
|
|
26766
26724
|
};
|
|
26767
26725
|
}
|
|
@@ -26769,13 +26727,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26769
26727
|
return {
|
|
26770
26728
|
score: 1,
|
|
26771
26729
|
verdict: "pass",
|
|
26772
|
-
|
|
26773
|
-
misses: [],
|
|
26730
|
+
assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
|
|
26774
26731
|
expectedAspectCount: 0
|
|
26775
26732
|
};
|
|
26776
26733
|
}
|
|
26777
|
-
const
|
|
26778
|
-
const misses = [];
|
|
26734
|
+
const assertions = [];
|
|
26779
26735
|
for (let i = 0; i < toolCalls.length; i++) {
|
|
26780
26736
|
const actualCall = toolCalls[i];
|
|
26781
26737
|
let allowed = false;
|
|
@@ -26787,17 +26743,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
26787
26743
|
}
|
|
26788
26744
|
}
|
|
26789
26745
|
if (allowed) {
|
|
26790
|
-
|
|
26746
|
+
assertions.push({
|
|
26747
|
+
text: `Position ${i}: ${actualCall.name} is in allowed set`,
|
|
26748
|
+
passed: true
|
|
26749
|
+
});
|
|
26791
26750
|
} else {
|
|
26792
|
-
|
|
26751
|
+
assertions.push({
|
|
26752
|
+
text: `Position ${i}: ${actualCall.name} is not in allowed set`,
|
|
26753
|
+
passed: false
|
|
26754
|
+
});
|
|
26793
26755
|
}
|
|
26794
26756
|
}
|
|
26795
|
-
const
|
|
26757
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
26758
|
+
const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
|
|
26796
26759
|
return {
|
|
26797
26760
|
score,
|
|
26798
26761
|
verdict: scoreToVerdict(score),
|
|
26799
|
-
|
|
26800
|
-
misses,
|
|
26762
|
+
assertions,
|
|
26801
26763
|
expectedAspectCount: toolCalls.length
|
|
26802
26764
|
};
|
|
26803
26765
|
}
|
|
@@ -26806,8 +26768,12 @@ function runContainsAssertion(output, value) {
|
|
|
26806
26768
|
const passed = output.includes(value);
|
|
26807
26769
|
return {
|
|
26808
26770
|
score: passed ? 1 : 0,
|
|
26809
|
-
|
|
26810
|
-
|
|
26771
|
+
assertions: [
|
|
26772
|
+
{
|
|
26773
|
+
text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
|
|
26774
|
+
passed
|
|
26775
|
+
}
|
|
26776
|
+
]
|
|
26811
26777
|
};
|
|
26812
26778
|
}
|
|
26813
26779
|
function runContainsAnyAssertion(output, values) {
|
|
@@ -26815,8 +26781,12 @@ function runContainsAnyAssertion(output, values) {
|
|
|
26815
26781
|
const passed = matched.length > 0;
|
|
26816
26782
|
return {
|
|
26817
26783
|
score: passed ? 1 : 0,
|
|
26818
|
-
|
|
26819
|
-
|
|
26784
|
+
assertions: [
|
|
26785
|
+
{
|
|
26786
|
+
text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
|
|
26787
|
+
passed
|
|
26788
|
+
}
|
|
26789
|
+
]
|
|
26820
26790
|
};
|
|
26821
26791
|
}
|
|
26822
26792
|
function runContainsAllAssertion(output, values) {
|
|
@@ -26824,16 +26794,24 @@ function runContainsAllAssertion(output, values) {
|
|
|
26824
26794
|
const passed = missing.length === 0;
|
|
26825
26795
|
return {
|
|
26826
26796
|
score: passed ? 1 : 0,
|
|
26827
|
-
|
|
26828
|
-
|
|
26797
|
+
assertions: [
|
|
26798
|
+
{
|
|
26799
|
+
text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
26800
|
+
passed
|
|
26801
|
+
}
|
|
26802
|
+
]
|
|
26829
26803
|
};
|
|
26830
26804
|
}
|
|
26831
26805
|
function runIcontainsAssertion(output, value) {
|
|
26832
26806
|
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
26833
26807
|
return {
|
|
26834
26808
|
score: passed ? 1 : 0,
|
|
26835
|
-
|
|
26836
|
-
|
|
26809
|
+
assertions: [
|
|
26810
|
+
{
|
|
26811
|
+
text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
|
|
26812
|
+
passed
|
|
26813
|
+
}
|
|
26814
|
+
]
|
|
26837
26815
|
};
|
|
26838
26816
|
}
|
|
26839
26817
|
function runIcontainsAnyAssertion(output, values) {
|
|
@@ -26842,9 +26820,11 @@ function runIcontainsAnyAssertion(output, values) {
|
|
|
26842
26820
|
const passed = matched.length > 0;
|
|
26843
26821
|
return {
|
|
26844
26822
|
score: passed ? 1 : 0,
|
|
26845
|
-
|
|
26846
|
-
|
|
26847
|
-
|
|
26823
|
+
assertions: [
|
|
26824
|
+
{
|
|
26825
|
+
text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
|
|
26826
|
+
passed
|
|
26827
|
+
}
|
|
26848
26828
|
]
|
|
26849
26829
|
};
|
|
26850
26830
|
}
|
|
@@ -26854,24 +26834,36 @@ function runIcontainsAllAssertion(output, values) {
|
|
|
26854
26834
|
const passed = missing.length === 0;
|
|
26855
26835
|
return {
|
|
26856
26836
|
score: passed ? 1 : 0,
|
|
26857
|
-
|
|
26858
|
-
|
|
26837
|
+
assertions: [
|
|
26838
|
+
{
|
|
26839
|
+
text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
26840
|
+
passed
|
|
26841
|
+
}
|
|
26842
|
+
]
|
|
26859
26843
|
};
|
|
26860
26844
|
}
|
|
26861
26845
|
function runStartsWithAssertion(output, value) {
|
|
26862
26846
|
const passed = output.trim().startsWith(value.trim());
|
|
26863
26847
|
return {
|
|
26864
26848
|
score: passed ? 1 : 0,
|
|
26865
|
-
|
|
26866
|
-
|
|
26849
|
+
assertions: [
|
|
26850
|
+
{
|
|
26851
|
+
text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
|
|
26852
|
+
passed
|
|
26853
|
+
}
|
|
26854
|
+
]
|
|
26867
26855
|
};
|
|
26868
26856
|
}
|
|
26869
26857
|
function runEndsWithAssertion(output, value) {
|
|
26870
26858
|
const passed = output.trim().endsWith(value.trim());
|
|
26871
26859
|
return {
|
|
26872
26860
|
score: passed ? 1 : 0,
|
|
26873
|
-
|
|
26874
|
-
|
|
26861
|
+
assertions: [
|
|
26862
|
+
{
|
|
26863
|
+
text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
|
|
26864
|
+
passed
|
|
26865
|
+
}
|
|
26866
|
+
]
|
|
26875
26867
|
};
|
|
26876
26868
|
}
|
|
26877
26869
|
function runRegexAssertion(output, pattern, flags) {
|
|
@@ -26880,8 +26872,12 @@ function runRegexAssertion(output, pattern, flags) {
|
|
|
26880
26872
|
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
26881
26873
|
return {
|
|
26882
26874
|
score: passed ? 1 : 0,
|
|
26883
|
-
|
|
26884
|
-
|
|
26875
|
+
assertions: [
|
|
26876
|
+
{
|
|
26877
|
+
text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
|
|
26878
|
+
passed
|
|
26879
|
+
}
|
|
26880
|
+
]
|
|
26885
26881
|
};
|
|
26886
26882
|
}
|
|
26887
26883
|
function runIsJsonAssertion(output) {
|
|
@@ -26893,16 +26889,24 @@ function runIsJsonAssertion(output) {
|
|
|
26893
26889
|
}
|
|
26894
26890
|
return {
|
|
26895
26891
|
score: passed ? 1 : 0,
|
|
26896
|
-
|
|
26897
|
-
|
|
26892
|
+
assertions: [
|
|
26893
|
+
{
|
|
26894
|
+
text: passed ? "Output is valid JSON" : "Output is not valid JSON",
|
|
26895
|
+
passed
|
|
26896
|
+
}
|
|
26897
|
+
]
|
|
26898
26898
|
};
|
|
26899
26899
|
}
|
|
26900
26900
|
function runEqualsAssertion(output, value) {
|
|
26901
26901
|
const passed = output.trim() === value.trim();
|
|
26902
26902
|
return {
|
|
26903
26903
|
score: passed ? 1 : 0,
|
|
26904
|
-
|
|
26905
|
-
|
|
26904
|
+
assertions: [
|
|
26905
|
+
{
|
|
26906
|
+
text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
|
|
26907
|
+
passed
|
|
26908
|
+
}
|
|
26909
|
+
]
|
|
26906
26910
|
};
|
|
26907
26911
|
}
|
|
26908
26912
|
var Node = class {
|
|
@@ -27101,10 +27105,8 @@ var InlineAssertEvaluator = class {
|
|
|
27101
27105
|
return {
|
|
27102
27106
|
score,
|
|
27103
27107
|
verdict: scoreToVerdict(score),
|
|
27104
|
-
|
|
27105
|
-
misses: score < 0.5 ? [result.name] : [],
|
|
27108
|
+
assertions: [{ text: result.name, passed: score >= 0.5 }],
|
|
27106
27109
|
expectedAspectCount: 1,
|
|
27107
|
-
reasoning: void 0,
|
|
27108
27110
|
details: result.metadata ? result.metadata : void 0
|
|
27109
27111
|
};
|
|
27110
27112
|
}
|
|
@@ -27139,11 +27141,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
|
|
|
27139
27141
|
}
|
|
27140
27142
|
async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
27141
27143
|
const payload = {
|
|
27142
|
-
question: context2.evalCase.question,
|
|
27143
27144
|
criteria: context2.evalCase.criteria,
|
|
27144
27145
|
expectedOutput: context2.evalCase.expected_output,
|
|
27145
|
-
|
|
27146
|
-
answer: context2.candidate,
|
|
27146
|
+
outputText: context2.candidate,
|
|
27147
27147
|
output: context2.output ?? null,
|
|
27148
27148
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
27149
27149
|
inputFiles: context2.evalCase.file_paths.filter(
|
|
@@ -27154,9 +27154,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
|
27154
27154
|
fileChanges: context2.fileChanges ?? null,
|
|
27155
27155
|
workspacePath: context2.workspacePath ?? null,
|
|
27156
27156
|
config: config ?? context2.config ?? null,
|
|
27157
|
-
// Text convenience accessors (new names, always strings)
|
|
27158
27157
|
inputText: context2.evalCase.question,
|
|
27159
|
-
outputText: context2.candidate,
|
|
27160
27158
|
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
27161
27159
|
};
|
|
27162
27160
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -27292,9 +27290,7 @@ var containsFactory = (config) => {
|
|
|
27292
27290
|
return {
|
|
27293
27291
|
score: result.score,
|
|
27294
27292
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27295
|
-
|
|
27296
|
-
misses: result.misses,
|
|
27297
|
-
reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
|
|
27293
|
+
assertions: result.assertions,
|
|
27298
27294
|
expectedAspectCount: 1
|
|
27299
27295
|
};
|
|
27300
27296
|
});
|
|
@@ -27306,9 +27302,7 @@ var regexFactory = (config) => {
|
|
|
27306
27302
|
return {
|
|
27307
27303
|
score: result.score,
|
|
27308
27304
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27309
|
-
|
|
27310
|
-
misses: result.misses,
|
|
27311
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
27305
|
+
assertions: result.assertions,
|
|
27312
27306
|
expectedAspectCount: 1
|
|
27313
27307
|
};
|
|
27314
27308
|
});
|
|
@@ -27319,9 +27313,7 @@ var isJsonFactory = () => {
|
|
|
27319
27313
|
return {
|
|
27320
27314
|
score: result.score,
|
|
27321
27315
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27322
|
-
|
|
27323
|
-
misses: result.misses,
|
|
27324
|
-
reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
|
|
27316
|
+
assertions: result.assertions,
|
|
27325
27317
|
expectedAspectCount: 1
|
|
27326
27318
|
};
|
|
27327
27319
|
});
|
|
@@ -27333,9 +27325,7 @@ var equalsFactory = (config) => {
|
|
|
27333
27325
|
return {
|
|
27334
27326
|
score: result.score,
|
|
27335
27327
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27336
|
-
|
|
27337
|
-
misses: result.misses,
|
|
27338
|
-
reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
|
|
27328
|
+
assertions: result.assertions,
|
|
27339
27329
|
expectedAspectCount: 1
|
|
27340
27330
|
};
|
|
27341
27331
|
});
|
|
@@ -27347,9 +27337,7 @@ var containsAnyFactory = (config) => {
|
|
|
27347
27337
|
return {
|
|
27348
27338
|
score: result.score,
|
|
27349
27339
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27350
|
-
|
|
27351
|
-
misses: result.misses,
|
|
27352
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27340
|
+
assertions: result.assertions,
|
|
27353
27341
|
expectedAspectCount: 1
|
|
27354
27342
|
};
|
|
27355
27343
|
});
|
|
@@ -27361,9 +27349,7 @@ var containsAllFactory = (config) => {
|
|
|
27361
27349
|
return {
|
|
27362
27350
|
score: result.score,
|
|
27363
27351
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27364
|
-
|
|
27365
|
-
misses: result.misses,
|
|
27366
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27352
|
+
assertions: result.assertions,
|
|
27367
27353
|
expectedAspectCount: 1
|
|
27368
27354
|
};
|
|
27369
27355
|
});
|
|
@@ -27375,9 +27361,7 @@ var icontainsFactory = (config) => {
|
|
|
27375
27361
|
return {
|
|
27376
27362
|
score: result.score,
|
|
27377
27363
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27378
|
-
|
|
27379
|
-
misses: result.misses,
|
|
27380
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27364
|
+
assertions: result.assertions,
|
|
27381
27365
|
expectedAspectCount: 1
|
|
27382
27366
|
};
|
|
27383
27367
|
});
|
|
@@ -27389,9 +27373,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
27389
27373
|
return {
|
|
27390
27374
|
score: result.score,
|
|
27391
27375
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27392
|
-
|
|
27393
|
-
misses: result.misses,
|
|
27394
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27376
|
+
assertions: result.assertions,
|
|
27395
27377
|
expectedAspectCount: 1
|
|
27396
27378
|
};
|
|
27397
27379
|
});
|
|
@@ -27403,9 +27385,7 @@ var icontainsAllFactory = (config) => {
|
|
|
27403
27385
|
return {
|
|
27404
27386
|
score: result.score,
|
|
27405
27387
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27406
|
-
|
|
27407
|
-
misses: result.misses,
|
|
27408
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27388
|
+
assertions: result.assertions,
|
|
27409
27389
|
expectedAspectCount: 1
|
|
27410
27390
|
};
|
|
27411
27391
|
});
|
|
@@ -27417,9 +27397,7 @@ var startsWithFactory = (config) => {
|
|
|
27417
27397
|
return {
|
|
27418
27398
|
score: result.score,
|
|
27419
27399
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27420
|
-
|
|
27421
|
-
misses: result.misses,
|
|
27422
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27400
|
+
assertions: result.assertions,
|
|
27423
27401
|
expectedAspectCount: 1
|
|
27424
27402
|
};
|
|
27425
27403
|
});
|
|
@@ -27431,9 +27409,7 @@ var endsWithFactory = (config) => {
|
|
|
27431
27409
|
return {
|
|
27432
27410
|
score: result.score,
|
|
27433
27411
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
27434
|
-
|
|
27435
|
-
misses: result.misses,
|
|
27436
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
27412
|
+
assertions: result.assertions,
|
|
27437
27413
|
expectedAspectCount: 1
|
|
27438
27414
|
};
|
|
27439
27415
|
});
|
|
@@ -28462,7 +28438,7 @@ async function runEvaluation(options) {
|
|
|
28462
28438
|
if (!cliModel) {
|
|
28463
28439
|
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
28464
28440
|
}
|
|
28465
|
-
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-
|
|
28441
|
+
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M-TJAWCWCX.js");
|
|
28466
28442
|
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
28467
28443
|
}
|
|
28468
28444
|
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
@@ -28797,9 +28773,8 @@ async function runEvaluation(options) {
|
|
|
28797
28773
|
testId: evalCase.id,
|
|
28798
28774
|
dataset: evalCase.dataset,
|
|
28799
28775
|
score: 0,
|
|
28800
|
-
|
|
28801
|
-
|
|
28802
|
-
answer: "",
|
|
28776
|
+
assertions: [],
|
|
28777
|
+
outputText: "",
|
|
28803
28778
|
target: target.name,
|
|
28804
28779
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
28805
28780
|
budgetExceeded: true,
|
|
@@ -28834,9 +28809,8 @@ async function runEvaluation(options) {
|
|
|
28834
28809
|
testId: evalCase.id,
|
|
28835
28810
|
dataset: evalCase.dataset,
|
|
28836
28811
|
score: 0,
|
|
28837
|
-
|
|
28838
|
-
|
|
28839
|
-
answer: "",
|
|
28812
|
+
assertions: [],
|
|
28813
|
+
outputText: "",
|
|
28840
28814
|
target: target.name,
|
|
28841
28815
|
error: errorMsg,
|
|
28842
28816
|
executionStatus: "execution_error",
|
|
@@ -29802,11 +29776,9 @@ async function evaluateCandidate(options) {
|
|
|
29802
29776
|
dataset: evalCase.dataset,
|
|
29803
29777
|
conversationId: evalCase.conversation_id,
|
|
29804
29778
|
score: score.score,
|
|
29805
|
-
|
|
29806
|
-
|
|
29807
|
-
answer: candidate,
|
|
29779
|
+
assertions: score.assertions,
|
|
29780
|
+
outputText: candidate,
|
|
29808
29781
|
target: target.name,
|
|
29809
|
-
reasoning: score.reasoning,
|
|
29810
29782
|
tokenUsage,
|
|
29811
29783
|
costUsd,
|
|
29812
29784
|
durationMs,
|
|
@@ -29980,9 +29952,7 @@ async function runEvaluatorList(options) {
|
|
|
29980
29952
|
score: score2.score,
|
|
29981
29953
|
weight,
|
|
29982
29954
|
verdict: score2.verdict,
|
|
29983
|
-
|
|
29984
|
-
misses: score2.misses,
|
|
29985
|
-
reasoning: score2.reasoning,
|
|
29955
|
+
assertions: score2.assertions,
|
|
29986
29956
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
29987
29957
|
details: score2.details,
|
|
29988
29958
|
scores: mapChildResults(score2.scores),
|
|
@@ -29997,10 +29967,10 @@ async function runEvaluatorList(options) {
|
|
|
29997
29967
|
const fallbackScore = {
|
|
29998
29968
|
score: 0,
|
|
29999
29969
|
verdict: "fail",
|
|
30000
|
-
|
|
30001
|
-
|
|
30002
|
-
|
|
30003
|
-
|
|
29970
|
+
assertions: [
|
|
29971
|
+
{ text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
29972
|
+
],
|
|
29973
|
+
expectedAspectCount: 1
|
|
30004
29974
|
};
|
|
30005
29975
|
const weight = evaluatorConfig.weight ?? 1;
|
|
30006
29976
|
scored.push({
|
|
@@ -30016,9 +29986,12 @@ async function runEvaluatorList(options) {
|
|
|
30016
29986
|
score: 0,
|
|
30017
29987
|
weight,
|
|
30018
29988
|
verdict: "fail",
|
|
30019
|
-
|
|
30020
|
-
|
|
30021
|
-
|
|
29989
|
+
assertions: [
|
|
29990
|
+
{
|
|
29991
|
+
text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
29992
|
+
passed: false
|
|
29993
|
+
}
|
|
29994
|
+
],
|
|
30022
29995
|
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
30023
29996
|
startedAt: startedAt.toISOString(),
|
|
30024
29997
|
endedAt: endedAt.toISOString()
|
|
@@ -30034,9 +30007,7 @@ async function runEvaluatorList(options) {
|
|
|
30034
30007
|
...scores[lastScoresIdx],
|
|
30035
30008
|
score: negated.score,
|
|
30036
30009
|
verdict: negated.verdict,
|
|
30037
|
-
|
|
30038
|
-
misses: [...negated.misses],
|
|
30039
|
-
reasoning: negated.reasoning
|
|
30010
|
+
assertions: [...negated.assertions]
|
|
30040
30011
|
};
|
|
30041
30012
|
}
|
|
30042
30013
|
}
|
|
@@ -30051,21 +30022,13 @@ async function runEvaluatorList(options) {
|
|
|
30051
30022
|
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
30052
30023
|
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
30053
30024
|
) : 0;
|
|
30054
|
-
const
|
|
30055
|
-
const
|
|
30056
|
-
const expectedAspectCount = scored.reduce(
|
|
30057
|
-
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
30058
|
-
0
|
|
30059
|
-
);
|
|
30060
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
30061
|
-
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
30025
|
+
const assertions = scored.flatMap((entry) => entry.score.assertions);
|
|
30026
|
+
const expectedAspectCount = assertions.length || 1;
|
|
30062
30027
|
const score = {
|
|
30063
30028
|
score: aggregateScore,
|
|
30064
30029
|
verdict: scoreToVerdict(aggregateScore),
|
|
30065
|
-
|
|
30066
|
-
|
|
30067
|
-
expectedAspectCount,
|
|
30068
|
-
reasoning
|
|
30030
|
+
assertions,
|
|
30031
|
+
expectedAspectCount
|
|
30069
30032
|
};
|
|
30070
30033
|
return { score, scores };
|
|
30071
30034
|
}
|
|
@@ -30169,9 +30132,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
30169
30132
|
dataset: evalCase.dataset,
|
|
30170
30133
|
conversationId: evalCase.conversation_id,
|
|
30171
30134
|
score: 0,
|
|
30172
|
-
|
|
30173
|
-
|
|
30174
|
-
answer: `Error occurred: ${message}`,
|
|
30135
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
30136
|
+
outputText: `Error occurred: ${message}`,
|
|
30175
30137
|
target: targetName,
|
|
30176
30138
|
requests,
|
|
30177
30139
|
input,
|
|
@@ -30280,9 +30242,7 @@ function mapChildResults(children) {
|
|
|
30280
30242
|
score: child.score,
|
|
30281
30243
|
weight: child.weight,
|
|
30282
30244
|
verdict: child.verdict,
|
|
30283
|
-
|
|
30284
|
-
misses: child.misses,
|
|
30285
|
-
reasoning: child.reasoning,
|
|
30245
|
+
assertions: child.assertions,
|
|
30286
30246
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
30287
30247
|
scores: mapChildResults(child.scores),
|
|
30288
30248
|
details: child.details,
|
|
@@ -30690,7 +30650,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
30690
30650
|
return false;
|
|
30691
30651
|
}
|
|
30692
30652
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
30693
|
-
"
|
|
30653
|
+
"outputText",
|
|
30694
30654
|
"requests",
|
|
30695
30655
|
"trace",
|
|
30696
30656
|
"workspacePath",
|
|
@@ -30862,7 +30822,7 @@ var OtelTraceExporter = class {
|
|
|
30862
30822
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
30863
30823
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
30864
30824
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
30865
|
-
if (captureContent) rootSpan.setAttribute("agentv.
|
|
30825
|
+
if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
|
|
30866
30826
|
if (result.durationMs != null)
|
|
30867
30827
|
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
30868
30828
|
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
@@ -31150,7 +31110,6 @@ export {
|
|
|
31150
31110
|
isJsonValue,
|
|
31151
31111
|
isTestMessage,
|
|
31152
31112
|
isEvaluatorKind,
|
|
31153
|
-
getHitCount,
|
|
31154
31113
|
fileExists,
|
|
31155
31114
|
normalizeLineEndings,
|
|
31156
31115
|
readTextFile,
|
|
@@ -31290,4 +31249,4 @@ export {
|
|
|
31290
31249
|
OtelStreamingObserver,
|
|
31291
31250
|
createAgentKernel
|
|
31292
31251
|
};
|
|
31293
|
-
//# sourceMappingURL=chunk-
|
|
31252
|
+
//# sourceMappingURL=chunk-K4RXLQWV.js.map
|