@agentv/core 3.4.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-NFFLXG5M.js +7 -0
- package/dist/{chunk-JO4HIAEF.js → chunk-2IZOTQ25.js} +1 -5
- package/dist/chunk-2IZOTQ25.js.map +1 -0
- package/dist/{chunk-Q52FQPKQ.js → chunk-W5YDZWT4.js} +2 -2
- package/dist/chunk-W5YDZWT4.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +449 -491
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +57 -47
- package/dist/index.d.ts +57 -47
- package/dist/index.js +451 -490
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/agentv-provider-HDSAUUEF.js +0 -7
- package/dist/chunk-JO4HIAEF.js.map +0 -1
- package/dist/chunk-Q52FQPKQ.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF.js.map → agentv-provider-NFFLXG5M.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -5,7 +5,6 @@ import {
|
|
|
5
5
|
extractLastAssistantContent,
|
|
6
6
|
fileExists,
|
|
7
7
|
findGitRoot,
|
|
8
|
-
getHitCount,
|
|
9
8
|
isAgentProvider,
|
|
10
9
|
isEvaluatorKind,
|
|
11
10
|
isJsonObject,
|
|
@@ -17,10 +16,10 @@ import {
|
|
|
17
16
|
readTextFile,
|
|
18
17
|
resolveFileReference,
|
|
19
18
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-2IZOTQ25.js";
|
|
21
20
|
import {
|
|
22
21
|
AgentvProvider
|
|
23
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-W5YDZWT4.js";
|
|
24
23
|
import {
|
|
25
24
|
OtlpJsonFileExporter
|
|
26
25
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -743,14 +742,8 @@ import { readFile as readFile4 } from "node:fs/promises";
|
|
|
743
742
|
|
|
744
743
|
// src/evaluation/template-variables.ts
|
|
745
744
|
var TEMPLATE_VARIABLES = {
|
|
746
|
-
/** @deprecated Use OUTPUT_TEXT instead */
|
|
747
|
-
ANSWER: "answer",
|
|
748
745
|
EXPECTED_OUTPUT: "expected_output",
|
|
749
|
-
/** @deprecated Use INPUT_TEXT instead */
|
|
750
|
-
QUESTION: "question",
|
|
751
746
|
CRITERIA: "criteria",
|
|
752
|
-
/** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
|
|
753
|
-
REFERENCE_ANSWER: "reference_answer",
|
|
754
747
|
INPUT: "input",
|
|
755
748
|
OUTPUT: "output",
|
|
756
749
|
FILE_CHANGES: "file_changes",
|
|
@@ -760,9 +753,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
760
753
|
};
|
|
761
754
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
762
755
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
763
|
-
TEMPLATE_VARIABLES.
|
|
764
|
-
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
765
|
-
TEMPLATE_VARIABLES.OUTPUT_TEXT
|
|
756
|
+
TEMPLATE_VARIABLES.OUTPUT_TEXT,
|
|
757
|
+
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
766
758
|
]);
|
|
767
759
|
|
|
768
760
|
// src/evaluation/validation/prompt-validator.ts
|
|
@@ -785,13 +777,13 @@ function validateTemplateVariables(content, source) {
|
|
|
785
777
|
}
|
|
786
778
|
match = variablePattern.exec(content);
|
|
787
779
|
}
|
|
788
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.
|
|
780
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
789
781
|
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
790
782
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
791
783
|
if (!hasRequiredFields) {
|
|
792
784
|
throw new Error(
|
|
793
785
|
`Missing required fields. Must include at least one of:
|
|
794
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
786
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
|
|
795
787
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
796
788
|
);
|
|
797
789
|
}
|
|
@@ -3752,7 +3744,7 @@ var AzureProvider = class {
|
|
|
3752
3744
|
};
|
|
3753
3745
|
this.retryConfig = config.retry;
|
|
3754
3746
|
const azure = createAzure(buildAzureOptions(config));
|
|
3755
|
-
this.model = azure(config.deploymentName);
|
|
3747
|
+
this.model = azure.chat(config.deploymentName);
|
|
3756
3748
|
}
|
|
3757
3749
|
id;
|
|
3758
3750
|
kind = "azure";
|
|
@@ -3975,6 +3967,8 @@ async function invokeModel(options) {
|
|
|
3975
3967
|
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
3976
3968
|
const chatPrompt = buildChatPrompt(request);
|
|
3977
3969
|
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
3970
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
3971
|
+
const startMs = Date.now();
|
|
3978
3972
|
const result = await withRetry(
|
|
3979
3973
|
() => generateText({
|
|
3980
3974
|
model,
|
|
@@ -3988,9 +3982,11 @@ async function invokeModel(options) {
|
|
|
3988
3982
|
retryConfig,
|
|
3989
3983
|
request.signal
|
|
3990
3984
|
);
|
|
3991
|
-
|
|
3985
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
3986
|
+
const durationMs = Date.now() - startMs;
|
|
3987
|
+
return mapResponse(result, { durationMs, startTime, endTime });
|
|
3992
3988
|
}
|
|
3993
|
-
function mapResponse(result) {
|
|
3989
|
+
function mapResponse(result, timing) {
|
|
3994
3990
|
const content = result.text ?? "";
|
|
3995
3991
|
const rawUsage = result.totalUsage ?? result.usage;
|
|
3996
3992
|
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
@@ -4005,7 +4001,10 @@ function mapResponse(result) {
|
|
|
4005
4001
|
raw: result,
|
|
4006
4002
|
usage: toJsonObject(rawUsage),
|
|
4007
4003
|
output: [{ role: "assistant", content }],
|
|
4008
|
-
tokenUsage
|
|
4004
|
+
tokenUsage,
|
|
4005
|
+
durationMs: timing?.durationMs,
|
|
4006
|
+
startTime: timing?.startTime,
|
|
4007
|
+
endTime: timing?.endTime
|
|
4009
4008
|
};
|
|
4010
4009
|
}
|
|
4011
4010
|
function toJsonObject(value) {
|
|
@@ -4883,10 +4882,12 @@ var ClaudeSdkProvider = class {
|
|
|
4883
4882
|
if (usage) {
|
|
4884
4883
|
const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
|
|
4885
4884
|
const outputTokens = usage.output_tokens ?? 0;
|
|
4885
|
+
const reasoningTokens = usage.reasoning_tokens ?? void 0;
|
|
4886
4886
|
tokenUsage = {
|
|
4887
4887
|
input: inputTokens,
|
|
4888
4888
|
output: outputTokens,
|
|
4889
|
-
cached: usage.cache_read_input_tokens ?? void 0
|
|
4889
|
+
cached: usage.cache_read_input_tokens ?? void 0,
|
|
4890
|
+
reasoning: reasoningTokens
|
|
4890
4891
|
};
|
|
4891
4892
|
request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
|
|
4892
4893
|
}
|
|
@@ -5900,7 +5901,8 @@ ${basePrompt}` : basePrompt;
|
|
|
5900
5901
|
onUsage({
|
|
5901
5902
|
input: usage.input_tokens ?? 0,
|
|
5902
5903
|
output: usage.output_tokens ?? 0,
|
|
5903
|
-
cached: usage.cached_input_tokens ?? void 0
|
|
5904
|
+
cached: usage.cached_input_tokens ?? void 0,
|
|
5905
|
+
reasoning: usage.reasoning_tokens ?? void 0
|
|
5904
5906
|
});
|
|
5905
5907
|
}
|
|
5906
5908
|
}
|
|
@@ -7914,10 +7916,12 @@ function extractTokenUsage(events) {
|
|
|
7914
7916
|
output: output ?? 0
|
|
7915
7917
|
};
|
|
7916
7918
|
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
7917
|
-
|
|
7918
|
-
|
|
7919
|
-
|
|
7920
|
-
|
|
7919
|
+
const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
|
|
7920
|
+
return {
|
|
7921
|
+
...result,
|
|
7922
|
+
...cached !== void 0 ? { cached } : {},
|
|
7923
|
+
...reasoning !== void 0 ? { reasoning } : {}
|
|
7924
|
+
};
|
|
7921
7925
|
}
|
|
7922
7926
|
}
|
|
7923
7927
|
const messages = record.messages;
|
|
@@ -9784,9 +9788,11 @@ function negateScore(score) {
|
|
|
9784
9788
|
...score,
|
|
9785
9789
|
score: negatedScore,
|
|
9786
9790
|
verdict: negatedVerdict,
|
|
9787
|
-
|
|
9788
|
-
|
|
9789
|
-
|
|
9791
|
+
assertions: score.assertions.map((a) => ({
|
|
9792
|
+
...a,
|
|
9793
|
+
passed: !a.passed,
|
|
9794
|
+
evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
|
|
9795
|
+
}))
|
|
9790
9796
|
};
|
|
9791
9797
|
}
|
|
9792
9798
|
|
|
@@ -10244,11 +10250,9 @@ var CodeEvaluator = class {
|
|
|
10244
10250
|
}
|
|
10245
10251
|
}
|
|
10246
10252
|
const payload = {
|
|
10247
|
-
question: context.evalCase.question,
|
|
10248
10253
|
criteria: context.evalCase.criteria,
|
|
10249
10254
|
expectedOutput: context.evalCase.expected_output,
|
|
10250
|
-
|
|
10251
|
-
answer: context.candidate,
|
|
10255
|
+
outputText: context.candidate,
|
|
10252
10256
|
output: outputForPayload,
|
|
10253
10257
|
outputPath,
|
|
10254
10258
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
@@ -10265,9 +10269,7 @@ var CodeEvaluator = class {
|
|
|
10265
10269
|
fileChanges: context.fileChanges ?? null,
|
|
10266
10270
|
workspacePath: context.workspacePath ?? null,
|
|
10267
10271
|
config: this.config ?? null,
|
|
10268
|
-
// Text convenience accessors (new names, always strings)
|
|
10269
10272
|
inputText: context.evalCase.question,
|
|
10270
|
-
outputText: context.candidate,
|
|
10271
10273
|
expectedOutputText: context.evalCase.reference_answer ?? ""
|
|
10272
10274
|
};
|
|
10273
10275
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -10301,9 +10303,13 @@ var CodeEvaluator = class {
|
|
|
10301
10303
|
);
|
|
10302
10304
|
const parsed = parseJsonSafe(stdout);
|
|
10303
10305
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
10304
|
-
const
|
|
10305
|
-
|
|
10306
|
-
|
|
10306
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
10307
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
10308
|
+
).map((a) => ({
|
|
10309
|
+
text: String(a.text),
|
|
10310
|
+
passed: Boolean(a.passed),
|
|
10311
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
10312
|
+
})) : [];
|
|
10307
10313
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
10308
10314
|
const proxyUsage = getProxyUsage?.();
|
|
10309
10315
|
const evaluatorRawRequest = {
|
|
@@ -10319,10 +10325,8 @@ var CodeEvaluator = class {
|
|
|
10319
10325
|
return {
|
|
10320
10326
|
score,
|
|
10321
10327
|
verdict: scoreToVerdict(score),
|
|
10322
|
-
|
|
10323
|
-
|
|
10324
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
10325
|
-
reasoning,
|
|
10328
|
+
assertions,
|
|
10329
|
+
expectedAspectCount: assertions.length || 1,
|
|
10326
10330
|
evaluatorRawRequest,
|
|
10327
10331
|
...details ? { details } : {},
|
|
10328
10332
|
tokenUsage: proxyUsage?.tokenUsage
|
|
@@ -10333,10 +10337,8 @@ var CodeEvaluator = class {
|
|
|
10333
10337
|
return {
|
|
10334
10338
|
score: 0,
|
|
10335
10339
|
verdict: "fail",
|
|
10336
|
-
|
|
10337
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
10340
|
+
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
10338
10341
|
expectedAspectCount: 1,
|
|
10339
|
-
reasoning: message,
|
|
10340
10342
|
evaluatorRawRequest: {
|
|
10341
10343
|
command: this.command,
|
|
10342
10344
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
@@ -10435,18 +10437,22 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
10435
10437
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
10436
10438
|
|
|
10437
10439
|
[[ ## question ## ]]
|
|
10438
|
-
{{${TEMPLATE_VARIABLES.
|
|
10440
|
+
{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
|
|
10439
10441
|
|
|
10440
10442
|
[[ ## reference_answer ## ]]
|
|
10441
|
-
{{${TEMPLATE_VARIABLES.
|
|
10443
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
|
|
10442
10444
|
|
|
10443
10445
|
[[ ## answer ## ]]
|
|
10444
|
-
{{${TEMPLATE_VARIABLES.
|
|
10446
|
+
{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
|
|
10445
10447
|
var freeformEvaluationSchema = z3.object({
|
|
10446
10448
|
score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
10447
|
-
|
|
10448
|
-
|
|
10449
|
-
|
|
10449
|
+
assertions: z3.array(
|
|
10450
|
+
z3.object({
|
|
10451
|
+
text: z3.string().describe("Brief description of what was checked"),
|
|
10452
|
+
passed: z3.boolean().describe("Whether this aspect was satisfied"),
|
|
10453
|
+
evidence: z3.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
10454
|
+
})
|
|
10455
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
10450
10456
|
});
|
|
10451
10457
|
var rubricCheckResultSchema = z3.object({
|
|
10452
10458
|
id: z3.string().describe("The ID of the rubric item being checked"),
|
|
@@ -10515,12 +10521,8 @@ var LlmGraderEvaluator = class {
|
|
|
10515
10521
|
2
|
|
10516
10522
|
),
|
|
10517
10523
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
|
|
10518
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10519
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10520
10524
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10521
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
10522
10525
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
|
|
10523
|
-
// Text convenience accessors (new names, always strings)
|
|
10524
10526
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10525
10527
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10526
10528
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
|
|
@@ -10548,17 +10550,12 @@ ${context.fileChanges}`;
|
|
|
10548
10550
|
schema: freeformEvaluationSchema
|
|
10549
10551
|
});
|
|
10550
10552
|
const score = clampScore(data.score);
|
|
10551
|
-
const
|
|
10552
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
10553
|
-
const reasoning = data.reasoning;
|
|
10554
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
10553
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
10555
10554
|
return {
|
|
10556
10555
|
score,
|
|
10557
10556
|
verdict: scoreToVerdict(score),
|
|
10558
|
-
|
|
10559
|
-
|
|
10560
|
-
expectedAspectCount,
|
|
10561
|
-
reasoning,
|
|
10557
|
+
assertions,
|
|
10558
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10562
10559
|
evaluatorRawRequest,
|
|
10563
10560
|
tokenUsage
|
|
10564
10561
|
};
|
|
@@ -10569,10 +10566,8 @@ ${context.fileChanges}`;
|
|
|
10569
10566
|
return {
|
|
10570
10567
|
score: 0,
|
|
10571
10568
|
verdict: "skip",
|
|
10572
|
-
|
|
10573
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
10569
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10574
10570
|
expectedAspectCount: 1,
|
|
10575
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
10576
10571
|
evaluatorRawRequest
|
|
10577
10572
|
};
|
|
10578
10573
|
}
|
|
@@ -10602,14 +10597,12 @@ ${context.fileChanges}`;
|
|
|
10602
10597
|
userPrompt: prompt,
|
|
10603
10598
|
schema: rubricEvaluationSchema
|
|
10604
10599
|
});
|
|
10605
|
-
const { score, verdict,
|
|
10600
|
+
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
10606
10601
|
return {
|
|
10607
10602
|
score,
|
|
10608
10603
|
verdict,
|
|
10609
|
-
|
|
10610
|
-
misses,
|
|
10604
|
+
assertions,
|
|
10611
10605
|
expectedAspectCount: rubrics.length,
|
|
10612
|
-
reasoning: data.overall_reasoning,
|
|
10613
10606
|
evaluatorRawRequest,
|
|
10614
10607
|
tokenUsage
|
|
10615
10608
|
};
|
|
@@ -10620,10 +10613,8 @@ ${context.fileChanges}`;
|
|
|
10620
10613
|
return {
|
|
10621
10614
|
score: 0,
|
|
10622
10615
|
verdict: "skip",
|
|
10623
|
-
|
|
10624
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
10616
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10625
10617
|
expectedAspectCount: rubrics.length,
|
|
10626
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
10627
10618
|
evaluatorRawRequest
|
|
10628
10619
|
};
|
|
10629
10620
|
}
|
|
@@ -10648,14 +10639,12 @@ ${context.fileChanges}`;
|
|
|
10648
10639
|
userPrompt: prompt,
|
|
10649
10640
|
schema: scoreRangeEvaluationSchema
|
|
10650
10641
|
});
|
|
10651
|
-
const { score, verdict,
|
|
10642
|
+
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
10652
10643
|
return {
|
|
10653
10644
|
score,
|
|
10654
10645
|
verdict,
|
|
10655
|
-
|
|
10656
|
-
misses,
|
|
10646
|
+
assertions,
|
|
10657
10647
|
expectedAspectCount: rubrics.length,
|
|
10658
|
-
reasoning: data.overall_reasoning,
|
|
10659
10648
|
evaluatorRawRequest,
|
|
10660
10649
|
details,
|
|
10661
10650
|
tokenUsage
|
|
@@ -10667,10 +10656,8 @@ ${context.fileChanges}`;
|
|
|
10667
10656
|
return {
|
|
10668
10657
|
score: 0,
|
|
10669
10658
|
verdict: "skip",
|
|
10670
|
-
|
|
10671
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
10659
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
10672
10660
|
expectedAspectCount: rubrics.length,
|
|
10673
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
10674
10661
|
evaluatorRawRequest
|
|
10675
10662
|
};
|
|
10676
10663
|
}
|
|
@@ -10727,8 +10714,7 @@ ${context.fileChanges}`;
|
|
|
10727
10714
|
return {
|
|
10728
10715
|
score: 0,
|
|
10729
10716
|
verdict: "fail",
|
|
10730
|
-
|
|
10731
|
-
misses: [`llm-grader built-in evaluation failed: ${message}`],
|
|
10717
|
+
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
10732
10718
|
expectedAspectCount: 1,
|
|
10733
10719
|
evaluatorRawRequest,
|
|
10734
10720
|
details: { mode: "built-in", error: message }
|
|
@@ -10778,8 +10764,9 @@ ${context.fileChanges}`;
|
|
|
10778
10764
|
return {
|
|
10779
10765
|
score: 0,
|
|
10780
10766
|
verdict: "fail",
|
|
10781
|
-
|
|
10782
|
-
|
|
10767
|
+
assertions: [
|
|
10768
|
+
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
10769
|
+
],
|
|
10783
10770
|
expectedAspectCount: 1,
|
|
10784
10771
|
evaluatorRawRequest,
|
|
10785
10772
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
@@ -10797,8 +10784,9 @@ ${context.fileChanges}`;
|
|
|
10797
10784
|
return {
|
|
10798
10785
|
score: 0,
|
|
10799
10786
|
verdict: "fail",
|
|
10800
|
-
|
|
10801
|
-
|
|
10787
|
+
assertions: [
|
|
10788
|
+
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
10789
|
+
],
|
|
10802
10790
|
expectedAspectCount: 1,
|
|
10803
10791
|
evaluatorRawRequest,
|
|
10804
10792
|
details: {
|
|
@@ -10839,10 +10827,10 @@ ${context.fileChanges}`;
|
|
|
10839
10827
|
buildAgentUserPrompt(context) {
|
|
10840
10828
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
10841
10829
|
const variables = {
|
|
10842
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10843
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10844
10830
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10845
|
-
[TEMPLATE_VARIABLES.
|
|
10831
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10832
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10833
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10846
10834
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
|
|
10847
10835
|
};
|
|
10848
10836
|
if (this.evaluatorTemplate) {
|
|
@@ -10895,10 +10883,10 @@ ${context.fileChanges}`;
|
|
|
10895
10883
|
const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
|
|
10896
10884
|
if (this.evaluatorTemplate) {
|
|
10897
10885
|
const variables = {
|
|
10898
|
-
[TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
|
|
10899
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10900
10886
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
10901
|
-
[TEMPLATE_VARIABLES.
|
|
10887
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
10888
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
|
|
10889
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
|
|
10902
10890
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
|
|
10903
10891
|
};
|
|
10904
10892
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
@@ -10950,29 +10938,24 @@ ${outputSchema}`;
|
|
|
10950
10938
|
const parsed = parseJsonFromText(text);
|
|
10951
10939
|
if (rubrics && rubrics.length > 0) {
|
|
10952
10940
|
const data2 = rubricEvaluationSchema.parse(parsed);
|
|
10953
|
-
const { score: score2, verdict,
|
|
10941
|
+
const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
|
|
10954
10942
|
return {
|
|
10955
10943
|
score: score2,
|
|
10956
10944
|
verdict,
|
|
10957
|
-
|
|
10958
|
-
misses: misses2,
|
|
10945
|
+
assertions: assertions2,
|
|
10959
10946
|
expectedAspectCount: rubrics.length,
|
|
10960
|
-
reasoning: data2.overall_reasoning,
|
|
10961
10947
|
evaluatorRawRequest,
|
|
10962
10948
|
details
|
|
10963
10949
|
};
|
|
10964
10950
|
}
|
|
10965
10951
|
const data = freeformEvaluationSchema.parse(parsed);
|
|
10966
10952
|
const score = clampScore(data.score);
|
|
10967
|
-
const
|
|
10968
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
10953
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
10969
10954
|
return {
|
|
10970
10955
|
score,
|
|
10971
10956
|
verdict: scoreToVerdict(score),
|
|
10972
|
-
|
|
10973
|
-
|
|
10974
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
10975
|
-
reasoning: data.reasoning,
|
|
10957
|
+
assertions,
|
|
10958
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
10976
10959
|
evaluatorRawRequest,
|
|
10977
10960
|
details
|
|
10978
10961
|
};
|
|
@@ -10980,8 +10963,12 @@ ${outputSchema}`;
|
|
|
10980
10963
|
return {
|
|
10981
10964
|
score: 0,
|
|
10982
10965
|
verdict: "fail",
|
|
10983
|
-
|
|
10984
|
-
|
|
10966
|
+
assertions: [
|
|
10967
|
+
{
|
|
10968
|
+
text: "Failed to parse llm-grader agent response as valid evaluation JSON",
|
|
10969
|
+
passed: false
|
|
10970
|
+
}
|
|
10971
|
+
],
|
|
10985
10972
|
expectedAspectCount: 1,
|
|
10986
10973
|
evaluatorRawRequest,
|
|
10987
10974
|
details
|
|
@@ -11110,9 +11097,13 @@ function buildOutputSchema() {
|
|
|
11110
11097
|
"",
|
|
11111
11098
|
"{",
|
|
11112
11099
|
' "score": <number between 0.0 and 1.0>,',
|
|
11113
|
-
' "
|
|
11114
|
-
|
|
11115
|
-
'
|
|
11100
|
+
' "assertions": [',
|
|
11101
|
+
" {",
|
|
11102
|
+
' "text": "<brief description of what was checked>",',
|
|
11103
|
+
' "passed": <boolean>,',
|
|
11104
|
+
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
11105
|
+
" }",
|
|
11106
|
+
" ]",
|
|
11116
11107
|
"}"
|
|
11117
11108
|
].join("\n");
|
|
11118
11109
|
}
|
|
@@ -11137,8 +11128,7 @@ function substituteVariables(template, variables) {
|
|
|
11137
11128
|
}
|
|
11138
11129
|
function calculateRubricScore(result, rubrics) {
|
|
11139
11130
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
11140
|
-
const
|
|
11141
|
-
const misses = [];
|
|
11131
|
+
const assertions = [];
|
|
11142
11132
|
let totalWeight = 0;
|
|
11143
11133
|
let earnedWeight = 0;
|
|
11144
11134
|
let failedRequired = false;
|
|
@@ -11148,19 +11138,20 @@ function calculateRubricScore(result, rubrics) {
|
|
|
11148
11138
|
continue;
|
|
11149
11139
|
}
|
|
11150
11140
|
totalWeight += rubric.weight;
|
|
11141
|
+
assertions.push({
|
|
11142
|
+
text: `[${rubric.id}] ${rubric.outcome}`,
|
|
11143
|
+
passed: check.satisfied,
|
|
11144
|
+
evidence: check.reasoning
|
|
11145
|
+
});
|
|
11151
11146
|
if (check.satisfied) {
|
|
11152
11147
|
earnedWeight += rubric.weight;
|
|
11153
|
-
|
|
11154
|
-
|
|
11155
|
-
misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
|
|
11156
|
-
if (rubric.required) {
|
|
11157
|
-
failedRequired = true;
|
|
11158
|
-
}
|
|
11148
|
+
} else if (rubric.required) {
|
|
11149
|
+
failedRequired = true;
|
|
11159
11150
|
}
|
|
11160
11151
|
}
|
|
11161
11152
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
11162
11153
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
11163
|
-
return { score, verdict,
|
|
11154
|
+
return { score, verdict, assertions };
|
|
11164
11155
|
}
|
|
11165
11156
|
function buildScoreRangeOutputSchema() {
|
|
11166
11157
|
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
@@ -11180,8 +11171,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
|
|
|
11180
11171
|
}
|
|
11181
11172
|
function calculateScoreRangeResult(result, rubrics) {
|
|
11182
11173
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
11183
|
-
const
|
|
11184
|
-
const misses = [];
|
|
11174
|
+
const assertions = [];
|
|
11185
11175
|
const rawScores = {};
|
|
11186
11176
|
let totalWeight = 0;
|
|
11187
11177
|
let weightedScoreSum = 0;
|
|
@@ -11207,24 +11197,22 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
11207
11197
|
);
|
|
11208
11198
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
11209
11199
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
11210
|
-
const
|
|
11211
|
-
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
11200
|
+
const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
|
|
11212
11201
|
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
11213
11202
|
failedRequired = true;
|
|
11214
|
-
misses.push(scoreInfo);
|
|
11215
|
-
} else if (rawScore >= 7) {
|
|
11216
|
-
hits.push(scoreInfo);
|
|
11217
|
-
} else {
|
|
11218
|
-
misses.push(scoreInfo);
|
|
11219
11203
|
}
|
|
11204
|
+
assertions.push({
|
|
11205
|
+
text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
|
|
11206
|
+
passed,
|
|
11207
|
+
evidence: check.reasoning
|
|
11208
|
+
});
|
|
11220
11209
|
}
|
|
11221
11210
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
11222
11211
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
11223
11212
|
return {
|
|
11224
11213
|
score,
|
|
11225
11214
|
verdict,
|
|
11226
|
-
|
|
11227
|
-
misses,
|
|
11215
|
+
assertions,
|
|
11228
11216
|
details: {
|
|
11229
11217
|
raw_scores: rawScores,
|
|
11230
11218
|
normalization: "score / 10",
|
|
@@ -11400,9 +11388,7 @@ var CompositeEvaluator = class {
|
|
|
11400
11388
|
let totalWeight = 0;
|
|
11401
11389
|
let weightedSum = 0;
|
|
11402
11390
|
let evaluatedCount = 0;
|
|
11403
|
-
const
|
|
11404
|
-
const allMisses = [];
|
|
11405
|
-
const reasoningParts = [];
|
|
11391
|
+
const allAssertions = [];
|
|
11406
11392
|
const scores = [];
|
|
11407
11393
|
for (const member of results) {
|
|
11408
11394
|
const weight = weights?.[member.id] ?? 1;
|
|
@@ -11412,9 +11398,7 @@ var CompositeEvaluator = class {
|
|
|
11412
11398
|
score: member.result.score,
|
|
11413
11399
|
weight,
|
|
11414
11400
|
verdict: member.result.verdict,
|
|
11415
|
-
|
|
11416
|
-
misses: [...member.result.misses],
|
|
11417
|
-
reasoning: member.result.reasoning,
|
|
11401
|
+
assertions: [...member.result.assertions],
|
|
11418
11402
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
11419
11403
|
scores: member.result.scores,
|
|
11420
11404
|
details: member.result.details,
|
|
@@ -11426,20 +11410,16 @@ var CompositeEvaluator = class {
|
|
|
11426
11410
|
evaluatedCount++;
|
|
11427
11411
|
totalWeight += weight;
|
|
11428
11412
|
weightedSum += member.result.score * weight;
|
|
11429
|
-
|
|
11430
|
-
|
|
11431
|
-
|
|
11432
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
11433
|
-
}
|
|
11413
|
+
allAssertions.push(
|
|
11414
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
11415
|
+
);
|
|
11434
11416
|
}
|
|
11435
11417
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
11436
11418
|
return {
|
|
11437
11419
|
score: 0,
|
|
11438
11420
|
verdict: "skip",
|
|
11439
|
-
|
|
11440
|
-
misses: [],
|
|
11421
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
11441
11422
|
expectedAspectCount: 1,
|
|
11442
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
11443
11423
|
evaluatorRawRequest: {
|
|
11444
11424
|
aggregator: "weighted_average",
|
|
11445
11425
|
...weights ? { weights } : {}
|
|
@@ -11451,10 +11431,8 @@ var CompositeEvaluator = class {
|
|
|
11451
11431
|
return {
|
|
11452
11432
|
score: clampScore(finalScore),
|
|
11453
11433
|
verdict: scoreToVerdict(finalScore),
|
|
11454
|
-
|
|
11455
|
-
|
|
11456
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
11457
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
11434
|
+
assertions: allAssertions,
|
|
11435
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
11458
11436
|
evaluatorRawRequest: {
|
|
11459
11437
|
aggregator: "weighted_average",
|
|
11460
11438
|
...weights ? { weights } : {}
|
|
@@ -11464,11 +11442,8 @@ var CompositeEvaluator = class {
|
|
|
11464
11442
|
}
|
|
11465
11443
|
runThreshold(results, threshold) {
|
|
11466
11444
|
const scores = [];
|
|
11467
|
-
const
|
|
11468
|
-
const allMisses = [];
|
|
11469
|
-
const reasoningParts = [];
|
|
11445
|
+
const allAssertions = [];
|
|
11470
11446
|
let passingCount = 0;
|
|
11471
|
-
let borderlineCount = 0;
|
|
11472
11447
|
let evaluatedCount = 0;
|
|
11473
11448
|
for (const member of results) {
|
|
11474
11449
|
scores.push({
|
|
@@ -11476,9 +11451,7 @@ var CompositeEvaluator = class {
|
|
|
11476
11451
|
type: member.type,
|
|
11477
11452
|
score: member.result.score,
|
|
11478
11453
|
verdict: member.result.verdict,
|
|
11479
|
-
|
|
11480
|
-
misses: [...member.result.misses],
|
|
11481
|
-
reasoning: member.result.reasoning,
|
|
11454
|
+
assertions: [...member.result.assertions],
|
|
11482
11455
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
11483
11456
|
scores: member.result.scores,
|
|
11484
11457
|
details: member.result.details,
|
|
@@ -11491,24 +11464,17 @@ var CompositeEvaluator = class {
|
|
|
11491
11464
|
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
11492
11465
|
if (isPassing) {
|
|
11493
11466
|
passingCount++;
|
|
11494
|
-
if (member.result.verdict === "borderline") {
|
|
11495
|
-
borderlineCount++;
|
|
11496
|
-
}
|
|
11497
|
-
}
|
|
11498
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
11499
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
11500
|
-
if (member.result.reasoning) {
|
|
11501
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
11502
11467
|
}
|
|
11468
|
+
allAssertions.push(
|
|
11469
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
11470
|
+
);
|
|
11503
11471
|
}
|
|
11504
11472
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
11505
11473
|
return {
|
|
11506
11474
|
score: 0,
|
|
11507
11475
|
verdict: "skip",
|
|
11508
|
-
|
|
11509
|
-
misses: [],
|
|
11476
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
11510
11477
|
expectedAspectCount: 1,
|
|
11511
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
11512
11478
|
evaluatorRawRequest: {
|
|
11513
11479
|
aggregator: "threshold",
|
|
11514
11480
|
threshold
|
|
@@ -11519,19 +11485,15 @@ var CompositeEvaluator = class {
|
|
|
11519
11485
|
const totalCount = evaluatedCount;
|
|
11520
11486
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
11521
11487
|
const pass = score >= threshold;
|
|
11522
|
-
|
|
11523
|
-
|
|
11524
|
-
|
|
11525
|
-
|
|
11526
|
-
`${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
|
|
11527
|
-
);
|
|
11488
|
+
allAssertions.unshift({
|
|
11489
|
+
text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
|
|
11490
|
+
passed: pass
|
|
11491
|
+
});
|
|
11528
11492
|
return {
|
|
11529
11493
|
score: clampScore(score),
|
|
11530
11494
|
verdict: pass ? "pass" : "fail",
|
|
11531
|
-
|
|
11532
|
-
|
|
11533
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
11534
|
-
reasoning: reasoningParts.join("; "),
|
|
11495
|
+
assertions: allAssertions,
|
|
11496
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
11535
11497
|
evaluatorRawRequest: {
|
|
11536
11498
|
aggregator: "threshold",
|
|
11537
11499
|
threshold
|
|
@@ -11548,9 +11510,7 @@ var CompositeEvaluator = class {
|
|
|
11548
11510
|
score: member.result.score,
|
|
11549
11511
|
weight: weights?.[member.id] ?? 1,
|
|
11550
11512
|
verdict: member.result.verdict,
|
|
11551
|
-
|
|
11552
|
-
misses: [...member.result.misses],
|
|
11553
|
-
reasoning: member.result.reasoning,
|
|
11513
|
+
assertions: [...member.result.assertions],
|
|
11554
11514
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
11555
11515
|
scores: member.result.scores,
|
|
11556
11516
|
details: member.result.details
|
|
@@ -11559,17 +11519,19 @@ var CompositeEvaluator = class {
|
|
|
11559
11519
|
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
11560
11520
|
const parsed = parseJsonSafe(stdout);
|
|
11561
11521
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
11562
|
-
const
|
|
11563
|
-
|
|
11564
|
-
|
|
11522
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
11523
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
11524
|
+
).map((a) => ({
|
|
11525
|
+
text: String(a.text),
|
|
11526
|
+
passed: Boolean(a.passed),
|
|
11527
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
11528
|
+
})) : [];
|
|
11565
11529
|
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
11566
11530
|
return {
|
|
11567
11531
|
score,
|
|
11568
11532
|
verdict,
|
|
11569
|
-
|
|
11570
|
-
|
|
11571
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
11572
|
-
reasoning,
|
|
11533
|
+
assertions,
|
|
11534
|
+
expectedAspectCount: assertions.length || 1,
|
|
11573
11535
|
evaluatorRawRequest: {
|
|
11574
11536
|
aggregator: "code-grader",
|
|
11575
11537
|
script: scriptPath
|
|
@@ -11581,10 +11543,8 @@ var CompositeEvaluator = class {
|
|
|
11581
11543
|
return {
|
|
11582
11544
|
score: 0,
|
|
11583
11545
|
verdict: "fail",
|
|
11584
|
-
|
|
11585
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
11546
|
+
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
11586
11547
|
expectedAspectCount: 1,
|
|
11587
|
-
reasoning: message,
|
|
11588
11548
|
evaluatorRawRequest: {
|
|
11589
11549
|
aggregator: "code-grader",
|
|
11590
11550
|
script: scriptPath,
|
|
@@ -11606,9 +11566,7 @@ var CompositeEvaluator = class {
|
|
|
11606
11566
|
type: member.type,
|
|
11607
11567
|
score: member.result.score,
|
|
11608
11568
|
verdict: member.result.verdict,
|
|
11609
|
-
|
|
11610
|
-
misses: [...member.result.misses],
|
|
11611
|
-
reasoning: member.result.reasoning,
|
|
11569
|
+
assertions: [...member.result.assertions],
|
|
11612
11570
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
11613
11571
|
scores: member.result.scores,
|
|
11614
11572
|
details: member.result.details
|
|
@@ -11632,16 +11590,12 @@ var CompositeEvaluator = class {
|
|
|
11632
11590
|
});
|
|
11633
11591
|
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
11634
11592
|
const score2 = clampScore(data2.score);
|
|
11635
|
-
const
|
|
11636
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
11637
|
-
const reasoning2 = data2.reasoning;
|
|
11593
|
+
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
11638
11594
|
return {
|
|
11639
11595
|
score: score2,
|
|
11640
11596
|
verdict: scoreToVerdict(score2),
|
|
11641
|
-
|
|
11642
|
-
|
|
11643
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
11644
|
-
reasoning: reasoning2,
|
|
11597
|
+
assertions: assertions2,
|
|
11598
|
+
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
11645
11599
|
evaluatorRawRequest,
|
|
11646
11600
|
scores
|
|
11647
11601
|
};
|
|
@@ -11656,16 +11610,12 @@ var CompositeEvaluator = class {
|
|
|
11656
11610
|
parseJsonFromText(extractLastAssistantContent(response.output))
|
|
11657
11611
|
);
|
|
11658
11612
|
const score = clampScore(data.score);
|
|
11659
|
-
const
|
|
11660
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
11661
|
-
const reasoning = data.reasoning;
|
|
11613
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
11662
11614
|
return {
|
|
11663
11615
|
score,
|
|
11664
11616
|
verdict: scoreToVerdict(score),
|
|
11665
|
-
|
|
11666
|
-
|
|
11667
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
11668
|
-
reasoning,
|
|
11617
|
+
assertions,
|
|
11618
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
11669
11619
|
evaluatorRawRequest,
|
|
11670
11620
|
scores
|
|
11671
11621
|
};
|
|
@@ -11673,8 +11623,7 @@ var CompositeEvaluator = class {
|
|
|
11673
11623
|
return {
|
|
11674
11624
|
score: 0,
|
|
11675
11625
|
verdict: "fail",
|
|
11676
|
-
|
|
11677
|
-
misses: [],
|
|
11626
|
+
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
11678
11627
|
expectedAspectCount: 1,
|
|
11679
11628
|
evaluatorRawRequest,
|
|
11680
11629
|
scores
|
|
@@ -11697,10 +11646,8 @@ var CostEvaluator = class {
|
|
|
11697
11646
|
return {
|
|
11698
11647
|
score: 0,
|
|
11699
11648
|
verdict: "fail",
|
|
11700
|
-
|
|
11701
|
-
misses: ["No cost data available in trace"],
|
|
11649
|
+
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
11702
11650
|
expectedAspectCount: 1,
|
|
11703
|
-
reasoning: "Execution cost not reported by provider",
|
|
11704
11651
|
evaluatorRawRequest: {
|
|
11705
11652
|
type: "cost",
|
|
11706
11653
|
budget,
|
|
@@ -11714,10 +11661,10 @@ var CostEvaluator = class {
|
|
|
11714
11661
|
return {
|
|
11715
11662
|
score,
|
|
11716
11663
|
verdict: passed ? "pass" : "fail",
|
|
11717
|
-
|
|
11718
|
-
|
|
11664
|
+
assertions: [
|
|
11665
|
+
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
11666
|
+
],
|
|
11719
11667
|
expectedAspectCount: 1,
|
|
11720
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
11721
11668
|
evaluatorRawRequest: {
|
|
11722
11669
|
type: "cost",
|
|
11723
11670
|
budget,
|
|
@@ -11750,10 +11697,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
11750
11697
|
return {
|
|
11751
11698
|
score: 0,
|
|
11752
11699
|
verdict: "fail",
|
|
11753
|
-
|
|
11754
|
-
misses: ["No trace summary available"],
|
|
11700
|
+
assertions: [{ text: "No trace summary available", passed: false }],
|
|
11755
11701
|
expectedAspectCount: 1,
|
|
11756
|
-
reasoning: "Execution metrics not available - no trace summary provided",
|
|
11757
11702
|
evaluatorRawRequest: {
|
|
11758
11703
|
type: "execution-metrics",
|
|
11759
11704
|
config: this.extractConfiguredThresholds(),
|
|
@@ -11762,116 +11707,114 @@ var ExecutionMetricsEvaluator = class {
|
|
|
11762
11707
|
};
|
|
11763
11708
|
}
|
|
11764
11709
|
const narrowedTrace = trace;
|
|
11765
|
-
const
|
|
11766
|
-
const misses = [];
|
|
11710
|
+
const assertions = [];
|
|
11767
11711
|
const actualMetrics = {};
|
|
11768
11712
|
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
11769
11713
|
const toolCalls = narrowedTrace.eventCount;
|
|
11770
11714
|
actualMetrics.tool_calls = toolCalls;
|
|
11771
11715
|
if (toolCalls <= max_tool_calls) {
|
|
11772
|
-
|
|
11716
|
+
assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
|
|
11773
11717
|
} else {
|
|
11774
|
-
|
|
11718
|
+
assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
|
|
11775
11719
|
}
|
|
11776
11720
|
}
|
|
11777
11721
|
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
11778
11722
|
const llmCalls = narrowedTrace.llmCallCount;
|
|
11779
11723
|
if (llmCalls === void 0) {
|
|
11780
|
-
|
|
11724
|
+
assertions.push({ text: "LLM call count data not available", passed: false });
|
|
11781
11725
|
} else {
|
|
11782
11726
|
actualMetrics.llm_calls = llmCalls;
|
|
11783
11727
|
if (llmCalls <= max_llm_calls) {
|
|
11784
|
-
|
|
11728
|
+
assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
|
|
11785
11729
|
} else {
|
|
11786
|
-
|
|
11730
|
+
assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
|
|
11787
11731
|
}
|
|
11788
11732
|
}
|
|
11789
11733
|
}
|
|
11790
11734
|
if (max_tokens !== void 0) {
|
|
11791
11735
|
if (!tokenUsage) {
|
|
11792
|
-
|
|
11736
|
+
assertions.push({ text: "Token usage data not available", passed: false });
|
|
11793
11737
|
} else {
|
|
11794
11738
|
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
11795
11739
|
actualMetrics.tokens = totalTokens;
|
|
11796
11740
|
if (totalTokens <= max_tokens) {
|
|
11797
|
-
|
|
11741
|
+
assertions.push({
|
|
11742
|
+
text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
|
|
11743
|
+
passed: true
|
|
11744
|
+
});
|
|
11798
11745
|
} else {
|
|
11799
|
-
|
|
11746
|
+
assertions.push({
|
|
11747
|
+
text: `Total tokens ${totalTokens} > ${max_tokens} max`,
|
|
11748
|
+
passed: false
|
|
11749
|
+
});
|
|
11800
11750
|
}
|
|
11801
11751
|
}
|
|
11802
11752
|
}
|
|
11803
11753
|
if (max_cost_usd !== void 0) {
|
|
11804
11754
|
if (costUsd === void 0) {
|
|
11805
|
-
|
|
11755
|
+
assertions.push({ text: "Cost data not available", passed: false });
|
|
11806
11756
|
} else {
|
|
11807
11757
|
actualMetrics.cost_usd = costUsd;
|
|
11808
11758
|
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
11809
11759
|
if (costUsd <= max_cost_usd) {
|
|
11810
|
-
|
|
11760
|
+
assertions.push({
|
|
11761
|
+
text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
|
|
11762
|
+
passed: true
|
|
11763
|
+
});
|
|
11811
11764
|
} else {
|
|
11812
|
-
|
|
11765
|
+
assertions.push({
|
|
11766
|
+
text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
|
|
11767
|
+
passed: false
|
|
11768
|
+
});
|
|
11813
11769
|
}
|
|
11814
11770
|
}
|
|
11815
11771
|
}
|
|
11816
11772
|
if (max_duration_ms !== void 0) {
|
|
11817
11773
|
if (durationMs === void 0) {
|
|
11818
|
-
|
|
11774
|
+
assertions.push({ text: "Duration data not available", passed: false });
|
|
11819
11775
|
} else {
|
|
11820
11776
|
actualMetrics.duration_ms = durationMs;
|
|
11821
11777
|
if (durationMs <= max_duration_ms) {
|
|
11822
|
-
|
|
11778
|
+
assertions.push({
|
|
11779
|
+
text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
|
|
11780
|
+
passed: true
|
|
11781
|
+
});
|
|
11823
11782
|
} else {
|
|
11824
|
-
|
|
11783
|
+
assertions.push({
|
|
11784
|
+
text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
|
|
11785
|
+
passed: false
|
|
11786
|
+
});
|
|
11825
11787
|
}
|
|
11826
11788
|
}
|
|
11827
11789
|
}
|
|
11828
11790
|
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
11829
11791
|
const ratio = explorationRatio(narrowedTrace);
|
|
11830
11792
|
if (ratio === void 0) {
|
|
11831
|
-
|
|
11793
|
+
assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
|
|
11832
11794
|
} else {
|
|
11833
11795
|
actualMetrics.exploration_ratio = ratio;
|
|
11834
11796
|
const diff = Math.abs(ratio - target_exploration_ratio);
|
|
11835
11797
|
if (diff <= exploration_tolerance) {
|
|
11836
|
-
|
|
11837
|
-
`Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}
|
|
11838
|
-
|
|
11798
|
+
assertions.push({
|
|
11799
|
+
text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
|
|
11800
|
+
passed: true
|
|
11801
|
+
});
|
|
11839
11802
|
} else {
|
|
11840
|
-
|
|
11841
|
-
`Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})
|
|
11842
|
-
|
|
11803
|
+
assertions.push({
|
|
11804
|
+
text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
|
|
11805
|
+
passed: false
|
|
11806
|
+
});
|
|
11843
11807
|
}
|
|
11844
11808
|
}
|
|
11845
11809
|
}
|
|
11846
|
-
const totalChecks =
|
|
11847
|
-
const
|
|
11848
|
-
const
|
|
11849
|
-
if (actualMetrics.tool_calls !== void 0) {
|
|
11850
|
-
reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
|
|
11851
|
-
}
|
|
11852
|
-
if (actualMetrics.llm_calls !== void 0) {
|
|
11853
|
-
reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
|
|
11854
|
-
}
|
|
11855
|
-
if (actualMetrics.tokens !== void 0) {
|
|
11856
|
-
reasoningParts.push(`tokens=${actualMetrics.tokens}`);
|
|
11857
|
-
}
|
|
11858
|
-
if (actualMetrics.cost_usd !== void 0) {
|
|
11859
|
-
reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
|
|
11860
|
-
}
|
|
11861
|
-
if (actualMetrics.duration_ms !== void 0) {
|
|
11862
|
-
reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
|
|
11863
|
-
}
|
|
11864
|
-
if (actualMetrics.exploration_ratio !== void 0) {
|
|
11865
|
-
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
11866
|
-
}
|
|
11867
|
-
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
11810
|
+
const totalChecks = assertions.length;
|
|
11811
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
11812
|
+
const score = totalChecks > 0 ? passedCount / totalChecks : 0;
|
|
11868
11813
|
return {
|
|
11869
11814
|
score,
|
|
11870
11815
|
verdict: scoreToVerdict(score),
|
|
11871
|
-
|
|
11872
|
-
misses,
|
|
11816
|
+
assertions,
|
|
11873
11817
|
expectedAspectCount: totalChecks || 1,
|
|
11874
|
-
reasoning,
|
|
11875
11818
|
evaluatorRawRequest: {
|
|
11876
11819
|
type: "execution-metrics",
|
|
11877
11820
|
config: this.extractConfiguredThresholds(),
|
|
@@ -11975,10 +11918,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
11975
11918
|
return {
|
|
11976
11919
|
score: 0,
|
|
11977
11920
|
verdict: "fail",
|
|
11978
|
-
|
|
11979
|
-
|
|
11980
|
-
expectedAspectCount: this.config.fields.length,
|
|
11981
|
-
reasoning: "Candidate answer is not valid JSON"
|
|
11921
|
+
assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
|
|
11922
|
+
expectedAspectCount: this.config.fields.length
|
|
11982
11923
|
};
|
|
11983
11924
|
}
|
|
11984
11925
|
const expectedData = this.extractExpectedData(evalCase.expected_output);
|
|
@@ -11986,10 +11927,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
11986
11927
|
return {
|
|
11987
11928
|
score: 0,
|
|
11988
11929
|
verdict: "fail",
|
|
11989
|
-
|
|
11990
|
-
|
|
11991
|
-
expectedAspectCount: this.config.fields.length,
|
|
11992
|
-
reasoning: "Could not extract expected data from expected_output"
|
|
11930
|
+
assertions: [{ text: "No expected data found in expected_output", passed: false }],
|
|
11931
|
+
expectedAspectCount: this.config.fields.length
|
|
11993
11932
|
};
|
|
11994
11933
|
}
|
|
11995
11934
|
const fieldResults = [];
|
|
@@ -12207,18 +12146,14 @@ var FieldAccuracyEvaluator = class {
|
|
|
12207
12146
|
*/
|
|
12208
12147
|
aggregateResults(results) {
|
|
12209
12148
|
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
12210
|
-
const
|
|
12211
|
-
const misses = [];
|
|
12149
|
+
const assertions = [];
|
|
12212
12150
|
for (const result of results) {
|
|
12213
|
-
|
|
12214
|
-
hits.push(result.message);
|
|
12215
|
-
} else {
|
|
12216
|
-
misses.push(result.message);
|
|
12217
|
-
}
|
|
12151
|
+
assertions.push({ text: result.message, passed: result.hit });
|
|
12218
12152
|
}
|
|
12219
12153
|
let score;
|
|
12220
12154
|
if (aggregation === "all_or_nothing") {
|
|
12221
|
-
|
|
12155
|
+
const hasFailed = assertions.some((a) => !a.passed);
|
|
12156
|
+
score = hasFailed ? 0 : 1;
|
|
12222
12157
|
} else {
|
|
12223
12158
|
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
12224
12159
|
if (totalWeight === 0) {
|
|
@@ -12228,15 +12163,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
12228
12163
|
score = weightedSum / totalWeight;
|
|
12229
12164
|
}
|
|
12230
12165
|
}
|
|
12231
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
12232
12166
|
return {
|
|
12233
12167
|
score: clampScore(score),
|
|
12234
12168
|
verdict: scoreToVerdict(score),
|
|
12235
|
-
|
|
12236
|
-
|
|
12237
|
-
misses: misses.slice(0, 4),
|
|
12238
|
-
expectedAspectCount: results.length,
|
|
12239
|
-
reasoning
|
|
12169
|
+
assertions,
|
|
12170
|
+
expectedAspectCount: results.length
|
|
12240
12171
|
};
|
|
12241
12172
|
}
|
|
12242
12173
|
};
|
|
@@ -12345,10 +12276,8 @@ var LatencyEvaluator = class {
|
|
|
12345
12276
|
return {
|
|
12346
12277
|
score: 0,
|
|
12347
12278
|
verdict: "fail",
|
|
12348
|
-
|
|
12349
|
-
misses: ["No duration data available in trace"],
|
|
12279
|
+
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
12350
12280
|
expectedAspectCount: 1,
|
|
12351
|
-
reasoning: "Execution duration not reported by provider",
|
|
12352
12281
|
evaluatorRawRequest: {
|
|
12353
12282
|
type: "latency",
|
|
12354
12283
|
threshold,
|
|
@@ -12361,10 +12290,10 @@ var LatencyEvaluator = class {
|
|
|
12361
12290
|
return {
|
|
12362
12291
|
score,
|
|
12363
12292
|
verdict: passed ? "pass" : "fail",
|
|
12364
|
-
|
|
12365
|
-
|
|
12293
|
+
assertions: [
|
|
12294
|
+
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
12295
|
+
],
|
|
12366
12296
|
expectedAspectCount: 1,
|
|
12367
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
12368
12297
|
evaluatorRawRequest: {
|
|
12369
12298
|
type: "latency",
|
|
12370
12299
|
threshold,
|
|
@@ -12385,7 +12314,10 @@ var COPILOT_MATCHER = {
|
|
|
12385
12314
|
skillTools: ["Skill", "skill"],
|
|
12386
12315
|
skillInputField: "skill",
|
|
12387
12316
|
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
12388
|
-
readInputField: "file_path"
|
|
12317
|
+
readInputField: "file_path",
|
|
12318
|
+
skillToolPrefixes: ["Using skill: "],
|
|
12319
|
+
readToolPrefixes: ["Viewing "],
|
|
12320
|
+
readInputFields: ["file_path", "path"]
|
|
12389
12321
|
};
|
|
12390
12322
|
var PROVIDER_TOOL_SEMANTICS = {
|
|
12391
12323
|
claude: CLAUDE_MATCHER,
|
|
@@ -12427,12 +12359,22 @@ var SkillTriggerEvaluator = class {
|
|
|
12427
12359
|
triggered = true;
|
|
12428
12360
|
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
|
|
12429
12361
|
}
|
|
12362
|
+
} else if (matcher.skillToolPrefixes?.some(
|
|
12363
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
12364
|
+
)) {
|
|
12365
|
+
triggered = true;
|
|
12366
|
+
evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
|
|
12430
12367
|
} else if (matcher.readTools.includes(firstTool.tool)) {
|
|
12431
|
-
const filePath =
|
|
12368
|
+
const filePath = this.readPathFromInput(input, matcher);
|
|
12432
12369
|
if (filePath.includes(skillName)) {
|
|
12433
12370
|
triggered = true;
|
|
12434
12371
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
12435
12372
|
}
|
|
12373
|
+
} else if (matcher.readToolPrefixes?.some(
|
|
12374
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
12375
|
+
)) {
|
|
12376
|
+
triggered = true;
|
|
12377
|
+
evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
|
|
12436
12378
|
}
|
|
12437
12379
|
}
|
|
12438
12380
|
const pass = triggered === shouldTrigger;
|
|
@@ -12440,25 +12382,37 @@ var SkillTriggerEvaluator = class {
|
|
|
12440
12382
|
return {
|
|
12441
12383
|
score: 1,
|
|
12442
12384
|
verdict: "pass",
|
|
12443
|
-
|
|
12444
|
-
|
|
12385
|
+
assertions: [
|
|
12386
|
+
{
|
|
12387
|
+
text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
|
|
12388
|
+
passed: true
|
|
12389
|
+
}
|
|
12445
12390
|
],
|
|
12446
|
-
|
|
12447
|
-
expectedAspectCount: 1,
|
|
12448
|
-
reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
|
|
12391
|
+
expectedAspectCount: 1
|
|
12449
12392
|
};
|
|
12450
12393
|
}
|
|
12451
12394
|
return {
|
|
12452
12395
|
score: 0,
|
|
12453
12396
|
verdict: "fail",
|
|
12454
|
-
|
|
12455
|
-
|
|
12456
|
-
|
|
12397
|
+
assertions: [
|
|
12398
|
+
{
|
|
12399
|
+
text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
|
|
12400
|
+
passed: false
|
|
12401
|
+
}
|
|
12457
12402
|
],
|
|
12458
|
-
expectedAspectCount: 1
|
|
12459
|
-
reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
|
|
12403
|
+
expectedAspectCount: 1
|
|
12460
12404
|
};
|
|
12461
12405
|
}
|
|
12406
|
+
readPathFromInput(input, matcher) {
|
|
12407
|
+
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
12408
|
+
for (const field of fields) {
|
|
12409
|
+
const value = input[field];
|
|
12410
|
+
if (value !== void 0 && value !== null) {
|
|
12411
|
+
return String(value);
|
|
12412
|
+
}
|
|
12413
|
+
}
|
|
12414
|
+
return "";
|
|
12415
|
+
}
|
|
12462
12416
|
};
|
|
12463
12417
|
|
|
12464
12418
|
// src/evaluation/evaluators/llm-grader-prompt.ts
|
|
@@ -12493,12 +12447,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
12493
12447
|
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
|
|
12494
12448
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
12495
12449
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
12496
|
-
[TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
|
|
12497
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
|
|
12498
12450
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
12499
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
12500
12451
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
12501
|
-
// Text convenience accessors (new names, always strings)
|
|
12502
12452
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
12503
12453
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
12504
12454
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -12625,10 +12575,8 @@ var TokenUsageEvaluator = class {
|
|
|
12625
12575
|
return {
|
|
12626
12576
|
score: 0,
|
|
12627
12577
|
verdict: "fail",
|
|
12628
|
-
|
|
12629
|
-
misses: ["No token usage data available in trace"],
|
|
12578
|
+
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
12630
12579
|
expectedAspectCount,
|
|
12631
|
-
reasoning: "Token usage not reported by provider",
|
|
12632
12580
|
evaluatorRawRequest: {
|
|
12633
12581
|
type: "token-usage",
|
|
12634
12582
|
max_total: maxTotal ?? null,
|
|
@@ -12642,37 +12590,34 @@ var TokenUsageEvaluator = class {
|
|
|
12642
12590
|
const output = usage.output;
|
|
12643
12591
|
const cached = usage.cached ?? 0;
|
|
12644
12592
|
const total = input + output + cached;
|
|
12645
|
-
const
|
|
12646
|
-
const misses = [];
|
|
12593
|
+
const assertions = [];
|
|
12647
12594
|
if (typeof maxInput === "number") {
|
|
12648
12595
|
if (input <= maxInput) {
|
|
12649
|
-
|
|
12596
|
+
assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
|
|
12650
12597
|
} else {
|
|
12651
|
-
|
|
12598
|
+
assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
|
|
12652
12599
|
}
|
|
12653
12600
|
}
|
|
12654
12601
|
if (typeof maxOutput === "number") {
|
|
12655
12602
|
if (output <= maxOutput) {
|
|
12656
|
-
|
|
12603
|
+
assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
|
|
12657
12604
|
} else {
|
|
12658
|
-
|
|
12605
|
+
assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
|
|
12659
12606
|
}
|
|
12660
12607
|
}
|
|
12661
12608
|
if (typeof maxTotal === "number") {
|
|
12662
12609
|
if (total <= maxTotal) {
|
|
12663
|
-
|
|
12610
|
+
assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
|
|
12664
12611
|
} else {
|
|
12665
|
-
|
|
12612
|
+
assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
|
|
12666
12613
|
}
|
|
12667
12614
|
}
|
|
12668
|
-
const passed =
|
|
12615
|
+
const passed = assertions.every((a) => a.passed);
|
|
12669
12616
|
return {
|
|
12670
12617
|
score: passed ? 1 : 0,
|
|
12671
12618
|
verdict: passed ? "pass" : "fail",
|
|
12672
|
-
|
|
12673
|
-
misses,
|
|
12619
|
+
assertions,
|
|
12674
12620
|
expectedAspectCount,
|
|
12675
|
-
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
12676
12621
|
evaluatorRawRequest: {
|
|
12677
12622
|
type: "token-usage",
|
|
12678
12623
|
max_total: maxTotal ?? null,
|
|
@@ -12772,8 +12717,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12772
12717
|
return {
|
|
12773
12718
|
score: 0,
|
|
12774
12719
|
verdict: "fail",
|
|
12775
|
-
|
|
12776
|
-
misses: ["No trace available for evaluation"],
|
|
12720
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
12777
12721
|
expectedAspectCount: 1
|
|
12778
12722
|
};
|
|
12779
12723
|
}
|
|
@@ -12784,8 +12728,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12784
12728
|
return {
|
|
12785
12729
|
score: 0,
|
|
12786
12730
|
verdict: "fail",
|
|
12787
|
-
|
|
12788
|
-
misses: ["No trace available for evaluation"],
|
|
12731
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
12789
12732
|
expectedAspectCount: 1
|
|
12790
12733
|
};
|
|
12791
12734
|
}
|
|
@@ -12803,8 +12746,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12803
12746
|
return {
|
|
12804
12747
|
score: 0,
|
|
12805
12748
|
verdict: "fail",
|
|
12806
|
-
|
|
12807
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
12749
|
+
assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
|
|
12808
12750
|
expectedAspectCount: 1
|
|
12809
12751
|
};
|
|
12810
12752
|
}
|
|
@@ -12853,28 +12795,32 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12853
12795
|
return {
|
|
12854
12796
|
score: 1,
|
|
12855
12797
|
verdict: "pass",
|
|
12856
|
-
|
|
12857
|
-
misses: [],
|
|
12798
|
+
assertions: [{ text: "No tool requirements specified", passed: true }],
|
|
12858
12799
|
expectedAspectCount: 0
|
|
12859
12800
|
};
|
|
12860
12801
|
}
|
|
12861
|
-
const
|
|
12862
|
-
const misses = [];
|
|
12802
|
+
const assertions = [];
|
|
12863
12803
|
for (const toolName of toolNames) {
|
|
12864
12804
|
const required = minimums[toolName];
|
|
12865
12805
|
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
12866
12806
|
if (actual >= required) {
|
|
12867
|
-
|
|
12807
|
+
assertions.push({
|
|
12808
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
12809
|
+
passed: true
|
|
12810
|
+
});
|
|
12868
12811
|
} else {
|
|
12869
|
-
|
|
12812
|
+
assertions.push({
|
|
12813
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
12814
|
+
passed: false
|
|
12815
|
+
});
|
|
12870
12816
|
}
|
|
12871
12817
|
}
|
|
12872
|
-
const
|
|
12818
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
12819
|
+
const score = passedCount / toolNames.length;
|
|
12873
12820
|
return {
|
|
12874
12821
|
score,
|
|
12875
12822
|
verdict: scoreToVerdict(score),
|
|
12876
|
-
|
|
12877
|
-
misses,
|
|
12823
|
+
assertions,
|
|
12878
12824
|
expectedAspectCount: toolNames.length
|
|
12879
12825
|
};
|
|
12880
12826
|
}
|
|
@@ -12884,13 +12830,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12884
12830
|
return {
|
|
12885
12831
|
score: 1,
|
|
12886
12832
|
verdict: "pass",
|
|
12887
|
-
|
|
12888
|
-
misses: [],
|
|
12833
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
12889
12834
|
expectedAspectCount: 0
|
|
12890
12835
|
};
|
|
12891
12836
|
}
|
|
12892
|
-
const
|
|
12893
|
-
const misses = [];
|
|
12837
|
+
const assertions = [];
|
|
12894
12838
|
const warnings = [];
|
|
12895
12839
|
let actualIndex = 0;
|
|
12896
12840
|
let sequenceHits = 0;
|
|
@@ -12910,16 +12854,20 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12910
12854
|
const actualCall = toolCalls[actualIndex];
|
|
12911
12855
|
if (actualCall.name === expectedTool) {
|
|
12912
12856
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
12913
|
-
|
|
12857
|
+
assertions.push({
|
|
12858
|
+
text: `Found ${expectedTool} at position ${actualIndex}`,
|
|
12859
|
+
passed: true
|
|
12860
|
+
});
|
|
12914
12861
|
sequenceHits++;
|
|
12915
12862
|
matchedCall = actualCall;
|
|
12916
12863
|
actualIndex++;
|
|
12917
12864
|
found = true;
|
|
12918
12865
|
break;
|
|
12919
12866
|
}
|
|
12920
|
-
|
|
12921
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch
|
|
12922
|
-
|
|
12867
|
+
assertions.push({
|
|
12868
|
+
text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
|
|
12869
|
+
passed: false
|
|
12870
|
+
});
|
|
12923
12871
|
actualIndex++;
|
|
12924
12872
|
argsMismatch = true;
|
|
12925
12873
|
break;
|
|
@@ -12927,7 +12875,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12927
12875
|
actualIndex++;
|
|
12928
12876
|
}
|
|
12929
12877
|
if (!found && !argsMismatch) {
|
|
12930
|
-
|
|
12878
|
+
assertions.push({
|
|
12879
|
+
text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
|
|
12880
|
+
passed: false
|
|
12881
|
+
});
|
|
12931
12882
|
}
|
|
12932
12883
|
if (found && matchedCall) {
|
|
12933
12884
|
const latencyResult = checkLatency(
|
|
@@ -12936,10 +12887,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12936
12887
|
matchedCall.durationMs
|
|
12937
12888
|
);
|
|
12938
12889
|
if (latencyResult.status === "pass") {
|
|
12939
|
-
|
|
12890
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
12940
12891
|
latencyHits++;
|
|
12941
12892
|
} else if (latencyResult.status === "fail") {
|
|
12942
|
-
|
|
12893
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
12943
12894
|
} else if (latencyResult.message) {
|
|
12944
12895
|
warnings.push(latencyResult.message);
|
|
12945
12896
|
latencySkips++;
|
|
@@ -12955,8 +12906,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12955
12906
|
return {
|
|
12956
12907
|
score,
|
|
12957
12908
|
verdict: scoreToVerdict(score),
|
|
12958
|
-
|
|
12959
|
-
misses,
|
|
12909
|
+
assertions,
|
|
12960
12910
|
expectedAspectCount: totalAssertions
|
|
12961
12911
|
};
|
|
12962
12912
|
}
|
|
@@ -12966,13 +12916,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12966
12916
|
return {
|
|
12967
12917
|
score: 1,
|
|
12968
12918
|
verdict: "pass",
|
|
12969
|
-
|
|
12970
|
-
misses: [],
|
|
12919
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
12971
12920
|
expectedAspectCount: 0
|
|
12972
12921
|
};
|
|
12973
12922
|
}
|
|
12974
|
-
const
|
|
12975
|
-
const misses = [];
|
|
12923
|
+
const assertions = [];
|
|
12976
12924
|
const warnings = [];
|
|
12977
12925
|
let sequenceHits = 0;
|
|
12978
12926
|
let latencyHits = 0;
|
|
@@ -12981,7 +12929,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12981
12929
|
(item) => item.maxDurationMs !== void 0
|
|
12982
12930
|
).length;
|
|
12983
12931
|
if (toolCalls.length !== expected.length) {
|
|
12984
|
-
|
|
12932
|
+
assertions.push({
|
|
12933
|
+
text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
|
|
12934
|
+
passed: false
|
|
12935
|
+
});
|
|
12985
12936
|
}
|
|
12986
12937
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
12987
12938
|
for (let i = 0; i < checkLength; i++) {
|
|
@@ -12993,14 +12944,17 @@ var ToolTrajectoryEvaluator = class {
|
|
|
12993
12944
|
let sequenceMatched = false;
|
|
12994
12945
|
if (actualTool === expectedTool) {
|
|
12995
12946
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
12996
|
-
|
|
12947
|
+
assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
|
|
12997
12948
|
sequenceHits++;
|
|
12998
12949
|
sequenceMatched = true;
|
|
12999
12950
|
} else {
|
|
13000
|
-
|
|
12951
|
+
assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
|
|
13001
12952
|
}
|
|
13002
12953
|
} else {
|
|
13003
|
-
|
|
12954
|
+
assertions.push({
|
|
12955
|
+
text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
|
|
12956
|
+
passed: false
|
|
12957
|
+
});
|
|
13004
12958
|
}
|
|
13005
12959
|
if (sequenceMatched) {
|
|
13006
12960
|
const latencyResult = checkLatency(
|
|
@@ -13009,10 +12963,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13009
12963
|
actualCall.durationMs
|
|
13010
12964
|
);
|
|
13011
12965
|
if (latencyResult.status === "pass") {
|
|
13012
|
-
|
|
12966
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
13013
12967
|
latencyHits++;
|
|
13014
12968
|
} else if (latencyResult.status === "fail") {
|
|
13015
|
-
|
|
12969
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
13016
12970
|
} else if (latencyResult.message) {
|
|
13017
12971
|
warnings.push(latencyResult.message);
|
|
13018
12972
|
latencySkips++;
|
|
@@ -13020,7 +12974,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13020
12974
|
}
|
|
13021
12975
|
}
|
|
13022
12976
|
for (let i = checkLength; i < expected.length; i++) {
|
|
13023
|
-
|
|
12977
|
+
assertions.push({
|
|
12978
|
+
text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
|
|
12979
|
+
passed: false
|
|
12980
|
+
});
|
|
13024
12981
|
}
|
|
13025
12982
|
for (const warning of warnings) {
|
|
13026
12983
|
console.warn(`[tool-trajectory] ${warning}`);
|
|
@@ -13031,8 +12988,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13031
12988
|
return {
|
|
13032
12989
|
score,
|
|
13033
12990
|
verdict: scoreToVerdict(score),
|
|
13034
|
-
|
|
13035
|
-
misses,
|
|
12991
|
+
assertions,
|
|
13036
12992
|
expectedAspectCount: totalAssertions
|
|
13037
12993
|
};
|
|
13038
12994
|
}
|
|
@@ -13047,13 +13003,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13047
13003
|
return {
|
|
13048
13004
|
score: 1,
|
|
13049
13005
|
verdict: "pass",
|
|
13050
|
-
|
|
13051
|
-
misses: [],
|
|
13006
|
+
assertions: [{ text: "No expected tools specified", passed: true }],
|
|
13052
13007
|
expectedAspectCount: 0
|
|
13053
13008
|
};
|
|
13054
13009
|
}
|
|
13055
|
-
const
|
|
13056
|
-
const misses = [];
|
|
13010
|
+
const assertions = [];
|
|
13057
13011
|
const consumed = /* @__PURE__ */ new Set();
|
|
13058
13012
|
for (let i = 0; i < expected.length; i++) {
|
|
13059
13013
|
const expectedItem = expected[i];
|
|
@@ -13064,22 +13018,25 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13064
13018
|
if (consumed.has(j)) continue;
|
|
13065
13019
|
const actualCall = toolCalls[j];
|
|
13066
13020
|
if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
13067
|
-
|
|
13021
|
+
assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
|
|
13068
13022
|
consumed.add(j);
|
|
13069
13023
|
found = true;
|
|
13070
13024
|
break;
|
|
13071
13025
|
}
|
|
13072
13026
|
}
|
|
13073
13027
|
if (!found) {
|
|
13074
|
-
|
|
13028
|
+
assertions.push({
|
|
13029
|
+
text: `Expected ${expectedTool} not found in actual trajectory`,
|
|
13030
|
+
passed: false
|
|
13031
|
+
});
|
|
13075
13032
|
}
|
|
13076
13033
|
}
|
|
13077
|
-
const
|
|
13034
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
13035
|
+
const score = expected.length > 0 ? passedCount / expected.length : 1;
|
|
13078
13036
|
return {
|
|
13079
13037
|
score,
|
|
13080
13038
|
verdict: scoreToVerdict(score),
|
|
13081
|
-
|
|
13082
|
-
misses,
|
|
13039
|
+
assertions,
|
|
13083
13040
|
expectedAspectCount: expected.length
|
|
13084
13041
|
};
|
|
13085
13042
|
}
|
|
@@ -13095,16 +13052,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13095
13052
|
return {
|
|
13096
13053
|
score: 1,
|
|
13097
13054
|
verdict: "pass",
|
|
13098
|
-
|
|
13099
|
-
misses: [],
|
|
13055
|
+
assertions: [{ text: "No tool calls and no expected tools", passed: true }],
|
|
13100
13056
|
expectedAspectCount: 0
|
|
13101
13057
|
};
|
|
13102
13058
|
}
|
|
13103
13059
|
return {
|
|
13104
13060
|
score: 0,
|
|
13105
13061
|
verdict: "fail",
|
|
13106
|
-
|
|
13107
|
-
|
|
13062
|
+
assertions: [
|
|
13063
|
+
{
|
|
13064
|
+
text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
|
|
13065
|
+
passed: false
|
|
13066
|
+
}
|
|
13067
|
+
],
|
|
13108
13068
|
expectedAspectCount: toolCalls.length
|
|
13109
13069
|
};
|
|
13110
13070
|
}
|
|
@@ -13112,13 +13072,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13112
13072
|
return {
|
|
13113
13073
|
score: 1,
|
|
13114
13074
|
verdict: "pass",
|
|
13115
|
-
|
|
13116
|
-
misses: [],
|
|
13075
|
+
assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
|
|
13117
13076
|
expectedAspectCount: 0
|
|
13118
13077
|
};
|
|
13119
13078
|
}
|
|
13120
|
-
const
|
|
13121
|
-
const misses = [];
|
|
13079
|
+
const assertions = [];
|
|
13122
13080
|
for (let i = 0; i < toolCalls.length; i++) {
|
|
13123
13081
|
const actualCall = toolCalls[i];
|
|
13124
13082
|
let allowed = false;
|
|
@@ -13130,17 +13088,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
13130
13088
|
}
|
|
13131
13089
|
}
|
|
13132
13090
|
if (allowed) {
|
|
13133
|
-
|
|
13091
|
+
assertions.push({
|
|
13092
|
+
text: `Position ${i}: ${actualCall.name} is in allowed set`,
|
|
13093
|
+
passed: true
|
|
13094
|
+
});
|
|
13134
13095
|
} else {
|
|
13135
|
-
|
|
13096
|
+
assertions.push({
|
|
13097
|
+
text: `Position ${i}: ${actualCall.name} is not in allowed set`,
|
|
13098
|
+
passed: false
|
|
13099
|
+
});
|
|
13136
13100
|
}
|
|
13137
13101
|
}
|
|
13138
|
-
const
|
|
13102
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
13103
|
+
const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
|
|
13139
13104
|
return {
|
|
13140
13105
|
score,
|
|
13141
13106
|
verdict: scoreToVerdict(score),
|
|
13142
|
-
|
|
13143
|
-
misses,
|
|
13107
|
+
assertions,
|
|
13144
13108
|
expectedAspectCount: toolCalls.length
|
|
13145
13109
|
};
|
|
13146
13110
|
}
|
|
@@ -13151,8 +13115,12 @@ function runContainsAssertion(output, value) {
|
|
|
13151
13115
|
const passed = output.includes(value);
|
|
13152
13116
|
return {
|
|
13153
13117
|
score: passed ? 1 : 0,
|
|
13154
|
-
|
|
13155
|
-
|
|
13118
|
+
assertions: [
|
|
13119
|
+
{
|
|
13120
|
+
text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
|
|
13121
|
+
passed
|
|
13122
|
+
}
|
|
13123
|
+
]
|
|
13156
13124
|
};
|
|
13157
13125
|
}
|
|
13158
13126
|
function runContainsAnyAssertion(output, values) {
|
|
@@ -13160,8 +13128,12 @@ function runContainsAnyAssertion(output, values) {
|
|
|
13160
13128
|
const passed = matched.length > 0;
|
|
13161
13129
|
return {
|
|
13162
13130
|
score: passed ? 1 : 0,
|
|
13163
|
-
|
|
13164
|
-
|
|
13131
|
+
assertions: [
|
|
13132
|
+
{
|
|
13133
|
+
text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
|
|
13134
|
+
passed
|
|
13135
|
+
}
|
|
13136
|
+
]
|
|
13165
13137
|
};
|
|
13166
13138
|
}
|
|
13167
13139
|
function runContainsAllAssertion(output, values) {
|
|
@@ -13169,16 +13141,24 @@ function runContainsAllAssertion(output, values) {
|
|
|
13169
13141
|
const passed = missing.length === 0;
|
|
13170
13142
|
return {
|
|
13171
13143
|
score: passed ? 1 : 0,
|
|
13172
|
-
|
|
13173
|
-
|
|
13144
|
+
assertions: [
|
|
13145
|
+
{
|
|
13146
|
+
text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
13147
|
+
passed
|
|
13148
|
+
}
|
|
13149
|
+
]
|
|
13174
13150
|
};
|
|
13175
13151
|
}
|
|
13176
13152
|
function runIcontainsAssertion(output, value) {
|
|
13177
13153
|
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
13178
13154
|
return {
|
|
13179
13155
|
score: passed ? 1 : 0,
|
|
13180
|
-
|
|
13181
|
-
|
|
13156
|
+
assertions: [
|
|
13157
|
+
{
|
|
13158
|
+
text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
|
|
13159
|
+
passed
|
|
13160
|
+
}
|
|
13161
|
+
]
|
|
13182
13162
|
};
|
|
13183
13163
|
}
|
|
13184
13164
|
function runIcontainsAnyAssertion(output, values) {
|
|
@@ -13187,9 +13167,11 @@ function runIcontainsAnyAssertion(output, values) {
|
|
|
13187
13167
|
const passed = matched.length > 0;
|
|
13188
13168
|
return {
|
|
13189
13169
|
score: passed ? 1 : 0,
|
|
13190
|
-
|
|
13191
|
-
|
|
13192
|
-
|
|
13170
|
+
assertions: [
|
|
13171
|
+
{
|
|
13172
|
+
text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
|
|
13173
|
+
passed
|
|
13174
|
+
}
|
|
13193
13175
|
]
|
|
13194
13176
|
};
|
|
13195
13177
|
}
|
|
@@ -13199,24 +13181,36 @@ function runIcontainsAllAssertion(output, values) {
|
|
|
13199
13181
|
const passed = missing.length === 0;
|
|
13200
13182
|
return {
|
|
13201
13183
|
score: passed ? 1 : 0,
|
|
13202
|
-
|
|
13203
|
-
|
|
13184
|
+
assertions: [
|
|
13185
|
+
{
|
|
13186
|
+
text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
13187
|
+
passed
|
|
13188
|
+
}
|
|
13189
|
+
]
|
|
13204
13190
|
};
|
|
13205
13191
|
}
|
|
13206
13192
|
function runStartsWithAssertion(output, value) {
|
|
13207
13193
|
const passed = output.trim().startsWith(value.trim());
|
|
13208
13194
|
return {
|
|
13209
13195
|
score: passed ? 1 : 0,
|
|
13210
|
-
|
|
13211
|
-
|
|
13196
|
+
assertions: [
|
|
13197
|
+
{
|
|
13198
|
+
text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
|
|
13199
|
+
passed
|
|
13200
|
+
}
|
|
13201
|
+
]
|
|
13212
13202
|
};
|
|
13213
13203
|
}
|
|
13214
13204
|
function runEndsWithAssertion(output, value) {
|
|
13215
13205
|
const passed = output.trim().endsWith(value.trim());
|
|
13216
13206
|
return {
|
|
13217
13207
|
score: passed ? 1 : 0,
|
|
13218
|
-
|
|
13219
|
-
|
|
13208
|
+
assertions: [
|
|
13209
|
+
{
|
|
13210
|
+
text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
|
|
13211
|
+
passed
|
|
13212
|
+
}
|
|
13213
|
+
]
|
|
13220
13214
|
};
|
|
13221
13215
|
}
|
|
13222
13216
|
function runRegexAssertion(output, pattern, flags) {
|
|
@@ -13225,8 +13219,12 @@ function runRegexAssertion(output, pattern, flags) {
|
|
|
13225
13219
|
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
13226
13220
|
return {
|
|
13227
13221
|
score: passed ? 1 : 0,
|
|
13228
|
-
|
|
13229
|
-
|
|
13222
|
+
assertions: [
|
|
13223
|
+
{
|
|
13224
|
+
text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
|
|
13225
|
+
passed
|
|
13226
|
+
}
|
|
13227
|
+
]
|
|
13230
13228
|
};
|
|
13231
13229
|
}
|
|
13232
13230
|
function runIsJsonAssertion(output) {
|
|
@@ -13238,16 +13236,24 @@ function runIsJsonAssertion(output) {
|
|
|
13238
13236
|
}
|
|
13239
13237
|
return {
|
|
13240
13238
|
score: passed ? 1 : 0,
|
|
13241
|
-
|
|
13242
|
-
|
|
13239
|
+
assertions: [
|
|
13240
|
+
{
|
|
13241
|
+
text: passed ? "Output is valid JSON" : "Output is not valid JSON",
|
|
13242
|
+
passed
|
|
13243
|
+
}
|
|
13244
|
+
]
|
|
13243
13245
|
};
|
|
13244
13246
|
}
|
|
13245
13247
|
function runEqualsAssertion(output, value) {
|
|
13246
13248
|
const passed = output.trim() === value.trim();
|
|
13247
13249
|
return {
|
|
13248
13250
|
score: passed ? 1 : 0,
|
|
13249
|
-
|
|
13250
|
-
|
|
13251
|
+
assertions: [
|
|
13252
|
+
{
|
|
13253
|
+
text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
|
|
13254
|
+
passed
|
|
13255
|
+
}
|
|
13256
|
+
]
|
|
13251
13257
|
};
|
|
13252
13258
|
}
|
|
13253
13259
|
|
|
@@ -13460,10 +13466,8 @@ var InlineAssertEvaluator = class {
|
|
|
13460
13466
|
return {
|
|
13461
13467
|
score,
|
|
13462
13468
|
verdict: scoreToVerdict(score),
|
|
13463
|
-
|
|
13464
|
-
misses: score < 0.5 ? [result.name] : [],
|
|
13469
|
+
assertions: [{ text: result.name, passed: score >= 0.5 }],
|
|
13465
13470
|
expectedAspectCount: 1,
|
|
13466
|
-
reasoning: void 0,
|
|
13467
13471
|
details: result.metadata ? result.metadata : void 0
|
|
13468
13472
|
};
|
|
13469
13473
|
}
|
|
@@ -13501,11 +13505,9 @@ async function resolveCustomPrompt(promptConfig, context, timeoutMs) {
|
|
|
13501
13505
|
}
|
|
13502
13506
|
async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
13503
13507
|
const payload = {
|
|
13504
|
-
question: context.evalCase.question,
|
|
13505
13508
|
criteria: context.evalCase.criteria,
|
|
13506
13509
|
expectedOutput: context.evalCase.expected_output,
|
|
13507
|
-
|
|
13508
|
-
answer: context.candidate,
|
|
13510
|
+
outputText: context.candidate,
|
|
13509
13511
|
output: context.output ?? null,
|
|
13510
13512
|
guidelineFiles: context.evalCase.guideline_paths,
|
|
13511
13513
|
inputFiles: context.evalCase.file_paths.filter(
|
|
@@ -13516,9 +13518,7 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
|
|
|
13516
13518
|
fileChanges: context.fileChanges ?? null,
|
|
13517
13519
|
workspacePath: context.workspacePath ?? null,
|
|
13518
13520
|
config: config ?? context.config ?? null,
|
|
13519
|
-
// Text convenience accessors (new names, always strings)
|
|
13520
13521
|
inputText: context.evalCase.question,
|
|
13521
|
-
outputText: context.candidate,
|
|
13522
13522
|
expectedOutputText: context.evalCase.reference_answer ?? ""
|
|
13523
13523
|
};
|
|
13524
13524
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -13656,9 +13656,7 @@ var containsFactory = (config) => {
|
|
|
13656
13656
|
return {
|
|
13657
13657
|
score: result.score,
|
|
13658
13658
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13659
|
-
|
|
13660
|
-
misses: result.misses,
|
|
13661
|
-
reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
|
|
13659
|
+
assertions: result.assertions,
|
|
13662
13660
|
expectedAspectCount: 1
|
|
13663
13661
|
};
|
|
13664
13662
|
});
|
|
@@ -13670,9 +13668,7 @@ var regexFactory = (config) => {
|
|
|
13670
13668
|
return {
|
|
13671
13669
|
score: result.score,
|
|
13672
13670
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13673
|
-
|
|
13674
|
-
misses: result.misses,
|
|
13675
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
13671
|
+
assertions: result.assertions,
|
|
13676
13672
|
expectedAspectCount: 1
|
|
13677
13673
|
};
|
|
13678
13674
|
});
|
|
@@ -13683,9 +13679,7 @@ var isJsonFactory = () => {
|
|
|
13683
13679
|
return {
|
|
13684
13680
|
score: result.score,
|
|
13685
13681
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13686
|
-
|
|
13687
|
-
misses: result.misses,
|
|
13688
|
-
reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
|
|
13682
|
+
assertions: result.assertions,
|
|
13689
13683
|
expectedAspectCount: 1
|
|
13690
13684
|
};
|
|
13691
13685
|
});
|
|
@@ -13697,9 +13691,7 @@ var equalsFactory = (config) => {
|
|
|
13697
13691
|
return {
|
|
13698
13692
|
score: result.score,
|
|
13699
13693
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13700
|
-
|
|
13701
|
-
misses: result.misses,
|
|
13702
|
-
reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
|
|
13694
|
+
assertions: result.assertions,
|
|
13703
13695
|
expectedAspectCount: 1
|
|
13704
13696
|
};
|
|
13705
13697
|
});
|
|
@@ -13711,9 +13703,7 @@ var containsAnyFactory = (config) => {
|
|
|
13711
13703
|
return {
|
|
13712
13704
|
score: result.score,
|
|
13713
13705
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13714
|
-
|
|
13715
|
-
misses: result.misses,
|
|
13716
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13706
|
+
assertions: result.assertions,
|
|
13717
13707
|
expectedAspectCount: 1
|
|
13718
13708
|
};
|
|
13719
13709
|
});
|
|
@@ -13725,9 +13715,7 @@ var containsAllFactory = (config) => {
|
|
|
13725
13715
|
return {
|
|
13726
13716
|
score: result.score,
|
|
13727
13717
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13728
|
-
|
|
13729
|
-
misses: result.misses,
|
|
13730
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13718
|
+
assertions: result.assertions,
|
|
13731
13719
|
expectedAspectCount: 1
|
|
13732
13720
|
};
|
|
13733
13721
|
});
|
|
@@ -13739,9 +13727,7 @@ var icontainsFactory = (config) => {
|
|
|
13739
13727
|
return {
|
|
13740
13728
|
score: result.score,
|
|
13741
13729
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13742
|
-
|
|
13743
|
-
misses: result.misses,
|
|
13744
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13730
|
+
assertions: result.assertions,
|
|
13745
13731
|
expectedAspectCount: 1
|
|
13746
13732
|
};
|
|
13747
13733
|
});
|
|
@@ -13753,9 +13739,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
13753
13739
|
return {
|
|
13754
13740
|
score: result.score,
|
|
13755
13741
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13756
|
-
|
|
13757
|
-
misses: result.misses,
|
|
13758
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13742
|
+
assertions: result.assertions,
|
|
13759
13743
|
expectedAspectCount: 1
|
|
13760
13744
|
};
|
|
13761
13745
|
});
|
|
@@ -13767,9 +13751,7 @@ var icontainsAllFactory = (config) => {
|
|
|
13767
13751
|
return {
|
|
13768
13752
|
score: result.score,
|
|
13769
13753
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13770
|
-
|
|
13771
|
-
misses: result.misses,
|
|
13772
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13754
|
+
assertions: result.assertions,
|
|
13773
13755
|
expectedAspectCount: 1
|
|
13774
13756
|
};
|
|
13775
13757
|
});
|
|
@@ -13781,9 +13763,7 @@ var startsWithFactory = (config) => {
|
|
|
13781
13763
|
return {
|
|
13782
13764
|
score: result.score,
|
|
13783
13765
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13784
|
-
|
|
13785
|
-
misses: result.misses,
|
|
13786
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13766
|
+
assertions: result.assertions,
|
|
13787
13767
|
expectedAspectCount: 1
|
|
13788
13768
|
};
|
|
13789
13769
|
});
|
|
@@ -13795,9 +13775,7 @@ var endsWithFactory = (config) => {
|
|
|
13795
13775
|
return {
|
|
13796
13776
|
score: result.score,
|
|
13797
13777
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
13798
|
-
|
|
13799
|
-
misses: result.misses,
|
|
13800
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
13778
|
+
assertions: result.assertions,
|
|
13801
13779
|
expectedAspectCount: 1
|
|
13802
13780
|
};
|
|
13803
13781
|
});
|
|
@@ -14868,7 +14846,7 @@ async function runEvaluation(options) {
|
|
|
14868
14846
|
if (!cliModel) {
|
|
14869
14847
|
throw new Error('--grader-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
|
|
14870
14848
|
}
|
|
14871
|
-
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-
|
|
14849
|
+
const { AgentvProvider: AgentvProvider2 } = await import("./agentv-provider-NFFLXG5M.js");
|
|
14872
14850
|
return new AgentvProvider2("agentv", { model: cliModel, temperature: 0 });
|
|
14873
14851
|
}
|
|
14874
14852
|
const overrideTarget = resolveTargetByName(cliGraderTarget);
|
|
@@ -15203,9 +15181,8 @@ async function runEvaluation(options) {
|
|
|
15203
15181
|
testId: evalCase.id,
|
|
15204
15182
|
dataset: evalCase.dataset,
|
|
15205
15183
|
score: 0,
|
|
15206
|
-
|
|
15207
|
-
|
|
15208
|
-
answer: "",
|
|
15184
|
+
assertions: [],
|
|
15185
|
+
outputText: "",
|
|
15209
15186
|
target: target.name,
|
|
15210
15187
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
15211
15188
|
budgetExceeded: true,
|
|
@@ -15240,9 +15217,8 @@ async function runEvaluation(options) {
|
|
|
15240
15217
|
testId: evalCase.id,
|
|
15241
15218
|
dataset: evalCase.dataset,
|
|
15242
15219
|
score: 0,
|
|
15243
|
-
|
|
15244
|
-
|
|
15245
|
-
answer: "",
|
|
15220
|
+
assertions: [],
|
|
15221
|
+
outputText: "",
|
|
15246
15222
|
target: target.name,
|
|
15247
15223
|
error: errorMsg,
|
|
15248
15224
|
executionStatus: "execution_error",
|
|
@@ -16208,11 +16184,9 @@ async function evaluateCandidate(options) {
|
|
|
16208
16184
|
dataset: evalCase.dataset,
|
|
16209
16185
|
conversationId: evalCase.conversation_id,
|
|
16210
16186
|
score: score.score,
|
|
16211
|
-
|
|
16212
|
-
|
|
16213
|
-
answer: candidate,
|
|
16187
|
+
assertions: score.assertions,
|
|
16188
|
+
outputText: candidate,
|
|
16214
16189
|
target: target.name,
|
|
16215
|
-
reasoning: score.reasoning,
|
|
16216
16190
|
tokenUsage,
|
|
16217
16191
|
costUsd,
|
|
16218
16192
|
durationMs,
|
|
@@ -16386,9 +16360,7 @@ async function runEvaluatorList(options) {
|
|
|
16386
16360
|
score: score2.score,
|
|
16387
16361
|
weight,
|
|
16388
16362
|
verdict: score2.verdict,
|
|
16389
|
-
|
|
16390
|
-
misses: score2.misses,
|
|
16391
|
-
reasoning: score2.reasoning,
|
|
16363
|
+
assertions: score2.assertions,
|
|
16392
16364
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
16393
16365
|
details: score2.details,
|
|
16394
16366
|
scores: mapChildResults(score2.scores),
|
|
@@ -16403,10 +16375,10 @@ async function runEvaluatorList(options) {
|
|
|
16403
16375
|
const fallbackScore = {
|
|
16404
16376
|
score: 0,
|
|
16405
16377
|
verdict: "fail",
|
|
16406
|
-
|
|
16407
|
-
|
|
16408
|
-
|
|
16409
|
-
|
|
16378
|
+
assertions: [
|
|
16379
|
+
{ text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
16380
|
+
],
|
|
16381
|
+
expectedAspectCount: 1
|
|
16410
16382
|
};
|
|
16411
16383
|
const weight = evaluatorConfig.weight ?? 1;
|
|
16412
16384
|
scored.push({
|
|
@@ -16422,9 +16394,12 @@ async function runEvaluatorList(options) {
|
|
|
16422
16394
|
score: 0,
|
|
16423
16395
|
weight,
|
|
16424
16396
|
verdict: "fail",
|
|
16425
|
-
|
|
16426
|
-
|
|
16427
|
-
|
|
16397
|
+
assertions: [
|
|
16398
|
+
{
|
|
16399
|
+
text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
16400
|
+
passed: false
|
|
16401
|
+
}
|
|
16402
|
+
],
|
|
16428
16403
|
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
16429
16404
|
startedAt: startedAt.toISOString(),
|
|
16430
16405
|
endedAt: endedAt.toISOString()
|
|
@@ -16440,9 +16415,7 @@ async function runEvaluatorList(options) {
|
|
|
16440
16415
|
...scores[lastScoresIdx],
|
|
16441
16416
|
score: negated.score,
|
|
16442
16417
|
verdict: negated.verdict,
|
|
16443
|
-
|
|
16444
|
-
misses: [...negated.misses],
|
|
16445
|
-
reasoning: negated.reasoning
|
|
16418
|
+
assertions: [...negated.assertions]
|
|
16446
16419
|
};
|
|
16447
16420
|
}
|
|
16448
16421
|
}
|
|
@@ -16457,21 +16430,13 @@ async function runEvaluatorList(options) {
|
|
|
16457
16430
|
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
16458
16431
|
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
16459
16432
|
) : 0;
|
|
16460
|
-
const
|
|
16461
|
-
const
|
|
16462
|
-
const expectedAspectCount = scored.reduce(
|
|
16463
|
-
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
16464
|
-
0
|
|
16465
|
-
);
|
|
16466
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
16467
|
-
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
16433
|
+
const assertions = scored.flatMap((entry) => entry.score.assertions);
|
|
16434
|
+
const expectedAspectCount = assertions.length || 1;
|
|
16468
16435
|
const score = {
|
|
16469
16436
|
score: aggregateScore,
|
|
16470
16437
|
verdict: scoreToVerdict(aggregateScore),
|
|
16471
|
-
|
|
16472
|
-
|
|
16473
|
-
expectedAspectCount,
|
|
16474
|
-
reasoning
|
|
16438
|
+
assertions,
|
|
16439
|
+
expectedAspectCount
|
|
16475
16440
|
};
|
|
16476
16441
|
return { score, scores };
|
|
16477
16442
|
}
|
|
@@ -16575,9 +16540,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
16575
16540
|
dataset: evalCase.dataset,
|
|
16576
16541
|
conversationId: evalCase.conversation_id,
|
|
16577
16542
|
score: 0,
|
|
16578
|
-
|
|
16579
|
-
|
|
16580
|
-
answer: `Error occurred: ${message}`,
|
|
16543
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
16544
|
+
outputText: `Error occurred: ${message}`,
|
|
16581
16545
|
target: targetName,
|
|
16582
16546
|
requests,
|
|
16583
16547
|
input,
|
|
@@ -16686,9 +16650,7 @@ function mapChildResults(children) {
|
|
|
16686
16650
|
score: child.score,
|
|
16687
16651
|
weight: child.weight,
|
|
16688
16652
|
verdict: child.verdict,
|
|
16689
|
-
|
|
16690
|
-
misses: child.misses,
|
|
16691
|
-
reasoning: child.reasoning,
|
|
16653
|
+
assertions: child.assertions,
|
|
16692
16654
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
16693
16655
|
scores: mapChildResults(child.scores),
|
|
16694
16656
|
details: child.details,
|
|
@@ -17117,7 +17079,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
17117
17079
|
|
|
17118
17080
|
// src/evaluation/baseline.ts
|
|
17119
17081
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
17120
|
-
"
|
|
17082
|
+
"outputText",
|
|
17121
17083
|
"requests",
|
|
17122
17084
|
"trace",
|
|
17123
17085
|
"workspacePath",
|
|
@@ -17291,7 +17253,7 @@ var OtelTraceExporter = class {
|
|
|
17291
17253
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
17292
17254
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
17293
17255
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
17294
|
-
if (captureContent) rootSpan.setAttribute("agentv.
|
|
17256
|
+
if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
|
|
17295
17257
|
if (result.durationMs != null)
|
|
17296
17258
|
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
17297
17259
|
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
@@ -17653,7 +17615,6 @@ export {
|
|
|
17653
17615
|
freeformEvaluationSchema,
|
|
17654
17616
|
generateRubrics,
|
|
17655
17617
|
getAgentvHome,
|
|
17656
|
-
getHitCount,
|
|
17657
17618
|
getOutputFilenames,
|
|
17658
17619
|
getSubagentsRoot,
|
|
17659
17620
|
getTraceStateRoot,
|