@agentv/core 3.4.0 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-NFFLXG5M.js +7 -0
- package/dist/{chunk-JO4HIAEF.js → chunk-2IZOTQ25.js} +1 -5
- package/dist/chunk-2IZOTQ25.js.map +1 -0
- package/dist/{chunk-Q52FQPKQ.js → chunk-W5YDZWT4.js} +2 -2
- package/dist/chunk-W5YDZWT4.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +449 -491
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +57 -47
- package/dist/index.d.ts +57 -47
- package/dist/index.js +451 -490
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/agentv-provider-HDSAUUEF.js +0 -7
- package/dist/chunk-JO4HIAEF.js.map +0 -1
- package/dist/chunk-Q52FQPKQ.js.map +0 -1
- /package/dist/{agentv-provider-HDSAUUEF.js.map → agentv-provider-NFFLXG5M.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -55,7 +55,7 @@ function createLanguageModel(modelString) {
|
|
|
55
55
|
case "anthropic":
|
|
56
56
|
return (0, import_anthropic.createAnthropic)()(modelName);
|
|
57
57
|
case "azure":
|
|
58
|
-
return (0, import_azure.createAzure)()(modelName);
|
|
58
|
+
return (0, import_azure.createAzure)().chat(modelName);
|
|
59
59
|
case "google":
|
|
60
60
|
return (0, import_google.createGoogleGenerativeAI)()(modelName);
|
|
61
61
|
default:
|
|
@@ -1580,7 +1580,6 @@ __export(index_exports, {
|
|
|
1580
1580
|
freeformEvaluationSchema: () => freeformEvaluationSchema,
|
|
1581
1581
|
generateRubrics: () => generateRubrics,
|
|
1582
1582
|
getAgentvHome: () => getAgentvHome,
|
|
1583
|
-
getHitCount: () => getHitCount,
|
|
1584
1583
|
getOutputFilenames: () => getOutputFilenames,
|
|
1585
1584
|
getSubagentsRoot: () => getSubagentsRoot,
|
|
1586
1585
|
getTraceStateRoot: () => getTraceStateRoot,
|
|
@@ -1730,9 +1729,6 @@ var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
|
1730
1729
|
function isEvaluatorKind(value) {
|
|
1731
1730
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
1732
1731
|
}
|
|
1733
|
-
function getHitCount(result) {
|
|
1734
|
-
return result.hits.length;
|
|
1735
|
-
}
|
|
1736
1732
|
|
|
1737
1733
|
// src/evaluation/trace.ts
|
|
1738
1734
|
function computeTraceSummary(messages) {
|
|
@@ -2449,14 +2445,8 @@ var import_promises5 = require("fs/promises");
|
|
|
2449
2445
|
|
|
2450
2446
|
// src/evaluation/template-variables.ts
|
|
2451
2447
|
var TEMPLATE_VARIABLES = {
|
|
2452
|
-
/** @deprecated Use OUTPUT_TEXT instead */
|
|
2453
|
-
ANSWER: "answer",
|
|
2454
2448
|
EXPECTED_OUTPUT: "expected_output",
|
|
2455
|
-
/** @deprecated Use INPUT_TEXT instead */
|
|
2456
|
-
QUESTION: "question",
|
|
2457
2449
|
CRITERIA: "criteria",
|
|
2458
|
-
/** @deprecated Use EXPECTED_OUTPUT_TEXT instead */
|
|
2459
|
-
REFERENCE_ANSWER: "reference_answer",
|
|
2460
2450
|
INPUT: "input",
|
|
2461
2451
|
OUTPUT: "output",
|
|
2462
2452
|
FILE_CHANGES: "file_changes",
|
|
@@ -2466,9 +2456,8 @@ var TEMPLATE_VARIABLES = {
|
|
|
2466
2456
|
};
|
|
2467
2457
|
var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
|
|
2468
2458
|
var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
|
|
2469
|
-
TEMPLATE_VARIABLES.
|
|
2470
|
-
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
2471
|
-
TEMPLATE_VARIABLES.OUTPUT_TEXT
|
|
2459
|
+
TEMPLATE_VARIABLES.OUTPUT_TEXT,
|
|
2460
|
+
TEMPLATE_VARIABLES.EXPECTED_OUTPUT
|
|
2472
2461
|
]);
|
|
2473
2462
|
|
|
2474
2463
|
// src/evaluation/validation/prompt-validator.ts
|
|
@@ -2491,13 +2480,13 @@ function validateTemplateVariables(content, source) {
|
|
|
2491
2480
|
}
|
|
2492
2481
|
match = variablePattern.exec(content);
|
|
2493
2482
|
}
|
|
2494
|
-
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.
|
|
2483
|
+
const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
|
|
2495
2484
|
const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
|
|
2496
2485
|
const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
|
|
2497
2486
|
if (!hasRequiredFields) {
|
|
2498
2487
|
throw new Error(
|
|
2499
2488
|
`Missing required fields. Must include at least one of:
|
|
2500
|
-
- {{ ${TEMPLATE_VARIABLES.
|
|
2489
|
+
- {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
|
|
2501
2490
|
- {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
|
|
2502
2491
|
);
|
|
2503
2492
|
}
|
|
@@ -5576,7 +5565,7 @@ var AzureProvider = class {
|
|
|
5576
5565
|
};
|
|
5577
5566
|
this.retryConfig = config.retry;
|
|
5578
5567
|
const azure = (0, import_azure2.createAzure)(buildAzureOptions(config));
|
|
5579
|
-
this.model = azure(config.deploymentName);
|
|
5568
|
+
this.model = azure.chat(config.deploymentName);
|
|
5580
5569
|
}
|
|
5581
5570
|
id;
|
|
5582
5571
|
kind = "azure";
|
|
@@ -5799,6 +5788,8 @@ async function invokeModel(options) {
|
|
|
5799
5788
|
const { model, request, defaults, retryConfig, providerOptions } = options;
|
|
5800
5789
|
const chatPrompt = buildChatPrompt(request);
|
|
5801
5790
|
const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
|
|
5791
|
+
const startTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
5792
|
+
const startMs = Date.now();
|
|
5802
5793
|
const result = await withRetry(
|
|
5803
5794
|
() => (0, import_ai.generateText)({
|
|
5804
5795
|
model,
|
|
@@ -5812,9 +5803,11 @@ async function invokeModel(options) {
|
|
|
5812
5803
|
retryConfig,
|
|
5813
5804
|
request.signal
|
|
5814
5805
|
);
|
|
5815
|
-
|
|
5806
|
+
const endTime = (/* @__PURE__ */ new Date()).toISOString();
|
|
5807
|
+
const durationMs = Date.now() - startMs;
|
|
5808
|
+
return mapResponse(result, { durationMs, startTime, endTime });
|
|
5816
5809
|
}
|
|
5817
|
-
function mapResponse(result) {
|
|
5810
|
+
function mapResponse(result, timing) {
|
|
5818
5811
|
const content = result.text ?? "";
|
|
5819
5812
|
const rawUsage = result.totalUsage ?? result.usage;
|
|
5820
5813
|
const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? void 0;
|
|
@@ -5829,7 +5822,10 @@ function mapResponse(result) {
|
|
|
5829
5822
|
raw: result,
|
|
5830
5823
|
usage: toJsonObject(rawUsage),
|
|
5831
5824
|
output: [{ role: "assistant", content }],
|
|
5832
|
-
tokenUsage
|
|
5825
|
+
tokenUsage,
|
|
5826
|
+
durationMs: timing?.durationMs,
|
|
5827
|
+
startTime: timing?.startTime,
|
|
5828
|
+
endTime: timing?.endTime
|
|
5833
5829
|
};
|
|
5834
5830
|
}
|
|
5835
5831
|
function toJsonObject(value) {
|
|
@@ -6707,10 +6703,12 @@ var ClaudeSdkProvider = class {
|
|
|
6707
6703
|
if (usage) {
|
|
6708
6704
|
const inputTokens = (usage.input_tokens ?? 0) + (usage.cache_read_input_tokens ?? 0) + (usage.cache_creation_input_tokens ?? 0);
|
|
6709
6705
|
const outputTokens = usage.output_tokens ?? 0;
|
|
6706
|
+
const reasoningTokens = usage.reasoning_tokens ?? void 0;
|
|
6710
6707
|
tokenUsage = {
|
|
6711
6708
|
input: inputTokens,
|
|
6712
6709
|
output: outputTokens,
|
|
6713
|
-
cached: usage.cache_read_input_tokens ?? void 0
|
|
6710
|
+
cached: usage.cache_read_input_tokens ?? void 0,
|
|
6711
|
+
reasoning: reasoningTokens
|
|
6714
6712
|
};
|
|
6715
6713
|
request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? "claude", tokenUsage);
|
|
6716
6714
|
}
|
|
@@ -7724,7 +7722,8 @@ ${basePrompt}` : basePrompt;
|
|
|
7724
7722
|
onUsage({
|
|
7725
7723
|
input: usage.input_tokens ?? 0,
|
|
7726
7724
|
output: usage.output_tokens ?? 0,
|
|
7727
|
-
cached: usage.cached_input_tokens ?? void 0
|
|
7725
|
+
cached: usage.cached_input_tokens ?? void 0,
|
|
7726
|
+
reasoning: usage.reasoning_tokens ?? void 0
|
|
7728
7727
|
});
|
|
7729
7728
|
}
|
|
7730
7729
|
}
|
|
@@ -9739,10 +9738,12 @@ function extractTokenUsage(events) {
|
|
|
9739
9738
|
output: output ?? 0
|
|
9740
9739
|
};
|
|
9741
9740
|
const cached = toFiniteNumber(u.cache_read_input_tokens ?? u.cached ?? u.cachedTokens);
|
|
9742
|
-
|
|
9743
|
-
|
|
9744
|
-
|
|
9745
|
-
|
|
9741
|
+
const reasoning = toFiniteNumber(u.reasoning_tokens ?? u.reasoningTokens ?? u.reasoning);
|
|
9742
|
+
return {
|
|
9743
|
+
...result,
|
|
9744
|
+
...cached !== void 0 ? { cached } : {},
|
|
9745
|
+
...reasoning !== void 0 ? { reasoning } : {}
|
|
9746
|
+
};
|
|
9746
9747
|
}
|
|
9747
9748
|
}
|
|
9748
9749
|
const messages = record.messages;
|
|
@@ -12807,9 +12808,11 @@ function negateScore(score) {
|
|
|
12807
12808
|
...score,
|
|
12808
12809
|
score: negatedScore,
|
|
12809
12810
|
verdict: negatedVerdict,
|
|
12810
|
-
|
|
12811
|
-
|
|
12812
|
-
|
|
12811
|
+
assertions: score.assertions.map((a) => ({
|
|
12812
|
+
...a,
|
|
12813
|
+
passed: !a.passed,
|
|
12814
|
+
evidence: a.evidence ? `[Negated] ${a.evidence}` : void 0
|
|
12815
|
+
}))
|
|
12813
12816
|
};
|
|
12814
12817
|
}
|
|
12815
12818
|
|
|
@@ -13267,11 +13270,9 @@ var CodeEvaluator = class {
|
|
|
13267
13270
|
}
|
|
13268
13271
|
}
|
|
13269
13272
|
const payload = {
|
|
13270
|
-
question: context2.evalCase.question,
|
|
13271
13273
|
criteria: context2.evalCase.criteria,
|
|
13272
13274
|
expectedOutput: context2.evalCase.expected_output,
|
|
13273
|
-
|
|
13274
|
-
answer: context2.candidate,
|
|
13275
|
+
outputText: context2.candidate,
|
|
13275
13276
|
output: outputForPayload,
|
|
13276
13277
|
outputPath,
|
|
13277
13278
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
@@ -13288,9 +13289,7 @@ var CodeEvaluator = class {
|
|
|
13288
13289
|
fileChanges: context2.fileChanges ?? null,
|
|
13289
13290
|
workspacePath: context2.workspacePath ?? null,
|
|
13290
13291
|
config: this.config ?? null,
|
|
13291
|
-
// Text convenience accessors (new names, always strings)
|
|
13292
13292
|
inputText: context2.evalCase.question,
|
|
13293
|
-
outputText: context2.candidate,
|
|
13294
13293
|
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
13295
13294
|
};
|
|
13296
13295
|
const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -13324,9 +13323,13 @@ var CodeEvaluator = class {
|
|
|
13324
13323
|
);
|
|
13325
13324
|
const parsed = parseJsonSafe(stdout);
|
|
13326
13325
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
13327
|
-
const
|
|
13328
|
-
|
|
13329
|
-
|
|
13326
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
13327
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
13328
|
+
).map((a) => ({
|
|
13329
|
+
text: String(a.text),
|
|
13330
|
+
passed: Boolean(a.passed),
|
|
13331
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
13332
|
+
})) : [];
|
|
13330
13333
|
const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
|
|
13331
13334
|
const proxyUsage = getProxyUsage?.();
|
|
13332
13335
|
const evaluatorRawRequest = {
|
|
@@ -13342,10 +13345,8 @@ var CodeEvaluator = class {
|
|
|
13342
13345
|
return {
|
|
13343
13346
|
score,
|
|
13344
13347
|
verdict: scoreToVerdict(score),
|
|
13345
|
-
|
|
13346
|
-
|
|
13347
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
13348
|
-
reasoning,
|
|
13348
|
+
assertions,
|
|
13349
|
+
expectedAspectCount: assertions.length || 1,
|
|
13349
13350
|
evaluatorRawRequest,
|
|
13350
13351
|
...details ? { details } : {},
|
|
13351
13352
|
tokenUsage: proxyUsage?.tokenUsage
|
|
@@ -13356,10 +13357,8 @@ var CodeEvaluator = class {
|
|
|
13356
13357
|
return {
|
|
13357
13358
|
score: 0,
|
|
13358
13359
|
verdict: "fail",
|
|
13359
|
-
|
|
13360
|
-
misses: [`Code evaluator failed: ${message}`],
|
|
13360
|
+
assertions: [{ text: `Code evaluator failed: ${message}`, passed: false }],
|
|
13361
13361
|
expectedAspectCount: 1,
|
|
13362
|
-
reasoning: message,
|
|
13363
13362
|
evaluatorRawRequest: {
|
|
13364
13363
|
command: this.command,
|
|
13365
13364
|
...this.cwd ? { cwd: this.cwd } : {},
|
|
@@ -13490,18 +13489,22 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
13490
13489
|
{{${TEMPLATE_VARIABLES.CRITERIA}}}
|
|
13491
13490
|
|
|
13492
13491
|
[[ ## question ## ]]
|
|
13493
|
-
{{${TEMPLATE_VARIABLES.
|
|
13492
|
+
{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
|
|
13494
13493
|
|
|
13495
13494
|
[[ ## reference_answer ## ]]
|
|
13496
|
-
{{${TEMPLATE_VARIABLES.
|
|
13495
|
+
{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
|
|
13497
13496
|
|
|
13498
13497
|
[[ ## answer ## ]]
|
|
13499
|
-
{{${TEMPLATE_VARIABLES.
|
|
13498
|
+
{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
|
|
13500
13499
|
var freeformEvaluationSchema = import_zod4.z.object({
|
|
13501
13500
|
score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
13502
|
-
|
|
13503
|
-
|
|
13504
|
-
|
|
13501
|
+
assertions: import_zod4.z.array(
|
|
13502
|
+
import_zod4.z.object({
|
|
13503
|
+
text: import_zod4.z.string().describe("Brief description of what was checked"),
|
|
13504
|
+
passed: import_zod4.z.boolean().describe("Whether this aspect was satisfied"),
|
|
13505
|
+
evidence: import_zod4.z.string().describe("Concise evidence (1-2 sentences)").optional()
|
|
13506
|
+
})
|
|
13507
|
+
).describe("Per-aspect evaluation results \u2014 one entry per aspect checked").optional()
|
|
13505
13508
|
});
|
|
13506
13509
|
var rubricCheckResultSchema = import_zod4.z.object({
|
|
13507
13510
|
id: import_zod4.z.string().describe("The ID of the rubric item being checked"),
|
|
@@ -13570,12 +13573,8 @@ var LlmGraderEvaluator = class {
|
|
|
13570
13573
|
2
|
|
13571
13574
|
),
|
|
13572
13575
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
|
|
13573
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
13574
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13575
13576
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
13576
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
13577
13577
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
|
|
13578
|
-
// Text convenience accessors (new names, always strings)
|
|
13579
13578
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
13580
13579
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
13581
13580
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
|
|
@@ -13603,17 +13602,12 @@ ${context2.fileChanges}`;
|
|
|
13603
13602
|
schema: freeformEvaluationSchema
|
|
13604
13603
|
});
|
|
13605
13604
|
const score = clampScore(data.score);
|
|
13606
|
-
const
|
|
13607
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
13608
|
-
const reasoning = data.reasoning;
|
|
13609
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
13605
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
13610
13606
|
return {
|
|
13611
13607
|
score,
|
|
13612
13608
|
verdict: scoreToVerdict(score),
|
|
13613
|
-
|
|
13614
|
-
|
|
13615
|
-
expectedAspectCount,
|
|
13616
|
-
reasoning,
|
|
13609
|
+
assertions,
|
|
13610
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
13617
13611
|
evaluatorRawRequest,
|
|
13618
13612
|
tokenUsage
|
|
13619
13613
|
};
|
|
@@ -13624,10 +13618,8 @@ ${context2.fileChanges}`;
|
|
|
13624
13618
|
return {
|
|
13625
13619
|
score: 0,
|
|
13626
13620
|
verdict: "skip",
|
|
13627
|
-
|
|
13628
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
13621
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13629
13622
|
expectedAspectCount: 1,
|
|
13630
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
13631
13623
|
evaluatorRawRequest
|
|
13632
13624
|
};
|
|
13633
13625
|
}
|
|
@@ -13657,14 +13649,12 @@ ${context2.fileChanges}`;
|
|
|
13657
13649
|
userPrompt: prompt,
|
|
13658
13650
|
schema: rubricEvaluationSchema
|
|
13659
13651
|
});
|
|
13660
|
-
const { score, verdict,
|
|
13652
|
+
const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
|
|
13661
13653
|
return {
|
|
13662
13654
|
score,
|
|
13663
13655
|
verdict,
|
|
13664
|
-
|
|
13665
|
-
misses,
|
|
13656
|
+
assertions,
|
|
13666
13657
|
expectedAspectCount: rubrics.length,
|
|
13667
|
-
reasoning: data.overall_reasoning,
|
|
13668
13658
|
evaluatorRawRequest,
|
|
13669
13659
|
tokenUsage
|
|
13670
13660
|
};
|
|
@@ -13675,10 +13665,8 @@ ${context2.fileChanges}`;
|
|
|
13675
13665
|
return {
|
|
13676
13666
|
score: 0,
|
|
13677
13667
|
verdict: "skip",
|
|
13678
|
-
|
|
13679
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
13668
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13680
13669
|
expectedAspectCount: rubrics.length,
|
|
13681
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
13682
13670
|
evaluatorRawRequest
|
|
13683
13671
|
};
|
|
13684
13672
|
}
|
|
@@ -13703,14 +13691,12 @@ ${context2.fileChanges}`;
|
|
|
13703
13691
|
userPrompt: prompt,
|
|
13704
13692
|
schema: scoreRangeEvaluationSchema
|
|
13705
13693
|
});
|
|
13706
|
-
const { score, verdict,
|
|
13694
|
+
const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
|
|
13707
13695
|
return {
|
|
13708
13696
|
score,
|
|
13709
13697
|
verdict,
|
|
13710
|
-
|
|
13711
|
-
misses,
|
|
13698
|
+
assertions,
|
|
13712
13699
|
expectedAspectCount: rubrics.length,
|
|
13713
|
-
reasoning: data.overall_reasoning,
|
|
13714
13700
|
evaluatorRawRequest,
|
|
13715
13701
|
details,
|
|
13716
13702
|
tokenUsage
|
|
@@ -13722,10 +13708,8 @@ ${context2.fileChanges}`;
|
|
|
13722
13708
|
return {
|
|
13723
13709
|
score: 0,
|
|
13724
13710
|
verdict: "skip",
|
|
13725
|
-
|
|
13726
|
-
misses: [`Grader parse failure after 3 attempts: ${message}`],
|
|
13711
|
+
assertions: [{ text: `Grader parse failure after 3 attempts: ${message}`, passed: false }],
|
|
13727
13712
|
expectedAspectCount: rubrics.length,
|
|
13728
|
-
reasoning: `Grader parse failure after 3 attempts: ${message}`,
|
|
13729
13713
|
evaluatorRawRequest
|
|
13730
13714
|
};
|
|
13731
13715
|
}
|
|
@@ -13782,8 +13766,7 @@ ${context2.fileChanges}`;
|
|
|
13782
13766
|
return {
|
|
13783
13767
|
score: 0,
|
|
13784
13768
|
verdict: "fail",
|
|
13785
|
-
|
|
13786
|
-
misses: [`llm-grader built-in evaluation failed: ${message}`],
|
|
13769
|
+
assertions: [{ text: `llm-grader built-in evaluation failed: ${message}`, passed: false }],
|
|
13787
13770
|
expectedAspectCount: 1,
|
|
13788
13771
|
evaluatorRawRequest,
|
|
13789
13772
|
details: { mode: "built-in", error: message }
|
|
@@ -13833,8 +13816,9 @@ ${context2.fileChanges}`;
|
|
|
13833
13816
|
return {
|
|
13834
13817
|
score: 0,
|
|
13835
13818
|
verdict: "fail",
|
|
13836
|
-
|
|
13837
|
-
|
|
13819
|
+
assertions: [
|
|
13820
|
+
{ text: `llm-grader ${modeLabel} returned no assistant response`, passed: false }
|
|
13821
|
+
],
|
|
13838
13822
|
expectedAspectCount: 1,
|
|
13839
13823
|
evaluatorRawRequest,
|
|
13840
13824
|
details: { mode: modeLabel, grader_target: provider.targetName }
|
|
@@ -13852,8 +13836,9 @@ ${context2.fileChanges}`;
|
|
|
13852
13836
|
return {
|
|
13853
13837
|
score: 0,
|
|
13854
13838
|
verdict: "fail",
|
|
13855
|
-
|
|
13856
|
-
|
|
13839
|
+
assertions: [
|
|
13840
|
+
{ text: `llm-grader ${modeLabel} evaluation failed: ${message}`, passed: false }
|
|
13841
|
+
],
|
|
13857
13842
|
expectedAspectCount: 1,
|
|
13858
13843
|
evaluatorRawRequest,
|
|
13859
13844
|
details: {
|
|
@@ -13894,10 +13879,10 @@ ${context2.fileChanges}`;
|
|
|
13894
13879
|
buildAgentUserPrompt(context2) {
|
|
13895
13880
|
const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
|
|
13896
13881
|
const variables = {
|
|
13897
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
13898
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13899
13882
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
13900
|
-
[TEMPLATE_VARIABLES.
|
|
13883
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
13884
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
13885
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13901
13886
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
13902
13887
|
};
|
|
13903
13888
|
if (this.evaluatorTemplate) {
|
|
@@ -13950,10 +13935,10 @@ ${context2.fileChanges}`;
|
|
|
13950
13935
|
const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
|
|
13951
13936
|
if (this.evaluatorTemplate) {
|
|
13952
13937
|
const variables = {
|
|
13953
|
-
[TEMPLATE_VARIABLES.ANSWER]: context2.candidate.trim(),
|
|
13954
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13955
13938
|
[TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
|
|
13956
|
-
[TEMPLATE_VARIABLES.
|
|
13939
|
+
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
13940
|
+
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
|
|
13941
|
+
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
|
|
13957
13942
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
|
|
13958
13943
|
};
|
|
13959
13944
|
const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
|
|
@@ -14005,29 +13990,24 @@ ${outputSchema}`;
|
|
|
14005
13990
|
const parsed = parseJsonFromText(text);
|
|
14006
13991
|
if (rubrics && rubrics.length > 0) {
|
|
14007
13992
|
const data2 = rubricEvaluationSchema.parse(parsed);
|
|
14008
|
-
const { score: score2, verdict,
|
|
13993
|
+
const { score: score2, verdict, assertions: assertions2 } = calculateRubricScore(data2, rubrics);
|
|
14009
13994
|
return {
|
|
14010
13995
|
score: score2,
|
|
14011
13996
|
verdict,
|
|
14012
|
-
|
|
14013
|
-
misses: misses2,
|
|
13997
|
+
assertions: assertions2,
|
|
14014
13998
|
expectedAspectCount: rubrics.length,
|
|
14015
|
-
reasoning: data2.overall_reasoning,
|
|
14016
13999
|
evaluatorRawRequest,
|
|
14017
14000
|
details
|
|
14018
14001
|
};
|
|
14019
14002
|
}
|
|
14020
14003
|
const data = freeformEvaluationSchema.parse(parsed);
|
|
14021
14004
|
const score = clampScore(data.score);
|
|
14022
|
-
const
|
|
14023
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
14005
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
14024
14006
|
return {
|
|
14025
14007
|
score,
|
|
14026
14008
|
verdict: scoreToVerdict(score),
|
|
14027
|
-
|
|
14028
|
-
|
|
14029
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
14030
|
-
reasoning: data.reasoning,
|
|
14009
|
+
assertions,
|
|
14010
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
14031
14011
|
evaluatorRawRequest,
|
|
14032
14012
|
details
|
|
14033
14013
|
};
|
|
@@ -14035,8 +14015,12 @@ ${outputSchema}`;
|
|
|
14035
14015
|
return {
|
|
14036
14016
|
score: 0,
|
|
14037
14017
|
verdict: "fail",
|
|
14038
|
-
|
|
14039
|
-
|
|
14018
|
+
assertions: [
|
|
14019
|
+
{
|
|
14020
|
+
text: "Failed to parse llm-grader agent response as valid evaluation JSON",
|
|
14021
|
+
passed: false
|
|
14022
|
+
}
|
|
14023
|
+
],
|
|
14040
14024
|
expectedAspectCount: 1,
|
|
14041
14025
|
evaluatorRawRequest,
|
|
14042
14026
|
details
|
|
@@ -14165,9 +14149,13 @@ function buildOutputSchema() {
|
|
|
14165
14149
|
"",
|
|
14166
14150
|
"{",
|
|
14167
14151
|
' "score": <number between 0.0 and 1.0>,',
|
|
14168
|
-
' "
|
|
14169
|
-
|
|
14170
|
-
'
|
|
14152
|
+
' "assertions": [',
|
|
14153
|
+
" {",
|
|
14154
|
+
' "text": "<brief description of what was checked>",',
|
|
14155
|
+
' "passed": <boolean>,',
|
|
14156
|
+
' "evidence": "<concise evidence, 1-2 sentences, optional>"',
|
|
14157
|
+
" }",
|
|
14158
|
+
" ]",
|
|
14171
14159
|
"}"
|
|
14172
14160
|
].join("\n");
|
|
14173
14161
|
}
|
|
@@ -14192,8 +14180,7 @@ function substituteVariables(template, variables) {
|
|
|
14192
14180
|
}
|
|
14193
14181
|
function calculateRubricScore(result, rubrics) {
|
|
14194
14182
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
14195
|
-
const
|
|
14196
|
-
const misses = [];
|
|
14183
|
+
const assertions = [];
|
|
14197
14184
|
let totalWeight = 0;
|
|
14198
14185
|
let earnedWeight = 0;
|
|
14199
14186
|
let failedRequired = false;
|
|
@@ -14203,19 +14190,20 @@ function calculateRubricScore(result, rubrics) {
|
|
|
14203
14190
|
continue;
|
|
14204
14191
|
}
|
|
14205
14192
|
totalWeight += rubric.weight;
|
|
14193
|
+
assertions.push({
|
|
14194
|
+
text: `[${rubric.id}] ${rubric.outcome}`,
|
|
14195
|
+
passed: check.satisfied,
|
|
14196
|
+
evidence: check.reasoning
|
|
14197
|
+
});
|
|
14206
14198
|
if (check.satisfied) {
|
|
14207
14199
|
earnedWeight += rubric.weight;
|
|
14208
|
-
|
|
14209
|
-
|
|
14210
|
-
misses.push(`[${rubric.id}] ${rubric.outcome}: ${check.reasoning}`);
|
|
14211
|
-
if (rubric.required) {
|
|
14212
|
-
failedRequired = true;
|
|
14213
|
-
}
|
|
14200
|
+
} else if (rubric.required) {
|
|
14201
|
+
failedRequired = true;
|
|
14214
14202
|
}
|
|
14215
14203
|
}
|
|
14216
14204
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
14217
14205
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
14218
|
-
return { score, verdict,
|
|
14206
|
+
return { score, verdict, assertions };
|
|
14219
14207
|
}
|
|
14220
14208
|
function buildScoreRangeOutputSchema() {
|
|
14221
14209
|
return `You are an expert evaluator. Score the candidate answer on each criterion.
|
|
@@ -14235,8 +14223,7 @@ Important: The "score" must be an integer from 0 to 10 that falls within one of
|
|
|
14235
14223
|
}
|
|
14236
14224
|
function calculateScoreRangeResult(result, rubrics) {
|
|
14237
14225
|
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
14238
|
-
const
|
|
14239
|
-
const misses = [];
|
|
14226
|
+
const assertions = [];
|
|
14240
14227
|
const rawScores = {};
|
|
14241
14228
|
let totalWeight = 0;
|
|
14242
14229
|
let weightedScoreSum = 0;
|
|
@@ -14262,24 +14249,22 @@ function calculateScoreRangeResult(result, rubrics) {
|
|
|
14262
14249
|
);
|
|
14263
14250
|
const rangeDescription = matchingRange?.outcome ?? "";
|
|
14264
14251
|
const criterionLabel = rubric.outcome ?? rubric.id;
|
|
14265
|
-
const
|
|
14266
|
-
const scoreInfo = `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})${reasoningText}`;
|
|
14252
|
+
const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
|
|
14267
14253
|
if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
|
|
14268
14254
|
failedRequired = true;
|
|
14269
|
-
misses.push(scoreInfo);
|
|
14270
|
-
} else if (rawScore >= 7) {
|
|
14271
|
-
hits.push(scoreInfo);
|
|
14272
|
-
} else {
|
|
14273
|
-
misses.push(scoreInfo);
|
|
14274
14255
|
}
|
|
14256
|
+
assertions.push({
|
|
14257
|
+
text: `[${rubric.id}] ${criterionLabel} - Score: ${rawScore}/10 (${rangeDescription})`,
|
|
14258
|
+
passed,
|
|
14259
|
+
evidence: check.reasoning
|
|
14260
|
+
});
|
|
14275
14261
|
}
|
|
14276
14262
|
const score = totalWeight > 0 ? Math.min(1, Math.max(0, weightedScoreSum / totalWeight)) : 0;
|
|
14277
14263
|
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
14278
14264
|
return {
|
|
14279
14265
|
score,
|
|
14280
14266
|
verdict,
|
|
14281
|
-
|
|
14282
|
-
misses,
|
|
14267
|
+
assertions,
|
|
14283
14268
|
details: {
|
|
14284
14269
|
raw_scores: rawScores,
|
|
14285
14270
|
normalization: "score / 10",
|
|
@@ -14455,9 +14440,7 @@ var CompositeEvaluator = class {
|
|
|
14455
14440
|
let totalWeight = 0;
|
|
14456
14441
|
let weightedSum = 0;
|
|
14457
14442
|
let evaluatedCount = 0;
|
|
14458
|
-
const
|
|
14459
|
-
const allMisses = [];
|
|
14460
|
-
const reasoningParts = [];
|
|
14443
|
+
const allAssertions = [];
|
|
14461
14444
|
const scores = [];
|
|
14462
14445
|
for (const member of results) {
|
|
14463
14446
|
const weight = weights?.[member.id] ?? 1;
|
|
@@ -14467,9 +14450,7 @@ var CompositeEvaluator = class {
|
|
|
14467
14450
|
score: member.result.score,
|
|
14468
14451
|
weight,
|
|
14469
14452
|
verdict: member.result.verdict,
|
|
14470
|
-
|
|
14471
|
-
misses: [...member.result.misses],
|
|
14472
|
-
reasoning: member.result.reasoning,
|
|
14453
|
+
assertions: [...member.result.assertions],
|
|
14473
14454
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
14474
14455
|
scores: member.result.scores,
|
|
14475
14456
|
details: member.result.details,
|
|
@@ -14481,20 +14462,16 @@ var CompositeEvaluator = class {
|
|
|
14481
14462
|
evaluatedCount++;
|
|
14482
14463
|
totalWeight += weight;
|
|
14483
14464
|
weightedSum += member.result.score * weight;
|
|
14484
|
-
|
|
14485
|
-
|
|
14486
|
-
|
|
14487
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
14488
|
-
}
|
|
14465
|
+
allAssertions.push(
|
|
14466
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
14467
|
+
);
|
|
14489
14468
|
}
|
|
14490
14469
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
14491
14470
|
return {
|
|
14492
14471
|
score: 0,
|
|
14493
14472
|
verdict: "skip",
|
|
14494
|
-
|
|
14495
|
-
misses: [],
|
|
14473
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
14496
14474
|
expectedAspectCount: 1,
|
|
14497
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
14498
14475
|
evaluatorRawRequest: {
|
|
14499
14476
|
aggregator: "weighted_average",
|
|
14500
14477
|
...weights ? { weights } : {}
|
|
@@ -14506,10 +14483,8 @@ var CompositeEvaluator = class {
|
|
|
14506
14483
|
return {
|
|
14507
14484
|
score: clampScore(finalScore),
|
|
14508
14485
|
verdict: scoreToVerdict(finalScore),
|
|
14509
|
-
|
|
14510
|
-
|
|
14511
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
14512
|
-
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
14486
|
+
assertions: allAssertions,
|
|
14487
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
14513
14488
|
evaluatorRawRequest: {
|
|
14514
14489
|
aggregator: "weighted_average",
|
|
14515
14490
|
...weights ? { weights } : {}
|
|
@@ -14519,11 +14494,8 @@ var CompositeEvaluator = class {
|
|
|
14519
14494
|
}
|
|
14520
14495
|
runThreshold(results, threshold) {
|
|
14521
14496
|
const scores = [];
|
|
14522
|
-
const
|
|
14523
|
-
const allMisses = [];
|
|
14524
|
-
const reasoningParts = [];
|
|
14497
|
+
const allAssertions = [];
|
|
14525
14498
|
let passingCount = 0;
|
|
14526
|
-
let borderlineCount = 0;
|
|
14527
14499
|
let evaluatedCount = 0;
|
|
14528
14500
|
for (const member of results) {
|
|
14529
14501
|
scores.push({
|
|
@@ -14531,9 +14503,7 @@ var CompositeEvaluator = class {
|
|
|
14531
14503
|
type: member.type,
|
|
14532
14504
|
score: member.result.score,
|
|
14533
14505
|
verdict: member.result.verdict,
|
|
14534
|
-
|
|
14535
|
-
misses: [...member.result.misses],
|
|
14536
|
-
reasoning: member.result.reasoning,
|
|
14506
|
+
assertions: [...member.result.assertions],
|
|
14537
14507
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
14538
14508
|
scores: member.result.scores,
|
|
14539
14509
|
details: member.result.details,
|
|
@@ -14546,24 +14516,17 @@ var CompositeEvaluator = class {
|
|
|
14546
14516
|
const isPassing = member.result.verdict === "pass" || member.result.verdict === "borderline";
|
|
14547
14517
|
if (isPassing) {
|
|
14548
14518
|
passingCount++;
|
|
14549
|
-
if (member.result.verdict === "borderline") {
|
|
14550
|
-
borderlineCount++;
|
|
14551
|
-
}
|
|
14552
|
-
}
|
|
14553
|
-
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
14554
|
-
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
14555
|
-
if (member.result.reasoning) {
|
|
14556
|
-
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
14557
14519
|
}
|
|
14520
|
+
allAssertions.push(
|
|
14521
|
+
...member.result.assertions.map((a) => ({ ...a, text: `[${member.id}] ${a.text}` }))
|
|
14522
|
+
);
|
|
14558
14523
|
}
|
|
14559
14524
|
if (evaluatedCount === 0 && results.length > 0) {
|
|
14560
14525
|
return {
|
|
14561
14526
|
score: 0,
|
|
14562
14527
|
verdict: "skip",
|
|
14563
|
-
|
|
14564
|
-
misses: [],
|
|
14528
|
+
assertions: [{ text: "All evaluators skipped (infrastructure failure)", passed: false }],
|
|
14565
14529
|
expectedAspectCount: 1,
|
|
14566
|
-
reasoning: "All evaluators skipped (infrastructure failure)",
|
|
14567
14530
|
evaluatorRawRequest: {
|
|
14568
14531
|
aggregator: "threshold",
|
|
14569
14532
|
threshold
|
|
@@ -14574,19 +14537,15 @@ var CompositeEvaluator = class {
|
|
|
14574
14537
|
const totalCount = evaluatedCount;
|
|
14575
14538
|
const score = totalCount > 0 ? passingCount / totalCount : 0;
|
|
14576
14539
|
const pass = score >= threshold;
|
|
14577
|
-
|
|
14578
|
-
|
|
14579
|
-
|
|
14580
|
-
|
|
14581
|
-
`${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`
|
|
14582
|
-
);
|
|
14540
|
+
allAssertions.unshift({
|
|
14541
|
+
text: `${passingCount}/${totalCount} evaluators passed (threshold: ${threshold})`,
|
|
14542
|
+
passed: pass
|
|
14543
|
+
});
|
|
14583
14544
|
return {
|
|
14584
14545
|
score: clampScore(score),
|
|
14585
14546
|
verdict: pass ? "pass" : "fail",
|
|
14586
|
-
|
|
14587
|
-
|
|
14588
|
-
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
14589
|
-
reasoning: reasoningParts.join("; "),
|
|
14547
|
+
assertions: allAssertions,
|
|
14548
|
+
expectedAspectCount: allAssertions.length || 1,
|
|
14590
14549
|
evaluatorRawRequest: {
|
|
14591
14550
|
aggregator: "threshold",
|
|
14592
14551
|
threshold
|
|
@@ -14603,9 +14562,7 @@ var CompositeEvaluator = class {
|
|
|
14603
14562
|
score: member.result.score,
|
|
14604
14563
|
weight: weights?.[member.id] ?? 1,
|
|
14605
14564
|
verdict: member.result.verdict,
|
|
14606
|
-
|
|
14607
|
-
misses: [...member.result.misses],
|
|
14608
|
-
reasoning: member.result.reasoning,
|
|
14565
|
+
assertions: [...member.result.assertions],
|
|
14609
14566
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
14610
14567
|
scores: member.result.scores,
|
|
14611
14568
|
details: member.result.details
|
|
@@ -14614,17 +14571,19 @@ var CompositeEvaluator = class {
|
|
|
14614
14571
|
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
14615
14572
|
const parsed = parseJsonSafe(stdout);
|
|
14616
14573
|
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
14617
|
-
const
|
|
14618
|
-
|
|
14619
|
-
|
|
14574
|
+
const assertions = Array.isArray(parsed?.assertions) ? parsed.assertions.filter(
|
|
14575
|
+
(a) => typeof a === "object" && a !== null && typeof a.text === "string"
|
|
14576
|
+
).map((a) => ({
|
|
14577
|
+
text: String(a.text),
|
|
14578
|
+
passed: Boolean(a.passed),
|
|
14579
|
+
...typeof a.evidence === "string" ? { evidence: a.evidence } : {}
|
|
14580
|
+
})) : [];
|
|
14620
14581
|
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
14621
14582
|
return {
|
|
14622
14583
|
score,
|
|
14623
14584
|
verdict,
|
|
14624
|
-
|
|
14625
|
-
|
|
14626
|
-
expectedAspectCount: hits.length + misses.length || 1,
|
|
14627
|
-
reasoning,
|
|
14585
|
+
assertions,
|
|
14586
|
+
expectedAspectCount: assertions.length || 1,
|
|
14628
14587
|
evaluatorRawRequest: {
|
|
14629
14588
|
aggregator: "code-grader",
|
|
14630
14589
|
script: scriptPath
|
|
@@ -14636,10 +14595,8 @@ var CompositeEvaluator = class {
|
|
|
14636
14595
|
return {
|
|
14637
14596
|
score: 0,
|
|
14638
14597
|
verdict: "fail",
|
|
14639
|
-
|
|
14640
|
-
misses: [`Code aggregator failed: ${message}`],
|
|
14598
|
+
assertions: [{ text: `Code aggregator failed: ${message}`, passed: false }],
|
|
14641
14599
|
expectedAspectCount: 1,
|
|
14642
|
-
reasoning: message,
|
|
14643
14600
|
evaluatorRawRequest: {
|
|
14644
14601
|
aggregator: "code-grader",
|
|
14645
14602
|
script: scriptPath,
|
|
@@ -14661,9 +14618,7 @@ var CompositeEvaluator = class {
|
|
|
14661
14618
|
type: member.type,
|
|
14662
14619
|
score: member.result.score,
|
|
14663
14620
|
verdict: member.result.verdict,
|
|
14664
|
-
|
|
14665
|
-
misses: [...member.result.misses],
|
|
14666
|
-
reasoning: member.result.reasoning,
|
|
14621
|
+
assertions: [...member.result.assertions],
|
|
14667
14622
|
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
14668
14623
|
scores: member.result.scores,
|
|
14669
14624
|
details: member.result.details
|
|
@@ -14687,16 +14642,12 @@ var CompositeEvaluator = class {
|
|
|
14687
14642
|
});
|
|
14688
14643
|
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
14689
14644
|
const score2 = clampScore(data2.score);
|
|
14690
|
-
const
|
|
14691
|
-
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
14692
|
-
const reasoning2 = data2.reasoning;
|
|
14645
|
+
const assertions2 = Array.isArray(data2.assertions) ? data2.assertions.slice(0, 8) : [];
|
|
14693
14646
|
return {
|
|
14694
14647
|
score: score2,
|
|
14695
14648
|
verdict: scoreToVerdict(score2),
|
|
14696
|
-
|
|
14697
|
-
|
|
14698
|
-
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
14699
|
-
reasoning: reasoning2,
|
|
14649
|
+
assertions: assertions2,
|
|
14650
|
+
expectedAspectCount: Math.max(assertions2.length, 1),
|
|
14700
14651
|
evaluatorRawRequest,
|
|
14701
14652
|
scores
|
|
14702
14653
|
};
|
|
@@ -14711,16 +14662,12 @@ var CompositeEvaluator = class {
|
|
|
14711
14662
|
parseJsonFromText(extractLastAssistantContent2(response.output))
|
|
14712
14663
|
);
|
|
14713
14664
|
const score = clampScore(data.score);
|
|
14714
|
-
const
|
|
14715
|
-
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
14716
|
-
const reasoning = data.reasoning;
|
|
14665
|
+
const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
|
|
14717
14666
|
return {
|
|
14718
14667
|
score,
|
|
14719
14668
|
verdict: scoreToVerdict(score),
|
|
14720
|
-
|
|
14721
|
-
|
|
14722
|
-
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
14723
|
-
reasoning,
|
|
14669
|
+
assertions,
|
|
14670
|
+
expectedAspectCount: Math.max(assertions.length, 1),
|
|
14724
14671
|
evaluatorRawRequest,
|
|
14725
14672
|
scores
|
|
14726
14673
|
};
|
|
@@ -14728,8 +14675,7 @@ var CompositeEvaluator = class {
|
|
|
14728
14675
|
return {
|
|
14729
14676
|
score: 0,
|
|
14730
14677
|
verdict: "fail",
|
|
14731
|
-
|
|
14732
|
-
misses: [],
|
|
14678
|
+
assertions: [{ text: "LLM aggregator failed", passed: false }],
|
|
14733
14679
|
expectedAspectCount: 1,
|
|
14734
14680
|
evaluatorRawRequest,
|
|
14735
14681
|
scores
|
|
@@ -14752,10 +14698,8 @@ var CostEvaluator = class {
|
|
|
14752
14698
|
return {
|
|
14753
14699
|
score: 0,
|
|
14754
14700
|
verdict: "fail",
|
|
14755
|
-
|
|
14756
|
-
misses: ["No cost data available in trace"],
|
|
14701
|
+
assertions: [{ text: "No cost data available in trace", passed: false }],
|
|
14757
14702
|
expectedAspectCount: 1,
|
|
14758
|
-
reasoning: "Execution cost not reported by provider",
|
|
14759
14703
|
evaluatorRawRequest: {
|
|
14760
14704
|
type: "cost",
|
|
14761
14705
|
budget,
|
|
@@ -14769,10 +14713,10 @@ var CostEvaluator = class {
|
|
|
14769
14713
|
return {
|
|
14770
14714
|
score,
|
|
14771
14715
|
verdict: passed ? "pass" : "fail",
|
|
14772
|
-
|
|
14773
|
-
|
|
14716
|
+
assertions: [
|
|
14717
|
+
passed ? { text: `Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`, passed: true } : { text: `Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`, passed: false }
|
|
14718
|
+
],
|
|
14774
14719
|
expectedAspectCount: 1,
|
|
14775
|
-
reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
|
|
14776
14720
|
evaluatorRawRequest: {
|
|
14777
14721
|
type: "cost",
|
|
14778
14722
|
budget,
|
|
@@ -14805,10 +14749,8 @@ var ExecutionMetricsEvaluator = class {
|
|
|
14805
14749
|
return {
|
|
14806
14750
|
score: 0,
|
|
14807
14751
|
verdict: "fail",
|
|
14808
|
-
|
|
14809
|
-
misses: ["No trace summary available"],
|
|
14752
|
+
assertions: [{ text: "No trace summary available", passed: false }],
|
|
14810
14753
|
expectedAspectCount: 1,
|
|
14811
|
-
reasoning: "Execution metrics not available - no trace summary provided",
|
|
14812
14754
|
evaluatorRawRequest: {
|
|
14813
14755
|
type: "execution-metrics",
|
|
14814
14756
|
config: this.extractConfiguredThresholds(),
|
|
@@ -14817,116 +14759,114 @@ var ExecutionMetricsEvaluator = class {
|
|
|
14817
14759
|
};
|
|
14818
14760
|
}
|
|
14819
14761
|
const narrowedTrace = trace2;
|
|
14820
|
-
const
|
|
14821
|
-
const misses = [];
|
|
14762
|
+
const assertions = [];
|
|
14822
14763
|
const actualMetrics = {};
|
|
14823
14764
|
if (max_tool_calls !== void 0 && narrowedTrace) {
|
|
14824
14765
|
const toolCalls = narrowedTrace.eventCount;
|
|
14825
14766
|
actualMetrics.tool_calls = toolCalls;
|
|
14826
14767
|
if (toolCalls <= max_tool_calls) {
|
|
14827
|
-
|
|
14768
|
+
assertions.push({ text: `Tool calls ${toolCalls} <= ${max_tool_calls} max`, passed: true });
|
|
14828
14769
|
} else {
|
|
14829
|
-
|
|
14770
|
+
assertions.push({ text: `Tool calls ${toolCalls} > ${max_tool_calls} max`, passed: false });
|
|
14830
14771
|
}
|
|
14831
14772
|
}
|
|
14832
14773
|
if (max_llm_calls !== void 0 && narrowedTrace) {
|
|
14833
14774
|
const llmCalls = narrowedTrace.llmCallCount;
|
|
14834
14775
|
if (llmCalls === void 0) {
|
|
14835
|
-
|
|
14776
|
+
assertions.push({ text: "LLM call count data not available", passed: false });
|
|
14836
14777
|
} else {
|
|
14837
14778
|
actualMetrics.llm_calls = llmCalls;
|
|
14838
14779
|
if (llmCalls <= max_llm_calls) {
|
|
14839
|
-
|
|
14780
|
+
assertions.push({ text: `LLM calls ${llmCalls} <= ${max_llm_calls} max`, passed: true });
|
|
14840
14781
|
} else {
|
|
14841
|
-
|
|
14782
|
+
assertions.push({ text: `LLM calls ${llmCalls} > ${max_llm_calls} max`, passed: false });
|
|
14842
14783
|
}
|
|
14843
14784
|
}
|
|
14844
14785
|
}
|
|
14845
14786
|
if (max_tokens !== void 0) {
|
|
14846
14787
|
if (!tokenUsage) {
|
|
14847
|
-
|
|
14788
|
+
assertions.push({ text: "Token usage data not available", passed: false });
|
|
14848
14789
|
} else {
|
|
14849
14790
|
const totalTokens = tokenUsage.input + tokenUsage.output;
|
|
14850
14791
|
actualMetrics.tokens = totalTokens;
|
|
14851
14792
|
if (totalTokens <= max_tokens) {
|
|
14852
|
-
|
|
14793
|
+
assertions.push({
|
|
14794
|
+
text: `Total tokens ${totalTokens} <= ${max_tokens} max`,
|
|
14795
|
+
passed: true
|
|
14796
|
+
});
|
|
14853
14797
|
} else {
|
|
14854
|
-
|
|
14798
|
+
assertions.push({
|
|
14799
|
+
text: `Total tokens ${totalTokens} > ${max_tokens} max`,
|
|
14800
|
+
passed: false
|
|
14801
|
+
});
|
|
14855
14802
|
}
|
|
14856
14803
|
}
|
|
14857
14804
|
}
|
|
14858
14805
|
if (max_cost_usd !== void 0) {
|
|
14859
14806
|
if (costUsd === void 0) {
|
|
14860
|
-
|
|
14807
|
+
assertions.push({ text: "Cost data not available", passed: false });
|
|
14861
14808
|
} else {
|
|
14862
14809
|
actualMetrics.cost_usd = costUsd;
|
|
14863
14810
|
const formatCost = (n) => `$${n.toFixed(4)}`;
|
|
14864
14811
|
if (costUsd <= max_cost_usd) {
|
|
14865
|
-
|
|
14812
|
+
assertions.push({
|
|
14813
|
+
text: `Cost ${formatCost(costUsd)} <= ${formatCost(max_cost_usd)} max`,
|
|
14814
|
+
passed: true
|
|
14815
|
+
});
|
|
14866
14816
|
} else {
|
|
14867
|
-
|
|
14817
|
+
assertions.push({
|
|
14818
|
+
text: `Cost ${formatCost(costUsd)} > ${formatCost(max_cost_usd)} max`,
|
|
14819
|
+
passed: false
|
|
14820
|
+
});
|
|
14868
14821
|
}
|
|
14869
14822
|
}
|
|
14870
14823
|
}
|
|
14871
14824
|
if (max_duration_ms !== void 0) {
|
|
14872
14825
|
if (durationMs === void 0) {
|
|
14873
|
-
|
|
14826
|
+
assertions.push({ text: "Duration data not available", passed: false });
|
|
14874
14827
|
} else {
|
|
14875
14828
|
actualMetrics.duration_ms = durationMs;
|
|
14876
14829
|
if (durationMs <= max_duration_ms) {
|
|
14877
|
-
|
|
14830
|
+
assertions.push({
|
|
14831
|
+
text: `Duration ${durationMs}ms <= ${max_duration_ms}ms max`,
|
|
14832
|
+
passed: true
|
|
14833
|
+
});
|
|
14878
14834
|
} else {
|
|
14879
|
-
|
|
14835
|
+
assertions.push({
|
|
14836
|
+
text: `Duration ${durationMs}ms > ${max_duration_ms}ms max`,
|
|
14837
|
+
passed: false
|
|
14838
|
+
});
|
|
14880
14839
|
}
|
|
14881
14840
|
}
|
|
14882
14841
|
}
|
|
14883
14842
|
if (target_exploration_ratio !== void 0 && narrowedTrace) {
|
|
14884
14843
|
const ratio = explorationRatio(narrowedTrace);
|
|
14885
14844
|
if (ratio === void 0) {
|
|
14886
|
-
|
|
14845
|
+
assertions.push({ text: "Exploration ratio not available (no tool calls)", passed: false });
|
|
14887
14846
|
} else {
|
|
14888
14847
|
actualMetrics.exploration_ratio = ratio;
|
|
14889
14848
|
const diff = Math.abs(ratio - target_exploration_ratio);
|
|
14890
14849
|
if (diff <= exploration_tolerance) {
|
|
14891
|
-
|
|
14892
|
-
`Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}
|
|
14893
|
-
|
|
14850
|
+
assertions.push({
|
|
14851
|
+
text: `Exploration ratio ${ratio.toFixed(2)} within tolerance of target ${target_exploration_ratio}`,
|
|
14852
|
+
passed: true
|
|
14853
|
+
});
|
|
14894
14854
|
} else {
|
|
14895
|
-
|
|
14896
|
-
`Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})
|
|
14897
|
-
|
|
14855
|
+
assertions.push({
|
|
14856
|
+
text: `Exploration ratio ${ratio.toFixed(2)} outside tolerance of target ${target_exploration_ratio} (diff: ${diff.toFixed(2)}, tolerance: ${exploration_tolerance})`,
|
|
14857
|
+
passed: false
|
|
14858
|
+
});
|
|
14898
14859
|
}
|
|
14899
14860
|
}
|
|
14900
14861
|
}
|
|
14901
|
-
const totalChecks =
|
|
14902
|
-
const
|
|
14903
|
-
const
|
|
14904
|
-
if (actualMetrics.tool_calls !== void 0) {
|
|
14905
|
-
reasoningParts.push(`tool_calls=${actualMetrics.tool_calls}`);
|
|
14906
|
-
}
|
|
14907
|
-
if (actualMetrics.llm_calls !== void 0) {
|
|
14908
|
-
reasoningParts.push(`llm_calls=${actualMetrics.llm_calls}`);
|
|
14909
|
-
}
|
|
14910
|
-
if (actualMetrics.tokens !== void 0) {
|
|
14911
|
-
reasoningParts.push(`tokens=${actualMetrics.tokens}`);
|
|
14912
|
-
}
|
|
14913
|
-
if (actualMetrics.cost_usd !== void 0) {
|
|
14914
|
-
reasoningParts.push(`cost=$${actualMetrics.cost_usd.toFixed(4)}`);
|
|
14915
|
-
}
|
|
14916
|
-
if (actualMetrics.duration_ms !== void 0) {
|
|
14917
|
-
reasoningParts.push(`duration=${actualMetrics.duration_ms}ms`);
|
|
14918
|
-
}
|
|
14919
|
-
if (actualMetrics.exploration_ratio !== void 0) {
|
|
14920
|
-
reasoningParts.push(`exploration_ratio=${actualMetrics.exploration_ratio.toFixed(2)}`);
|
|
14921
|
-
}
|
|
14922
|
-
const reasoning = reasoningParts.length > 0 ? `execution-metrics ${reasoningParts.join(", ")}` : "No metrics evaluated";
|
|
14862
|
+
const totalChecks = assertions.length;
|
|
14863
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
14864
|
+
const score = totalChecks > 0 ? passedCount / totalChecks : 0;
|
|
14923
14865
|
return {
|
|
14924
14866
|
score,
|
|
14925
14867
|
verdict: scoreToVerdict(score),
|
|
14926
|
-
|
|
14927
|
-
misses,
|
|
14868
|
+
assertions,
|
|
14928
14869
|
expectedAspectCount: totalChecks || 1,
|
|
14929
|
-
reasoning,
|
|
14930
14870
|
evaluatorRawRequest: {
|
|
14931
14871
|
type: "execution-metrics",
|
|
14932
14872
|
config: this.extractConfiguredThresholds(),
|
|
@@ -15030,10 +14970,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
15030
14970
|
return {
|
|
15031
14971
|
score: 0,
|
|
15032
14972
|
verdict: "fail",
|
|
15033
|
-
|
|
15034
|
-
|
|
15035
|
-
expectedAspectCount: this.config.fields.length,
|
|
15036
|
-
reasoning: "Candidate answer is not valid JSON"
|
|
14973
|
+
assertions: [{ text: "Failed to parse candidate answer as JSON", passed: false }],
|
|
14974
|
+
expectedAspectCount: this.config.fields.length
|
|
15037
14975
|
};
|
|
15038
14976
|
}
|
|
15039
14977
|
const expectedData = this.extractExpectedData(evalCase.expected_output);
|
|
@@ -15041,10 +14979,8 @@ var FieldAccuracyEvaluator = class {
|
|
|
15041
14979
|
return {
|
|
15042
14980
|
score: 0,
|
|
15043
14981
|
verdict: "fail",
|
|
15044
|
-
|
|
15045
|
-
|
|
15046
|
-
expectedAspectCount: this.config.fields.length,
|
|
15047
|
-
reasoning: "Could not extract expected data from expected_output"
|
|
14982
|
+
assertions: [{ text: "No expected data found in expected_output", passed: false }],
|
|
14983
|
+
expectedAspectCount: this.config.fields.length
|
|
15048
14984
|
};
|
|
15049
14985
|
}
|
|
15050
14986
|
const fieldResults = [];
|
|
@@ -15262,18 +15198,14 @@ var FieldAccuracyEvaluator = class {
|
|
|
15262
15198
|
*/
|
|
15263
15199
|
aggregateResults(results) {
|
|
15264
15200
|
const aggregation = this.config.aggregation ?? "weighted_average";
|
|
15265
|
-
const
|
|
15266
|
-
const misses = [];
|
|
15201
|
+
const assertions = [];
|
|
15267
15202
|
for (const result of results) {
|
|
15268
|
-
|
|
15269
|
-
hits.push(result.message);
|
|
15270
|
-
} else {
|
|
15271
|
-
misses.push(result.message);
|
|
15272
|
-
}
|
|
15203
|
+
assertions.push({ text: result.message, passed: result.hit });
|
|
15273
15204
|
}
|
|
15274
15205
|
let score;
|
|
15275
15206
|
if (aggregation === "all_or_nothing") {
|
|
15276
|
-
|
|
15207
|
+
const hasFailed = assertions.some((a) => !a.passed);
|
|
15208
|
+
score = hasFailed ? 0 : 1;
|
|
15277
15209
|
} else {
|
|
15278
15210
|
const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
|
|
15279
15211
|
if (totalWeight === 0) {
|
|
@@ -15283,15 +15215,11 @@ var FieldAccuracyEvaluator = class {
|
|
|
15283
15215
|
score = weightedSum / totalWeight;
|
|
15284
15216
|
}
|
|
15285
15217
|
}
|
|
15286
|
-
const reasoning = `${hits.length}/${results.length} fields matched`;
|
|
15287
15218
|
return {
|
|
15288
15219
|
score: clampScore(score),
|
|
15289
15220
|
verdict: scoreToVerdict(score),
|
|
15290
|
-
|
|
15291
|
-
|
|
15292
|
-
misses: misses.slice(0, 4),
|
|
15293
|
-
expectedAspectCount: results.length,
|
|
15294
|
-
reasoning
|
|
15221
|
+
assertions,
|
|
15222
|
+
expectedAspectCount: results.length
|
|
15295
15223
|
};
|
|
15296
15224
|
}
|
|
15297
15225
|
};
|
|
@@ -15400,10 +15328,8 @@ var LatencyEvaluator = class {
|
|
|
15400
15328
|
return {
|
|
15401
15329
|
score: 0,
|
|
15402
15330
|
verdict: "fail",
|
|
15403
|
-
|
|
15404
|
-
misses: ["No duration data available in trace"],
|
|
15331
|
+
assertions: [{ text: "No duration data available in trace", passed: false }],
|
|
15405
15332
|
expectedAspectCount: 1,
|
|
15406
|
-
reasoning: "Execution duration not reported by provider",
|
|
15407
15333
|
evaluatorRawRequest: {
|
|
15408
15334
|
type: "latency",
|
|
15409
15335
|
threshold,
|
|
@@ -15416,10 +15342,10 @@ var LatencyEvaluator = class {
|
|
|
15416
15342
|
return {
|
|
15417
15343
|
score,
|
|
15418
15344
|
verdict: passed ? "pass" : "fail",
|
|
15419
|
-
|
|
15420
|
-
|
|
15345
|
+
assertions: [
|
|
15346
|
+
passed ? { text: `Duration ${durationMs}ms <= ${threshold}ms threshold`, passed: true } : { text: `Duration ${durationMs}ms > ${threshold}ms threshold`, passed: false }
|
|
15347
|
+
],
|
|
15421
15348
|
expectedAspectCount: 1,
|
|
15422
|
-
reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
|
|
15423
15349
|
evaluatorRawRequest: {
|
|
15424
15350
|
type: "latency",
|
|
15425
15351
|
threshold,
|
|
@@ -15440,7 +15366,10 @@ var COPILOT_MATCHER = {
|
|
|
15440
15366
|
skillTools: ["Skill", "skill"],
|
|
15441
15367
|
skillInputField: "skill",
|
|
15442
15368
|
readTools: ["Read File", "readFile", "Read", "readTextFile"],
|
|
15443
|
-
readInputField: "file_path"
|
|
15369
|
+
readInputField: "file_path",
|
|
15370
|
+
skillToolPrefixes: ["Using skill: "],
|
|
15371
|
+
readToolPrefixes: ["Viewing "],
|
|
15372
|
+
readInputFields: ["file_path", "path"]
|
|
15444
15373
|
};
|
|
15445
15374
|
var PROVIDER_TOOL_SEMANTICS = {
|
|
15446
15375
|
claude: CLAUDE_MATCHER,
|
|
@@ -15482,12 +15411,22 @@ var SkillTriggerEvaluator = class {
|
|
|
15482
15411
|
triggered = true;
|
|
15483
15412
|
evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
|
|
15484
15413
|
}
|
|
15414
|
+
} else if (matcher.skillToolPrefixes?.some(
|
|
15415
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
15416
|
+
)) {
|
|
15417
|
+
triggered = true;
|
|
15418
|
+
evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
|
|
15485
15419
|
} else if (matcher.readTools.includes(firstTool.tool)) {
|
|
15486
|
-
const filePath =
|
|
15420
|
+
const filePath = this.readPathFromInput(input, matcher);
|
|
15487
15421
|
if (filePath.includes(skillName)) {
|
|
15488
15422
|
triggered = true;
|
|
15489
15423
|
evidence = `Read tool loaded skill file: ${filePath}`;
|
|
15490
15424
|
}
|
|
15425
|
+
} else if (matcher.readToolPrefixes?.some(
|
|
15426
|
+
(prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName)
|
|
15427
|
+
)) {
|
|
15428
|
+
triggered = true;
|
|
15429
|
+
evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
|
|
15491
15430
|
}
|
|
15492
15431
|
}
|
|
15493
15432
|
const pass = triggered === shouldTrigger;
|
|
@@ -15495,25 +15434,37 @@ var SkillTriggerEvaluator = class {
|
|
|
15495
15434
|
return {
|
|
15496
15435
|
score: 1,
|
|
15497
15436
|
verdict: "pass",
|
|
15498
|
-
|
|
15499
|
-
|
|
15437
|
+
assertions: [
|
|
15438
|
+
{
|
|
15439
|
+
text: shouldTrigger ? evidence || `Skill "${skillName}" triggered as expected` : `Skill "${skillName}" correctly did not trigger`,
|
|
15440
|
+
passed: true
|
|
15441
|
+
}
|
|
15500
15442
|
],
|
|
15501
|
-
|
|
15502
|
-
expectedAspectCount: 1,
|
|
15503
|
-
reasoning: shouldTrigger ? "Skill triggered correctly" : "No false trigger"
|
|
15443
|
+
expectedAspectCount: 1
|
|
15504
15444
|
};
|
|
15505
15445
|
}
|
|
15506
15446
|
return {
|
|
15507
15447
|
score: 0,
|
|
15508
15448
|
verdict: "fail",
|
|
15509
|
-
|
|
15510
|
-
|
|
15511
|
-
|
|
15449
|
+
assertions: [
|
|
15450
|
+
{
|
|
15451
|
+
text: shouldTrigger ? firstTool ? `First tool was "${firstTool.tool}" \u2014 not a skill/read tool for "${skillName}"` : "No tool calls recorded" : evidence || `Skill "${skillName}" triggered unexpectedly`,
|
|
15452
|
+
passed: false
|
|
15453
|
+
}
|
|
15512
15454
|
],
|
|
15513
|
-
expectedAspectCount: 1
|
|
15514
|
-
reasoning: shouldTrigger ? `Skill "${skillName}" was not triggered` : "False trigger: skill fired when it should not have"
|
|
15455
|
+
expectedAspectCount: 1
|
|
15515
15456
|
};
|
|
15516
15457
|
}
|
|
15458
|
+
readPathFromInput(input, matcher) {
|
|
15459
|
+
const fields = matcher.readInputFields ?? [matcher.readInputField];
|
|
15460
|
+
for (const field of fields) {
|
|
15461
|
+
const value = input[field];
|
|
15462
|
+
if (value !== void 0 && value !== null) {
|
|
15463
|
+
return String(value);
|
|
15464
|
+
}
|
|
15465
|
+
}
|
|
15466
|
+
return "";
|
|
15467
|
+
}
|
|
15517
15468
|
};
|
|
15518
15469
|
|
|
15519
15470
|
// src/evaluation/evaluators/llm-grader-prompt.ts
|
|
@@ -15548,12 +15499,8 @@ function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evalua
|
|
|
15548
15499
|
[TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input_segments, null, 2),
|
|
15549
15500
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
|
|
15550
15501
|
[TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
|
|
15551
|
-
[TEMPLATE_VARIABLES.ANSWER]: candidate.trim(),
|
|
15552
|
-
[TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (evalCase.reference_answer ?? "").trim(),
|
|
15553
15502
|
[TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
|
|
15554
|
-
[TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
|
|
15555
15503
|
[TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
|
|
15556
|
-
// Text convenience accessors (new names, always strings)
|
|
15557
15504
|
[TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
|
|
15558
15505
|
[TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
|
|
15559
15506
|
[TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
|
|
@@ -15680,10 +15627,8 @@ var TokenUsageEvaluator = class {
|
|
|
15680
15627
|
return {
|
|
15681
15628
|
score: 0,
|
|
15682
15629
|
verdict: "fail",
|
|
15683
|
-
|
|
15684
|
-
misses: ["No token usage data available in trace"],
|
|
15630
|
+
assertions: [{ text: "No token usage data available in trace", passed: false }],
|
|
15685
15631
|
expectedAspectCount,
|
|
15686
|
-
reasoning: "Token usage not reported by provider",
|
|
15687
15632
|
evaluatorRawRequest: {
|
|
15688
15633
|
type: "token-usage",
|
|
15689
15634
|
max_total: maxTotal ?? null,
|
|
@@ -15697,37 +15642,34 @@ var TokenUsageEvaluator = class {
|
|
|
15697
15642
|
const output = usage.output;
|
|
15698
15643
|
const cached = usage.cached ?? 0;
|
|
15699
15644
|
const total = input + output + cached;
|
|
15700
|
-
const
|
|
15701
|
-
const misses = [];
|
|
15645
|
+
const assertions = [];
|
|
15702
15646
|
if (typeof maxInput === "number") {
|
|
15703
15647
|
if (input <= maxInput) {
|
|
15704
|
-
|
|
15648
|
+
assertions.push({ text: `Input tokens ${input} <= ${maxInput}`, passed: true });
|
|
15705
15649
|
} else {
|
|
15706
|
-
|
|
15650
|
+
assertions.push({ text: `Input tokens ${input} > ${maxInput}`, passed: false });
|
|
15707
15651
|
}
|
|
15708
15652
|
}
|
|
15709
15653
|
if (typeof maxOutput === "number") {
|
|
15710
15654
|
if (output <= maxOutput) {
|
|
15711
|
-
|
|
15655
|
+
assertions.push({ text: `Output tokens ${output} <= ${maxOutput}`, passed: true });
|
|
15712
15656
|
} else {
|
|
15713
|
-
|
|
15657
|
+
assertions.push({ text: `Output tokens ${output} > ${maxOutput}`, passed: false });
|
|
15714
15658
|
}
|
|
15715
15659
|
}
|
|
15716
15660
|
if (typeof maxTotal === "number") {
|
|
15717
15661
|
if (total <= maxTotal) {
|
|
15718
|
-
|
|
15662
|
+
assertions.push({ text: `Total tokens ${total} <= ${maxTotal}`, passed: true });
|
|
15719
15663
|
} else {
|
|
15720
|
-
|
|
15664
|
+
assertions.push({ text: `Total tokens ${total} > ${maxTotal}`, passed: false });
|
|
15721
15665
|
}
|
|
15722
15666
|
}
|
|
15723
|
-
const passed =
|
|
15667
|
+
const passed = assertions.every((a) => a.passed);
|
|
15724
15668
|
return {
|
|
15725
15669
|
score: passed ? 1 : 0,
|
|
15726
15670
|
verdict: passed ? "pass" : "fail",
|
|
15727
|
-
|
|
15728
|
-
misses,
|
|
15671
|
+
assertions,
|
|
15729
15672
|
expectedAspectCount,
|
|
15730
|
-
reasoning: `token-usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
|
|
15731
15673
|
evaluatorRawRequest: {
|
|
15732
15674
|
type: "token-usage",
|
|
15733
15675
|
max_total: maxTotal ?? null,
|
|
@@ -15827,8 +15769,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15827
15769
|
return {
|
|
15828
15770
|
score: 0,
|
|
15829
15771
|
verdict: "fail",
|
|
15830
|
-
|
|
15831
|
-
misses: ["No trace available for evaluation"],
|
|
15772
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
15832
15773
|
expectedAspectCount: 1
|
|
15833
15774
|
};
|
|
15834
15775
|
}
|
|
@@ -15839,8 +15780,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15839
15780
|
return {
|
|
15840
15781
|
score: 0,
|
|
15841
15782
|
verdict: "fail",
|
|
15842
|
-
|
|
15843
|
-
misses: ["No trace available for evaluation"],
|
|
15783
|
+
assertions: [{ text: "No trace available for evaluation", passed: false }],
|
|
15844
15784
|
expectedAspectCount: 1
|
|
15845
15785
|
};
|
|
15846
15786
|
}
|
|
@@ -15858,8 +15798,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15858
15798
|
return {
|
|
15859
15799
|
score: 0,
|
|
15860
15800
|
verdict: "fail",
|
|
15861
|
-
|
|
15862
|
-
misses: [`Unknown mode: ${this.config.mode}`],
|
|
15801
|
+
assertions: [{ text: `Unknown mode: ${this.config.mode}`, passed: false }],
|
|
15863
15802
|
expectedAspectCount: 1
|
|
15864
15803
|
};
|
|
15865
15804
|
}
|
|
@@ -15908,28 +15847,32 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15908
15847
|
return {
|
|
15909
15848
|
score: 1,
|
|
15910
15849
|
verdict: "pass",
|
|
15911
|
-
|
|
15912
|
-
misses: [],
|
|
15850
|
+
assertions: [{ text: "No tool requirements specified", passed: true }],
|
|
15913
15851
|
expectedAspectCount: 0
|
|
15914
15852
|
};
|
|
15915
15853
|
}
|
|
15916
|
-
const
|
|
15917
|
-
const misses = [];
|
|
15854
|
+
const assertions = [];
|
|
15918
15855
|
for (const toolName of toolNames) {
|
|
15919
15856
|
const required = minimums[toolName];
|
|
15920
15857
|
const actual = summary.toolCallsByName[toolName] ?? 0;
|
|
15921
15858
|
if (actual >= required) {
|
|
15922
|
-
|
|
15859
|
+
assertions.push({
|
|
15860
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
15861
|
+
passed: true
|
|
15862
|
+
});
|
|
15923
15863
|
} else {
|
|
15924
|
-
|
|
15864
|
+
assertions.push({
|
|
15865
|
+
text: `${toolName}: called ${actual} times (required >=${required})`,
|
|
15866
|
+
passed: false
|
|
15867
|
+
});
|
|
15925
15868
|
}
|
|
15926
15869
|
}
|
|
15927
|
-
const
|
|
15870
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
15871
|
+
const score = passedCount / toolNames.length;
|
|
15928
15872
|
return {
|
|
15929
15873
|
score,
|
|
15930
15874
|
verdict: scoreToVerdict(score),
|
|
15931
|
-
|
|
15932
|
-
misses,
|
|
15875
|
+
assertions,
|
|
15933
15876
|
expectedAspectCount: toolNames.length
|
|
15934
15877
|
};
|
|
15935
15878
|
}
|
|
@@ -15939,13 +15882,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15939
15882
|
return {
|
|
15940
15883
|
score: 1,
|
|
15941
15884
|
verdict: "pass",
|
|
15942
|
-
|
|
15943
|
-
misses: [],
|
|
15885
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
15944
15886
|
expectedAspectCount: 0
|
|
15945
15887
|
};
|
|
15946
15888
|
}
|
|
15947
|
-
const
|
|
15948
|
-
const misses = [];
|
|
15889
|
+
const assertions = [];
|
|
15949
15890
|
const warnings = [];
|
|
15950
15891
|
let actualIndex = 0;
|
|
15951
15892
|
let sequenceHits = 0;
|
|
@@ -15965,16 +15906,20 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15965
15906
|
const actualCall = toolCalls[actualIndex];
|
|
15966
15907
|
if (actualCall.name === expectedTool) {
|
|
15967
15908
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
15968
|
-
|
|
15909
|
+
assertions.push({
|
|
15910
|
+
text: `Found ${expectedTool} at position ${actualIndex}`,
|
|
15911
|
+
passed: true
|
|
15912
|
+
});
|
|
15969
15913
|
sequenceHits++;
|
|
15970
15914
|
matchedCall = actualCall;
|
|
15971
15915
|
actualIndex++;
|
|
15972
15916
|
found = true;
|
|
15973
15917
|
break;
|
|
15974
15918
|
}
|
|
15975
|
-
|
|
15976
|
-
`Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch
|
|
15977
|
-
|
|
15919
|
+
assertions.push({
|
|
15920
|
+
text: `Expected ${expectedTool} at position ${i}: tool found at ${actualIndex} but args mismatch`,
|
|
15921
|
+
passed: false
|
|
15922
|
+
});
|
|
15978
15923
|
actualIndex++;
|
|
15979
15924
|
argsMismatch = true;
|
|
15980
15925
|
break;
|
|
@@ -15982,7 +15927,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15982
15927
|
actualIndex++;
|
|
15983
15928
|
}
|
|
15984
15929
|
if (!found && !argsMismatch) {
|
|
15985
|
-
|
|
15930
|
+
assertions.push({
|
|
15931
|
+
text: `Expected ${expectedTool} at position ${i}, not found in remaining trace`,
|
|
15932
|
+
passed: false
|
|
15933
|
+
});
|
|
15986
15934
|
}
|
|
15987
15935
|
if (found && matchedCall) {
|
|
15988
15936
|
const latencyResult = checkLatency(
|
|
@@ -15991,10 +15939,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
15991
15939
|
matchedCall.durationMs
|
|
15992
15940
|
);
|
|
15993
15941
|
if (latencyResult.status === "pass") {
|
|
15994
|
-
|
|
15942
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
15995
15943
|
latencyHits++;
|
|
15996
15944
|
} else if (latencyResult.status === "fail") {
|
|
15997
|
-
|
|
15945
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
15998
15946
|
} else if (latencyResult.message) {
|
|
15999
15947
|
warnings.push(latencyResult.message);
|
|
16000
15948
|
latencySkips++;
|
|
@@ -16010,8 +15958,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16010
15958
|
return {
|
|
16011
15959
|
score,
|
|
16012
15960
|
verdict: scoreToVerdict(score),
|
|
16013
|
-
|
|
16014
|
-
misses,
|
|
15961
|
+
assertions,
|
|
16015
15962
|
expectedAspectCount: totalAssertions
|
|
16016
15963
|
};
|
|
16017
15964
|
}
|
|
@@ -16021,13 +15968,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16021
15968
|
return {
|
|
16022
15969
|
score: 1,
|
|
16023
15970
|
verdict: "pass",
|
|
16024
|
-
|
|
16025
|
-
misses: [],
|
|
15971
|
+
assertions: [{ text: "No tool sequence specified", passed: true }],
|
|
16026
15972
|
expectedAspectCount: 0
|
|
16027
15973
|
};
|
|
16028
15974
|
}
|
|
16029
|
-
const
|
|
16030
|
-
const misses = [];
|
|
15975
|
+
const assertions = [];
|
|
16031
15976
|
const warnings = [];
|
|
16032
15977
|
let sequenceHits = 0;
|
|
16033
15978
|
let latencyHits = 0;
|
|
@@ -16036,7 +15981,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16036
15981
|
(item) => item.maxDurationMs !== void 0
|
|
16037
15982
|
).length;
|
|
16038
15983
|
if (toolCalls.length !== expected.length) {
|
|
16039
|
-
|
|
15984
|
+
assertions.push({
|
|
15985
|
+
text: `Expected ${expected.length} tool calls, got ${toolCalls.length}`,
|
|
15986
|
+
passed: false
|
|
15987
|
+
});
|
|
16040
15988
|
}
|
|
16041
15989
|
const checkLength = Math.min(expected.length, toolCalls.length);
|
|
16042
15990
|
for (let i = 0; i < checkLength; i++) {
|
|
@@ -16048,14 +15996,17 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16048
15996
|
let sequenceMatched = false;
|
|
16049
15997
|
if (actualTool === expectedTool) {
|
|
16050
15998
|
if (argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
16051
|
-
|
|
15999
|
+
assertions.push({ text: `Position ${i}: ${expectedTool}`, passed: true });
|
|
16052
16000
|
sequenceHits++;
|
|
16053
16001
|
sequenceMatched = true;
|
|
16054
16002
|
} else {
|
|
16055
|
-
|
|
16003
|
+
assertions.push({ text: `Position ${i}: ${expectedTool} args mismatch`, passed: false });
|
|
16056
16004
|
}
|
|
16057
16005
|
} else {
|
|
16058
|
-
|
|
16006
|
+
assertions.push({
|
|
16007
|
+
text: `Position ${i}: expected ${expectedTool}, got ${actualTool}`,
|
|
16008
|
+
passed: false
|
|
16009
|
+
});
|
|
16059
16010
|
}
|
|
16060
16011
|
if (sequenceMatched) {
|
|
16061
16012
|
const latencyResult = checkLatency(
|
|
@@ -16064,10 +16015,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16064
16015
|
actualCall.durationMs
|
|
16065
16016
|
);
|
|
16066
16017
|
if (latencyResult.status === "pass") {
|
|
16067
|
-
|
|
16018
|
+
assertions.push({ text: latencyResult.message, passed: true });
|
|
16068
16019
|
latencyHits++;
|
|
16069
16020
|
} else if (latencyResult.status === "fail") {
|
|
16070
|
-
|
|
16021
|
+
assertions.push({ text: latencyResult.message, passed: false });
|
|
16071
16022
|
} else if (latencyResult.message) {
|
|
16072
16023
|
warnings.push(latencyResult.message);
|
|
16073
16024
|
latencySkips++;
|
|
@@ -16075,7 +16026,10 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16075
16026
|
}
|
|
16076
16027
|
}
|
|
16077
16028
|
for (let i = checkLength; i < expected.length; i++) {
|
|
16078
|
-
|
|
16029
|
+
assertions.push({
|
|
16030
|
+
text: `Position ${i}: expected ${expected[i].tool}, got nothing`,
|
|
16031
|
+
passed: false
|
|
16032
|
+
});
|
|
16079
16033
|
}
|
|
16080
16034
|
for (const warning of warnings) {
|
|
16081
16035
|
console.warn(`[tool-trajectory] ${warning}`);
|
|
@@ -16086,8 +16040,7 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16086
16040
|
return {
|
|
16087
16041
|
score,
|
|
16088
16042
|
verdict: scoreToVerdict(score),
|
|
16089
|
-
|
|
16090
|
-
misses,
|
|
16043
|
+
assertions,
|
|
16091
16044
|
expectedAspectCount: totalAssertions
|
|
16092
16045
|
};
|
|
16093
16046
|
}
|
|
@@ -16102,13 +16055,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16102
16055
|
return {
|
|
16103
16056
|
score: 1,
|
|
16104
16057
|
verdict: "pass",
|
|
16105
|
-
|
|
16106
|
-
misses: [],
|
|
16058
|
+
assertions: [{ text: "No expected tools specified", passed: true }],
|
|
16107
16059
|
expectedAspectCount: 0
|
|
16108
16060
|
};
|
|
16109
16061
|
}
|
|
16110
|
-
const
|
|
16111
|
-
const misses = [];
|
|
16062
|
+
const assertions = [];
|
|
16112
16063
|
const consumed = /* @__PURE__ */ new Set();
|
|
16113
16064
|
for (let i = 0; i < expected.length; i++) {
|
|
16114
16065
|
const expectedItem = expected[i];
|
|
@@ -16119,22 +16070,25 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16119
16070
|
if (consumed.has(j)) continue;
|
|
16120
16071
|
const actualCall = toolCalls[j];
|
|
16121
16072
|
if (actualCall.name === expectedTool && argsMatch(expectedItem.args, actualCall.args, mode)) {
|
|
16122
|
-
|
|
16073
|
+
assertions.push({ text: `Found ${expectedTool} at position ${j}`, passed: true });
|
|
16123
16074
|
consumed.add(j);
|
|
16124
16075
|
found = true;
|
|
16125
16076
|
break;
|
|
16126
16077
|
}
|
|
16127
16078
|
}
|
|
16128
16079
|
if (!found) {
|
|
16129
|
-
|
|
16080
|
+
assertions.push({
|
|
16081
|
+
text: `Expected ${expectedTool} not found in actual trajectory`,
|
|
16082
|
+
passed: false
|
|
16083
|
+
});
|
|
16130
16084
|
}
|
|
16131
16085
|
}
|
|
16132
|
-
const
|
|
16086
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
16087
|
+
const score = expected.length > 0 ? passedCount / expected.length : 1;
|
|
16133
16088
|
return {
|
|
16134
16089
|
score,
|
|
16135
16090
|
verdict: scoreToVerdict(score),
|
|
16136
|
-
|
|
16137
|
-
misses,
|
|
16091
|
+
assertions,
|
|
16138
16092
|
expectedAspectCount: expected.length
|
|
16139
16093
|
};
|
|
16140
16094
|
}
|
|
@@ -16150,16 +16104,19 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16150
16104
|
return {
|
|
16151
16105
|
score: 1,
|
|
16152
16106
|
verdict: "pass",
|
|
16153
|
-
|
|
16154
|
-
misses: [],
|
|
16107
|
+
assertions: [{ text: "No tool calls and no expected tools", passed: true }],
|
|
16155
16108
|
expectedAspectCount: 0
|
|
16156
16109
|
};
|
|
16157
16110
|
}
|
|
16158
16111
|
return {
|
|
16159
16112
|
score: 0,
|
|
16160
16113
|
verdict: "fail",
|
|
16161
|
-
|
|
16162
|
-
|
|
16114
|
+
assertions: [
|
|
16115
|
+
{
|
|
16116
|
+
text: `${toolCalls.length} unexpected tool call(s) with empty allowed list`,
|
|
16117
|
+
passed: false
|
|
16118
|
+
}
|
|
16119
|
+
],
|
|
16163
16120
|
expectedAspectCount: toolCalls.length
|
|
16164
16121
|
};
|
|
16165
16122
|
}
|
|
@@ -16167,13 +16124,11 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16167
16124
|
return {
|
|
16168
16125
|
score: 1,
|
|
16169
16126
|
verdict: "pass",
|
|
16170
|
-
|
|
16171
|
-
misses: [],
|
|
16127
|
+
assertions: [{ text: "No actual tool calls (trivially a subset)", passed: true }],
|
|
16172
16128
|
expectedAspectCount: 0
|
|
16173
16129
|
};
|
|
16174
16130
|
}
|
|
16175
|
-
const
|
|
16176
|
-
const misses = [];
|
|
16131
|
+
const assertions = [];
|
|
16177
16132
|
for (let i = 0; i < toolCalls.length; i++) {
|
|
16178
16133
|
const actualCall = toolCalls[i];
|
|
16179
16134
|
let allowed = false;
|
|
@@ -16185,17 +16140,23 @@ var ToolTrajectoryEvaluator = class {
|
|
|
16185
16140
|
}
|
|
16186
16141
|
}
|
|
16187
16142
|
if (allowed) {
|
|
16188
|
-
|
|
16143
|
+
assertions.push({
|
|
16144
|
+
text: `Position ${i}: ${actualCall.name} is in allowed set`,
|
|
16145
|
+
passed: true
|
|
16146
|
+
});
|
|
16189
16147
|
} else {
|
|
16190
|
-
|
|
16148
|
+
assertions.push({
|
|
16149
|
+
text: `Position ${i}: ${actualCall.name} is not in allowed set`,
|
|
16150
|
+
passed: false
|
|
16151
|
+
});
|
|
16191
16152
|
}
|
|
16192
16153
|
}
|
|
16193
|
-
const
|
|
16154
|
+
const passedCount = assertions.filter((a) => a.passed).length;
|
|
16155
|
+
const score = toolCalls.length > 0 ? passedCount / toolCalls.length : 1;
|
|
16194
16156
|
return {
|
|
16195
16157
|
score,
|
|
16196
16158
|
verdict: scoreToVerdict(score),
|
|
16197
|
-
|
|
16198
|
-
misses,
|
|
16159
|
+
assertions,
|
|
16199
16160
|
expectedAspectCount: toolCalls.length
|
|
16200
16161
|
};
|
|
16201
16162
|
}
|
|
@@ -16206,8 +16167,12 @@ function runContainsAssertion(output, value) {
|
|
|
16206
16167
|
const passed = output.includes(value);
|
|
16207
16168
|
return {
|
|
16208
16169
|
score: passed ? 1 : 0,
|
|
16209
|
-
|
|
16210
|
-
|
|
16170
|
+
assertions: [
|
|
16171
|
+
{
|
|
16172
|
+
text: passed ? `Output contains "${value}"` : `Output does not contain "${value}"`,
|
|
16173
|
+
passed
|
|
16174
|
+
}
|
|
16175
|
+
]
|
|
16211
16176
|
};
|
|
16212
16177
|
}
|
|
16213
16178
|
function runContainsAnyAssertion(output, values) {
|
|
@@ -16215,8 +16180,12 @@ function runContainsAnyAssertion(output, values) {
|
|
|
16215
16180
|
const passed = matched.length > 0;
|
|
16216
16181
|
return {
|
|
16217
16182
|
score: passed ? 1 : 0,
|
|
16218
|
-
|
|
16219
|
-
|
|
16183
|
+
assertions: [
|
|
16184
|
+
{
|
|
16185
|
+
text: passed ? `Output contains "${matched[0]}"` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")}`,
|
|
16186
|
+
passed
|
|
16187
|
+
}
|
|
16188
|
+
]
|
|
16220
16189
|
};
|
|
16221
16190
|
}
|
|
16222
16191
|
function runContainsAllAssertion(output, values) {
|
|
@@ -16224,16 +16193,24 @@ function runContainsAllAssertion(output, values) {
|
|
|
16224
16193
|
const passed = missing.length === 0;
|
|
16225
16194
|
return {
|
|
16226
16195
|
score: passed ? 1 : 0,
|
|
16227
|
-
|
|
16228
|
-
|
|
16196
|
+
assertions: [
|
|
16197
|
+
{
|
|
16198
|
+
text: passed ? `Output contains all ${values.length} expected strings` : `Output missing: ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
16199
|
+
passed
|
|
16200
|
+
}
|
|
16201
|
+
]
|
|
16229
16202
|
};
|
|
16230
16203
|
}
|
|
16231
16204
|
function runIcontainsAssertion(output, value) {
|
|
16232
16205
|
const passed = output.toLowerCase().includes(value.toLowerCase());
|
|
16233
16206
|
return {
|
|
16234
16207
|
score: passed ? 1 : 0,
|
|
16235
|
-
|
|
16236
|
-
|
|
16208
|
+
assertions: [
|
|
16209
|
+
{
|
|
16210
|
+
text: passed ? `Output contains "${value}" (case-insensitive)` : `Output does not contain "${value}" (case-insensitive)`,
|
|
16211
|
+
passed
|
|
16212
|
+
}
|
|
16213
|
+
]
|
|
16237
16214
|
};
|
|
16238
16215
|
}
|
|
16239
16216
|
function runIcontainsAnyAssertion(output, values) {
|
|
@@ -16242,9 +16219,11 @@ function runIcontainsAnyAssertion(output, values) {
|
|
|
16242
16219
|
const passed = matched.length > 0;
|
|
16243
16220
|
return {
|
|
16244
16221
|
score: passed ? 1 : 0,
|
|
16245
|
-
|
|
16246
|
-
|
|
16247
|
-
|
|
16222
|
+
assertions: [
|
|
16223
|
+
{
|
|
16224
|
+
text: passed ? `Output contains "${matched[0]}" (case-insensitive)` : `Output does not contain any of: ${values.map((v) => `"${v}"`).join(", ")} (case-insensitive)`,
|
|
16225
|
+
passed
|
|
16226
|
+
}
|
|
16248
16227
|
]
|
|
16249
16228
|
};
|
|
16250
16229
|
}
|
|
@@ -16254,24 +16233,36 @@ function runIcontainsAllAssertion(output, values) {
|
|
|
16254
16233
|
const passed = missing.length === 0;
|
|
16255
16234
|
return {
|
|
16256
16235
|
score: passed ? 1 : 0,
|
|
16257
|
-
|
|
16258
|
-
|
|
16236
|
+
assertions: [
|
|
16237
|
+
{
|
|
16238
|
+
text: passed ? `Output contains all ${values.length} expected strings (case-insensitive)` : `Output missing (case-insensitive): ${missing.map((v) => `"${v}"`).join(", ")}`,
|
|
16239
|
+
passed
|
|
16240
|
+
}
|
|
16241
|
+
]
|
|
16259
16242
|
};
|
|
16260
16243
|
}
|
|
16261
16244
|
function runStartsWithAssertion(output, value) {
|
|
16262
16245
|
const passed = output.trim().startsWith(value.trim());
|
|
16263
16246
|
return {
|
|
16264
16247
|
score: passed ? 1 : 0,
|
|
16265
|
-
|
|
16266
|
-
|
|
16248
|
+
assertions: [
|
|
16249
|
+
{
|
|
16250
|
+
text: passed ? `Output starts with "${value}"` : `Output does not start with "${value}"`,
|
|
16251
|
+
passed
|
|
16252
|
+
}
|
|
16253
|
+
]
|
|
16267
16254
|
};
|
|
16268
16255
|
}
|
|
16269
16256
|
function runEndsWithAssertion(output, value) {
|
|
16270
16257
|
const passed = output.trim().endsWith(value.trim());
|
|
16271
16258
|
return {
|
|
16272
16259
|
score: passed ? 1 : 0,
|
|
16273
|
-
|
|
16274
|
-
|
|
16260
|
+
assertions: [
|
|
16261
|
+
{
|
|
16262
|
+
text: passed ? `Output ends with "${value}"` : `Output does not end with "${value}"`,
|
|
16263
|
+
passed
|
|
16264
|
+
}
|
|
16265
|
+
]
|
|
16275
16266
|
};
|
|
16276
16267
|
}
|
|
16277
16268
|
function runRegexAssertion(output, pattern, flags) {
|
|
@@ -16280,8 +16271,12 @@ function runRegexAssertion(output, pattern, flags) {
|
|
|
16280
16271
|
const flagsLabel = flags ? ` (flags: ${flags})` : "";
|
|
16281
16272
|
return {
|
|
16282
16273
|
score: passed ? 1 : 0,
|
|
16283
|
-
|
|
16284
|
-
|
|
16274
|
+
assertions: [
|
|
16275
|
+
{
|
|
16276
|
+
text: passed ? `Output matches pattern /${pattern}/${flags ?? ""}${flagsLabel}` : `Output does not match pattern /${pattern}/${flags ?? ""}${flagsLabel}`,
|
|
16277
|
+
passed
|
|
16278
|
+
}
|
|
16279
|
+
]
|
|
16285
16280
|
};
|
|
16286
16281
|
}
|
|
16287
16282
|
function runIsJsonAssertion(output) {
|
|
@@ -16293,16 +16288,24 @@ function runIsJsonAssertion(output) {
|
|
|
16293
16288
|
}
|
|
16294
16289
|
return {
|
|
16295
16290
|
score: passed ? 1 : 0,
|
|
16296
|
-
|
|
16297
|
-
|
|
16291
|
+
assertions: [
|
|
16292
|
+
{
|
|
16293
|
+
text: passed ? "Output is valid JSON" : "Output is not valid JSON",
|
|
16294
|
+
passed
|
|
16295
|
+
}
|
|
16296
|
+
]
|
|
16298
16297
|
};
|
|
16299
16298
|
}
|
|
16300
16299
|
function runEqualsAssertion(output, value) {
|
|
16301
16300
|
const passed = output.trim() === value.trim();
|
|
16302
16301
|
return {
|
|
16303
16302
|
score: passed ? 1 : 0,
|
|
16304
|
-
|
|
16305
|
-
|
|
16303
|
+
assertions: [
|
|
16304
|
+
{
|
|
16305
|
+
text: passed ? `Output equals "${value}"` : `Output does not equal "${value}"`,
|
|
16306
|
+
passed
|
|
16307
|
+
}
|
|
16308
|
+
]
|
|
16306
16309
|
};
|
|
16307
16310
|
}
|
|
16308
16311
|
|
|
@@ -16515,10 +16518,8 @@ var InlineAssertEvaluator = class {
|
|
|
16515
16518
|
return {
|
|
16516
16519
|
score,
|
|
16517
16520
|
verdict: scoreToVerdict(score),
|
|
16518
|
-
|
|
16519
|
-
misses: score < 0.5 ? [result.name] : [],
|
|
16521
|
+
assertions: [{ text: result.name, passed: score >= 0.5 }],
|
|
16520
16522
|
expectedAspectCount: 1,
|
|
16521
|
-
reasoning: void 0,
|
|
16522
16523
|
details: result.metadata ? result.metadata : void 0
|
|
16523
16524
|
};
|
|
16524
16525
|
}
|
|
@@ -16556,11 +16557,9 @@ async function resolveCustomPrompt(promptConfig, context2, timeoutMs) {
|
|
|
16556
16557
|
}
|
|
16557
16558
|
async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
16558
16559
|
const payload = {
|
|
16559
|
-
question: context2.evalCase.question,
|
|
16560
16560
|
criteria: context2.evalCase.criteria,
|
|
16561
16561
|
expectedOutput: context2.evalCase.expected_output,
|
|
16562
|
-
|
|
16563
|
-
answer: context2.candidate,
|
|
16562
|
+
outputText: context2.candidate,
|
|
16564
16563
|
output: context2.output ?? null,
|
|
16565
16564
|
guidelineFiles: context2.evalCase.guideline_paths,
|
|
16566
16565
|
inputFiles: context2.evalCase.file_paths.filter(
|
|
@@ -16571,9 +16570,7 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
|
|
|
16571
16570
|
fileChanges: context2.fileChanges ?? null,
|
|
16572
16571
|
workspacePath: context2.workspacePath ?? null,
|
|
16573
16572
|
config: config ?? context2.config ?? null,
|
|
16574
|
-
// Text convenience accessors (new names, always strings)
|
|
16575
16573
|
inputText: context2.evalCase.question,
|
|
16576
|
-
outputText: context2.candidate,
|
|
16577
16574
|
expectedOutputText: context2.evalCase.reference_answer ?? ""
|
|
16578
16575
|
};
|
|
16579
16576
|
const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
|
|
@@ -16711,9 +16708,7 @@ var containsFactory = (config) => {
|
|
|
16711
16708
|
return {
|
|
16712
16709
|
score: result.score,
|
|
16713
16710
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16714
|
-
|
|
16715
|
-
misses: result.misses,
|
|
16716
|
-
reasoning: result.score === 1 ? `Output contains "${c.value}"` : `Output does not contain "${c.value}"`,
|
|
16711
|
+
assertions: result.assertions,
|
|
16717
16712
|
expectedAspectCount: 1
|
|
16718
16713
|
};
|
|
16719
16714
|
});
|
|
@@ -16725,9 +16720,7 @@ var regexFactory = (config) => {
|
|
|
16725
16720
|
return {
|
|
16726
16721
|
score: result.score,
|
|
16727
16722
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16728
|
-
|
|
16729
|
-
misses: result.misses,
|
|
16730
|
-
reasoning: result.score === 1 ? `Output matches pattern /${c.value}/${c.flags ?? ""}` : `Output does not match pattern /${c.value}/${c.flags ?? ""}`,
|
|
16723
|
+
assertions: result.assertions,
|
|
16731
16724
|
expectedAspectCount: 1
|
|
16732
16725
|
};
|
|
16733
16726
|
});
|
|
@@ -16738,9 +16731,7 @@ var isJsonFactory = () => {
|
|
|
16738
16731
|
return {
|
|
16739
16732
|
score: result.score,
|
|
16740
16733
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16741
|
-
|
|
16742
|
-
misses: result.misses,
|
|
16743
|
-
reasoning: result.score === 1 ? "Output is valid JSON" : "Output is not valid JSON",
|
|
16734
|
+
assertions: result.assertions,
|
|
16744
16735
|
expectedAspectCount: 1
|
|
16745
16736
|
};
|
|
16746
16737
|
});
|
|
@@ -16752,9 +16743,7 @@ var equalsFactory = (config) => {
|
|
|
16752
16743
|
return {
|
|
16753
16744
|
score: result.score,
|
|
16754
16745
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16755
|
-
|
|
16756
|
-
misses: result.misses,
|
|
16757
|
-
reasoning: result.score === 1 ? `Output equals "${c.value}"` : `Output does not equal "${c.value}"`,
|
|
16746
|
+
assertions: result.assertions,
|
|
16758
16747
|
expectedAspectCount: 1
|
|
16759
16748
|
};
|
|
16760
16749
|
});
|
|
@@ -16766,9 +16755,7 @@ var containsAnyFactory = (config) => {
|
|
|
16766
16755
|
return {
|
|
16767
16756
|
score: result.score,
|
|
16768
16757
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16769
|
-
|
|
16770
|
-
misses: result.misses,
|
|
16771
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16758
|
+
assertions: result.assertions,
|
|
16772
16759
|
expectedAspectCount: 1
|
|
16773
16760
|
};
|
|
16774
16761
|
});
|
|
@@ -16780,9 +16767,7 @@ var containsAllFactory = (config) => {
|
|
|
16780
16767
|
return {
|
|
16781
16768
|
score: result.score,
|
|
16782
16769
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16783
|
-
|
|
16784
|
-
misses: result.misses,
|
|
16785
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16770
|
+
assertions: result.assertions,
|
|
16786
16771
|
expectedAspectCount: 1
|
|
16787
16772
|
};
|
|
16788
16773
|
});
|
|
@@ -16794,9 +16779,7 @@ var icontainsFactory = (config) => {
|
|
|
16794
16779
|
return {
|
|
16795
16780
|
score: result.score,
|
|
16796
16781
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16797
|
-
|
|
16798
|
-
misses: result.misses,
|
|
16799
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16782
|
+
assertions: result.assertions,
|
|
16800
16783
|
expectedAspectCount: 1
|
|
16801
16784
|
};
|
|
16802
16785
|
});
|
|
@@ -16808,9 +16791,7 @@ var icontainsAnyFactory = (config) => {
|
|
|
16808
16791
|
return {
|
|
16809
16792
|
score: result.score,
|
|
16810
16793
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16811
|
-
|
|
16812
|
-
misses: result.misses,
|
|
16813
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16794
|
+
assertions: result.assertions,
|
|
16814
16795
|
expectedAspectCount: 1
|
|
16815
16796
|
};
|
|
16816
16797
|
});
|
|
@@ -16822,9 +16803,7 @@ var icontainsAllFactory = (config) => {
|
|
|
16822
16803
|
return {
|
|
16823
16804
|
score: result.score,
|
|
16824
16805
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16825
|
-
|
|
16826
|
-
misses: result.misses,
|
|
16827
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16806
|
+
assertions: result.assertions,
|
|
16828
16807
|
expectedAspectCount: 1
|
|
16829
16808
|
};
|
|
16830
16809
|
});
|
|
@@ -16836,9 +16815,7 @@ var startsWithFactory = (config) => {
|
|
|
16836
16815
|
return {
|
|
16837
16816
|
score: result.score,
|
|
16838
16817
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16839
|
-
|
|
16840
|
-
misses: result.misses,
|
|
16841
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16818
|
+
assertions: result.assertions,
|
|
16842
16819
|
expectedAspectCount: 1
|
|
16843
16820
|
};
|
|
16844
16821
|
});
|
|
@@ -16850,9 +16827,7 @@ var endsWithFactory = (config) => {
|
|
|
16850
16827
|
return {
|
|
16851
16828
|
score: result.score,
|
|
16852
16829
|
verdict: result.score === 1 ? "pass" : "fail",
|
|
16853
|
-
|
|
16854
|
-
misses: result.misses,
|
|
16855
|
-
reasoning: result.score === 1 ? result.hits[0] : result.misses[0],
|
|
16830
|
+
assertions: result.assertions,
|
|
16856
16831
|
expectedAspectCount: 1
|
|
16857
16832
|
};
|
|
16858
16833
|
});
|
|
@@ -18258,9 +18233,8 @@ async function runEvaluation(options) {
|
|
|
18258
18233
|
testId: evalCase.id,
|
|
18259
18234
|
dataset: evalCase.dataset,
|
|
18260
18235
|
score: 0,
|
|
18261
|
-
|
|
18262
|
-
|
|
18263
|
-
answer: "",
|
|
18236
|
+
assertions: [],
|
|
18237
|
+
outputText: "",
|
|
18264
18238
|
target: target.name,
|
|
18265
18239
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
18266
18240
|
budgetExceeded: true,
|
|
@@ -18295,9 +18269,8 @@ async function runEvaluation(options) {
|
|
|
18295
18269
|
testId: evalCase.id,
|
|
18296
18270
|
dataset: evalCase.dataset,
|
|
18297
18271
|
score: 0,
|
|
18298
|
-
|
|
18299
|
-
|
|
18300
|
-
answer: "",
|
|
18272
|
+
assertions: [],
|
|
18273
|
+
outputText: "",
|
|
18301
18274
|
target: target.name,
|
|
18302
18275
|
error: errorMsg,
|
|
18303
18276
|
executionStatus: "execution_error",
|
|
@@ -19263,11 +19236,9 @@ async function evaluateCandidate(options) {
|
|
|
19263
19236
|
dataset: evalCase.dataset,
|
|
19264
19237
|
conversationId: evalCase.conversation_id,
|
|
19265
19238
|
score: score.score,
|
|
19266
|
-
|
|
19267
|
-
|
|
19268
|
-
answer: candidate,
|
|
19239
|
+
assertions: score.assertions,
|
|
19240
|
+
outputText: candidate,
|
|
19269
19241
|
target: target.name,
|
|
19270
|
-
reasoning: score.reasoning,
|
|
19271
19242
|
tokenUsage,
|
|
19272
19243
|
costUsd,
|
|
19273
19244
|
durationMs,
|
|
@@ -19441,9 +19412,7 @@ async function runEvaluatorList(options) {
|
|
|
19441
19412
|
score: score2.score,
|
|
19442
19413
|
weight,
|
|
19443
19414
|
verdict: score2.verdict,
|
|
19444
|
-
|
|
19445
|
-
misses: score2.misses,
|
|
19446
|
-
reasoning: score2.reasoning,
|
|
19415
|
+
assertions: score2.assertions,
|
|
19447
19416
|
evaluatorProviderRequest: score2.evaluatorRawRequest,
|
|
19448
19417
|
details: score2.details,
|
|
19449
19418
|
scores: mapChildResults(score2.scores),
|
|
@@ -19458,10 +19427,10 @@ async function runEvaluatorList(options) {
|
|
|
19458
19427
|
const fallbackScore = {
|
|
19459
19428
|
score: 0,
|
|
19460
19429
|
verdict: "fail",
|
|
19461
|
-
|
|
19462
|
-
|
|
19463
|
-
|
|
19464
|
-
|
|
19430
|
+
assertions: [
|
|
19431
|
+
{ text: `Evaluator '${evaluatorConfig.name}' failed: ${message}`, passed: false }
|
|
19432
|
+
],
|
|
19433
|
+
expectedAspectCount: 1
|
|
19465
19434
|
};
|
|
19466
19435
|
const weight = evaluatorConfig.weight ?? 1;
|
|
19467
19436
|
scored.push({
|
|
@@ -19477,9 +19446,12 @@ async function runEvaluatorList(options) {
|
|
|
19477
19446
|
score: 0,
|
|
19478
19447
|
weight,
|
|
19479
19448
|
verdict: "fail",
|
|
19480
|
-
|
|
19481
|
-
|
|
19482
|
-
|
|
19449
|
+
assertions: [
|
|
19450
|
+
{
|
|
19451
|
+
text: `Evaluator '${evaluatorConfig.name ?? "unknown"}' failed: ${message}`,
|
|
19452
|
+
passed: false
|
|
19453
|
+
}
|
|
19454
|
+
],
|
|
19483
19455
|
durationMs: endedAt.getTime() - startedAt.getTime(),
|
|
19484
19456
|
startedAt: startedAt.toISOString(),
|
|
19485
19457
|
endedAt: endedAt.toISOString()
|
|
@@ -19495,9 +19467,7 @@ async function runEvaluatorList(options) {
|
|
|
19495
19467
|
...scores[lastScoresIdx],
|
|
19496
19468
|
score: negated.score,
|
|
19497
19469
|
verdict: negated.verdict,
|
|
19498
|
-
|
|
19499
|
-
misses: [...negated.misses],
|
|
19500
|
-
reasoning: negated.reasoning
|
|
19470
|
+
assertions: [...negated.assertions]
|
|
19501
19471
|
};
|
|
19502
19472
|
}
|
|
19503
19473
|
}
|
|
@@ -19512,21 +19482,13 @@ async function runEvaluatorList(options) {
|
|
|
19512
19482
|
const aggregateScore = hasRequiredFailure ? 0 : scorable.length > 0 ? computeWeightedMean(
|
|
19513
19483
|
scorable.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
|
|
19514
19484
|
) : 0;
|
|
19515
|
-
const
|
|
19516
|
-
const
|
|
19517
|
-
const expectedAspectCount = scored.reduce(
|
|
19518
|
-
(total, entry) => total + (entry.score.expectedAspectCount ?? 0),
|
|
19519
|
-
0
|
|
19520
|
-
);
|
|
19521
|
-
const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString);
|
|
19522
|
-
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
19485
|
+
const assertions = scored.flatMap((entry) => entry.score.assertions);
|
|
19486
|
+
const expectedAspectCount = assertions.length || 1;
|
|
19523
19487
|
const score = {
|
|
19524
19488
|
score: aggregateScore,
|
|
19525
19489
|
verdict: scoreToVerdict(aggregateScore),
|
|
19526
|
-
|
|
19527
|
-
|
|
19528
|
-
expectedAspectCount,
|
|
19529
|
-
reasoning
|
|
19490
|
+
assertions,
|
|
19491
|
+
expectedAspectCount
|
|
19530
19492
|
};
|
|
19531
19493
|
return { score, scores };
|
|
19532
19494
|
}
|
|
@@ -19630,9 +19592,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
19630
19592
|
dataset: evalCase.dataset,
|
|
19631
19593
|
conversationId: evalCase.conversation_id,
|
|
19632
19594
|
score: 0,
|
|
19633
|
-
|
|
19634
|
-
|
|
19635
|
-
answer: `Error occurred: ${message}`,
|
|
19595
|
+
assertions: [{ text: `Error: ${message}`, passed: false }],
|
|
19596
|
+
outputText: `Error occurred: ${message}`,
|
|
19636
19597
|
target: targetName,
|
|
19637
19598
|
requests,
|
|
19638
19599
|
input,
|
|
@@ -19741,9 +19702,7 @@ function mapChildResults(children) {
|
|
|
19741
19702
|
score: child.score,
|
|
19742
19703
|
weight: child.weight,
|
|
19743
19704
|
verdict: child.verdict,
|
|
19744
|
-
|
|
19745
|
-
misses: child.misses,
|
|
19746
|
-
reasoning: child.reasoning,
|
|
19705
|
+
assertions: child.assertions,
|
|
19747
19706
|
evaluatorProviderRequest: child.evaluatorRawRequest,
|
|
19748
19707
|
scores: mapChildResults(child.scores),
|
|
19749
19708
|
details: child.details,
|
|
@@ -20172,7 +20131,7 @@ function shouldSkipCacheForTemperature(targetConfig) {
|
|
|
20172
20131
|
|
|
20173
20132
|
// src/evaluation/baseline.ts
|
|
20174
20133
|
var STRIPPED_TOP_LEVEL_FIELDS = /* @__PURE__ */ new Set([
|
|
20175
|
-
"
|
|
20134
|
+
"outputText",
|
|
20176
20135
|
"requests",
|
|
20177
20136
|
"trace",
|
|
20178
20137
|
"workspacePath",
|
|
@@ -20346,7 +20305,7 @@ var OtelTraceExporter = class {
|
|
|
20346
20305
|
rootSpan.setAttribute("agentv.target", result.target);
|
|
20347
20306
|
if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
|
|
20348
20307
|
rootSpan.setAttribute("agentv.score", result.score);
|
|
20349
|
-
if (captureContent) rootSpan.setAttribute("agentv.
|
|
20308
|
+
if (captureContent) rootSpan.setAttribute("agentv.output_text", result.outputText);
|
|
20350
20309
|
if (result.durationMs != null)
|
|
20351
20310
|
rootSpan.setAttribute("agentv.trace.duration_ms", result.durationMs);
|
|
20352
20311
|
if (result.costUsd != null) rootSpan.setAttribute("agentv.trace.cost_usd", result.costUsd);
|
|
@@ -20713,7 +20672,6 @@ function createAgentKernel() {
|
|
|
20713
20672
|
freeformEvaluationSchema,
|
|
20714
20673
|
generateRubrics,
|
|
20715
20674
|
getAgentvHome,
|
|
20716
|
-
getHitCount,
|
|
20717
20675
|
getOutputFilenames,
|
|
20718
20676
|
getSubagentsRoot,
|
|
20719
20677
|
getTraceStateRoot,
|