@mastra/evals 0.13.5-alpha.0 → 0.13.6-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/dist/{chunk-4LRZVFXR.js → chunk-KHEXN75Q.js} +72 -3
- package/dist/chunk-KHEXN75Q.js.map +1 -0
- package/dist/{chunk-EKSPLMYP.cjs → chunk-QKR2PMLZ.cjs} +79 -2
- package/dist/chunk-QKR2PMLZ.cjs.map +1 -0
- package/dist/scorers/code/index.cjs +2 -2
- package/dist/scorers/code/index.js +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts +34 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/prompts.d.ts +29 -0
- package/dist/scorers/llm/answer-similarity/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/index.cjs +335 -68
- package/dist/scorers/llm/index.cjs.map +1 -1
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/index.js +291 -27
- package/dist/scorers/llm/index.js.map +1 -1
- package/dist/scorers/utils.cjs +60 -0
- package/dist/scorers/utils.cjs.map +1 -0
- package/dist/scorers/utils.d.ts +1 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +3 -0
- package/dist/scorers/utils.js.map +1 -0
- package/package.json +14 -4
- package/dist/chunk-4LRZVFXR.js.map +0 -1
- package/dist/chunk-EKSPLMYP.cjs.map +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var chunk7QAUEU4L_cjs = require('../../chunk-7QAUEU4L.cjs');
|
|
4
|
-
var
|
|
4
|
+
var chunkQKR2PMLZ_cjs = require('../../chunk-QKR2PMLZ.cjs');
|
|
5
5
|
var scores = require('@mastra/core/scores');
|
|
6
6
|
var zod = require('zod');
|
|
7
7
|
|
|
@@ -227,14 +227,14 @@ function createAnswerRelevancyScorer({
|
|
|
227
227
|
description: "Extract relevant statements from the LLM output",
|
|
228
228
|
outputSchema: extractOutputSchema,
|
|
229
229
|
createPrompt: ({ run }) => {
|
|
230
|
-
const assistantMessage =
|
|
230
|
+
const assistantMessage = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
231
231
|
return createExtractPrompt(assistantMessage);
|
|
232
232
|
}
|
|
233
233
|
}).analyze({
|
|
234
234
|
description: "Score the relevance of the statements to the input",
|
|
235
235
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
236
236
|
createPrompt: ({ run, results }) => {
|
|
237
|
-
const input =
|
|
237
|
+
const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
238
238
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
239
239
|
}
|
|
240
240
|
}).generateScore(({ results }) => {
|
|
@@ -256,8 +256,8 @@ function createAnswerRelevancyScorer({
|
|
|
256
256
|
description: "Reason about the results",
|
|
257
257
|
createPrompt: ({ run, results, score }) => {
|
|
258
258
|
return createReasonPrompt({
|
|
259
|
-
input:
|
|
260
|
-
output:
|
|
259
|
+
input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
260
|
+
output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
261
261
|
score,
|
|
262
262
|
results: results.analyzeStepResult.results,
|
|
263
263
|
scale: options.scale
|
|
@@ -266,6 +266,270 @@ function createAnswerRelevancyScorer({
|
|
|
266
266
|
});
|
|
267
267
|
}
|
|
268
268
|
|
|
269
|
+
// src/scorers/llm/answer-similarity/prompts.ts
|
|
270
|
+
var createExtractPrompt2 = ({ output, groundTruth }) => `
|
|
271
|
+
Extract and normalize the semantic units (facts, claims, concepts) from both the agent output and the ground truth answer.
|
|
272
|
+
|
|
273
|
+
Break down each text into its core semantic components while preserving meaning and relationships.
|
|
274
|
+
Focus on extracting:
|
|
275
|
+
- Key facts and claims
|
|
276
|
+
- Important concepts and entities
|
|
277
|
+
- Relationships between concepts
|
|
278
|
+
- Quantitative information
|
|
279
|
+
- Qualitative descriptions
|
|
280
|
+
|
|
281
|
+
Guidelines:
|
|
282
|
+
- Preserve the semantic meaning, not just keywords
|
|
283
|
+
- Group related information together
|
|
284
|
+
- Normalize different phrasings of the same concept
|
|
285
|
+
- Keep numerical values and units together
|
|
286
|
+
- Don't over-split compound concepts that belong together
|
|
287
|
+
|
|
288
|
+
Return ONLY valid JSON with two arrays of semantic units. Do not include any text before or after the JSON.
|
|
289
|
+
|
|
290
|
+
Agent Output:
|
|
291
|
+
${output}
|
|
292
|
+
|
|
293
|
+
Ground Truth:
|
|
294
|
+
${groundTruth}
|
|
295
|
+
|
|
296
|
+
Required JSON format (return valid JSON only):
|
|
297
|
+
{
|
|
298
|
+
"outputUnits": [],
|
|
299
|
+
"groundTruthUnits": []
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
Important: Return valid JSON only, no additional text or explanations.
|
|
303
|
+
`;
|
|
304
|
+
var createAnalyzePrompt = ({
|
|
305
|
+
outputUnits,
|
|
306
|
+
groundTruthUnits
|
|
307
|
+
}) => `
|
|
308
|
+
Compare the semantic units from the agent output against the ground truth to evaluate answer similarity.
|
|
309
|
+
|
|
310
|
+
Analyze each ground truth unit and determine:
|
|
311
|
+
1. Whether it has a matching unit in the output (exact or semantic match)
|
|
312
|
+
2. The quality of the match (exact, semantic, partial, missing)
|
|
313
|
+
3. Whether there are contradictions
|
|
314
|
+
|
|
315
|
+
Also identify:
|
|
316
|
+
- Extra information in the output not present in ground truth
|
|
317
|
+
- Any contradictory statements between output and ground truth
|
|
318
|
+
|
|
319
|
+
Matching Guidelines:
|
|
320
|
+
- "exact": The same information expressed identically or with minor wording differences
|
|
321
|
+
- "semantic": The same concept or fact expressed differently but with equivalent meaning
|
|
322
|
+
- "partial": Some overlap but missing important details or context
|
|
323
|
+
- "missing": No corresponding information found in the output
|
|
324
|
+
- "contradiction": Information that directly conflicts with the ground truth (wrong facts, incorrect names, false claims)
|
|
325
|
+
|
|
326
|
+
CRITICAL: If the output contains factually incorrect information (wrong names, wrong facts, opposite claims), you MUST identify contradictions and mark relevant matches as "missing" while adding entries to the contradictions array.
|
|
327
|
+
|
|
328
|
+
Return ONLY valid JSON with detailed analysis. Do not include any text before or after the JSON.
|
|
329
|
+
|
|
330
|
+
Output Units:
|
|
331
|
+
${JSON.stringify(outputUnits, null, 2)}
|
|
332
|
+
|
|
333
|
+
Ground Truth Units:
|
|
334
|
+
${JSON.stringify(groundTruthUnits, null, 2)}
|
|
335
|
+
|
|
336
|
+
Required JSON format (copy this structure exactly):
|
|
337
|
+
{
|
|
338
|
+
"matches": [
|
|
339
|
+
{
|
|
340
|
+
"groundTruthUnit": "unit from ground truth",
|
|
341
|
+
"outputUnit": "corresponding unit from output or null if missing",
|
|
342
|
+
"matchType": "exact",
|
|
343
|
+
"explanation": "brief explanation of the match quality"
|
|
344
|
+
}
|
|
345
|
+
],
|
|
346
|
+
"extraInOutput": [],
|
|
347
|
+
"contradictions": []
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
Important:
|
|
351
|
+
- matchType must be exactly one of: "exact", "semantic", "partial", "missing"
|
|
352
|
+
- outputUnit must be a string or null (not undefined)
|
|
353
|
+
- All arrays must be present even if empty
|
|
354
|
+
- Return valid JSON only, no additional text
|
|
355
|
+
`;
|
|
356
|
+
var createReasonPrompt2 = ({
|
|
357
|
+
output,
|
|
358
|
+
groundTruth,
|
|
359
|
+
score,
|
|
360
|
+
analysis,
|
|
361
|
+
scale
|
|
362
|
+
}) => `
|
|
363
|
+
Generate a clear, actionable explanation of the answer similarity score.
|
|
364
|
+
|
|
365
|
+
Context:
|
|
366
|
+
- Agent Output: ${output}
|
|
367
|
+
- Ground Truth: ${groundTruth}
|
|
368
|
+
- Score: ${score}/${scale}
|
|
369
|
+
- Analysis: ${JSON.stringify(analysis, null, 2)}
|
|
370
|
+
|
|
371
|
+
Provide a concise explanation that:
|
|
372
|
+
1. States the overall similarity level (high/moderate/low)
|
|
373
|
+
2. Highlights what the agent got right
|
|
374
|
+
3. Identifies key missing or incorrect information
|
|
375
|
+
4. Suggests specific improvements if score is not perfect
|
|
376
|
+
|
|
377
|
+
Keep the explanation under 3 sentences and focus on actionable insights.
|
|
378
|
+
|
|
379
|
+
Format: "The score is {score}/{scale} because {explanation}. {what matched well}. {what needs improvement or is perfect}."
|
|
380
|
+
|
|
381
|
+
Example good responses:
|
|
382
|
+
- "The score is 0.9/1 because the answer captures all key concepts with minor phrasing differences. The agent correctly identified the main facts and relationships. Only missing a minor detail about the specific date mentioned in the ground truth."
|
|
383
|
+
- "The score is 0.5/1 because the answer is partially correct but missing crucial information. The agent correctly explained the basic concept. However, it missed the quantitative data and specific examples that were essential to the complete answer."
|
|
384
|
+
- "The score is 1.0/1 because the answer perfectly matches the ground truth semantically. All key facts, relationships, and details are accurately represented. No improvements needed."
|
|
385
|
+
`;
|
|
386
|
+
|
|
387
|
+
// src/scorers/llm/answer-similarity/index.ts
|
|
388
|
+
var ANSWER_SIMILARITY_DEFAULT_OPTIONS = {
|
|
389
|
+
requireGroundTruth: true,
|
|
390
|
+
semanticThreshold: 0.8,
|
|
391
|
+
exactMatchBonus: 0.2,
|
|
392
|
+
missingPenalty: 0.15,
|
|
393
|
+
contradictionPenalty: 1,
|
|
394
|
+
extraInfoPenalty: 0.05,
|
|
395
|
+
scale: 1
|
|
396
|
+
};
|
|
397
|
+
var ANSWER_SIMILARITY_INSTRUCTIONS = `
|
|
398
|
+
You are a precise answer similarity evaluator for CI/CD testing. Your role is to compare agent outputs against ground truth answers to ensure consistency and accuracy in automated testing.
|
|
399
|
+
|
|
400
|
+
Key Principles:
|
|
401
|
+
1. Focus on semantic equivalence, not just string matching
|
|
402
|
+
2. Recognize that different phrasings can convey the same information
|
|
403
|
+
3. Identify missing critical information from the ground truth
|
|
404
|
+
4. Detect contradictions between output and ground truth
|
|
405
|
+
5. Provide actionable feedback for improving answer accuracy
|
|
406
|
+
6. Be strict but fair - partial credit for partial matches
|
|
407
|
+
`;
|
|
408
|
+
var extractOutputSchema2 = zod.z.object({
|
|
409
|
+
outputUnits: zod.z.array(zod.z.string()),
|
|
410
|
+
groundTruthUnits: zod.z.array(zod.z.string())
|
|
411
|
+
});
|
|
412
|
+
var analyzeOutputSchema = zod.z.object({
|
|
413
|
+
matches: zod.z.array(
|
|
414
|
+
zod.z.object({
|
|
415
|
+
groundTruthUnit: zod.z.string(),
|
|
416
|
+
outputUnit: zod.z.string().nullable(),
|
|
417
|
+
matchType: zod.z.enum(["exact", "semantic", "partial", "missing"]),
|
|
418
|
+
explanation: zod.z.string()
|
|
419
|
+
})
|
|
420
|
+
),
|
|
421
|
+
extraInOutput: zod.z.array(zod.z.string()),
|
|
422
|
+
contradictions: zod.z.array(
|
|
423
|
+
zod.z.object({
|
|
424
|
+
outputUnit: zod.z.string(),
|
|
425
|
+
groundTruthUnit: zod.z.string(),
|
|
426
|
+
explanation: zod.z.string()
|
|
427
|
+
})
|
|
428
|
+
)
|
|
429
|
+
});
|
|
430
|
+
function createAnswerSimilarityScorer({
|
|
431
|
+
model,
|
|
432
|
+
options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
|
|
433
|
+
}) {
|
|
434
|
+
const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
|
|
435
|
+
return scores.createScorer({
|
|
436
|
+
name: "Answer Similarity Scorer",
|
|
437
|
+
description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
|
|
438
|
+
judge: {
|
|
439
|
+
model,
|
|
440
|
+
instructions: ANSWER_SIMILARITY_INSTRUCTIONS
|
|
441
|
+
}
|
|
442
|
+
}).preprocess({
|
|
443
|
+
description: "Extract semantic units from output and ground truth",
|
|
444
|
+
outputSchema: extractOutputSchema2,
|
|
445
|
+
createPrompt: ({ run }) => {
|
|
446
|
+
if (!run.groundTruth) {
|
|
447
|
+
if (mergedOptions.requireGroundTruth) {
|
|
448
|
+
throw new Error("Answer Similarity Scorer requires ground truth to be provided");
|
|
449
|
+
}
|
|
450
|
+
return createExtractPrompt2({
|
|
451
|
+
output: "",
|
|
452
|
+
groundTruth: ""
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
456
|
+
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
457
|
+
return createExtractPrompt2({
|
|
458
|
+
output,
|
|
459
|
+
groundTruth
|
|
460
|
+
});
|
|
461
|
+
}
|
|
462
|
+
}).analyze({
|
|
463
|
+
description: "Compare semantic units between output and ground truth",
|
|
464
|
+
outputSchema: analyzeOutputSchema,
|
|
465
|
+
createPrompt: ({ results }) => {
|
|
466
|
+
const outputUnits = results.preprocessStepResult?.outputUnits || [];
|
|
467
|
+
const groundTruthUnits = results.preprocessStepResult?.groundTruthUnits || [];
|
|
468
|
+
return createAnalyzePrompt({
|
|
469
|
+
outputUnits,
|
|
470
|
+
groundTruthUnits
|
|
471
|
+
});
|
|
472
|
+
}
|
|
473
|
+
}).generateScore(({ run, results }) => {
|
|
474
|
+
if (!run.groundTruth) {
|
|
475
|
+
return 0;
|
|
476
|
+
}
|
|
477
|
+
const analysis = results.analyzeStepResult;
|
|
478
|
+
if (!analysis) {
|
|
479
|
+
return 0;
|
|
480
|
+
}
|
|
481
|
+
let score = 0;
|
|
482
|
+
const totalUnits = analysis.matches.length;
|
|
483
|
+
if (totalUnits === 0) {
|
|
484
|
+
return 0;
|
|
485
|
+
}
|
|
486
|
+
for (const match of analysis.matches) {
|
|
487
|
+
switch (match.matchType) {
|
|
488
|
+
case "exact":
|
|
489
|
+
score += 1 + mergedOptions.exactMatchBonus;
|
|
490
|
+
break;
|
|
491
|
+
case "semantic":
|
|
492
|
+
score += mergedOptions.semanticThreshold;
|
|
493
|
+
break;
|
|
494
|
+
case "partial":
|
|
495
|
+
score += mergedOptions.semanticThreshold * 0.5;
|
|
496
|
+
break;
|
|
497
|
+
case "missing":
|
|
498
|
+
score -= mergedOptions.missingPenalty;
|
|
499
|
+
break;
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
const maxPossibleScore = totalUnits * (1 + mergedOptions.exactMatchBonus);
|
|
503
|
+
score = score / maxPossibleScore;
|
|
504
|
+
const contradictionPenalty = analysis.contradictions.length * mergedOptions.contradictionPenalty;
|
|
505
|
+
score -= contradictionPenalty;
|
|
506
|
+
const extraInfoPenalty = Math.min(
|
|
507
|
+
analysis.extraInOutput.length * mergedOptions.extraInfoPenalty,
|
|
508
|
+
0.2
|
|
509
|
+
// Cap extra info penalty at 0.2
|
|
510
|
+
);
|
|
511
|
+
score -= extraInfoPenalty;
|
|
512
|
+
score = Math.max(0, Math.min(1, score));
|
|
513
|
+
return chunk7QAUEU4L_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
514
|
+
}).generateReason({
|
|
515
|
+
description: "Generate explanation of similarity score",
|
|
516
|
+
createPrompt: ({ run, results, score }) => {
|
|
517
|
+
if (!run.groundTruth) {
|
|
518
|
+
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
519
|
+
}
|
|
520
|
+
const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
521
|
+
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
522
|
+
return createReasonPrompt2({
|
|
523
|
+
output,
|
|
524
|
+
groundTruth,
|
|
525
|
+
score,
|
|
526
|
+
analysis: results.analyzeStepResult,
|
|
527
|
+
scale: mergedOptions.scale
|
|
528
|
+
});
|
|
529
|
+
}
|
|
530
|
+
});
|
|
531
|
+
}
|
|
532
|
+
|
|
269
533
|
// src/scorers/llm/faithfulness/prompts.ts
|
|
270
534
|
var FAITHFULNESS_AGENT_INSTRUCTIONS = `You are a precise and thorough faithfulness evaluator. Your job is to determine if LLM outputs are factually consistent with the provided context, focusing on claim verification.
|
|
271
535
|
|
|
@@ -435,7 +699,7 @@ function createFaithfulnessScorer({
|
|
|
435
699
|
description: "Extract relevant statements from the LLM output",
|
|
436
700
|
outputSchema: zod.z.array(zod.z.string()),
|
|
437
701
|
createPrompt: ({ run }) => {
|
|
438
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
702
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
439
703
|
return prompt;
|
|
440
704
|
}
|
|
441
705
|
}).analyze({
|
|
@@ -456,13 +720,13 @@ function createFaithfulnessScorer({
|
|
|
456
720
|
return 0;
|
|
457
721
|
}
|
|
458
722
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
459
|
-
return
|
|
723
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
|
|
460
724
|
}).generateReason({
|
|
461
725
|
description: "Reason about the results",
|
|
462
726
|
createPrompt: ({ run, results, score }) => {
|
|
463
727
|
const prompt = createFaithfulnessReasonPrompt({
|
|
464
|
-
input:
|
|
465
|
-
output:
|
|
728
|
+
input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
729
|
+
output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
466
730
|
context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
467
731
|
score,
|
|
468
732
|
scale: options?.scale || 1,
|
|
@@ -593,13 +857,13 @@ function createBiasScorer({ model, options }) {
|
|
|
593
857
|
outputSchema: zod.z.object({
|
|
594
858
|
opinions: zod.z.array(zod.z.string())
|
|
595
859
|
}),
|
|
596
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
860
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
597
861
|
}).analyze({
|
|
598
862
|
description: "Score the relevance of the statements to the input",
|
|
599
863
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
600
864
|
createPrompt: ({ run, results }) => {
|
|
601
865
|
const prompt = createBiasAnalyzePrompt({
|
|
602
|
-
output:
|
|
866
|
+
output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
603
867
|
opinions: results.preprocessStepResult?.opinions || []
|
|
604
868
|
});
|
|
605
869
|
return prompt;
|
|
@@ -610,7 +874,7 @@ function createBiasScorer({ model, options }) {
|
|
|
610
874
|
}
|
|
611
875
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
612
876
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
613
|
-
return
|
|
877
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
614
878
|
}).generateReason({
|
|
615
879
|
description: "Reason about the results",
|
|
616
880
|
createPrompt: ({ score, results }) => {
|
|
@@ -827,7 +1091,7 @@ function createHallucinationScorer({
|
|
|
827
1091
|
claims: zod.z.array(zod.z.string())
|
|
828
1092
|
}),
|
|
829
1093
|
createPrompt: ({ run }) => {
|
|
830
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1094
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
831
1095
|
return prompt;
|
|
832
1096
|
}
|
|
833
1097
|
}).analyze({
|
|
@@ -849,13 +1113,13 @@ function createHallucinationScorer({
|
|
|
849
1113
|
return 0;
|
|
850
1114
|
}
|
|
851
1115
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
852
|
-
return
|
|
1116
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
|
|
853
1117
|
}).generateReason({
|
|
854
1118
|
description: "Reason about the results",
|
|
855
1119
|
createPrompt: ({ run, results, score }) => {
|
|
856
1120
|
const prompt = createHallucinationReasonPrompt({
|
|
857
|
-
input:
|
|
858
|
-
output:
|
|
1121
|
+
input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1122
|
+
output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
859
1123
|
context: options?.context || [],
|
|
860
1124
|
score,
|
|
861
1125
|
scale: options?.scale || 1,
|
|
@@ -964,8 +1228,8 @@ function createToxicityScorer({ model, options }) {
|
|
|
964
1228
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
965
1229
|
createPrompt: ({ run }) => {
|
|
966
1230
|
const prompt = createToxicityAnalyzePrompt({
|
|
967
|
-
input:
|
|
968
|
-
output:
|
|
1231
|
+
input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1232
|
+
output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
969
1233
|
});
|
|
970
1234
|
return prompt;
|
|
971
1235
|
}
|
|
@@ -981,7 +1245,7 @@ function createToxicityScorer({ model, options }) {
|
|
|
981
1245
|
}
|
|
982
1246
|
}
|
|
983
1247
|
const score = toxicityCount / numberOfVerdicts;
|
|
984
|
-
return
|
|
1248
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
985
1249
|
}).generateReason({
|
|
986
1250
|
description: "Reason about the results",
|
|
987
1251
|
createPrompt: ({ results, score }) => {
|
|
@@ -1018,7 +1282,7 @@ OUTPUT REQUIREMENTS:
|
|
|
1018
1282
|
|
|
1019
1283
|
You excel at identifying the difference between tools that directly serve the user's stated need versus tools that might be generally useful but weren't requested.
|
|
1020
1284
|
`;
|
|
1021
|
-
var
|
|
1285
|
+
var createAnalyzePrompt2 = ({
|
|
1022
1286
|
userInput,
|
|
1023
1287
|
agentResponse,
|
|
1024
1288
|
toolsCalled,
|
|
@@ -1069,7 +1333,7 @@ STRICT EVALUATION CRITERIA:
|
|
|
1069
1333
|
Evaluate each tool that was called, or if no tools were called, evaluate whether that was the right decision.
|
|
1070
1334
|
`;
|
|
1071
1335
|
};
|
|
1072
|
-
var
|
|
1336
|
+
var createReasonPrompt3 = ({
|
|
1073
1337
|
userInput,
|
|
1074
1338
|
score,
|
|
1075
1339
|
evaluations,
|
|
@@ -1088,7 +1352,7 @@ Provide a single, concise sentence explaining why this score was given.
|
|
|
1088
1352
|
};
|
|
1089
1353
|
|
|
1090
1354
|
// src/scorers/llm/tool-call-accuracy/index.ts
|
|
1091
|
-
var
|
|
1355
|
+
var analyzeOutputSchema2 = zod.z.object({
|
|
1092
1356
|
evaluations: zod.z.array(
|
|
1093
1357
|
zod.z.object({
|
|
1094
1358
|
toolCalled: zod.z.string(),
|
|
@@ -1113,7 +1377,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1113
1377
|
if (isInputInvalid || isOutputInvalid) {
|
|
1114
1378
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1115
1379
|
}
|
|
1116
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1380
|
+
const { tools: actualTools, toolCallInfos } = chunkQKR2PMLZ_cjs.extractToolCalls(run.output);
|
|
1117
1381
|
return {
|
|
1118
1382
|
actualTools,
|
|
1119
1383
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1121,12 +1385,12 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1121
1385
|
};
|
|
1122
1386
|
}).analyze({
|
|
1123
1387
|
description: "Analyze the appropriateness of tool selections",
|
|
1124
|
-
outputSchema:
|
|
1388
|
+
outputSchema: analyzeOutputSchema2,
|
|
1125
1389
|
createPrompt: ({ run, results }) => {
|
|
1126
|
-
const userInput =
|
|
1127
|
-
const agentResponse =
|
|
1390
|
+
const userInput = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1391
|
+
const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1128
1392
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1129
|
-
return
|
|
1393
|
+
return createAnalyzePrompt2({
|
|
1130
1394
|
userInput,
|
|
1131
1395
|
agentResponse,
|
|
1132
1396
|
toolsCalled,
|
|
@@ -1141,14 +1405,14 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1141
1405
|
}
|
|
1142
1406
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1143
1407
|
const totalToolCalls = evaluations.length;
|
|
1144
|
-
return
|
|
1408
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1145
1409
|
}).generateReason({
|
|
1146
1410
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1147
1411
|
createPrompt: ({ run, results, score }) => {
|
|
1148
|
-
const userInput =
|
|
1412
|
+
const userInput = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1149
1413
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1150
1414
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1151
|
-
return
|
|
1415
|
+
return createReasonPrompt3({
|
|
1152
1416
|
userInput,
|
|
1153
1417
|
score,
|
|
1154
1418
|
evaluations,
|
|
@@ -1175,7 +1439,7 @@ Evaluation Guidelines:
|
|
|
1175
1439
|
- Consider whether missing context might have led to a better response
|
|
1176
1440
|
|
|
1177
1441
|
Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
|
|
1178
|
-
function
|
|
1442
|
+
function createAnalyzePrompt3({
|
|
1179
1443
|
userQuery,
|
|
1180
1444
|
agentResponse,
|
|
1181
1445
|
providedContext
|
|
@@ -1260,7 +1524,7 @@ Context:
|
|
|
1260
1524
|
"overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
|
|
1261
1525
|
}`;
|
|
1262
1526
|
}
|
|
1263
|
-
function
|
|
1527
|
+
function createReasonPrompt4({
|
|
1264
1528
|
userQuery,
|
|
1265
1529
|
score,
|
|
1266
1530
|
evaluations,
|
|
@@ -1306,7 +1570,7 @@ Example responses:
|
|
|
1306
1570
|
}
|
|
1307
1571
|
|
|
1308
1572
|
// src/scorers/llm/context-relevance/index.ts
|
|
1309
|
-
var
|
|
1573
|
+
var analyzeOutputSchema3 = zod.z.object({
|
|
1310
1574
|
evaluations: zod.z.array(
|
|
1311
1575
|
zod.z.object({
|
|
1312
1576
|
context_index: zod.z.number(),
|
|
@@ -1346,19 +1610,19 @@ function createContextRelevanceScorerLLM({
|
|
|
1346
1610
|
}
|
|
1347
1611
|
}).analyze({
|
|
1348
1612
|
description: "Analyze the relevance and utility of provided context",
|
|
1349
|
-
outputSchema:
|
|
1613
|
+
outputSchema: analyzeOutputSchema3,
|
|
1350
1614
|
createPrompt: ({ run }) => {
|
|
1351
|
-
const userQuery =
|
|
1352
|
-
const agentResponse =
|
|
1615
|
+
const userQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1616
|
+
const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1353
1617
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1354
1618
|
if (context.length === 0) {
|
|
1355
|
-
return
|
|
1619
|
+
return createAnalyzePrompt3({
|
|
1356
1620
|
userQuery,
|
|
1357
1621
|
agentResponse,
|
|
1358
1622
|
providedContext: ["[No context was provided for evaluation]"]
|
|
1359
1623
|
});
|
|
1360
1624
|
}
|
|
1361
|
-
return
|
|
1625
|
+
return createAnalyzePrompt3({
|
|
1362
1626
|
userQuery,
|
|
1363
1627
|
agentResponse,
|
|
1364
1628
|
providedContext: context
|
|
@@ -1397,18 +1661,18 @@ function createContextRelevanceScorerLLM({
|
|
|
1397
1661
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1398
1662
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1399
1663
|
const scaledScore = finalScore * (options.scale || 1);
|
|
1400
|
-
return
|
|
1664
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(scaledScore);
|
|
1401
1665
|
}).generateReason({
|
|
1402
1666
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1403
1667
|
createPrompt: ({ run, results, score }) => {
|
|
1404
|
-
const userQuery =
|
|
1668
|
+
const userQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1405
1669
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1406
1670
|
if (context.length === 0) {
|
|
1407
1671
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
1408
1672
|
}
|
|
1409
1673
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1410
1674
|
const missingContext = results.analyzeStepResult?.missingContext || [];
|
|
1411
|
-
return
|
|
1675
|
+
return createReasonPrompt4({
|
|
1412
1676
|
userQuery,
|
|
1413
1677
|
score,
|
|
1414
1678
|
evaluations,
|
|
@@ -1570,8 +1834,8 @@ function createContextPrecisionScorer({
|
|
|
1570
1834
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1571
1835
|
outputSchema: contextRelevanceOutputSchema,
|
|
1572
1836
|
createPrompt: ({ run }) => {
|
|
1573
|
-
const input =
|
|
1574
|
-
const output =
|
|
1837
|
+
const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1838
|
+
const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1575
1839
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1576
1840
|
if (context.length === 0) {
|
|
1577
1841
|
throw new Error("No context available for evaluation");
|
|
@@ -1604,12 +1868,12 @@ function createContextPrecisionScorer({
|
|
|
1604
1868
|
}
|
|
1605
1869
|
const map = sumPrecision / relevantCount;
|
|
1606
1870
|
const score = map * (options.scale || 1);
|
|
1607
|
-
return
|
|
1871
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
|
|
1608
1872
|
}).generateReason({
|
|
1609
1873
|
description: "Reason about the context precision results",
|
|
1610
1874
|
createPrompt: ({ run, results, score }) => {
|
|
1611
|
-
const input =
|
|
1612
|
-
const output =
|
|
1875
|
+
const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1876
|
+
const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1613
1877
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1614
1878
|
return createContextPrecisionReasonPrompt({
|
|
1615
1879
|
input,
|
|
@@ -1641,7 +1905,7 @@ Noise Impact Assessment:
|
|
|
1641
1905
|
- **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
|
|
1642
1906
|
|
|
1643
1907
|
Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
|
|
1644
|
-
function
|
|
1908
|
+
function createAnalyzePrompt4({
|
|
1645
1909
|
userQuery,
|
|
1646
1910
|
baselineResponse,
|
|
1647
1911
|
noisyQuery,
|
|
@@ -1760,7 +2024,7 @@ Noisy Response: "Regular exercise improves cardiovascular health and strengthens
|
|
|
1760
2024
|
"robustnessScore": 0.85
|
|
1761
2025
|
}`;
|
|
1762
2026
|
}
|
|
1763
|
-
function
|
|
2027
|
+
function createReasonPrompt5({
|
|
1764
2028
|
userQuery,
|
|
1765
2029
|
score,
|
|
1766
2030
|
dimensions,
|
|
@@ -1815,7 +2079,7 @@ Example responses:
|
|
|
1815
2079
|
}
|
|
1816
2080
|
|
|
1817
2081
|
// src/scorers/llm/noise-sensitivity/index.ts
|
|
1818
|
-
var
|
|
2082
|
+
var analyzeOutputSchema4 = zod.z.object({
|
|
1819
2083
|
dimensions: zod.z.array(
|
|
1820
2084
|
zod.z.object({
|
|
1821
2085
|
dimension: zod.z.string(),
|
|
@@ -1859,14 +2123,14 @@ function createNoiseSensitivityScorerLLM({
|
|
|
1859
2123
|
}
|
|
1860
2124
|
}).analyze({
|
|
1861
2125
|
description: "Analyze the impact of noise on agent response quality",
|
|
1862
|
-
outputSchema:
|
|
2126
|
+
outputSchema: analyzeOutputSchema4,
|
|
1863
2127
|
createPrompt: ({ run }) => {
|
|
1864
|
-
const originalQuery =
|
|
1865
|
-
const noisyResponse =
|
|
2128
|
+
const originalQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2129
|
+
const noisyResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1866
2130
|
if (!originalQuery || !noisyResponse) {
|
|
1867
2131
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
1868
2132
|
}
|
|
1869
|
-
return
|
|
2133
|
+
return createAnalyzePrompt4({
|
|
1870
2134
|
userQuery: originalQuery,
|
|
1871
2135
|
baselineResponse: options.baselineResponse,
|
|
1872
2136
|
noisyQuery: options.noisyQuery,
|
|
@@ -1905,16 +2169,16 @@ function createNoiseSensitivityScorerLLM({
|
|
|
1905
2169
|
const majorIssues = analysisResult.majorIssues || [];
|
|
1906
2170
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
1907
2171
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
1908
|
-
return
|
|
2172
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(finalScore);
|
|
1909
2173
|
}).generateReason({
|
|
1910
2174
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
1911
2175
|
createPrompt: ({ run, results, score }) => {
|
|
1912
|
-
const originalQuery =
|
|
2176
|
+
const originalQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1913
2177
|
const analysisResult = results.analyzeStepResult;
|
|
1914
2178
|
if (!analysisResult) {
|
|
1915
2179
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
1916
2180
|
}
|
|
1917
|
-
return
|
|
2181
|
+
return createReasonPrompt5({
|
|
1918
2182
|
userQuery: originalQuery,
|
|
1919
2183
|
score,
|
|
1920
2184
|
dimensions: analysisResult.dimensions || [],
|
|
@@ -1943,7 +2207,7 @@ Evaluation Guidelines:
|
|
|
1943
2207
|
- Be objective and focus on alignment rather than response quality
|
|
1944
2208
|
|
|
1945
2209
|
Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
|
|
1946
|
-
function
|
|
2210
|
+
function createAnalyzePrompt5({
|
|
1947
2211
|
userPrompt,
|
|
1948
2212
|
systemPrompt,
|
|
1949
2213
|
agentResponse,
|
|
@@ -2095,7 +2359,7 @@ Agent Response: "def factorial(n):
|
|
|
2095
2359
|
"overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
|
|
2096
2360
|
}`;
|
|
2097
2361
|
}
|
|
2098
|
-
function
|
|
2362
|
+
function createReasonPrompt6({
|
|
2099
2363
|
userPrompt,
|
|
2100
2364
|
systemPrompt,
|
|
2101
2365
|
score,
|
|
@@ -2158,7 +2422,7 @@ Example responses:
|
|
|
2158
2422
|
}
|
|
2159
2423
|
|
|
2160
2424
|
// src/scorers/llm/prompt-alignment/index.ts
|
|
2161
|
-
var
|
|
2425
|
+
var analyzeOutputSchema5 = zod.z.object({
|
|
2162
2426
|
intentAlignment: zod.z.object({
|
|
2163
2427
|
score: zod.z.number().min(0).max(1),
|
|
2164
2428
|
primaryIntent: zod.z.string(),
|
|
@@ -2230,11 +2494,11 @@ function createPromptAlignmentScorerLLM({
|
|
|
2230
2494
|
}
|
|
2231
2495
|
}).analyze({
|
|
2232
2496
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2233
|
-
outputSchema:
|
|
2497
|
+
outputSchema: analyzeOutputSchema5,
|
|
2234
2498
|
createPrompt: ({ run }) => {
|
|
2235
|
-
const userPrompt =
|
|
2236
|
-
const systemPrompt =
|
|
2237
|
-
const agentResponse =
|
|
2499
|
+
const userPrompt = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2500
|
+
const systemPrompt = chunkQKR2PMLZ_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2501
|
+
const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2238
2502
|
if (evaluationMode === "user" && !userPrompt) {
|
|
2239
2503
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2240
2504
|
}
|
|
@@ -2247,7 +2511,7 @@ function createPromptAlignmentScorerLLM({
|
|
|
2247
2511
|
if (!agentResponse) {
|
|
2248
2512
|
throw new Error("Agent response is required for prompt alignment scoring");
|
|
2249
2513
|
}
|
|
2250
|
-
return
|
|
2514
|
+
return createAnalyzePrompt5({
|
|
2251
2515
|
userPrompt,
|
|
2252
2516
|
systemPrompt,
|
|
2253
2517
|
agentResponse,
|
|
@@ -2270,17 +2534,17 @@ function createPromptAlignmentScorerLLM({
|
|
|
2270
2534
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2271
2535
|
}
|
|
2272
2536
|
const finalScore = weightedScore * scale;
|
|
2273
|
-
return
|
|
2537
|
+
return chunkQKR2PMLZ_cjs.roundToTwoDecimals(finalScore);
|
|
2274
2538
|
}).generateReason({
|
|
2275
2539
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2276
2540
|
createPrompt: ({ run, results, score }) => {
|
|
2277
|
-
const userPrompt =
|
|
2278
|
-
const systemPrompt =
|
|
2541
|
+
const userPrompt = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2542
|
+
const systemPrompt = chunkQKR2PMLZ_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2279
2543
|
const analysis = results.analyzeStepResult;
|
|
2280
2544
|
if (!analysis) {
|
|
2281
2545
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
2282
2546
|
}
|
|
2283
|
-
return
|
|
2547
|
+
return createReasonPrompt6({
|
|
2284
2548
|
userPrompt,
|
|
2285
2549
|
systemPrompt,
|
|
2286
2550
|
score,
|
|
@@ -2293,8 +2557,11 @@ function createPromptAlignmentScorerLLM({
|
|
|
2293
2557
|
}
|
|
2294
2558
|
|
|
2295
2559
|
exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
|
|
2560
|
+
exports.ANSWER_SIMILARITY_DEFAULT_OPTIONS = ANSWER_SIMILARITY_DEFAULT_OPTIONS;
|
|
2561
|
+
exports.ANSWER_SIMILARITY_INSTRUCTIONS = ANSWER_SIMILARITY_INSTRUCTIONS;
|
|
2296
2562
|
exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;
|
|
2297
2563
|
exports.createAnswerRelevancyScorer = createAnswerRelevancyScorer;
|
|
2564
|
+
exports.createAnswerSimilarityScorer = createAnswerSimilarityScorer;
|
|
2298
2565
|
exports.createBiasScorer = createBiasScorer;
|
|
2299
2566
|
exports.createContextPrecisionScorer = createContextPrecisionScorer;
|
|
2300
2567
|
exports.createContextRelevanceScorerLLM = createContextRelevanceScorerLLM;
|