@mastra/evals 0.13.5 → 0.13.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/dist/{chunk-4LRZVFXR.js → chunk-KHEXN75Q.js} +72 -3
- package/dist/chunk-KHEXN75Q.js.map +1 -0
- package/dist/{chunk-EKSPLMYP.cjs → chunk-QKR2PMLZ.cjs} +79 -2
- package/dist/chunk-QKR2PMLZ.cjs.map +1 -0
- package/dist/{dist-QNM75ISG.cjs → dist-ALHZKHK6.cjs} +9 -9
- package/dist/{dist-QNM75ISG.cjs.map → dist-ALHZKHK6.cjs.map} +1 -1
- package/dist/{dist-KXHZV6E4.js → dist-HPW4UI62.js} +9 -9
- package/dist/{dist-KXHZV6E4.js.map → dist-HPW4UI62.js.map} +1 -1
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/scorers/code/index.cjs +2 -2
- package/dist/scorers/code/index.js +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts +34 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/prompts.d.ts +29 -0
- package/dist/scorers/llm/answer-similarity/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/index.cjs +335 -68
- package/dist/scorers/llm/index.cjs.map +1 -1
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/index.js +291 -27
- package/dist/scorers/llm/index.js.map +1 -1
- package/dist/scorers/utils.cjs +60 -0
- package/dist/scorers/utils.cjs.map +1 -0
- package/dist/scorers/utils.d.ts +1 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +3 -0
- package/dist/scorers/utils.js.map +1 -0
- package/package.json +14 -4
- package/dist/chunk-4LRZVFXR.js.map +0 -1
- package/dist/chunk-EKSPLMYP.cjs.map +0 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { roundToTwoDecimals } from '../../chunk-QTWX6TKR.js';
|
|
2
|
-
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-
|
|
2
|
+
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-KHEXN75Q.js';
|
|
3
3
|
import { createScorer } from '@mastra/core/scores';
|
|
4
4
|
import { z } from 'zod';
|
|
5
5
|
|
|
@@ -264,6 +264,270 @@ function createAnswerRelevancyScorer({
|
|
|
264
264
|
});
|
|
265
265
|
}
|
|
266
266
|
|
|
267
|
+
// src/scorers/llm/answer-similarity/prompts.ts
|
|
268
|
+
var createExtractPrompt2 = ({ output, groundTruth }) => `
|
|
269
|
+
Extract and normalize the semantic units (facts, claims, concepts) from both the agent output and the ground truth answer.
|
|
270
|
+
|
|
271
|
+
Break down each text into its core semantic components while preserving meaning and relationships.
|
|
272
|
+
Focus on extracting:
|
|
273
|
+
- Key facts and claims
|
|
274
|
+
- Important concepts and entities
|
|
275
|
+
- Relationships between concepts
|
|
276
|
+
- Quantitative information
|
|
277
|
+
- Qualitative descriptions
|
|
278
|
+
|
|
279
|
+
Guidelines:
|
|
280
|
+
- Preserve the semantic meaning, not just keywords
|
|
281
|
+
- Group related information together
|
|
282
|
+
- Normalize different phrasings of the same concept
|
|
283
|
+
- Keep numerical values and units together
|
|
284
|
+
- Don't over-split compound concepts that belong together
|
|
285
|
+
|
|
286
|
+
Return ONLY valid JSON with two arrays of semantic units. Do not include any text before or after the JSON.
|
|
287
|
+
|
|
288
|
+
Agent Output:
|
|
289
|
+
${output}
|
|
290
|
+
|
|
291
|
+
Ground Truth:
|
|
292
|
+
${groundTruth}
|
|
293
|
+
|
|
294
|
+
Required JSON format (return valid JSON only):
|
|
295
|
+
{
|
|
296
|
+
"outputUnits": [],
|
|
297
|
+
"groundTruthUnits": []
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
Important: Return valid JSON only, no additional text or explanations.
|
|
301
|
+
`;
|
|
302
|
+
var createAnalyzePrompt = ({
|
|
303
|
+
outputUnits,
|
|
304
|
+
groundTruthUnits
|
|
305
|
+
}) => `
|
|
306
|
+
Compare the semantic units from the agent output against the ground truth to evaluate answer similarity.
|
|
307
|
+
|
|
308
|
+
Analyze each ground truth unit and determine:
|
|
309
|
+
1. Whether it has a matching unit in the output (exact or semantic match)
|
|
310
|
+
2. The quality of the match (exact, semantic, partial, missing)
|
|
311
|
+
3. Whether there are contradictions
|
|
312
|
+
|
|
313
|
+
Also identify:
|
|
314
|
+
- Extra information in the output not present in ground truth
|
|
315
|
+
- Any contradictory statements between output and ground truth
|
|
316
|
+
|
|
317
|
+
Matching Guidelines:
|
|
318
|
+
- "exact": The same information expressed identically or with minor wording differences
|
|
319
|
+
- "semantic": The same concept or fact expressed differently but with equivalent meaning
|
|
320
|
+
- "partial": Some overlap but missing important details or context
|
|
321
|
+
- "missing": No corresponding information found in the output
|
|
322
|
+
- "contradiction": Information that directly conflicts with the ground truth (wrong facts, incorrect names, false claims)
|
|
323
|
+
|
|
324
|
+
CRITICAL: If the output contains factually incorrect information (wrong names, wrong facts, opposite claims), you MUST identify contradictions and mark relevant matches as "missing" while adding entries to the contradictions array.
|
|
325
|
+
|
|
326
|
+
Return ONLY valid JSON with detailed analysis. Do not include any text before or after the JSON.
|
|
327
|
+
|
|
328
|
+
Output Units:
|
|
329
|
+
${JSON.stringify(outputUnits, null, 2)}
|
|
330
|
+
|
|
331
|
+
Ground Truth Units:
|
|
332
|
+
${JSON.stringify(groundTruthUnits, null, 2)}
|
|
333
|
+
|
|
334
|
+
Required JSON format (copy this structure exactly):
|
|
335
|
+
{
|
|
336
|
+
"matches": [
|
|
337
|
+
{
|
|
338
|
+
"groundTruthUnit": "unit from ground truth",
|
|
339
|
+
"outputUnit": "corresponding unit from output or null if missing",
|
|
340
|
+
"matchType": "exact",
|
|
341
|
+
"explanation": "brief explanation of the match quality"
|
|
342
|
+
}
|
|
343
|
+
],
|
|
344
|
+
"extraInOutput": [],
|
|
345
|
+
"contradictions": []
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
Important:
|
|
349
|
+
- matchType must be exactly one of: "exact", "semantic", "partial", "missing"
|
|
350
|
+
- outputUnit must be a string or null (not undefined)
|
|
351
|
+
- All arrays must be present even if empty
|
|
352
|
+
- Return valid JSON only, no additional text
|
|
353
|
+
`;
|
|
354
|
+
var createReasonPrompt2 = ({
|
|
355
|
+
output,
|
|
356
|
+
groundTruth,
|
|
357
|
+
score,
|
|
358
|
+
analysis,
|
|
359
|
+
scale
|
|
360
|
+
}) => `
|
|
361
|
+
Generate a clear, actionable explanation of the answer similarity score.
|
|
362
|
+
|
|
363
|
+
Context:
|
|
364
|
+
- Agent Output: ${output}
|
|
365
|
+
- Ground Truth: ${groundTruth}
|
|
366
|
+
- Score: ${score}/${scale}
|
|
367
|
+
- Analysis: ${JSON.stringify(analysis, null, 2)}
|
|
368
|
+
|
|
369
|
+
Provide a concise explanation that:
|
|
370
|
+
1. States the overall similarity level (high/moderate/low)
|
|
371
|
+
2. Highlights what the agent got right
|
|
372
|
+
3. Identifies key missing or incorrect information
|
|
373
|
+
4. Suggests specific improvements if score is not perfect
|
|
374
|
+
|
|
375
|
+
Keep the explanation under 3 sentences and focus on actionable insights.
|
|
376
|
+
|
|
377
|
+
Format: "The score is {score}/{scale} because {explanation}. {what matched well}. {what needs improvement or is perfect}."
|
|
378
|
+
|
|
379
|
+
Example good responses:
|
|
380
|
+
- "The score is 0.9/1 because the answer captures all key concepts with minor phrasing differences. The agent correctly identified the main facts and relationships. Only missing a minor detail about the specific date mentioned in the ground truth."
|
|
381
|
+
- "The score is 0.5/1 because the answer is partially correct but missing crucial information. The agent correctly explained the basic concept. However, it missed the quantitative data and specific examples that were essential to the complete answer."
|
|
382
|
+
- "The score is 1.0/1 because the answer perfectly matches the ground truth semantically. All key facts, relationships, and details are accurately represented. No improvements needed."
|
|
383
|
+
`;
|
|
384
|
+
|
|
385
|
+
// src/scorers/llm/answer-similarity/index.ts
|
|
386
|
+
var ANSWER_SIMILARITY_DEFAULT_OPTIONS = {
|
|
387
|
+
requireGroundTruth: true,
|
|
388
|
+
semanticThreshold: 0.8,
|
|
389
|
+
exactMatchBonus: 0.2,
|
|
390
|
+
missingPenalty: 0.15,
|
|
391
|
+
contradictionPenalty: 1,
|
|
392
|
+
extraInfoPenalty: 0.05,
|
|
393
|
+
scale: 1
|
|
394
|
+
};
|
|
395
|
+
var ANSWER_SIMILARITY_INSTRUCTIONS = `
|
|
396
|
+
You are a precise answer similarity evaluator for CI/CD testing. Your role is to compare agent outputs against ground truth answers to ensure consistency and accuracy in automated testing.
|
|
397
|
+
|
|
398
|
+
Key Principles:
|
|
399
|
+
1. Focus on semantic equivalence, not just string matching
|
|
400
|
+
2. Recognize that different phrasings can convey the same information
|
|
401
|
+
3. Identify missing critical information from the ground truth
|
|
402
|
+
4. Detect contradictions between output and ground truth
|
|
403
|
+
5. Provide actionable feedback for improving answer accuracy
|
|
404
|
+
6. Be strict but fair - partial credit for partial matches
|
|
405
|
+
`;
|
|
406
|
+
var extractOutputSchema2 = z.object({
|
|
407
|
+
outputUnits: z.array(z.string()),
|
|
408
|
+
groundTruthUnits: z.array(z.string())
|
|
409
|
+
});
|
|
410
|
+
var analyzeOutputSchema = z.object({
|
|
411
|
+
matches: z.array(
|
|
412
|
+
z.object({
|
|
413
|
+
groundTruthUnit: z.string(),
|
|
414
|
+
outputUnit: z.string().nullable(),
|
|
415
|
+
matchType: z.enum(["exact", "semantic", "partial", "missing"]),
|
|
416
|
+
explanation: z.string()
|
|
417
|
+
})
|
|
418
|
+
),
|
|
419
|
+
extraInOutput: z.array(z.string()),
|
|
420
|
+
contradictions: z.array(
|
|
421
|
+
z.object({
|
|
422
|
+
outputUnit: z.string(),
|
|
423
|
+
groundTruthUnit: z.string(),
|
|
424
|
+
explanation: z.string()
|
|
425
|
+
})
|
|
426
|
+
)
|
|
427
|
+
});
|
|
428
|
+
function createAnswerSimilarityScorer({
|
|
429
|
+
model,
|
|
430
|
+
options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
|
|
431
|
+
}) {
|
|
432
|
+
const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
|
|
433
|
+
return createScorer({
|
|
434
|
+
name: "Answer Similarity Scorer",
|
|
435
|
+
description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
|
|
436
|
+
judge: {
|
|
437
|
+
model,
|
|
438
|
+
instructions: ANSWER_SIMILARITY_INSTRUCTIONS
|
|
439
|
+
}
|
|
440
|
+
}).preprocess({
|
|
441
|
+
description: "Extract semantic units from output and ground truth",
|
|
442
|
+
outputSchema: extractOutputSchema2,
|
|
443
|
+
createPrompt: ({ run }) => {
|
|
444
|
+
if (!run.groundTruth) {
|
|
445
|
+
if (mergedOptions.requireGroundTruth) {
|
|
446
|
+
throw new Error("Answer Similarity Scorer requires ground truth to be provided");
|
|
447
|
+
}
|
|
448
|
+
return createExtractPrompt2({
|
|
449
|
+
output: "",
|
|
450
|
+
groundTruth: ""
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
const output = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
454
|
+
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
455
|
+
return createExtractPrompt2({
|
|
456
|
+
output,
|
|
457
|
+
groundTruth
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
}).analyze({
|
|
461
|
+
description: "Compare semantic units between output and ground truth",
|
|
462
|
+
outputSchema: analyzeOutputSchema,
|
|
463
|
+
createPrompt: ({ results }) => {
|
|
464
|
+
const outputUnits = results.preprocessStepResult?.outputUnits || [];
|
|
465
|
+
const groundTruthUnits = results.preprocessStepResult?.groundTruthUnits || [];
|
|
466
|
+
return createAnalyzePrompt({
|
|
467
|
+
outputUnits,
|
|
468
|
+
groundTruthUnits
|
|
469
|
+
});
|
|
470
|
+
}
|
|
471
|
+
}).generateScore(({ run, results }) => {
|
|
472
|
+
if (!run.groundTruth) {
|
|
473
|
+
return 0;
|
|
474
|
+
}
|
|
475
|
+
const analysis = results.analyzeStepResult;
|
|
476
|
+
if (!analysis) {
|
|
477
|
+
return 0;
|
|
478
|
+
}
|
|
479
|
+
let score = 0;
|
|
480
|
+
const totalUnits = analysis.matches.length;
|
|
481
|
+
if (totalUnits === 0) {
|
|
482
|
+
return 0;
|
|
483
|
+
}
|
|
484
|
+
for (const match of analysis.matches) {
|
|
485
|
+
switch (match.matchType) {
|
|
486
|
+
case "exact":
|
|
487
|
+
score += 1 + mergedOptions.exactMatchBonus;
|
|
488
|
+
break;
|
|
489
|
+
case "semantic":
|
|
490
|
+
score += mergedOptions.semanticThreshold;
|
|
491
|
+
break;
|
|
492
|
+
case "partial":
|
|
493
|
+
score += mergedOptions.semanticThreshold * 0.5;
|
|
494
|
+
break;
|
|
495
|
+
case "missing":
|
|
496
|
+
score -= mergedOptions.missingPenalty;
|
|
497
|
+
break;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
const maxPossibleScore = totalUnits * (1 + mergedOptions.exactMatchBonus);
|
|
501
|
+
score = score / maxPossibleScore;
|
|
502
|
+
const contradictionPenalty = analysis.contradictions.length * mergedOptions.contradictionPenalty;
|
|
503
|
+
score -= contradictionPenalty;
|
|
504
|
+
const extraInfoPenalty = Math.min(
|
|
505
|
+
analysis.extraInOutput.length * mergedOptions.extraInfoPenalty,
|
|
506
|
+
0.2
|
|
507
|
+
// Cap extra info penalty at 0.2
|
|
508
|
+
);
|
|
509
|
+
score -= extraInfoPenalty;
|
|
510
|
+
score = Math.max(0, Math.min(1, score));
|
|
511
|
+
return roundToTwoDecimals(score * mergedOptions.scale);
|
|
512
|
+
}).generateReason({
|
|
513
|
+
description: "Generate explanation of similarity score",
|
|
514
|
+
createPrompt: ({ run, results, score }) => {
|
|
515
|
+
if (!run.groundTruth) {
|
|
516
|
+
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
517
|
+
}
|
|
518
|
+
const output = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
519
|
+
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
520
|
+
return createReasonPrompt2({
|
|
521
|
+
output,
|
|
522
|
+
groundTruth,
|
|
523
|
+
score,
|
|
524
|
+
analysis: results.analyzeStepResult,
|
|
525
|
+
scale: mergedOptions.scale
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
});
|
|
529
|
+
}
|
|
530
|
+
|
|
267
531
|
// src/scorers/llm/faithfulness/prompts.ts
|
|
268
532
|
var FAITHFULNESS_AGENT_INSTRUCTIONS = `You are a precise and thorough faithfulness evaluator. Your job is to determine if LLM outputs are factually consistent with the provided context, focusing on claim verification.
|
|
269
533
|
|
|
@@ -1016,7 +1280,7 @@ OUTPUT REQUIREMENTS:
|
|
|
1016
1280
|
|
|
1017
1281
|
You excel at identifying the difference between tools that directly serve the user's stated need versus tools that might be generally useful but weren't requested.
|
|
1018
1282
|
`;
|
|
1019
|
-
var
|
|
1283
|
+
var createAnalyzePrompt2 = ({
|
|
1020
1284
|
userInput,
|
|
1021
1285
|
agentResponse,
|
|
1022
1286
|
toolsCalled,
|
|
@@ -1067,7 +1331,7 @@ STRICT EVALUATION CRITERIA:
|
|
|
1067
1331
|
Evaluate each tool that was called, or if no tools were called, evaluate whether that was the right decision.
|
|
1068
1332
|
`;
|
|
1069
1333
|
};
|
|
1070
|
-
var
|
|
1334
|
+
var createReasonPrompt3 = ({
|
|
1071
1335
|
userInput,
|
|
1072
1336
|
score,
|
|
1073
1337
|
evaluations,
|
|
@@ -1086,7 +1350,7 @@ Provide a single, concise sentence explaining why this score was given.
|
|
|
1086
1350
|
};
|
|
1087
1351
|
|
|
1088
1352
|
// src/scorers/llm/tool-call-accuracy/index.ts
|
|
1089
|
-
var
|
|
1353
|
+
var analyzeOutputSchema2 = z.object({
|
|
1090
1354
|
evaluations: z.array(
|
|
1091
1355
|
z.object({
|
|
1092
1356
|
toolCalled: z.string(),
|
|
@@ -1119,12 +1383,12 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1119
1383
|
};
|
|
1120
1384
|
}).analyze({
|
|
1121
1385
|
description: "Analyze the appropriateness of tool selections",
|
|
1122
|
-
outputSchema:
|
|
1386
|
+
outputSchema: analyzeOutputSchema2,
|
|
1123
1387
|
createPrompt: ({ run, results }) => {
|
|
1124
1388
|
const userInput = getUserMessageFromRunInput(run.input) ?? "";
|
|
1125
1389
|
const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1126
1390
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1127
|
-
return
|
|
1391
|
+
return createAnalyzePrompt2({
|
|
1128
1392
|
userInput,
|
|
1129
1393
|
agentResponse,
|
|
1130
1394
|
toolsCalled,
|
|
@@ -1146,7 +1410,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1146
1410
|
const userInput = getUserMessageFromRunInput(run.input) ?? "";
|
|
1147
1411
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1148
1412
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1149
|
-
return
|
|
1413
|
+
return createReasonPrompt3({
|
|
1150
1414
|
userInput,
|
|
1151
1415
|
score,
|
|
1152
1416
|
evaluations,
|
|
@@ -1173,7 +1437,7 @@ Evaluation Guidelines:
|
|
|
1173
1437
|
- Consider whether missing context might have led to a better response
|
|
1174
1438
|
|
|
1175
1439
|
Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
|
|
1176
|
-
function
|
|
1440
|
+
function createAnalyzePrompt3({
|
|
1177
1441
|
userQuery,
|
|
1178
1442
|
agentResponse,
|
|
1179
1443
|
providedContext
|
|
@@ -1258,7 +1522,7 @@ Context:
|
|
|
1258
1522
|
"overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
|
|
1259
1523
|
}`;
|
|
1260
1524
|
}
|
|
1261
|
-
function
|
|
1525
|
+
function createReasonPrompt4({
|
|
1262
1526
|
userQuery,
|
|
1263
1527
|
score,
|
|
1264
1528
|
evaluations,
|
|
@@ -1304,7 +1568,7 @@ Example responses:
|
|
|
1304
1568
|
}
|
|
1305
1569
|
|
|
1306
1570
|
// src/scorers/llm/context-relevance/index.ts
|
|
1307
|
-
var
|
|
1571
|
+
var analyzeOutputSchema3 = z.object({
|
|
1308
1572
|
evaluations: z.array(
|
|
1309
1573
|
z.object({
|
|
1310
1574
|
context_index: z.number(),
|
|
@@ -1344,19 +1608,19 @@ function createContextRelevanceScorerLLM({
|
|
|
1344
1608
|
}
|
|
1345
1609
|
}).analyze({
|
|
1346
1610
|
description: "Analyze the relevance and utility of provided context",
|
|
1347
|
-
outputSchema:
|
|
1611
|
+
outputSchema: analyzeOutputSchema3,
|
|
1348
1612
|
createPrompt: ({ run }) => {
|
|
1349
1613
|
const userQuery = getUserMessageFromRunInput(run.input) ?? "";
|
|
1350
1614
|
const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1351
1615
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1352
1616
|
if (context.length === 0) {
|
|
1353
|
-
return
|
|
1617
|
+
return createAnalyzePrompt3({
|
|
1354
1618
|
userQuery,
|
|
1355
1619
|
agentResponse,
|
|
1356
1620
|
providedContext: ["[No context was provided for evaluation]"]
|
|
1357
1621
|
});
|
|
1358
1622
|
}
|
|
1359
|
-
return
|
|
1623
|
+
return createAnalyzePrompt3({
|
|
1360
1624
|
userQuery,
|
|
1361
1625
|
agentResponse,
|
|
1362
1626
|
providedContext: context
|
|
@@ -1406,7 +1670,7 @@ function createContextRelevanceScorerLLM({
|
|
|
1406
1670
|
}
|
|
1407
1671
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1408
1672
|
const missingContext = results.analyzeStepResult?.missingContext || [];
|
|
1409
|
-
return
|
|
1673
|
+
return createReasonPrompt4({
|
|
1410
1674
|
userQuery,
|
|
1411
1675
|
score,
|
|
1412
1676
|
evaluations,
|
|
@@ -1639,7 +1903,7 @@ Noise Impact Assessment:
|
|
|
1639
1903
|
- **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
|
|
1640
1904
|
|
|
1641
1905
|
Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
|
|
1642
|
-
function
|
|
1906
|
+
function createAnalyzePrompt4({
|
|
1643
1907
|
userQuery,
|
|
1644
1908
|
baselineResponse,
|
|
1645
1909
|
noisyQuery,
|
|
@@ -1758,7 +2022,7 @@ Noisy Response: "Regular exercise improves cardiovascular health and strengthens
|
|
|
1758
2022
|
"robustnessScore": 0.85
|
|
1759
2023
|
}`;
|
|
1760
2024
|
}
|
|
1761
|
-
function
|
|
2025
|
+
function createReasonPrompt5({
|
|
1762
2026
|
userQuery,
|
|
1763
2027
|
score,
|
|
1764
2028
|
dimensions,
|
|
@@ -1813,7 +2077,7 @@ Example responses:
|
|
|
1813
2077
|
}
|
|
1814
2078
|
|
|
1815
2079
|
// src/scorers/llm/noise-sensitivity/index.ts
|
|
1816
|
-
var
|
|
2080
|
+
var analyzeOutputSchema4 = z.object({
|
|
1817
2081
|
dimensions: z.array(
|
|
1818
2082
|
z.object({
|
|
1819
2083
|
dimension: z.string(),
|
|
@@ -1857,14 +2121,14 @@ function createNoiseSensitivityScorerLLM({
|
|
|
1857
2121
|
}
|
|
1858
2122
|
}).analyze({
|
|
1859
2123
|
description: "Analyze the impact of noise on agent response quality",
|
|
1860
|
-
outputSchema:
|
|
2124
|
+
outputSchema: analyzeOutputSchema4,
|
|
1861
2125
|
createPrompt: ({ run }) => {
|
|
1862
2126
|
const originalQuery = getUserMessageFromRunInput(run.input) ?? "";
|
|
1863
2127
|
const noisyResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1864
2128
|
if (!originalQuery || !noisyResponse) {
|
|
1865
2129
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
1866
2130
|
}
|
|
1867
|
-
return
|
|
2131
|
+
return createAnalyzePrompt4({
|
|
1868
2132
|
userQuery: originalQuery,
|
|
1869
2133
|
baselineResponse: options.baselineResponse,
|
|
1870
2134
|
noisyQuery: options.noisyQuery,
|
|
@@ -1912,7 +2176,7 @@ function createNoiseSensitivityScorerLLM({
|
|
|
1912
2176
|
if (!analysisResult) {
|
|
1913
2177
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
1914
2178
|
}
|
|
1915
|
-
return
|
|
2179
|
+
return createReasonPrompt5({
|
|
1916
2180
|
userQuery: originalQuery,
|
|
1917
2181
|
score,
|
|
1918
2182
|
dimensions: analysisResult.dimensions || [],
|
|
@@ -1941,7 +2205,7 @@ Evaluation Guidelines:
|
|
|
1941
2205
|
- Be objective and focus on alignment rather than response quality
|
|
1942
2206
|
|
|
1943
2207
|
Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
|
|
1944
|
-
function
|
|
2208
|
+
function createAnalyzePrompt5({
|
|
1945
2209
|
userPrompt,
|
|
1946
2210
|
systemPrompt,
|
|
1947
2211
|
agentResponse,
|
|
@@ -2093,7 +2357,7 @@ Agent Response: "def factorial(n):
|
|
|
2093
2357
|
"overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
|
|
2094
2358
|
}`;
|
|
2095
2359
|
}
|
|
2096
|
-
function
|
|
2360
|
+
function createReasonPrompt6({
|
|
2097
2361
|
userPrompt,
|
|
2098
2362
|
systemPrompt,
|
|
2099
2363
|
score,
|
|
@@ -2156,7 +2420,7 @@ Example responses:
|
|
|
2156
2420
|
}
|
|
2157
2421
|
|
|
2158
2422
|
// src/scorers/llm/prompt-alignment/index.ts
|
|
2159
|
-
var
|
|
2423
|
+
var analyzeOutputSchema5 = z.object({
|
|
2160
2424
|
intentAlignment: z.object({
|
|
2161
2425
|
score: z.number().min(0).max(1),
|
|
2162
2426
|
primaryIntent: z.string(),
|
|
@@ -2228,7 +2492,7 @@ function createPromptAlignmentScorerLLM({
|
|
|
2228
2492
|
}
|
|
2229
2493
|
}).analyze({
|
|
2230
2494
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2231
|
-
outputSchema:
|
|
2495
|
+
outputSchema: analyzeOutputSchema5,
|
|
2232
2496
|
createPrompt: ({ run }) => {
|
|
2233
2497
|
const userPrompt = getUserMessageFromRunInput(run.input) ?? "";
|
|
2234
2498
|
const systemPrompt = getCombinedSystemPrompt(run.input) ?? "";
|
|
@@ -2245,7 +2509,7 @@ function createPromptAlignmentScorerLLM({
|
|
|
2245
2509
|
if (!agentResponse) {
|
|
2246
2510
|
throw new Error("Agent response is required for prompt alignment scoring");
|
|
2247
2511
|
}
|
|
2248
|
-
return
|
|
2512
|
+
return createAnalyzePrompt5({
|
|
2249
2513
|
userPrompt,
|
|
2250
2514
|
systemPrompt,
|
|
2251
2515
|
agentResponse,
|
|
@@ -2278,7 +2542,7 @@ function createPromptAlignmentScorerLLM({
|
|
|
2278
2542
|
if (!analysis) {
|
|
2279
2543
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
2280
2544
|
}
|
|
2281
|
-
return
|
|
2545
|
+
return createReasonPrompt6({
|
|
2282
2546
|
userPrompt,
|
|
2283
2547
|
systemPrompt,
|
|
2284
2548
|
score,
|
|
@@ -2290,6 +2554,6 @@ function createPromptAlignmentScorerLLM({
|
|
|
2290
2554
|
});
|
|
2291
2555
|
}
|
|
2292
2556
|
|
|
2293
|
-
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
|
|
2557
|
+
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
|
|
2294
2558
|
//# sourceMappingURL=index.js.map
|
|
2295
2559
|
//# sourceMappingURL=index.js.map
|