@mastra/evals 0.14.3-alpha.0 → 1.0.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +36 -9
- package/README.md +19 -159
- package/dist/{chunk-KHEXN75Q.js → chunk-CCLM7KPF.js} +45 -21
- package/dist/chunk-CCLM7KPF.js.map +1 -0
- package/dist/{chunk-QKR2PMLZ.cjs → chunk-TPQLLHZW.cjs} +46 -21
- package/dist/chunk-TPQLLHZW.cjs.map +1 -0
- package/dist/scorers/code/completeness/index.d.ts +1 -1
- package/dist/scorers/code/completeness/index.d.ts.map +1 -1
- package/dist/scorers/code/content-similarity/index.d.ts +1 -1
- package/dist/scorers/code/content-similarity/index.d.ts.map +1 -1
- package/dist/scorers/code/keyword-coverage/index.d.ts +1 -1
- package/dist/scorers/code/keyword-coverage/index.d.ts.map +1 -1
- package/dist/scorers/code/textual-difference/index.d.ts +1 -1
- package/dist/scorers/code/textual-difference/index.d.ts.map +1 -1
- package/dist/scorers/code/tone/index.d.ts +1 -1
- package/dist/scorers/code/tone/index.d.ts.map +1 -1
- package/dist/scorers/code/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/code/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts +1 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
- package/dist/scorers/llm/bias/index.d.ts +2 -2
- package/dist/scorers/llm/bias/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts +3 -3
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-relevance/index.d.ts +3 -3
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +2 -2
- package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -2
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +2 -2
- package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
- package/dist/scorers/{llm → prebuilt}/index.cjs +479 -62
- package/dist/scorers/prebuilt/index.cjs.map +1 -0
- package/dist/scorers/prebuilt/index.d.ts +3 -0
- package/dist/scorers/prebuilt/index.d.ts.map +1 -0
- package/dist/scorers/{llm → prebuilt}/index.js +419 -15
- package/dist/scorers/prebuilt/index.js.map +1 -0
- package/dist/scorers/utils.cjs +21 -17
- package/dist/scorers/utils.d.ts +21 -11
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +12 -58
- package/dist/attachListeners.d.ts +0 -4
- package/dist/attachListeners.d.ts.map +0 -1
- package/dist/chunk-7QAUEU4L.cjs +0 -10
- package/dist/chunk-7QAUEU4L.cjs.map +0 -1
- package/dist/chunk-EMMSS5I5.cjs +0 -37
- package/dist/chunk-EMMSS5I5.cjs.map +0 -1
- package/dist/chunk-G3PMV62Z.js +0 -33
- package/dist/chunk-G3PMV62Z.js.map +0 -1
- package/dist/chunk-IUSAD2BW.cjs +0 -19
- package/dist/chunk-IUSAD2BW.cjs.map +0 -1
- package/dist/chunk-KHEXN75Q.js.map +0 -1
- package/dist/chunk-QKR2PMLZ.cjs.map +0 -1
- package/dist/chunk-QTWX6TKR.js +0 -8
- package/dist/chunk-QTWX6TKR.js.map +0 -1
- package/dist/chunk-YGTIO3J5.js +0 -17
- package/dist/chunk-YGTIO3J5.js.map +0 -1
- package/dist/dist-LDTK3TIP.cjs +0 -16759
- package/dist/dist-LDTK3TIP.cjs.map +0 -1
- package/dist/dist-OWYZEOJK.js +0 -16737
- package/dist/dist-OWYZEOJK.js.map +0 -1
- package/dist/evaluation.d.ts +0 -8
- package/dist/evaluation.d.ts.map +0 -1
- package/dist/index.cjs +0 -93
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.ts +0 -3
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js +0 -89
- package/dist/index.js.map +0 -1
- package/dist/magic-string.es-7ORA5OGR.js +0 -1305
- package/dist/magic-string.es-7ORA5OGR.js.map +0 -1
- package/dist/magic-string.es-NZ2XWFKN.cjs +0 -1311
- package/dist/magic-string.es-NZ2XWFKN.cjs.map +0 -1
- package/dist/metrics/index.d.ts +0 -4
- package/dist/metrics/index.d.ts.map +0 -1
- package/dist/metrics/judge/index.cjs +0 -12
- package/dist/metrics/judge/index.cjs.map +0 -1
- package/dist/metrics/judge/index.d.ts +0 -7
- package/dist/metrics/judge/index.d.ts.map +0 -1
- package/dist/metrics/judge/index.js +0 -3
- package/dist/metrics/judge/index.js.map +0 -1
- package/dist/metrics/llm/answer-relevancy/index.d.ts +0 -16
- package/dist/metrics/llm/answer-relevancy/index.d.ts.map +0 -1
- package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/answer-relevancy/prompts.d.ts +0 -19
- package/dist/metrics/llm/answer-relevancy/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/bias/index.d.ts +0 -14
- package/dist/metrics/llm/bias/index.d.ts.map +0 -1
- package/dist/metrics/llm/bias/metricJudge.d.ts +0 -14
- package/dist/metrics/llm/bias/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/bias/prompts.d.ts +0 -14
- package/dist/metrics/llm/bias/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/index.d.ts +0 -16
- package/dist/metrics/llm/context-position/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/context-position/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/prompts.d.ts +0 -17
- package/dist/metrics/llm/context-position/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/index.d.ts +0 -16
- package/dist/metrics/llm/context-precision/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/context-precision/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/prompts.d.ts +0 -17
- package/dist/metrics/llm/context-precision/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/index.d.ts +0 -16
- package/dist/metrics/llm/context-relevancy/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/metricJudge.d.ts +0 -16
- package/dist/metrics/llm/context-relevancy/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/prompts.d.ts +0 -13
- package/dist/metrics/llm/context-relevancy/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/index.d.ts +0 -16
- package/dist/metrics/llm/contextual-recall/index.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/metricJudge.d.ts +0 -16
- package/dist/metrics/llm/contextual-recall/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/prompts.d.ts +0 -13
- package/dist/metrics/llm/contextual-recall/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/index.d.ts +0 -16
- package/dist/metrics/llm/faithfulness/index.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/metricJudge.d.ts +0 -22
- package/dist/metrics/llm/faithfulness/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/prompts.d.ts +0 -20
- package/dist/metrics/llm/faithfulness/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/index.d.ts +0 -16
- package/dist/metrics/llm/hallucination/index.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/metricJudge.d.ts +0 -22
- package/dist/metrics/llm/hallucination/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/prompts.d.ts +0 -17
- package/dist/metrics/llm/hallucination/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/index.cjs +0 -2481
- package/dist/metrics/llm/index.cjs.map +0 -1
- package/dist/metrics/llm/index.d.ts +0 -12
- package/dist/metrics/llm/index.d.ts.map +0 -1
- package/dist/metrics/llm/index.js +0 -2469
- package/dist/metrics/llm/index.js.map +0 -1
- package/dist/metrics/llm/prompt-alignment/index.d.ts +0 -33
- package/dist/metrics/llm/prompt-alignment/index.d.ts.map +0 -1
- package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/prompt-alignment/prompts.d.ts +0 -17
- package/dist/metrics/llm/prompt-alignment/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/index.d.ts +0 -19
- package/dist/metrics/llm/summarization/index.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/metricJudge.d.ts +0 -34
- package/dist/metrics/llm/summarization/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/prompts.d.ts +0 -30
- package/dist/metrics/llm/summarization/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/index.d.ts +0 -14
- package/dist/metrics/llm/toxicity/index.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/metricJudge.d.ts +0 -14
- package/dist/metrics/llm/toxicity/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/prompts.d.ts +0 -10
- package/dist/metrics/llm/toxicity/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/types.d.ts +0 -7
- package/dist/metrics/llm/types.d.ts.map +0 -1
- package/dist/metrics/llm/utils.d.ts +0 -14
- package/dist/metrics/llm/utils.d.ts.map +0 -1
- package/dist/metrics/nlp/completeness/index.d.ts +0 -21
- package/dist/metrics/nlp/completeness/index.d.ts.map +0 -1
- package/dist/metrics/nlp/content-similarity/index.d.ts +0 -18
- package/dist/metrics/nlp/content-similarity/index.d.ts.map +0 -1
- package/dist/metrics/nlp/index.cjs +0 -203
- package/dist/metrics/nlp/index.cjs.map +0 -1
- package/dist/metrics/nlp/index.d.ts +0 -6
- package/dist/metrics/nlp/index.d.ts.map +0 -1
- package/dist/metrics/nlp/index.js +0 -190
- package/dist/metrics/nlp/index.js.map +0 -1
- package/dist/metrics/nlp/keyword-coverage/index.d.ts +0 -13
- package/dist/metrics/nlp/keyword-coverage/index.d.ts.map +0 -1
- package/dist/metrics/nlp/textual-difference/index.d.ts +0 -15
- package/dist/metrics/nlp/textual-difference/index.d.ts.map +0 -1
- package/dist/metrics/nlp/tone/index.d.ts +0 -18
- package/dist/metrics/nlp/tone/index.d.ts.map +0 -1
- package/dist/scorers/code/index.cjs +0 -329
- package/dist/scorers/code/index.cjs.map +0 -1
- package/dist/scorers/code/index.js +0 -315
- package/dist/scorers/code/index.js.map +0 -1
- package/dist/scorers/llm/index.cjs.map +0 -1
- package/dist/scorers/llm/index.js.map +0 -1
|
@@ -1,9 +1,19 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
4
|
-
var
|
|
5
|
-
var scores = require('@mastra/core/scores');
|
|
3
|
+
var chunkTPQLLHZW_cjs = require('../../chunk-TPQLLHZW.cjs');
|
|
4
|
+
var evals = require('@mastra/core/evals');
|
|
6
5
|
var zod = require('zod');
|
|
6
|
+
var nlp = require('compromise');
|
|
7
|
+
var keyword_extractor = require('keyword-extractor');
|
|
8
|
+
var stringSimilarity = require('string-similarity');
|
|
9
|
+
var Sentiment = require('sentiment');
|
|
10
|
+
|
|
11
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
12
|
+
|
|
13
|
+
var nlp__default = /*#__PURE__*/_interopDefault(nlp);
|
|
14
|
+
var keyword_extractor__default = /*#__PURE__*/_interopDefault(keyword_extractor);
|
|
15
|
+
var stringSimilarity__default = /*#__PURE__*/_interopDefault(stringSimilarity);
|
|
16
|
+
var Sentiment__default = /*#__PURE__*/_interopDefault(Sentiment);
|
|
7
17
|
|
|
8
18
|
// src/scorers/llm/answer-relevancy/prompts.ts
|
|
9
19
|
var createExtractPrompt = (output) => `
|
|
@@ -216,7 +226,8 @@ function createAnswerRelevancyScorer({
|
|
|
216
226
|
model,
|
|
217
227
|
options = DEFAULT_OPTIONS
|
|
218
228
|
}) {
|
|
219
|
-
return
|
|
229
|
+
return evals.createScorer({
|
|
230
|
+
id: "answer-relevancy-scorer",
|
|
220
231
|
name: "Answer Relevancy Scorer",
|
|
221
232
|
description: "A scorer that evaluates the relevancy of an LLM output to an input",
|
|
222
233
|
judge: {
|
|
@@ -228,14 +239,14 @@ function createAnswerRelevancyScorer({
|
|
|
228
239
|
description: "Extract relevant statements from the LLM output",
|
|
229
240
|
outputSchema: extractOutputSchema,
|
|
230
241
|
createPrompt: ({ run }) => {
|
|
231
|
-
const assistantMessage =
|
|
242
|
+
const assistantMessage = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
232
243
|
return createExtractPrompt(assistantMessage);
|
|
233
244
|
}
|
|
234
245
|
}).analyze({
|
|
235
246
|
description: "Score the relevance of the statements to the input",
|
|
236
247
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
237
248
|
createPrompt: ({ run, results }) => {
|
|
238
|
-
const input =
|
|
249
|
+
const input = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
239
250
|
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
240
251
|
}
|
|
241
252
|
}).generateScore(({ results }) => {
|
|
@@ -252,13 +263,13 @@ function createAnswerRelevancyScorer({
|
|
|
252
263
|
}
|
|
253
264
|
}
|
|
254
265
|
const score = relevancyCount / numberOfResults;
|
|
255
|
-
return
|
|
266
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(score * options.scale);
|
|
256
267
|
}).generateReason({
|
|
257
268
|
description: "Reason about the results",
|
|
258
269
|
createPrompt: ({ run, results, score }) => {
|
|
259
270
|
return createReasonPrompt({
|
|
260
|
-
input:
|
|
261
|
-
output:
|
|
271
|
+
input: chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
272
|
+
output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
262
273
|
score,
|
|
263
274
|
results: results.analyzeStepResult.results,
|
|
264
275
|
scale: options.scale
|
|
@@ -433,7 +444,8 @@ function createAnswerSimilarityScorer({
|
|
|
433
444
|
options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
|
|
434
445
|
}) {
|
|
435
446
|
const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
|
|
436
|
-
return
|
|
447
|
+
return evals.createScorer({
|
|
448
|
+
id: "answer-similarity-scorer",
|
|
437
449
|
name: "Answer Similarity Scorer",
|
|
438
450
|
description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
|
|
439
451
|
judge: {
|
|
@@ -454,7 +466,7 @@ function createAnswerSimilarityScorer({
|
|
|
454
466
|
groundTruth: ""
|
|
455
467
|
});
|
|
456
468
|
}
|
|
457
|
-
const output =
|
|
469
|
+
const output = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
458
470
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
459
471
|
return createExtractPrompt2({
|
|
460
472
|
output,
|
|
@@ -512,14 +524,14 @@ function createAnswerSimilarityScorer({
|
|
|
512
524
|
);
|
|
513
525
|
score -= extraInfoPenalty;
|
|
514
526
|
score = Math.max(0, Math.min(1, score));
|
|
515
|
-
return
|
|
527
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(score * mergedOptions.scale);
|
|
516
528
|
}).generateReason({
|
|
517
529
|
description: "Generate explanation of similarity score",
|
|
518
530
|
createPrompt: ({ run, results, score }) => {
|
|
519
531
|
if (!run.groundTruth) {
|
|
520
532
|
return "No ground truth was provided for comparison. Score is 0 by default.";
|
|
521
533
|
}
|
|
522
|
-
const output =
|
|
534
|
+
const output = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
523
535
|
const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
|
|
524
536
|
return createReasonPrompt2({
|
|
525
537
|
output,
|
|
@@ -690,7 +702,8 @@ function createFaithfulnessScorer({
|
|
|
690
702
|
model,
|
|
691
703
|
options
|
|
692
704
|
}) {
|
|
693
|
-
return
|
|
705
|
+
return evals.createScorer({
|
|
706
|
+
id: "faithfulness-scorer",
|
|
694
707
|
name: "Faithfulness Scorer",
|
|
695
708
|
description: "A scorer that evaluates the faithfulness of an LLM output to an input",
|
|
696
709
|
judge: {
|
|
@@ -702,14 +715,17 @@ function createFaithfulnessScorer({
|
|
|
702
715
|
description: "Extract relevant statements from the LLM output",
|
|
703
716
|
outputSchema: zod.z.array(zod.z.string()),
|
|
704
717
|
createPrompt: ({ run }) => {
|
|
705
|
-
const prompt = createFaithfulnessExtractPrompt({ output:
|
|
718
|
+
const prompt = createFaithfulnessExtractPrompt({ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
706
719
|
return prompt;
|
|
707
720
|
}
|
|
708
721
|
}).analyze({
|
|
709
722
|
description: "Score the relevance of the statements to the input",
|
|
710
723
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
711
724
|
createPrompt: ({ results, run }) => {
|
|
712
|
-
const
|
|
725
|
+
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
726
|
+
const context = options?.context ?? assistantMessage?.content?.toolInvocations?.map(
|
|
727
|
+
(toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : ""
|
|
728
|
+
) ?? [];
|
|
713
729
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
714
730
|
claims: results.preprocessStepResult || [],
|
|
715
731
|
context
|
|
@@ -723,14 +739,15 @@ function createFaithfulnessScorer({
|
|
|
723
739
|
return 0;
|
|
724
740
|
}
|
|
725
741
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
726
|
-
return
|
|
742
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(score);
|
|
727
743
|
}).generateReason({
|
|
728
744
|
description: "Reason about the results",
|
|
729
745
|
createPrompt: ({ run, results, score }) => {
|
|
746
|
+
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
730
747
|
const prompt = createFaithfulnessReasonPrompt({
|
|
731
|
-
input:
|
|
732
|
-
output:
|
|
733
|
-
context:
|
|
748
|
+
input: chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
749
|
+
output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
750
|
+
context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
734
751
|
score,
|
|
735
752
|
scale: options?.scale || 1,
|
|
736
753
|
verdicts: results.analyzeStepResult?.verdicts || []
|
|
@@ -848,7 +865,8 @@ ${biases.join("\n")}
|
|
|
848
865
|
|
|
849
866
|
// src/scorers/llm/bias/index.ts
|
|
850
867
|
function createBiasScorer({ model, options }) {
|
|
851
|
-
return
|
|
868
|
+
return evals.createScorer({
|
|
869
|
+
id: "bias-scorer",
|
|
852
870
|
name: "Bias Scorer",
|
|
853
871
|
description: "A scorer that evaluates the bias of an LLM output to an input",
|
|
854
872
|
judge: {
|
|
@@ -861,13 +879,13 @@ function createBiasScorer({ model, options }) {
|
|
|
861
879
|
outputSchema: zod.z.object({
|
|
862
880
|
opinions: zod.z.array(zod.z.string())
|
|
863
881
|
}),
|
|
864
|
-
createPrompt: ({ run }) => createBiasExtractPrompt({ output:
|
|
882
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
865
883
|
}).analyze({
|
|
866
884
|
description: "Score the relevance of the statements to the input",
|
|
867
885
|
outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
|
|
868
886
|
createPrompt: ({ run, results }) => {
|
|
869
887
|
const prompt = createBiasAnalyzePrompt({
|
|
870
|
-
output:
|
|
888
|
+
output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
871
889
|
opinions: results.preprocessStepResult?.opinions || []
|
|
872
890
|
});
|
|
873
891
|
return prompt;
|
|
@@ -878,7 +896,7 @@ function createBiasScorer({ model, options }) {
|
|
|
878
896
|
}
|
|
879
897
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
880
898
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
881
|
-
return
|
|
899
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
882
900
|
}).generateReason({
|
|
883
901
|
description: "Reason about the results",
|
|
884
902
|
createPrompt: ({ score, results }) => {
|
|
@@ -1082,7 +1100,8 @@ function createHallucinationScorer({
|
|
|
1082
1100
|
model,
|
|
1083
1101
|
options
|
|
1084
1102
|
}) {
|
|
1085
|
-
return
|
|
1103
|
+
return evals.createScorer({
|
|
1104
|
+
id: "hallucination-scorer",
|
|
1086
1105
|
name: "Hallucination Scorer",
|
|
1087
1106
|
description: "A scorer that evaluates the hallucination of an LLM output to an input",
|
|
1088
1107
|
judge: {
|
|
@@ -1096,7 +1115,7 @@ function createHallucinationScorer({
|
|
|
1096
1115
|
claims: zod.z.array(zod.z.string())
|
|
1097
1116
|
}),
|
|
1098
1117
|
createPrompt: ({ run }) => {
|
|
1099
|
-
const prompt = createHallucinationExtractPrompt({ output:
|
|
1118
|
+
const prompt = createHallucinationExtractPrompt({ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
1100
1119
|
return prompt;
|
|
1101
1120
|
}
|
|
1102
1121
|
}).analyze({
|
|
@@ -1118,13 +1137,13 @@ function createHallucinationScorer({
|
|
|
1118
1137
|
return 0;
|
|
1119
1138
|
}
|
|
1120
1139
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1121
|
-
return
|
|
1140
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(score);
|
|
1122
1141
|
}).generateReason({
|
|
1123
1142
|
description: "Reason about the results",
|
|
1124
1143
|
createPrompt: ({ run, results, score }) => {
|
|
1125
1144
|
const prompt = createHallucinationReasonPrompt({
|
|
1126
|
-
input:
|
|
1127
|
-
output:
|
|
1145
|
+
input: chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1146
|
+
output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
1128
1147
|
context: options?.context || [],
|
|
1129
1148
|
score,
|
|
1130
1149
|
scale: options?.scale || 1,
|
|
@@ -1224,7 +1243,8 @@ function createToxicityScorer({
|
|
|
1224
1243
|
model,
|
|
1225
1244
|
options
|
|
1226
1245
|
}) {
|
|
1227
|
-
return
|
|
1246
|
+
return evals.createScorer({
|
|
1247
|
+
id: "toxicity-scorer",
|
|
1228
1248
|
name: "Toxicity Scorer",
|
|
1229
1249
|
description: "A scorer that evaluates the toxicity of an LLM output to an input",
|
|
1230
1250
|
judge: {
|
|
@@ -1237,8 +1257,8 @@ function createToxicityScorer({
|
|
|
1237
1257
|
outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
|
|
1238
1258
|
createPrompt: ({ run }) => {
|
|
1239
1259
|
const prompt = createToxicityAnalyzePrompt({
|
|
1240
|
-
input:
|
|
1241
|
-
output:
|
|
1260
|
+
input: chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "",
|
|
1261
|
+
output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
1242
1262
|
});
|
|
1243
1263
|
return prompt;
|
|
1244
1264
|
}
|
|
@@ -1254,7 +1274,7 @@ function createToxicityScorer({
|
|
|
1254
1274
|
}
|
|
1255
1275
|
}
|
|
1256
1276
|
const score = toxicityCount / numberOfVerdicts;
|
|
1257
|
-
return
|
|
1277
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(score * (options?.scale || 1));
|
|
1258
1278
|
}).generateReason({
|
|
1259
1279
|
description: "Reason about the results",
|
|
1260
1280
|
createPrompt: ({ results, score }) => {
|
|
@@ -1373,7 +1393,8 @@ var analyzeOutputSchema2 = zod.z.object({
|
|
|
1373
1393
|
});
|
|
1374
1394
|
function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
1375
1395
|
const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
|
|
1376
|
-
return
|
|
1396
|
+
return evals.createScorer({
|
|
1397
|
+
id: "llm-tool-call-accuracy-scorer",
|
|
1377
1398
|
name: "Tool Call Accuracy (LLM)",
|
|
1378
1399
|
description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
|
|
1379
1400
|
judge: {
|
|
@@ -1387,7 +1408,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1387
1408
|
if (isInputInvalid || isOutputInvalid) {
|
|
1388
1409
|
throw new Error("Input and output messages cannot be null or empty");
|
|
1389
1410
|
}
|
|
1390
|
-
const { tools: actualTools, toolCallInfos } =
|
|
1411
|
+
const { tools: actualTools, toolCallInfos } = chunkTPQLLHZW_cjs.extractToolCalls(run.output);
|
|
1391
1412
|
return {
|
|
1392
1413
|
actualTools,
|
|
1393
1414
|
hasToolCalls: actualTools.length > 0,
|
|
@@ -1397,8 +1418,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1397
1418
|
description: "Analyze the appropriateness of tool selections",
|
|
1398
1419
|
outputSchema: analyzeOutputSchema2,
|
|
1399
1420
|
createPrompt: ({ run, results }) => {
|
|
1400
|
-
const userInput =
|
|
1401
|
-
const agentResponse =
|
|
1421
|
+
const userInput = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1422
|
+
const agentResponse = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1402
1423
|
const toolsCalled = results.preprocessStepResult?.actualTools || [];
|
|
1403
1424
|
return createAnalyzePrompt2({
|
|
1404
1425
|
userInput,
|
|
@@ -1415,11 +1436,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1415
1436
|
}
|
|
1416
1437
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1417
1438
|
const totalToolCalls = evaluations.length;
|
|
1418
|
-
return
|
|
1439
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1419
1440
|
}).generateReason({
|
|
1420
1441
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1421
1442
|
createPrompt: ({ run, results, score }) => {
|
|
1422
|
-
const userInput =
|
|
1443
|
+
const userInput = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1423
1444
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1424
1445
|
const missingTools = results.analyzeStepResult?.missingTools || [];
|
|
1425
1446
|
return createReasonPrompt3({
|
|
@@ -1611,7 +1632,8 @@ function createContextRelevanceScorerLLM({
|
|
|
1611
1632
|
if (options.context && options.context.length === 0) {
|
|
1612
1633
|
throw new Error("Context array cannot be empty if provided");
|
|
1613
1634
|
}
|
|
1614
|
-
return
|
|
1635
|
+
return evals.createScorer({
|
|
1636
|
+
id: "context-relevance-scorer",
|
|
1615
1637
|
name: "Context Relevance (LLM)",
|
|
1616
1638
|
description: "Evaluates how relevant and useful the provided context was for generating the agent response",
|
|
1617
1639
|
judge: {
|
|
@@ -1623,8 +1645,8 @@ function createContextRelevanceScorerLLM({
|
|
|
1623
1645
|
description: "Analyze the relevance and utility of provided context",
|
|
1624
1646
|
outputSchema: analyzeOutputSchema3,
|
|
1625
1647
|
createPrompt: ({ run }) => {
|
|
1626
|
-
const userQuery =
|
|
1627
|
-
const agentResponse =
|
|
1648
|
+
const userQuery = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1649
|
+
const agentResponse = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1628
1650
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1629
1651
|
if (context.length === 0) {
|
|
1630
1652
|
return createAnalyzePrompt3({
|
|
@@ -1672,11 +1694,11 @@ function createContextRelevanceScorerLLM({
|
|
|
1672
1694
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1673
1695
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1674
1696
|
const scaledScore = finalScore * (options.scale || 1);
|
|
1675
|
-
return
|
|
1697
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(scaledScore);
|
|
1676
1698
|
}).generateReason({
|
|
1677
1699
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1678
1700
|
createPrompt: ({ run, results, score }) => {
|
|
1679
|
-
const userQuery =
|
|
1701
|
+
const userQuery = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1680
1702
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1681
1703
|
if (context.length === 0) {
|
|
1682
1704
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
@@ -1834,7 +1856,8 @@ function createContextPrecisionScorer({
|
|
|
1834
1856
|
if (options.context && options.context.length === 0) {
|
|
1835
1857
|
throw new Error("Context array cannot be empty if provided");
|
|
1836
1858
|
}
|
|
1837
|
-
return
|
|
1859
|
+
return evals.createScorer({
|
|
1860
|
+
id: "context-precision-scorer",
|
|
1838
1861
|
name: "Context Precision Scorer",
|
|
1839
1862
|
description: "A scorer that evaluates the relevance and precision of retrieved context nodes for generating expected outputs",
|
|
1840
1863
|
judge: {
|
|
@@ -1846,8 +1869,8 @@ function createContextPrecisionScorer({
|
|
|
1846
1869
|
description: "Evaluate the relevance of each context piece for generating the expected output",
|
|
1847
1870
|
outputSchema: contextRelevanceOutputSchema,
|
|
1848
1871
|
createPrompt: ({ run }) => {
|
|
1849
|
-
const input =
|
|
1850
|
-
const output =
|
|
1872
|
+
const input = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1873
|
+
const output = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1851
1874
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1852
1875
|
if (context.length === 0) {
|
|
1853
1876
|
throw new Error("No context available for evaluation");
|
|
@@ -1880,12 +1903,12 @@ function createContextPrecisionScorer({
|
|
|
1880
1903
|
}
|
|
1881
1904
|
const map = sumPrecision / relevantCount;
|
|
1882
1905
|
const score = map * (options.scale || 1);
|
|
1883
|
-
return
|
|
1906
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(score);
|
|
1884
1907
|
}).generateReason({
|
|
1885
1908
|
description: "Reason about the context precision results",
|
|
1886
1909
|
createPrompt: ({ run, results, score }) => {
|
|
1887
|
-
const input =
|
|
1888
|
-
const output =
|
|
1910
|
+
const input = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
1911
|
+
const output = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1889
1912
|
const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
|
|
1890
1913
|
return createContextPrecisionReasonPrompt({
|
|
1891
1914
|
input,
|
|
@@ -2126,7 +2149,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2126
2149
|
if (!options.baselineResponse || !options.noisyQuery) {
|
|
2127
2150
|
throw new Error("Both baselineResponse and noisyQuery are required for Noise Sensitivity scoring");
|
|
2128
2151
|
}
|
|
2129
|
-
return
|
|
2152
|
+
return evals.createScorer({
|
|
2153
|
+
id: "noise-sensitivity-scorer",
|
|
2130
2154
|
name: "Noise Sensitivity (LLM)",
|
|
2131
2155
|
description: "Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information",
|
|
2132
2156
|
judge: {
|
|
@@ -2138,8 +2162,8 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2138
2162
|
description: "Analyze the impact of noise on agent response quality",
|
|
2139
2163
|
outputSchema: analyzeOutputSchema4,
|
|
2140
2164
|
createPrompt: ({ run }) => {
|
|
2141
|
-
const originalQuery =
|
|
2142
|
-
const noisyResponse =
|
|
2165
|
+
const originalQuery = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2166
|
+
const noisyResponse = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2143
2167
|
if (!originalQuery || !noisyResponse) {
|
|
2144
2168
|
throw new Error("Both original query and noisy response are required for evaluation");
|
|
2145
2169
|
}
|
|
@@ -2182,11 +2206,11 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2182
2206
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2183
2207
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2184
2208
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2185
|
-
return
|
|
2209
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(finalScore);
|
|
2186
2210
|
}).generateReason({
|
|
2187
2211
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2188
2212
|
createPrompt: ({ run, results, score }) => {
|
|
2189
|
-
const originalQuery =
|
|
2213
|
+
const originalQuery = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2190
2214
|
const analysisResult = results.analyzeStepResult;
|
|
2191
2215
|
if (!analysisResult) {
|
|
2192
2216
|
throw new Error("Analysis step failed to produce results for reason generation");
|
|
@@ -2498,7 +2522,8 @@ function createPromptAlignmentScorerLLM({
|
|
|
2498
2522
|
}) {
|
|
2499
2523
|
const scale = options?.scale || 1;
|
|
2500
2524
|
const evaluationMode = options?.evaluationMode || "both";
|
|
2501
|
-
return
|
|
2525
|
+
return evals.createScorer({
|
|
2526
|
+
id: "prompt-alignment-scorer",
|
|
2502
2527
|
name: "Prompt Alignment (LLM)",
|
|
2503
2528
|
description: "Evaluates how well the agent response aligns with the intent and requirements of the user prompt",
|
|
2504
2529
|
judge: {
|
|
@@ -2509,9 +2534,9 @@ function createPromptAlignmentScorerLLM({
|
|
|
2509
2534
|
description: "Analyze prompt-response alignment across multiple dimensions",
|
|
2510
2535
|
outputSchema: analyzeOutputSchema5,
|
|
2511
2536
|
createPrompt: ({ run }) => {
|
|
2512
|
-
const userPrompt =
|
|
2513
|
-
const systemPrompt =
|
|
2514
|
-
const agentResponse =
|
|
2537
|
+
const userPrompt = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2538
|
+
const systemPrompt = chunkTPQLLHZW_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2539
|
+
const agentResponse = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
2515
2540
|
if (evaluationMode === "user" && !userPrompt) {
|
|
2516
2541
|
throw new Error("User prompt is required for user prompt alignment scoring");
|
|
2517
2542
|
}
|
|
@@ -2547,12 +2572,12 @@ function createPromptAlignmentScorerLLM({
|
|
|
2547
2572
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2548
2573
|
}
|
|
2549
2574
|
const finalScore = weightedScore * scale;
|
|
2550
|
-
return
|
|
2575
|
+
return chunkTPQLLHZW_cjs.roundToTwoDecimals(finalScore);
|
|
2551
2576
|
}).generateReason({
|
|
2552
2577
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2553
2578
|
createPrompt: ({ run, results, score }) => {
|
|
2554
|
-
const userPrompt =
|
|
2555
|
-
const systemPrompt =
|
|
2579
|
+
const userPrompt = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
|
|
2580
|
+
const systemPrompt = chunkTPQLLHZW_cjs.getCombinedSystemPrompt(run.input) ?? "";
|
|
2556
2581
|
const analysis = results.analyzeStepResult;
|
|
2557
2582
|
if (!analysis) {
|
|
2558
2583
|
return `Unable to analyze prompt alignment. Score: ${score}`;
|
|
@@ -2568,6 +2593,392 @@ function createPromptAlignmentScorerLLM({
|
|
|
2568
2593
|
}
|
|
2569
2594
|
});
|
|
2570
2595
|
}
|
|
2596
|
+
function normalizeString(str) {
|
|
2597
|
+
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
2598
|
+
}
|
|
2599
|
+
function extractElements(doc) {
|
|
2600
|
+
const nouns = doc.nouns().out("array") || [];
|
|
2601
|
+
const verbs = doc.verbs().toInfinitive().out("array") || [];
|
|
2602
|
+
const topics = doc.topics().out("array") || [];
|
|
2603
|
+
const terms = doc.terms().out("array") || [];
|
|
2604
|
+
const cleanAndSplitTerm = (term) => {
|
|
2605
|
+
const normalized = normalizeString(term);
|
|
2606
|
+
return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
|
|
2607
|
+
};
|
|
2608
|
+
const processedTerms = [
|
|
2609
|
+
...nouns.flatMap(cleanAndSplitTerm),
|
|
2610
|
+
...verbs.flatMap(cleanAndSplitTerm),
|
|
2611
|
+
...topics.flatMap(cleanAndSplitTerm),
|
|
2612
|
+
...terms.flatMap(cleanAndSplitTerm)
|
|
2613
|
+
];
|
|
2614
|
+
return [...new Set(processedTerms)];
|
|
2615
|
+
}
|
|
2616
|
+
function calculateCoverage({ original, simplified }) {
|
|
2617
|
+
if (original.length === 0) {
|
|
2618
|
+
return simplified.length === 0 ? 1 : 0;
|
|
2619
|
+
}
|
|
2620
|
+
const covered = original.filter(
|
|
2621
|
+
(element) => simplified.some((s) => {
|
|
2622
|
+
const elem = normalizeString(element);
|
|
2623
|
+
const simp = normalizeString(s);
|
|
2624
|
+
if (elem.length <= 3) {
|
|
2625
|
+
return elem === simp;
|
|
2626
|
+
}
|
|
2627
|
+
const longer = elem.length > simp.length ? elem : simp;
|
|
2628
|
+
const shorter = elem.length > simp.length ? simp : elem;
|
|
2629
|
+
if (longer.includes(shorter)) {
|
|
2630
|
+
return shorter.length / longer.length > 0.6;
|
|
2631
|
+
}
|
|
2632
|
+
return false;
|
|
2633
|
+
})
|
|
2634
|
+
);
|
|
2635
|
+
return covered.length / original.length;
|
|
2636
|
+
}
|
|
2637
|
+
function createCompletenessScorer() {
|
|
2638
|
+
return evals.createScorer({
|
|
2639
|
+
id: "completeness-scorer",
|
|
2640
|
+
name: "Completeness Scorer",
|
|
2641
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
2642
|
+
type: "agent"
|
|
2643
|
+
}).preprocess(async ({ run }) => {
|
|
2644
|
+
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
2645
|
+
const content = chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i);
|
|
2646
|
+
return content === null || content === void 0;
|
|
2647
|
+
});
|
|
2648
|
+
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
2649
|
+
const content = chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i);
|
|
2650
|
+
return content === null || content === void 0;
|
|
2651
|
+
});
|
|
2652
|
+
if (isInputInvalid || isOutputInvalid) {
|
|
2653
|
+
throw new Error("Inputs cannot be null or undefined");
|
|
2654
|
+
}
|
|
2655
|
+
const input = run.input?.inputMessages.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2656
|
+
const output = run.output?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2657
|
+
const inputToProcess = input;
|
|
2658
|
+
const outputToProcess = output;
|
|
2659
|
+
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
2660
|
+
const outputDoc = nlp__default.default(outputToProcess.trim());
|
|
2661
|
+
const inputElements = extractElements(inputDoc);
|
|
2662
|
+
const outputElements = extractElements(outputDoc);
|
|
2663
|
+
return {
|
|
2664
|
+
inputElements,
|
|
2665
|
+
outputElements,
|
|
2666
|
+
missingElements: inputElements.filter((e) => !outputElements.includes(e)),
|
|
2667
|
+
elementCounts: {
|
|
2668
|
+
input: inputElements.length,
|
|
2669
|
+
output: outputElements.length
|
|
2670
|
+
}
|
|
2671
|
+
};
|
|
2672
|
+
}).generateScore(({ results }) => {
|
|
2673
|
+
const inputElements = results.preprocessStepResult?.inputElements;
|
|
2674
|
+
const outputElements = results.preprocessStepResult?.outputElements;
|
|
2675
|
+
return calculateCoverage({
|
|
2676
|
+
original: inputElements,
|
|
2677
|
+
simplified: outputElements
|
|
2678
|
+
});
|
|
2679
|
+
});
|
|
2680
|
+
}
|
|
2681
|
+
function calculateRatio(input, output) {
|
|
2682
|
+
if (input === output) {
|
|
2683
|
+
return 1;
|
|
2684
|
+
}
|
|
2685
|
+
if (input.length === 0 || output.length === 0) {
|
|
2686
|
+
return 0;
|
|
2687
|
+
}
|
|
2688
|
+
const matches = longestCommonSubsequence(input, output);
|
|
2689
|
+
const total = input.length + output.length;
|
|
2690
|
+
return total > 0 ? 2 * matches / total : 0;
|
|
2691
|
+
}
|
|
2692
|
+
function longestCommonSubsequence(str1, str2) {
|
|
2693
|
+
const m = str1.length;
|
|
2694
|
+
const n = str2.length;
|
|
2695
|
+
const dp = [];
|
|
2696
|
+
for (let i = 0; i <= m; i++) {
|
|
2697
|
+
dp[i] = [];
|
|
2698
|
+
for (let j = 0; j <= n; j++) {
|
|
2699
|
+
dp[i][j] = 0;
|
|
2700
|
+
}
|
|
2701
|
+
}
|
|
2702
|
+
for (let i = 1; i <= m; i++) {
|
|
2703
|
+
for (let j = 1; j <= n; j++) {
|
|
2704
|
+
if (str1[i - 1] === str2[j - 1]) {
|
|
2705
|
+
dp[i][j] = dp[i - 1][j - 1] + 1;
|
|
2706
|
+
} else {
|
|
2707
|
+
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
|
|
2708
|
+
}
|
|
2709
|
+
}
|
|
2710
|
+
}
|
|
2711
|
+
return dp[m][n];
|
|
2712
|
+
}
|
|
2713
|
+
function countChanges(input, output) {
|
|
2714
|
+
const inputNormalized = input.replace(/\s+/g, " ").trim();
|
|
2715
|
+
const outputNormalized = output.replace(/\s+/g, " ").trim();
|
|
2716
|
+
if (inputNormalized === outputNormalized) {
|
|
2717
|
+
if (input !== output) {
|
|
2718
|
+
const inputWords2 = input.split(/\s+/).filter((w) => w.length > 0);
|
|
2719
|
+
const outputWords2 = output.split(/\s+/).filter((w) => w.length > 0);
|
|
2720
|
+
return Math.abs(inputWords2.length - outputWords2.length) || 1;
|
|
2721
|
+
}
|
|
2722
|
+
return 0;
|
|
2723
|
+
}
|
|
2724
|
+
const inputWords = inputNormalized.split(/\s+/).filter((w) => w.length > 0);
|
|
2725
|
+
const outputWords = outputNormalized.split(/\s+/).filter((w) => w.length > 0);
|
|
2726
|
+
if (inputWords.length === 0 && outputWords.length === 0) {
|
|
2727
|
+
return 0;
|
|
2728
|
+
}
|
|
2729
|
+
if (inputWords.length === 0) {
|
|
2730
|
+
return outputWords.length;
|
|
2731
|
+
}
|
|
2732
|
+
if (outputWords.length === 0) {
|
|
2733
|
+
return inputWords.length;
|
|
2734
|
+
}
|
|
2735
|
+
const matchingWords = findCommonWords(inputWords, outputWords);
|
|
2736
|
+
const maxLength = Math.max(inputWords.length, outputWords.length);
|
|
2737
|
+
const changes = maxLength - matchingWords;
|
|
2738
|
+
return changes;
|
|
2739
|
+
}
|
|
2740
|
+
function findCommonWords(arr1, arr2) {
|
|
2741
|
+
let matches = 0;
|
|
2742
|
+
const used = /* @__PURE__ */ new Set();
|
|
2743
|
+
for (let i = 0; i < arr1.length; i++) {
|
|
2744
|
+
for (let j = 0; j < arr2.length; j++) {
|
|
2745
|
+
if (!used.has(j) && arr1[i] === arr2[j]) {
|
|
2746
|
+
matches++;
|
|
2747
|
+
used.add(j);
|
|
2748
|
+
break;
|
|
2749
|
+
}
|
|
2750
|
+
}
|
|
2751
|
+
}
|
|
2752
|
+
return matches;
|
|
2753
|
+
}
|
|
2754
|
+
function createTextualDifferenceScorer() {
|
|
2755
|
+
return evals.createScorer({
|
|
2756
|
+
id: "textual-difference-scorer",
|
|
2757
|
+
name: "Textual Difference Scorer",
|
|
2758
|
+
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
2759
|
+
type: "agent"
|
|
2760
|
+
}).preprocess(async ({ run }) => {
|
|
2761
|
+
const input = run.input?.inputMessages?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2762
|
+
const output = run.output?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2763
|
+
const ratio = calculateRatio(input, output);
|
|
2764
|
+
const changes = countChanges(input, output);
|
|
2765
|
+
const maxLength = Math.max(input.length, output.length);
|
|
2766
|
+
const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
|
|
2767
|
+
const confidence = 1 - lengthDiff;
|
|
2768
|
+
return {
|
|
2769
|
+
ratio,
|
|
2770
|
+
confidence,
|
|
2771
|
+
changes,
|
|
2772
|
+
lengthDiff
|
|
2773
|
+
};
|
|
2774
|
+
}).generateScore(({ results }) => {
|
|
2775
|
+
return results.preprocessStepResult?.ratio;
|
|
2776
|
+
});
|
|
2777
|
+
}
|
|
2778
|
+
function createKeywordCoverageScorer() {
|
|
2779
|
+
return evals.createScorer({
|
|
2780
|
+
id: "keyword-coverage-scorer",
|
|
2781
|
+
name: "Keyword Coverage Scorer",
|
|
2782
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
2783
|
+
type: "agent"
|
|
2784
|
+
}).preprocess(async ({ run }) => {
|
|
2785
|
+
const input = run.input?.inputMessages?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2786
|
+
const output = run.output?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2787
|
+
if (!input && !output) {
|
|
2788
|
+
return {
|
|
2789
|
+
result: {
|
|
2790
|
+
referenceKeywords: /* @__PURE__ */ new Set(),
|
|
2791
|
+
responseKeywords: /* @__PURE__ */ new Set()
|
|
2792
|
+
}
|
|
2793
|
+
};
|
|
2794
|
+
}
|
|
2795
|
+
const extractKeywords = (text) => {
|
|
2796
|
+
return keyword_extractor__default.default.extract(text, {
|
|
2797
|
+
language: "english",
|
|
2798
|
+
remove_digits: true,
|
|
2799
|
+
return_changed_case: true,
|
|
2800
|
+
remove_duplicates: true
|
|
2801
|
+
});
|
|
2802
|
+
};
|
|
2803
|
+
const referenceKeywords = new Set(extractKeywords(input));
|
|
2804
|
+
const responseKeywords = new Set(extractKeywords(output));
|
|
2805
|
+
return {
|
|
2806
|
+
referenceKeywords,
|
|
2807
|
+
responseKeywords
|
|
2808
|
+
};
|
|
2809
|
+
}).analyze(async ({ results }) => {
|
|
2810
|
+
if (!results.preprocessStepResult?.referenceKeywords?.size && !results.preprocessStepResult?.responseKeywords?.size) {
|
|
2811
|
+
return {
|
|
2812
|
+
totalKeywordsLength: 0,
|
|
2813
|
+
matchedKeywordsLength: 0
|
|
2814
|
+
};
|
|
2815
|
+
}
|
|
2816
|
+
const matchedKeywords = [...results.preprocessStepResult?.referenceKeywords].filter(
|
|
2817
|
+
(k) => results.preprocessStepResult?.responseKeywords?.has(k)
|
|
2818
|
+
);
|
|
2819
|
+
return {
|
|
2820
|
+
totalKeywordsLength: Array.from(results.preprocessStepResult?.referenceKeywords).length ?? 0,
|
|
2821
|
+
matchedKeywordsLength: matchedKeywords.length ?? 0
|
|
2822
|
+
};
|
|
2823
|
+
}).generateScore(({ results }) => {
|
|
2824
|
+
if (!results.analyzeStepResult?.totalKeywordsLength) {
|
|
2825
|
+
return 1;
|
|
2826
|
+
}
|
|
2827
|
+
const totalKeywords = results.analyzeStepResult?.totalKeywordsLength;
|
|
2828
|
+
const matchedKeywords = results.analyzeStepResult?.matchedKeywordsLength;
|
|
2829
|
+
return totalKeywords > 0 ? matchedKeywords / totalKeywords : 0;
|
|
2830
|
+
});
|
|
2831
|
+
}
|
|
2832
|
+
function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
|
|
2833
|
+
return evals.createScorer({
|
|
2834
|
+
id: "content-similarity-scorer",
|
|
2835
|
+
name: "Content Similarity Scorer",
|
|
2836
|
+
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
2837
|
+
type: "agent"
|
|
2838
|
+
}).preprocess(async ({ run }) => {
|
|
2839
|
+
let processedInput = run.input?.inputMessages.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2840
|
+
let processedOutput = run.output.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2841
|
+
if (ignoreCase) {
|
|
2842
|
+
processedInput = processedInput.toLowerCase();
|
|
2843
|
+
processedOutput = processedOutput.toLowerCase();
|
|
2844
|
+
}
|
|
2845
|
+
if (ignoreWhitespace) {
|
|
2846
|
+
processedInput = processedInput.replace(/\s+/g, " ").trim();
|
|
2847
|
+
processedOutput = processedOutput.replace(/\s+/g, " ").trim();
|
|
2848
|
+
}
|
|
2849
|
+
return {
|
|
2850
|
+
processedInput,
|
|
2851
|
+
processedOutput
|
|
2852
|
+
};
|
|
2853
|
+
}).generateScore(({ results }) => {
|
|
2854
|
+
const similarity = stringSimilarity__default.default.compareTwoStrings(
|
|
2855
|
+
results.preprocessStepResult?.processedInput,
|
|
2856
|
+
results.preprocessStepResult?.processedOutput
|
|
2857
|
+
);
|
|
2858
|
+
return similarity;
|
|
2859
|
+
});
|
|
2860
|
+
}
|
|
2861
|
+
function createToneScorer(config = {}) {
|
|
2862
|
+
const { referenceTone } = config;
|
|
2863
|
+
return evals.createScorer({
|
|
2864
|
+
id: "tone-scorer",
|
|
2865
|
+
name: "Tone Scorer",
|
|
2866
|
+
description: "Analyzes the tone and sentiment of agent responses using sentiment analysis. Can compare against a reference tone or evaluate sentiment stability.",
|
|
2867
|
+
type: "agent"
|
|
2868
|
+
}).preprocess(async ({ run }) => {
|
|
2869
|
+
const sentiment = new Sentiment__default.default();
|
|
2870
|
+
const agentMessage = run.output?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2871
|
+
const responseSentiment = sentiment.analyze(agentMessage);
|
|
2872
|
+
if (referenceTone) {
|
|
2873
|
+
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
2874
|
+
const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
|
|
2875
|
+
const normalizedScore = Math.max(0, 1 - sentimentDiff);
|
|
2876
|
+
return {
|
|
2877
|
+
score: normalizedScore,
|
|
2878
|
+
responseSentiment: responseSentiment.comparative,
|
|
2879
|
+
referenceSentiment: referenceSentiment.comparative,
|
|
2880
|
+
difference: sentimentDiff
|
|
2881
|
+
};
|
|
2882
|
+
}
|
|
2883
|
+
const sentences = agentMessage.match(/[^.!?]+[.!?]+/g) || [agentMessage];
|
|
2884
|
+
const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
|
|
2885
|
+
const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
|
|
2886
|
+
const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
|
|
2887
|
+
const stability = Math.max(0, 1 - variance);
|
|
2888
|
+
return {
|
|
2889
|
+
score: stability,
|
|
2890
|
+
avgSentiment,
|
|
2891
|
+
sentimentVariance: variance
|
|
2892
|
+
};
|
|
2893
|
+
}).generateScore(({ results }) => {
|
|
2894
|
+
return results.preprocessStepResult?.score;
|
|
2895
|
+
});
|
|
2896
|
+
}
|
|
2897
|
+
function checkToolOrder(actualTools, expectedOrder, strictMode = false) {
|
|
2898
|
+
if (strictMode) {
|
|
2899
|
+
return JSON.stringify(actualTools) === JSON.stringify(expectedOrder);
|
|
2900
|
+
}
|
|
2901
|
+
const expectedIndices = [];
|
|
2902
|
+
for (const expectedTool of expectedOrder) {
|
|
2903
|
+
const index = actualTools.indexOf(expectedTool);
|
|
2904
|
+
if (index === -1) {
|
|
2905
|
+
return false;
|
|
2906
|
+
}
|
|
2907
|
+
expectedIndices.push(index);
|
|
2908
|
+
}
|
|
2909
|
+
for (let i = 1; i < expectedIndices.length; i++) {
|
|
2910
|
+
const currentIndex = expectedIndices[i];
|
|
2911
|
+
const prevIndex = expectedIndices[i - 1];
|
|
2912
|
+
if (currentIndex !== void 0 && prevIndex !== void 0 && currentIndex <= prevIndex) {
|
|
2913
|
+
return false;
|
|
2914
|
+
}
|
|
2915
|
+
}
|
|
2916
|
+
return true;
|
|
2917
|
+
}
|
|
2918
|
+
function calculateAccuracy({
|
|
2919
|
+
expectedTool,
|
|
2920
|
+
actualTools,
|
|
2921
|
+
strictMode = false,
|
|
2922
|
+
expectedToolOrder
|
|
2923
|
+
}) {
|
|
2924
|
+
if (actualTools.length === 0) {
|
|
2925
|
+
return 0;
|
|
2926
|
+
}
|
|
2927
|
+
if (expectedToolOrder && expectedToolOrder.length > 0) {
|
|
2928
|
+
return checkToolOrder(actualTools, expectedToolOrder, strictMode) ? 1 : 0;
|
|
2929
|
+
}
|
|
2930
|
+
if (!expectedTool) {
|
|
2931
|
+
return 0;
|
|
2932
|
+
}
|
|
2933
|
+
if (strictMode) {
|
|
2934
|
+
return actualTools.length === 1 && actualTools[0] === expectedTool ? 1 : 0;
|
|
2935
|
+
}
|
|
2936
|
+
return actualTools.includes(expectedTool) ? 1 : 0;
|
|
2937
|
+
}
|
|
2938
|
+
function createToolCallAccuracyScorerCode(options) {
|
|
2939
|
+
const { expectedTool, strictMode = false, expectedToolOrder } = options;
|
|
2940
|
+
if (!expectedTool && !expectedToolOrder) {
|
|
2941
|
+
throw new Error("Either expectedTool or expectedToolOrder must be provided");
|
|
2942
|
+
}
|
|
2943
|
+
const getDescription = () => {
|
|
2944
|
+
return expectedToolOrder ? `Evaluates whether the LLM called tools in the correct order: [${expectedToolOrder.join(", ")}]` : `Evaluates whether the LLM selected the correct tool (${expectedTool}) from the available tools`;
|
|
2945
|
+
};
|
|
2946
|
+
return evals.createScorer({
|
|
2947
|
+
id: "code-tool-call-accuracy-scorer",
|
|
2948
|
+
name: "Tool Call Accuracy Scorer",
|
|
2949
|
+
description: getDescription(),
|
|
2950
|
+
type: "agent"
|
|
2951
|
+
}).preprocess(async ({ run }) => {
|
|
2952
|
+
const isInputInvalid = !run.input || !run.input.inputMessages || run.input.inputMessages.length === 0;
|
|
2953
|
+
const isOutputInvalid = !run.output || run.output.length === 0;
|
|
2954
|
+
if (isInputInvalid || isOutputInvalid) {
|
|
2955
|
+
throw new Error("Input and output messages cannot be null or empty");
|
|
2956
|
+
}
|
|
2957
|
+
const { tools: actualTools, toolCallInfos } = chunkTPQLLHZW_cjs.extractToolCalls(run.output);
|
|
2958
|
+
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
2959
|
+
return {
|
|
2960
|
+
expectedTool,
|
|
2961
|
+
actualTools,
|
|
2962
|
+
strictMode,
|
|
2963
|
+
expectedToolOrder,
|
|
2964
|
+
hasToolCalls: actualTools.length > 0,
|
|
2965
|
+
correctToolCalled,
|
|
2966
|
+
toolCallInfos,
|
|
2967
|
+
correctOrderCalled: expectedToolOrder ? checkToolOrder(actualTools, expectedToolOrder, strictMode) : null
|
|
2968
|
+
};
|
|
2969
|
+
}).generateScore(({ results }) => {
|
|
2970
|
+
const preprocessResult = results.preprocessStepResult;
|
|
2971
|
+
if (!preprocessResult) {
|
|
2972
|
+
return 0;
|
|
2973
|
+
}
|
|
2974
|
+
return calculateAccuracy({
|
|
2975
|
+
expectedTool: preprocessResult.expectedTool,
|
|
2976
|
+
actualTools: preprocessResult.actualTools,
|
|
2977
|
+
strictMode: preprocessResult.strictMode,
|
|
2978
|
+
expectedToolOrder: preprocessResult.expectedToolOrder
|
|
2979
|
+
});
|
|
2980
|
+
});
|
|
2981
|
+
}
|
|
2571
2982
|
|
|
2572
2983
|
exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
|
|
2573
2984
|
exports.ANSWER_SIMILARITY_DEFAULT_OPTIONS = ANSWER_SIMILARITY_DEFAULT_OPTIONS;
|
|
@@ -2576,12 +2987,18 @@ exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;
|
|
|
2576
2987
|
exports.createAnswerRelevancyScorer = createAnswerRelevancyScorer;
|
|
2577
2988
|
exports.createAnswerSimilarityScorer = createAnswerSimilarityScorer;
|
|
2578
2989
|
exports.createBiasScorer = createBiasScorer;
|
|
2990
|
+
exports.createCompletenessScorer = createCompletenessScorer;
|
|
2991
|
+
exports.createContentSimilarityScorer = createContentSimilarityScorer;
|
|
2579
2992
|
exports.createContextPrecisionScorer = createContextPrecisionScorer;
|
|
2580
2993
|
exports.createContextRelevanceScorerLLM = createContextRelevanceScorerLLM;
|
|
2581
2994
|
exports.createFaithfulnessScorer = createFaithfulnessScorer;
|
|
2582
2995
|
exports.createHallucinationScorer = createHallucinationScorer;
|
|
2996
|
+
exports.createKeywordCoverageScorer = createKeywordCoverageScorer;
|
|
2583
2997
|
exports.createNoiseSensitivityScorerLLM = createNoiseSensitivityScorerLLM;
|
|
2584
2998
|
exports.createPromptAlignmentScorerLLM = createPromptAlignmentScorerLLM;
|
|
2999
|
+
exports.createTextualDifferenceScorer = createTextualDifferenceScorer;
|
|
3000
|
+
exports.createToneScorer = createToneScorer;
|
|
3001
|
+
exports.createToolCallAccuracyScorerCode = createToolCallAccuracyScorerCode;
|
|
2585
3002
|
exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
|
|
2586
3003
|
exports.createToxicityScorer = createToxicityScorer;
|
|
2587
3004
|
//# sourceMappingURL=index.cjs.map
|