@mastra/evals 1.2.1 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +38 -0
- package/dist/{chunk-AY4K3J4R.cjs → chunk-33T2SZZ2.cjs} +74 -14
- package/dist/chunk-33T2SZZ2.cjs.map +1 -0
- package/dist/{chunk-X4MKZ735.js → chunk-ZRHCSFKL.js} +73 -15
- package/dist/chunk-ZRHCSFKL.js.map +1 -0
- package/dist/docs/SKILL.md +1 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/reference-evals-context-precision.md +3 -3
- package/dist/docs/references/reference-evals-context-relevance.md +3 -3
- package/dist/docs/references/reference-evals-noise-sensitivity.md +6 -6
- package/dist/docs/references/reference-evals-prompt-alignment.md +12 -12
- package/dist/docs/references/reference-evals-scorer-utils.md +3 -3
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
- package/dist/scorers/llm/answer-relevancy/index.d.ts +2 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
- package/dist/scorers/llm/bias/index.d.ts +2 -2
- package/dist/scorers/llm/bias/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts +2 -1
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-relevance/index.d.ts +2 -1
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +2 -1
- package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +4 -4
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +2 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +2 -1
- package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
- package/dist/scorers/prebuilt/index.cjs +105 -85
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +34 -14
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +31 -23
- package/dist/scorers/utils.d.ts +33 -16
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +12 -12
- package/dist/chunk-AY4K3J4R.cjs.map +0 -1
- package/dist/chunk-X4MKZ735.js.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-
|
|
1
|
+
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-ZRHCSFKL.js';
|
|
2
2
|
import { createScorer } from '@mastra/core/evals';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import nlp from 'compromise';
|
|
@@ -689,6 +689,10 @@ Example Responses:
|
|
|
689
689
|
}
|
|
690
690
|
|
|
691
691
|
// src/scorers/llm/faithfulness/index.ts
|
|
692
|
+
var getToolInvocationContext = (output) => {
|
|
693
|
+
if (!Array.isArray(output)) return [];
|
|
694
|
+
return output.filter((message) => message?.role === "assistant").flatMap((message) => message?.content?.toolInvocations ?? []).filter((toolCall) => toolCall.state === "result").map((toolCall) => JSON.stringify(toolCall.result));
|
|
695
|
+
};
|
|
692
696
|
function createFaithfulnessScorer({
|
|
693
697
|
model,
|
|
694
698
|
options
|
|
@@ -715,10 +719,7 @@ function createFaithfulnessScorer({
|
|
|
715
719
|
description: "Score the relevance of the statements to the input",
|
|
716
720
|
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
717
721
|
createPrompt: ({ results, run }) => {
|
|
718
|
-
const
|
|
719
|
-
const context = options?.context ?? assistantMessage?.content?.toolInvocations?.map(
|
|
720
|
-
(toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : ""
|
|
721
|
-
) ?? [];
|
|
722
|
+
const context = options?.context ?? getToolInvocationContext(run.output);
|
|
722
723
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
723
724
|
claims: results.preprocessStepResult?.claims || [],
|
|
724
725
|
context
|
|
@@ -736,11 +737,10 @@ function createFaithfulnessScorer({
|
|
|
736
737
|
}).generateReason({
|
|
737
738
|
description: "Reason about the results",
|
|
738
739
|
createPrompt: ({ run, results, score }) => {
|
|
739
|
-
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
740
740
|
const prompt = createFaithfulnessReasonPrompt({
|
|
741
741
|
input: getUserMessageFromRunInput(run.input) ?? "",
|
|
742
742
|
output: getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
743
|
-
context:
|
|
743
|
+
context: options?.context ?? getToolInvocationContext(run.output),
|
|
744
744
|
score,
|
|
745
745
|
scale: options?.scale || 1,
|
|
746
746
|
verdicts: results.analyzeStepResult?.verdicts || []
|
|
@@ -1627,6 +1627,16 @@ var DEFAULT_PENALTIES = {
|
|
|
1627
1627
|
MAX_MISSING_CONTEXT_PENALTY: 0.5
|
|
1628
1628
|
// Maximum 50% penalty for missing context
|
|
1629
1629
|
};
|
|
1630
|
+
var getContext = ({
|
|
1631
|
+
input,
|
|
1632
|
+
output,
|
|
1633
|
+
options
|
|
1634
|
+
}) => {
|
|
1635
|
+
if (options.contextExtractor && isScorerRunInputForAgent(input) && isScorerRunOutputForAgent(output)) {
|
|
1636
|
+
return options.contextExtractor(input, output);
|
|
1637
|
+
}
|
|
1638
|
+
return options.context ?? [];
|
|
1639
|
+
};
|
|
1630
1640
|
function createContextRelevanceScorerLLM({
|
|
1631
1641
|
model,
|
|
1632
1642
|
options
|
|
@@ -1652,7 +1662,7 @@ function createContextRelevanceScorerLLM({
|
|
|
1652
1662
|
createPrompt: ({ run }) => {
|
|
1653
1663
|
const userQuery = getUserMessageFromRunInput(run.input) ?? "";
|
|
1654
1664
|
const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1655
|
-
const context =
|
|
1665
|
+
const context = getContext({ input: run.input, output: run.output, options });
|
|
1656
1666
|
if (context.length === 0) {
|
|
1657
1667
|
return createAnalyzePrompt3({
|
|
1658
1668
|
userQuery,
|
|
@@ -1668,7 +1678,7 @@ function createContextRelevanceScorerLLM({
|
|
|
1668
1678
|
}
|
|
1669
1679
|
}).generateScore(({ results, run }) => {
|
|
1670
1680
|
const evaluations = results.analyzeStepResult?.evaluations || [];
|
|
1671
|
-
const context =
|
|
1681
|
+
const context = getContext({ input: run.input, output: run.output, options });
|
|
1672
1682
|
if (context.length === 0) {
|
|
1673
1683
|
return 1 * (options.scale || 1);
|
|
1674
1684
|
}
|
|
@@ -1704,7 +1714,7 @@ function createContextRelevanceScorerLLM({
|
|
|
1704
1714
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1705
1715
|
createPrompt: ({ run, results, score }) => {
|
|
1706
1716
|
const userQuery = getUserMessageFromRunInput(run.input) ?? "";
|
|
1707
|
-
const context =
|
|
1717
|
+
const context = getContext({ input: run.input, output: run.output, options });
|
|
1708
1718
|
if (context.length === 0) {
|
|
1709
1719
|
return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
|
|
1710
1720
|
}
|
|
@@ -1851,6 +1861,16 @@ var contextRelevanceOutputSchema = z.object({
|
|
|
1851
1861
|
})
|
|
1852
1862
|
)
|
|
1853
1863
|
});
|
|
1864
|
+
var getContext2 = ({
|
|
1865
|
+
input,
|
|
1866
|
+
output,
|
|
1867
|
+
options
|
|
1868
|
+
}) => {
|
|
1869
|
+
if (options.contextExtractor && isScorerRunInputForAgent(input) && isScorerRunOutputForAgent(output)) {
|
|
1870
|
+
return options.contextExtractor(input, output);
|
|
1871
|
+
}
|
|
1872
|
+
return options.context ?? [];
|
|
1873
|
+
};
|
|
1854
1874
|
function createContextPrecisionScorer({
|
|
1855
1875
|
model,
|
|
1856
1876
|
options
|
|
@@ -1876,7 +1896,7 @@ function createContextPrecisionScorer({
|
|
|
1876
1896
|
createPrompt: ({ run }) => {
|
|
1877
1897
|
const input = getUserMessageFromRunInput(run.input) ?? "";
|
|
1878
1898
|
const output = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1879
|
-
const context =
|
|
1899
|
+
const context = getContext2({ input: run.input, output: run.output, options });
|
|
1880
1900
|
if (context.length === 0) {
|
|
1881
1901
|
throw new Error("No context available for evaluation");
|
|
1882
1902
|
}
|
|
@@ -1914,7 +1934,7 @@ function createContextPrecisionScorer({
|
|
|
1914
1934
|
createPrompt: ({ run, results, score }) => {
|
|
1915
1935
|
const input = getUserMessageFromRunInput(run.input) ?? "";
|
|
1916
1936
|
const output = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
1917
|
-
const context =
|
|
1937
|
+
const context = getContext2({ input: run.input, output: run.output, options });
|
|
1918
1938
|
return createContextPrecisionReasonPrompt({
|
|
1919
1939
|
input,
|
|
1920
1940
|
output,
|
|
@@ -2550,8 +2570,8 @@ function createPromptAlignmentScorerLLM({
|
|
|
2550
2570
|
if (evaluationMode === "system" && !systemPrompt) {
|
|
2551
2571
|
throw new Error("System prompt is required for system prompt alignment scoring");
|
|
2552
2572
|
}
|
|
2553
|
-
if (evaluationMode === "both" &&
|
|
2554
|
-
throw new Error("
|
|
2573
|
+
if (evaluationMode === "both" && !userPrompt && !systemPrompt) {
|
|
2574
|
+
throw new Error("A user or system prompt is required for combined alignment scoring");
|
|
2555
2575
|
}
|
|
2556
2576
|
if (!agentResponse) {
|
|
2557
2577
|
throw new Error("Agent response is required for prompt alignment scoring");
|