@mastra/evals 0.1.8-alpha.0 → 0.1.8-alpha.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +10 -10
- package/CHANGELOG.md +88 -0
- package/dist/_tsup-dts-rollup.d.cts +3 -3
- package/dist/_tsup-dts-rollup.d.ts +3 -3
- package/dist/metrics/llm/index.cjs +75 -62
- package/dist/metrics/llm/index.js +75 -62
- package/package.json +2 -2
- package/src/metrics/llm/hallucination/index.test.ts +206 -103
- package/src/metrics/llm/hallucination/index.ts +1 -0
- package/src/metrics/llm/hallucination/metricJudge.ts +13 -2
- package/src/metrics/llm/hallucination/prompts.ts +63 -60
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,37 +1,37 @@
|
|
|
1
1
|
|
|
2
|
-
> @mastra/evals@0.1.8-alpha.
|
|
2
|
+
> @mastra/evals@0.1.8-alpha.10 build /home/runner/work/mastra/mastra/packages/evals
|
|
3
3
|
> pnpm check && tsup src/index.ts src/metrics/judge/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm,cjs --experimental-dts --clean --treeshake
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
> @mastra/evals@0.1.8-alpha.
|
|
6
|
+
> @mastra/evals@0.1.8-alpha.10 check /home/runner/work/mastra/mastra/packages/evals
|
|
7
7
|
> tsc --noEmit
|
|
8
8
|
|
|
9
|
-
[34mCLI[39m Building entry: src/index.ts, src/metrics/judge/index.ts, src/metrics/
|
|
9
|
+
[34mCLI[39m Building entry: src/index.ts, src/metrics/judge/index.ts, src/metrics/nlp/index.ts, src/metrics/llm/index.ts
|
|
10
10
|
[34mCLI[39m Using tsconfig: tsconfig.json
|
|
11
11
|
[34mCLI[39m tsup v8.3.6
|
|
12
12
|
[34mTSC[39m Build start
|
|
13
|
-
[32mTSC[39m ⚡️ Build success in
|
|
13
|
+
[32mTSC[39m ⚡️ Build success in 16273ms
|
|
14
14
|
[34mDTS[39m Build start
|
|
15
15
|
[34mCLI[39m Target: es2022
|
|
16
16
|
Analysis will use the bundled TypeScript version 5.7.3
|
|
17
17
|
[36mWriting package typings: /home/runner/work/mastra/mastra/packages/evals/dist/_tsup-dts-rollup.d.ts[39m
|
|
18
18
|
Analysis will use the bundled TypeScript version 5.7.3
|
|
19
19
|
[36mWriting package typings: /home/runner/work/mastra/mastra/packages/evals/dist/_tsup-dts-rollup.d.cts[39m
|
|
20
|
-
[32mDTS[39m ⚡️ Build success in
|
|
20
|
+
[32mDTS[39m ⚡️ Build success in 10916ms
|
|
21
21
|
[34mCLI[39m Cleaning output folder
|
|
22
22
|
[34mESM[39m Build start
|
|
23
23
|
[34mCJS[39m Build start
|
|
24
24
|
[32mCJS[39m [1mdist/metrics/judge/index.cjs [22m[32m341.00 B[39m
|
|
25
|
-
[32mCJS[39m [1mdist/metrics/llm/index.cjs [22m[32m86.28 KB[39m
|
|
26
25
|
[32mCJS[39m [1mdist/metrics/nlp/index.cjs [22m[32m6.94 KB[39m
|
|
26
|
+
[32mCJS[39m [1mdist/metrics/llm/index.cjs [22m[32m86.80 KB[39m
|
|
27
27
|
[32mCJS[39m [1mdist/index.cjs [22m[32m655.25 KB[39m
|
|
28
|
-
[32mCJS[39m ⚡️ Build success in
|
|
28
|
+
[32mCJS[39m ⚡️ Build success in 10747ms
|
|
29
29
|
[32mESM[39m [1mdist/index.js [22m[32m2.63 KB[39m
|
|
30
30
|
[32mESM[39m [1mdist/metrics/judge/index.js [22m[32m94.00 B[39m
|
|
31
|
-
[32mESM[39m [1mdist/chunk-TXXJUIES.js [22m[32m305.00 B[39m
|
|
32
31
|
[32mESM[39m [1mdist/metrics/nlp/index.js [22m[32m6.30 KB[39m
|
|
32
|
+
[32mESM[39m [1mdist/chunk-TXXJUIES.js [22m[32m305.00 B[39m
|
|
33
33
|
[32mESM[39m [1mdist/chunk-4VNS5WPM.js [22m[32m1.82 KB[39m
|
|
34
|
-
[32mESM[39m [1mdist/metrics/llm/index.js [22m[32m85.
|
|
34
|
+
[32mESM[39m [1mdist/metrics/llm/index.js [22m[32m85.82 KB[39m
|
|
35
35
|
[32mESM[39m [1mdist/magic-string.es-5UDOWOAZ.js [22m[32m40.80 KB[39m
|
|
36
36
|
[32mESM[39m [1mdist/dist-EOJDANYG.js [22m[32m571.17 KB[39m
|
|
37
|
-
[32mESM[39m ⚡️ Build success in
|
|
37
|
+
[32mESM[39m ⚡️ Build success in 10756ms
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,93 @@
|
|
|
1
1
|
# @mastra/evals
|
|
2
2
|
|
|
3
|
+
## 0.1.8-alpha.10
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 9d31a36: Update hallucination eval
|
|
8
|
+
- Updated dependencies [a910463]
|
|
9
|
+
- @mastra/core@0.5.0-alpha.10
|
|
10
|
+
|
|
11
|
+
## 0.1.8-alpha.9
|
|
12
|
+
|
|
13
|
+
### Patch Changes
|
|
14
|
+
|
|
15
|
+
- Updated dependencies [e9fbac5]
|
|
16
|
+
- Updated dependencies [1e8bcbc]
|
|
17
|
+
- Updated dependencies [aeb5e36]
|
|
18
|
+
- Updated dependencies [f2301de]
|
|
19
|
+
- @mastra/core@0.5.0-alpha.9
|
|
20
|
+
|
|
21
|
+
## 0.1.8-alpha.8
|
|
22
|
+
|
|
23
|
+
### Patch Changes
|
|
24
|
+
|
|
25
|
+
- Updated dependencies [506f1d5]
|
|
26
|
+
- @mastra/core@0.5.0-alpha.8
|
|
27
|
+
|
|
28
|
+
## 0.1.8-alpha.7
|
|
29
|
+
|
|
30
|
+
### Patch Changes
|
|
31
|
+
|
|
32
|
+
- Updated dependencies [ee667a2]
|
|
33
|
+
- @mastra/core@0.5.0-alpha.7
|
|
34
|
+
|
|
35
|
+
## 0.1.8-alpha.6
|
|
36
|
+
|
|
37
|
+
### Patch Changes
|
|
38
|
+
|
|
39
|
+
- Updated dependencies [f6678e4]
|
|
40
|
+
- @mastra/core@0.5.0-alpha.6
|
|
41
|
+
|
|
42
|
+
## 0.1.8-alpha.5
|
|
43
|
+
|
|
44
|
+
### Patch Changes
|
|
45
|
+
|
|
46
|
+
- Updated dependencies [22643eb]
|
|
47
|
+
- Updated dependencies [6feb23f]
|
|
48
|
+
- Updated dependencies [f2d6727]
|
|
49
|
+
- Updated dependencies [301e4ee]
|
|
50
|
+
- Updated dependencies [dfbe4e9]
|
|
51
|
+
- Updated dependencies [9e81f35]
|
|
52
|
+
- Updated dependencies [caefaa2]
|
|
53
|
+
- Updated dependencies [c151ae6]
|
|
54
|
+
- Updated dependencies [52e0418]
|
|
55
|
+
- Updated dependencies [03236ec]
|
|
56
|
+
- Updated dependencies [3764e71]
|
|
57
|
+
- Updated dependencies [df982db]
|
|
58
|
+
- Updated dependencies [0461849]
|
|
59
|
+
- Updated dependencies [2259379]
|
|
60
|
+
- Updated dependencies [358f069]
|
|
61
|
+
- @mastra/core@0.5.0-alpha.5
|
|
62
|
+
|
|
63
|
+
## 0.1.8-alpha.4
|
|
64
|
+
|
|
65
|
+
### Patch Changes
|
|
66
|
+
|
|
67
|
+
- Updated dependencies [d79aedf]
|
|
68
|
+
- @mastra/core@0.5.0-alpha.4
|
|
69
|
+
|
|
70
|
+
## 0.1.8-alpha.3
|
|
71
|
+
|
|
72
|
+
### Patch Changes
|
|
73
|
+
|
|
74
|
+
- Updated dependencies [3d0e290]
|
|
75
|
+
- @mastra/core@0.5.0-alpha.3
|
|
76
|
+
|
|
77
|
+
## 0.1.8-alpha.2
|
|
78
|
+
|
|
79
|
+
### Patch Changes
|
|
80
|
+
|
|
81
|
+
- Updated dependencies [02ffb7b]
|
|
82
|
+
- @mastra/core@0.5.0-alpha.2
|
|
83
|
+
|
|
84
|
+
## 0.1.8-alpha.1
|
|
85
|
+
|
|
86
|
+
### Patch Changes
|
|
87
|
+
|
|
88
|
+
- Updated dependencies [dab255b]
|
|
89
|
+
- @mastra/core@0.5.0-alpha.1
|
|
90
|
+
|
|
3
91
|
## 0.1.8-alpha.0
|
|
4
92
|
|
|
5
93
|
### Patch Changes
|
|
@@ -355,9 +355,9 @@ export declare function generateEvaluatePrompt_alias_6({ claims, context }: {
|
|
|
355
355
|
context: string[];
|
|
356
356
|
}): string;
|
|
357
357
|
|
|
358
|
-
export declare function generateEvaluatePrompt_alias_7({ context,
|
|
358
|
+
export declare function generateEvaluatePrompt_alias_7({ context, claims }: {
|
|
359
359
|
context: string[];
|
|
360
|
-
|
|
360
|
+
claims: string[];
|
|
361
361
|
}): string;
|
|
362
362
|
|
|
363
363
|
export declare function generateEvaluatePrompt_alias_8({ instructions, input, output, }: {
|
|
@@ -504,7 +504,7 @@ declare function globalSetup(): Promise<void>;
|
|
|
504
504
|
export { globalSetup }
|
|
505
505
|
export { globalSetup as globalSetup_alias_1 }
|
|
506
506
|
|
|
507
|
-
export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output
|
|
507
|
+
export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.\n\nKey Principles:\n1. First extract all claims from the output (both factual and speculative)\n2. Then verify each extracted claim against the provided context\n3. Consider it a hallucination if a claim contradicts the context\n4. Consider it a hallucination if a claim makes assertions not supported by context\n5. Empty outputs should be handled as having no hallucinations\n6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination\n7. Speculative language about facts NOT in the context IS a hallucination\n8. Never use prior knowledge in judgments - only use what's explicitly stated in context\n9. The following are NOT hallucinations:\n - Using less precise dates (e.g., year when context gives month)\n - Reasonable numerical approximations\n - Omitting additional details while maintaining factual accuracy\n10. Subjective claims (\"made history\", \"pioneering\", \"leading\") are hallucinations unless explicitly stated in context";
|
|
508
508
|
|
|
509
509
|
export declare class HallucinationJudge extends MastraAgentJudge {
|
|
510
510
|
constructor(model: LanguageModel);
|
|
@@ -355,9 +355,9 @@ export declare function generateEvaluatePrompt_alias_6({ claims, context }: {
|
|
|
355
355
|
context: string[];
|
|
356
356
|
}): string;
|
|
357
357
|
|
|
358
|
-
export declare function generateEvaluatePrompt_alias_7({ context,
|
|
358
|
+
export declare function generateEvaluatePrompt_alias_7({ context, claims }: {
|
|
359
359
|
context: string[];
|
|
360
|
-
|
|
360
|
+
claims: string[];
|
|
361
361
|
}): string;
|
|
362
362
|
|
|
363
363
|
export declare function generateEvaluatePrompt_alias_8({ instructions, input, output, }: {
|
|
@@ -504,7 +504,7 @@ declare function globalSetup(): Promise<void>;
|
|
|
504
504
|
export { globalSetup }
|
|
505
505
|
export { globalSetup as globalSetup_alias_1 }
|
|
506
506
|
|
|
507
|
-
export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output
|
|
507
|
+
export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.\n\nKey Principles:\n1. First extract all claims from the output (both factual and speculative)\n2. Then verify each extracted claim against the provided context\n3. Consider it a hallucination if a claim contradicts the context\n4. Consider it a hallucination if a claim makes assertions not supported by context\n5. Empty outputs should be handled as having no hallucinations\n6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination\n7. Speculative language about facts NOT in the context IS a hallucination\n8. Never use prior knowledge in judgments - only use what's explicitly stated in context\n9. The following are NOT hallucinations:\n - Using less precise dates (e.g., year when context gives month)\n - Reasonable numerical approximations\n - Omitting additional details while maintaining factual accuracy\n10. Subjective claims (\"made history\", \"pioneering\", \"leading\") are hallucinations unless explicitly stated in context";
|
|
508
508
|
|
|
509
509
|
export declare class HallucinationJudge extends MastraAgentJudge {
|
|
510
510
|
constructor(model: LanguageModel);
|
|
@@ -955,98 +955,101 @@ var FaithfulnessMetric = class extends _eval.Metric {
|
|
|
955
955
|
};
|
|
956
956
|
|
|
957
957
|
// src/metrics/llm/hallucination/prompts.ts
|
|
958
|
-
var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output
|
|
958
|
+
var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
|
|
959
959
|
|
|
960
960
|
Key Principles:
|
|
961
|
-
1.
|
|
962
|
-
2.
|
|
963
|
-
3. Consider a
|
|
964
|
-
4. Consider
|
|
965
|
-
5. Empty outputs should be handled as having no
|
|
966
|
-
6.
|
|
967
|
-
7.
|
|
968
|
-
8.
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
961
|
+
1. First extract all claims from the output (both factual and speculative)
|
|
962
|
+
2. Then verify each extracted claim against the provided context
|
|
963
|
+
3. Consider it a hallucination if a claim contradicts the context
|
|
964
|
+
4. Consider it a hallucination if a claim makes assertions not supported by context
|
|
965
|
+
5. Empty outputs should be handled as having no hallucinations
|
|
966
|
+
6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
|
|
967
|
+
7. Speculative language about facts NOT in the context IS a hallucination
|
|
968
|
+
8. Never use prior knowledge in judgments - only use what's explicitly stated in context
|
|
969
|
+
9. The following are NOT hallucinations:
|
|
970
|
+
- Using less precise dates (e.g., year when context gives month)
|
|
971
|
+
- Reasonable numerical approximations
|
|
972
|
+
- Omitting additional details while maintaining factual accuracy
|
|
973
|
+
10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
|
|
974
|
+
function generateEvaluatePrompt5({ context, claims }) {
|
|
975
|
+
return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
|
|
976
|
+
1. Contradicts the context
|
|
977
|
+
2. Makes assertions not supported by the context
|
|
978
|
+
|
|
979
|
+
Claims to verify:
|
|
980
|
+
${claims.join("\n")}
|
|
974
981
|
|
|
975
982
|
Number of context statements: ${context.length}
|
|
976
983
|
|
|
977
|
-
Context statements
|
|
984
|
+
Context statements:
|
|
978
985
|
${context.join("\n")}
|
|
979
986
|
|
|
980
|
-
For each
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
-
|
|
984
|
-
-
|
|
987
|
+
For each claim, determine if it is supported by the context. When evaluating:
|
|
988
|
+
|
|
989
|
+
1. NOT Hallucinations:
|
|
990
|
+
- Using less precise dates (e.g., year when context gives month)
|
|
991
|
+
- Reasonable numerical approximations
|
|
992
|
+
- Omitting additional details while maintaining factual accuracy
|
|
993
|
+
- Speculative language about facts present in context
|
|
994
|
+
|
|
995
|
+
2. ARE Hallucinations:
|
|
996
|
+
- Claims that contradict the context
|
|
997
|
+
- Assertions not supported by context
|
|
998
|
+
- Speculative claims about facts not in context
|
|
999
|
+
- Subjective claims not explicitly supported by context
|
|
985
1000
|
|
|
986
1001
|
Example:
|
|
987
|
-
Context:
|
|
988
|
-
|
|
1002
|
+
Context: [
|
|
1003
|
+
"SpaceX achieved first successful landing in December 2015.",
|
|
1004
|
+
"Their reusable rocket technology reduced launch costs by 30%."
|
|
1005
|
+
]
|
|
1006
|
+
Claims: [
|
|
1007
|
+
"SpaceX made history in 2015",
|
|
1008
|
+
"SpaceX had pioneering reusable rockets",
|
|
1009
|
+
"reusable rockets significantly cut costs",
|
|
1010
|
+
"They might expand operations globally"
|
|
1011
|
+
]
|
|
989
1012
|
{
|
|
990
1013
|
"verdicts": [
|
|
991
1014
|
{
|
|
992
|
-
"statement": "
|
|
1015
|
+
"statement": "SpaceX made history in 2015",
|
|
993
1016
|
"verdict": "yes",
|
|
994
|
-
"reason": "The
|
|
995
|
-
}
|
|
996
|
-
]
|
|
997
|
-
}
|
|
998
|
-
|
|
999
|
-
Context: "The company has exactly 1,234 employees"
|
|
1000
|
-
Output: "The company employs around 1,200 people"
|
|
1001
|
-
{
|
|
1002
|
-
"verdicts": [
|
|
1017
|
+
"reason": "The subjective claim 'made history' and the year are not supported by context"
|
|
1018
|
+
},
|
|
1003
1019
|
{
|
|
1004
|
-
"statement": "
|
|
1005
|
-
"verdict": "
|
|
1006
|
-
"reason": "
|
|
1007
|
-
}
|
|
1008
|
-
]
|
|
1009
|
-
}
|
|
1010
|
-
|
|
1011
|
-
Context: "Revenue reached $50.5 million in 2022"
|
|
1012
|
-
Output: "The company made about $50 million in 2022"
|
|
1013
|
-
{
|
|
1014
|
-
"verdicts": [
|
|
1020
|
+
"statement": "SpaceX had pioneering reusable rockets",
|
|
1021
|
+
"verdict": "yes",
|
|
1022
|
+
"reason": "The subjective claim 'pioneering' is not supported by context"
|
|
1023
|
+
},
|
|
1015
1024
|
{
|
|
1016
|
-
"statement": "
|
|
1025
|
+
"statement": "reusable rockets significantly cut costs",
|
|
1017
1026
|
"verdict": "no",
|
|
1018
|
-
"reason": "
|
|
1019
|
-
}
|
|
1020
|
-
]
|
|
1021
|
-
}
|
|
1022
|
-
|
|
1023
|
-
Context: "The startup raised $2.1 million in seed funding"
|
|
1024
|
-
Output: "The company secured approximately $5 million in their seed round"
|
|
1025
|
-
{
|
|
1026
|
-
"verdicts": [
|
|
1027
|
+
"reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
|
|
1028
|
+
},
|
|
1027
1029
|
{
|
|
1028
|
-
"statement": "
|
|
1030
|
+
"statement": "They might expand operations globally",
|
|
1029
1031
|
"verdict": "yes",
|
|
1030
|
-
"reason": "
|
|
1032
|
+
"reason": "This speculative claim about facts not in context is a hallucination"
|
|
1031
1033
|
}
|
|
1032
1034
|
]
|
|
1033
1035
|
}
|
|
1034
1036
|
|
|
1035
1037
|
Rules:
|
|
1036
|
-
-
|
|
1037
|
-
-
|
|
1038
|
+
- Mark as hallucination if information contradicts context
|
|
1039
|
+
- Mark as hallucination if assertions aren't supported by context
|
|
1040
|
+
- Allow reasonable approximations and less precise dates
|
|
1041
|
+
- Every factual claim must be verified
|
|
1038
1042
|
- Never use prior knowledge in your judgment
|
|
1039
1043
|
- Provide clear reasoning for each verdict
|
|
1040
|
-
- Be specific about
|
|
1041
|
-
- The number of verdicts MUST MATCH the number of context statements exactly
|
|
1044
|
+
- Be specific about what information is or isn't supported by context
|
|
1042
1045
|
|
|
1043
1046
|
Format:
|
|
1044
1047
|
{
|
|
1045
1048
|
"verdicts": [
|
|
1046
1049
|
{
|
|
1047
|
-
"statement": "
|
|
1050
|
+
"statement": "individual claim",
|
|
1048
1051
|
"verdict": "yes/no",
|
|
1049
|
-
"reason": "explanation of
|
|
1052
|
+
"reason": "explanation of whether the claim is supported by context"
|
|
1050
1053
|
}
|
|
1051
1054
|
]
|
|
1052
1055
|
}`;
|
|
@@ -1096,7 +1099,16 @@ var HallucinationJudge = class extends MastraAgentJudge {
|
|
|
1096
1099
|
super("Hallucination", HALLUCINATION_AGENT_INSTRUCTIONS, model);
|
|
1097
1100
|
}
|
|
1098
1101
|
async evaluate(output, context) {
|
|
1099
|
-
const
|
|
1102
|
+
const claimsPrompt = generateClaimExtractionPrompt({ output });
|
|
1103
|
+
const claims = await this.agent.generate(claimsPrompt, {
|
|
1104
|
+
output: zod.z.object({
|
|
1105
|
+
claims: zod.z.array(zod.z.string())
|
|
1106
|
+
})
|
|
1107
|
+
});
|
|
1108
|
+
if (claims.object.claims.length === 0) {
|
|
1109
|
+
return [];
|
|
1110
|
+
}
|
|
1111
|
+
const evaluatePrompt = generateEvaluatePrompt5({ claims: claims.object.claims, context });
|
|
1100
1112
|
const result = await this.agent.generate(evaluatePrompt, {
|
|
1101
1113
|
output: zod.z.object({
|
|
1102
1114
|
verdicts: zod.z.array(
|
|
@@ -1132,6 +1144,7 @@ var HallucinationMetric = class extends _eval.Metric {
|
|
|
1132
1144
|
}
|
|
1133
1145
|
async measure(input, output) {
|
|
1134
1146
|
const verdicts = await this.judge.evaluate(output, this.context);
|
|
1147
|
+
console.log("verdicts", verdicts);
|
|
1135
1148
|
const score = this.calculateScore(verdicts);
|
|
1136
1149
|
const reason = await this.judge.getReason({
|
|
1137
1150
|
input,
|
|
@@ -942,98 +942,101 @@ var FaithfulnessMetric = class extends Metric {
|
|
|
942
942
|
};
|
|
943
943
|
|
|
944
944
|
// src/metrics/llm/hallucination/prompts.ts
|
|
945
|
-
var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output
|
|
945
|
+
var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
|
|
946
946
|
|
|
947
947
|
Key Principles:
|
|
948
|
-
1.
|
|
949
|
-
2.
|
|
950
|
-
3. Consider a
|
|
951
|
-
4. Consider
|
|
952
|
-
5. Empty outputs should be handled as having no
|
|
953
|
-
6.
|
|
954
|
-
7.
|
|
955
|
-
8.
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
948
|
+
1. First extract all claims from the output (both factual and speculative)
|
|
949
|
+
2. Then verify each extracted claim against the provided context
|
|
950
|
+
3. Consider it a hallucination if a claim contradicts the context
|
|
951
|
+
4. Consider it a hallucination if a claim makes assertions not supported by context
|
|
952
|
+
5. Empty outputs should be handled as having no hallucinations
|
|
953
|
+
6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
|
|
954
|
+
7. Speculative language about facts NOT in the context IS a hallucination
|
|
955
|
+
8. Never use prior knowledge in judgments - only use what's explicitly stated in context
|
|
956
|
+
9. The following are NOT hallucinations:
|
|
957
|
+
- Using less precise dates (e.g., year when context gives month)
|
|
958
|
+
- Reasonable numerical approximations
|
|
959
|
+
- Omitting additional details while maintaining factual accuracy
|
|
960
|
+
10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
|
|
961
|
+
function generateEvaluatePrompt5({ context, claims }) {
|
|
962
|
+
return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
|
|
963
|
+
1. Contradicts the context
|
|
964
|
+
2. Makes assertions not supported by the context
|
|
965
|
+
|
|
966
|
+
Claims to verify:
|
|
967
|
+
${claims.join("\n")}
|
|
961
968
|
|
|
962
969
|
Number of context statements: ${context.length}
|
|
963
970
|
|
|
964
|
-
Context statements
|
|
971
|
+
Context statements:
|
|
965
972
|
${context.join("\n")}
|
|
966
973
|
|
|
967
|
-
For each
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
-
|
|
971
|
-
-
|
|
974
|
+
For each claim, determine if it is supported by the context. When evaluating:
|
|
975
|
+
|
|
976
|
+
1. NOT Hallucinations:
|
|
977
|
+
- Using less precise dates (e.g., year when context gives month)
|
|
978
|
+
- Reasonable numerical approximations
|
|
979
|
+
- Omitting additional details while maintaining factual accuracy
|
|
980
|
+
- Speculative language about facts present in context
|
|
981
|
+
|
|
982
|
+
2. ARE Hallucinations:
|
|
983
|
+
- Claims that contradict the context
|
|
984
|
+
- Assertions not supported by context
|
|
985
|
+
- Speculative claims about facts not in context
|
|
986
|
+
- Subjective claims not explicitly supported by context
|
|
972
987
|
|
|
973
988
|
Example:
|
|
974
|
-
Context:
|
|
975
|
-
|
|
989
|
+
Context: [
|
|
990
|
+
"SpaceX achieved first successful landing in December 2015.",
|
|
991
|
+
"Their reusable rocket technology reduced launch costs by 30%."
|
|
992
|
+
]
|
|
993
|
+
Claims: [
|
|
994
|
+
"SpaceX made history in 2015",
|
|
995
|
+
"SpaceX had pioneering reusable rockets",
|
|
996
|
+
"reusable rockets significantly cut costs",
|
|
997
|
+
"They might expand operations globally"
|
|
998
|
+
]
|
|
976
999
|
{
|
|
977
1000
|
"verdicts": [
|
|
978
1001
|
{
|
|
979
|
-
"statement": "
|
|
1002
|
+
"statement": "SpaceX made history in 2015",
|
|
980
1003
|
"verdict": "yes",
|
|
981
|
-
"reason": "The
|
|
982
|
-
}
|
|
983
|
-
]
|
|
984
|
-
}
|
|
985
|
-
|
|
986
|
-
Context: "The company has exactly 1,234 employees"
|
|
987
|
-
Output: "The company employs around 1,200 people"
|
|
988
|
-
{
|
|
989
|
-
"verdicts": [
|
|
1004
|
+
"reason": "The subjective claim 'made history' and the year are not supported by context"
|
|
1005
|
+
},
|
|
990
1006
|
{
|
|
991
|
-
"statement": "
|
|
992
|
-
"verdict": "
|
|
993
|
-
"reason": "
|
|
994
|
-
}
|
|
995
|
-
]
|
|
996
|
-
}
|
|
997
|
-
|
|
998
|
-
Context: "Revenue reached $50.5 million in 2022"
|
|
999
|
-
Output: "The company made about $50 million in 2022"
|
|
1000
|
-
{
|
|
1001
|
-
"verdicts": [
|
|
1007
|
+
"statement": "SpaceX had pioneering reusable rockets",
|
|
1008
|
+
"verdict": "yes",
|
|
1009
|
+
"reason": "The subjective claim 'pioneering' is not supported by context"
|
|
1010
|
+
},
|
|
1002
1011
|
{
|
|
1003
|
-
"statement": "
|
|
1012
|
+
"statement": "reusable rockets significantly cut costs",
|
|
1004
1013
|
"verdict": "no",
|
|
1005
|
-
"reason": "
|
|
1006
|
-
}
|
|
1007
|
-
]
|
|
1008
|
-
}
|
|
1009
|
-
|
|
1010
|
-
Context: "The startup raised $2.1 million in seed funding"
|
|
1011
|
-
Output: "The company secured approximately $5 million in their seed round"
|
|
1012
|
-
{
|
|
1013
|
-
"verdicts": [
|
|
1014
|
+
"reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
|
|
1015
|
+
},
|
|
1014
1016
|
{
|
|
1015
|
-
"statement": "
|
|
1017
|
+
"statement": "They might expand operations globally",
|
|
1016
1018
|
"verdict": "yes",
|
|
1017
|
-
"reason": "
|
|
1019
|
+
"reason": "This speculative claim about facts not in context is a hallucination"
|
|
1018
1020
|
}
|
|
1019
1021
|
]
|
|
1020
1022
|
}
|
|
1021
1023
|
|
|
1022
1024
|
Rules:
|
|
1023
|
-
-
|
|
1024
|
-
-
|
|
1025
|
+
- Mark as hallucination if information contradicts context
|
|
1026
|
+
- Mark as hallucination if assertions aren't supported by context
|
|
1027
|
+
- Allow reasonable approximations and less precise dates
|
|
1028
|
+
- Every factual claim must be verified
|
|
1025
1029
|
- Never use prior knowledge in your judgment
|
|
1026
1030
|
- Provide clear reasoning for each verdict
|
|
1027
|
-
- Be specific about
|
|
1028
|
-
- The number of verdicts MUST MATCH the number of context statements exactly
|
|
1031
|
+
- Be specific about what information is or isn't supported by context
|
|
1029
1032
|
|
|
1030
1033
|
Format:
|
|
1031
1034
|
{
|
|
1032
1035
|
"verdicts": [
|
|
1033
1036
|
{
|
|
1034
|
-
"statement": "
|
|
1037
|
+
"statement": "individual claim",
|
|
1035
1038
|
"verdict": "yes/no",
|
|
1036
|
-
"reason": "explanation of
|
|
1039
|
+
"reason": "explanation of whether the claim is supported by context"
|
|
1037
1040
|
}
|
|
1038
1041
|
]
|
|
1039
1042
|
}`;
|
|
@@ -1083,7 +1086,16 @@ var HallucinationJudge = class extends MastraAgentJudge {
|
|
|
1083
1086
|
super("Hallucination", HALLUCINATION_AGENT_INSTRUCTIONS, model);
|
|
1084
1087
|
}
|
|
1085
1088
|
async evaluate(output, context) {
|
|
1086
|
-
const
|
|
1089
|
+
const claimsPrompt = generateClaimExtractionPrompt({ output });
|
|
1090
|
+
const claims = await this.agent.generate(claimsPrompt, {
|
|
1091
|
+
output: z.object({
|
|
1092
|
+
claims: z.array(z.string())
|
|
1093
|
+
})
|
|
1094
|
+
});
|
|
1095
|
+
if (claims.object.claims.length === 0) {
|
|
1096
|
+
return [];
|
|
1097
|
+
}
|
|
1098
|
+
const evaluatePrompt = generateEvaluatePrompt5({ claims: claims.object.claims, context });
|
|
1087
1099
|
const result = await this.agent.generate(evaluatePrompt, {
|
|
1088
1100
|
output: z.object({
|
|
1089
1101
|
verdicts: z.array(
|
|
@@ -1119,6 +1131,7 @@ var HallucinationMetric = class extends Metric {
|
|
|
1119
1131
|
}
|
|
1120
1132
|
async measure(input, output) {
|
|
1121
1133
|
const verdicts = await this.judge.evaluate(output, this.context);
|
|
1134
|
+
console.log("verdicts", verdicts);
|
|
1122
1135
|
const score = this.calculateScore(verdicts);
|
|
1123
1136
|
const reason = await this.judge.getReason({
|
|
1124
1137
|
input,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/evals",
|
|
3
|
-
"version": "0.1.8-alpha.
|
|
3
|
+
"version": "0.1.8-alpha.10",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -59,7 +59,7 @@
|
|
|
59
59
|
"sentiment": "^5.0.2",
|
|
60
60
|
"string-similarity": "^4.0.4",
|
|
61
61
|
"zod": "^3.24.1",
|
|
62
|
-
"@mastra/core": "^0.5.0-alpha.
|
|
62
|
+
"@mastra/core": "^0.5.0-alpha.10"
|
|
63
63
|
},
|
|
64
64
|
"peerDependencies": {
|
|
65
65
|
"ai": "^4.0.0"
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import { openai } from '@ai-sdk/openai';
|
|
2
|
-
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import type { TestCaseWithContext } from '../utils';
|
|
5
5
|
|
|
6
6
|
import { HallucinationMetric } from './index';
|
|
7
7
|
|
|
8
|
+
vi.setConfig({ testTimeout: 30000, hookTimeout: 30000 });
|
|
9
|
+
|
|
8
10
|
const testCases: TestCaseWithContext[] = [
|
|
9
11
|
{
|
|
10
12
|
// No hallucination - output aligns with context
|
|
@@ -17,7 +19,7 @@ const testCases: TestCaseWithContext[] = [
|
|
|
17
19
|
},
|
|
18
20
|
},
|
|
19
21
|
{
|
|
20
|
-
// Complete hallucination - output contradicts
|
|
22
|
+
// Complete hallucination - output contradicts context
|
|
21
23
|
input: 'Tell me about Tesla.',
|
|
22
24
|
context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.', 'The first Tesla car was the Roadster.'],
|
|
23
25
|
output: 'Tesla was established in 2001 by Elon Musk himself. Their first car was the Model S.',
|
|
@@ -38,33 +40,33 @@ const testCases: TestCaseWithContext[] = [
|
|
|
38
40
|
},
|
|
39
41
|
},
|
|
40
42
|
{
|
|
41
|
-
// Empty output
|
|
43
|
+
// Empty output
|
|
42
44
|
input: 'Tell me about Tesla.',
|
|
43
45
|
context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
|
|
44
46
|
output: '',
|
|
45
47
|
expectedResult: {
|
|
46
48
|
score: 0.0,
|
|
47
|
-
reason: 'Empty output cannot
|
|
49
|
+
reason: 'Empty output cannot contain hallucinations.',
|
|
48
50
|
},
|
|
49
51
|
},
|
|
50
52
|
{
|
|
51
|
-
// Speculative language
|
|
53
|
+
// Speculative language with known facts
|
|
52
54
|
input: 'Tell me about Tesla.',
|
|
53
55
|
context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
|
|
54
56
|
output: 'Tesla might have been founded around 2003, and I believe Elon Musk possibly joined a year later.',
|
|
55
57
|
expectedResult: {
|
|
56
58
|
score: 0.0,
|
|
57
|
-
reason: 'Speculative language
|
|
59
|
+
reason: 'Speculative language about facts that match context is not considered hallucination.',
|
|
58
60
|
},
|
|
59
61
|
},
|
|
60
62
|
{
|
|
61
|
-
// Empty context
|
|
63
|
+
// Empty context
|
|
62
64
|
input: 'Tell me about Tesla.',
|
|
63
65
|
context: [],
|
|
64
66
|
output: 'Tesla was founded in 2001 by Elon Musk.',
|
|
65
67
|
expectedResult: {
|
|
66
|
-
score:
|
|
67
|
-
reason: '
|
|
68
|
+
score: 1.0,
|
|
69
|
+
reason: 'With no context provided, any factual claims are considered hallucinations.',
|
|
68
70
|
},
|
|
69
71
|
},
|
|
70
72
|
{
|
|
@@ -73,8 +75,9 @@ const testCases: TestCaseWithContext[] = [
|
|
|
73
75
|
context: ['SpaceX achieved first successful landing in 2015.', 'Their first crewed mission was in 2020.'],
|
|
74
76
|
output: 'Before anyone else, SpaceX pioneered reusable rockets with their first landing in 2014.',
|
|
75
77
|
expectedResult: {
|
|
76
|
-
score: 0
|
|
77
|
-
reason:
|
|
78
|
+
score: 1.0,
|
|
79
|
+
reason:
|
|
80
|
+
'Both the timing claim (2014 vs 2015) and the unsupported "Before anyone else" pioneering claim are hallucinations.',
|
|
78
81
|
},
|
|
79
82
|
},
|
|
80
83
|
{
|
|
@@ -91,16 +94,15 @@ const testCases: TestCaseWithContext[] = [
|
|
|
91
94
|
// Out of scope additions
|
|
92
95
|
input: 'Tell me about the company.',
|
|
93
96
|
context: ['The company was founded in New York.', 'They specialize in software.'],
|
|
94
|
-
output:
|
|
95
|
-
'The company, founded in New York, specializes in software and has offices worldwide with plans to expand into AI.',
|
|
97
|
+
output: 'The company, founded in New York, specializes in software and has offices worldwide.',
|
|
96
98
|
expectedResult: {
|
|
97
|
-
score: 0.
|
|
99
|
+
score: 0.33,
|
|
98
100
|
reason:
|
|
99
|
-
'
|
|
101
|
+
'One out of three claims (worldwide offices) is a hallucination, while founding location and specialization are supported.',
|
|
100
102
|
},
|
|
101
103
|
},
|
|
102
104
|
{
|
|
103
|
-
// Temporal
|
|
105
|
+
// Temporal sequence
|
|
104
106
|
input: 'Describe the project timeline.',
|
|
105
107
|
context: [
|
|
106
108
|
'Project started in January 2023.',
|
|
@@ -111,104 +113,205 @@ const testCases: TestCaseWithContext[] = [
|
|
|
111
113
|
expectedResult: {
|
|
112
114
|
score: 0.67,
|
|
113
115
|
reason:
|
|
114
|
-
'Two
|
|
116
|
+
'Two out of three claims are hallucinations: Phase 2 completion date and the claim about Phase 2 finishing before Phase 1.',
|
|
115
117
|
},
|
|
116
118
|
},
|
|
117
119
|
{
|
|
118
|
-
// Numerical contradiction
|
|
120
|
+
// Numerical contradiction
|
|
119
121
|
input: 'Tell me about the company size.',
|
|
120
122
|
context: ['The company employs 300 people globally.'],
|
|
121
123
|
output: 'The company has approximately 1000 employees worldwide.',
|
|
122
124
|
expectedResult: {
|
|
123
125
|
score: 1.0,
|
|
124
126
|
reason:
|
|
125
|
-
'Despite using "approximately", the claimed number (1000) represents a deviation too large from the actual value (300)
|
|
127
|
+
'Despite using "approximately", the claimed number (1000) represents a deviation too large from the actual value (300).',
|
|
128
|
+
},
|
|
129
|
+
},
|
|
130
|
+
{
|
|
131
|
+
// Additional information
|
|
132
|
+
input: 'Tell me about Tesla.',
|
|
133
|
+
context: ['Tesla was founded in 2003.'],
|
|
134
|
+
output: 'Tesla, founded in 2003, is a leading electric car manufacturer with global operations.',
|
|
135
|
+
expectedResult: {
|
|
136
|
+
score: 0.67,
|
|
137
|
+
reason:
|
|
138
|
+
'Two out of three claims (being a leading manufacturer and having global operations) are not supported by context.',
|
|
139
|
+
},
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
// Speculative claims about unknown facts
|
|
143
|
+
input: 'Tell me about Tesla.',
|
|
144
|
+
context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
|
|
145
|
+
output: 'Tesla was founded in 2003, and might be the most innovative car company in history.',
|
|
146
|
+
expectedResult: {
|
|
147
|
+
score: 0.5,
|
|
148
|
+
reason: 'While founding date is supported, speculative claim about being most innovative is not in context.',
|
|
149
|
+
},
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
// Date precision
|
|
153
|
+
input: 'Tell me about SpaceX achievements.',
|
|
154
|
+
context: ['SpaceX achieved first successful landing in December 2015.'],
|
|
155
|
+
output: 'SpaceX made history with their first successful landing in 2015.',
|
|
156
|
+
expectedResult: {
|
|
157
|
+
score: 1.0,
|
|
158
|
+
reason:
|
|
159
|
+
'The statement contains an unsupported subjective claim ("made history") that modifies the factual landing claim.',
|
|
160
|
+
},
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
// Numerical precision
|
|
164
|
+
input: 'Tell me about the company size.',
|
|
165
|
+
context: ['The company employs exactly 300 people globally.'],
|
|
166
|
+
output: 'The company has approximately 300 employees worldwide.',
|
|
167
|
+
expectedResult: {
|
|
168
|
+
score: 0.0,
|
|
169
|
+
reason: 'Using "approximately" when context specifies "exactly" is still considered a reasonable approximation.',
|
|
170
|
+
},
|
|
171
|
+
},
|
|
172
|
+
{
|
|
173
|
+
// Mixed precision levels
|
|
174
|
+
input: 'Tell me about revenue growth.',
|
|
175
|
+
context: ['Company revenue grew from exactly $10.5M in Q1 to approximately $20M in Q2.'],
|
|
176
|
+
output: 'Revenue was about $10M in Q1 and exactly $20M in Q2.',
|
|
177
|
+
expectedResult: {
|
|
178
|
+
score: 1.0,
|
|
179
|
+
reason:
|
|
180
|
+
'Mismatched precision levels: uses "about" when context specifies "exactly" for Q1, and uses "exactly" when context specifies "approximately" for Q2.',
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
// Relative comparisons
|
|
185
|
+
input: 'Tell me about the market share.',
|
|
186
|
+
context: ['Company A has 30% market share.', 'Company B has 25% market share.'],
|
|
187
|
+
output: 'Company A leads the market with 30% share, ahead of Company B.',
|
|
188
|
+
expectedResult: {
|
|
189
|
+
score: 0.5,
|
|
190
|
+
reason:
|
|
191
|
+
'While the market share numbers are correct, the claim about "leading the market" is not supported as we don\'t know about other companies.',
|
|
126
192
|
},
|
|
127
193
|
},
|
|
128
194
|
];
|
|
129
195
|
|
|
130
196
|
const model = openai('gpt-4o');
|
|
131
|
-
describe(
|
|
132
|
-
'
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
);
|
|
197
|
+
describe('HallucinationMetric', () => {
|
|
198
|
+
it('should handle perfect alignment', async () => {
|
|
199
|
+
const testCase = testCases[0]!;
|
|
200
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
201
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
202
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
it('should handle complete hallucination', async () => {
|
|
206
|
+
const testCase = testCases[1]!;
|
|
207
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
208
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
209
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
it('should handle partial hallucination', async () => {
|
|
213
|
+
const testCase = testCases[2]!;
|
|
214
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
215
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
216
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
it('should handle empty output', async () => {
|
|
220
|
+
const testCase = testCases[3]!;
|
|
221
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
222
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
223
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
it('should handle speculative language', async () => {
|
|
227
|
+
const testCase = testCases[4]!;
|
|
228
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
229
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
230
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
it('should handle empty context', async () => {
|
|
234
|
+
const testCase = testCases[5]!;
|
|
235
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
236
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
237
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
it('should handle implicit contradictions', async () => {
|
|
241
|
+
const testCase = testCases[6]!;
|
|
242
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
243
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
244
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
245
|
+
});
|
|
246
|
+
|
|
247
|
+
it('should handle numerical approximations', async () => {
|
|
248
|
+
const testCase = testCases[7]!;
|
|
249
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
250
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
251
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
it('should handle out of scope additions', async () => {
|
|
255
|
+
const testCase = testCases[8]!;
|
|
256
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
257
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
258
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
it('should handle temporal contradictions', async () => {
|
|
262
|
+
const testCase = testCases[9]!;
|
|
263
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
264
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
265
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
it('should handle numerical contradiction despite approximation', async () => {
|
|
269
|
+
const testCase = testCases[10]!;
|
|
270
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
271
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
272
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
// New tests for stricter hallucination checking
|
|
276
|
+
it('should detect additional information as hallucination', async () => {
|
|
277
|
+
const testCase = testCases[11]!;
|
|
278
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
279
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
280
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
it('should detect speculative claims about unknown facts as hallucination', async () => {
|
|
284
|
+
const testCase = testCases[12]!;
|
|
285
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
286
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
287
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
it('should enforce strict date matching', async () => {
|
|
291
|
+
const testCase = testCases[13]!;
|
|
292
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
293
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
294
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
it('should enforce strict numerical matching', async () => {
|
|
298
|
+
const testCase = testCases[14]!;
|
|
299
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
300
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
301
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
it('should handle mixed precision levels', async () => {
|
|
305
|
+
const testCase = testCases[15]!;
|
|
306
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
307
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
308
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
309
|
+
});
|
|
310
|
+
|
|
311
|
+
it('should handle relative comparisons', async () => {
|
|
312
|
+
const testCase = testCases[16]!;
|
|
313
|
+
const metric = new HallucinationMetric(model, { context: testCase.context });
|
|
314
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
315
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
316
|
+
});
|
|
317
|
+
});
|
|
@@ -26,6 +26,7 @@ export class HallucinationMetric extends Metric {
|
|
|
26
26
|
|
|
27
27
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|
|
28
28
|
const verdicts = await this.judge.evaluate(output, this.context);
|
|
29
|
+
console.log('verdicts', verdicts);
|
|
29
30
|
const score = this.calculateScore(verdicts);
|
|
30
31
|
const reason = await this.judge.getReason({
|
|
31
32
|
input,
|
|
@@ -2,7 +2,7 @@ import type { LanguageModel } from '@mastra/core/llm';
|
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
5
|
-
|
|
5
|
+
import { generateClaimExtractionPrompt } from '../faithfulness/prompts';
|
|
6
6
|
import { generateEvaluatePrompt, HALLUCINATION_AGENT_INSTRUCTIONS, generateReasonPrompt } from './prompts';
|
|
7
7
|
|
|
8
8
|
export class HallucinationJudge extends MastraAgentJudge {
|
|
@@ -11,7 +11,18 @@ export class HallucinationJudge extends MastraAgentJudge {
|
|
|
11
11
|
}
|
|
12
12
|
|
|
13
13
|
async evaluate(output: string, context: string[]): Promise<{ statement: string; verdict: string; reason: string }[]> {
|
|
14
|
-
const
|
|
14
|
+
const claimsPrompt = generateClaimExtractionPrompt({ output });
|
|
15
|
+
const claims = await this.agent.generate(claimsPrompt, {
|
|
16
|
+
output: z.object({
|
|
17
|
+
claims: z.array(z.string()),
|
|
18
|
+
}),
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
if (claims.object.claims.length === 0) {
|
|
22
|
+
return [];
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const evaluatePrompt = generateEvaluatePrompt({ claims: claims.object.claims, context });
|
|
15
26
|
const result = await this.agent.generate(evaluatePrompt, {
|
|
16
27
|
output: z.object({
|
|
17
28
|
verdicts: z.array(
|
|
@@ -1,96 +1,99 @@
|
|
|
1
|
-
export const HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output
|
|
1
|
+
export const HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
|
|
2
2
|
|
|
3
3
|
Key Principles:
|
|
4
|
-
1.
|
|
5
|
-
2.
|
|
6
|
-
3. Consider a
|
|
7
|
-
4. Consider
|
|
8
|
-
5. Empty outputs should be handled as having no
|
|
9
|
-
6.
|
|
10
|
-
7.
|
|
11
|
-
8.
|
|
4
|
+
1. First extract all claims from the output (both factual and speculative)
|
|
5
|
+
2. Then verify each extracted claim against the provided context
|
|
6
|
+
3. Consider it a hallucination if a claim contradicts the context
|
|
7
|
+
4. Consider it a hallucination if a claim makes assertions not supported by context
|
|
8
|
+
5. Empty outputs should be handled as having no hallucinations
|
|
9
|
+
6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
|
|
10
|
+
7. Speculative language about facts NOT in the context IS a hallucination
|
|
11
|
+
8. Never use prior knowledge in judgments - only use what's explicitly stated in context
|
|
12
|
+
9. The following are NOT hallucinations:
|
|
13
|
+
- Using less precise dates (e.g., year when context gives month)
|
|
14
|
+
- Reasonable numerical approximations
|
|
15
|
+
- Omitting additional details while maintaining factual accuracy
|
|
16
|
+
10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
|
|
12
17
|
|
|
13
|
-
export function generateEvaluatePrompt({ context,
|
|
14
|
-
return `Verify if the
|
|
18
|
+
export function generateEvaluatePrompt({ context, claims }: { context: string[]; claims: string[] }) {
|
|
19
|
+
return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
|
|
20
|
+
1. Contradicts the context
|
|
21
|
+
2. Makes assertions not supported by the context
|
|
15
22
|
|
|
16
|
-
|
|
17
|
-
${
|
|
23
|
+
Claims to verify:
|
|
24
|
+
${claims.join('\n')}
|
|
18
25
|
|
|
19
26
|
Number of context statements: ${context.length}
|
|
20
27
|
|
|
21
|
-
Context statements
|
|
28
|
+
Context statements:
|
|
22
29
|
${context.join('\n')}
|
|
23
30
|
|
|
24
|
-
For each
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
-
|
|
28
|
-
-
|
|
31
|
+
For each claim, determine if it is supported by the context. When evaluating:
|
|
32
|
+
|
|
33
|
+
1. NOT Hallucinations:
|
|
34
|
+
- Using less precise dates (e.g., year when context gives month)
|
|
35
|
+
- Reasonable numerical approximations
|
|
36
|
+
- Omitting additional details while maintaining factual accuracy
|
|
37
|
+
- Speculative language about facts present in context
|
|
38
|
+
|
|
39
|
+
2. ARE Hallucinations:
|
|
40
|
+
- Claims that contradict the context
|
|
41
|
+
- Assertions not supported by context
|
|
42
|
+
- Speculative claims about facts not in context
|
|
43
|
+
- Subjective claims not explicitly supported by context
|
|
29
44
|
|
|
30
45
|
Example:
|
|
31
|
-
Context:
|
|
32
|
-
|
|
46
|
+
Context: [
|
|
47
|
+
"SpaceX achieved first successful landing in December 2015.",
|
|
48
|
+
"Their reusable rocket technology reduced launch costs by 30%."
|
|
49
|
+
]
|
|
50
|
+
Claims: [
|
|
51
|
+
"SpaceX made history in 2015",
|
|
52
|
+
"SpaceX had pioneering reusable rockets",
|
|
53
|
+
"reusable rockets significantly cut costs",
|
|
54
|
+
"They might expand operations globally"
|
|
55
|
+
]
|
|
33
56
|
{
|
|
34
57
|
"verdicts": [
|
|
35
58
|
{
|
|
36
|
-
"statement": "
|
|
59
|
+
"statement": "SpaceX made history in 2015",
|
|
37
60
|
"verdict": "yes",
|
|
38
|
-
"reason": "The
|
|
39
|
-
}
|
|
40
|
-
]
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
Context: "The company has exactly 1,234 employees"
|
|
44
|
-
Output: "The company employs around 1,200 people"
|
|
45
|
-
{
|
|
46
|
-
"verdicts": [
|
|
61
|
+
"reason": "The subjective claim 'made history' and the year are not supported by context"
|
|
62
|
+
},
|
|
47
63
|
{
|
|
48
|
-
"statement": "
|
|
49
|
-
"verdict": "
|
|
50
|
-
"reason": "
|
|
51
|
-
}
|
|
52
|
-
]
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
Context: "Revenue reached $50.5 million in 2022"
|
|
56
|
-
Output: "The company made about $50 million in 2022"
|
|
57
|
-
{
|
|
58
|
-
"verdicts": [
|
|
64
|
+
"statement": "SpaceX had pioneering reusable rockets",
|
|
65
|
+
"verdict": "yes",
|
|
66
|
+
"reason": "The subjective claim 'pioneering' is not supported by context"
|
|
67
|
+
},
|
|
59
68
|
{
|
|
60
|
-
"statement": "
|
|
69
|
+
"statement": "reusable rockets significantly cut costs",
|
|
61
70
|
"verdict": "no",
|
|
62
|
-
"reason": "
|
|
63
|
-
}
|
|
64
|
-
]
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
Context: "The startup raised $2.1 million in seed funding"
|
|
68
|
-
Output: "The company secured approximately $5 million in their seed round"
|
|
69
|
-
{
|
|
70
|
-
"verdicts": [
|
|
71
|
+
"reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
|
|
72
|
+
},
|
|
71
73
|
{
|
|
72
|
-
"statement": "
|
|
74
|
+
"statement": "They might expand operations globally",
|
|
73
75
|
"verdict": "yes",
|
|
74
|
-
"reason": "
|
|
76
|
+
"reason": "This speculative claim about facts not in context is a hallucination"
|
|
75
77
|
}
|
|
76
78
|
]
|
|
77
79
|
}
|
|
78
80
|
|
|
79
81
|
Rules:
|
|
80
|
-
-
|
|
81
|
-
-
|
|
82
|
+
- Mark as hallucination if information contradicts context
|
|
83
|
+
- Mark as hallucination if assertions aren't supported by context
|
|
84
|
+
- Allow reasonable approximations and less precise dates
|
|
85
|
+
- Every factual claim must be verified
|
|
82
86
|
- Never use prior knowledge in your judgment
|
|
83
87
|
- Provide clear reasoning for each verdict
|
|
84
|
-
- Be specific about
|
|
85
|
-
- The number of verdicts MUST MATCH the number of context statements exactly
|
|
88
|
+
- Be specific about what information is or isn't supported by context
|
|
86
89
|
|
|
87
90
|
Format:
|
|
88
91
|
{
|
|
89
92
|
"verdicts": [
|
|
90
93
|
{
|
|
91
|
-
"statement": "
|
|
94
|
+
"statement": "individual claim",
|
|
92
95
|
"verdict": "yes/no",
|
|
93
|
-
"reason": "explanation of
|
|
96
|
+
"reason": "explanation of whether the claim is supported by context"
|
|
94
97
|
}
|
|
95
98
|
]
|
|
96
99
|
}`;
|