@mastra/evals 0.1.8-alpha.0 → 0.1.8-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,37 +1,37 @@
1
1
 
2
- > @mastra/evals@0.1.8-alpha.0 build /home/runner/work/mastra/mastra/packages/evals
2
+ > @mastra/evals@0.1.8-alpha.10 build /home/runner/work/mastra/mastra/packages/evals
3
3
  > pnpm check && tsup src/index.ts src/metrics/judge/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm,cjs --experimental-dts --clean --treeshake
4
4
 
5
5
 
6
- > @mastra/evals@0.1.8-alpha.0 check /home/runner/work/mastra/mastra/packages/evals
6
+ > @mastra/evals@0.1.8-alpha.10 check /home/runner/work/mastra/mastra/packages/evals
7
7
  > tsc --noEmit
8
8
 
9
- CLI Building entry: src/index.ts, src/metrics/judge/index.ts, src/metrics/llm/index.ts, src/metrics/nlp/index.ts
9
+ CLI Building entry: src/index.ts, src/metrics/judge/index.ts, src/metrics/nlp/index.ts, src/metrics/llm/index.ts
10
10
  CLI Using tsconfig: tsconfig.json
11
11
  CLI tsup v8.3.6
12
12
  TSC Build start
13
- TSC ⚡️ Build success in 9785ms
13
+ TSC ⚡️ Build success in 16273ms
14
14
  DTS Build start
15
15
  CLI Target: es2022
16
16
  Analysis will use the bundled TypeScript version 5.7.3
17
17
  Writing package typings: /home/runner/work/mastra/mastra/packages/evals/dist/_tsup-dts-rollup.d.ts
18
18
  Analysis will use the bundled TypeScript version 5.7.3
19
19
  Writing package typings: /home/runner/work/mastra/mastra/packages/evals/dist/_tsup-dts-rollup.d.cts
20
- DTS ⚡️ Build success in 9198ms
20
+ DTS ⚡️ Build success in 10916ms
21
21
  CLI Cleaning output folder
22
22
  ESM Build start
23
23
  CJS Build start
24
24
  CJS dist/metrics/judge/index.cjs 341.00 B
25
- CJS dist/metrics/llm/index.cjs 86.28 KB
26
25
  CJS dist/metrics/nlp/index.cjs 6.94 KB
26
+ CJS dist/metrics/llm/index.cjs 86.80 KB
27
27
  CJS dist/index.cjs 655.25 KB
28
- CJS ⚡️ Build success in 7816ms
28
+ CJS ⚡️ Build success in 10747ms
29
29
  ESM dist/index.js 2.63 KB
30
30
  ESM dist/metrics/judge/index.js 94.00 B
31
- ESM dist/chunk-TXXJUIES.js 305.00 B
32
31
  ESM dist/metrics/nlp/index.js 6.30 KB
32
+ ESM dist/chunk-TXXJUIES.js 305.00 B
33
33
  ESM dist/chunk-4VNS5WPM.js 1.82 KB
34
- ESM dist/metrics/llm/index.js 85.32 KB
34
+ ESM dist/metrics/llm/index.js 85.82 KB
35
35
  ESM dist/magic-string.es-5UDOWOAZ.js 40.80 KB
36
36
  ESM dist/dist-EOJDANYG.js 571.17 KB
37
- ESM ⚡️ Build success in 7823ms
37
+ ESM ⚡️ Build success in 10756ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,93 @@
1
1
  # @mastra/evals
2
2
 
3
+ ## 0.1.8-alpha.10
4
+
5
+ ### Patch Changes
6
+
7
+ - 9d31a36: Update hallucination eval
8
+ - Updated dependencies [a910463]
9
+ - @mastra/core@0.5.0-alpha.10
10
+
11
+ ## 0.1.8-alpha.9
12
+
13
+ ### Patch Changes
14
+
15
+ - Updated dependencies [e9fbac5]
16
+ - Updated dependencies [1e8bcbc]
17
+ - Updated dependencies [aeb5e36]
18
+ - Updated dependencies [f2301de]
19
+ - @mastra/core@0.5.0-alpha.9
20
+
21
+ ## 0.1.8-alpha.8
22
+
23
+ ### Patch Changes
24
+
25
+ - Updated dependencies [506f1d5]
26
+ - @mastra/core@0.5.0-alpha.8
27
+
28
+ ## 0.1.8-alpha.7
29
+
30
+ ### Patch Changes
31
+
32
+ - Updated dependencies [ee667a2]
33
+ - @mastra/core@0.5.0-alpha.7
34
+
35
+ ## 0.1.8-alpha.6
36
+
37
+ ### Patch Changes
38
+
39
+ - Updated dependencies [f6678e4]
40
+ - @mastra/core@0.5.0-alpha.6
41
+
42
+ ## 0.1.8-alpha.5
43
+
44
+ ### Patch Changes
45
+
46
+ - Updated dependencies [22643eb]
47
+ - Updated dependencies [6feb23f]
48
+ - Updated dependencies [f2d6727]
49
+ - Updated dependencies [301e4ee]
50
+ - Updated dependencies [dfbe4e9]
51
+ - Updated dependencies [9e81f35]
52
+ - Updated dependencies [caefaa2]
53
+ - Updated dependencies [c151ae6]
54
+ - Updated dependencies [52e0418]
55
+ - Updated dependencies [03236ec]
56
+ - Updated dependencies [3764e71]
57
+ - Updated dependencies [df982db]
58
+ - Updated dependencies [0461849]
59
+ - Updated dependencies [2259379]
60
+ - Updated dependencies [358f069]
61
+ - @mastra/core@0.5.0-alpha.5
62
+
63
+ ## 0.1.8-alpha.4
64
+
65
+ ### Patch Changes
66
+
67
+ - Updated dependencies [d79aedf]
68
+ - @mastra/core@0.5.0-alpha.4
69
+
70
+ ## 0.1.8-alpha.3
71
+
72
+ ### Patch Changes
73
+
74
+ - Updated dependencies [3d0e290]
75
+ - @mastra/core@0.5.0-alpha.3
76
+
77
+ ## 0.1.8-alpha.2
78
+
79
+ ### Patch Changes
80
+
81
+ - Updated dependencies [02ffb7b]
82
+ - @mastra/core@0.5.0-alpha.2
83
+
84
+ ## 0.1.8-alpha.1
85
+
86
+ ### Patch Changes
87
+
88
+ - Updated dependencies [dab255b]
89
+ - @mastra/core@0.5.0-alpha.1
90
+
3
91
  ## 0.1.8-alpha.0
4
92
 
5
93
  ### Patch Changes
@@ -355,9 +355,9 @@ export declare function generateEvaluatePrompt_alias_6({ claims, context }: {
355
355
  context: string[];
356
356
  }): string;
357
357
 
358
- export declare function generateEvaluatePrompt_alias_7({ context, output }: {
358
+ export declare function generateEvaluatePrompt_alias_7({ context, claims }: {
359
359
  context: string[];
360
- output: string;
360
+ claims: string[];
361
361
  }): string;
362
362
 
363
363
  export declare function generateEvaluatePrompt_alias_8({ instructions, input, output, }: {
@@ -504,7 +504,7 @@ declare function globalSetup(): Promise<void>;
504
504
  export { globalSetup }
505
505
  export { globalSetup as globalSetup_alias_1 }
506
506
 
507
- export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.\n\nKey Principles:\n1. Treat each context piece as a statement to verify\n2. Verify if the output contradicts any of these statements\n3. Consider a contradiction when the output directly conflicts with context statements\n4. Consider no contradiction when the output aligns with or doesn't mention context statements\n5. Empty outputs should be handled as having no contradictions\n6. Focus on factual inconsistencies, not omissions\n7. Never use prior knowledge in judgments\n8. Speculative language (may, might, possibly) should not be considered contradictions";
507
+ export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.\n\nKey Principles:\n1. First extract all claims from the output (both factual and speculative)\n2. Then verify each extracted claim against the provided context\n3. Consider it a hallucination if a claim contradicts the context\n4. Consider it a hallucination if a claim makes assertions not supported by context\n5. Empty outputs should be handled as having no hallucinations\n6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination\n7. Speculative language about facts NOT in the context IS a hallucination\n8. Never use prior knowledge in judgments - only use what's explicitly stated in context\n9. The following are NOT hallucinations:\n - Using less precise dates (e.g., year when context gives month)\n - Reasonable numerical approximations\n - Omitting additional details while maintaining factual accuracy\n10. Subjective claims (\"made history\", \"pioneering\", \"leading\") are hallucinations unless explicitly stated in context";
508
508
 
509
509
  export declare class HallucinationJudge extends MastraAgentJudge {
510
510
  constructor(model: LanguageModel);
@@ -355,9 +355,9 @@ export declare function generateEvaluatePrompt_alias_6({ claims, context }: {
355
355
  context: string[];
356
356
  }): string;
357
357
 
358
- export declare function generateEvaluatePrompt_alias_7({ context, output }: {
358
+ export declare function generateEvaluatePrompt_alias_7({ context, claims }: {
359
359
  context: string[];
360
- output: string;
360
+ claims: string[];
361
361
  }): string;
362
362
 
363
363
  export declare function generateEvaluatePrompt_alias_8({ instructions, input, output, }: {
@@ -504,7 +504,7 @@ declare function globalSetup(): Promise<void>;
504
504
  export { globalSetup }
505
505
  export { globalSetup as globalSetup_alias_1 }
506
506
 
507
- export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.\n\nKey Principles:\n1. Treat each context piece as a statement to verify\n2. Verify if the output contradicts any of these statements\n3. Consider a contradiction when the output directly conflicts with context statements\n4. Consider no contradiction when the output aligns with or doesn't mention context statements\n5. Empty outputs should be handled as having no contradictions\n6. Focus on factual inconsistencies, not omissions\n7. Never use prior knowledge in judgments\n8. Speculative language (may, might, possibly) should not be considered contradictions";
507
+ export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.\n\nKey Principles:\n1. First extract all claims from the output (both factual and speculative)\n2. Then verify each extracted claim against the provided context\n3. Consider it a hallucination if a claim contradicts the context\n4. Consider it a hallucination if a claim makes assertions not supported by context\n5. Empty outputs should be handled as having no hallucinations\n6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination\n7. Speculative language about facts NOT in the context IS a hallucination\n8. Never use prior knowledge in judgments - only use what's explicitly stated in context\n9. The following are NOT hallucinations:\n - Using less precise dates (e.g., year when context gives month)\n - Reasonable numerical approximations\n - Omitting additional details while maintaining factual accuracy\n10. Subjective claims (\"made history\", \"pioneering\", \"leading\") are hallucinations unless explicitly stated in context";
508
508
 
509
509
  export declare class HallucinationJudge extends MastraAgentJudge {
510
510
  constructor(model: LanguageModel);
@@ -955,98 +955,101 @@ var FaithfulnessMetric = class extends _eval.Metric {
955
955
  };
956
956
 
957
957
  // src/metrics/llm/hallucination/prompts.ts
958
- var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.
958
+ var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
959
959
 
960
960
  Key Principles:
961
- 1. Treat each context piece as a statement to verify
962
- 2. Verify if the output contradicts any of these statements
963
- 3. Consider a contradiction when the output directly conflicts with context statements
964
- 4. Consider no contradiction when the output aligns with or doesn't mention context statements
965
- 5. Empty outputs should be handled as having no contradictions
966
- 6. Focus on factual inconsistencies, not omissions
967
- 7. Never use prior knowledge in judgments
968
- 8. Speculative language (may, might, possibly) should not be considered contradictions`;
969
- function generateEvaluatePrompt5({ context, output }) {
970
- return `Verify if the output contradicts any of the provided context statements. A contradiction occurs when the output directly conflicts with a statement.
971
-
972
- Output to verify:
973
- ${output}
961
+ 1. First extract all claims from the output (both factual and speculative)
962
+ 2. Then verify each extracted claim against the provided context
963
+ 3. Consider it a hallucination if a claim contradicts the context
964
+ 4. Consider it a hallucination if a claim makes assertions not supported by context
965
+ 5. Empty outputs should be handled as having no hallucinations
966
+ 6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
967
+ 7. Speculative language about facts NOT in the context IS a hallucination
968
+ 8. Never use prior knowledge in judgments - only use what's explicitly stated in context
969
+ 9. The following are NOT hallucinations:
970
+ - Using less precise dates (e.g., year when context gives month)
971
+ - Reasonable numerical approximations
972
+ - Omitting additional details while maintaining factual accuracy
973
+ 10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
974
+ function generateEvaluatePrompt5({ context, claims }) {
975
+ return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
976
+ 1. Contradicts the context
977
+ 2. Makes assertions not supported by the context
978
+
979
+ Claims to verify:
980
+ ${claims.join("\n")}
974
981
 
975
982
  Number of context statements: ${context.length}
976
983
 
977
- Context statements to check:
984
+ Context statements:
978
985
  ${context.join("\n")}
979
986
 
980
- For each context statement, determine if the output contradicts it. When evaluating numbers:
981
- - Numbers with qualifiers ("about", "around", "approximately") allow reasonable approximations
982
- - Consider the scale of the number when determining reasonable approximations
983
- - Only mark as contradiction if the difference would be misleading in context
984
- - Respect explicit precision markers ("exactly", "precisely")
987
+ For each claim, determine if it is supported by the context. When evaluating:
988
+
989
+ 1. NOT Hallucinations:
990
+ - Using less precise dates (e.g., year when context gives month)
991
+ - Reasonable numerical approximations
992
+ - Omitting additional details while maintaining factual accuracy
993
+ - Speculative language about facts present in context
994
+
995
+ 2. ARE Hallucinations:
996
+ - Claims that contradict the context
997
+ - Assertions not supported by context
998
+ - Speculative claims about facts not in context
999
+ - Subjective claims not explicitly supported by context
985
1000
 
986
1001
  Example:
987
- Context: "Tesla was founded in 2003"
988
- Output: "Tesla, established in 2004, revolutionized the electric car industry."
1002
+ Context: [
1003
+ "SpaceX achieved first successful landing in December 2015.",
1004
+ "Their reusable rocket technology reduced launch costs by 30%."
1005
+ ]
1006
+ Claims: [
1007
+ "SpaceX made history in 2015",
1008
+ "SpaceX had pioneering reusable rockets",
1009
+ "reusable rockets significantly cut costs",
1010
+ "They might expand operations globally"
1011
+ ]
989
1012
  {
990
1013
  "verdicts": [
991
1014
  {
992
- "statement": "Tesla was founded in 2003",
1015
+ "statement": "SpaceX made history in 2015",
993
1016
  "verdict": "yes",
994
- "reason": "The output claims Tesla was established in 2004, which directly contradicts the statement that it was founded in 2003"
995
- }
996
- ]
997
- }
998
-
999
- Context: "The company has exactly 1,234 employees"
1000
- Output: "The company employs around 1,200 people"
1001
- {
1002
- "verdicts": [
1017
+ "reason": "The subjective claim 'made history' and the year are not supported by context"
1018
+ },
1003
1019
  {
1004
- "statement": "The company has exactly 1,234 employees",
1005
- "verdict": "no",
1006
- "reason": "While the output uses an approximation (around 1,200), this is a reasonable representation of 1,234 employees and maintains the correct order of magnitude"
1007
- }
1008
- ]
1009
- }
1010
-
1011
- Context: "Revenue reached $50.5 million in 2022"
1012
- Output: "The company made about $50 million in 2022"
1013
- {
1014
- "verdicts": [
1020
+ "statement": "SpaceX had pioneering reusable rockets",
1021
+ "verdict": "yes",
1022
+ "reason": "The subjective claim 'pioneering' is not supported by context"
1023
+ },
1015
1024
  {
1016
- "statement": "Revenue reached $50.5 million in 2022",
1025
+ "statement": "reusable rockets significantly cut costs",
1017
1026
  "verdict": "no",
1018
- "reason": "The output's approximation of 'about $50 million' is a reasonable representation of $50.5 million, maintaining accuracy at this scale"
1019
- }
1020
- ]
1021
- }
1022
-
1023
- Context: "The startup raised $2.1 million in seed funding"
1024
- Output: "The company secured approximately $5 million in their seed round"
1025
- {
1026
- "verdicts": [
1027
+ "reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
1028
+ },
1027
1029
  {
1028
- "statement": "The startup raised $2.1 million in seed funding",
1030
+ "statement": "They might expand operations globally",
1029
1031
  "verdict": "yes",
1030
- "reason": "Despite using 'approximately', the output claims $5 million which is more than double the actual amount ($2.1 million), making it a significant and misleading deviation"
1032
+ "reason": "This speculative claim about facts not in context is a hallucination"
1031
1033
  }
1032
1034
  ]
1033
1035
  }
1034
1036
 
1035
1037
  Rules:
1036
- - Only mark as contradicted if there's a direct conflict
1037
- - Omissions are not contradictions
1038
+ - Mark as hallucination if information contradicts context
1039
+ - Mark as hallucination if assertions aren't supported by context
1040
+ - Allow reasonable approximations and less precise dates
1041
+ - Every factual claim must be verified
1038
1042
  - Never use prior knowledge in your judgment
1039
1043
  - Provide clear reasoning for each verdict
1040
- - Be specific about where in the output the contradiction occurs
1041
- - The number of verdicts MUST MATCH the number of context statements exactly
1044
+ - Be specific about what information is or isn't supported by context
1042
1045
 
1043
1046
  Format:
1044
1047
  {
1045
1048
  "verdicts": [
1046
1049
  {
1047
- "statement": "context statement",
1050
+ "statement": "individual claim",
1048
1051
  "verdict": "yes/no",
1049
- "reason": "explanation of contradiction or lack thereof"
1052
+ "reason": "explanation of whether the claim is supported by context"
1050
1053
  }
1051
1054
  ]
1052
1055
  }`;
@@ -1096,7 +1099,16 @@ var HallucinationJudge = class extends MastraAgentJudge {
1096
1099
  super("Hallucination", HALLUCINATION_AGENT_INSTRUCTIONS, model);
1097
1100
  }
1098
1101
  async evaluate(output, context) {
1099
- const evaluatePrompt = generateEvaluatePrompt5({ context, output });
1102
+ const claimsPrompt = generateClaimExtractionPrompt({ output });
1103
+ const claims = await this.agent.generate(claimsPrompt, {
1104
+ output: zod.z.object({
1105
+ claims: zod.z.array(zod.z.string())
1106
+ })
1107
+ });
1108
+ if (claims.object.claims.length === 0) {
1109
+ return [];
1110
+ }
1111
+ const evaluatePrompt = generateEvaluatePrompt5({ claims: claims.object.claims, context });
1100
1112
  const result = await this.agent.generate(evaluatePrompt, {
1101
1113
  output: zod.z.object({
1102
1114
  verdicts: zod.z.array(
@@ -1132,6 +1144,7 @@ var HallucinationMetric = class extends _eval.Metric {
1132
1144
  }
1133
1145
  async measure(input, output) {
1134
1146
  const verdicts = await this.judge.evaluate(output, this.context);
1147
+ console.log("verdicts", verdicts);
1135
1148
  const score = this.calculateScore(verdicts);
1136
1149
  const reason = await this.judge.getReason({
1137
1150
  input,
@@ -942,98 +942,101 @@ var FaithfulnessMetric = class extends Metric {
942
942
  };
943
943
 
944
944
  // src/metrics/llm/hallucination/prompts.ts
945
- var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.
945
+ var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
946
946
 
947
947
  Key Principles:
948
- 1. Treat each context piece as a statement to verify
949
- 2. Verify if the output contradicts any of these statements
950
- 3. Consider a contradiction when the output directly conflicts with context statements
951
- 4. Consider no contradiction when the output aligns with or doesn't mention context statements
952
- 5. Empty outputs should be handled as having no contradictions
953
- 6. Focus on factual inconsistencies, not omissions
954
- 7. Never use prior knowledge in judgments
955
- 8. Speculative language (may, might, possibly) should not be considered contradictions`;
956
- function generateEvaluatePrompt5({ context, output }) {
957
- return `Verify if the output contradicts any of the provided context statements. A contradiction occurs when the output directly conflicts with a statement.
958
-
959
- Output to verify:
960
- ${output}
948
+ 1. First extract all claims from the output (both factual and speculative)
949
+ 2. Then verify each extracted claim against the provided context
950
+ 3. Consider it a hallucination if a claim contradicts the context
951
+ 4. Consider it a hallucination if a claim makes assertions not supported by context
952
+ 5. Empty outputs should be handled as having no hallucinations
953
+ 6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
954
+ 7. Speculative language about facts NOT in the context IS a hallucination
955
+ 8. Never use prior knowledge in judgments - only use what's explicitly stated in context
956
+ 9. The following are NOT hallucinations:
957
+ - Using less precise dates (e.g., year when context gives month)
958
+ - Reasonable numerical approximations
959
+ - Omitting additional details while maintaining factual accuracy
960
+ 10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
961
+ function generateEvaluatePrompt5({ context, claims }) {
962
+ return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
963
+ 1. Contradicts the context
964
+ 2. Makes assertions not supported by the context
965
+
966
+ Claims to verify:
967
+ ${claims.join("\n")}
961
968
 
962
969
  Number of context statements: ${context.length}
963
970
 
964
- Context statements to check:
971
+ Context statements:
965
972
  ${context.join("\n")}
966
973
 
967
- For each context statement, determine if the output contradicts it. When evaluating numbers:
968
- - Numbers with qualifiers ("about", "around", "approximately") allow reasonable approximations
969
- - Consider the scale of the number when determining reasonable approximations
970
- - Only mark as contradiction if the difference would be misleading in context
971
- - Respect explicit precision markers ("exactly", "precisely")
974
+ For each claim, determine if it is supported by the context. When evaluating:
975
+
976
+ 1. NOT Hallucinations:
977
+ - Using less precise dates (e.g., year when context gives month)
978
+ - Reasonable numerical approximations
979
+ - Omitting additional details while maintaining factual accuracy
980
+ - Speculative language about facts present in context
981
+
982
+ 2. ARE Hallucinations:
983
+ - Claims that contradict the context
984
+ - Assertions not supported by context
985
+ - Speculative claims about facts not in context
986
+ - Subjective claims not explicitly supported by context
972
987
 
973
988
  Example:
974
- Context: "Tesla was founded in 2003"
975
- Output: "Tesla, established in 2004, revolutionized the electric car industry."
989
+ Context: [
990
+ "SpaceX achieved first successful landing in December 2015.",
991
+ "Their reusable rocket technology reduced launch costs by 30%."
992
+ ]
993
+ Claims: [
994
+ "SpaceX made history in 2015",
995
+ "SpaceX had pioneering reusable rockets",
996
+ "reusable rockets significantly cut costs",
997
+ "They might expand operations globally"
998
+ ]
976
999
  {
977
1000
  "verdicts": [
978
1001
  {
979
- "statement": "Tesla was founded in 2003",
1002
+ "statement": "SpaceX made history in 2015",
980
1003
  "verdict": "yes",
981
- "reason": "The output claims Tesla was established in 2004, which directly contradicts the statement that it was founded in 2003"
982
- }
983
- ]
984
- }
985
-
986
- Context: "The company has exactly 1,234 employees"
987
- Output: "The company employs around 1,200 people"
988
- {
989
- "verdicts": [
1004
+ "reason": "The subjective claim 'made history' and the year are not supported by context"
1005
+ },
990
1006
  {
991
- "statement": "The company has exactly 1,234 employees",
992
- "verdict": "no",
993
- "reason": "While the output uses an approximation (around 1,200), this is a reasonable representation of 1,234 employees and maintains the correct order of magnitude"
994
- }
995
- ]
996
- }
997
-
998
- Context: "Revenue reached $50.5 million in 2022"
999
- Output: "The company made about $50 million in 2022"
1000
- {
1001
- "verdicts": [
1007
+ "statement": "SpaceX had pioneering reusable rockets",
1008
+ "verdict": "yes",
1009
+ "reason": "The subjective claim 'pioneering' is not supported by context"
1010
+ },
1002
1011
  {
1003
- "statement": "Revenue reached $50.5 million in 2022",
1012
+ "statement": "reusable rockets significantly cut costs",
1004
1013
  "verdict": "no",
1005
- "reason": "The output's approximation of 'about $50 million' is a reasonable representation of $50.5 million, maintaining accuracy at this scale"
1006
- }
1007
- ]
1008
- }
1009
-
1010
- Context: "The startup raised $2.1 million in seed funding"
1011
- Output: "The company secured approximately $5 million in their seed round"
1012
- {
1013
- "verdicts": [
1014
+ "reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
1015
+ },
1014
1016
  {
1015
- "statement": "The startup raised $2.1 million in seed funding",
1017
+ "statement": "They might expand operations globally",
1016
1018
  "verdict": "yes",
1017
- "reason": "Despite using 'approximately', the output claims $5 million which is more than double the actual amount ($2.1 million), making it a significant and misleading deviation"
1019
+ "reason": "This speculative claim about facts not in context is a hallucination"
1018
1020
  }
1019
1021
  ]
1020
1022
  }
1021
1023
 
1022
1024
  Rules:
1023
- - Only mark as contradicted if there's a direct conflict
1024
- - Omissions are not contradictions
1025
+ - Mark as hallucination if information contradicts context
1026
+ - Mark as hallucination if assertions aren't supported by context
1027
+ - Allow reasonable approximations and less precise dates
1028
+ - Every factual claim must be verified
1025
1029
  - Never use prior knowledge in your judgment
1026
1030
  - Provide clear reasoning for each verdict
1027
- - Be specific about where in the output the contradiction occurs
1028
- - The number of verdicts MUST MATCH the number of context statements exactly
1031
+ - Be specific about what information is or isn't supported by context
1029
1032
 
1030
1033
  Format:
1031
1034
  {
1032
1035
  "verdicts": [
1033
1036
  {
1034
- "statement": "context statement",
1037
+ "statement": "individual claim",
1035
1038
  "verdict": "yes/no",
1036
- "reason": "explanation of contradiction or lack thereof"
1039
+ "reason": "explanation of whether the claim is supported by context"
1037
1040
  }
1038
1041
  ]
1039
1042
  }`;
@@ -1083,7 +1086,16 @@ var HallucinationJudge = class extends MastraAgentJudge {
1083
1086
  super("Hallucination", HALLUCINATION_AGENT_INSTRUCTIONS, model);
1084
1087
  }
1085
1088
  async evaluate(output, context) {
1086
- const evaluatePrompt = generateEvaluatePrompt5({ context, output });
1089
+ const claimsPrompt = generateClaimExtractionPrompt({ output });
1090
+ const claims = await this.agent.generate(claimsPrompt, {
1091
+ output: z.object({
1092
+ claims: z.array(z.string())
1093
+ })
1094
+ });
1095
+ if (claims.object.claims.length === 0) {
1096
+ return [];
1097
+ }
1098
+ const evaluatePrompt = generateEvaluatePrompt5({ claims: claims.object.claims, context });
1087
1099
  const result = await this.agent.generate(evaluatePrompt, {
1088
1100
  output: z.object({
1089
1101
  verdicts: z.array(
@@ -1119,6 +1131,7 @@ var HallucinationMetric = class extends Metric {
1119
1131
  }
1120
1132
  async measure(input, output) {
1121
1133
  const verdicts = await this.judge.evaluate(output, this.context);
1134
+ console.log("verdicts", verdicts);
1122
1135
  const score = this.calculateScore(verdicts);
1123
1136
  const reason = await this.judge.getReason({
1124
1137
  input,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/evals",
3
- "version": "0.1.8-alpha.0",
3
+ "version": "0.1.8-alpha.10",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -59,7 +59,7 @@
59
59
  "sentiment": "^5.0.2",
60
60
  "string-similarity": "^4.0.4",
61
61
  "zod": "^3.24.1",
62
- "@mastra/core": "^0.5.0-alpha.0"
62
+ "@mastra/core": "^0.5.0-alpha.10"
63
63
  },
64
64
  "peerDependencies": {
65
65
  "ai": "^4.0.0"
@@ -1,10 +1,12 @@
1
1
  import { openai } from '@ai-sdk/openai';
2
- import { describe, it, expect } from 'vitest';
2
+ import { describe, it, expect, vi } from 'vitest';
3
3
 
4
4
  import type { TestCaseWithContext } from '../utils';
5
5
 
6
6
  import { HallucinationMetric } from './index';
7
7
 
8
+ vi.setConfig({ testTimeout: 30000, hookTimeout: 30000 });
9
+
8
10
  const testCases: TestCaseWithContext[] = [
9
11
  {
10
12
  // No hallucination - output aligns with context
@@ -17,7 +19,7 @@ const testCases: TestCaseWithContext[] = [
17
19
  },
18
20
  },
19
21
  {
20
- // Complete hallucination - output contradicts all context
22
+ // Complete hallucination - output contradicts context
21
23
  input: 'Tell me about Tesla.',
22
24
  context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.', 'The first Tesla car was the Roadster.'],
23
25
  output: 'Tesla was established in 2001 by Elon Musk himself. Their first car was the Model S.',
@@ -38,33 +40,33 @@ const testCases: TestCaseWithContext[] = [
38
40
  },
39
41
  },
40
42
  {
41
- // Empty output - should have no contradictions
43
+ // Empty output
42
44
  input: 'Tell me about Tesla.',
43
45
  context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
44
46
  output: '',
45
47
  expectedResult: {
46
48
  score: 0.0,
47
- reason: 'Empty output cannot contradict any context statements.',
49
+ reason: 'Empty output cannot contain hallucinations.',
48
50
  },
49
51
  },
50
52
  {
51
- // Speculative language - should not count as contradictions
53
+ // Speculative language with known facts
52
54
  input: 'Tell me about Tesla.',
53
55
  context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
54
56
  output: 'Tesla might have been founded around 2003, and I believe Elon Musk possibly joined a year later.',
55
57
  expectedResult: {
56
58
  score: 0.0,
57
- reason: 'Speculative language (might, possibly, believe) does not constitute contradictions with context.',
59
+ reason: 'Speculative language about facts that match context is not considered hallucination.',
58
60
  },
59
61
  },
60
62
  {
61
- // Empty context - should return score of 0
63
+ // Empty context
62
64
  input: 'Tell me about Tesla.',
63
65
  context: [],
64
66
  output: 'Tesla was founded in 2001 by Elon Musk.',
65
67
  expectedResult: {
66
- score: 0.0,
67
- reason: 'No context statements to contradict, resulting in zero hallucination score.',
68
+ score: 1.0,
69
+ reason: 'With no context provided, any factual claims are considered hallucinations.',
68
70
  },
69
71
  },
70
72
  {
@@ -73,8 +75,9 @@ const testCases: TestCaseWithContext[] = [
73
75
  context: ['SpaceX achieved first successful landing in 2015.', 'Their first crewed mission was in 2020.'],
74
76
  output: 'Before anyone else, SpaceX pioneered reusable rockets with their first landing in 2014.',
75
77
  expectedResult: {
76
- score: 0.5,
77
- reason: 'One context statement is contradicted through implicit claim about timing (2014 vs 2015).',
78
+ score: 1.0,
79
+ reason:
80
+ 'Both the timing claim (2014 vs 2015) and the unsupported "Before anyone else" pioneering claim are hallucinations.',
78
81
  },
79
82
  },
80
83
  {
@@ -91,16 +94,15 @@ const testCases: TestCaseWithContext[] = [
91
94
  // Out of scope additions
92
95
  input: 'Tell me about the company.',
93
96
  context: ['The company was founded in New York.', 'They specialize in software.'],
94
- output:
95
- 'The company, founded in New York, specializes in software and has offices worldwide with plans to expand into AI.',
97
+ output: 'The company, founded in New York, specializes in software and has offices worldwide.',
96
98
  expectedResult: {
97
- score: 0.0,
99
+ score: 0.33,
98
100
  reason:
99
- 'Additional information beyond context scope is not counted as contradictions unless it directly conflicts with context.',
101
+ 'One out of three claims (worldwide offices) is a hallucination, while founding location and specialization are supported.',
100
102
  },
101
103
  },
102
104
  {
103
- // Temporal contradictions
105
+ // Temporal sequence
104
106
  input: 'Describe the project timeline.',
105
107
  context: [
106
108
  'Project started in January 2023.',
@@ -111,104 +113,205 @@ const testCases: TestCaseWithContext[] = [
111
113
  expectedResult: {
112
114
  score: 0.67,
113
115
  reason:
114
- 'Two context statements are contradicted through temporal inconsistency in phase completion order and dates.',
116
+ 'Two out of three claims are hallucinations: Phase 2 completion date and the claim about Phase 2 finishing before Phase 1.',
115
117
  },
116
118
  },
117
119
  {
118
- // Numerical contradiction despite approximation
120
+ // Numerical contradiction
119
121
  input: 'Tell me about the company size.',
120
122
  context: ['The company employs 300 people globally.'],
121
123
  output: 'The company has approximately 1000 employees worldwide.',
122
124
  expectedResult: {
123
125
  score: 1.0,
124
126
  reason:
125
- 'Despite using "approximately", the claimed number (1000) represents a deviation too large from the actual value (300) to be considered a reasonable approximation.',
127
+ 'Despite using "approximately", the claimed number (1000) represents a deviation too large from the actual value (300).',
128
+ },
129
+ },
130
+ {
131
+ // Additional information
132
+ input: 'Tell me about Tesla.',
133
+ context: ['Tesla was founded in 2003.'],
134
+ output: 'Tesla, founded in 2003, is a leading electric car manufacturer with global operations.',
135
+ expectedResult: {
136
+ score: 0.67,
137
+ reason:
138
+ 'Two out of three claims (being a leading manufacturer and having global operations) are not supported by context.',
139
+ },
140
+ },
141
+ {
142
+ // Speculative claims about unknown facts
143
+ input: 'Tell me about Tesla.',
144
+ context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
145
+ output: 'Tesla was founded in 2003, and might be the most innovative car company in history.',
146
+ expectedResult: {
147
+ score: 0.5,
148
+ reason: 'While founding date is supported, speculative claim about being most innovative is not in context.',
149
+ },
150
+ },
151
+ {
152
+ // Date precision
153
+ input: 'Tell me about SpaceX achievements.',
154
+ context: ['SpaceX achieved first successful landing in December 2015.'],
155
+ output: 'SpaceX made history with their first successful landing in 2015.',
156
+ expectedResult: {
157
+ score: 1.0,
158
+ reason:
159
+ 'The statement contains an unsupported subjective claim ("made history") that modifies the factual landing claim.',
160
+ },
161
+ },
162
+ {
163
+ // Numerical precision
164
+ input: 'Tell me about the company size.',
165
+ context: ['The company employs exactly 300 people globally.'],
166
+ output: 'The company has approximately 300 employees worldwide.',
167
+ expectedResult: {
168
+ score: 0.0,
169
+ reason: 'Using "approximately" when context specifies "exactly" is still considered a reasonable approximation.',
170
+ },
171
+ },
172
+ {
173
+ // Mixed precision levels
174
+ input: 'Tell me about revenue growth.',
175
+ context: ['Company revenue grew from exactly $10.5M in Q1 to approximately $20M in Q2.'],
176
+ output: 'Revenue was about $10M in Q1 and exactly $20M in Q2.',
177
+ expectedResult: {
178
+ score: 1.0,
179
+ reason:
180
+ 'Mismatched precision levels: uses "about" when context specifies "exactly" for Q1, and uses "exactly" when context specifies "approximately" for Q2.',
181
+ },
182
+ },
183
+ {
184
+ // Relative comparisons
185
+ input: 'Tell me about the market share.',
186
+ context: ['Company A has 30% market share.', 'Company B has 25% market share.'],
187
+ output: 'Company A leads the market with 30% share, ahead of Company B.',
188
+ expectedResult: {
189
+ score: 0.5,
190
+ reason:
191
+ 'While the market share numbers are correct, the claim about "leading the market" is not supported as we don\'t know about other companies.',
126
192
  },
127
193
  },
128
194
  ];
129
195
 
130
196
  const model = openai('gpt-4o');
131
- describe(
132
- 'HallucinationMetric',
133
- () => {
134
- it('should handle perfect alignment', async () => {
135
- const testCase = testCases[0]!;
136
- const metric = new HallucinationMetric(model, { context: testCase.context });
137
- const result = await metric.measure(testCase.input, testCase.output);
138
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
139
- });
140
-
141
- it('should handle complete hallucination', async () => {
142
- const testCase = testCases[1]!;
143
- const metric = new HallucinationMetric(model, { context: testCase.context });
144
- const result = await metric.measure(testCase.input, testCase.output);
145
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
146
- });
147
-
148
- it('should handle partial hallucination', async () => {
149
- const testCase = testCases[2]!;
150
- const metric = new HallucinationMetric(model, { context: testCase.context });
151
- const result = await metric.measure(testCase.input, testCase.output);
152
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
153
- });
154
-
155
- it('should handle empty output', async () => {
156
- const testCase = testCases[3]!;
157
- const metric = new HallucinationMetric(model, { context: testCase.context });
158
- const result = await metric.measure(testCase.input, testCase.output);
159
- expect(result.score).toBe(testCase.expectedResult.score);
160
- });
161
-
162
- it('should handle speculative language', async () => {
163
- const testCase = testCases[4]!;
164
- const metric = new HallucinationMetric(model, { context: testCase.context });
165
- const result = await metric.measure(testCase.input, testCase.output);
166
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
167
- });
168
-
169
- it('should handle empty context', async () => {
170
- const testCase = testCases[5]!;
171
- const metric = new HallucinationMetric(model, { context: testCase.context });
172
- const result = await metric.measure(testCase.input, testCase.output);
173
- expect(result.score).toBe(testCase.expectedResult.score);
174
- });
175
-
176
- it('should handle implicit contradictions', async () => {
177
- const testCase = testCases[6]!;
178
- const metric = new HallucinationMetric(model, { context: testCase.context });
179
- const result = await metric.measure(testCase.input, testCase.output);
180
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
181
- });
182
-
183
- it('should handle numerical approximations', async () => {
184
- const testCase = testCases[7]!;
185
- const metric = new HallucinationMetric(model, { context: testCase.context });
186
- const result = await metric.measure(testCase.input, testCase.output);
187
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
188
- });
189
-
190
- it('should handle out of scope additions', async () => {
191
- const testCase = testCases[8]!;
192
- const metric = new HallucinationMetric(model, { context: testCase.context });
193
- const result = await metric.measure(testCase.input, testCase.output);
194
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
195
- });
196
-
197
- it('should handle temporal contradictions', async () => {
198
- const testCase = testCases[9]!;
199
- const metric = new HallucinationMetric(model, { context: testCase.context });
200
- const result = await metric.measure(testCase.input, testCase.output);
201
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
202
- });
203
-
204
- it('should handle numerical contradiction despite approximation', async () => {
205
- const testCase = testCases[10]!;
206
- const metric = new HallucinationMetric(model, { context: testCase.context });
207
- const result = await metric.measure(testCase.input, testCase.output);
208
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
209
- });
210
- },
211
- {
212
- timeout: 15 * 10000,
213
- },
214
- );
197
+ describe('HallucinationMetric', () => {
198
+ it('should handle perfect alignment', async () => {
199
+ const testCase = testCases[0]!;
200
+ const metric = new HallucinationMetric(model, { context: testCase.context });
201
+ const result = await metric.measure(testCase.input, testCase.output);
202
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
203
+ });
204
+
205
+ it('should handle complete hallucination', async () => {
206
+ const testCase = testCases[1]!;
207
+ const metric = new HallucinationMetric(model, { context: testCase.context });
208
+ const result = await metric.measure(testCase.input, testCase.output);
209
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
210
+ });
211
+
212
+ it('should handle partial hallucination', async () => {
213
+ const testCase = testCases[2]!;
214
+ const metric = new HallucinationMetric(model, { context: testCase.context });
215
+ const result = await metric.measure(testCase.input, testCase.output);
216
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
217
+ });
218
+
219
+ it('should handle empty output', async () => {
220
+ const testCase = testCases[3]!;
221
+ const metric = new HallucinationMetric(model, { context: testCase.context });
222
+ const result = await metric.measure(testCase.input, testCase.output);
223
+ expect(result.score).toBe(testCase.expectedResult.score);
224
+ });
225
+
226
+ it('should handle speculative language', async () => {
227
+ const testCase = testCases[4]!;
228
+ const metric = new HallucinationMetric(model, { context: testCase.context });
229
+ const result = await metric.measure(testCase.input, testCase.output);
230
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
231
+ });
232
+
233
+ it('should handle empty context', async () => {
234
+ const testCase = testCases[5]!;
235
+ const metric = new HallucinationMetric(model, { context: testCase.context });
236
+ const result = await metric.measure(testCase.input, testCase.output);
237
+ expect(result.score).toBe(testCase.expectedResult.score);
238
+ });
239
+
240
+ it('should handle implicit contradictions', async () => {
241
+ const testCase = testCases[6]!;
242
+ const metric = new HallucinationMetric(model, { context: testCase.context });
243
+ const result = await metric.measure(testCase.input, testCase.output);
244
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
245
+ });
246
+
247
+ it('should handle numerical approximations', async () => {
248
+ const testCase = testCases[7]!;
249
+ const metric = new HallucinationMetric(model, { context: testCase.context });
250
+ const result = await metric.measure(testCase.input, testCase.output);
251
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
252
+ });
253
+
254
+ it('should handle out of scope additions', async () => {
255
+ const testCase = testCases[8]!;
256
+ const metric = new HallucinationMetric(model, { context: testCase.context });
257
+ const result = await metric.measure(testCase.input, testCase.output);
258
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
259
+ });
260
+
261
+ it('should handle temporal contradictions', async () => {
262
+ const testCase = testCases[9]!;
263
+ const metric = new HallucinationMetric(model, { context: testCase.context });
264
+ const result = await metric.measure(testCase.input, testCase.output);
265
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
266
+ });
267
+
268
+ it('should handle numerical contradiction despite approximation', async () => {
269
+ const testCase = testCases[10]!;
270
+ const metric = new HallucinationMetric(model, { context: testCase.context });
271
+ const result = await metric.measure(testCase.input, testCase.output);
272
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
273
+ });
274
+
275
+ // New tests for stricter hallucination checking
276
+ it('should detect additional information as hallucination', async () => {
277
+ const testCase = testCases[11]!;
278
+ const metric = new HallucinationMetric(model, { context: testCase.context });
279
+ const result = await metric.measure(testCase.input, testCase.output);
280
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
281
+ });
282
+
283
+ it('should detect speculative claims about unknown facts as hallucination', async () => {
284
+ const testCase = testCases[12]!;
285
+ const metric = new HallucinationMetric(model, { context: testCase.context });
286
+ const result = await metric.measure(testCase.input, testCase.output);
287
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
288
+ });
289
+
290
+ it('should enforce strict date matching', async () => {
291
+ const testCase = testCases[13]!;
292
+ const metric = new HallucinationMetric(model, { context: testCase.context });
293
+ const result = await metric.measure(testCase.input, testCase.output);
294
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
295
+ });
296
+
297
+ it('should enforce strict numerical matching', async () => {
298
+ const testCase = testCases[14]!;
299
+ const metric = new HallucinationMetric(model, { context: testCase.context });
300
+ const result = await metric.measure(testCase.input, testCase.output);
301
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
302
+ });
303
+
304
+ it('should handle mixed precision levels', async () => {
305
+ const testCase = testCases[15]!;
306
+ const metric = new HallucinationMetric(model, { context: testCase.context });
307
+ const result = await metric.measure(testCase.input, testCase.output);
308
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
309
+ });
310
+
311
+ it('should handle relative comparisons', async () => {
312
+ const testCase = testCases[16]!;
313
+ const metric = new HallucinationMetric(model, { context: testCase.context });
314
+ const result = await metric.measure(testCase.input, testCase.output);
315
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
316
+ });
317
+ });
@@ -26,6 +26,7 @@ export class HallucinationMetric extends Metric {
26
26
 
27
27
  async measure(input: string, output: string): Promise<MetricResultWithReason> {
28
28
  const verdicts = await this.judge.evaluate(output, this.context);
29
+ console.log('verdicts', verdicts);
29
30
  const score = this.calculateScore(verdicts);
30
31
  const reason = await this.judge.getReason({
31
32
  input,
@@ -2,7 +2,7 @@ import type { LanguageModel } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
5
-
5
+ import { generateClaimExtractionPrompt } from '../faithfulness/prompts';
6
6
  import { generateEvaluatePrompt, HALLUCINATION_AGENT_INSTRUCTIONS, generateReasonPrompt } from './prompts';
7
7
 
8
8
  export class HallucinationJudge extends MastraAgentJudge {
@@ -11,7 +11,18 @@ export class HallucinationJudge extends MastraAgentJudge {
11
11
  }
12
12
 
13
13
  async evaluate(output: string, context: string[]): Promise<{ statement: string; verdict: string; reason: string }[]> {
14
- const evaluatePrompt = generateEvaluatePrompt({ context, output });
14
+ const claimsPrompt = generateClaimExtractionPrompt({ output });
15
+ const claims = await this.agent.generate(claimsPrompt, {
16
+ output: z.object({
17
+ claims: z.array(z.string()),
18
+ }),
19
+ });
20
+
21
+ if (claims.object.claims.length === 0) {
22
+ return [];
23
+ }
24
+
25
+ const evaluatePrompt = generateEvaluatePrompt({ claims: claims.object.claims, context });
15
26
  const result = await this.agent.generate(evaluatePrompt, {
16
27
  output: z.object({
17
28
  verdicts: z.array(
@@ -1,96 +1,99 @@
1
- export const HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.
1
+ export const HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
2
2
 
3
3
  Key Principles:
4
- 1. Treat each context piece as a statement to verify
5
- 2. Verify if the output contradicts any of these statements
6
- 3. Consider a contradiction when the output directly conflicts with context statements
7
- 4. Consider no contradiction when the output aligns with or doesn't mention context statements
8
- 5. Empty outputs should be handled as having no contradictions
9
- 6. Focus on factual inconsistencies, not omissions
10
- 7. Never use prior knowledge in judgments
11
- 8. Speculative language (may, might, possibly) should not be considered contradictions`;
4
+ 1. First extract all claims from the output (both factual and speculative)
5
+ 2. Then verify each extracted claim against the provided context
6
+ 3. Consider it a hallucination if a claim contradicts the context
7
+ 4. Consider it a hallucination if a claim makes assertions not supported by context
8
+ 5. Empty outputs should be handled as having no hallucinations
9
+ 6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
10
+ 7. Speculative language about facts NOT in the context IS a hallucination
11
+ 8. Never use prior knowledge in judgments - only use what's explicitly stated in context
12
+ 9. The following are NOT hallucinations:
13
+ - Using less precise dates (e.g., year when context gives month)
14
+ - Reasonable numerical approximations
15
+ - Omitting additional details while maintaining factual accuracy
16
+ 10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
12
17
 
13
- export function generateEvaluatePrompt({ context, output }: { context: string[]; output: string }) {
14
- return `Verify if the output contradicts any of the provided context statements. A contradiction occurs when the output directly conflicts with a statement.
18
+ export function generateEvaluatePrompt({ context, claims }: { context: string[]; claims: string[] }) {
19
+ return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
20
+ 1. Contradicts the context
21
+ 2. Makes assertions not supported by the context
15
22
 
16
- Output to verify:
17
- ${output}
23
+ Claims to verify:
24
+ ${claims.join('\n')}
18
25
 
19
26
  Number of context statements: ${context.length}
20
27
 
21
- Context statements to check:
28
+ Context statements:
22
29
  ${context.join('\n')}
23
30
 
24
- For each context statement, determine if the output contradicts it. When evaluating numbers:
25
- - Numbers with qualifiers ("about", "around", "approximately") allow reasonable approximations
26
- - Consider the scale of the number when determining reasonable approximations
27
- - Only mark as contradiction if the difference would be misleading in context
28
- - Respect explicit precision markers ("exactly", "precisely")
31
+ For each claim, determine if it is supported by the context. When evaluating:
32
+
33
+ 1. NOT Hallucinations:
34
+ - Using less precise dates (e.g., year when context gives month)
35
+ - Reasonable numerical approximations
36
+ - Omitting additional details while maintaining factual accuracy
37
+ - Speculative language about facts present in context
38
+
39
+ 2. ARE Hallucinations:
40
+ - Claims that contradict the context
41
+ - Assertions not supported by context
42
+ - Speculative claims about facts not in context
43
+ - Subjective claims not explicitly supported by context
29
44
 
30
45
  Example:
31
- Context: "Tesla was founded in 2003"
32
- Output: "Tesla, established in 2004, revolutionized the electric car industry."
46
+ Context: [
47
+ "SpaceX achieved first successful landing in December 2015.",
48
+ "Their reusable rocket technology reduced launch costs by 30%."
49
+ ]
50
+ Claims: [
51
+ "SpaceX made history in 2015",
52
+ "SpaceX had pioneering reusable rockets",
53
+ "reusable rockets significantly cut costs",
54
+ "They might expand operations globally"
55
+ ]
33
56
  {
34
57
  "verdicts": [
35
58
  {
36
- "statement": "Tesla was founded in 2003",
59
+ "statement": "SpaceX made history in 2015",
37
60
  "verdict": "yes",
38
- "reason": "The output claims Tesla was established in 2004, which directly contradicts the statement that it was founded in 2003"
39
- }
40
- ]
41
- }
42
-
43
- Context: "The company has exactly 1,234 employees"
44
- Output: "The company employs around 1,200 people"
45
- {
46
- "verdicts": [
61
+ "reason": "The subjective claim 'made history' and the year are not supported by context"
62
+ },
47
63
  {
48
- "statement": "The company has exactly 1,234 employees",
49
- "verdict": "no",
50
- "reason": "While the output uses an approximation (around 1,200), this is a reasonable representation of 1,234 employees and maintains the correct order of magnitude"
51
- }
52
- ]
53
- }
54
-
55
- Context: "Revenue reached $50.5 million in 2022"
56
- Output: "The company made about $50 million in 2022"
57
- {
58
- "verdicts": [
64
+ "statement": "SpaceX had pioneering reusable rockets",
65
+ "verdict": "yes",
66
+ "reason": "The subjective claim 'pioneering' is not supported by context"
67
+ },
59
68
  {
60
- "statement": "Revenue reached $50.5 million in 2022",
69
+ "statement": "reusable rockets significantly cut costs",
61
70
  "verdict": "no",
62
- "reason": "The output's approximation of 'about $50 million' is a reasonable representation of $50.5 million, maintaining accuracy at this scale"
63
- }
64
- ]
65
- }
66
-
67
- Context: "The startup raised $2.1 million in seed funding"
68
- Output: "The company secured approximately $5 million in their seed round"
69
- {
70
- "verdicts": [
71
+ "reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
72
+ },
71
73
  {
72
- "statement": "The startup raised $2.1 million in seed funding",
74
+ "statement": "They might expand operations globally",
73
75
  "verdict": "yes",
74
- "reason": "Despite using 'approximately', the output claims $5 million which is more than double the actual amount ($2.1 million), making it a significant and misleading deviation"
76
+ "reason": "This speculative claim about facts not in context is a hallucination"
75
77
  }
76
78
  ]
77
79
  }
78
80
 
79
81
  Rules:
80
- - Only mark as contradicted if there's a direct conflict
81
- - Omissions are not contradictions
82
+ - Mark as hallucination if information contradicts context
83
+ - Mark as hallucination if assertions aren't supported by context
84
+ - Allow reasonable approximations and less precise dates
85
+ - Every factual claim must be verified
82
86
  - Never use prior knowledge in your judgment
83
87
  - Provide clear reasoning for each verdict
84
- - Be specific about where in the output the contradiction occurs
85
- - The number of verdicts MUST MATCH the number of context statements exactly
88
+ - Be specific about what information is or isn't supported by context
86
89
 
87
90
  Format:
88
91
  {
89
92
  "verdicts": [
90
93
  {
91
- "statement": "context statement",
94
+ "statement": "individual claim",
92
95
  "verdict": "yes/no",
93
- "reason": "explanation of contradiction or lack thereof"
96
+ "reason": "explanation of whether the claim is supported by context"
94
97
  }
95
98
  ]
96
99
  }`;