npm - @mastra/evals - Versions diffs - 0.1.8-alpha.0 → 0.1.8-alpha.10 - Mend

@mastra/evals 0.1.8-alpha.0 → 0.1.8-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/.turbo/turbo-build.log +10 -10
package/CHANGELOG.md +88 -0
package/dist/_tsup-dts-rollup.d.cts +3 -3
package/dist/_tsup-dts-rollup.d.ts +3 -3
package/dist/metrics/llm/index.cjs +75 -62
package/dist/metrics/llm/index.js +75 -62
package/package.json +2 -2
package/src/metrics/llm/hallucination/index.test.ts +206 -103
package/src/metrics/llm/hallucination/index.ts +1 -0
package/src/metrics/llm/hallucination/metricJudge.ts +13 -2
package/src/metrics/llm/hallucination/prompts.ts +63 -60

package/.turbo/turbo-build.log CHANGED Viewed

@@ -1,37 +1,37 @@
-> @mastra/evals@0.1.8-alpha.0 build /home/runner/work/mastra/mastra/packages/evals
+> @mastra/evals@0.1.8-alpha.10 build /home/runner/work/mastra/mastra/packages/evals
 > pnpm check && tsup src/index.ts src/metrics/judge/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm,cjs --experimental-dts --clean --treeshake
-> @mastra/evals@0.1.8-alpha.0 check /home/runner/work/mastra/mastra/packages/evals
+> @mastra/evals@0.1.8-alpha.10 check /home/runner/work/mastra/mastra/packages/evals
 > tsc --noEmit
-[34mCLI[39m Building entry: src/index.ts, src/metrics/judge/index.ts, src/metrics/llm/index.ts, src/metrics/nlp/index.ts
+[34mCLI[39m Building entry: src/index.ts, src/metrics/judge/index.ts, src/metrics/nlp/index.ts, src/metrics/llm/index.ts
 [34mCLI[39m Using tsconfig: tsconfig.json
 [34mCLI[39m tsup v8.3.6
 [34mTSC[39m Build start
-[32mTSC[39m ⚡️ Build success in 9785ms
+[32mTSC[39m ⚡️ Build success in 16273ms
 [34mDTS[39m Build start
 [34mCLI[39m Target: es2022
 Analysis will use the bundled TypeScript version 5.7.3
 [36mWriting package typings: /home/runner/work/mastra/mastra/packages/evals/dist/_tsup-dts-rollup.d.ts[39m
 Analysis will use the bundled TypeScript version 5.7.3
 [36mWriting package typings: /home/runner/work/mastra/mastra/packages/evals/dist/_tsup-dts-rollup.d.cts[39m
-[32mDTS[39m ⚡️ Build success in 9198ms
+[32mDTS[39m ⚡️ Build success in 10916ms
 [34mCLI[39m Cleaning output folder
 [34mESM[39m Build start
 [34mCJS[39m Build start
 [32mCJS[39m [1mdist/metrics/judge/index.cjs [22m[32m341.00 B[39m
-[32mCJS[39m [1mdist/metrics/llm/index.cjs   [22m[32m86.28 KB[39m
 [32mCJS[39m [1mdist/metrics/nlp/index.cjs   [22m[32m6.94 KB[39m
+[32mCJS[39m [1mdist/metrics/llm/index.cjs   [22m[32m86.80 KB[39m
 [32mCJS[39m [1mdist/index.cjs               [22m[32m655.25 KB[39m
-[32mCJS[39m ⚡️ Build success in 7816ms
+[32mCJS[39m ⚡️ Build success in 10747ms
 [32mESM[39m [1mdist/index.js                    [22m[32m2.63 KB[39m
 [32mESM[39m [1mdist/metrics/judge/index.js      [22m[32m94.00 B[39m
-[32mESM[39m [1mdist/chunk-TXXJUIES.js           [22m[32m305.00 B[39m
 [32mESM[39m [1mdist/metrics/nlp/index.js        [22m[32m6.30 KB[39m
+[32mESM[39m [1mdist/chunk-TXXJUIES.js           [22m[32m305.00 B[39m
 [32mESM[39m [1mdist/chunk-4VNS5WPM.js           [22m[32m1.82 KB[39m
-[32mESM[39m [1mdist/metrics/llm/index.js        [22m[32m85.32 KB[39m
+[32mESM[39m [1mdist/metrics/llm/index.js        [22m[32m85.82 KB[39m
 [32mESM[39m [1mdist/magic-string.es-5UDOWOAZ.js [22m[32m40.80 KB[39m
 [32mESM[39m [1mdist/dist-EOJDANYG.js            [22m[32m571.17 KB[39m
-[32mESM[39m ⚡️ Build success in 7823ms
+[32mESM[39m ⚡️ Build success in 10756ms

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,93 @@
 # @mastra/evals
+## 0.1.8-alpha.10
+### Patch Changes
+- 9d31a36: Update hallucination eval
+- Updated dependencies [a910463]
+  - @mastra/core@0.5.0-alpha.10
+## 0.1.8-alpha.9
+### Patch Changes
+- Updated dependencies [e9fbac5]
+- Updated dependencies [1e8bcbc]
+- Updated dependencies [aeb5e36]
+- Updated dependencies [f2301de]
+  - @mastra/core@0.5.0-alpha.9
+## 0.1.8-alpha.8
+### Patch Changes
+- Updated dependencies [506f1d5]
+  - @mastra/core@0.5.0-alpha.8
+## 0.1.8-alpha.7
+### Patch Changes
+- Updated dependencies [ee667a2]
+  - @mastra/core@0.5.0-alpha.7
+## 0.1.8-alpha.6
+### Patch Changes
+- Updated dependencies [f6678e4]
+  - @mastra/core@0.5.0-alpha.6
+## 0.1.8-alpha.5
+### Patch Changes
+- Updated dependencies [22643eb]
+- Updated dependencies [6feb23f]
+- Updated dependencies [f2d6727]
+- Updated dependencies [301e4ee]
+- Updated dependencies [dfbe4e9]
+- Updated dependencies [9e81f35]
+- Updated dependencies [caefaa2]
+- Updated dependencies [c151ae6]
+- Updated dependencies [52e0418]
+- Updated dependencies [03236ec]
+- Updated dependencies [3764e71]
+- Updated dependencies [df982db]
+- Updated dependencies [0461849]
+- Updated dependencies [2259379]
+- Updated dependencies [358f069]
+  - @mastra/core@0.5.0-alpha.5
+## 0.1.8-alpha.4
+### Patch Changes
+- Updated dependencies [d79aedf]
+  - @mastra/core@0.5.0-alpha.4
+## 0.1.8-alpha.3
+### Patch Changes
+- Updated dependencies [3d0e290]
+  - @mastra/core@0.5.0-alpha.3
+## 0.1.8-alpha.2
+### Patch Changes
+- Updated dependencies [02ffb7b]
+  - @mastra/core@0.5.0-alpha.2
+## 0.1.8-alpha.1
+### Patch Changes
+- Updated dependencies [dab255b]
+  - @mastra/core@0.5.0-alpha.1
 ## 0.1.8-alpha.0
 ### Patch Changes

package/dist/_tsup-dts-rollup.d.cts CHANGED Viewed

@@ -355,9 +355,9 @@ export declare function generateEvaluatePrompt_alias_6({ claims, context }: {
     context: string[];
 }): string;
-export declare function generateEvaluatePrompt_alias_7({ context, output }: {
+export declare function generateEvaluatePrompt_alias_7({ context, claims }: {
     context: string[];
-    output: string;
+    claims: string[];
 }): string;
 export declare function generateEvaluatePrompt_alias_8({ instructions, input, output, }: {
@@ -504,7 +504,7 @@ declare function globalSetup(): Promise<void>;
 export { globalSetup }
 export { globalSetup as globalSetup_alias_1 }
-export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.\n\nKey Principles:\n1. Treat each context piece as a statement to verify\n2. Verify if the output contradicts any of these statements\n3. Consider a contradiction when the output directly conflicts with context statements\n4. Consider no contradiction when the output aligns with or doesn't mention context statements\n5. Empty outputs should be handled as having no contradictions\n6. Focus on factual inconsistencies, not omissions\n7. Never use prior knowledge in judgments\n8. Speculative language (may, might, possibly) should not be considered contradictions";
+export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.\n\nKey Principles:\n1. First extract all claims from the output (both factual and speculative)\n2. Then verify each extracted claim against the provided context\n3. Consider it a hallucination if a claim contradicts the context\n4. Consider it a hallucination if a claim makes assertions not supported by context\n5. Empty outputs should be handled as having no hallucinations\n6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination\n7. Speculative language about facts NOT in the context IS a hallucination\n8. Never use prior knowledge in judgments - only use what's explicitly stated in context\n9. The following are NOT hallucinations:\n   - Using less precise dates (e.g., year when context gives month)\n   - Reasonable numerical approximations\n   - Omitting additional details while maintaining factual accuracy\n10. Subjective claims (\"made history\", \"pioneering\", \"leading\") are hallucinations unless explicitly stated in context";
 export declare class HallucinationJudge extends MastraAgentJudge {
     constructor(model: LanguageModel);

package/dist/_tsup-dts-rollup.d.ts CHANGED Viewed

@@ -355,9 +355,9 @@ export declare function generateEvaluatePrompt_alias_6({ claims, context }: {
     context: string[];
 }): string;
-export declare function generateEvaluatePrompt_alias_7({ context, output }: {
+export declare function generateEvaluatePrompt_alias_7({ context, claims }: {
     context: string[];
-    output: string;
+    claims: string[];
 }): string;
 export declare function generateEvaluatePrompt_alias_8({ instructions, input, output, }: {
@@ -504,7 +504,7 @@ declare function globalSetup(): Promise<void>;
 export { globalSetup }
 export { globalSetup as globalSetup_alias_1 }
-export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.\n\nKey Principles:\n1. Treat each context piece as a statement to verify\n2. Verify if the output contradicts any of these statements\n3. Consider a contradiction when the output directly conflicts with context statements\n4. Consider no contradiction when the output aligns with or doesn't mention context statements\n5. Empty outputs should be handled as having no contradictions\n6. Focus on factual inconsistencies, not omissions\n7. Never use prior knowledge in judgments\n8. Speculative language (may, might, possibly) should not be considered contradictions";
+export declare const HALLUCINATION_AGENT_INSTRUCTIONS = "You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.\n\nKey Principles:\n1. First extract all claims from the output (both factual and speculative)\n2. Then verify each extracted claim against the provided context\n3. Consider it a hallucination if a claim contradicts the context\n4. Consider it a hallucination if a claim makes assertions not supported by context\n5. Empty outputs should be handled as having no hallucinations\n6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination\n7. Speculative language about facts NOT in the context IS a hallucination\n8. Never use prior knowledge in judgments - only use what's explicitly stated in context\n9. The following are NOT hallucinations:\n   - Using less precise dates (e.g., year when context gives month)\n   - Reasonable numerical approximations\n   - Omitting additional details while maintaining factual accuracy\n10. Subjective claims (\"made history\", \"pioneering\", \"leading\") are hallucinations unless explicitly stated in context";
 export declare class HallucinationJudge extends MastraAgentJudge {
     constructor(model: LanguageModel);

package/dist/metrics/llm/index.cjs CHANGED Viewed

@@ -955,98 +955,101 @@ var FaithfulnessMetric = class extends _eval.Metric {
 };
 // src/metrics/llm/hallucination/prompts.ts
-var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.
+var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
 Key Principles:
-1. Treat each context piece as a statement to verify
-2. Verify if the output contradicts any of these statements
-3. Consider a contradiction when the output directly conflicts with context statements
-4. Consider no contradiction when the output aligns with or doesn't mention context statements
-5. Empty outputs should be handled as having no contradictions
-6. Focus on factual inconsistencies, not omissions
-7. Never use prior knowledge in judgments
-8. Speculative language (may, might, possibly) should not be considered contradictions`;
-function generateEvaluatePrompt5({ context, output }) {
-  return `Verify if the output contradicts any of the provided context statements. A contradiction occurs when the output directly conflicts with a statement.
-Output to verify:
-${output}
+1. First extract all claims from the output (both factual and speculative)
+2. Then verify each extracted claim against the provided context
+3. Consider it a hallucination if a claim contradicts the context
+4. Consider it a hallucination if a claim makes assertions not supported by context
+5. Empty outputs should be handled as having no hallucinations
+6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
+7. Speculative language about facts NOT in the context IS a hallucination
+8. Never use prior knowledge in judgments - only use what's explicitly stated in context
+9. The following are NOT hallucinations:
+   - Using less precise dates (e.g., year when context gives month)
+   - Reasonable numerical approximations
+   - Omitting additional details while maintaining factual accuracy
+10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
+function generateEvaluatePrompt5({ context, claims }) {
+  return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
+1. Contradicts the context
+2. Makes assertions not supported by the context
+Claims to verify:
+${claims.join("\n")}
 Number of context statements: ${context.length}
-Context statements to check:
+Context statements:
 ${context.join("\n")}
-For each context statement, determine if the output contradicts it. When evaluating numbers:
-- Numbers with qualifiers ("about", "around", "approximately") allow reasonable approximations
-- Consider the scale of the number when determining reasonable approximations
-- Only mark as contradiction if the difference would be misleading in context
-- Respect explicit precision markers ("exactly", "precisely")
+For each claim, determine if it is supported by the context. When evaluating:
+1. NOT Hallucinations:
+   - Using less precise dates (e.g., year when context gives month)
+   - Reasonable numerical approximations
+   - Omitting additional details while maintaining factual accuracy
+   - Speculative language about facts present in context
+2. ARE Hallucinations:
+   - Claims that contradict the context
+   - Assertions not supported by context
+   - Speculative claims about facts not in context
+   - Subjective claims not explicitly supported by context
 Example:
-Context: "Tesla was founded in 2003"
-Output: "Tesla, established in 2004, revolutionized the electric car industry."
+Context: [
+  "SpaceX achieved first successful landing in December 2015.",
+  "Their reusable rocket technology reduced launch costs by 30%."
+]
+Claims: [
+  "SpaceX made history in 2015",
+  "SpaceX had pioneering reusable rockets",
+  "reusable rockets significantly cut costs",
+  "They might expand operations globally"
+]
 {
     "verdicts": [
         {
-            "statement": "Tesla was founded in 2003",
+            "statement": "SpaceX made history in 2015",
             "verdict": "yes",
-            "reason": "The output claims Tesla was established in 2004, which directly contradicts the statement that it was founded in 2003"
-        }
-    ]
-}
-Context: "The company has exactly 1,234 employees"
-Output: "The company employs around 1,200 people"
-{
-    "verdicts": [
+            "reason": "The subjective claim 'made history' and the year are not supported by context"
+        },
         {
-            "statement": "The company has exactly 1,234 employees",
-            "verdict": "no",
-            "reason": "While the output uses an approximation (around 1,200), this is a reasonable representation of 1,234 employees and maintains the correct order of magnitude"
-        }
-    ]
-}
-Context: "Revenue reached $50.5 million in 2022"
-Output: "The company made about $50 million in 2022"
-{
-    "verdicts": [
+            "statement": "SpaceX had pioneering reusable rockets",
+            "verdict": "yes",
+            "reason": "The subjective claim 'pioneering' is not supported by context"
+        },
         {
-            "statement": "Revenue reached $50.5 million in 2022",
+            "statement": "reusable rockets significantly cut costs",
             "verdict": "no",
-            "reason": "The output's approximation of 'about $50 million' is a reasonable representation of $50.5 million, maintaining accuracy at this scale"
-        }
-    ]
-}
-Context: "The startup raised $2.1 million in seed funding"
-Output: "The company secured approximately $5 million in their seed round"
-{
-    "verdicts": [
+            "reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
+        },
         {
-            "statement": "The startup raised $2.1 million in seed funding",
+            "statement": "They might expand operations globally",
             "verdict": "yes",
-            "reason": "Despite using 'approximately', the output claims $5 million which is more than double the actual amount ($2.1 million), making it a significant and misleading deviation"
+            "reason": "This speculative claim about facts not in context is a hallucination"
         }
     ]
 }
 Rules:
-- Only mark as contradicted if there's a direct conflict
-- Omissions are not contradictions
+- Mark as hallucination if information contradicts context
+- Mark as hallucination if assertions aren't supported by context
+- Allow reasonable approximations and less precise dates
+- Every factual claim must be verified
 - Never use prior knowledge in your judgment
 - Provide clear reasoning for each verdict
-- Be specific about where in the output the contradiction occurs
-- The number of verdicts MUST MATCH the number of context statements exactly
+- Be specific about what information is or isn't supported by context
 Format:
 {
     "verdicts": [
         {
-            "statement": "context statement",
+            "statement": "individual claim",
             "verdict": "yes/no",
-            "reason": "explanation of contradiction or lack thereof"
+            "reason": "explanation of whether the claim is supported by context"
         }
     ]
 }`;
@@ -1096,7 +1099,16 @@ var HallucinationJudge = class extends MastraAgentJudge {
     super("Hallucination", HALLUCINATION_AGENT_INSTRUCTIONS, model);
   }
   async evaluate(output, context) {
-    const evaluatePrompt = generateEvaluatePrompt5({ context, output });
+    const claimsPrompt = generateClaimExtractionPrompt({ output });
+    const claims = await this.agent.generate(claimsPrompt, {
+      output: zod.z.object({
+        claims: zod.z.array(zod.z.string())
+      })
+    });
+    if (claims.object.claims.length === 0) {
+      return [];
+    }
+    const evaluatePrompt = generateEvaluatePrompt5({ claims: claims.object.claims, context });
     const result = await this.agent.generate(evaluatePrompt, {
       output: zod.z.object({
         verdicts: zod.z.array(
@@ -1132,6 +1144,7 @@ var HallucinationMetric = class extends _eval.Metric {
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(output, this.context);
+    console.log("verdicts", verdicts);
     const score = this.calculateScore(verdicts);
     const reason = await this.judge.getReason({
       input,

package/dist/metrics/llm/index.js CHANGED Viewed

@@ -942,98 +942,101 @@ var FaithfulnessMetric = class extends Metric {
 };
 // src/metrics/llm/hallucination/prompts.ts
-var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.
+var HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
 Key Principles:
-1. Treat each context piece as a statement to verify
-2. Verify if the output contradicts any of these statements
-3. Consider a contradiction when the output directly conflicts with context statements
-4. Consider no contradiction when the output aligns with or doesn't mention context statements
-5. Empty outputs should be handled as having no contradictions
-6. Focus on factual inconsistencies, not omissions
-7. Never use prior knowledge in judgments
-8. Speculative language (may, might, possibly) should not be considered contradictions`;
-function generateEvaluatePrompt5({ context, output }) {
-  return `Verify if the output contradicts any of the provided context statements. A contradiction occurs when the output directly conflicts with a statement.
-Output to verify:
-${output}
+1. First extract all claims from the output (both factual and speculative)
+2. Then verify each extracted claim against the provided context
+3. Consider it a hallucination if a claim contradicts the context
+4. Consider it a hallucination if a claim makes assertions not supported by context
+5. Empty outputs should be handled as having no hallucinations
+6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
+7. Speculative language about facts NOT in the context IS a hallucination
+8. Never use prior knowledge in judgments - only use what's explicitly stated in context
+9. The following are NOT hallucinations:
+   - Using less precise dates (e.g., year when context gives month)
+   - Reasonable numerical approximations
+   - Omitting additional details while maintaining factual accuracy
+10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
+function generateEvaluatePrompt5({ context, claims }) {
+  return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
+1. Contradicts the context
+2. Makes assertions not supported by the context
+Claims to verify:
+${claims.join("\n")}
 Number of context statements: ${context.length}
-Context statements to check:
+Context statements:
 ${context.join("\n")}
-For each context statement, determine if the output contradicts it. When evaluating numbers:
-- Numbers with qualifiers ("about", "around", "approximately") allow reasonable approximations
-- Consider the scale of the number when determining reasonable approximations
-- Only mark as contradiction if the difference would be misleading in context
-- Respect explicit precision markers ("exactly", "precisely")
+For each claim, determine if it is supported by the context. When evaluating:
+1. NOT Hallucinations:
+   - Using less precise dates (e.g., year when context gives month)
+   - Reasonable numerical approximations
+   - Omitting additional details while maintaining factual accuracy
+   - Speculative language about facts present in context
+2. ARE Hallucinations:
+   - Claims that contradict the context
+   - Assertions not supported by context
+   - Speculative claims about facts not in context
+   - Subjective claims not explicitly supported by context
 Example:
-Context: "Tesla was founded in 2003"
-Output: "Tesla, established in 2004, revolutionized the electric car industry."
+Context: [
+  "SpaceX achieved first successful landing in December 2015.",
+  "Their reusable rocket technology reduced launch costs by 30%."
+]
+Claims: [
+  "SpaceX made history in 2015",
+  "SpaceX had pioneering reusable rockets",
+  "reusable rockets significantly cut costs",
+  "They might expand operations globally"
+]
 {
     "verdicts": [
         {
-            "statement": "Tesla was founded in 2003",
+            "statement": "SpaceX made history in 2015",
             "verdict": "yes",
-            "reason": "The output claims Tesla was established in 2004, which directly contradicts the statement that it was founded in 2003"
-        }
-    ]
-}
-Context: "The company has exactly 1,234 employees"
-Output: "The company employs around 1,200 people"
-{
-    "verdicts": [
+            "reason": "The subjective claim 'made history' and the year are not supported by context"
+        },
         {
-            "statement": "The company has exactly 1,234 employees",
-            "verdict": "no",
-            "reason": "While the output uses an approximation (around 1,200), this is a reasonable representation of 1,234 employees and maintains the correct order of magnitude"
-        }
-    ]
-}
-Context: "Revenue reached $50.5 million in 2022"
-Output: "The company made about $50 million in 2022"
-{
-    "verdicts": [
+            "statement": "SpaceX had pioneering reusable rockets",
+            "verdict": "yes",
+            "reason": "The subjective claim 'pioneering' is not supported by context"
+        },
         {
-            "statement": "Revenue reached $50.5 million in 2022",
+            "statement": "reusable rockets significantly cut costs",
             "verdict": "no",
-            "reason": "The output's approximation of 'about $50 million' is a reasonable representation of $50.5 million, maintaining accuracy at this scale"
-        }
-    ]
-}
-Context: "The startup raised $2.1 million in seed funding"
-Output: "The company secured approximately $5 million in their seed round"
-{
-    "verdicts": [
+            "reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
+        },
         {
-            "statement": "The startup raised $2.1 million in seed funding",
+            "statement": "They might expand operations globally",
             "verdict": "yes",
-            "reason": "Despite using 'approximately', the output claims $5 million which is more than double the actual amount ($2.1 million), making it a significant and misleading deviation"
+            "reason": "This speculative claim about facts not in context is a hallucination"
         }
     ]
 }
 Rules:
-- Only mark as contradicted if there's a direct conflict
-- Omissions are not contradictions
+- Mark as hallucination if information contradicts context
+- Mark as hallucination if assertions aren't supported by context
+- Allow reasonable approximations and less precise dates
+- Every factual claim must be verified
 - Never use prior knowledge in your judgment
 - Provide clear reasoning for each verdict
-- Be specific about where in the output the contradiction occurs
-- The number of verdicts MUST MATCH the number of context statements exactly
+- Be specific about what information is or isn't supported by context
 Format:
 {
     "verdicts": [
         {
-            "statement": "context statement",
+            "statement": "individual claim",
             "verdict": "yes/no",
-            "reason": "explanation of contradiction or lack thereof"
+            "reason": "explanation of whether the claim is supported by context"
         }
     ]
 }`;
@@ -1083,7 +1086,16 @@ var HallucinationJudge = class extends MastraAgentJudge {
     super("Hallucination", HALLUCINATION_AGENT_INSTRUCTIONS, model);
   }
   async evaluate(output, context) {
-    const evaluatePrompt = generateEvaluatePrompt5({ context, output });
+    const claimsPrompt = generateClaimExtractionPrompt({ output });
+    const claims = await this.agent.generate(claimsPrompt, {
+      output: z.object({
+        claims: z.array(z.string())
+      })
+    });
+    if (claims.object.claims.length === 0) {
+      return [];
+    }
+    const evaluatePrompt = generateEvaluatePrompt5({ claims: claims.object.claims, context });
     const result = await this.agent.generate(evaluatePrompt, {
       output: z.object({
         verdicts: z.array(
@@ -1119,6 +1131,7 @@ var HallucinationMetric = class extends Metric {
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(output, this.context);
+    console.log("verdicts", verdicts);
     const score = this.calculateScore(verdicts);
     const reason = await this.judge.getReason({
       input,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mastra/evals",
-  "version": "0.1.8-alpha.0",
+  "version": "0.1.8-alpha.10",
   "description": "",
   "type": "module",
   "main": "dist/index.js",
@@ -59,7 +59,7 @@
     "sentiment": "^5.0.2",
     "string-similarity": "^4.0.4",
     "zod": "^3.24.1",
-    "@mastra/core": "^0.5.0-alpha.0"
+    "@mastra/core": "^0.5.0-alpha.10"
   },
   "peerDependencies": {
     "ai": "^4.0.0"

package/src/metrics/llm/hallucination/index.test.ts CHANGED Viewed

@@ -1,10 +1,12 @@
 import { openai } from '@ai-sdk/openai';
-import { describe, it, expect } from 'vitest';
+import { describe, it, expect, vi } from 'vitest';
 import type { TestCaseWithContext } from '../utils';
 import { HallucinationMetric } from './index';
+vi.setConfig({ testTimeout: 30000, hookTimeout: 30000 });
 const testCases: TestCaseWithContext[] = [
   {
     // No hallucination - output aligns with context
@@ -17,7 +19,7 @@ const testCases: TestCaseWithContext[] = [
     },
   },
   {
-    // Complete hallucination - output contradicts all context
+    // Complete hallucination - output contradicts context
     input: 'Tell me about Tesla.',
     context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.', 'The first Tesla car was the Roadster.'],
     output: 'Tesla was established in 2001 by Elon Musk himself. Their first car was the Model S.',
@@ -38,33 +40,33 @@ const testCases: TestCaseWithContext[] = [
     },
   },
   {
-    // Empty output - should have no contradictions
+    // Empty output
     input: 'Tell me about Tesla.',
     context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
     output: '',
     expectedResult: {
       score: 0.0,
-      reason: 'Empty output cannot contradict any context statements.',
+      reason: 'Empty output cannot contain hallucinations.',
     },
   },
   {
-    // Speculative language - should not count as contradictions
+    // Speculative language with known facts
     input: 'Tell me about Tesla.',
     context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
     output: 'Tesla might have been founded around 2003, and I believe Elon Musk possibly joined a year later.',
     expectedResult: {
       score: 0.0,
-      reason: 'Speculative language (might, possibly, believe) does not constitute contradictions with context.',
+      reason: 'Speculative language about facts that match context is not considered hallucination.',
     },
   },
   {
-    // Empty context - should return score of 0
+    // Empty context
     input: 'Tell me about Tesla.',
     context: [],
     output: 'Tesla was founded in 2001 by Elon Musk.',
     expectedResult: {
-      score: 0.0,
-      reason: 'No context statements to contradict, resulting in zero hallucination score.',
+      score: 1.0,
+      reason: 'With no context provided, any factual claims are considered hallucinations.',
     },
   },
   {
@@ -73,8 +75,9 @@ const testCases: TestCaseWithContext[] = [
     context: ['SpaceX achieved first successful landing in 2015.', 'Their first crewed mission was in 2020.'],
     output: 'Before anyone else, SpaceX pioneered reusable rockets with their first landing in 2014.',
     expectedResult: {
-      score: 0.5,
-      reason: 'One context statement is contradicted through implicit claim about timing (2014 vs 2015).',
+      score: 1.0,
+      reason:
+        'Both the timing claim (2014 vs 2015) and the unsupported "Before anyone else" pioneering claim are hallucinations.',
     },
   },
   {
@@ -91,16 +94,15 @@ const testCases: TestCaseWithContext[] = [
     // Out of scope additions
     input: 'Tell me about the company.',
     context: ['The company was founded in New York.', 'They specialize in software.'],
-    output:
-      'The company, founded in New York, specializes in software and has offices worldwide with plans to expand into AI.',
+    output: 'The company, founded in New York, specializes in software and has offices worldwide.',
     expectedResult: {
-      score: 0.0,
+      score: 0.33,
       reason:
-        'Additional information beyond context scope is not counted as contradictions unless it directly conflicts with context.',
+        'One out of three claims (worldwide offices) is a hallucination, while founding location and specialization are supported.',
     },
   },
   {
-    // Temporal contradictions
+    // Temporal sequence
     input: 'Describe the project timeline.',
     context: [
       'Project started in January 2023.',
@@ -111,104 +113,205 @@ const testCases: TestCaseWithContext[] = [
     expectedResult: {
       score: 0.67,
       reason:
-        'Two context statements are contradicted through temporal inconsistency in phase completion order and dates.',
+        'Two out of three claims are hallucinations: Phase 2 completion date and the claim about Phase 2 finishing before Phase 1.',
     },
   },
   {
-    // Numerical contradiction despite approximation
+    // Numerical contradiction
     input: 'Tell me about the company size.',
     context: ['The company employs 300 people globally.'],
     output: 'The company has approximately 1000 employees worldwide.',
     expectedResult: {
       score: 1.0,
       reason:
-        'Despite using "approximately", the claimed number (1000) represents a deviation too large from the actual value (300) to be considered a reasonable approximation.',
+        'Despite using "approximately", the claimed number (1000) represents a deviation too large from the actual value (300).',
+    },
+  },
+  {
+    // Additional information
+    input: 'Tell me about Tesla.',
+    context: ['Tesla was founded in 2003.'],
+    output: 'Tesla, founded in 2003, is a leading electric car manufacturer with global operations.',
+    expectedResult: {
+      score: 0.67,
+      reason:
+        'Two out of three claims (being a leading manufacturer and having global operations) are not supported by context.',
+    },
+  },
+  {
+    // Speculative claims about unknown facts
+    input: 'Tell me about Tesla.',
+    context: ['Tesla was founded in 2003.', 'Elon Musk joined Tesla in 2004.'],
+    output: 'Tesla was founded in 2003, and might be the most innovative car company in history.',
+    expectedResult: {
+      score: 0.5,
+      reason: 'While founding date is supported, speculative claim about being most innovative is not in context.',
+    },
+  },
+  {
+    // Date precision
+    input: 'Tell me about SpaceX achievements.',
+    context: ['SpaceX achieved first successful landing in December 2015.'],
+    output: 'SpaceX made history with their first successful landing in 2015.',
+    expectedResult: {
+      score: 1.0,
+      reason:
+        'The statement contains an unsupported subjective claim ("made history") that modifies the factual landing claim.',
+    },
+  },
+  {
+    // Numerical precision
+    input: 'Tell me about the company size.',
+    context: ['The company employs exactly 300 people globally.'],
+    output: 'The company has approximately 300 employees worldwide.',
+    expectedResult: {
+      score: 0.0,
+      reason: 'Using "approximately" when context specifies "exactly" is still considered a reasonable approximation.',
+    },
+  },
+  {
+    // Mixed precision levels
+    input: 'Tell me about revenue growth.',
+    context: ['Company revenue grew from exactly $10.5M in Q1 to approximately $20M in Q2.'],
+    output: 'Revenue was about $10M in Q1 and exactly $20M in Q2.',
+    expectedResult: {
+      score: 1.0,
+      reason:
+        'Mismatched precision levels: uses "about" when context specifies "exactly" for Q1, and uses "exactly" when context specifies "approximately" for Q2.',
+    },
+  },
+  {
+    // Relative comparisons
+    input: 'Tell me about the market share.',
+    context: ['Company A has 30% market share.', 'Company B has 25% market share.'],
+    output: 'Company A leads the market with 30% share, ahead of Company B.',
+    expectedResult: {
+      score: 0.5,
+      reason:
+        'While the market share numbers are correct, the claim about "leading the market" is not supported as we don\'t know about other companies.',
     },
   },
 ];
 const model = openai('gpt-4o');
-describe(
-  'HallucinationMetric',
-  () => {
-    it('should handle perfect alignment', async () => {
-      const testCase = testCases[0]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-    it('should handle complete hallucination', async () => {
-      const testCase = testCases[1]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-    it('should handle partial hallucination', async () => {
-      const testCase = testCases[2]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-    it('should handle empty output', async () => {
-      const testCase = testCases[3]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBe(testCase.expectedResult.score);
-    });
-    it('should handle speculative language', async () => {
-      const testCase = testCases[4]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-    it('should handle empty context', async () => {
-      const testCase = testCases[5]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBe(testCase.expectedResult.score);
-    });
-    it('should handle implicit contradictions', async () => {
-      const testCase = testCases[6]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-    it('should handle numerical approximations', async () => {
-      const testCase = testCases[7]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-    it('should handle out of scope additions', async () => {
-      const testCase = testCases[8]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-    it('should handle temporal contradictions', async () => {
-      const testCase = testCases[9]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-    it('should handle numerical contradiction despite approximation', async () => {
-      const testCase = testCases[10]!;
-      const metric = new HallucinationMetric(model, { context: testCase.context });
-      const result = await metric.measure(testCase.input, testCase.output);
-      expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
-    });
-  },
-  {
-    timeout: 15 * 10000,
-  },
-);
+describe('HallucinationMetric', () => {
+  it('should handle perfect alignment', async () => {
+    const testCase = testCases[0]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle complete hallucination', async () => {
+    const testCase = testCases[1]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle partial hallucination', async () => {
+    const testCase = testCases[2]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle empty output', async () => {
+    const testCase = testCases[3]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBe(testCase.expectedResult.score);
+  });
+  it('should handle speculative language', async () => {
+    const testCase = testCases[4]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle empty context', async () => {
+    const testCase = testCases[5]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBe(testCase.expectedResult.score);
+  });
+  it('should handle implicit contradictions', async () => {
+    const testCase = testCases[6]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle numerical approximations', async () => {
+    const testCase = testCases[7]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle out of scope additions', async () => {
+    const testCase = testCases[8]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle temporal contradictions', async () => {
+    const testCase = testCases[9]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle numerical contradiction despite approximation', async () => {
+    const testCase = testCases[10]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  // New tests for stricter hallucination checking
+  it('should detect additional information as hallucination', async () => {
+    const testCase = testCases[11]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should detect speculative claims about unknown facts as hallucination', async () => {
+    const testCase = testCases[12]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should enforce strict date matching', async () => {
+    const testCase = testCases[13]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should enforce strict numerical matching', async () => {
+    const testCase = testCases[14]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle mixed precision levels', async () => {
+    const testCase = testCases[15]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+  it('should handle relative comparisons', async () => {
+    const testCase = testCases[16]!;
+    const metric = new HallucinationMetric(model, { context: testCase.context });
+    const result = await metric.measure(testCase.input, testCase.output);
+    expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
+  });
+});

package/src/metrics/llm/hallucination/index.ts CHANGED Viewed

@@ -26,6 +26,7 @@ export class HallucinationMetric extends Metric {
   async measure(input: string, output: string): Promise<MetricResultWithReason> {
     const verdicts = await this.judge.evaluate(output, this.context);
+    console.log('verdicts', verdicts);
     const score = this.calculateScore(verdicts);
     const reason = await this.judge.getReason({
       input,

package/src/metrics/llm/hallucination/metricJudge.ts CHANGED Viewed

@@ -2,7 +2,7 @@ import type { LanguageModel } from '@mastra/core/llm';
 import { z } from 'zod';
 import { MastraAgentJudge } from '../../judge';
+import { generateClaimExtractionPrompt } from '../faithfulness/prompts';
 import { generateEvaluatePrompt, HALLUCINATION_AGENT_INSTRUCTIONS, generateReasonPrompt } from './prompts';
 export class HallucinationJudge extends MastraAgentJudge {
@@ -11,7 +11,18 @@ export class HallucinationJudge extends MastraAgentJudge {
   }
   async evaluate(output: string, context: string[]): Promise<{ statement: string; verdict: string; reason: string }[]> {
-    const evaluatePrompt = generateEvaluatePrompt({ context, output });
+    const claimsPrompt = generateClaimExtractionPrompt({ output });
+    const claims = await this.agent.generate(claimsPrompt, {
+      output: z.object({
+        claims: z.array(z.string()),
+      }),
+    });
+    if (claims.object.claims.length === 0) {
+      return [];
+    }
+    const evaluatePrompt = generateEvaluatePrompt({ claims: claims.object.claims, context });
     const result = await this.agent.generate(evaluatePrompt, {
       output: z.object({
         verdicts: z.array(

package/src/metrics/llm/hallucination/prompts.ts CHANGED Viewed

@@ -1,96 +1,99 @@
-export const HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contradicts the provided context, focusing on identifying factual inconsistencies.
+export const HALLUCINATION_AGENT_INSTRUCTIONS = `You are a precise and thorough hallucination evaluator. Your job is to determine if an LLM's output contains information not supported by or contradicts the provided context.
 Key Principles:
-1. Treat each context piece as a statement to verify
-2. Verify if the output contradicts any of these statements
-3. Consider a contradiction when the output directly conflicts with context statements
-4. Consider no contradiction when the output aligns with or doesn't mention context statements
-5. Empty outputs should be handled as having no contradictions
-6. Focus on factual inconsistencies, not omissions
-7. Never use prior knowledge in judgments
-8. Speculative language (may, might, possibly) should not be considered contradictions`;
+1. First extract all claims from the output (both factual and speculative)
+2. Then verify each extracted claim against the provided context
+3. Consider it a hallucination if a claim contradicts the context
+4. Consider it a hallucination if a claim makes assertions not supported by context
+5. Empty outputs should be handled as having no hallucinations
+6. Speculative language (may, might, possibly) about facts IN the context is NOT a hallucination
+7. Speculative language about facts NOT in the context IS a hallucination
+8. Never use prior knowledge in judgments - only use what's explicitly stated in context
+9. The following are NOT hallucinations:
+   - Using less precise dates (e.g., year when context gives month)
+   - Reasonable numerical approximations
+   - Omitting additional details while maintaining factual accuracy
+10. Subjective claims ("made history", "pioneering", "leading") are hallucinations unless explicitly stated in context`;
-export function generateEvaluatePrompt({ context, output }: { context: string[]; output: string }) {
-  return `Verify if the output contradicts any of the provided context statements. A contradiction occurs when the output directly conflicts with a statement.
+export function generateEvaluatePrompt({ context, claims }: { context: string[]; claims: string[] }) {
+  return `Verify if the claims contain any information not supported by or contradicting the provided context. A hallucination occurs when a claim either:
+1. Contradicts the context
+2. Makes assertions not supported by the context
-Output to verify:
-${output}
+Claims to verify:
+${claims.join('\n')}
 Number of context statements: ${context.length}
-Context statements to check:
+Context statements:
 ${context.join('\n')}
-For each context statement, determine if the output contradicts it. When evaluating numbers:
-- Numbers with qualifiers ("about", "around", "approximately") allow reasonable approximations
-- Consider the scale of the number when determining reasonable approximations
-- Only mark as contradiction if the difference would be misleading in context
-- Respect explicit precision markers ("exactly", "precisely")
+For each claim, determine if it is supported by the context. When evaluating:
+1. NOT Hallucinations:
+   - Using less precise dates (e.g., year when context gives month)
+   - Reasonable numerical approximations
+   - Omitting additional details while maintaining factual accuracy
+   - Speculative language about facts present in context
+2. ARE Hallucinations:
+   - Claims that contradict the context
+   - Assertions not supported by context
+   - Speculative claims about facts not in context
+   - Subjective claims not explicitly supported by context
 Example:
-Context: "Tesla was founded in 2003"
-Output: "Tesla, established in 2004, revolutionized the electric car industry."
+Context: [
+  "SpaceX achieved first successful landing in December 2015.",
+  "Their reusable rocket technology reduced launch costs by 30%."
+]
+Claims: [
+  "SpaceX made history in 2015",
+  "SpaceX had pioneering reusable rockets",
+  "reusable rockets significantly cut costs",
+  "They might expand operations globally"
+]
 {
     "verdicts": [
         {
-            "statement": "Tesla was founded in 2003",
+            "statement": "SpaceX made history in 2015",
             "verdict": "yes",
-            "reason": "The output claims Tesla was established in 2004, which directly contradicts the statement that it was founded in 2003"
-        }
-    ]
-}
-Context: "The company has exactly 1,234 employees"
-Output: "The company employs around 1,200 people"
-{
-    "verdicts": [
+            "reason": "The subjective claim 'made history' and the year are not supported by context"
+        },
         {
-            "statement": "The company has exactly 1,234 employees",
-            "verdict": "no",
-            "reason": "While the output uses an approximation (around 1,200), this is a reasonable representation of 1,234 employees and maintains the correct order of magnitude"
-        }
-    ]
-}
-Context: "Revenue reached $50.5 million in 2022"
-Output: "The company made about $50 million in 2022"
-{
-    "verdicts": [
+            "statement": "SpaceX had pioneering reusable rockets",
+            "verdict": "yes",
+            "reason": "The subjective claim 'pioneering' is not supported by context"
+        },
         {
-            "statement": "Revenue reached $50.5 million in 2022",
+            "statement": "reusable rockets significantly cut costs",
             "verdict": "no",
-            "reason": "The output's approximation of 'about $50 million' is a reasonable representation of $50.5 million, maintaining accuracy at this scale"
-        }
-    ]
-}
-Context: "The startup raised $2.1 million in seed funding"
-Output: "The company secured approximately $5 million in their seed round"
-{
-    "verdicts": [
+            "reason": "Context supports that costs were reduced by 30%, this is a reasonable paraphrase"
+        },
         {
-            "statement": "The startup raised $2.1 million in seed funding",
+            "statement": "They might expand operations globally",
             "verdict": "yes",
-            "reason": "Despite using 'approximately', the output claims $5 million which is more than double the actual amount ($2.1 million), making it a significant and misleading deviation"
+            "reason": "This speculative claim about facts not in context is a hallucination"
         }
     ]
 }
 Rules:
-- Only mark as contradicted if there's a direct conflict
-- Omissions are not contradictions
+- Mark as hallucination if information contradicts context
+- Mark as hallucination if assertions aren't supported by context
+- Allow reasonable approximations and less precise dates
+- Every factual claim must be verified
 - Never use prior knowledge in your judgment
 - Provide clear reasoning for each verdict
-- Be specific about where in the output the contradiction occurs
-- The number of verdicts MUST MATCH the number of context statements exactly
+- Be specific about what information is or isn't supported by context
 Format:
 {
     "verdicts": [
         {
-            "statement": "context statement",
+            "statement": "individual claim",
             "verdict": "yes/no",
-            "reason": "explanation of contradiction or lack thereof"
+            "reason": "explanation of whether the claim is supported by context"
         }
     ]
 }`;