hallucinationbench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hallucinationbench-0.1.0/LICENSE +21 -0
- hallucinationbench-0.1.0/PKG-INFO +192 -0
- hallucinationbench-0.1.0/README.md +164 -0
- hallucinationbench-0.1.0/hallucinationbench/__init__.py +5 -0
- hallucinationbench-0.1.0/hallucinationbench/models.py +44 -0
- hallucinationbench-0.1.0/hallucinationbench/scorer.py +142 -0
- hallucinationbench-0.1.0/hallucinationbench.egg-info/PKG-INFO +192 -0
- hallucinationbench-0.1.0/hallucinationbench.egg-info/SOURCES.txt +11 -0
- hallucinationbench-0.1.0/hallucinationbench.egg-info/dependency_links.txt +1 -0
- hallucinationbench-0.1.0/hallucinationbench.egg-info/requires.txt +3 -0
- hallucinationbench-0.1.0/hallucinationbench.egg-info/top_level.txt +1 -0
- hallucinationbench-0.1.0/setup.cfg +4 -0
- hallucinationbench-0.1.0/setup.py +28 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Devasish Banerjee
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hallucinationbench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect hallucinations in your RAG pipeline output in two lines of Python.
|
|
5
|
+
Home-page: https://github.com/bdeva1975/hallucinationbench
|
|
6
|
+
Author: Devasish Banerjee
|
|
7
|
+
Author-email: bdeva1975@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: openai>=1.0.0
|
|
16
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
17
|
+
Requires-Dist: httpx>=0.27.0
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
Dynamic: requires-dist
|
|
26
|
+
Dynamic: requires-python
|
|
27
|
+
Dynamic: summary
|
|
28
|
+
|
|
29
|
+
# 🔬 HallucinationBench
|
|
30
|
+
|
|
31
|
+
**Detect hallucinations in your RAG pipeline output — in two lines of Python.**
|
|
32
|
+
|
|
33
|
+

|
|
34
|
+

|
|
35
|
+

|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## The problem
|
|
40
|
+
|
|
41
|
+
Your RAG pipeline retrieves documents and passes them to an LLM.
|
|
42
|
+
The LLM generates a response that *sounds* correct.
|
|
43
|
+
But is every claim actually grounded in your context — or did the model fabricate some of it?
|
|
44
|
+
|
|
45
|
+
**HallucinationBench answers that question instantly.**
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Quickstart
|
|
50
|
+
|
|
51
|
+
Install dependencies:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install openai python-dotenv
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Set your OpenAI API key:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# .env
|
|
61
|
+
OPENAI_API_KEY=your_key_here
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Run your first evaluation:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from hallucinationbench import score
|
|
68
|
+
|
|
69
|
+
context = """
|
|
70
|
+
The Eiffel Tower is located in Paris, France. It was constructed between
|
|
71
|
+
1887 and 1889 as the entrance arch for the 1889 World's Fair.
|
|
72
|
+
The tower is 330 metres tall.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
response = """
|
|
76
|
+
The Eiffel Tower is in Paris. It was built in 1889 and stands 330 metres
|
|
77
|
+
tall. It was designed by Leonardo da Vinci and attracts over 7 million
|
|
78
|
+
visitors every year.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
result = score(context=context, response=response)
|
|
82
|
+
print(result)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Output:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
Verdict : FAIL
|
|
89
|
+
Faithfulness : 0.40
|
|
90
|
+
|
|
91
|
+
Grounded claims (2):
|
|
92
|
+
✓ The Eiffel Tower is in Paris.
|
|
93
|
+
✓ It stands 330 metres tall.
|
|
94
|
+
|
|
95
|
+
Hallucinated claims (3):
|
|
96
|
+
✗ It was built in 1889.
|
|
97
|
+
✗ It was designed by Leonardo da Vinci.
|
|
98
|
+
✗ It attracts over 7 million visitors every year.
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## The result object
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
result.faithfulness_score # float 0.0 – 1.0
|
|
107
|
+
result.grounded_claims # list of supported statements
|
|
108
|
+
result.hallucinated_claims # list of fabricated statements
|
|
109
|
+
result.verdict # "PASS" | "WARN" | "FAIL"
|
|
110
|
+
result.model # judge model used
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
| Verdict | Faithfulness Score |
|
|
114
|
+
|---------|-------------------|
|
|
115
|
+
| ✅ PASS | >= 0.8 |
|
|
116
|
+
| ⚠️ WARN | >= 0.5 |
|
|
117
|
+
| ❌ FAIL | < 0.5 |
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Streamlit demo
|
|
122
|
+
|
|
123
|
+
Run the interactive demo locally:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
streamlit run app.py
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Paste any context and LLM response — get an instant hallucination report.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## How it works
|
|
134
|
+
|
|
135
|
+
1. Your `context` and `response` are sent to `gpt-4o-mini` as a structured judge prompt.
|
|
136
|
+
2. The judge breaks the response into individual factual claims.
|
|
137
|
+
3. Each claim is classified as grounded (supported by context) or hallucinated (absent or contradicted).
|
|
138
|
+
4. A faithfulness score is calculated: `grounded_claims / total_claims`.
|
|
139
|
+
5. A verdict of PASS, WARN, or FAIL is assigned.
|
|
140
|
+
|
|
141
|
+
The judge uses `response_format: json_object` to guarantee structured output.
|
|
142
|
+
Temperature is set to 0 for deterministic results.
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Cost
|
|
147
|
+
|
|
148
|
+
Each evaluation uses `gpt-4o-mini`.
|
|
149
|
+
Typical cost: **~$0.001 per evaluation** (well under a tenth of a cent).
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Roadmap
|
|
154
|
+
|
|
155
|
+
- [ ] Batch evaluation across multiple context/response pairs
|
|
156
|
+
- [ ] CSV upload support in the Streamlit app
|
|
157
|
+
- [ ] Custom judge model selection
|
|
158
|
+
- [ ] LangChain and LlamaIndex integration hooks
|
|
159
|
+
- [ ] CI/CD integration example (GitHub Actions)
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Project structure
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
hallucinationbench/
|
|
167
|
+
├── hallucinationbench/
|
|
168
|
+
│ ├── __init__.py # public API
|
|
169
|
+
│ ├── scorer.py # GPT-4o-mini judge
|
|
170
|
+
│ └── models.py # ScoreResult dataclass
|
|
171
|
+
├── app.py # Streamlit demo
|
|
172
|
+
├── example.py # quickstart example
|
|
173
|
+
├── requirements.txt
|
|
174
|
+
├── .env.example
|
|
175
|
+
└── README.md
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
MIT — free to use, modify, and distribute.
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Contributing
|
|
187
|
+
|
|
188
|
+
Pull requests are welcome. Please open an issue first to discuss what you would like to change.
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
*Built with OpenAI GPT-4o-mini as the hallucination judge.*
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# 🔬 HallucinationBench
|
|
2
|
+
|
|
3
|
+
**Detect hallucinations in your RAG pipeline output — in two lines of Python.**
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## The problem
|
|
12
|
+
|
|
13
|
+
Your RAG pipeline retrieves documents and passes them to an LLM.
|
|
14
|
+
The LLM generates a response that *sounds* correct.
|
|
15
|
+
But is every claim actually grounded in your context — or did the model fabricate some of it?
|
|
16
|
+
|
|
17
|
+
**HallucinationBench answers that question instantly.**
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Quickstart
|
|
22
|
+
|
|
23
|
+
Install dependencies:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install openai python-dotenv
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Set your OpenAI API key:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
# .env
|
|
33
|
+
OPENAI_API_KEY=your_key_here
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Run your first evaluation:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from hallucinationbench import score
|
|
40
|
+
|
|
41
|
+
context = """
|
|
42
|
+
The Eiffel Tower is located in Paris, France. It was constructed between
|
|
43
|
+
1887 and 1889 as the entrance arch for the 1889 World's Fair.
|
|
44
|
+
The tower is 330 metres tall.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
response = """
|
|
48
|
+
The Eiffel Tower is in Paris. It was built in 1889 and stands 330 metres
|
|
49
|
+
tall. It was designed by Leonardo da Vinci and attracts over 7 million
|
|
50
|
+
visitors every year.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
result = score(context=context, response=response)
|
|
54
|
+
print(result)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Output:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
Verdict : FAIL
|
|
61
|
+
Faithfulness : 0.40
|
|
62
|
+
|
|
63
|
+
Grounded claims (2):
|
|
64
|
+
✓ The Eiffel Tower is in Paris.
|
|
65
|
+
✓ It stands 330 metres tall.
|
|
66
|
+
|
|
67
|
+
Hallucinated claims (3):
|
|
68
|
+
✗ It was built in 1889.
|
|
69
|
+
✗ It was designed by Leonardo da Vinci.
|
|
70
|
+
✗ It attracts over 7 million visitors every year.
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## The result object
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
result.faithfulness_score # float 0.0 – 1.0
|
|
79
|
+
result.grounded_claims # list of supported statements
|
|
80
|
+
result.hallucinated_claims # list of fabricated statements
|
|
81
|
+
result.verdict # "PASS" | "WARN" | "FAIL"
|
|
82
|
+
result.model # judge model used
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
| Verdict | Faithfulness Score |
|
|
86
|
+
|---------|-------------------|
|
|
87
|
+
| ✅ PASS | >= 0.8 |
|
|
88
|
+
| ⚠️ WARN | >= 0.5 |
|
|
89
|
+
| ❌ FAIL | < 0.5 |
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Streamlit demo
|
|
94
|
+
|
|
95
|
+
Run the interactive demo locally:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
streamlit run app.py
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Paste any context and LLM response — get an instant hallucination report.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## How it works
|
|
106
|
+
|
|
107
|
+
1. Your `context` and `response` are sent to `gpt-4o-mini` as a structured judge prompt.
|
|
108
|
+
2. The judge breaks the response into individual factual claims.
|
|
109
|
+
3. Each claim is classified as grounded (supported by context) or hallucinated (absent or contradicted).
|
|
110
|
+
4. A faithfulness score is calculated: `grounded_claims / total_claims`.
|
|
111
|
+
5. A verdict of PASS, WARN, or FAIL is assigned.
|
|
112
|
+
|
|
113
|
+
The judge uses `response_format: json_object` to guarantee structured output.
|
|
114
|
+
Temperature is set to 0 for deterministic results.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Cost
|
|
119
|
+
|
|
120
|
+
Each evaluation uses `gpt-4o-mini`.
|
|
121
|
+
Typical cost: **~$0.001 per evaluation** (well under a tenth of a cent).
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## Roadmap
|
|
126
|
+
|
|
127
|
+
- [ ] Batch evaluation across multiple context/response pairs
|
|
128
|
+
- [ ] CSV upload support in the Streamlit app
|
|
129
|
+
- [ ] Custom judge model selection
|
|
130
|
+
- [ ] LangChain and LlamaIndex integration hooks
|
|
131
|
+
- [ ] CI/CD integration example (GitHub Actions)
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Project structure
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
hallucinationbench/
|
|
139
|
+
├── hallucinationbench/
|
|
140
|
+
│ ├── __init__.py # public API
|
|
141
|
+
│ ├── scorer.py # GPT-4o-mini judge
|
|
142
|
+
│ └── models.py # ScoreResult dataclass
|
|
143
|
+
├── app.py # Streamlit demo
|
|
144
|
+
├── example.py # quickstart example
|
|
145
|
+
├── requirements.txt
|
|
146
|
+
├── .env.example
|
|
147
|
+
└── README.md
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
MIT — free to use, modify, and distribute.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Contributing
|
|
159
|
+
|
|
160
|
+
Pull requests are welcome. Please open an issue first to discuss what you would like to change.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
*Built with OpenAI GPT-4o-mini as the hallucination judge.*
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class ScoreResult:
|
|
7
|
+
"""
|
|
8
|
+
The result returned by hallucinationbench.score().
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
faithfulness_score : float between 0.0 and 1.0.
|
|
12
|
+
1.0 means fully grounded, 0.0 means fully hallucinated.
|
|
13
|
+
grounded_claims : list of statements in the response that are
|
|
14
|
+
supported by the context.
|
|
15
|
+
hallucinated_claims : list of statements in the response that are
|
|
16
|
+
NOT supported by, or contradict, the context.
|
|
17
|
+
verdict : "PASS" → faithfulness_score >= 0.8
|
|
18
|
+
"WARN" → faithfulness_score >= 0.5
|
|
19
|
+
"FAIL" → faithfulness_score < 0.5
|
|
20
|
+
model : the OpenAI model used as the judge.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
faithfulness_score: float
|
|
24
|
+
grounded_claims: List[str] = field(default_factory=list)
|
|
25
|
+
hallucinated_claims: List[str] = field(default_factory=list)
|
|
26
|
+
verdict: str = "FAIL"
|
|
27
|
+
model: str = "gpt-4o-mini"
|
|
28
|
+
|
|
29
|
+
def __str__(self) -> str:
|
|
30
|
+
lines = [
|
|
31
|
+
f"Verdict : {self.verdict}",
|
|
32
|
+
f"Faithfulness : {self.faithfulness_score:.2f}",
|
|
33
|
+
f"",
|
|
34
|
+
f"Grounded claims ({len(self.grounded_claims)}):",
|
|
35
|
+
]
|
|
36
|
+
for claim in self.grounded_claims:
|
|
37
|
+
lines.append(f" ✓ {claim}")
|
|
38
|
+
|
|
39
|
+
lines.append(f"")
|
|
40
|
+
lines.append(f"Hallucinated claims ({len(self.hallucinated_claims)}):")
|
|
41
|
+
for claim in self.hallucinated_claims:
|
|
42
|
+
lines.append(f" ✗ {claim}")
|
|
43
|
+
|
|
44
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
from hallucinationbench.models import ScoreResult
|
|
6
|
+
|
|
7
|
+
load_dotenv()
|
|
8
|
+
|
|
9
|
+
_client = None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _get_client() -> OpenAI:
|
|
13
|
+
"""Lazy-initialise the OpenAI client once."""
|
|
14
|
+
global _client
|
|
15
|
+
if _client is None:
|
|
16
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
17
|
+
if not api_key:
|
|
18
|
+
raise EnvironmentError(
|
|
19
|
+
"OPENAI_API_KEY not found. "
|
|
20
|
+
"Set it in your .env file or as an environment variable."
|
|
21
|
+
)
|
|
22
|
+
_client = OpenAI(api_key=api_key)
|
|
23
|
+
return _client
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_SYSTEM_PROMPT = """
|
|
27
|
+
You are a hallucination detection judge.
|
|
28
|
+
|
|
29
|
+
Your job is to evaluate whether an AI-generated response is faithful to a
|
|
30
|
+
given context. A response is faithful if every factual claim it makes is
|
|
31
|
+
directly supported by the context. A claim is hallucinated if it introduces
|
|
32
|
+
facts, figures, names, or statements that are absent from or contradict
|
|
33
|
+
the context.
|
|
34
|
+
|
|
35
|
+
You must respond ONLY with a valid JSON object. No explanation. No markdown.
|
|
36
|
+
No code fences. Raw JSON only.
|
|
37
|
+
|
|
38
|
+
The JSON must follow this exact schema:
|
|
39
|
+
{
|
|
40
|
+
"grounded_claims": ["claim 1", "claim 2"],
|
|
41
|
+
"hallucinated_claims": ["claim A", "claim B"],
|
|
42
|
+
"faithfulness_score": 0.75
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
Rules:
|
|
46
|
+
- Break the response into individual factual claims.
|
|
47
|
+
- For each claim, decide: grounded (supported by context) or hallucinated.
|
|
48
|
+
- faithfulness_score = grounded_claims / total_claims.
|
|
49
|
+
If there are no claims at all, set faithfulness_score to 1.0.
|
|
50
|
+
- Keep each claim as a short, self-contained sentence.
|
|
51
|
+
- Do not include opinions, hedges, or non-factual statements as claims.
|
|
52
|
+
""".strip()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _build_user_prompt(context: str, response: str) -> str:
|
|
56
|
+
return (
|
|
57
|
+
f"CONTEXT:\n{context.strip()}\n\n"
|
|
58
|
+
f"RESPONSE TO EVALUATE:\n{response.strip()}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _parse_result(raw_json: str, model: str) -> ScoreResult:
|
|
63
|
+
"""Parse the JSON returned by the judge into a ScoreResult."""
|
|
64
|
+
try:
|
|
65
|
+
data = json.loads(raw_json)
|
|
66
|
+
except json.JSONDecodeError as e:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
f"Judge returned invalid JSON.\n"
|
|
69
|
+
f"Raw output: {raw_json}\n"
|
|
70
|
+
f"Error: {e}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
grounded = data.get("grounded_claims", [])
|
|
74
|
+
hallucinated = data.get("hallucinated_claims", [])
|
|
75
|
+
score = float(data.get("faithfulness_score", 0.0))
|
|
76
|
+
|
|
77
|
+
# Clamp to [0.0, 1.0] defensively
|
|
78
|
+
score = max(0.0, min(1.0, score))
|
|
79
|
+
|
|
80
|
+
if score >= 0.8:
|
|
81
|
+
verdict = "PASS"
|
|
82
|
+
elif score >= 0.5:
|
|
83
|
+
verdict = "WARN"
|
|
84
|
+
else:
|
|
85
|
+
verdict = "FAIL"
|
|
86
|
+
|
|
87
|
+
return ScoreResult(
|
|
88
|
+
faithfulness_score=score,
|
|
89
|
+
grounded_claims=grounded,
|
|
90
|
+
hallucinated_claims=hallucinated,
|
|
91
|
+
verdict=verdict,
|
|
92
|
+
model=model,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def score(
|
|
97
|
+
context: str,
|
|
98
|
+
response: str,
|
|
99
|
+
model: str = "gpt-4o-mini",
|
|
100
|
+
) -> ScoreResult:
|
|
101
|
+
"""
|
|
102
|
+
Evaluate whether an LLM response is faithful to the provided context.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
context : The source text the response should be grounded in.
|
|
106
|
+
Typically your retrieved RAG documents.
|
|
107
|
+
response : The LLM-generated response to evaluate.
|
|
108
|
+
model : OpenAI model to use as the judge.
|
|
109
|
+
Default: "gpt-4o-mini" (fast and cheap).
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
ScoreResult with faithfulness_score, grounded_claims,
|
|
113
|
+
hallucinated_claims, and verdict.
|
|
114
|
+
|
|
115
|
+
Example:
|
|
116
|
+
from hallucinationbench import score
|
|
117
|
+
|
|
118
|
+
result = score(
|
|
119
|
+
context="The Eiffel Tower is located in Paris, France.",
|
|
120
|
+
response="The Eiffel Tower is in Paris. It was built in 1889."
|
|
121
|
+
)
|
|
122
|
+
print(result)
|
|
123
|
+
"""
|
|
124
|
+
if not context or not context.strip():
|
|
125
|
+
raise ValueError("context cannot be empty.")
|
|
126
|
+
if not response or not response.strip():
|
|
127
|
+
raise ValueError("response cannot be empty.")
|
|
128
|
+
|
|
129
|
+
client = _get_client()
|
|
130
|
+
|
|
131
|
+
completion = client.chat.completions.create(
|
|
132
|
+
model=model,
|
|
133
|
+
messages=[
|
|
134
|
+
{"role": "system", "content": _SYSTEM_PROMPT},
|
|
135
|
+
{"role": "user", "content": _build_user_prompt(context, response)},
|
|
136
|
+
],
|
|
137
|
+
temperature=0,
|
|
138
|
+
response_format={"type": "json_object"},
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
raw_json = completion.choices[0].message.content
|
|
142
|
+
return _parse_result(raw_json, model)
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hallucinationbench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect hallucinations in your RAG pipeline output in two lines of Python.
|
|
5
|
+
Home-page: https://github.com/bdeva1975/hallucinationbench
|
|
6
|
+
Author: Devasish Banerjee
|
|
7
|
+
Author-email: bdeva1975@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: openai>=1.0.0
|
|
16
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
17
|
+
Requires-Dist: httpx>=0.27.0
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
Dynamic: requires-dist
|
|
26
|
+
Dynamic: requires-python
|
|
27
|
+
Dynamic: summary
|
|
28
|
+
|
|
29
|
+
# 🔬 HallucinationBench
|
|
30
|
+
|
|
31
|
+
**Detect hallucinations in your RAG pipeline output — in two lines of Python.**
|
|
32
|
+
|
|
33
|
+

|
|
34
|
+

|
|
35
|
+

|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## The problem
|
|
40
|
+
|
|
41
|
+
Your RAG pipeline retrieves documents and passes them to an LLM.
|
|
42
|
+
The LLM generates a response that *sounds* correct.
|
|
43
|
+
But is every claim actually grounded in your context — or did the model fabricate some of it?
|
|
44
|
+
|
|
45
|
+
**HallucinationBench answers that question instantly.**
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Quickstart
|
|
50
|
+
|
|
51
|
+
Install dependencies:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install openai python-dotenv
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Set your OpenAI API key:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# .env
|
|
61
|
+
OPENAI_API_KEY=your_key_here
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Run your first evaluation:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from hallucinationbench import score
|
|
68
|
+
|
|
69
|
+
context = """
|
|
70
|
+
The Eiffel Tower is located in Paris, France. It was constructed between
|
|
71
|
+
1887 and 1889 as the entrance arch for the 1889 World's Fair.
|
|
72
|
+
The tower is 330 metres tall.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
response = """
|
|
76
|
+
The Eiffel Tower is in Paris. It was built in 1889 and stands 330 metres
|
|
77
|
+
tall. It was designed by Leonardo da Vinci and attracts over 7 million
|
|
78
|
+
visitors every year.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
result = score(context=context, response=response)
|
|
82
|
+
print(result)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Output:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
Verdict : FAIL
|
|
89
|
+
Faithfulness : 0.40
|
|
90
|
+
|
|
91
|
+
Grounded claims (2):
|
|
92
|
+
✓ The Eiffel Tower is in Paris.
|
|
93
|
+
✓ It stands 330 metres tall.
|
|
94
|
+
|
|
95
|
+
Hallucinated claims (3):
|
|
96
|
+
✗ It was built in 1889.
|
|
97
|
+
✗ It was designed by Leonardo da Vinci.
|
|
98
|
+
✗ It attracts over 7 million visitors every year.
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## The result object
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
result.faithfulness_score # float 0.0 – 1.0
|
|
107
|
+
result.grounded_claims # list of supported statements
|
|
108
|
+
result.hallucinated_claims # list of fabricated statements
|
|
109
|
+
result.verdict # "PASS" | "WARN" | "FAIL"
|
|
110
|
+
result.model # judge model used
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
| Verdict | Faithfulness Score |
|
|
114
|
+
|---------|-------------------|
|
|
115
|
+
| ✅ PASS | >= 0.8 |
|
|
116
|
+
| ⚠️ WARN | >= 0.5 |
|
|
117
|
+
| ❌ FAIL | < 0.5 |
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Streamlit demo
|
|
122
|
+
|
|
123
|
+
Run the interactive demo locally:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
streamlit run app.py
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Paste any context and LLM response — get an instant hallucination report.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## How it works
|
|
134
|
+
|
|
135
|
+
1. Your `context` and `response` are sent to `gpt-4o-mini` as a structured judge prompt.
|
|
136
|
+
2. The judge breaks the response into individual factual claims.
|
|
137
|
+
3. Each claim is classified as grounded (supported by context) or hallucinated (absent or contradicted).
|
|
138
|
+
4. A faithfulness score is calculated: `grounded_claims / total_claims`.
|
|
139
|
+
5. A verdict of PASS, WARN, or FAIL is assigned.
|
|
140
|
+
|
|
141
|
+
The judge uses `response_format: json_object` to guarantee structured output.
|
|
142
|
+
Temperature is set to 0 for deterministic results.
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Cost
|
|
147
|
+
|
|
148
|
+
Each evaluation uses `gpt-4o-mini`.
|
|
149
|
+
Typical cost: **~$0.001 per evaluation** (well under a tenth of a cent).
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Roadmap
|
|
154
|
+
|
|
155
|
+
- [ ] Batch evaluation across multiple context/response pairs
|
|
156
|
+
- [ ] CSV upload support in the Streamlit app
|
|
157
|
+
- [ ] Custom judge model selection
|
|
158
|
+
- [ ] LangChain and LlamaIndex integration hooks
|
|
159
|
+
- [ ] CI/CD integration example (GitHub Actions)
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Project structure
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
hallucinationbench/
|
|
167
|
+
├── hallucinationbench/
|
|
168
|
+
│ ├── __init__.py # public API
|
|
169
|
+
│ ├── scorer.py # GPT-4o-mini judge
|
|
170
|
+
│ └── models.py # ScoreResult dataclass
|
|
171
|
+
├── app.py # Streamlit demo
|
|
172
|
+
├── example.py # quickstart example
|
|
173
|
+
├── requirements.txt
|
|
174
|
+
├── .env.example
|
|
175
|
+
└── README.md
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
MIT — free to use, modify, and distribute.
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Contributing
|
|
187
|
+
|
|
188
|
+
Pull requests are welcome. Please open an issue first to discuss what you would like to change.
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
*Built with OpenAI GPT-4o-mini as the hallucination judge.*
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
hallucinationbench/__init__.py
|
|
5
|
+
hallucinationbench/models.py
|
|
6
|
+
hallucinationbench/scorer.py
|
|
7
|
+
hallucinationbench.egg-info/PKG-INFO
|
|
8
|
+
hallucinationbench.egg-info/SOURCES.txt
|
|
9
|
+
hallucinationbench.egg-info/dependency_links.txt
|
|
10
|
+
hallucinationbench.egg-info/requires.txt
|
|
11
|
+
hallucinationbench.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
hallucinationbench
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as f:
|
|
4
|
+
long_description = f.read()
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name="hallucinationbench",
|
|
8
|
+
version="0.1.0",
|
|
9
|
+
author="Devasish Banerjee",
|
|
10
|
+
author_email="bdeva1975@gmail.com",
|
|
11
|
+
description="Detect hallucinations in your RAG pipeline output in two lines of Python.",
|
|
12
|
+
long_description=long_description,
|
|
13
|
+
long_description_content_type="text/markdown",
|
|
14
|
+
url="https://github.com/bdeva1975/hallucinationbench",
|
|
15
|
+
packages=find_packages(),
|
|
16
|
+
classifiers=[
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
],
|
|
22
|
+
python_requires=">=3.10",
|
|
23
|
+
install_requires=[
|
|
24
|
+
"openai>=1.0.0",
|
|
25
|
+
"python-dotenv>=1.0.0",
|
|
26
|
+
"httpx>=0.27.0",
|
|
27
|
+
],
|
|
28
|
+
)
|