libhallubench 0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- libhallubench/__init__.py +15 -0
- libhallubench/cli.py +64 -0
- libhallubench/evaluate.py +247 -0
- libhallubench/libhallubench-control.jsonl +321 -0
- libhallubench/libhallubench-describe.jsonl +1926 -0
- libhallubench/libhallubench-specify.jsonl +1926 -0
- libhallubench/libraries.py +68 -0
- libhallubench/load.py +105 -0
- libhallubench/mitigation.py +27 -0
- libhallubench/pypi.py +101 -0
- libhallubench-0.7.dist-info/METADATA +88 -0
- libhallubench-0.7.dist-info/RECORD +16 -0
- libhallubench-0.7.dist-info/WHEEL +5 -0
- libhallubench-0.7.dist-info/entry_points.txt +2 -0
- libhallubench-0.7.dist-info/licenses/LICENSE +395 -0
- libhallubench-0.7.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""LibHalluBench - Library Hallucinations Adversarial Benchmark."""
|
|
2
|
+
|
|
3
|
+
from libhallubench.evaluate import evaluate_responses
|
|
4
|
+
from libhallubench.load import load_dataset, save_dataset
|
|
5
|
+
from libhallubench.mitigation import MitigationStrategy
|
|
6
|
+
from libhallubench.pypi import download_pypi_data
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"load_dataset",
|
|
11
|
+
"save_dataset",
|
|
12
|
+
"download_pypi_data",
|
|
13
|
+
"evaluate_responses",
|
|
14
|
+
"MitigationStrategy",
|
|
15
|
+
]
|
libhallubench/cli.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""The command line interface entry-point for evaluating LibHalluBench benchmark responses."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from argparse import ArgumentParser
|
|
5
|
+
|
|
6
|
+
from libhallubench.evaluate import evaluate_responses
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# default value for optional arguments
|
|
10
|
+
_DEFAULT_ARG = object()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# create the main argument parser
|
|
14
|
+
parser = ArgumentParser(
|
|
15
|
+
argument_default=_DEFAULT_ARG,
|
|
16
|
+
description="Evaluate LLM responses against the LibHalluBench benchmark.",
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"responses-file",
|
|
21
|
+
type=str,
|
|
22
|
+
help="Path to the benchmark responses file to evaluate (.jsonl).",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"-r",
|
|
27
|
+
"--refresh-pypi-data",
|
|
28
|
+
action=argparse.BooleanOptionalAction,
|
|
29
|
+
help="Whether to refresh the PyPI data before evaluation.",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"-g",
|
|
34
|
+
"--ground-truth-file",
|
|
35
|
+
type=str,
|
|
36
|
+
help="Path to the PyPI ground truth data file (.json).",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"-o",
|
|
41
|
+
"--output-directory",
|
|
42
|
+
type=str,
|
|
43
|
+
help="Path to directory to save the evaluation results.",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def main():
|
|
48
|
+
"""Evaluate responses generated from the LibHalluBench benchmark."""
|
|
49
|
+
|
|
50
|
+
# parse command line arguments
|
|
51
|
+
args = parser.parse_args()
|
|
52
|
+
kwargs = vars(args)
|
|
53
|
+
|
|
54
|
+
# remove arguments where the method default value should be used
|
|
55
|
+
kwargs = {
|
|
56
|
+
k.replace("-", "_"): v for k, v in kwargs.items() if v is not _DEFAULT_ARG
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# run the code with the kwargs
|
|
60
|
+
evaluate_responses(**kwargs)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
if __name__ == "__main__":
|
|
64
|
+
main()
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""Code to evaluate results from running the LibHalluBench benchmark."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from libhallubench.libraries import check_for_unknown_libraries
|
|
8
|
+
from libhallubench.load import load_dataset
|
|
9
|
+
from libhallubench.pypi import download_pypi_data
|
|
10
|
+
from llm_cgr import load_jsonl, save_json
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _load_benchmark() -> dict[str, dict]:
|
|
14
|
+
"""
|
|
15
|
+
Load the LibHalluBench benchmark dataset from the bundled split files.
|
|
16
|
+
|
|
17
|
+
Returns a dictionary keyed by task id.
|
|
18
|
+
"""
|
|
19
|
+
dataset = load_dataset()
|
|
20
|
+
# flatten all splits into a single dict keyed by task id
|
|
21
|
+
return {record["id"]: record for records in dataset.values() for record in records}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _load_responses(file_path: str) -> dict[str, list[str]]:
|
|
25
|
+
"""
|
|
26
|
+
Load model responses from a JSONL file.
|
|
27
|
+
|
|
28
|
+
Supports two formats per line:
|
|
29
|
+
- {"id": "0001", "responses": ["r1", "r2"]} (list of responses)
|
|
30
|
+
- {"id": "0001", "response": "r1"} (single response, collated by id)
|
|
31
|
+
|
|
32
|
+
Returns a dictionary mapping task ids to lists of responses.
|
|
33
|
+
"""
|
|
34
|
+
records = load_jsonl(file_path=file_path)
|
|
35
|
+
responses: dict[str, list[str]] = defaultdict(list)
|
|
36
|
+
|
|
37
|
+
for record in records:
|
|
38
|
+
task_id = record["id"]
|
|
39
|
+
|
|
40
|
+
# support both "responses" (list) and "response" (single) formats
|
|
41
|
+
if "responses" in record:
|
|
42
|
+
responses[task_id].extend(record["responses"])
|
|
43
|
+
elif "response" in record:
|
|
44
|
+
responses[task_id].append(record["response"])
|
|
45
|
+
else:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"Record {task_id} has neither 'responses' nor 'response' field."
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return dict(responses)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _calculate_stats(
|
|
54
|
+
task_counts: dict[str, int],
|
|
55
|
+
task_hallus: dict[str, set[str]],
|
|
56
|
+
response_counts: dict[str, int],
|
|
57
|
+
response_hallus: dict[str, set[str]],
|
|
58
|
+
) -> dict:
|
|
59
|
+
"""
|
|
60
|
+
Calculate aggregate hallucination statistics from the provided counts.
|
|
61
|
+
|
|
62
|
+
Returns a dictionary of task and response hallucination counts and rates.
|
|
63
|
+
"""
|
|
64
|
+
total_tasks = sum(task_counts.values())
|
|
65
|
+
total_hallucinated_tasks = sum(len(v) for v in task_hallus.values())
|
|
66
|
+
total_responses = sum(response_counts.values())
|
|
67
|
+
total_hallucinated_responses = sum(len(v) for v in response_hallus.values())
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
"task_count": total_hallucinated_tasks,
|
|
71
|
+
"task_total": total_tasks,
|
|
72
|
+
"task_rate": total_hallucinated_tasks / total_tasks if total_tasks > 0 else 0,
|
|
73
|
+
"response_count": total_hallucinated_responses,
|
|
74
|
+
"response_total": total_responses,
|
|
75
|
+
"response_rate": (
|
|
76
|
+
total_hallucinated_responses / total_responses if total_responses > 0 else 0
|
|
77
|
+
),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _calculate_type_stats(
|
|
82
|
+
prompt_type: str,
|
|
83
|
+
task_counts: dict[str, int],
|
|
84
|
+
task_hallus: dict[str, set[str]],
|
|
85
|
+
response_counts: dict[str, int],
|
|
86
|
+
response_hallus: dict[str, set[str]],
|
|
87
|
+
) -> dict:
|
|
88
|
+
"""
|
|
89
|
+
Calculate hallucination statistics for a single prompt type.
|
|
90
|
+
|
|
91
|
+
Returns a dictionary of task and response hallucination counts and rates.
|
|
92
|
+
"""
|
|
93
|
+
t_count = task_counts.get(prompt_type, 0)
|
|
94
|
+
t_hallus = len(task_hallus.get(prompt_type, set()))
|
|
95
|
+
r_count = response_counts.get(prompt_type, 0)
|
|
96
|
+
r_hallus = len(response_hallus.get(prompt_type, set()))
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
"task_count": t_hallus,
|
|
100
|
+
"task_total": t_count,
|
|
101
|
+
"task_rate": t_hallus / t_count if t_count > 0 else 0,
|
|
102
|
+
"response_count": r_hallus,
|
|
103
|
+
"response_total": r_count,
|
|
104
|
+
"response_rate": r_hallus / r_count if r_count > 0 else 0,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# mapping from split to its constituent types
|
|
109
|
+
_SPLIT_TYPES: dict[str, list[str]] = {
|
|
110
|
+
"control": ["none"],
|
|
111
|
+
"describe": [
|
|
112
|
+
"from 2023",
|
|
113
|
+
"from 2024",
|
|
114
|
+
"from 2025",
|
|
115
|
+
"hidden gem",
|
|
116
|
+
"lesser known",
|
|
117
|
+
"not widely used",
|
|
118
|
+
],
|
|
119
|
+
"specify": [
|
|
120
|
+
"1 character typo",
|
|
121
|
+
"2-8 character typo",
|
|
122
|
+
"fake library",
|
|
123
|
+
],
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def evaluate_responses(
|
|
128
|
+
responses_file: str,
|
|
129
|
+
refresh_pypi_data: bool = False,
|
|
130
|
+
ground_truth_file: str | None = None,
|
|
131
|
+
output_directory: str = "output",
|
|
132
|
+
) -> dict:
|
|
133
|
+
"""
|
|
134
|
+
Evaluate LLM responses to the LibHalluBench benchmark dataset, detecting hallucinations
|
|
135
|
+
and saving calculated statistics to file.
|
|
136
|
+
|
|
137
|
+
Returns a dictionary with keys for each split and a hallucinations summary.
|
|
138
|
+
"""
|
|
139
|
+
print(f"Evaluating benchmark responses from file {responses_file}")
|
|
140
|
+
print(
|
|
141
|
+
f"Parameters: {refresh_pypi_data=}, {ground_truth_file=}, {output_directory=}"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# refresh the pypi data if requested
|
|
145
|
+
if refresh_pypi_data:
|
|
146
|
+
if ground_truth_file:
|
|
147
|
+
download_pypi_data(destination=ground_truth_file)
|
|
148
|
+
else:
|
|
149
|
+
download_pypi_data()
|
|
150
|
+
|
|
151
|
+
# load benchmark and results data
|
|
152
|
+
benchmark = _load_benchmark()
|
|
153
|
+
results_data = _load_responses(file_path=responses_file)
|
|
154
|
+
|
|
155
|
+
# validate the provided responses file
|
|
156
|
+
if any(key not in benchmark for key in results_data.keys()):
|
|
157
|
+
raise ValueError(
|
|
158
|
+
"The results file contains keys that do not match benchmark task ids."
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# initialise per-type tracking
|
|
162
|
+
response_counts: defaultdict[str, int] = defaultdict(int)
|
|
163
|
+
response_hallus: defaultdict[str, set[str]] = defaultdict(set)
|
|
164
|
+
task_counts: defaultdict[str, int] = defaultdict(int)
|
|
165
|
+
task_hallus: defaultdict[str, set[str]] = defaultdict(set)
|
|
166
|
+
hallus: defaultdict[str, set[str]] = defaultdict(set)
|
|
167
|
+
|
|
168
|
+
# loop through responses for each benchmark record
|
|
169
|
+
for bench_id, responses in results_data.items():
|
|
170
|
+
if not responses:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
# access the benchmark record and its type
|
|
174
|
+
bench_record = benchmark[bench_id]
|
|
175
|
+
prompt_type = bench_record["type"]
|
|
176
|
+
|
|
177
|
+
# increment counts
|
|
178
|
+
task_counts[prompt_type] += 1
|
|
179
|
+
response_counts[prompt_type] += len(responses)
|
|
180
|
+
|
|
181
|
+
# check each response for hallucinations
|
|
182
|
+
for _idx, _response in enumerate(responses):
|
|
183
|
+
_hallus = check_for_unknown_libraries(
|
|
184
|
+
response=_response,
|
|
185
|
+
ground_truth_file=ground_truth_file,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# update hallucinations
|
|
189
|
+
if _hallus:
|
|
190
|
+
response_hallus[prompt_type].add(f"{bench_id}_{_idx}")
|
|
191
|
+
task_hallus[prompt_type].add(bench_id)
|
|
192
|
+
hallus[prompt_type].update(_hallus)
|
|
193
|
+
|
|
194
|
+
# build results structured by split
|
|
195
|
+
results: dict = {}
|
|
196
|
+
|
|
197
|
+
for split, types in _SPLIT_TYPES.items():
|
|
198
|
+
# filter counts to only include types in this split
|
|
199
|
+
split_task_counts = {t: task_counts[t] for t in types if t in task_counts}
|
|
200
|
+
split_task_hallus = {t: task_hallus[t] for t in types if t in task_hallus}
|
|
201
|
+
split_resp_counts = {
|
|
202
|
+
t: response_counts[t] for t in types if t in response_counts
|
|
203
|
+
}
|
|
204
|
+
split_resp_hallus = {
|
|
205
|
+
t: response_hallus[t] for t in types if t in response_hallus
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# aggregate stats for the split
|
|
209
|
+
split_results: dict = _calculate_stats(
|
|
210
|
+
task_counts=split_task_counts,
|
|
211
|
+
task_hallus=split_task_hallus,
|
|
212
|
+
response_counts=split_resp_counts,
|
|
213
|
+
response_hallus=split_resp_hallus,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# per-type stats (only for splits with multiple types)
|
|
217
|
+
if len(types) > 1:
|
|
218
|
+
split_results["types"] = {
|
|
219
|
+
t: _calculate_type_stats(
|
|
220
|
+
prompt_type=t,
|
|
221
|
+
task_counts=task_counts,
|
|
222
|
+
task_hallus=task_hallus,
|
|
223
|
+
response_counts=response_counts,
|
|
224
|
+
response_hallus=response_hallus,
|
|
225
|
+
)
|
|
226
|
+
for t in types
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
results[split] = split_results
|
|
230
|
+
|
|
231
|
+
# hallucinations summary, keyed by type
|
|
232
|
+
results["hallucinations"] = {
|
|
233
|
+
prompt_type: sorted(hallus[prompt_type]) for prompt_type in hallus
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# ensure output directory exists
|
|
237
|
+
output_path = Path(output_directory or "")
|
|
238
|
+
if not output_path.is_dir():
|
|
239
|
+
output_path.mkdir(parents=True)
|
|
240
|
+
|
|
241
|
+
# save evaluation to json file
|
|
242
|
+
file_name = f"libhallubench_eval_{datetime.now().isoformat()}.json"
|
|
243
|
+
file_path = str(output_path / file_name)
|
|
244
|
+
save_json(data=results, file_path=file_path)
|
|
245
|
+
|
|
246
|
+
print(f"Success! Results saved to: {file_path}")
|
|
247
|
+
return results
|