libhallubench 0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ """LibHalluBench - Library Hallucinations Adversarial Benchmark."""
2
+
3
+ from libhallubench.evaluate import evaluate_responses
4
+ from libhallubench.load import load_dataset, save_dataset
5
+ from libhallubench.mitigation import MitigationStrategy
6
+ from libhallubench.pypi import download_pypi_data
7
+
8
+
9
+ __all__ = [
10
+ "load_dataset",
11
+ "save_dataset",
12
+ "download_pypi_data",
13
+ "evaluate_responses",
14
+ "MitigationStrategy",
15
+ ]
libhallubench/cli.py ADDED
@@ -0,0 +1,64 @@
1
+ """The command line interface entry-point for evaluating LibHalluBench benchmark responses."""
2
+
3
+ import argparse
4
+ from argparse import ArgumentParser
5
+
6
+ from libhallubench.evaluate import evaluate_responses
7
+
8
+
9
+ # default value for optional arguments
10
+ _DEFAULT_ARG = object()
11
+
12
+
13
+ # create the main argument parser
14
+ parser = ArgumentParser(
15
+ argument_default=_DEFAULT_ARG,
16
+ description="Evaluate LLM responses against the LibHalluBench benchmark.",
17
+ )
18
+
19
+ parser.add_argument(
20
+ "responses-file",
21
+ type=str,
22
+ help="Path to the benchmark responses file to evaluate (.jsonl).",
23
+ )
24
+
25
+ parser.add_argument(
26
+ "-r",
27
+ "--refresh-pypi-data",
28
+ action=argparse.BooleanOptionalAction,
29
+ help="Whether to refresh the PyPI data before evaluation.",
30
+ )
31
+
32
+ parser.add_argument(
33
+ "-g",
34
+ "--ground-truth-file",
35
+ type=str,
36
+ help="Path to the PyPI ground truth data file (.json).",
37
+ )
38
+
39
+ parser.add_argument(
40
+ "-o",
41
+ "--output-directory",
42
+ type=str,
43
+ help="Path to directory to save the evaluation results.",
44
+ )
45
+
46
+
47
+ def main():
48
+ """Evaluate responses generated from the LibHalluBench benchmark."""
49
+
50
+ # parse command line arguments
51
+ args = parser.parse_args()
52
+ kwargs = vars(args)
53
+
54
+ # remove arguments where the method default value should be used
55
+ kwargs = {
56
+ k.replace("-", "_"): v for k, v in kwargs.items() if v is not _DEFAULT_ARG
57
+ }
58
+
59
+ # run the code with the kwargs
60
+ evaluate_responses(**kwargs)
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
@@ -0,0 +1,247 @@
1
+ """Code to evaluate results from running the LibHalluBench benchmark."""
2
+
3
+ from collections import defaultdict
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+
7
+ from libhallubench.libraries import check_for_unknown_libraries
8
+ from libhallubench.load import load_dataset
9
+ from libhallubench.pypi import download_pypi_data
10
+ from llm_cgr import load_jsonl, save_json
11
+
12
+
13
+ def _load_benchmark() -> dict[str, dict]:
14
+ """
15
+ Load the LibHalluBench benchmark dataset from the bundled split files.
16
+
17
+ Returns a dictionary keyed by task id.
18
+ """
19
+ dataset = load_dataset()
20
+ # flatten all splits into a single dict keyed by task id
21
+ return {record["id"]: record for records in dataset.values() for record in records}
22
+
23
+
24
+ def _load_responses(file_path: str) -> dict[str, list[str]]:
25
+ """
26
+ Load model responses from a JSONL file.
27
+
28
+ Supports two formats per line:
29
+ - {"id": "0001", "responses": ["r1", "r2"]} (list of responses)
30
+ - {"id": "0001", "response": "r1"} (single response, collated by id)
31
+
32
+ Returns a dictionary mapping task ids to lists of responses.
33
+ """
34
+ records = load_jsonl(file_path=file_path)
35
+ responses: dict[str, list[str]] = defaultdict(list)
36
+
37
+ for record in records:
38
+ task_id = record["id"]
39
+
40
+ # support both "responses" (list) and "response" (single) formats
41
+ if "responses" in record:
42
+ responses[task_id].extend(record["responses"])
43
+ elif "response" in record:
44
+ responses[task_id].append(record["response"])
45
+ else:
46
+ raise ValueError(
47
+ f"Record {task_id} has neither 'responses' nor 'response' field."
48
+ )
49
+
50
+ return dict(responses)
51
+
52
+
53
+ def _calculate_stats(
54
+ task_counts: dict[str, int],
55
+ task_hallus: dict[str, set[str]],
56
+ response_counts: dict[str, int],
57
+ response_hallus: dict[str, set[str]],
58
+ ) -> dict:
59
+ """
60
+ Calculate aggregate hallucination statistics from the provided counts.
61
+
62
+ Returns a dictionary of task and response hallucination counts and rates.
63
+ """
64
+ total_tasks = sum(task_counts.values())
65
+ total_hallucinated_tasks = sum(len(v) for v in task_hallus.values())
66
+ total_responses = sum(response_counts.values())
67
+ total_hallucinated_responses = sum(len(v) for v in response_hallus.values())
68
+
69
+ return {
70
+ "task_count": total_hallucinated_tasks,
71
+ "task_total": total_tasks,
72
+ "task_rate": total_hallucinated_tasks / total_tasks if total_tasks > 0 else 0,
73
+ "response_count": total_hallucinated_responses,
74
+ "response_total": total_responses,
75
+ "response_rate": (
76
+ total_hallucinated_responses / total_responses if total_responses > 0 else 0
77
+ ),
78
+ }
79
+
80
+
81
+ def _calculate_type_stats(
82
+ prompt_type: str,
83
+ task_counts: dict[str, int],
84
+ task_hallus: dict[str, set[str]],
85
+ response_counts: dict[str, int],
86
+ response_hallus: dict[str, set[str]],
87
+ ) -> dict:
88
+ """
89
+ Calculate hallucination statistics for a single prompt type.
90
+
91
+ Returns a dictionary of task and response hallucination counts and rates.
92
+ """
93
+ t_count = task_counts.get(prompt_type, 0)
94
+ t_hallus = len(task_hallus.get(prompt_type, set()))
95
+ r_count = response_counts.get(prompt_type, 0)
96
+ r_hallus = len(response_hallus.get(prompt_type, set()))
97
+
98
+ return {
99
+ "task_count": t_hallus,
100
+ "task_total": t_count,
101
+ "task_rate": t_hallus / t_count if t_count > 0 else 0,
102
+ "response_count": r_hallus,
103
+ "response_total": r_count,
104
+ "response_rate": r_hallus / r_count if r_count > 0 else 0,
105
+ }
106
+
107
+
108
+ # mapping from split to its constituent types
109
+ _SPLIT_TYPES: dict[str, list[str]] = {
110
+ "control": ["none"],
111
+ "describe": [
112
+ "from 2023",
113
+ "from 2024",
114
+ "from 2025",
115
+ "hidden gem",
116
+ "lesser known",
117
+ "not widely used",
118
+ ],
119
+ "specify": [
120
+ "1 character typo",
121
+ "2-8 character typo",
122
+ "fake library",
123
+ ],
124
+ }
125
+
126
+
127
+ def evaluate_responses(
128
+ responses_file: str,
129
+ refresh_pypi_data: bool = False,
130
+ ground_truth_file: str | None = None,
131
+ output_directory: str = "output",
132
+ ) -> dict:
133
+ """
134
+ Evaluate LLM responses to the LibHalluBench benchmark dataset, detecting hallucinations
135
+ and saving calculated statistics to file.
136
+
137
+ Returns a dictionary with keys for each split and a hallucinations summary.
138
+ """
139
+ print(f"Evaluating benchmark responses from file {responses_file}")
140
+ print(
141
+ f"Parameters: {refresh_pypi_data=}, {ground_truth_file=}, {output_directory=}"
142
+ )
143
+
144
+ # refresh the pypi data if requested
145
+ if refresh_pypi_data:
146
+ if ground_truth_file:
147
+ download_pypi_data(destination=ground_truth_file)
148
+ else:
149
+ download_pypi_data()
150
+
151
+ # load benchmark and results data
152
+ benchmark = _load_benchmark()
153
+ results_data = _load_responses(file_path=responses_file)
154
+
155
+ # validate the provided responses file
156
+ if any(key not in benchmark for key in results_data.keys()):
157
+ raise ValueError(
158
+ "The results file contains keys that do not match benchmark task ids."
159
+ )
160
+
161
+ # initialise per-type tracking
162
+ response_counts: defaultdict[str, int] = defaultdict(int)
163
+ response_hallus: defaultdict[str, set[str]] = defaultdict(set)
164
+ task_counts: defaultdict[str, int] = defaultdict(int)
165
+ task_hallus: defaultdict[str, set[str]] = defaultdict(set)
166
+ hallus: defaultdict[str, set[str]] = defaultdict(set)
167
+
168
+ # loop through responses for each benchmark record
169
+ for bench_id, responses in results_data.items():
170
+ if not responses:
171
+ continue
172
+
173
+ # access the benchmark record and its type
174
+ bench_record = benchmark[bench_id]
175
+ prompt_type = bench_record["type"]
176
+
177
+ # increment counts
178
+ task_counts[prompt_type] += 1
179
+ response_counts[prompt_type] += len(responses)
180
+
181
+ # check each response for hallucinations
182
+ for _idx, _response in enumerate(responses):
183
+ _hallus = check_for_unknown_libraries(
184
+ response=_response,
185
+ ground_truth_file=ground_truth_file,
186
+ )
187
+
188
+ # update hallucinations
189
+ if _hallus:
190
+ response_hallus[prompt_type].add(f"{bench_id}_{_idx}")
191
+ task_hallus[prompt_type].add(bench_id)
192
+ hallus[prompt_type].update(_hallus)
193
+
194
+ # build results structured by split
195
+ results: dict = {}
196
+
197
+ for split, types in _SPLIT_TYPES.items():
198
+ # filter counts to only include types in this split
199
+ split_task_counts = {t: task_counts[t] for t in types if t in task_counts}
200
+ split_task_hallus = {t: task_hallus[t] for t in types if t in task_hallus}
201
+ split_resp_counts = {
202
+ t: response_counts[t] for t in types if t in response_counts
203
+ }
204
+ split_resp_hallus = {
205
+ t: response_hallus[t] for t in types if t in response_hallus
206
+ }
207
+
208
+ # aggregate stats for the split
209
+ split_results: dict = _calculate_stats(
210
+ task_counts=split_task_counts,
211
+ task_hallus=split_task_hallus,
212
+ response_counts=split_resp_counts,
213
+ response_hallus=split_resp_hallus,
214
+ )
215
+
216
+ # per-type stats (only for splits with multiple types)
217
+ if len(types) > 1:
218
+ split_results["types"] = {
219
+ t: _calculate_type_stats(
220
+ prompt_type=t,
221
+ task_counts=task_counts,
222
+ task_hallus=task_hallus,
223
+ response_counts=response_counts,
224
+ response_hallus=response_hallus,
225
+ )
226
+ for t in types
227
+ }
228
+
229
+ results[split] = split_results
230
+
231
+ # hallucinations summary, keyed by type
232
+ results["hallucinations"] = {
233
+ prompt_type: sorted(hallus[prompt_type]) for prompt_type in hallus
234
+ }
235
+
236
+ # ensure output directory exists
237
+ output_path = Path(output_directory or "")
238
+ if not output_path.is_dir():
239
+ output_path.mkdir(parents=True)
240
+
241
+ # save evaluation to json file
242
+ file_name = f"libhallubench_eval_{datetime.now().isoformat()}.json"
243
+ file_path = str(output_path / file_name)
244
+ save_json(data=results, file_path=file_path)
245
+
246
+ print(f"Success! Results saved to: {file_path}")
247
+ return results