valor-lite 0.37.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of valor-lite might be problematic. Click here for more details.
- valor_lite/LICENSE +21 -0
- valor_lite/__init__.py +0 -0
- valor_lite/cache/__init__.py +11 -0
- valor_lite/cache/compute.py +154 -0
- valor_lite/cache/ephemeral.py +302 -0
- valor_lite/cache/persistent.py +529 -0
- valor_lite/classification/__init__.py +14 -0
- valor_lite/classification/annotation.py +45 -0
- valor_lite/classification/computation.py +378 -0
- valor_lite/classification/evaluator.py +879 -0
- valor_lite/classification/loader.py +97 -0
- valor_lite/classification/metric.py +535 -0
- valor_lite/classification/numpy_compatibility.py +13 -0
- valor_lite/classification/shared.py +184 -0
- valor_lite/classification/utilities.py +314 -0
- valor_lite/exceptions.py +20 -0
- valor_lite/object_detection/__init__.py +17 -0
- valor_lite/object_detection/annotation.py +238 -0
- valor_lite/object_detection/computation.py +841 -0
- valor_lite/object_detection/evaluator.py +805 -0
- valor_lite/object_detection/loader.py +292 -0
- valor_lite/object_detection/metric.py +850 -0
- valor_lite/object_detection/shared.py +185 -0
- valor_lite/object_detection/utilities.py +396 -0
- valor_lite/schemas.py +11 -0
- valor_lite/semantic_segmentation/__init__.py +15 -0
- valor_lite/semantic_segmentation/annotation.py +123 -0
- valor_lite/semantic_segmentation/computation.py +165 -0
- valor_lite/semantic_segmentation/evaluator.py +414 -0
- valor_lite/semantic_segmentation/loader.py +205 -0
- valor_lite/semantic_segmentation/metric.py +275 -0
- valor_lite/semantic_segmentation/shared.py +149 -0
- valor_lite/semantic_segmentation/utilities.py +88 -0
- valor_lite/text_generation/__init__.py +15 -0
- valor_lite/text_generation/annotation.py +56 -0
- valor_lite/text_generation/computation.py +611 -0
- valor_lite/text_generation/llm/__init__.py +0 -0
- valor_lite/text_generation/llm/exceptions.py +14 -0
- valor_lite/text_generation/llm/generation.py +903 -0
- valor_lite/text_generation/llm/instructions.py +814 -0
- valor_lite/text_generation/llm/integrations.py +226 -0
- valor_lite/text_generation/llm/utilities.py +43 -0
- valor_lite/text_generation/llm/validators.py +68 -0
- valor_lite/text_generation/manager.py +697 -0
- valor_lite/text_generation/metric.py +381 -0
- valor_lite-0.37.1.dist-info/METADATA +174 -0
- valor_lite-0.37.1.dist-info/RECORD +49 -0
- valor_lite-0.37.1.dist-info/WHEEL +5 -0
- valor_lite-0.37.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,903 @@
|
|
|
1
|
+
from typing import Any, Callable
|
|
2
|
+
|
|
3
|
+
from valor_lite.text_generation.llm.exceptions import InvalidLLMResponseError
|
|
4
|
+
from valor_lite.text_generation.llm.instructions import (
|
|
5
|
+
format_answer_correctness_verdicts_instruction,
|
|
6
|
+
format_answer_relevance_verdicts_instruction,
|
|
7
|
+
format_bias_verdicts_instruction,
|
|
8
|
+
format_claims_instruction,
|
|
9
|
+
format_context_precision_verdicts_instruction,
|
|
10
|
+
format_context_recall_verdicts_instruction,
|
|
11
|
+
format_context_relevance_verdicts_instruction,
|
|
12
|
+
format_faithfulness_verdicts_instruction,
|
|
13
|
+
format_hallucination_verdicts_instruction,
|
|
14
|
+
format_opinions_instruction,
|
|
15
|
+
format_statements_instruction,
|
|
16
|
+
format_summary_coherence_instruction,
|
|
17
|
+
format_toxicity_verdicts_instruction,
|
|
18
|
+
)
|
|
19
|
+
from valor_lite.text_generation.llm.integrations import ClientWrapper
|
|
20
|
+
from valor_lite.text_generation.llm.utilities import (
|
|
21
|
+
find_first_signed_integer,
|
|
22
|
+
trim_and_load_json,
|
|
23
|
+
)
|
|
24
|
+
from valor_lite.text_generation.llm.validators import (
|
|
25
|
+
validate_statements,
|
|
26
|
+
validate_verdicts,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _generate(
|
|
31
|
+
client: ClientWrapper,
|
|
32
|
+
messages: list[dict[str, str]],
|
|
33
|
+
keys: set[str],
|
|
34
|
+
validator: Callable,
|
|
35
|
+
allowed_values: set[str] | None = None,
|
|
36
|
+
enforce_length: int | None = None,
|
|
37
|
+
) -> dict[str, Any]:
|
|
38
|
+
"""
|
|
39
|
+
Query the LLM client.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
client : ClientWrapper
|
|
44
|
+
The LLM client.
|
|
45
|
+
messages : list[dict[str, str]]
|
|
46
|
+
A formatted list of commands for the LLM.
|
|
47
|
+
keys : list[str]
|
|
48
|
+
The keys used to extract results from the LLM's response.
|
|
49
|
+
validator : Callable
|
|
50
|
+
Specifies a validator to use on the response.
|
|
51
|
+
allowed_values : set[str], optional
|
|
52
|
+
An optional set of values to restrict the results to.
|
|
53
|
+
enforce_length : int, optional
|
|
54
|
+
An optional integer that enforces the length of the result.
|
|
55
|
+
"""
|
|
56
|
+
response = client(messages)
|
|
57
|
+
response = trim_and_load_json(response)
|
|
58
|
+
for key in keys:
|
|
59
|
+
validator(
|
|
60
|
+
response=response,
|
|
61
|
+
key=key,
|
|
62
|
+
allowed_values=allowed_values,
|
|
63
|
+
enforce_length=enforce_length,
|
|
64
|
+
)
|
|
65
|
+
return response
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def generate_claims(
|
|
69
|
+
client: ClientWrapper,
|
|
70
|
+
system_prompt: str,
|
|
71
|
+
text: str,
|
|
72
|
+
) -> list[str]:
|
|
73
|
+
"""
|
|
74
|
+
Generate a list of claims from a piece of text, using a call to the LLM API.
|
|
75
|
+
|
|
76
|
+
Example Text: "Einstein won the noble prize in 1921 for his discovery of the photoelectric effect."
|
|
77
|
+
|
|
78
|
+
Example JSON Response:
|
|
79
|
+
{
|
|
80
|
+
"claims": [
|
|
81
|
+
"Einstein won the noble prize for his discovery of the photoelectric effect.",
|
|
82
|
+
"Einstein won the noble prize in 1921."
|
|
83
|
+
]
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
Parameters
|
|
87
|
+
----------
|
|
88
|
+
text: str
|
|
89
|
+
The text to extract claims from.
|
|
90
|
+
|
|
91
|
+
Returns
|
|
92
|
+
-------
|
|
93
|
+
list[str]
|
|
94
|
+
The list of claims extracted from the text.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
messages = [
|
|
98
|
+
{"role": "system", "content": system_prompt},
|
|
99
|
+
{
|
|
100
|
+
"role": "user",
|
|
101
|
+
"content": format_claims_instruction(text=text),
|
|
102
|
+
},
|
|
103
|
+
]
|
|
104
|
+
response = _generate(
|
|
105
|
+
client=client,
|
|
106
|
+
messages=messages,
|
|
107
|
+
keys={"claims"},
|
|
108
|
+
validator=validate_statements,
|
|
109
|
+
)
|
|
110
|
+
return response["claims"]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def generate_opinions(
|
|
114
|
+
client: ClientWrapper,
|
|
115
|
+
system_prompt: str,
|
|
116
|
+
text: str,
|
|
117
|
+
) -> list[str]:
|
|
118
|
+
"""
|
|
119
|
+
Generate a list of opinions from a piece of text, using a call to the LLM API.
|
|
120
|
+
|
|
121
|
+
Example Text: "Although most people live in cities, I like living in the countryside. CNN thinks that the government is not doing enough to combat climate change. Earth is the smallest planet in our solar system."
|
|
122
|
+
|
|
123
|
+
Example JSON response:
|
|
124
|
+
{
|
|
125
|
+
"opinions": [
|
|
126
|
+
"I like living in the countryside."
|
|
127
|
+
]
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
text: str
|
|
133
|
+
The text to extract opinions from.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
list[str]
|
|
138
|
+
The list of opinions extracted from the text.
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
messages = [
|
|
142
|
+
{"role": "system", "content": system_prompt},
|
|
143
|
+
{
|
|
144
|
+
"role": "user",
|
|
145
|
+
"content": format_opinions_instruction(text=text),
|
|
146
|
+
},
|
|
147
|
+
]
|
|
148
|
+
response = _generate(
|
|
149
|
+
client=client,
|
|
150
|
+
messages=messages,
|
|
151
|
+
keys={"opinions"},
|
|
152
|
+
validator=validate_statements,
|
|
153
|
+
)
|
|
154
|
+
return response["opinions"]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def generate_statements(
|
|
158
|
+
client: ClientWrapper,
|
|
159
|
+
system_prompt: str,
|
|
160
|
+
text: str,
|
|
161
|
+
) -> list[str]:
|
|
162
|
+
"""
|
|
163
|
+
Generate a list of statements from a piece of text, using a call to the LLM API.
|
|
164
|
+
|
|
165
|
+
Example Text: "These shoes? All of our shoes have a thirty day return policy and can be returned for a full refund!"
|
|
166
|
+
|
|
167
|
+
Example JSON Response:
|
|
168
|
+
{
|
|
169
|
+
"statements": [
|
|
170
|
+
"These shoes?",
|
|
171
|
+
"All of our shoes have a thirty day return policy",
|
|
172
|
+
"All of our shoes can be returned for a full refund"
|
|
173
|
+
]
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
----------
|
|
178
|
+
text: str
|
|
179
|
+
The text to extract statements from.
|
|
180
|
+
|
|
181
|
+
Returns
|
|
182
|
+
-------
|
|
183
|
+
list[str]
|
|
184
|
+
The list of statements extracted from the text.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
messages = [
|
|
188
|
+
{"role": "system", "content": system_prompt},
|
|
189
|
+
{
|
|
190
|
+
"role": "user",
|
|
191
|
+
"content": format_statements_instruction(text=text),
|
|
192
|
+
},
|
|
193
|
+
]
|
|
194
|
+
response = _generate(
|
|
195
|
+
client=client,
|
|
196
|
+
messages=messages,
|
|
197
|
+
keys={"statements"},
|
|
198
|
+
validator=validate_statements,
|
|
199
|
+
)
|
|
200
|
+
return response["statements"]
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def generate_answer_correctness_verdicts(
|
|
204
|
+
client: ClientWrapper,
|
|
205
|
+
system_prompt: str,
|
|
206
|
+
query: str,
|
|
207
|
+
prediction_statements: list[str],
|
|
208
|
+
groundtruth_statements: list[str],
|
|
209
|
+
) -> dict[str, list[str]]:
|
|
210
|
+
"""
|
|
211
|
+
Generate lists of true positives, false positives and false negatives, using a call to the LLM API.
|
|
212
|
+
|
|
213
|
+
Example Query: What is the boiling point of water?
|
|
214
|
+
|
|
215
|
+
Example Prediction Statements: [
|
|
216
|
+
"The boiling point of water is 100 degrees Celsius at sea level",
|
|
217
|
+
"The melting point of water is 0 degrees Celsius!"
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
Example Ground Truth Statements: [
|
|
221
|
+
"The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level.",
|
|
222
|
+
"The boiling point of water can change with altitude."
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
Example JSON response:
|
|
226
|
+
{
|
|
227
|
+
"TP": [
|
|
228
|
+
"The boiling point of water is 100 degrees Celsius at sea level"
|
|
229
|
+
],
|
|
230
|
+
"FP": [
|
|
231
|
+
"The melting point of water is 0 degrees Celsius!"
|
|
232
|
+
],
|
|
233
|
+
"FN": [
|
|
234
|
+
"The boiling point of water can change with altitude."
|
|
235
|
+
]
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
Parameters
|
|
239
|
+
----------
|
|
240
|
+
query: str
|
|
241
|
+
The query that both the prediction and ground truth should be answering.
|
|
242
|
+
prediction_statements: list[str]
|
|
243
|
+
The prediction statements to evaluate.
|
|
244
|
+
groundtruth_statements: list[str]
|
|
245
|
+
The ground truth statements to evaluate.
|
|
246
|
+
|
|
247
|
+
Returns
|
|
248
|
+
-------
|
|
249
|
+
dict[str, list[str]]
|
|
250
|
+
A dictionary of true positives, false positives and false negatives.
|
|
251
|
+
"""
|
|
252
|
+
|
|
253
|
+
messages = [
|
|
254
|
+
{"role": "system", "content": system_prompt},
|
|
255
|
+
{
|
|
256
|
+
"role": "user",
|
|
257
|
+
"content": format_answer_correctness_verdicts_instruction(
|
|
258
|
+
query=query,
|
|
259
|
+
prediction_statements=prediction_statements,
|
|
260
|
+
groundtruth_statements=groundtruth_statements,
|
|
261
|
+
),
|
|
262
|
+
},
|
|
263
|
+
]
|
|
264
|
+
response = _generate(
|
|
265
|
+
client=client,
|
|
266
|
+
messages=messages,
|
|
267
|
+
keys={"TP", "FP", "FN"},
|
|
268
|
+
validator=validate_statements,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
if len(response["TP"]) + len(response["FP"]) != len(prediction_statements):
|
|
272
|
+
raise InvalidLLMResponseError(
|
|
273
|
+
f"Number of true positives and false positives did not match the number of prediction statements: {response}"
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
if len(response["FN"]) > len(groundtruth_statements):
|
|
277
|
+
raise InvalidLLMResponseError(
|
|
278
|
+
f"Number of false negatives exceeded the number of ground truth statements '{len(groundtruth_statements)}': {response}"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return response
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def generate_answer_relevance_verdicts(
|
|
285
|
+
client: ClientWrapper,
|
|
286
|
+
system_prompt: str,
|
|
287
|
+
query: str,
|
|
288
|
+
statements: list[str],
|
|
289
|
+
) -> list[dict[str, str]]:
|
|
290
|
+
"""
|
|
291
|
+
Generate a list of answer relevance verdicts for a list of statements, using a call to the LLM API.
|
|
292
|
+
|
|
293
|
+
Example Query: What should I do if there is an earthquake?
|
|
294
|
+
|
|
295
|
+
Example Statements: ["Shoes.", "Thanks for asking the question!", "Earthquake frequency varies by region.", "Duck and hide"]
|
|
296
|
+
|
|
297
|
+
Example JSON response:
|
|
298
|
+
{
|
|
299
|
+
"verdicts": [
|
|
300
|
+
{
|
|
301
|
+
"analysis": "The 'Shoes.' statement is completely irrelevant to the query, which asks about what to do in the event of an earthquake.",
|
|
302
|
+
"verdict": "no"
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
"analysis": "This statement refers to the query but does not answer the question.",
|
|
306
|
+
"verdict": "idk"
|
|
307
|
+
},
|
|
308
|
+
{
|
|
309
|
+
"analysis": "The statement is about earthquakes, but it does not provide any advice. The statement could be used as a supporting point for some advice, though, so the relevance is unclear.",
|
|
310
|
+
"verdict": "idk"
|
|
311
|
+
},
|
|
312
|
+
{
|
|
313
|
+
"analysis": "This statement is an answer to the question and provides relevant advice.",
|
|
314
|
+
"verdict": "yes"
|
|
315
|
+
}
|
|
316
|
+
]
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
Parameters
|
|
320
|
+
----------
|
|
321
|
+
query: str
|
|
322
|
+
The query to evaluate the statements against.
|
|
323
|
+
statements: list[str]
|
|
324
|
+
The statements to evaluate the validity of.
|
|
325
|
+
|
|
326
|
+
Returns
|
|
327
|
+
-------
|
|
328
|
+
list[dict[str,str]]
|
|
329
|
+
The list of verdicts for each statement. Each verdict is a dictionary with the "verdict" field.
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
messages = [
|
|
333
|
+
{"role": "system", "content": system_prompt},
|
|
334
|
+
{
|
|
335
|
+
"role": "user",
|
|
336
|
+
"content": format_answer_relevance_verdicts_instruction(
|
|
337
|
+
query=query,
|
|
338
|
+
statements=statements,
|
|
339
|
+
),
|
|
340
|
+
},
|
|
341
|
+
]
|
|
342
|
+
response = _generate(
|
|
343
|
+
client=client,
|
|
344
|
+
messages=messages,
|
|
345
|
+
keys={"verdicts"},
|
|
346
|
+
allowed_values={"yes", "no", "idk"},
|
|
347
|
+
enforce_length=len(statements),
|
|
348
|
+
validator=validate_verdicts,
|
|
349
|
+
)
|
|
350
|
+
return response["verdicts"]
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def generate_bias_verdicts(
|
|
354
|
+
client: ClientWrapper,
|
|
355
|
+
system_prompt: str,
|
|
356
|
+
opinions: list[str],
|
|
357
|
+
) -> list[dict[str, str]]:
|
|
358
|
+
"""
|
|
359
|
+
Generate a list of bias verdicts for a list of opinions, using a call to the LLM API.
|
|
360
|
+
|
|
361
|
+
Example Opinions: [
|
|
362
|
+
"Government meddling in healthcare bloats costs and quashes innovation.",
|
|
363
|
+
"Different models of government involvement in healthcare aim to balance accessibility, cost, and quality, each with its own merits and challenges.",
|
|
364
|
+
"Men and women should have equal opportunities in the workforce."
|
|
365
|
+
]
|
|
366
|
+
|
|
367
|
+
Example JSON:
|
|
368
|
+
{
|
|
369
|
+
"verdicts": [
|
|
370
|
+
{
|
|
371
|
+
"analysis": "The opinion 'Government meddling in healthcare bloats costs and quashes innovation' reveals a political bias, emphasizing negative views on government involvement.",
|
|
372
|
+
"verdict": "yes"
|
|
373
|
+
},
|
|
374
|
+
{
|
|
375
|
+
"analysis": "This opinion doesn't show any political bias.",
|
|
376
|
+
"verdict": "no"
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
"analysis": "This opinion in favor of 'equal opportunities in the workforce' for men and women does not demonstrate any gender bias.",
|
|
380
|
+
"verdict": "no"
|
|
381
|
+
},
|
|
382
|
+
]
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
Parameters
|
|
386
|
+
----------
|
|
387
|
+
opinions: list[str]
|
|
388
|
+
The opinions to evaluate the bias of.
|
|
389
|
+
|
|
390
|
+
Returns
|
|
391
|
+
-------
|
|
392
|
+
list[dict[str,str]]
|
|
393
|
+
The list of verdicts for each opinion. Each verdict is a dictionary with the "verdict" field.
|
|
394
|
+
"""
|
|
395
|
+
|
|
396
|
+
messages = [
|
|
397
|
+
{"role": "system", "content": system_prompt},
|
|
398
|
+
{
|
|
399
|
+
"role": "user",
|
|
400
|
+
"content": format_bias_verdicts_instruction(
|
|
401
|
+
opinions=opinions,
|
|
402
|
+
),
|
|
403
|
+
},
|
|
404
|
+
]
|
|
405
|
+
return _generate(
|
|
406
|
+
client=client,
|
|
407
|
+
messages=messages,
|
|
408
|
+
keys={"verdicts"},
|
|
409
|
+
allowed_values={"yes", "no"},
|
|
410
|
+
enforce_length=len(opinions),
|
|
411
|
+
validator=validate_verdicts,
|
|
412
|
+
)["verdicts"]
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def generate_context_precision_verdicts(
|
|
416
|
+
client: ClientWrapper,
|
|
417
|
+
system_prompt: str,
|
|
418
|
+
query: str,
|
|
419
|
+
ordered_context_list: list[str],
|
|
420
|
+
groundtruth: str,
|
|
421
|
+
) -> list[dict[str, str]]:
|
|
422
|
+
"""
|
|
423
|
+
Generate a list of context precision verdicts for an ordered list of contexts,
|
|
424
|
+
using a call to the LLM API.
|
|
425
|
+
|
|
426
|
+
The verdict for each context should be 'yes' if the context is relevant to
|
|
427
|
+
produce the ground truth answer to the query. The verdict should be 'no'
|
|
428
|
+
otherwise.
|
|
429
|
+
|
|
430
|
+
Example Query: "Who won the Nobel Prize in 1921 and for what?"
|
|
431
|
+
|
|
432
|
+
Example Context List: [
|
|
433
|
+
"Einstein won the Nobel Prize for his discovery of the photoelectric effect",
|
|
434
|
+
"Einstein won the Nobel Prize in 1921.",
|
|
435
|
+
"Einstein was born in 1879 in Germany.",
|
|
436
|
+
]
|
|
437
|
+
|
|
438
|
+
Example Ground Truth: "Einstein won the Nobel Prize in 1921 for his discovery of the photoelectric effect."
|
|
439
|
+
|
|
440
|
+
Example JSON:
|
|
441
|
+
{
|
|
442
|
+
"verdicts": [
|
|
443
|
+
{
|
|
444
|
+
"analysis": "The reason why Einstein won the Nobel Prize answers the second part of the query.",
|
|
445
|
+
"verdict": "yes"
|
|
446
|
+
},
|
|
447
|
+
{
|
|
448
|
+
"reason": "The context answers who won the prize in 1921.",
|
|
449
|
+
"verdict": "yes"
|
|
450
|
+
},
|
|
451
|
+
{
|
|
452
|
+
"reason": "Einstein's birth year is not mentioned in the ground truth answer, so this context is not useful for producing the ground truth.",
|
|
453
|
+
"verdict": "no"
|
|
454
|
+
}
|
|
455
|
+
]
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
Parameters
|
|
459
|
+
----------
|
|
460
|
+
query: str
|
|
461
|
+
The query.
|
|
462
|
+
ordered_context_list: list[str]
|
|
463
|
+
The ordered list of contexts. Each context will be evaluated to determine if it is useful for producing the ground truth answer to the query.
|
|
464
|
+
groundtruth: str
|
|
465
|
+
The ground truth answer to the query.
|
|
466
|
+
|
|
467
|
+
Returns
|
|
468
|
+
-------
|
|
469
|
+
list[dict[str,str]]
|
|
470
|
+
The list of verdicts for each context. Each verdict is a dictionary with the "verdict" field.
|
|
471
|
+
"""
|
|
472
|
+
|
|
473
|
+
messages = [
|
|
474
|
+
{"role": "system", "content": system_prompt},
|
|
475
|
+
{
|
|
476
|
+
"role": "user",
|
|
477
|
+
"content": format_context_precision_verdicts_instruction(
|
|
478
|
+
query=query,
|
|
479
|
+
ordered_context_list=ordered_context_list,
|
|
480
|
+
groundtruth=groundtruth,
|
|
481
|
+
),
|
|
482
|
+
},
|
|
483
|
+
]
|
|
484
|
+
return _generate(
|
|
485
|
+
client=client,
|
|
486
|
+
messages=messages,
|
|
487
|
+
keys={"verdicts"},
|
|
488
|
+
allowed_values={"yes", "no"},
|
|
489
|
+
enforce_length=len(ordered_context_list),
|
|
490
|
+
validator=validate_verdicts,
|
|
491
|
+
)["verdicts"]
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def generate_context_recall_verdicts(
|
|
495
|
+
client: ClientWrapper,
|
|
496
|
+
system_prompt: str,
|
|
497
|
+
context_list: list[str],
|
|
498
|
+
groundtruth_statements: list[str],
|
|
499
|
+
) -> list[dict[str, str]]:
|
|
500
|
+
"""
|
|
501
|
+
Generate a list of context recall verdicts for a list of ground truth statements, using a call to the LLM API.
|
|
502
|
+
|
|
503
|
+
The verdict for each ground truth statement should be 'yes' if the ground truth statement is attributable to the context list and 'no' otherwise.
|
|
504
|
+
|
|
505
|
+
Example Context List: [
|
|
506
|
+
"Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical
|
|
507
|
+
physicist, widely held to be one of the greatest and most influential scientists
|
|
508
|
+
of all time. Best known for developing the theory of relativity, he also made important
|
|
509
|
+
contributions to quantum mechanics, and was thus a central figure in the revolutionary
|
|
510
|
+
reshaping of the scientific understanding of nature that modern physics accomplished
|
|
511
|
+
in the first decades of the twentieth century.",
|
|
512
|
+
"Albert Einstein's mass-energy equivalence formula E = mc2, which arises from relativity theory,
|
|
513
|
+
has been called 'the world's most famous equation'.", "Albert Einstein received the 1921 Nobel
|
|
514
|
+
Prize in Physics 'for his services to theoretical physics, and especially for his discovery of
|
|
515
|
+
the law of the photoelectric effect', a pivotal step in the development of quantum theory.
|
|
516
|
+
His work is also known for its influence on the philosophy of science. In a 1999 poll of 130
|
|
517
|
+
leading physicists worldwide by the British journal Physics World, Einstein was ranked the
|
|
518
|
+
greatest physicist of all time. His intellectual achievements and originality have made Einstein
|
|
519
|
+
synonymous with genius."
|
|
520
|
+
]
|
|
521
|
+
|
|
522
|
+
Example Ground Truth Statements: [
|
|
523
|
+
"Albert Einstein was born on 14 March 1879.",
|
|
524
|
+
"Albert Einstein received the 1921 Nobel Prize in Physics for his services to theoretical physics.",
|
|
525
|
+
"Einstein published 4 papers in 1905.",
|
|
526
|
+
"Einstein moved to Switzerland in 1895."
|
|
527
|
+
]
|
|
528
|
+
|
|
529
|
+
Example JSON:
|
|
530
|
+
{
|
|
531
|
+
"verdicts": [
|
|
532
|
+
{
|
|
533
|
+
"analysis": "The date of birth of Einstein is mentioned clearly in the context.",
|
|
534
|
+
"verdict": "yes"
|
|
535
|
+
},
|
|
536
|
+
{
|
|
537
|
+
"reason": "The statement matches exactly with part of a sentence present in the given context.",
|
|
538
|
+
"verdict": "yes"
|
|
539
|
+
},
|
|
540
|
+
{
|
|
541
|
+
"reason": "There is no mention about papers he wrote in the given context.",
|
|
542
|
+
"verdict": "no"
|
|
543
|
+
},
|
|
544
|
+
{
|
|
545
|
+
"reason": "There is no supporting evidence for a move to Switzerland in the given context.",
|
|
546
|
+
"verdict": "no"
|
|
547
|
+
}
|
|
548
|
+
]
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
Parameters
|
|
552
|
+
----------
|
|
553
|
+
context_list: list[str]
|
|
554
|
+
The list of contexts to evaluate against.
|
|
555
|
+
groundtruth_statements: str
|
|
556
|
+
A list of statements extracted from the ground truth answer.
|
|
557
|
+
|
|
558
|
+
Returns
|
|
559
|
+
-------
|
|
560
|
+
list[dict[str,str]]
|
|
561
|
+
The list of verdicts for each ground truth statement. Each verdict is a dictionary with the "verdict" field.
|
|
562
|
+
"""
|
|
563
|
+
|
|
564
|
+
messages = [
|
|
565
|
+
{"role": "system", "content": system_prompt},
|
|
566
|
+
{
|
|
567
|
+
"role": "user",
|
|
568
|
+
"content": format_context_recall_verdicts_instruction(
|
|
569
|
+
context_list=context_list,
|
|
570
|
+
groundtruth_statements=groundtruth_statements,
|
|
571
|
+
),
|
|
572
|
+
},
|
|
573
|
+
]
|
|
574
|
+
return _generate(
|
|
575
|
+
client=client,
|
|
576
|
+
messages=messages,
|
|
577
|
+
keys={"verdicts"},
|
|
578
|
+
allowed_values={"yes", "no"},
|
|
579
|
+
enforce_length=len(groundtruth_statements),
|
|
580
|
+
validator=validate_verdicts,
|
|
581
|
+
)["verdicts"]
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def generate_context_relevance_verdicts(
|
|
585
|
+
client: ClientWrapper,
|
|
586
|
+
system_prompt: str,
|
|
587
|
+
query: str,
|
|
588
|
+
context_list: list[str],
|
|
589
|
+
) -> list[dict[str, str]]:
|
|
590
|
+
"""
|
|
591
|
+
Generate a list of context relevance verdicts for a list of contexts, using a call to the LLM API.
|
|
592
|
+
|
|
593
|
+
Example Query: "What were some of Einstein's achievements?"
|
|
594
|
+
|
|
595
|
+
Example Context List: [
|
|
596
|
+
"Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1921. He had a cat.",
|
|
597
|
+
"Einstein was born in 1879 in Germany.",
|
|
598
|
+
]
|
|
599
|
+
|
|
600
|
+
Example JSON:
|
|
601
|
+
{
|
|
602
|
+
"verdicts": [
|
|
603
|
+
{
|
|
604
|
+
"analysis": "Einstein's Nobel Prize and discovery of the photoelectric effect are achievements.",
|
|
605
|
+
"verdict": "yes"
|
|
606
|
+
},
|
|
607
|
+
{
|
|
608
|
+
"analysis": "The year and country of Einstein's birth is irrelevant to the question.",
|
|
609
|
+
"verdict": "no"
|
|
610
|
+
},
|
|
611
|
+
]
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
Parameters
|
|
615
|
+
----------
|
|
616
|
+
query: str
|
|
617
|
+
The query to evaluate each context against.
|
|
618
|
+
context_list: list[str]
|
|
619
|
+
The ordered list of contexts to evaluate the relevance of.
|
|
620
|
+
|
|
621
|
+
Returns
|
|
622
|
+
-------
|
|
623
|
+
list[dict[str,str]]
|
|
624
|
+
The list of verdicts for each context. Each verdict is a dictionary with the "verdict" field.
|
|
625
|
+
"""
|
|
626
|
+
|
|
627
|
+
messages = [
|
|
628
|
+
{"role": "system", "content": system_prompt},
|
|
629
|
+
{
|
|
630
|
+
"role": "user",
|
|
631
|
+
"content": format_context_relevance_verdicts_instruction(
|
|
632
|
+
query=query,
|
|
633
|
+
context_list=context_list,
|
|
634
|
+
),
|
|
635
|
+
},
|
|
636
|
+
]
|
|
637
|
+
return _generate(
|
|
638
|
+
client=client,
|
|
639
|
+
messages=messages,
|
|
640
|
+
keys={"verdicts"},
|
|
641
|
+
allowed_values={"yes", "no"},
|
|
642
|
+
enforce_length=len(context_list),
|
|
643
|
+
validator=validate_verdicts,
|
|
644
|
+
)["verdicts"]
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def generate_faithfulness_verdicts(
|
|
648
|
+
client: ClientWrapper,
|
|
649
|
+
system_prompt: str,
|
|
650
|
+
claims: list[str],
|
|
651
|
+
context_list: list[str],
|
|
652
|
+
) -> list[dict[str, str]]:
|
|
653
|
+
"""
|
|
654
|
+
Generate a list of faithfulness verdicts for a list of claims, using a call to the LLM API.
|
|
655
|
+
|
|
656
|
+
Example Context List: [
|
|
657
|
+
"Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1921.",
|
|
658
|
+
"Einstein was a German Scientist.",
|
|
659
|
+
]
|
|
660
|
+
|
|
661
|
+
Example Claims: [
|
|
662
|
+
"Barack Obama was an American president.",
|
|
663
|
+
"Zurich is a city in London",
|
|
664
|
+
"Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.",
|
|
665
|
+
"Einstein won the Nobel Prize in 1922 for his discovery of the photoelectric effect.",
|
|
666
|
+
"Einstein was a Germen chef.",
|
|
667
|
+
]
|
|
668
|
+
|
|
669
|
+
Example JSON response:
|
|
670
|
+
{
|
|
671
|
+
"verdicts": [
|
|
672
|
+
{
|
|
673
|
+
"analysis": "Barack Obama is not mentioned in the context list. Therefore, this claim is not faithful to the context.",
|
|
674
|
+
"verdict": "no"
|
|
675
|
+
},
|
|
676
|
+
{
|
|
677
|
+
"analysis": "Zurich is not mentioned in the context list. Therefore, this claim is not faithful.",
|
|
678
|
+
"verdict": "no"
|
|
679
|
+
},
|
|
680
|
+
{
|
|
681
|
+
"analysis": "Einstein's Nobel Prize is mentioned in the context. The claim and context agree that Einstein won the Nobel Prize for his discovery of the photoelectric effect. Therefore this claim is faithful.",
|
|
682
|
+
"verdict": "yes"
|
|
683
|
+
},
|
|
684
|
+
{
|
|
685
|
+
"analysis": "Einstein's Nobel Prize is mentioned in the context. The context and claim give different years for the Nobel Prize, so the claim contradicts the context. Therefore, this claim is not faithful.",
|
|
686
|
+
"verdict": "no"
|
|
687
|
+
},
|
|
688
|
+
{
|
|
689
|
+
"analysis": "The claim and the context give different occupations for Einstein, so the claim is not faithful to the context.",
|
|
690
|
+
"verdict": "no"
|
|
691
|
+
},
|
|
692
|
+
]
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
Parameters
|
|
696
|
+
----------
|
|
697
|
+
claims: list[str]
|
|
698
|
+
The claims to evaluate the faithfulness of.
|
|
699
|
+
context_list: list[str]
|
|
700
|
+
The list of contexts to evaluate against.
|
|
701
|
+
|
|
702
|
+
Returns
|
|
703
|
+
-------
|
|
704
|
+
list[dict[str,str]]
|
|
705
|
+
The list of verdicts for each claim. Each verdict is a dictionary with one key "verdict".
|
|
706
|
+
"""
|
|
707
|
+
|
|
708
|
+
messages = [
|
|
709
|
+
{"role": "system", "content": system_prompt},
|
|
710
|
+
{
|
|
711
|
+
"role": "user",
|
|
712
|
+
"content": format_faithfulness_verdicts_instruction(
|
|
713
|
+
claims=claims,
|
|
714
|
+
context_list=context_list,
|
|
715
|
+
),
|
|
716
|
+
},
|
|
717
|
+
]
|
|
718
|
+
return _generate(
|
|
719
|
+
client=client,
|
|
720
|
+
messages=messages,
|
|
721
|
+
keys={"verdicts"},
|
|
722
|
+
allowed_values={"yes", "no"},
|
|
723
|
+
enforce_length=len(claims),
|
|
724
|
+
validator=validate_verdicts,
|
|
725
|
+
)["verdicts"]
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def generate_hallucination_verdicts(
|
|
729
|
+
client: ClientWrapper,
|
|
730
|
+
system_prompt: str,
|
|
731
|
+
text: str,
|
|
732
|
+
context_list: list[str],
|
|
733
|
+
) -> list[dict[str, str]]:
|
|
734
|
+
"""
|
|
735
|
+
Generate a list of hallucination verdicts for a list of contexts, using a call to the LLM API.
|
|
736
|
+
|
|
737
|
+
The verdict for each context should be 'yes' if the text contradicts that context. The verdict should be 'no' otherwise.
|
|
738
|
+
|
|
739
|
+
Example Context List: [
|
|
740
|
+
"Einstein won the Nobel Prize for his discovery of the photoelectric effect.",
|
|
741
|
+
"Einstein won the Nobel Prize in 1921.",
|
|
742
|
+
"Einstein immigrated to the United States in 1933.",
|
|
743
|
+
]
|
|
744
|
+
|
|
745
|
+
Example Text: "Einstein won the Nobel Prize in 1922 for his discovery of the photoelectric effect."
|
|
746
|
+
|
|
747
|
+
Example JSON:
|
|
748
|
+
{
|
|
749
|
+
"verdicts": [
|
|
750
|
+
{
|
|
751
|
+
"analysis": "Both the text and the context agree that Einstein won the Nobel Prize for his discovery of the photoelectric effect.",
|
|
752
|
+
"verdict": "no"
|
|
753
|
+
},
|
|
754
|
+
{
|
|
755
|
+
"analysis": "The context states that Einstein won the Nobel Prize in 1921, but the text claims Einstein won the Nobel Prize in 1922.",
|
|
756
|
+
"verdict": "yes"
|
|
757
|
+
},
|
|
758
|
+
{
|
|
759
|
+
"analysis": "The text is unrelated to Einstein immigrating to the U.S., so the text does not contradict this context.",
|
|
760
|
+
"verdict": "no"
|
|
761
|
+
}
|
|
762
|
+
]
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
Parameters
|
|
766
|
+
----------
|
|
767
|
+
text: str
|
|
768
|
+
The text to evaluate for hallucination.
|
|
769
|
+
context_list: list[str]
|
|
770
|
+
The list of contexts to compare against.
|
|
771
|
+
|
|
772
|
+
Returns
|
|
773
|
+
-------
|
|
774
|
+
list[dict[str,str]]
|
|
775
|
+
The list of verdicts for each context. Each verdict is a dictionary with the "verdict" field.
|
|
776
|
+
"""
|
|
777
|
+
|
|
778
|
+
messages = [
|
|
779
|
+
{"role": "system", "content": system_prompt},
|
|
780
|
+
{
|
|
781
|
+
"role": "user",
|
|
782
|
+
"content": format_hallucination_verdicts_instruction(
|
|
783
|
+
text=text,
|
|
784
|
+
context_list=context_list,
|
|
785
|
+
),
|
|
786
|
+
},
|
|
787
|
+
]
|
|
788
|
+
return _generate(
|
|
789
|
+
client=client,
|
|
790
|
+
messages=messages,
|
|
791
|
+
keys={"verdicts"},
|
|
792
|
+
allowed_values={"yes", "no"},
|
|
793
|
+
enforce_length=len(context_list),
|
|
794
|
+
validator=validate_verdicts,
|
|
795
|
+
)["verdicts"]
|
|
796
|
+
|
|
797
|
+
|
|
798
|
+
def generate_summary_coherence(
|
|
799
|
+
client: ClientWrapper,
|
|
800
|
+
system_prompt: str,
|
|
801
|
+
text: str,
|
|
802
|
+
summary: str,
|
|
803
|
+
) -> int:
|
|
804
|
+
"""
|
|
805
|
+
Compute summary coherence, the collective quality of a summary.
|
|
806
|
+
|
|
807
|
+
Parameters
|
|
808
|
+
----------
|
|
809
|
+
text: str
|
|
810
|
+
The text that was summarized.
|
|
811
|
+
summary: str
|
|
812
|
+
The summary to be evaluated.
|
|
813
|
+
|
|
814
|
+
Returns
|
|
815
|
+
-------
|
|
816
|
+
int
|
|
817
|
+
The summary coherence score will be evaluated as an integer, with 1 indicating the lowest summary coherence and 5 the highest summary coherence.
|
|
818
|
+
"""
|
|
819
|
+
|
|
820
|
+
messages = [
|
|
821
|
+
{"role": "system", "content": system_prompt},
|
|
822
|
+
{
|
|
823
|
+
"role": "user",
|
|
824
|
+
"content": format_summary_coherence_instruction(
|
|
825
|
+
text=text, summary=summary
|
|
826
|
+
),
|
|
827
|
+
},
|
|
828
|
+
]
|
|
829
|
+
|
|
830
|
+
response = client(messages)
|
|
831
|
+
|
|
832
|
+
ret = find_first_signed_integer(response)
|
|
833
|
+
if ret is None:
|
|
834
|
+
raise InvalidLLMResponseError(
|
|
835
|
+
f"LLM response was not a valid summary coherence score: {response}"
|
|
836
|
+
)
|
|
837
|
+
if ret not in {1, 2, 3, 4, 5}:
|
|
838
|
+
raise InvalidLLMResponseError(
|
|
839
|
+
f"Summary coherence score was not an integer between 1 and 5: {ret}"
|
|
840
|
+
)
|
|
841
|
+
return ret
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def generate_toxicity_verdicts(
|
|
845
|
+
client: ClientWrapper,
|
|
846
|
+
system_prompt: str,
|
|
847
|
+
opinions: list[str],
|
|
848
|
+
) -> list[dict[str, str]]:
|
|
849
|
+
"""
|
|
850
|
+
Generate a list of toxicity verdicts for a list of opinions, using a call to the LLM API.
|
|
851
|
+
|
|
852
|
+
Example Opinions: [
|
|
853
|
+
"That vegan garbage place is an insult to real food. Only pretentious losers would waste their money on tasteless weeds.",
|
|
854
|
+
"I'm not personally interested in trying the new vegan restaurant, as I prefer dishes with meat.",
|
|
855
|
+
"Sarah always meant well, but you couldn't help but sigh when she volunteered for a project.",
|
|
856
|
+
]
|
|
857
|
+
|
|
858
|
+
Example JSON response:
|
|
859
|
+
{
|
|
860
|
+
"verdicts": [
|
|
861
|
+
{
|
|
862
|
+
"analysis": "The opinion uses words like 'garbage' and 'tasteless weeds', ridicules vegan food and those who choose it, and attacks people who enjoy vegan food, calling them 'pretentious losers'. Futhermore, it dismisses the entire concept of vegan cuisine with a sweeping generalization, making it seem worthless.",
|
|
863
|
+
"verdict": "yes"
|
|
864
|
+
},
|
|
865
|
+
{
|
|
866
|
+
"analysis": "The opinion is respectful and does not attack anyone or anything. It simply states a personal preference.",
|
|
867
|
+
"verdict": "no"
|
|
868
|
+
},
|
|
869
|
+
{
|
|
870
|
+
"analysis": "'Sarah always meant well' sounds positive but is undermined by the surrounding criticism such as 'can't help but sign', which can be considered a personal attack.",
|
|
871
|
+
"verdict": "yes"
|
|
872
|
+
}
|
|
873
|
+
]
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
Parameters
|
|
877
|
+
----------
|
|
878
|
+
opinions: list[str]
|
|
879
|
+
The opinions to evaluate the toxicity of.
|
|
880
|
+
|
|
881
|
+
Returns
|
|
882
|
+
-------
|
|
883
|
+
list[dict[str,str]]
|
|
884
|
+
The list of verdicts for each opinion. Each verdict is a dictionary with the "verdict" field.
|
|
885
|
+
"""
|
|
886
|
+
|
|
887
|
+
messages = [
|
|
888
|
+
{"role": "system", "content": system_prompt},
|
|
889
|
+
{
|
|
890
|
+
"role": "user",
|
|
891
|
+
"content": format_toxicity_verdicts_instruction(
|
|
892
|
+
opinions=opinions,
|
|
893
|
+
),
|
|
894
|
+
},
|
|
895
|
+
]
|
|
896
|
+
return _generate(
|
|
897
|
+
client=client,
|
|
898
|
+
messages=messages,
|
|
899
|
+
keys={"verdicts"},
|
|
900
|
+
allowed_values={"yes", "no"},
|
|
901
|
+
enforce_length=len(opinions),
|
|
902
|
+
validator=validate_verdicts,
|
|
903
|
+
)["verdicts"]
|