valor-lite 0.33.14__py3-none-any.whl → 0.33.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of valor-lite might be problematic. Click here for more details.

@@ -0,0 +1,609 @@
1
+ import evaluate
2
+ from nltk.tokenize import RegexpTokenizer
3
+ from nltk.translate import bleu_score
4
+ from valor_lite.text_generation.llm.generation import (
5
+ generate_answer_correctness_verdicts,
6
+ generate_answer_relevance_verdicts,
7
+ generate_bias_verdicts,
8
+ generate_claims,
9
+ generate_context_precision_verdicts,
10
+ generate_context_recall_verdicts,
11
+ generate_context_relevance_verdicts,
12
+ generate_faithfulness_verdicts,
13
+ generate_hallucination_verdicts,
14
+ generate_opinions,
15
+ generate_statements,
16
+ generate_summary_coherence,
17
+ generate_toxicity_verdicts,
18
+ )
19
+ from valor_lite.text_generation.llm.integrations import ClientWrapper
20
+
21
+
22
+ def calculate_answer_correctness(
23
+ client: ClientWrapper,
24
+ system_prompt: str,
25
+ query: str,
26
+ response: str,
27
+ groundtruths: list[str],
28
+ ) -> float:
29
+ """
30
+ Compute answer correctness. Answer correctness is computed as an f1 score obtained
31
+ by comparing prediction statements to ground truth statements.
32
+
33
+ If there are multiple ground truths, then the f1 score is computed for each ground
34
+ truth and the maximum score is returned.
35
+
36
+ This metric was adapted from RAGAS. We follow a similar prompting strategy and
37
+ computation, however we do not do a weighted sum with an answer similarity score
38
+ using embeddings.
39
+
40
+ Parameters
41
+ ----------
42
+ client : ClientWrapper
43
+ The LLM client used to perform evaluation.
44
+ system_prompt : str
45
+ A system prompt to pass with the evaluation query.
46
+ query : str
47
+ The user query.
48
+ response : str
49
+ A generated response.
50
+ groundtruths : list[str]
51
+ A list of ground truth references.
52
+
53
+ Returns
54
+ -------
55
+ float
56
+ The answer correctness score between 0 and 1. Higher values indicate that the
57
+ answer is more correct. A score of 1 indicates that all statements in the
58
+ prediction are supported by the ground truth and all statements in the ground
59
+ truth are present in the prediction.
60
+ """
61
+ prediction_statements = generate_statements(
62
+ client=client,
63
+ system_prompt=system_prompt,
64
+ text=response,
65
+ )
66
+ f1_scores = [0.0]
67
+ for groundtruth in groundtruths:
68
+ groundtruth_statements = generate_statements(
69
+ client=client,
70
+ system_prompt=system_prompt,
71
+ text=groundtruth,
72
+ )
73
+ verdicts = generate_answer_correctness_verdicts(
74
+ client=client,
75
+ system_prompt=system_prompt,
76
+ query=query,
77
+ groundtruth_statements=groundtruth_statements,
78
+ prediction_statements=prediction_statements,
79
+ )
80
+
81
+ tp = len(verdicts["TP"])
82
+ fp = len(verdicts["FP"])
83
+ fn = len(verdicts["FN"])
84
+
85
+ f1_scores.append(tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0)
86
+
87
+ return max(f1_scores)
88
+
89
+
90
+ def calculate_answer_relevance(
91
+ client: ClientWrapper,
92
+ system_prompt: str,
93
+ query: str,
94
+ response: str,
95
+ ) -> float:
96
+ """
97
+ Compute answer relevance, the proportion of the model response that is
98
+ relevant to the query, for a single piece of text.
99
+
100
+ Parameters
101
+ ----------
102
+ client : ClientWrapper
103
+ The LLM client used to perform evaluation.
104
+ system_prompt : str
105
+ A system prompt to pass with the evaluation query.
106
+ query : str
107
+ The user query.
108
+ response : str
109
+ A generated response.
110
+
111
+ Returns
112
+ -------
113
+ float
114
+ The answer relevance score between 0 and 1. A score of 1 indicates that all
115
+ statements are relevant to the query.
116
+ """
117
+ statements = generate_statements(
118
+ client=client,
119
+ system_prompt=system_prompt,
120
+ text=response,
121
+ )
122
+ verdicts = generate_answer_relevance_verdicts(
123
+ client=client,
124
+ system_prompt=system_prompt,
125
+ query=query,
126
+ statements=statements,
127
+ )
128
+ if len(verdicts) == 0:
129
+ return 0.0
130
+
131
+ return sum(verdict["verdict"] == "yes" for verdict in verdicts) / len(
132
+ verdicts
133
+ )
134
+
135
+
136
+ def calculate_bias(
137
+ client: ClientWrapper,
138
+ system_prompt: str,
139
+ response: str,
140
+ ) -> float:
141
+ """
142
+ Compute bias, the proportion of model opinions that are biased.
143
+
144
+ Parameters
145
+ ----------
146
+ client : ClientWrapper
147
+ The LLM client used to perform evaluation.
148
+ system_prompt : str
149
+ A system prompt to pass with the evaluation query.
150
+ response : str
151
+ A generated response.
152
+
153
+ Returns
154
+ -------
155
+ float
156
+ The bias score between 0 and 1. A score of 1 indicates that all opinions in
157
+ the text are biased.
158
+ """
159
+
160
+ opinions = generate_opinions(
161
+ client=client,
162
+ system_prompt=system_prompt,
163
+ text=response,
164
+ )
165
+ if len(opinions) == 0:
166
+ return 0.0
167
+
168
+ verdicts = generate_bias_verdicts(
169
+ client=client,
170
+ system_prompt=system_prompt,
171
+ opinions=opinions,
172
+ )
173
+ return sum(verdict["verdict"] == "yes" for verdict in verdicts) / len(
174
+ verdicts
175
+ )
176
+
177
+
178
+ def calculate_context_precision(
179
+ client: ClientWrapper,
180
+ system_prompt: str,
181
+ query: str,
182
+ predicted_context: list[str],
183
+ groundtruth_context: list[str],
184
+ ) -> float:
185
+ """
186
+ Compute context precision, a score for evaluating the retrieval
187
+ mechanism of a RAG model.
188
+
189
+ First, an LLM is prompted to determine if each context in the context
190
+ list is useful for producing the ground truth answer to the query.
191
+
192
+ If there are multiple ground truths, then the verdict is "yes" for a
193
+ context if that context is useful for producing any of the ground truth
194
+ answers, and "no" otherwise.
195
+
196
+ Then, using these verdicts, the context precision score is computed as
197
+ a weighted sum of the precision at k for each k from 1 to the length
198
+ of the context list.
199
+
200
+ Note that the earlier a piece of context appears in the context list,
201
+ the more important it is in the computation of this score. For example,
202
+ the first context in the context list will be included in every precision
203
+ at k computation, so will have a large influence on the final score,
204
+ whereas the last context will only be used for the last precision at
205
+ k computation, so will have a small influence on the final score.
206
+
207
+ Parameters
208
+ ----------
209
+ client : ClientWrapper
210
+ The LLM client used to perform evaluation.
211
+ system_prompt : str
212
+ A system prompt to pass with the evaluation query.
213
+ query : str
214
+ The user query.
215
+ response : str
216
+ A generated response.
217
+ predicted_context : list[str]
218
+ A list of predicted context.
219
+ groundtruths : list[str]
220
+ A list of ground truth references.
221
+
222
+ Returns
223
+ -------
224
+ float
225
+ The context precision score between 0 and 1. A higher score indicates
226
+ better context precision.
227
+ """
228
+ if len(predicted_context) == 0 and len(groundtruth_context) == 0:
229
+ return 1.0
230
+ elif len(predicted_context) == 0 or len(groundtruth_context) == 0:
231
+ return 0.0
232
+
233
+ # Get verdicts for each ground truth, and aggregate by setting the verdict for
234
+ # a context to "yes" if the verdict is "yes" for any ground truth.
235
+ aggregate_verdicts = ["no"] * len(predicted_context)
236
+ for groundtruth in groundtruth_context:
237
+ verdicts = generate_context_precision_verdicts(
238
+ client=client,
239
+ system_prompt=system_prompt,
240
+ query=query,
241
+ ordered_context_list=predicted_context,
242
+ groundtruth=groundtruth,
243
+ )
244
+ for i in range(len(verdicts)):
245
+ if verdicts[i]["verdict"] == "yes":
246
+ aggregate_verdicts[i] = "yes"
247
+
248
+ # Use the aggregate verdicts to compute the precision at k for each k.
249
+ precision_at_k_list = []
250
+ for k in range(1, len(predicted_context) + 1):
251
+ # Only compute the precision at k if the kth context is relevant.
252
+ if aggregate_verdicts[k - 1] == "yes":
253
+ precision_at_k = (
254
+ sum(verdict == "yes" for verdict in aggregate_verdicts[:k]) / k
255
+ )
256
+ precision_at_k_list.append(precision_at_k)
257
+
258
+ # If none of the context are relevant, then the context precision is 0.
259
+ if len(precision_at_k_list) == 0:
260
+ return 0.0
261
+
262
+ # Average over all the precision at k for which the kth context is relevant.
263
+ return sum(precision_at_k_list) / len(precision_at_k_list)
264
+
265
+
266
+ def calculate_context_recall(
267
+ client: ClientWrapper,
268
+ system_prompt: str,
269
+ predicted_context: list[str],
270
+ groundtruth_context: list[str],
271
+ ) -> float:
272
+ """
273
+ Compute context recall, a score for evaluating the retrieval mechanism of a RAG model.
274
+
275
+ The context recall score is the proportion of statements in the ground truth
276
+ that are attributable to the context list.
277
+
278
+ If multiple ground truths are provided, then the context recall score is
279
+ computed for each ground truth and the maximum score is returned.
280
+
281
+ Parameters
282
+ ----------
283
+ client : ClientWrapper
284
+ The LLM client used to perform evaluation.
285
+ system_prompt : str
286
+ A system prompt to pass with the evaluation query.
287
+ predicted_context : list[str]
288
+ A list of predicted context.
289
+ groundtruths : list[str]
290
+ A list of ground truth references.
291
+
292
+ Returns
293
+ -------
294
+ float
295
+ The context recall score between 0 and 1. A score of 1 indicates that
296
+ all ground truth statements are attributable to the contexts in the context list.
297
+ """
298
+ if len(predicted_context) == 0 and len(groundtruth_context) == 0:
299
+ return 1.0
300
+ elif len(predicted_context) == 0 or len(groundtruth_context) == 0:
301
+ return 0.0
302
+
303
+ scores = []
304
+ for groundtruth in groundtruth_context:
305
+ groundtruth_statements = generate_statements(
306
+ client=client,
307
+ system_prompt=system_prompt,
308
+ text=groundtruth,
309
+ )
310
+ verdicts = generate_context_recall_verdicts(
311
+ client=client,
312
+ system_prompt=system_prompt,
313
+ context_list=predicted_context,
314
+ groundtruth_statements=groundtruth_statements,
315
+ )
316
+ scores.append(
317
+ sum(verdict["verdict"] == "yes" for verdict in verdicts)
318
+ / len(verdicts)
319
+ )
320
+
321
+ return max(scores)
322
+
323
+
324
+ def calculate_context_relevance(
325
+ client: ClientWrapper,
326
+ system_prompt: str,
327
+ query: str,
328
+ context: list[str],
329
+ ) -> float:
330
+ """
331
+ Compute context relevance, the proportion of contexts in the context list
332
+ that are relevant to the query.
333
+
334
+ Parameters
335
+ ----------
336
+ client : ClientWrapper
337
+ The LLM client used to perform evaluation.
338
+ system_prompt : str
339
+ A system prompt to pass with the evaluation query.
340
+ query : str
341
+ The user query.
342
+ context : list[str]
343
+ A list of predicted context.
344
+
345
+ Returns
346
+ -------
347
+ float
348
+ The context relevance score between 0 and 1. A score of 0 indicates
349
+ that none of the contexts are relevant and a score of 1 indicates
350
+ that all of the contexts are relevant.
351
+ """
352
+ if len(context) == 0:
353
+ return 0.0
354
+ verdicts = generate_context_relevance_verdicts(
355
+ client=client,
356
+ system_prompt=system_prompt,
357
+ query=query,
358
+ context_list=context,
359
+ )
360
+ return sum(verdict["verdict"] == "yes" for verdict in verdicts) / len(
361
+ verdicts
362
+ )
363
+
364
+
365
+ def calculate_faithfulness(
366
+ client: ClientWrapper,
367
+ system_prompt: str,
368
+ response: str,
369
+ context: list[str],
370
+ ) -> float:
371
+ """
372
+ Compute the faithfulness score. The faithfulness score is the proportion
373
+ of claims in the text that are implied by the list of contexts. Claims
374
+ that contradict the list of contexts and claims that are unrelated to
375
+ the list of contexts both count against the score.
376
+
377
+ Parameters
378
+ ----------
379
+ client : ClientWrapper
380
+ The LLM client used to perform evaluation.
381
+ system_prompt : str
382
+ A system prompt to pass with the evaluation query.
383
+ response : str
384
+ A generated response.
385
+ context : list[str]
386
+ A list of predicted context.
387
+
388
+ Returns
389
+ -------
390
+ float
391
+ The faithfulness score between 0 and 1. A score of 1 indicates that
392
+ all claims in the text are implied by the list of contexts.
393
+ """
394
+ if len(context) == 0:
395
+ return 0.0
396
+
397
+ claims = generate_claims(
398
+ client=client, system_prompt=system_prompt, text=response
399
+ )
400
+
401
+ # If there aren't any claims, then the text is perfectly faithful, as the text does not contain any non-faithful claims.
402
+ if len(claims) == 0:
403
+ return 1.0
404
+
405
+ faithfulness_verdicts = generate_faithfulness_verdicts(
406
+ client=client,
407
+ system_prompt=system_prompt,
408
+ claims=claims,
409
+ context_list=context,
410
+ )
411
+ return sum(
412
+ verdict["verdict"] == "yes" for verdict in faithfulness_verdicts
413
+ ) / len(faithfulness_verdicts)
414
+
415
+
416
+ def calculate_hallucination(
417
+ client: ClientWrapper,
418
+ system_prompt: str,
419
+ response: str,
420
+ context: list[str],
421
+ ) -> float:
422
+ """
423
+ Compute the hallucination score, the proportion of contexts in the context
424
+ list that are contradicted by the text.
425
+
426
+ Parameters
427
+ ----------
428
+ client : ClientWrapper
429
+ The LLM client used to perform evaluation.
430
+ system_prompt : str
431
+ A system prompt to pass with the evaluation query.
432
+ response : str
433
+ A generated response.
434
+ context : list[str]
435
+ A list of predicted context.
436
+
437
+ Returns
438
+ -------
439
+ float
440
+ The hallucination score between 0 and 1. A score of 1 indicates that
441
+ all contexts are contradicted by the text.
442
+ """
443
+ if len(context) == 0:
444
+ raise ValueError("Hallucination requires context to be calculated.")
445
+
446
+ verdicts = generate_hallucination_verdicts(
447
+ client=client,
448
+ system_prompt=system_prompt,
449
+ text=response,
450
+ context_list=context,
451
+ )
452
+ return sum(verdict["verdict"] == "yes" for verdict in verdicts) / len(
453
+ verdicts
454
+ )
455
+
456
+
457
+ def calculate_summary_coherence(
458
+ client: ClientWrapper,
459
+ system_prompt: str,
460
+ text: str,
461
+ summary: str,
462
+ ) -> int:
463
+ """
464
+ Compute summary coherence, the collective quality of a summary.
465
+
466
+ Parameters
467
+ ----------
468
+ client : ClientWrapper
469
+ The LLM client used to perform evaluation.
470
+ system_prompt : str
471
+ A system prompt to pass with the evaluation query.
472
+ text : str
473
+ The original text.
474
+ summary : str
475
+ The generated summary.
476
+
477
+ Returns
478
+ -------
479
+ int
480
+ The summary coherence score between 1 and 5. A score of 1 indicates
481
+ the lowest summary coherence and a score of 5 indicates the highest
482
+ summary coherence.
483
+ """
484
+ return generate_summary_coherence(
485
+ client=client,
486
+ system_prompt=system_prompt,
487
+ text=text,
488
+ summary=summary,
489
+ )
490
+
491
+
492
+ def calculate_toxicity(
493
+ client: ClientWrapper,
494
+ system_prompt: str,
495
+ response: str,
496
+ ) -> float:
497
+ """
498
+ Compute toxicity, the proportion of opinions that are toxic.
499
+
500
+ Parameters
501
+ ----------
502
+ client : ClientWrapper
503
+ The LLM client used to perform evaluation.
504
+ system_prompt : str
505
+ A system prompt to pass with the evaluation query.
506
+ response : str
507
+ A generated response.
508
+
509
+ Returns
510
+ -------
511
+ Metric
512
+ The toxicity score will be evaluated as a float between 0 and 1, with
513
+ 1 indicating that all opinions in the text are toxic.
514
+ """
515
+ opinions = generate_opinions(
516
+ client=client,
517
+ system_prompt=system_prompt,
518
+ text=response,
519
+ )
520
+ if len(opinions) == 0:
521
+ return 0.0
522
+
523
+ verdicts = generate_toxicity_verdicts(
524
+ client=client,
525
+ system_prompt=system_prompt,
526
+ opinions=opinions,
527
+ )
528
+ return sum(verdict["verdict"] == "yes" for verdict in verdicts) / len(
529
+ verdicts
530
+ )
531
+
532
+
533
+ def calculate_rouge_scores(
534
+ prediction: str,
535
+ references: str | list[str],
536
+ rouge_types: list[str],
537
+ use_stemmer: bool = False,
538
+ ) -> dict[str, float]:
539
+ """
540
+ Calculate ROUGE scores for a prediction given some set of references.
541
+
542
+ Parameters
543
+ ----------
544
+ prediction : str
545
+ A generated response to score. Each prediction should be a string with tokens separated by spaces.
546
+ references : str | list[str]
547
+ A list of references for a given response. Each reference should be a string with tokens separated by spaces.
548
+ rouge_types : list[str]
549
+ A list of rouge types to calculate.
550
+ use_stemmer: bool, default=False
551
+ If True, uses Porter stemmer to strip word suffixes. Defaults to False.
552
+ """
553
+ rouge = evaluate.load("rouge")
554
+
555
+ metrics = rouge.compute(
556
+ predictions=[prediction],
557
+ references=[references],
558
+ rouge_types=rouge_types,
559
+ use_stemmer=use_stemmer,
560
+ use_aggregator=False, # aggregation gives us an average across all predictions, which isn't what we want
561
+ )
562
+
563
+ # find the max value for each prediction
564
+ results = dict()
565
+ if metrics is not None:
566
+ for type_ in rouge_types:
567
+ results[type_] = max(metrics[type_][0], 0.0)
568
+ return results
569
+
570
+
571
+ def calculate_sentence_bleu(
572
+ prediction: str,
573
+ references: list[str],
574
+ weights: tuple[float, ...] | list[float],
575
+ ) -> float:
576
+ """
577
+ Calculate sentence BLEU scores for a of prediction - ground truth pair.
578
+
579
+ Parameters
580
+ ----------
581
+ prediction : str
582
+ A generated response to score. Each prediction should be a string with tokens separated by spaces.
583
+ references : list[str]
584
+ A list of references for a given response. Each reference should be a string with tokens separated by spaces.
585
+ weights : tuple[float]
586
+ The default BLEU calculates a score for up to 4-grams using uniform
587
+ weights (this is called BLEU-4). To evaluate your translations with
588
+ higher/lower order ngrams, use customized weights. Example: when accounting
589
+ for up to 5-grams with uniform weights (this is called BLEU-5) use [1/5]*5
590
+ """
591
+ if len(weights) == 0:
592
+ raise ValueError("At least one weight should be defined.")
593
+
594
+ tokenizer = RegexpTokenizer(
595
+ r"\w+|\$[\d]+|[^\s\.]+"
596
+ ) # regex tokenizer that ignores periods
597
+
598
+ tokenized_prediction = tokenizer.tokenize(prediction)
599
+ tokenized_references = [tokenizer.tokenize(ref) for ref in references]
600
+
601
+ # find the max value for each prediction
602
+ result = float(
603
+ bleu_score.sentence_bleu(
604
+ references=tokenized_references,
605
+ hypothesis=tokenized_prediction,
606
+ weights=weights,
607
+ ), # type: ignore
608
+ )
609
+ return result if result >= 1e-9 else 0.0
File without changes
@@ -0,0 +1,14 @@
1
+ class InvalidLLMResponseError(Exception):
2
+ """
3
+ Raised when the response from the LLM is invalid for a given metric computation.
4
+ """
5
+
6
+ pass
7
+
8
+
9
+ class MismatchingTextGenerationDatumError(Exception):
10
+ """
11
+ Raised when datums with the same uid but different text are added to the ValorTextGenerationStreamingManager.
12
+ """
13
+
14
+ pass