judgeval 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. judgeval/__init__.py +0 -71
  2. judgeval/clients.py +14 -3
  3. judgeval/common/tracer.py +57 -31
  4. judgeval/constants.py +1 -0
  5. judgeval/data/__init__.py +2 -1
  6. judgeval/data/scorer_data.py +2 -2
  7. judgeval/evaluation_run.py +16 -15
  8. judgeval/judges/__init__.py +2 -2
  9. judgeval/judges/base_judge.py +1 -1
  10. judgeval/judges/litellm_judge.py +2 -2
  11. judgeval/judges/mixture_of_judges.py +2 -2
  12. judgeval/judges/together_judge.py +2 -2
  13. judgeval/judges/utils.py +4 -4
  14. judgeval/judgment_client.py +67 -15
  15. judgeval/run_evaluation.py +79 -14
  16. judgeval/scorers/__init__.py +8 -4
  17. judgeval/scorers/api_scorer.py +64 -0
  18. judgeval/scorers/base_scorer.py +3 -2
  19. judgeval/scorers/exceptions.py +11 -0
  20. judgeval/scorers/{custom_scorer.py → judgeval_scorer.py} +9 -5
  21. judgeval/scorers/judgeval_scorers/__init__.py +132 -9
  22. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
  23. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
  24. judgeval/scorers/judgeval_scorers/{answer_relevancy.py → api_scorers/answer_relevancy.py} +2 -2
  25. judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py} +2 -2
  26. judgeval/scorers/judgeval_scorers/{contextual_recall.py → api_scorers/contextual_recall.py} +2 -2
  27. judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py} +2 -2
  28. judgeval/scorers/judgeval_scorers/{faithfulness.py → api_scorers/faithfulness.py} +2 -2
  29. judgeval/scorers/judgeval_scorers/{hallucination.py → api_scorers/hallucination.py} +2 -2
  30. judgeval/scorers/judgeval_scorers/{json_correctness.py → api_scorers/json_correctness.py} +7 -7
  31. judgeval/scorers/judgeval_scorers/{summarization.py → api_scorers/summarization.py} +2 -2
  32. judgeval/scorers/judgeval_scorers/{tool_correctness.py → api_scorers/tool_correctness.py} +2 -2
  33. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
  34. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
  35. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
  36. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
  37. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
  38. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
  39. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
  40. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
  41. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
  42. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
  43. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
  44. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
  45. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
  46. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
  47. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
  48. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
  49. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
  50. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
  51. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
  52. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
  53. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
  54. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
  55. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
  56. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
  57. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
  58. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
  59. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
  60. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
  61. judgeval/scorers/prompt_scorer.py +4 -4
  62. judgeval/scorers/score.py +14 -14
  63. judgeval/scorers/utils.py +40 -6
  64. {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/METADATA +1 -1
  65. judgeval-0.0.5.dist-info/RECORD +78 -0
  66. judgeval-0.0.3.dist-info/RECORD +0 -46
  67. {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/WHEEL +0 -0
  68. {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,541 @@
1
+ from typing import List, Optional, Union
2
+ import asyncio
3
+
4
+ from judgeval.scorers.utils import (get_or_create_event_loop,
5
+ scorer_progress_meter,
6
+ create_verbose_logs,
7
+ parse_response_json,
8
+ check_example_params
9
+ )
10
+ from judgeval.scorers import JudgevalScorer
11
+ from judgeval.judges import JudgevalJudge
12
+ from judgeval.judges.utils import create_judge
13
+ from judgeval.data import Example, ExampleParams
14
+ from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.prompts import (
15
+ FaithfulnessTemplate,
16
+ Claims
17
+ )
18
+ from judgeval.scorers.judgeval_scorers.local_implementations.summarization.prompts import *
19
+
20
+
21
+ required_params = [
22
+ ExampleParams.INPUT,
23
+ ExampleParams.ACTUAL_OUTPUT,
24
+ ]
25
+
26
+
27
+ class SummarizationScorer(JudgevalScorer):
28
+ def __init__(
29
+ self,
30
+ threshold: float = 0.5,
31
+ n: int = 5,
32
+ model: Optional[Union[str, JudgevalJudge]] = None,
33
+ assessment_questions: Optional[List[str]] = None,
34
+ include_reason: bool = True,
35
+ async_mode=True,
36
+ strict_mode: bool = False,
37
+ verbose_mode: bool = False,
38
+ ):
39
+ self.threshold = 1 if strict_mode else threshold
40
+ self.model, self.using_native_model = create_judge(model)
41
+ self.evaluation_model = self.model.get_model_name()
42
+
43
+ if assessment_questions is not None and len(assessment_questions) == 0:
44
+ self.assessment_questions = None
45
+ else:
46
+ self.assessment_questions = assessment_questions
47
+
48
+ self.include_reason = include_reason
49
+ self.n = n
50
+ self.async_mode = async_mode
51
+ self.strict_mode = strict_mode
52
+ self.verbose_mode = verbose_mode
53
+
54
+ def score_example(
55
+ self,
56
+ example: Example,
57
+ _show_indicator: bool = True,
58
+ ) -> float:
59
+ check_example_params(example, required_params, self)
60
+ try:
61
+ with scorer_progress_meter(self, display_meter=_show_indicator):
62
+ if self.async_mode:
63
+ loop = get_or_create_event_loop()
64
+ loop.run_until_complete(
65
+ self.a_score_example(example, _show_indicator=False)
66
+ )
67
+ else:
68
+ self.claims: List[str] = self._generate_claims(
69
+ example.actual_output
70
+ )
71
+
72
+ self.info_coverage_verdicts: List[InfoCoverageVerdict] = (
73
+ self._generate_info_coverage_verdicts(example)
74
+ )
75
+
76
+ self.contradiction_verdicts: List[ContradictionVerdict] = (
77
+ self._generate_contradiction_verdicts(example)
78
+ )
79
+
80
+ contradiction_score = self._calculate_score(ScoreType.CONTRADICTION)
81
+ info_coverage_score = self._calculate_score(ScoreType.INFO_COVERAGE)
82
+ self.score_breakdown = {
83
+ ScoreType.CONTRADICTION.value: contradiction_score,
84
+ ScoreType.INFO_COVERAGE.value: info_coverage_score,
85
+ }
86
+ self.score = min(contradiction_score, info_coverage_score)
87
+ self.reason = self._generate_reason()
88
+ self.success = self.score >= self.threshold
89
+ self.verbose_logs = create_verbose_logs(
90
+ self,
91
+ steps=[
92
+ f"Claims:\n{self.claims}",
93
+ f"Assessment Questions:\n{self.assessment_questions}",
94
+ f"Info Coverage Verdicts:\n{[v.model_dump() for v in self.info_coverage_verdicts]}",
95
+ f"Contradiction Verdicts:\n{[v.model_dump() for v in self.contradiction_verdicts]}",
96
+ f"Score: {self.score}\nReason: {self.reason}",
97
+ ],
98
+ )
99
+
100
+ return self.score
101
+ except Exception as e:
102
+ print(f"Error in SummarizationScorer score_example: {e}")
103
+ raise
104
+
105
+ async def a_score_example(
106
+ self,
107
+ example: Example,
108
+ _show_indicator: bool = True,
109
+ ) -> float:
110
+ """
111
+ To score, we take the following steps:
112
+ 1. Generate claims from the actual output
113
+ - Extract key factual claims from the summary text
114
+
115
+ 2. Generate info coverage verdicts:
116
+ a. Generate assessment questions if not provided
117
+ b. Generate answers to the assessment questions for both summary and original text
118
+ c. Compare answers to determine if summary adequately covers key information
119
+ d. Calculate info coverage score based on matching answers
120
+
121
+ 3. Generate contradiction verdicts:
122
+ a. Generate claims from the actual output
123
+ b. Verify each claim against the original text for factual accuracy
124
+ c. Calculate contradiction score based on verified claims
125
+
126
+ 4. Calculate final score:
127
+ - Take minimum of info coverage and contradiction scores
128
+ - Generate reason explaining the scoring
129
+ - Check if score meets threshold for success
130
+ """
131
+ check_example_params(example, required_params, self)
132
+ try:
133
+ with scorer_progress_meter(
134
+ self,
135
+ async_mode=True,
136
+ display_meter=_show_indicator,
137
+ ):
138
+ self.claims = await self._a_generate_claims(example.actual_output),
139
+
140
+ self.info_coverage_verdicts, self.contradiction_verdicts = await asyncio.gather(
141
+ self._a_generate_info_coverage_verdicts(example),
142
+ self._a_generate_contradiction_verdicts(example),
143
+ )
144
+
145
+ contradiction_score = self._calculate_score(ScoreType.CONTRADICTION)
146
+ info_coverage_score = self._calculate_score(ScoreType.INFO_COVERAGE)
147
+ self.score_breakdown = {
148
+ ScoreType.CONTRADICTION.value: contradiction_score,
149
+ ScoreType.INFO_COVERAGE.value: info_coverage_score,
150
+ }
151
+ self.score = min(contradiction_score, info_coverage_score)
152
+ self.reason = await self._a_generate_reason()
153
+ self.success = self.score >= self.threshold
154
+ self.verbose_logs = create_verbose_logs(
155
+ self,
156
+ steps=[
157
+ f"Claims:\n{self.claims}",
158
+ f"Assessment Questions:\n{self.assessment_questions}",
159
+ f"Info Coverage Verdicts:\n{[v.model_dump() for v in self.info_coverage_verdicts]}",
160
+ f"Contradiction Verdicts:\n{[v.model_dump() for v in self.contradiction_verdicts]}",
161
+ f"Score: {self.score}\nReason: {self.reason}",
162
+ ],
163
+ )
164
+
165
+ return self.score
166
+ except Exception as e:
167
+ print(f"Error in SummarizationScorer a_score_example: {e}")
168
+ raise
169
+
170
+ async def _a_generate_reason(self) -> str:
171
+ if self.include_reason is False:
172
+ return None
173
+
174
+ contradictions = []
175
+ redundancies = []
176
+ for verdict in self.contradiction_verdicts:
177
+ if verdict.verdict.strip().lower() == "no":
178
+ contradictions.append(verdict.reason)
179
+ elif verdict.verdict.strip().lower() == "idk":
180
+ redundancies.append(verdict.reason)
181
+
182
+ questions = []
183
+ if self.info_coverage_verdicts:
184
+ for verdict in self.info_coverage_verdicts:
185
+ if (
186
+ verdict.original_verdict.strip().lower() == "yes"
187
+ and verdict.summary_verdict.strip().lower() == "no"
188
+ ):
189
+ questions.append(verdict.question)
190
+
191
+ prompt: dict = SummarizationTemplate.generate_reason(
192
+ contradictions=contradictions,
193
+ redundancies=redundancies,
194
+ questions=questions,
195
+ score=format(self.score, ".2f"),
196
+ )
197
+
198
+ if len(questions) > 0:
199
+ prompt += f"""Questions the original text can answer but not the summary:
200
+ {questions}
201
+
202
+ """
203
+ prompt += """JSON:
204
+ """
205
+
206
+ if self.using_native_model:
207
+ res = await self.model.a_generate(prompt)
208
+ data = parse_response_json(res, self)
209
+ return data["reason"]
210
+ else:
211
+ try:
212
+ res: Reason = await self.model.a_generate(prompt, schema=Reason)
213
+ return res.reason
214
+ except TypeError:
215
+ res = await self.model.a_generate(prompt)
216
+ data = parse_response_json(res, self)
217
+ return data["reason"]
218
+
219
+ def _generate_reason(self) -> str:
220
+ if self.include_reason is False:
221
+ return None
222
+
223
+ contradictions = []
224
+ redundancies = []
225
+ for verdict in self.contradiction_verdicts:
226
+ if verdict.verdict.strip().lower() == "no":
227
+ contradictions.append(verdict.reason)
228
+ elif verdict.verdict.strip().lower() == "idk":
229
+ redundancies.append(verdict.reason)
230
+
231
+ questions = []
232
+ if self.info_coverage_verdicts:
233
+ for verdict in self.info_coverage_verdicts:
234
+ if (
235
+ verdict.original_verdict.strip().lower() == "yes"
236
+ and verdict.summary_verdict.strip().lower() == "no"
237
+ ):
238
+ questions.append(verdict.question)
239
+
240
+ prompt: dict = SummarizationTemplate.generate_reason(
241
+ contradictions=contradictions,
242
+ redundancies=redundancies,
243
+ questions=questions,
244
+ score=format(self.score, ".2f"),
245
+ )
246
+
247
+ if len(questions) > 0:
248
+ prompt += f"""Questions the original text can answer but not the summary:
249
+ {questions}
250
+
251
+ """
252
+ prompt += """JSON:
253
+ """
254
+
255
+ if self.using_native_model:
256
+ res = self.model.generate(prompt)
257
+ data = parse_response_json(res, self)
258
+ return data["reason"]
259
+ else:
260
+ try:
261
+ res: Reason = self.model.generate(prompt, schema=Reason)
262
+ return res.reason
263
+ except TypeError:
264
+ res = self.model.generate(prompt)
265
+ data = parse_response_json(res, self)
266
+ return data["reason"]
267
+
268
+ def _calculate_score(self, score_type: ScoreType) -> float:
269
+ if score_type == ScoreType.CONTRADICTION:
270
+ total = len(self.contradiction_verdicts)
271
+ if total == 0:
272
+ return 0
273
+ faithfulness_count = 0
274
+ for verdict in self.contradiction_verdicts:
275
+ # Different from the faithfulness score, this
276
+ # penalizes 'idk' (full of fluff) summaries
277
+ if verdict.verdict.strip().lower() == "yes":
278
+ faithfulness_count += 1
279
+
280
+ score = faithfulness_count / total
281
+
282
+ else:
283
+ if self.assessment_questions is None:
284
+ return 1
285
+ total = 0
286
+ coverage_count = 0
287
+ for verdict in self.info_coverage_verdicts:
288
+ if verdict.original_verdict.strip().lower() == "yes":
289
+ total += 1
290
+ if verdict.summary_verdict.strip().lower() == "yes":
291
+ coverage_count += 1
292
+
293
+ if total == 0:
294
+ return 0
295
+
296
+ score = coverage_count / total
297
+
298
+ return 0 if self.strict_mode and score < self.threshold else score
299
+
300
+ async def _a_generate_answers(self, text: str) -> List[str]:
301
+ prompt = SummarizationTemplate.generate_answers(
302
+ questions=self.assessment_questions, text=text
303
+ )
304
+ if self.using_native_model:
305
+ res = await self.model.a_generate(prompt)
306
+ data = parse_response_json(res, self)
307
+ return data["answers"]
308
+ else:
309
+ try:
310
+ res: Answers = await self.model.a_generate(
311
+ prompt, schema=Answers
312
+ )
313
+ return res.answers
314
+ except TypeError:
315
+ res = await self.model.a_generate(prompt)
316
+ data = parse_response_json(res, self)
317
+ return data["answers"]
318
+
319
+ def _generate_answers(self, text: str) -> List[str]:
320
+ prompt = SummarizationTemplate.generate_answers(
321
+ questions=self.assessment_questions, text=text
322
+ )
323
+ if self.using_native_model:
324
+ res = self.model.generate(prompt)
325
+ data = parse_response_json(res, self)
326
+ return data["answers"]
327
+ else:
328
+ try:
329
+ res: Answers = self.model.generate(prompt, schema=Answers)
330
+ return res.answers
331
+ except TypeError:
332
+ res = self.model.generate(prompt)
333
+ data = parse_response_json(res, self)
334
+ return data["answers"]
335
+
336
+ async def _a_generate_assessment_questions(self, text: str):
337
+ prompt = SummarizationTemplate.generate_questions(text=text, n=self.n)
338
+ if self.using_native_model:
339
+ res = await self.model.a_generate(prompt)
340
+ data = parse_response_json(res, self)
341
+ return data["questions"]
342
+ else:
343
+ try:
344
+ res: Questions = await self.model.a_generate(
345
+ prompt, schema=Questions
346
+ )
347
+ return res.questions
348
+ except TypeError:
349
+ res = await self.model.a_generate(prompt)
350
+ data = parse_response_json(res, self)
351
+ return data["questions"]
352
+
353
+ def _generate_assessment_questions(self, text: str):
354
+ prompt = SummarizationTemplate.generate_questions(text=text, n=self.n)
355
+ if self.using_native_model:
356
+ res = self.model.generate(prompt)
357
+ data = parse_response_json(res, self)
358
+ return data["questions"]
359
+ else:
360
+ try:
361
+ res: Questions = self.model.generate(prompt, schema=Questions)
362
+ return res.questions
363
+ except TypeError:
364
+ res = self.model.generate(prompt)
365
+ data = parse_response_json(res, self)
366
+ return data["questions"]
367
+
368
+ async def _a_generate_info_coverage_verdicts(
369
+ self, example: Example
370
+ ) -> List[InfoCoverageVerdict]:
371
+ if self.assessment_questions is None:
372
+ self.assessment_questions = (
373
+ await self._a_generate_assessment_questions(example.input)
374
+ )
375
+
376
+ tasks = [
377
+ self._a_generate_answers(example.input),
378
+ self._a_generate_answers(example.actual_output),
379
+ ]
380
+ results = await asyncio.gather(*tasks)
381
+ original_answers = results[0]
382
+ summary_answers = results[1]
383
+
384
+ if len(original_answers) != len(summary_answers):
385
+ raise ValueError("Number of verdicts generated does not equal.")
386
+
387
+ coverage_veridcts: List[InfoCoverageVerdict] = []
388
+ for i in range(len(original_answers)):
389
+ coverage_veridcts.append(
390
+ InfoCoverageVerdict(
391
+ summary_verdict=summary_answers[i],
392
+ original_verdict=original_answers[i],
393
+ question=self.assessment_questions[i],
394
+ )
395
+ )
396
+ return coverage_veridcts
397
+
398
+ def _generate_info_coverage_verdicts(
399
+ self, example: Example
400
+ ) -> List[InfoCoverageVerdict]:
401
+ if self.assessment_questions is None:
402
+ self.assessment_questions = self._generate_assessment_questions(
403
+ example.input
404
+ )
405
+
406
+ original_answers = self._generate_answers(example.input)
407
+ summary_answers = self._generate_answers(example.actual_output)
408
+
409
+ if len(original_answers) != len(summary_answers):
410
+ raise ValueError("Number of verdicts generated does not equal.")
411
+
412
+ coverage_veridcts: List[InfoCoverageVerdict] = []
413
+ for i in range(len(original_answers)):
414
+ coverage_veridcts.append(
415
+ InfoCoverageVerdict(
416
+ summary_verdict=summary_answers[i],
417
+ original_verdict=original_answers[i],
418
+ question=self.assessment_questions[i],
419
+ )
420
+ )
421
+
422
+ return coverage_veridcts
423
+
424
+ async def _a_generate_contradiction_verdicts(
425
+ self,
426
+ example: Example,
427
+ ) -> List[ContradictionVerdict]:
428
+ if len(self.claims) == 0:
429
+ return []
430
+
431
+ verdicts: List[ContradictionVerdict] = []
432
+
433
+ prompt = SummarizationTemplate.generate_contradiction_verdicts(
434
+ original_text=example.input,
435
+ summary_claims=self.claims
436
+ )
437
+ if self.using_native_model:
438
+ res = await self.model.a_generate(prompt)
439
+ data = parse_response_json(res, self)
440
+ verdicts = [
441
+ ContradictionVerdict(**item)
442
+ for item in data["verdicts"]
443
+ ]
444
+ return verdicts
445
+ else:
446
+ try:
447
+ res: Verdicts = await self.model.a_generate(
448
+ prompt, schema=Verdicts
449
+ )
450
+ verdicts = [item for item in res.verdicts]
451
+ return verdicts
452
+ except TypeError:
453
+ res = await self.model.a_generate(prompt)
454
+ data = parse_response_json(res, self)
455
+ verdicts = [
456
+ ContradictionVerdict(**item)
457
+ for item in data["verdicts"]
458
+ ]
459
+ return verdicts
460
+
461
+ def _generate_contradiction_verdicts(
462
+ self,
463
+ example: Example,
464
+ ) -> List[ContradictionVerdict]:
465
+ if len(self.claims) == 0:
466
+ return []
467
+
468
+ verdicts: List[ContradictionVerdict] = []
469
+
470
+ prompt = SummarizationTemplate.generate_contradiction_verdicts(
471
+ original_text=example.input,
472
+ summary_claims=self.claims
473
+ )
474
+ if self.using_native_model:
475
+ res = self.model.generate(prompt)
476
+ data = parse_response_json(res, self)
477
+ verdicts = [
478
+ ContradictionVerdict(**item)
479
+ for item in data["verdicts"]
480
+ ]
481
+ return verdicts
482
+ else:
483
+ try:
484
+ res: Verdicts = self.model.generate(prompt, schema=Verdicts)
485
+ verdicts = [item for item in res.verdicts]
486
+ return verdicts
487
+ except TypeError:
488
+ res = self.model.generate(prompt)
489
+ data = parse_response_json(res, self)
490
+ verdicts = [
491
+ ContradictionVerdict(**item)
492
+ for item in data["verdicts"]
493
+ ]
494
+ return verdicts
495
+
496
+ async def _a_generate_claims(self, text: str) -> List[str]:
497
+ # Borrow faithfulness template since it already works
498
+ prompt = FaithfulnessTemplate.find_claims(text=text)
499
+ if self.using_native_model:
500
+ res = await self.model.a_generate(prompt)
501
+ data = parse_response_json(res, self)
502
+ return data["claims"]
503
+ else:
504
+ try:
505
+ res: Claims = await self.model.a_generate(prompt, schema=Claims)
506
+ return res.claims
507
+ except TypeError:
508
+ res = await self.model.a_generate(prompt)
509
+ data = parse_response_json(res, self)
510
+ return data["claims"]
511
+
512
+ def _generate_claims(self, text: str) -> List[str]:
513
+ # Borrow faithfulness template
514
+ prompt = FaithfulnessTemplate.find_claims(text=text)
515
+ if self.using_native_model:
516
+ res = self.model.generate(prompt)
517
+ data = parse_response_json(res, self)
518
+ return data["claims"]
519
+ else:
520
+ try:
521
+ res: Claims = self.model.generate(prompt, schema=Claims)
522
+ return res.claims
523
+ except TypeError:
524
+ res = self.model.generate(prompt)
525
+ data = parse_response_json(res, self)
526
+ return data["claims"]
527
+
528
+ def _success_check(self) -> bool:
529
+ if self.error is not None:
530
+ self.success = False
531
+ else:
532
+ try:
533
+ self.success = self.score >= self.threshold
534
+ except:
535
+ self.success = False
536
+ return self.success
537
+
538
+ @property
539
+ def __name__(self):
540
+ return "Summarization"
541
+
@@ -0,0 +1,3 @@
1
+ from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
2
+
3
+ __all__ = ["ToolCorrectnessScorer"]
@@ -0,0 +1,151 @@
1
+ from typing import List, Union
2
+
3
+ from judgeval.scorers.utils import (
4
+ scorer_progress_meter,
5
+ create_verbose_logs,
6
+ parse_response_json,
7
+ check_example_params
8
+ )
9
+ from judgeval.data import Example, ExampleParams
10
+ from judgeval.scorers import JudgevalScorer
11
+
12
+
13
+ required_params = [
14
+ ExampleParams.INPUT,
15
+ ExampleParams.ACTUAL_OUTPUT,
16
+ ExampleParams.EXPECTED_TOOLS,
17
+ ExampleParams.TOOLS_CALLED,
18
+ ]
19
+
20
+
21
+ def get_lcs(seq1, seq2):
22
+ m, n = len(seq1), len(seq2)
23
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
24
+
25
+ for i in range(1, m + 1):
26
+ for j in range(1, n + 1):
27
+ if seq1[i - 1] == seq2[j - 1]:
28
+ dp[i][j] = dp[i - 1][j - 1] + 1
29
+ else:
30
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
31
+
32
+ # Reconstruct the LCS
33
+ lcs = []
34
+ i, j = m, n
35
+ while i > 0 and j > 0:
36
+ if seq1[i - 1] == seq2[j - 1]:
37
+ lcs.append(seq1[i - 1])
38
+ i -= 1
39
+ j -= 1
40
+ elif dp[i - 1][j] > dp[i][j - 1]:
41
+ i -= 1
42
+ else:
43
+ j -= 1
44
+
45
+ return lcs[::-1]
46
+
47
+
48
+ class ToolCorrectnessScorer(JudgevalScorer):
49
+ def __init__(
50
+ self,
51
+ threshold: float = 0.5,
52
+ include_reason: bool = True,
53
+ strict_mode: bool = False,
54
+ verbose_mode: bool = False,
55
+ should_exact_match: bool = False,
56
+ should_consider_ordering: bool = False,
57
+ ):
58
+ self.threshold = 1 if strict_mode else threshold
59
+ self.include_reason = include_reason
60
+ self.strict_mode = strict_mode
61
+ self.verbose_mode = verbose_mode
62
+ self.should_exact_match = should_exact_match
63
+ self.should_consider_ordering = should_consider_ordering
64
+
65
+ def measure(
66
+ self,
67
+ example: Example,
68
+ _show_indicator: bool = True,
69
+ ) -> float:
70
+ check_example_params(example, required_params, self)
71
+
72
+ with scorer_progress_meter(self, display_meter=_show_indicator):
73
+ self.tools_called: List[str] = example.tools_called
74
+ self.expected_tools: List[str] = example.expected_tools
75
+ self.score = self._calculate_score()
76
+ self.reason = self._generate_reason()
77
+ self.success = self.score >= self.threshold
78
+ self.verbose_logs = create_verbose_logs(
79
+ self,
80
+ steps=[
81
+ f"Expected Tools:\n{self.expected_tools}",
82
+ f"Tools Called:\n{self.tools_called}",
83
+ f"Score: {self.score}\nReason: {self.reason}",
84
+ ],
85
+ )
86
+ return self.score
87
+
88
+ async def a_measure(
89
+ self, test_case: Example, _show_indicator: bool = True
90
+ ) -> float:
91
+ check_example_params(test_case, required_params, self)
92
+ return self.measure(test_case, _show_indicator=_show_indicator)
93
+
94
+ def _generate_reason(self):
95
+ if self.should_exact_match:
96
+ return f"{'Exact match' if self.tools_called == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_called}."
97
+
98
+ elif self.should_consider_ordering:
99
+ lcs = get_lcs(self.expected_tools, self.tools_called)
100
+ missing = set(self.expected_tools) - set(self.tools_called)
101
+ out_of_order = set(self.expected_tools) - set(lcs)
102
+
103
+ if len(lcs) == len(self.expected_tools):
104
+ return f"Correct ordering: all expected tools {self.expected_tools} were called in the correct order."
105
+ else:
106
+ issues = []
107
+ if missing:
108
+ issues.append(f"missing tools {list(missing)}")
109
+ if out_of_order:
110
+ issues.append(f"out-of-order tools {list(out_of_order)}")
111
+ return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_called}."
112
+
113
+ else:
114
+ used_expected = set(self.tools_called).intersection(
115
+ set(self.expected_tools)
116
+ )
117
+ missing = set(self.expected_tools) - used_expected
118
+
119
+ if len(used_expected) == len(self.expected_tools):
120
+ return f"All expected tools {self.expected_tools} were called (order not considered)."
121
+ else:
122
+ return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_called}."
123
+
124
+ def _calculate_score(self):
125
+ if self.should_exact_match:
126
+ return 1.0 if self.tools_called == self.expected_tools else 0.0
127
+
128
+ elif self.should_consider_ordering:
129
+ longest_common_subsequence = get_lcs(
130
+ self.expected_tools, self.tools_called
131
+ )
132
+ score = len(longest_common_subsequence) / len(self.expected_tools)
133
+
134
+ else:
135
+ used_expected_tools = set(self.tools_called).intersection(
136
+ set(self.expected_tools)
137
+ )
138
+ score = len(used_expected_tools) / len(self.expected_tools)
139
+ return 0 if self.strict_mode and score < self.threshold else score
140
+
141
+ def _success_check(self) -> bool:
142
+ try:
143
+ self.success = self.score >= self.threshold
144
+ except:
145
+ self.success = False
146
+ return self.success
147
+
148
+ @property
149
+ def __name__(self):
150
+ return "Tool Correctness"
151
+