judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,550 +0,0 @@
1
- from typing import List, Optional, Union
2
- import asyncio
3
-
4
- from judgeval.constants import APIScorer
5
- from judgeval.scorers.utils import (get_or_create_event_loop,
6
- scorer_progress_meter,
7
- create_verbose_logs,
8
- parse_response_json,
9
- check_example_params
10
- )
11
- from judgeval.scorers import JudgevalScorer
12
- from judgeval.judges import JudgevalJudge
13
- from judgeval.judges.utils import create_judge
14
- from judgeval.data import Example, ExampleParams
15
- from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.prompts import (
16
- FaithfulnessTemplate,
17
- Claims
18
- )
19
- from judgeval.scorers.judgeval_scorers.local_implementations.summarization.prompts import *
20
-
21
-
22
- required_params = [
23
- ExampleParams.INPUT,
24
- ExampleParams.ACTUAL_OUTPUT,
25
- ]
26
-
27
-
28
- class SummarizationScorer(JudgevalScorer):
29
- def __init__(
30
- self,
31
- threshold: float = 0.5,
32
- n: int = 5,
33
- model: Optional[Union[str, JudgevalJudge]] = None,
34
- assessment_questions: Optional[List[str]] = None,
35
- include_reason: bool = True,
36
- async_mode=True,
37
- strict_mode: bool = False,
38
- verbose_mode: bool = False,
39
- ):
40
- super().__init__(
41
- score_type=APIScorer.SUMMARIZATION,
42
- threshold=1 if strict_mode else threshold,
43
- evaluation_model=None,
44
- include_reason=include_reason,
45
- async_mode=async_mode,
46
- strict_mode=strict_mode,
47
- verbose_mode=verbose_mode
48
- )
49
- self.model, self.using_native_model = create_judge(model)
50
- self.evaluation_model = self.model.get_model_name()
51
-
52
- if assessment_questions is not None and len(assessment_questions) == 0:
53
- self.assessment_questions = None
54
- else:
55
- self.assessment_questions = assessment_questions
56
-
57
- self.include_reason = include_reason
58
- self.n = n
59
- self.async_mode = async_mode
60
- self.strict_mode = strict_mode
61
- self.verbose_mode = verbose_mode
62
-
63
- def score_example(
64
- self,
65
- example: Example,
66
- _show_indicator: bool = True,
67
- ) -> float:
68
- check_example_params(example, required_params, self)
69
- try:
70
- with scorer_progress_meter(self, display_meter=_show_indicator):
71
- if self.async_mode:
72
- loop = get_or_create_event_loop()
73
- loop.run_until_complete(
74
- self.a_score_example(example, _show_indicator=False)
75
- )
76
- else:
77
- self.claims: List[str] = self._generate_claims(
78
- example.actual_output
79
- )
80
-
81
- self.info_coverage_verdicts: List[InfoCoverageVerdict] = (
82
- self._generate_info_coverage_verdicts(example)
83
- )
84
-
85
- self.contradiction_verdicts: List[ContradictionVerdict] = (
86
- self._generate_contradiction_verdicts(example)
87
- )
88
-
89
- contradiction_score = self._calculate_score(ScoreType.CONTRADICTION)
90
- info_coverage_score = self._calculate_score(ScoreType.INFO_COVERAGE)
91
- self.score_breakdown = {
92
- ScoreType.CONTRADICTION.value: contradiction_score,
93
- ScoreType.INFO_COVERAGE.value: info_coverage_score,
94
- }
95
- self.score = min(contradiction_score, info_coverage_score)
96
- self.reason = self._generate_reason()
97
- self.success = self.score >= self.threshold
98
- self.verbose_logs = create_verbose_logs(
99
- self,
100
- steps=[
101
- f"Claims:\n{self.claims}",
102
- f"Assessment Questions:\n{self.assessment_questions}",
103
- f"Info Coverage Verdicts:\n{[v.model_dump() for v in self.info_coverage_verdicts]}",
104
- f"Contradiction Verdicts:\n{[v.model_dump() for v in self.contradiction_verdicts]}",
105
- f"Score: {self.score}\nReason: {self.reason}",
106
- ],
107
- )
108
-
109
- return self.score
110
- except Exception as e:
111
- print(f"Error in SummarizationScorer score_example: {e}")
112
- raise
113
-
114
- async def a_score_example(
115
- self,
116
- example: Example,
117
- _show_indicator: bool = True,
118
- ) -> float:
119
- """
120
- To score, we take the following steps:
121
- 1. Generate claims from the actual output
122
- - Extract key factual claims from the summary text
123
-
124
- 2. Generate info coverage verdicts:
125
- a. Generate assessment questions if not provided
126
- b. Generate answers to the assessment questions for both summary and original text
127
- c. Compare answers to determine if summary adequately covers key information
128
- d. Calculate info coverage score based on matching answers
129
-
130
- 3. Generate contradiction verdicts:
131
- a. Generate claims from the actual output
132
- b. Verify each claim against the original text for factual accuracy
133
- c. Calculate contradiction score based on verified claims
134
-
135
- 4. Calculate final score:
136
- - Take minimum of info coverage and contradiction scores
137
- - Generate reason explaining the scoring
138
- - Check if score meets threshold for success
139
- """
140
- check_example_params(example, required_params, self)
141
- try:
142
- with scorer_progress_meter(
143
- self,
144
- async_mode=True,
145
- display_meter=_show_indicator,
146
- ):
147
- self.claims = await self._a_generate_claims(example.actual_output),
148
-
149
- self.info_coverage_verdicts, self.contradiction_verdicts = await asyncio.gather(
150
- self._a_generate_info_coverage_verdicts(example),
151
- self._a_generate_contradiction_verdicts(example),
152
- )
153
-
154
- contradiction_score = self._calculate_score(ScoreType.CONTRADICTION)
155
- info_coverage_score = self._calculate_score(ScoreType.INFO_COVERAGE)
156
- self.score_breakdown = {
157
- ScoreType.CONTRADICTION.value: contradiction_score,
158
- ScoreType.INFO_COVERAGE.value: info_coverage_score,
159
- }
160
- self.score = min(contradiction_score, info_coverage_score)
161
- self.reason = await self._a_generate_reason()
162
- self.success = self.score >= self.threshold
163
- self.verbose_logs = create_verbose_logs(
164
- self,
165
- steps=[
166
- f"Claims:\n{self.claims}",
167
- f"Assessment Questions:\n{self.assessment_questions}",
168
- f"Info Coverage Verdicts:\n{[v.model_dump() for v in self.info_coverage_verdicts]}",
169
- f"Contradiction Verdicts:\n{[v.model_dump() for v in self.contradiction_verdicts]}",
170
- f"Score: {self.score}\nReason: {self.reason}",
171
- ],
172
- )
173
-
174
- return self.score
175
- except Exception as e:
176
- print(f"Error in SummarizationScorer a_score_example: {e}")
177
- raise
178
-
179
- async def _a_generate_reason(self) -> str:
180
- if self.include_reason is False:
181
- return None
182
-
183
- contradictions = []
184
- redundancies = []
185
- for verdict in self.contradiction_verdicts:
186
- if verdict.verdict.strip().lower() == "no":
187
- contradictions.append(verdict.reason)
188
- elif verdict.verdict.strip().lower() == "idk":
189
- redundancies.append(verdict.reason)
190
-
191
- questions = []
192
- if self.info_coverage_verdicts:
193
- for verdict in self.info_coverage_verdicts:
194
- if (
195
- verdict.original_verdict.strip().lower() == "yes"
196
- and verdict.summary_verdict.strip().lower() == "no"
197
- ):
198
- questions.append(verdict.question)
199
-
200
- prompt: dict = SummarizationTemplate.generate_reason(
201
- contradictions=contradictions,
202
- redundancies=redundancies,
203
- questions=questions,
204
- score=format(self.score, ".2f"),
205
- )
206
-
207
- if len(questions) > 0:
208
- prompt += f"""Questions the original text can answer but not the summary:
209
- {questions}
210
-
211
- """
212
- prompt += """JSON:
213
- """
214
-
215
- if self.using_native_model:
216
- res = await self.model.a_generate(prompt)
217
- data = parse_response_json(res, self)
218
- return data["reason"]
219
- else:
220
- try:
221
- res: Reason = await self.model.a_generate(prompt, schema=Reason)
222
- return res.reason
223
- except TypeError:
224
- res = await self.model.a_generate(prompt)
225
- data = parse_response_json(res, self)
226
- return data["reason"]
227
-
228
- def _generate_reason(self) -> str:
229
- if self.include_reason is False:
230
- return None
231
-
232
- contradictions = []
233
- redundancies = []
234
- for verdict in self.contradiction_verdicts:
235
- if verdict.verdict.strip().lower() == "no":
236
- contradictions.append(verdict.reason)
237
- elif verdict.verdict.strip().lower() == "idk":
238
- redundancies.append(verdict.reason)
239
-
240
- questions = []
241
- if self.info_coverage_verdicts:
242
- for verdict in self.info_coverage_verdicts:
243
- if (
244
- verdict.original_verdict.strip().lower() == "yes"
245
- and verdict.summary_verdict.strip().lower() == "no"
246
- ):
247
- questions.append(verdict.question)
248
-
249
- prompt: dict = SummarizationTemplate.generate_reason(
250
- contradictions=contradictions,
251
- redundancies=redundancies,
252
- questions=questions,
253
- score=format(self.score, ".2f"),
254
- )
255
-
256
- if len(questions) > 0:
257
- prompt += f"""Questions the original text can answer but not the summary:
258
- {questions}
259
-
260
- """
261
- prompt += """JSON:
262
- """
263
-
264
- if self.using_native_model:
265
- res = self.model.generate(prompt)
266
- data = parse_response_json(res, self)
267
- return data["reason"]
268
- else:
269
- try:
270
- res: Reason = self.model.generate(prompt, schema=Reason)
271
- return res.reason
272
- except TypeError:
273
- res = self.model.generate(prompt)
274
- data = parse_response_json(res, self)
275
- return data["reason"]
276
-
277
- def _calculate_score(self, score_type: ScoreType) -> float:
278
- if score_type == ScoreType.CONTRADICTION:
279
- total = len(self.contradiction_verdicts)
280
- if total == 0:
281
- return 0
282
- faithfulness_count = 0
283
- for verdict in self.contradiction_verdicts:
284
- # Different from the faithfulness score, this
285
- # penalizes 'idk' (full of fluff) summaries
286
- if verdict.verdict.strip().lower() == "yes":
287
- faithfulness_count += 1
288
-
289
- score = faithfulness_count / total
290
-
291
- else:
292
- if self.assessment_questions is None:
293
- return 1
294
- total = 0
295
- coverage_count = 0
296
- for verdict in self.info_coverage_verdicts:
297
- if verdict.original_verdict.strip().lower() == "yes":
298
- total += 1
299
- if verdict.summary_verdict.strip().lower() == "yes":
300
- coverage_count += 1
301
-
302
- if total == 0:
303
- return 0
304
-
305
- score = coverage_count / total
306
-
307
- return 0 if self.strict_mode and score < self.threshold else score
308
-
309
- async def _a_generate_answers(self, text: str) -> List[str]:
310
- prompt = SummarizationTemplate.generate_answers(
311
- questions=self.assessment_questions, text=text
312
- )
313
- if self.using_native_model:
314
- res = await self.model.a_generate(prompt)
315
- data = parse_response_json(res, self)
316
- return data["answers"]
317
- else:
318
- try:
319
- res: Answers = await self.model.a_generate(
320
- prompt, schema=Answers
321
- )
322
- return res.answers
323
- except TypeError:
324
- res = await self.model.a_generate(prompt)
325
- data = parse_response_json(res, self)
326
- return data["answers"]
327
-
328
- def _generate_answers(self, text: str) -> List[str]:
329
- prompt = SummarizationTemplate.generate_answers(
330
- questions=self.assessment_questions, text=text
331
- )
332
- if self.using_native_model:
333
- res = self.model.generate(prompt)
334
- data = parse_response_json(res, self)
335
- return data["answers"]
336
- else:
337
- try:
338
- res: Answers = self.model.generate(prompt, schema=Answers)
339
- return res.answers
340
- except TypeError:
341
- res = self.model.generate(prompt)
342
- data = parse_response_json(res, self)
343
- return data["answers"]
344
-
345
- async def _a_generate_assessment_questions(self, text: str):
346
- prompt = SummarizationTemplate.generate_questions(text=text, n=self.n)
347
- if self.using_native_model:
348
- res = await self.model.a_generate(prompt)
349
- data = parse_response_json(res, self)
350
- return data["questions"]
351
- else:
352
- try:
353
- res: Questions = await self.model.a_generate(
354
- prompt, schema=Questions
355
- )
356
- return res.questions
357
- except TypeError:
358
- res = await self.model.a_generate(prompt)
359
- data = parse_response_json(res, self)
360
- return data["questions"]
361
-
362
- def _generate_assessment_questions(self, text: str):
363
- prompt = SummarizationTemplate.generate_questions(text=text, n=self.n)
364
- if self.using_native_model:
365
- res = self.model.generate(prompt)
366
- data = parse_response_json(res, self)
367
- return data["questions"]
368
- else:
369
- try:
370
- res: Questions = self.model.generate(prompt, schema=Questions)
371
- return res.questions
372
- except TypeError:
373
- res = self.model.generate(prompt)
374
- data = parse_response_json(res, self)
375
- return data["questions"]
376
-
377
- async def _a_generate_info_coverage_verdicts(
378
- self, example: Example
379
- ) -> List[InfoCoverageVerdict]:
380
- if self.assessment_questions is None:
381
- self.assessment_questions = (
382
- await self._a_generate_assessment_questions(example.input)
383
- )
384
-
385
- tasks = [
386
- self._a_generate_answers(example.input),
387
- self._a_generate_answers(example.actual_output),
388
- ]
389
- results = await asyncio.gather(*tasks)
390
- original_answers = results[0]
391
- summary_answers = results[1]
392
-
393
- if len(original_answers) != len(summary_answers):
394
- raise ValueError("Number of verdicts generated does not equal.")
395
-
396
- coverage_veridcts: List[InfoCoverageVerdict] = []
397
- for i in range(len(original_answers)):
398
- coverage_veridcts.append(
399
- InfoCoverageVerdict(
400
- summary_verdict=summary_answers[i],
401
- original_verdict=original_answers[i],
402
- question=self.assessment_questions[i],
403
- )
404
- )
405
- return coverage_veridcts
406
-
407
- def _generate_info_coverage_verdicts(
408
- self, example: Example
409
- ) -> List[InfoCoverageVerdict]:
410
- if self.assessment_questions is None:
411
- self.assessment_questions = self._generate_assessment_questions(
412
- example.input
413
- )
414
-
415
- original_answers = self._generate_answers(example.input)
416
- summary_answers = self._generate_answers(example.actual_output)
417
-
418
- if len(original_answers) != len(summary_answers):
419
- raise ValueError("Number of verdicts generated does not equal.")
420
-
421
- coverage_veridcts: List[InfoCoverageVerdict] = []
422
- for i in range(len(original_answers)):
423
- coverage_veridcts.append(
424
- InfoCoverageVerdict(
425
- summary_verdict=summary_answers[i],
426
- original_verdict=original_answers[i],
427
- question=self.assessment_questions[i],
428
- )
429
- )
430
-
431
- return coverage_veridcts
432
-
433
- async def _a_generate_contradiction_verdicts(
434
- self,
435
- example: Example,
436
- ) -> List[ContradictionVerdict]:
437
- if len(self.claims) == 0:
438
- return []
439
-
440
- verdicts: List[ContradictionVerdict] = []
441
-
442
- prompt = SummarizationTemplate.generate_contradiction_verdicts(
443
- original_text=example.input,
444
- summary_claims=self.claims
445
- )
446
- if self.using_native_model:
447
- res = await self.model.a_generate(prompt)
448
- data = parse_response_json(res, self)
449
- verdicts = [
450
- ContradictionVerdict(**item)
451
- for item in data["verdicts"]
452
- ]
453
- return verdicts
454
- else:
455
- try:
456
- res: Verdicts = await self.model.a_generate(
457
- prompt, schema=Verdicts
458
- )
459
- verdicts = [item for item in res.verdicts]
460
- return verdicts
461
- except TypeError:
462
- res = await self.model.a_generate(prompt)
463
- data = parse_response_json(res, self)
464
- verdicts = [
465
- ContradictionVerdict(**item)
466
- for item in data["verdicts"]
467
- ]
468
- return verdicts
469
-
470
- def _generate_contradiction_verdicts(
471
- self,
472
- example: Example,
473
- ) -> List[ContradictionVerdict]:
474
- if len(self.claims) == 0:
475
- return []
476
-
477
- verdicts: List[ContradictionVerdict] = []
478
-
479
- prompt = SummarizationTemplate.generate_contradiction_verdicts(
480
- original_text=example.input,
481
- summary_claims=self.claims
482
- )
483
- if self.using_native_model:
484
- res = self.model.generate(prompt)
485
- data = parse_response_json(res, self)
486
- verdicts = [
487
- ContradictionVerdict(**item)
488
- for item in data["verdicts"]
489
- ]
490
- return verdicts
491
- else:
492
- try:
493
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
494
- verdicts = [item for item in res.verdicts]
495
- return verdicts
496
- except TypeError:
497
- res = self.model.generate(prompt)
498
- data = parse_response_json(res, self)
499
- verdicts = [
500
- ContradictionVerdict(**item)
501
- for item in data["verdicts"]
502
- ]
503
- return verdicts
504
-
505
- async def _a_generate_claims(self, text: str) -> List[str]:
506
- # Borrow faithfulness template since it already works
507
- prompt = FaithfulnessTemplate.find_claims(text=text)
508
- if self.using_native_model:
509
- res = await self.model.a_generate(prompt)
510
- data = parse_response_json(res, self)
511
- return data["claims"]
512
- else:
513
- try:
514
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
515
- return res.claims
516
- except TypeError:
517
- res = await self.model.a_generate(prompt)
518
- data = parse_response_json(res, self)
519
- return data["claims"]
520
-
521
- def _generate_claims(self, text: str) -> List[str]:
522
- # Borrow faithfulness template
523
- prompt = FaithfulnessTemplate.find_claims(text=text)
524
- if self.using_native_model:
525
- res = self.model.generate(prompt)
526
- data = parse_response_json(res, self)
527
- return data["claims"]
528
- else:
529
- try:
530
- res: Claims = self.model.generate(prompt, schema=Claims)
531
- return res.claims
532
- except TypeError:
533
- res = self.model.generate(prompt)
534
- data = parse_response_json(res, self)
535
- return data["claims"]
536
-
537
- def _success_check(self) -> bool:
538
- if self.error is not None:
539
- self.success = False
540
- else:
541
- try:
542
- self.success = self.score >= self.threshold
543
- except:
544
- self.success = False
545
- return self.success
546
-
547
- @property
548
- def __name__(self):
549
- return "Summarization"
550
-
@@ -1,3 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.tool_correctness.tool_correctness_scorer import ToolCorrectnessScorer
2
-
3
- __all__ = ["ToolCorrectnessScorer"]