judgeval 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. judgeval/__init__.py +0 -71
  2. judgeval/common/tracer.py +57 -31
  3. judgeval/constants.py +1 -0
  4. judgeval/data/__init__.py +2 -1
  5. judgeval/data/scorer_data.py +2 -2
  6. judgeval/evaluation_run.py +16 -15
  7. judgeval/judges/__init__.py +2 -2
  8. judgeval/judges/base_judge.py +1 -1
  9. judgeval/judges/litellm_judge.py +2 -2
  10. judgeval/judges/mixture_of_judges.py +2 -2
  11. judgeval/judges/together_judge.py +2 -2
  12. judgeval/judges/utils.py +4 -4
  13. judgeval/judgment_client.py +67 -15
  14. judgeval/run_evaluation.py +79 -14
  15. judgeval/scorers/__init__.py +8 -4
  16. judgeval/scorers/api_scorer.py +64 -0
  17. judgeval/scorers/base_scorer.py +3 -2
  18. judgeval/scorers/exceptions.py +11 -0
  19. judgeval/scorers/{custom_scorer.py → judgeval_scorer.py} +9 -5
  20. judgeval/scorers/judgeval_scorers/__init__.py +132 -9
  21. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
  22. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
  23. judgeval/scorers/judgeval_scorers/{answer_relevancy.py → api_scorers/answer_relevancy.py} +2 -2
  24. judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py} +2 -2
  25. judgeval/scorers/judgeval_scorers/{contextual_recall.py → api_scorers/contextual_recall.py} +2 -2
  26. judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py} +2 -2
  27. judgeval/scorers/judgeval_scorers/{faithfulness.py → api_scorers/faithfulness.py} +2 -2
  28. judgeval/scorers/judgeval_scorers/{hallucination.py → api_scorers/hallucination.py} +2 -2
  29. judgeval/scorers/judgeval_scorers/{json_correctness.py → api_scorers/json_correctness.py} +7 -7
  30. judgeval/scorers/judgeval_scorers/{summarization.py → api_scorers/summarization.py} +2 -2
  31. judgeval/scorers/judgeval_scorers/{tool_correctness.py → api_scorers/tool_correctness.py} +2 -2
  32. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
  33. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
  34. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
  35. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
  36. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
  37. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
  38. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
  39. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
  40. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
  41. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
  42. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
  43. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
  44. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
  45. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
  46. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
  47. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
  48. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
  49. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
  50. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
  51. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
  52. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
  53. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
  54. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
  55. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
  56. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
  57. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
  58. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
  59. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
  60. judgeval/scorers/prompt_scorer.py +4 -4
  61. judgeval/scorers/score.py +14 -14
  62. judgeval/scorers/utils.py +40 -6
  63. {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/METADATA +1 -1
  64. judgeval-0.0.4.dist-info/RECORD +78 -0
  65. judgeval-0.0.3.dist-info/RECORD +0 -46
  66. {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/WHEEL +0 -0
  67. {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,259 @@
1
+ from typing import Optional, List, Union
2
+
3
+ from judgeval.judges import JudgevalJudge
4
+ from judgeval.judges.utils import create_judge
5
+ from judgeval.data import Example, ExampleParams
6
+ from judgeval.scorers import JudgevalScorer
7
+ from judgeval.scorers.utils import (
8
+ get_or_create_event_loop,
9
+ parse_response_json,
10
+ scorer_progress_meter,
11
+ create_verbose_logs,
12
+ check_example_params,
13
+ )
14
+ from judgeval.scorers.judgeval_scorers.local_implementations.contextual_precision.prompts import *
15
+
16
+ required_params = [
17
+ ExampleParams.INPUT,
18
+ ExampleParams.ACTUAL_OUTPUT,
19
+ ExampleParams.RETRIEVAL_CONTEXT,
20
+ ExampleParams.EXPECTED_OUTPUT,
21
+ ]
22
+
23
+ class ContextualPrecisionScorer(JudgevalScorer):
24
+ def __init__(
25
+ self,
26
+ threshold: float = 0.5,
27
+ model: Optional[Union[str, JudgevalJudge]] = None,
28
+ include_reason: bool = True,
29
+ async_mode: bool = True,
30
+ strict_mode: bool = False,
31
+ verbose_mode: bool = False,
32
+ ):
33
+ self.threshold = 1 if strict_mode else threshold
34
+ self.include_reason = include_reason
35
+ self.model, self.using_native_model = create_judge(model)
36
+ self.evaluation_model = self.model.get_model_name()
37
+ self.async_mode = async_mode
38
+ self.strict_mode = strict_mode
39
+ self.verbose_mode = verbose_mode
40
+
41
+ def score_example(
42
+ self,
43
+ example: Example,
44
+ _show_indicator: bool = True,
45
+ ) -> float:
46
+ check_example_params(example, required_params, self)
47
+
48
+ with scorer_progress_meter(self, display_meter=_show_indicator):
49
+ if self.async_mode:
50
+ loop = get_or_create_event_loop()
51
+ loop.run_until_complete(
52
+ self.a_score_example(example, _show_indicator=False)
53
+ )
54
+ else:
55
+ self.verdicts: List[ContextualPrecisionVerdict] = (
56
+ self._generate_verdicts(
57
+ example.input,
58
+ example.expected_output,
59
+ example.retrieval_context,
60
+ )
61
+ )
62
+ self.score = self._calculate_score()
63
+ self.reason = self._generate_reason(example.input)
64
+ self.success = self.score >= self.threshold
65
+ self.verbose_logs = create_verbose_logs(
66
+ self,
67
+ steps=[
68
+ # Convert to dict for serialization purposes
69
+ f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
70
+ f"Score: {self.score}\nReason: {self.reason}",
71
+ ],
72
+ )
73
+ return self.score
74
+
75
+ async def a_score_example(
76
+ self,
77
+ example: Example,
78
+ _show_indicator: bool = True,
79
+ ) -> float:
80
+ check_example_params(example, required_params, self)
81
+
82
+ with scorer_progress_meter(
83
+ self,
84
+ async_mode=True,
85
+ display_meter=_show_indicator,
86
+ ):
87
+ self.verdicts: List[ContextualPrecisionVerdict] = (
88
+ await self._a_generate_verdicts(
89
+ example.input,
90
+ example.expected_output,
91
+ example.retrieval_context,
92
+ )
93
+ )
94
+ self.score = self._calculate_score()
95
+ self.reason = await self._a_generate_reason(example.input)
96
+ self.success = self.score >= self.threshold
97
+ self.verbose_logs = create_verbose_logs(
98
+ self,
99
+ steps=[
100
+ # Convert to dict for serialization purposes
101
+ f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
102
+ f"Score: {self.score}\nReason: {self.reason}",
103
+ ],
104
+ )
105
+ return self.score
106
+
107
+ async def _a_generate_reason(self, input: str):
108
+ if self.include_reason is False:
109
+ return None
110
+
111
+ retrieval_contexts_verdicts = [
112
+ {"verdict": verdict.verdict, "reasons": verdict.reason}
113
+ for verdict in self.verdicts
114
+ ]
115
+ prompt = ContextualPrecisionTemplate.generate_reason(
116
+ input=input,
117
+ verdicts=retrieval_contexts_verdicts,
118
+ score=format(self.score, ".2f"),
119
+ )
120
+
121
+ if self.using_native_model:
122
+ res = await self.model.a_generate(prompt)
123
+ data = parse_response_json(res, self)
124
+ return data["reason"]
125
+ else:
126
+ try:
127
+ res: Reason = await self.model.a_generate(prompt, schema=Reason)
128
+ return res.reason
129
+ except TypeError:
130
+ res = await self.model.a_generate(prompt)
131
+ data = parse_response_json(res, self)
132
+ return data["reason"]
133
+
134
+ def _generate_reason(self, input: str):
135
+ if self.include_reason is False:
136
+ return None
137
+
138
+ retrieval_contexts_verdicts = [
139
+ {"verdict": verdict.verdict, "reasons": verdict.reason}
140
+ for verdict in self.verdicts
141
+ ]
142
+ prompt = ContextualPrecisionTemplate.generate_reason(
143
+ input=input,
144
+ verdicts=retrieval_contexts_verdicts,
145
+ score=format(self.score, ".2f"),
146
+ )
147
+
148
+ if self.using_native_model:
149
+ res = self.model.generate(prompt)
150
+ data = parse_response_json(res, self)
151
+ return data["reason"]
152
+ else:
153
+ try:
154
+ res: Reason = self.model.generate(prompt, schema=Reason)
155
+ return res.reason
156
+ except TypeError:
157
+ res = self.model.generate(prompt)
158
+ data = parse_response_json(res, self)
159
+ return data["reason"]
160
+
161
+ async def _a_generate_verdicts(
162
+ self, input: str, expected_output: str, retrieval_context: List[str]
163
+ ) -> List[ContextualPrecisionVerdict]:
164
+ prompt = ContextualPrecisionTemplate.generate_verdicts(
165
+ input=input,
166
+ expected_output=expected_output,
167
+ retrieval_context=retrieval_context,
168
+ )
169
+ if self.using_native_model:
170
+ res = await self.model.a_generate(prompt)
171
+ data = parse_response_json(res, self)
172
+ verdicts = [
173
+ ContextualPrecisionVerdict(**item) for item in data["verdicts"]
174
+ ]
175
+ return verdicts
176
+ else:
177
+ try:
178
+ res: Verdicts = await self.model.a_generate(
179
+ prompt, schema=Verdicts
180
+ )
181
+ verdicts = [item for item in res.verdicts]
182
+ return verdicts
183
+ except TypeError:
184
+ res = await self.model.a_generate(prompt)
185
+ data = parse_response_json(res, self)
186
+ verdicts = [
187
+ ContextualPrecisionVerdict(**item)
188
+ for item in data["verdicts"]
189
+ ]
190
+ return verdicts
191
+
192
+ def _generate_verdicts(
193
+ self, input: str, expected_output: str, retrieval_context: List[str]
194
+ ) -> List[ContextualPrecisionVerdict]:
195
+ prompt = ContextualPrecisionTemplate.generate_verdicts(
196
+ input=input,
197
+ expected_output=expected_output,
198
+ retrieval_context=retrieval_context,
199
+ )
200
+ if self.using_native_model:
201
+ res = self.model.generate(prompt)
202
+ data = parse_response_json(res, self)
203
+ verdicts = [
204
+ ContextualPrecisionVerdict(**item) for item in data["verdicts"]
205
+ ]
206
+ return verdicts
207
+ else:
208
+ try:
209
+ res: Verdicts = self.model.generate(prompt, schema=Verdicts)
210
+ verdicts = [item for item in res.verdicts]
211
+ return verdicts
212
+ except TypeError:
213
+ res = self.model.generate(prompt)
214
+ data = parse_response_json(res, self)
215
+ verdicts = [
216
+ ContextualPrecisionVerdict(**item)
217
+ for item in data["verdicts"]
218
+ ]
219
+ return verdicts
220
+
221
+ def _calculate_score(self):
222
+ number_of_verdicts = len(self.verdicts)
223
+ if number_of_verdicts == 0:
224
+ return 0
225
+
226
+ # Convert verdicts to a binary list where 'yes' is 1 and others are 0
227
+ node_verdicts = [
228
+ 1 if v.verdict.strip().lower() == "yes" else 0
229
+ for v in self.verdicts
230
+ ]
231
+
232
+ sum_weighted_precision_at_k = 0.0
233
+ relevant_nodes_count = 0
234
+ for k, is_relevant in enumerate(node_verdicts, start=1):
235
+ # If the item is relevant, update the counter and add the weighted precision at k to the sum
236
+ if is_relevant:
237
+ relevant_nodes_count += 1
238
+ precision_at_k = relevant_nodes_count / k
239
+ sum_weighted_precision_at_k += precision_at_k * is_relevant
240
+
241
+ if relevant_nodes_count == 0:
242
+ return 0
243
+ # Calculate weighted cumulative precision
244
+ score = sum_weighted_precision_at_k / relevant_nodes_count
245
+ return 0 if self.strict_mode and score < self.threshold else score
246
+
247
+ def _success_check(self) -> bool:
248
+ if self.error is not None:
249
+ self.success = False
250
+ else:
251
+ try:
252
+ self.success = self.score >= self.threshold
253
+ except:
254
+ self.success = False
255
+ return self.success
256
+
257
+ @property
258
+ def __name__(self):
259
+ return "Contextual Precision"
@@ -0,0 +1,106 @@
1
+ from typing import List, Optional
2
+ from pydantic import BaseModel, Field
3
+
4
+
5
+ class ContextualPrecisionVerdict(BaseModel):
6
+ verdict: str
7
+ reason: str
8
+
9
+
10
+ class Verdicts(BaseModel):
11
+ verdicts: List[ContextualPrecisionVerdict]
12
+
13
+
14
+ class Reason(BaseModel):
15
+ reason: str
16
+
17
+
18
+ class ContextualPrecisionTemplate:
19
+ @staticmethod
20
+ def generate_verdicts(input, expected_output, retrieval_context):
21
+ return f"""==== TASK INSTRUCTIONS ====\nGiven the input, expected output, and retrieval context, your task is to determine whether each document in the retrieval context was relevant to arrive at the expected output.
22
+ You should reason through the documents in the retrieval context thoroughly, and then generate a list of JSON objects representing your decision.
23
+
24
+ ==== FORMAT INSTRUCTIONS ====\nIMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON. These JSON only contain the `verdict` key that outputs only 'yes' or 'no', and a `reason` key to justify the verdict. In your reason, aim to quote parts of the context to support your verdict.
25
+
26
+ ==== EXAMPLE ====
27
+ Example Input: "What are the main symptoms of COVID-19?"
28
+ Example Expected Output: "The main symptoms of COVID-19 include fever, cough, fatigue, and loss of taste or smell."
29
+ Example Retrieval Context: ["Common COVID-19 symptoms include fever and dry cough", "Loss of taste and smell are distinctive COVID-19 symptoms", "The first COVID-19 case was reported in Wuhan", "My friend's birthday party was fun last weekend"]
30
+
31
+ Example output JSON:
32
+ {{
33
+ "verdicts": [
34
+ {{
35
+ "verdict": "yes",
36
+ "reason": "The text directly lists key COVID-19 symptoms including 'fever and dry cough' which are part of the main symptoms."
37
+ }},
38
+ {{
39
+ "verdict": "yes",
40
+ "reason": "The text mentions 'loss of taste and smell' which are distinctive symptoms of COVID-19 that should be included."
41
+ }},
42
+ {{
43
+ "verdict": "no",
44
+ "reason": "While related to COVID-19, the origin of the first case is not relevant to listing the main symptoms."
45
+ }},
46
+ {{
47
+ "verdict": "no",
48
+ "reason": "A personal anecdote about a birthday party has no relevance to COVID-19 symptoms."
49
+ }}
50
+ ]
51
+ }}
52
+
53
+ Your task is to generate a verdict for each document in the retrieval context, so the number of 'verdicts' SHOULD BE EXACTLY EQUAL to that of the retrievalcontexts.
54
+
55
+ ==== YOUR TURN ====
56
+ Input:
57
+ {input}
58
+
59
+ Expected output:
60
+ {expected_output}
61
+
62
+ Retrieval Context:
63
+ {retrieval_context}
64
+
65
+ JSON:
66
+ """
67
+
68
+ @staticmethod
69
+ def generate_reason(input, verdicts, score):
70
+ return f"""==== TASK INSTRUCTIONS ====\nYou will be provided with an input, retrieval contexts, and a contextual precision score. Your task is to provide a CLEAR and CONCISE reason for the score.
71
+ You should explain why the score is not higher, but also the current score is reasonable. Here's a further breakdown of the task:
72
+
73
+ 1. input (str) is a task or question that the model attempted to solve
74
+ 2. retrieval contexts (list[dict]) is a list of JSON with the following keys:
75
+ - `verdict` (str): either 'yes' or 'no', which represents whether the corresponding document in the retrieval context is relevant to the input.
76
+ - `reason` (str): a reason for the verdict.
77
+ 3. The contextual precision score is a float between 0 and 1 and represents if the relevant documents are ranked higher than irrelevant ones in the retrieval context.
78
+ The ranking can be inferred by the order of the retrieval documents: retrieval contexts is given IN THE ORDER OF THE DOCUMENT RANKINGS.
79
+ This implies that the score will be higher if the relevant documents are ranked higher (appears earlier in the list) than irrelevant ones.
80
+
81
+ ==== FORMAT INSTRUCTIONS ====\nIMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason for the contextual precision score.
82
+ Example JSON:
83
+ {{
84
+ "reason": "The score is <contextual_precision_score> because <your_reason>."
85
+ }}
86
+
87
+
88
+ DO NOT mention 'verdict' in your reason, but instead phrase it as irrelevant nodes. The term 'verdict' are just here for you to understand the broader scope of things.
89
+ Also DO NOT mention there are `reason` fields in the retrieval contexts you are presented with, instead just use the information in the `reason` field.
90
+ In your reason, you MUST USE the `reason`, QUOTES in the 'reason', and the node RANK (starting from 1, eg. first node) to explain why the 'no' verdicts should be ranked lower than the 'yes' verdicts.
91
+ When addressing nodes, make it explicit that it is nodes in retrieval context.
92
+ If the score is 1, keep it short and say something positive with an upbeat tone (but don't overdo it otherwise it gets annoying).
93
+
94
+ ==== YOUR TURN ====
95
+ Contextual Precision Score:
96
+ {score}
97
+
98
+ Input:
99
+ {input}
100
+
101
+ Retrieval Contexts:
102
+ {verdicts}
103
+
104
+ JSON:
105
+ """
106
+
@@ -0,0 +1,3 @@
1
+ from .contextual_recall_scorer import ContextualRecallScorer
2
+
3
+ __all__ = ["ContextualRecallScorer"]
@@ -0,0 +1,249 @@
1
+ from typing import Optional, List, Union
2
+
3
+ from judgeval.scorers.utils import (
4
+ get_or_create_event_loop,
5
+ parse_response_json,
6
+ scorer_progress_meter,
7
+ create_verbose_logs,
8
+ check_example_params
9
+ )
10
+ from judgeval.judges.utils import create_judge
11
+ from judgeval.scorers import JudgevalScorer
12
+ from judgeval.judges import JudgevalJudge
13
+ from judgeval.judges.utils import create_judge
14
+ from judgeval.data import Example, ExampleParams
15
+ from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.prompts import *
16
+
17
+ required_params = [
18
+ ExampleParams.INPUT,
19
+ ExampleParams.ACTUAL_OUTPUT,
20
+ ExampleParams.EXPECTED_OUTPUT,
21
+ ExampleParams.RETRIEVAL_CONTEXT,
22
+ ]
23
+
24
+ class ContextualRecallScorer(JudgevalScorer):
25
+ def __init__(
26
+ self,
27
+ threshold: float = 0.5,
28
+ model: Optional[Union[str, JudgevalJudge]] = None,
29
+ include_reason: bool = True,
30
+ async_mode: bool = True,
31
+ strict_mode: bool = False,
32
+ verbose_mode: bool = False,
33
+ user: Optional[str] = None
34
+ ):
35
+ self.user = user
36
+ self.threshold = 1 if strict_mode else threshold
37
+ self.model, self.using_native_model = create_judge(model)
38
+ self.evaluation_model = self.model.get_model_name()
39
+ self.include_reason = include_reason
40
+ self.async_mode = async_mode
41
+ self.strict_mode = strict_mode
42
+ self.verbose_mode = verbose_mode
43
+
44
+ def score_example(
45
+ self,
46
+ example: Example,
47
+ _show_indicator: bool = True,
48
+ ) -> float:
49
+ check_example_params(example, required_params, self)
50
+
51
+ with scorer_progress_meter(self, display_meter=_show_indicator):
52
+ if self.async_mode:
53
+ loop = get_or_create_event_loop()
54
+ loop.run_until_complete(
55
+ self.a_score_example(example, _show_indicator=False)
56
+ )
57
+ else:
58
+ self.verdicts: List[ContextualRecallVerdict] = (
59
+ self._generate_verdicts(
60
+ example.expected_output, example.retrieval_context
61
+ )
62
+ )
63
+ self.score = self._calculate_score()
64
+ self.reason = self._generate_reason(example.expected_output)
65
+ self.success = self.score >= self.threshold
66
+ self.verbose_logs = create_verbose_logs(
67
+ self,
68
+ steps=[
69
+ f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
70
+ f"Score: {self.score}\nReason: {self.reason}",
71
+ ],
72
+ )
73
+ return self.score
74
+
75
+ async def a_score_example(
76
+ self,
77
+ example: Example,
78
+ _show_indicator: bool = True,
79
+ ) -> float:
80
+ check_example_params(example, required_params, self)
81
+
82
+ with scorer_progress_meter(
83
+ self,
84
+ async_mode=True,
85
+ display_meter=_show_indicator,
86
+ ):
87
+ self.verdicts: List[ContextualRecallVerdict] = (
88
+ await self._a_generate_verdicts(
89
+ example.expected_output, example.retrieval_context
90
+ )
91
+ )
92
+ self.score = self._calculate_score()
93
+ self.reason = await self._a_generate_reason(
94
+ example.expected_output
95
+ )
96
+ self.success = self.score >= self.threshold
97
+ self.verbose_logs = create_verbose_logs(
98
+ self,
99
+ steps=[
100
+ f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
101
+ f"Score: {self.score}\nReason: {self.reason}",
102
+ ],
103
+ )
104
+ return self.score
105
+
106
+ async def _a_generate_reason(self, expected_output: str):
107
+ if self.include_reason is False:
108
+ return None
109
+
110
+ supportive_reasons = []
111
+ unsupportive_reasons = []
112
+ for idx, verdict in enumerate(self.verdicts):
113
+ if verdict.verdict.lower() == "yes":
114
+ supportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
115
+ else:
116
+ unsupportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
117
+
118
+ prompt = ContextualRecallTemplate.generate_reason(
119
+ expected_output=expected_output,
120
+ supportive_reasons=supportive_reasons,
121
+ unsupportive_reasons=unsupportive_reasons,
122
+ score=format(self.score, ".2f"),
123
+ )
124
+
125
+ if self.using_native_model:
126
+ res = await self.model.a_generate(prompt)
127
+ data = parse_response_json(res, self)
128
+ return data["reason"]
129
+ else:
130
+ try:
131
+ res: Reason = await self.model.a_generate(prompt, schema=Reason)
132
+ return res.reason
133
+ except TypeError:
134
+ res = await self.model.a_generate(prompt)
135
+ data = parse_response_json(res, self)
136
+ return data["reason"]
137
+
138
+ def _generate_reason(self, expected_output: str):
139
+ if self.include_reason is False:
140
+ return None
141
+
142
+ supportive_reasons = []
143
+ unsupportive_reasons = []
144
+ for verdict in self.verdicts:
145
+ if verdict.verdict.lower() == "yes":
146
+ supportive_reasons.append(verdict.reason)
147
+ else:
148
+ unsupportive_reasons.append(verdict.reason)
149
+
150
+ prompt = ContextualRecallTemplate.generate_reason(
151
+ expected_output=expected_output,
152
+ supportive_reasons=supportive_reasons,
153
+ unsupportive_reasons=unsupportive_reasons,
154
+ score=format(self.score, ".2f"),
155
+ )
156
+
157
+ if self.using_native_model:
158
+ res = self.model.generate(prompt)
159
+ data = parse_response_json(res, self)
160
+ return data["reason"]
161
+ else:
162
+ try:
163
+ res: Reason = self.model.generate(prompt, schema=Reason)
164
+ return res.reason
165
+ except TypeError:
166
+ res = self.model.generate(prompt)
167
+ data = parse_response_json(res, self)
168
+ return data["reason"]
169
+
170
+ def _calculate_score(self):
171
+ number_of_verdicts = len(self.verdicts)
172
+ if number_of_verdicts == 0:
173
+ return 0
174
+
175
+ justified_sentences = 0
176
+ for verdict in self.verdicts:
177
+ if verdict.verdict.lower() == "yes":
178
+ justified_sentences += 1
179
+
180
+ score = justified_sentences / number_of_verdicts
181
+ return 0 if self.strict_mode and score < self.threshold else score
182
+
183
+ async def _a_generate_verdicts(
184
+ self, expected_output: str, retrieval_context: List[str]
185
+ ) -> List[ContextualRecallVerdict]:
186
+ prompt = ContextualRecallTemplate.generate_verdicts(
187
+ expected_output=expected_output, retrieval_context=retrieval_context
188
+ )
189
+ if self.using_native_model:
190
+ res = await self.model.a_generate(prompt)
191
+ data = parse_response_json(res, self)
192
+ verdicts = [
193
+ ContextualRecallVerdict(**item) for item in data["verdicts"]
194
+ ]
195
+ return verdicts
196
+ else:
197
+ try:
198
+ res: Verdicts = await self.model.a_generate(
199
+ prompt, schema=Verdicts
200
+ )
201
+ verdicts: Verdicts = [item for item in res.verdicts]
202
+ return verdicts
203
+ except TypeError:
204
+ res = await self.model.a_generate(prompt)
205
+ data = parse_response_json(res, self)
206
+ verdicts = [
207
+ ContextualRecallVerdict(**item) for item in data["verdicts"]
208
+ ]
209
+ return verdicts
210
+
211
+ def _generate_verdicts(
212
+ self, expected_output: str, retrieval_context: List[str]
213
+ ) -> List[ContextualRecallVerdict]:
214
+ prompt = ContextualRecallTemplate.generate_verdicts(
215
+ expected_output=expected_output, retrieval_context=retrieval_context
216
+ )
217
+ if self.using_native_model:
218
+ res = self.model.generate(prompt)
219
+ data = parse_response_json(res, self)
220
+ verdicts = [
221
+ ContextualRecallVerdict(**item) for item in data["verdicts"]
222
+ ]
223
+ return verdicts
224
+ else:
225
+ try:
226
+ res: Verdicts = self.model.generate(prompt, schema=Verdicts)
227
+ verdicts: Verdicts = [item for item in res.verdicts]
228
+ return verdicts
229
+ except TypeError:
230
+ res = self.model.generate(prompt)
231
+ data = parse_response_json(res, self)
232
+ verdicts = [
233
+ ContextualRecallVerdict(**item) for item in data["verdicts"]
234
+ ]
235
+ return verdicts
236
+
237
+ def _success_check(self) -> bool:
238
+ if self.error is not None:
239
+ self.success = False
240
+ else:
241
+ try:
242
+ self.success = self.score >= self.threshold
243
+ except:
244
+ self.success = False
245
+ return self.success
246
+
247
+ @property
248
+ def __name__(self):
249
+ return "Contextual Recall"