judgeval 0.0.31__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. judgeval/__init__.py +3 -1
  2. judgeval/common/s3_storage.py +93 -0
  3. judgeval/common/tracer.py +869 -183
  4. judgeval/constants.py +1 -1
  5. judgeval/data/datasets/dataset.py +5 -1
  6. judgeval/data/datasets/eval_dataset_client.py +2 -2
  7. judgeval/data/sequence.py +16 -26
  8. judgeval/data/sequence_run.py +2 -0
  9. judgeval/judgment_client.py +44 -166
  10. judgeval/rules.py +4 -7
  11. judgeval/run_evaluation.py +2 -2
  12. judgeval/scorers/__init__.py +4 -4
  13. judgeval/scorers/judgeval_scorers/__init__.py +0 -176
  14. judgeval/version_check.py +22 -0
  15. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
  16. judgeval-0.0.33.dist-info/RECORD +63 -0
  17. judgeval/scorers/base_scorer.py +0 -58
  18. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
  19. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  20. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
  21. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  22. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  23. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  24. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  25. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  26. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
  27. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
  28. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  29. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  30. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  31. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  32. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  33. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  34. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  35. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  36. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  37. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
  39. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  40. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
  41. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  42. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  43. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
  44. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  45. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
  46. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
  47. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  48. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  49. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  50. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  51. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
  52. judgeval-0.0.31.dist-info/RECORD +0 -96
  53. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
  54. {judgeval-0.0.31.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,264 +0,0 @@
1
- from typing import Optional, List, Union
2
-
3
- from judgeval.judges import JudgevalJudge
4
- from judgeval.judges.utils import create_judge
5
- from judgeval.data import Example, ExampleParams
6
- from judgeval.scorers import JudgevalScorer
7
- from judgeval.constants import APIScorer
8
- from judgeval.scorers.utils import (
9
- get_or_create_event_loop,
10
- parse_response_json,
11
- scorer_progress_meter,
12
- create_verbose_logs,
13
- check_example_params,
14
- )
15
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_precision.prompts import *
16
-
17
- required_params = [
18
- ExampleParams.INPUT,
19
- ExampleParams.ACTUAL_OUTPUT,
20
- ExampleParams.RETRIEVAL_CONTEXT,
21
- ExampleParams.EXPECTED_OUTPUT,
22
- ]
23
-
24
- class ContextualPrecisionScorer(JudgevalScorer):
25
- def __init__(
26
- self,
27
- threshold: float = 0.5,
28
- model: Optional[Union[str, JudgevalJudge]] = None,
29
- include_reason: bool = True,
30
- async_mode: bool = True,
31
- strict_mode: bool = False,
32
- verbose_mode: bool = False,
33
- ):
34
- super().__init__(
35
- score_type=APIScorer.CONTEXTUAL_PRECISION,
36
- threshold=1 if strict_mode else threshold,
37
- evaluation_model=None,
38
- include_reason=include_reason,
39
- async_mode=async_mode,
40
- strict_mode=strict_mode,
41
- verbose_mode=verbose_mode
42
- )
43
- self.model, self.using_native_model = create_judge(model)
44
- self.evaluation_model = self.model.get_model_name()
45
-
46
- def score_example(
47
- self,
48
- example: Example,
49
- _show_indicator: bool = True,
50
- ) -> float:
51
- check_example_params(example, required_params, self)
52
-
53
- with scorer_progress_meter(self, display_meter=_show_indicator):
54
- if self.async_mode:
55
- loop = get_or_create_event_loop()
56
- loop.run_until_complete(
57
- self.a_score_example(example, _show_indicator=False)
58
- )
59
- else:
60
- self.verdicts: List[ContextualPrecisionVerdict] = (
61
- self._generate_verdicts(
62
- example.input,
63
- example.expected_output,
64
- example.retrieval_context,
65
- )
66
- )
67
- self.score = self._calculate_score()
68
- self.reason = self._generate_reason(example.input)
69
- self.success = self.score >= self.threshold
70
- self.verbose_logs = create_verbose_logs(
71
- self,
72
- steps=[
73
- # Convert to dict for serialization purposes
74
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
75
- f"Score: {self.score}\nReason: {self.reason}",
76
- ],
77
- )
78
- return self.score
79
-
80
- async def a_score_example(
81
- self,
82
- example: Example,
83
- _show_indicator: bool = True,
84
- ) -> float:
85
- check_example_params(example, required_params, self)
86
-
87
- with scorer_progress_meter(
88
- self,
89
- async_mode=True,
90
- display_meter=_show_indicator,
91
- ):
92
- self.verdicts: List[ContextualPrecisionVerdict] = (
93
- await self._a_generate_verdicts(
94
- example.input,
95
- example.expected_output,
96
- example.retrieval_context,
97
- )
98
- )
99
- self.score = self._calculate_score()
100
- self.reason = await self._a_generate_reason(example.input)
101
- self.success = self.score >= self.threshold
102
- self.verbose_logs = create_verbose_logs(
103
- self,
104
- steps=[
105
- # Convert to dict for serialization purposes
106
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
107
- f"Score: {self.score}\nReason: {self.reason}",
108
- ],
109
- )
110
- return self.score
111
-
112
- async def _a_generate_reason(self, input: str):
113
- if self.include_reason is False:
114
- return None
115
-
116
- retrieval_contexts_verdicts = [
117
- {"verdict": verdict.verdict, "reasons": verdict.reason}
118
- for verdict in self.verdicts
119
- ]
120
- prompt = ContextualPrecisionTemplate.generate_reason(
121
- input=input,
122
- verdicts=retrieval_contexts_verdicts,
123
- score=format(self.score, ".2f"),
124
- )
125
-
126
- if self.using_native_model:
127
- res = await self.model.a_generate(prompt)
128
- data = parse_response_json(res, self)
129
- return data["reason"]
130
- else:
131
- try:
132
- res: Reason = await self.model.a_generate(prompt, schema=Reason)
133
- return res.reason
134
- except TypeError:
135
- res = await self.model.a_generate(prompt)
136
- data = parse_response_json(res, self)
137
- return data["reason"]
138
-
139
- def _generate_reason(self, input: str):
140
- if self.include_reason is False:
141
- return None
142
-
143
- retrieval_contexts_verdicts = [
144
- {"verdict": verdict.verdict, "reasons": verdict.reason}
145
- for verdict in self.verdicts
146
- ]
147
- prompt = ContextualPrecisionTemplate.generate_reason(
148
- input=input,
149
- verdicts=retrieval_contexts_verdicts,
150
- score=format(self.score, ".2f"),
151
- )
152
-
153
- if self.using_native_model:
154
- res = self.model.generate(prompt)
155
- data = parse_response_json(res, self)
156
- return data["reason"]
157
- else:
158
- try:
159
- res: Reason = self.model.generate(prompt, schema=Reason)
160
- return res.reason
161
- except TypeError:
162
- res = self.model.generate(prompt)
163
- data = parse_response_json(res, self)
164
- return data["reason"]
165
-
166
- async def _a_generate_verdicts(
167
- self, input: str, expected_output: str, retrieval_context: List[str]
168
- ) -> List[ContextualPrecisionVerdict]:
169
- prompt = ContextualPrecisionTemplate.generate_verdicts(
170
- input=input,
171
- expected_output=expected_output,
172
- retrieval_context=retrieval_context,
173
- )
174
- if self.using_native_model:
175
- res = await self.model.a_generate(prompt)
176
- data = parse_response_json(res, self)
177
- verdicts = [
178
- ContextualPrecisionVerdict(**item) for item in data["verdicts"]
179
- ]
180
- return verdicts
181
- else:
182
- try:
183
- res: Verdicts = await self.model.a_generate(
184
- prompt, schema=Verdicts
185
- )
186
- verdicts = [item for item in res.verdicts]
187
- return verdicts
188
- except TypeError:
189
- res = await self.model.a_generate(prompt)
190
- data = parse_response_json(res, self)
191
- verdicts = [
192
- ContextualPrecisionVerdict(**item)
193
- for item in data["verdicts"]
194
- ]
195
- return verdicts
196
-
197
- def _generate_verdicts(
198
- self, input: str, expected_output: str, retrieval_context: List[str]
199
- ) -> List[ContextualPrecisionVerdict]:
200
- prompt = ContextualPrecisionTemplate.generate_verdicts(
201
- input=input,
202
- expected_output=expected_output,
203
- retrieval_context=retrieval_context,
204
- )
205
- if self.using_native_model:
206
- res = self.model.generate(prompt)
207
- data = parse_response_json(res, self)
208
- verdicts = [
209
- ContextualPrecisionVerdict(**item) for item in data["verdicts"]
210
- ]
211
- return verdicts
212
- else:
213
- try:
214
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
215
- verdicts = [item for item in res.verdicts]
216
- return verdicts
217
- except TypeError:
218
- res = self.model.generate(prompt)
219
- data = parse_response_json(res, self)
220
- verdicts = [
221
- ContextualPrecisionVerdict(**item)
222
- for item in data["verdicts"]
223
- ]
224
- return verdicts
225
-
226
- def _calculate_score(self):
227
- number_of_verdicts = len(self.verdicts)
228
- if number_of_verdicts == 0:
229
- return 0
230
-
231
- # Convert verdicts to a binary list where 'yes' is 1 and others are 0
232
- node_verdicts = [
233
- 1 if v.verdict.strip().lower() == "yes" else 0
234
- for v in self.verdicts
235
- ]
236
-
237
- sum_weighted_precision_at_k = 0.0
238
- relevant_nodes_count = 0
239
- for k, is_relevant in enumerate(node_verdicts, start=1):
240
- # If the item is relevant, update the counter and add the weighted precision at k to the sum
241
- if is_relevant:
242
- relevant_nodes_count += 1
243
- precision_at_k = relevant_nodes_count / k
244
- sum_weighted_precision_at_k += precision_at_k * is_relevant
245
-
246
- if relevant_nodes_count == 0:
247
- return 0
248
- # Calculate weighted cumulative precision
249
- score = sum_weighted_precision_at_k / relevant_nodes_count
250
- return 0 if self.strict_mode and score < self.threshold else score
251
-
252
- def _success_check(self) -> bool:
253
- if self.error is not None:
254
- self.success = False
255
- else:
256
- try:
257
- self.success = self.score >= self.threshold
258
- except:
259
- self.success = False
260
- return self.success
261
-
262
- @property
263
- def __name__(self):
264
- return "Contextual Precision"
@@ -1,106 +0,0 @@
1
- from typing import List
2
- from pydantic import BaseModel
3
-
4
-
5
- class ContextualPrecisionVerdict(BaseModel):
6
- verdict: str
7
- reason: str
8
-
9
-
10
- class Verdicts(BaseModel):
11
- verdicts: List[ContextualPrecisionVerdict]
12
-
13
-
14
- class Reason(BaseModel):
15
- reason: str
16
-
17
-
18
- class ContextualPrecisionTemplate:
19
- @staticmethod
20
- def generate_verdicts(input, expected_output, retrieval_context):
21
- return f"""==== TASK INSTRUCTIONS ====\nGiven the input, expected output, and retrieval context, your task is to determine whether each document in the retrieval context was relevant to arrive at the expected output.
22
- You should reason through the documents in the retrieval context thoroughly, and then generate a list of JSON objects representing your decision.
23
-
24
- ==== FORMAT INSTRUCTIONS ====\nIMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON. These JSON only contain the `verdict` key that outputs only 'yes' or 'no', and a `reason` key to justify the verdict. In your reason, aim to quote parts of the context to support your verdict.
25
-
26
- ==== EXAMPLE ====
27
- Example Input: "What are the main symptoms of COVID-19?"
28
- Example Expected Output: "The main symptoms of COVID-19 include fever, cough, fatigue, and loss of taste or smell."
29
- Example Retrieval Context: ["Common COVID-19 symptoms include fever and dry cough", "Loss of taste and smell are distinctive COVID-19 symptoms", "The first COVID-19 case was reported in Wuhan", "My friend's birthday party was fun last weekend"]
30
-
31
- Example output JSON:
32
- {{
33
- "verdicts": [
34
- {{
35
- "verdict": "yes",
36
- "reason": "The text directly lists key COVID-19 symptoms including 'fever and dry cough' which are part of the main symptoms."
37
- }},
38
- {{
39
- "verdict": "yes",
40
- "reason": "The text mentions 'loss of taste and smell' which are distinctive symptoms of COVID-19 that should be included."
41
- }},
42
- {{
43
- "verdict": "no",
44
- "reason": "While related to COVID-19, the origin of the first case is not relevant to listing the main symptoms."
45
- }},
46
- {{
47
- "verdict": "no",
48
- "reason": "A personal anecdote about a birthday party has no relevance to COVID-19 symptoms."
49
- }}
50
- ]
51
- }}
52
-
53
- Your task is to generate a verdict for each document in the retrieval context, so the number of 'verdicts' SHOULD BE EXACTLY EQUAL to that of the retrievalcontexts.
54
-
55
- ==== YOUR TURN ====
56
- Input:
57
- {input}
58
-
59
- Expected output:
60
- {expected_output}
61
-
62
- Retrieval Context:
63
- {retrieval_context}
64
-
65
- JSON:
66
- """
67
-
68
- @staticmethod
69
- def generate_reason(input, verdicts, score):
70
- return f"""==== TASK INSTRUCTIONS ====\nYou will be provided with an input, retrieval contexts, and a contextual precision score. Your task is to provide a CLEAR and CONCISE reason for the score.
71
- You should explain why the score is not higher, but also the current score is reasonable. Here's a further breakdown of the task:
72
-
73
- 1. input (str) is a task or question that the model attempted to solve
74
- 2. retrieval contexts (list[dict]) is a list of JSON with the following keys:
75
- - `verdict` (str): either 'yes' or 'no', which represents whether the corresponding document in the retrieval context is relevant to the input.
76
- - `reason` (str): a reason for the verdict.
77
- 3. The contextual precision score is a float between 0 and 1 and represents if the relevant documents are ranked higher than irrelevant ones in the retrieval context.
78
- The ranking can be inferred by the order of the retrieval documents: retrieval contexts is given IN THE ORDER OF THE DOCUMENT RANKINGS.
79
- This implies that the score will be higher if the relevant documents are ranked higher (appears earlier in the list) than irrelevant ones.
80
-
81
- ==== FORMAT INSTRUCTIONS ====\nIMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason for the contextual precision score.
82
- Example JSON:
83
- {{
84
- "reason": "The score is <contextual_precision_score> because <your_reason>."
85
- }}
86
-
87
-
88
- DO NOT mention 'verdict' in your reason, but instead phrase it as irrelevant nodes. The term 'verdict' are just here for you to understand the broader scope of things.
89
- Also DO NOT mention there are `reason` fields in the retrieval contexts you are presented with, instead just use the information in the `reason` field.
90
- In your reason, you MUST USE the `reason`, QUOTES in the 'reason', and the node RANK (starting from 1, eg. first node) to explain why the 'no' verdicts should be ranked lower than the 'yes' verdicts.
91
- When addressing nodes, make it explicit that it is nodes in retrieval context.
92
- If the score is 1, keep it short and say something positive with an upbeat tone (but don't overdo it otherwise it gets annoying).
93
-
94
- ==== YOUR TURN ====
95
- Contextual Precision Score:
96
- {score}
97
-
98
- Input:
99
- {input}
100
-
101
- Retrieval Contexts:
102
- {verdicts}
103
-
104
- JSON:
105
- """
106
-
@@ -1,3 +0,0 @@
1
- from .contextual_recall_scorer import ContextualRecallScorer
2
-
3
- __all__ = ["ContextualRecallScorer"]
@@ -1,254 +0,0 @@
1
- from typing import Optional, List, Union
2
-
3
- from judgeval.constants import APIScorer
4
- from judgeval.scorers.utils import (
5
- get_or_create_event_loop,
6
- parse_response_json,
7
- scorer_progress_meter,
8
- create_verbose_logs,
9
- check_example_params
10
- )
11
- from judgeval.judges.utils import create_judge
12
- from judgeval.scorers import JudgevalScorer
13
- from judgeval.judges import JudgevalJudge
14
- from judgeval.judges.utils import create_judge
15
- from judgeval.data import Example, ExampleParams
16
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.prompts import *
17
-
18
- required_params = [
19
- ExampleParams.INPUT,
20
- ExampleParams.ACTUAL_OUTPUT,
21
- ExampleParams.EXPECTED_OUTPUT,
22
- ExampleParams.RETRIEVAL_CONTEXT,
23
- ]
24
-
25
- class ContextualRecallScorer(JudgevalScorer):
26
- def __init__(
27
- self,
28
- threshold: float = 0.5,
29
- model: Optional[Union[str, JudgevalJudge]] = None,
30
- include_reason: bool = True,
31
- async_mode: bool = True,
32
- strict_mode: bool = False,
33
- verbose_mode: bool = False,
34
- user: Optional[str] = None
35
- ):
36
- super().__init__(
37
- score_type=APIScorer.CONTEXTUAL_RECALL,
38
- threshold=1 if strict_mode else threshold,
39
- evaluation_model=None,
40
- include_reason=include_reason,
41
- async_mode=async_mode,
42
- strict_mode=strict_mode,
43
- verbose_mode=verbose_mode
44
- )
45
- self.user = user
46
- self.model, self.using_native_model = create_judge(model)
47
- self.evaluation_model = self.model.get_model_name()
48
-
49
- def score_example(
50
- self,
51
- example: Example,
52
- _show_indicator: bool = True,
53
- ) -> float:
54
- check_example_params(example, required_params, self)
55
-
56
- with scorer_progress_meter(self, display_meter=_show_indicator):
57
- if self.async_mode:
58
- loop = get_or_create_event_loop()
59
- loop.run_until_complete(
60
- self.a_score_example(example, _show_indicator=False)
61
- )
62
- else:
63
- self.verdicts: List[ContextualRecallVerdict] = (
64
- self._generate_verdicts(
65
- example.expected_output, example.retrieval_context
66
- )
67
- )
68
- self.score = self._calculate_score()
69
- self.reason = self._generate_reason(example.expected_output)
70
- self.success = self.score >= self.threshold
71
- self.verbose_logs = create_verbose_logs(
72
- self,
73
- steps=[
74
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
75
- f"Score: {self.score}\nReason: {self.reason}",
76
- ],
77
- )
78
- return self.score
79
-
80
- async def a_score_example(
81
- self,
82
- example: Example,
83
- _show_indicator: bool = True,
84
- ) -> float:
85
- check_example_params(example, required_params, self)
86
-
87
- with scorer_progress_meter(
88
- self,
89
- async_mode=True,
90
- display_meter=_show_indicator,
91
- ):
92
- self.verdicts: List[ContextualRecallVerdict] = (
93
- await self._a_generate_verdicts(
94
- example.expected_output, example.retrieval_context
95
- )
96
- )
97
- self.score = self._calculate_score()
98
- self.reason = await self._a_generate_reason(
99
- example.expected_output
100
- )
101
- self.success = self.score >= self.threshold
102
- self.verbose_logs = create_verbose_logs(
103
- self,
104
- steps=[
105
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
106
- f"Score: {self.score}\nReason: {self.reason}",
107
- ],
108
- )
109
- return self.score
110
-
111
- async def _a_generate_reason(self, expected_output: str):
112
- if self.include_reason is False:
113
- return None
114
-
115
- supportive_reasons = []
116
- unsupportive_reasons = []
117
- for idx, verdict in enumerate(self.verdicts):
118
- if verdict.verdict.lower() == "yes":
119
- supportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
120
- else:
121
- unsupportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
122
-
123
- prompt = ContextualRecallTemplate.generate_reason(
124
- expected_output=expected_output,
125
- supportive_reasons=supportive_reasons,
126
- unsupportive_reasons=unsupportive_reasons,
127
- score=format(self.score, ".2f"),
128
- )
129
-
130
- if self.using_native_model:
131
- res = await self.model.a_generate(prompt)
132
- data = parse_response_json(res, self)
133
- return data["reason"]
134
- else:
135
- try:
136
- res: Reason = await self.model.a_generate(prompt, schema=Reason)
137
- return res.reason
138
- except TypeError:
139
- res = await self.model.a_generate(prompt)
140
- data = parse_response_json(res, self)
141
- return data["reason"]
142
-
143
- def _generate_reason(self, expected_output: str):
144
- if self.include_reason is False:
145
- return None
146
-
147
- supportive_reasons = []
148
- unsupportive_reasons = []
149
- for verdict in self.verdicts:
150
- if verdict.verdict.lower() == "yes":
151
- supportive_reasons.append(verdict.reason)
152
- else:
153
- unsupportive_reasons.append(verdict.reason)
154
-
155
- prompt = ContextualRecallTemplate.generate_reason(
156
- expected_output=expected_output,
157
- supportive_reasons=supportive_reasons,
158
- unsupportive_reasons=unsupportive_reasons,
159
- score=format(self.score, ".2f"),
160
- )
161
-
162
- if self.using_native_model:
163
- res = self.model.generate(prompt)
164
- data = parse_response_json(res, self)
165
- return data["reason"]
166
- else:
167
- try:
168
- res: Reason = self.model.generate(prompt, schema=Reason)
169
- return res.reason
170
- except TypeError:
171
- res = self.model.generate(prompt)
172
- data = parse_response_json(res, self)
173
- return data["reason"]
174
-
175
- def _calculate_score(self):
176
- number_of_verdicts = len(self.verdicts)
177
- if number_of_verdicts == 0:
178
- return 0
179
-
180
- justified_sentences = 0
181
- for verdict in self.verdicts:
182
- if verdict.verdict.lower() == "yes":
183
- justified_sentences += 1
184
-
185
- score = justified_sentences / number_of_verdicts
186
- return 0 if self.strict_mode and score < self.threshold else score
187
-
188
- async def _a_generate_verdicts(
189
- self, expected_output: str, retrieval_context: List[str]
190
- ) -> List[ContextualRecallVerdict]:
191
- prompt = ContextualRecallTemplate.generate_verdicts(
192
- expected_output=expected_output, retrieval_context=retrieval_context
193
- )
194
- if self.using_native_model:
195
- res = await self.model.a_generate(prompt)
196
- data = parse_response_json(res, self)
197
- verdicts = [
198
- ContextualRecallVerdict(**item) for item in data["verdicts"]
199
- ]
200
- return verdicts
201
- else:
202
- try:
203
- res: Verdicts = await self.model.a_generate(
204
- prompt, schema=Verdicts
205
- )
206
- verdicts: Verdicts = [item for item in res.verdicts]
207
- return verdicts
208
- except TypeError:
209
- res = await self.model.a_generate(prompt)
210
- data = parse_response_json(res, self)
211
- verdicts = [
212
- ContextualRecallVerdict(**item) for item in data["verdicts"]
213
- ]
214
- return verdicts
215
-
216
- def _generate_verdicts(
217
- self, expected_output: str, retrieval_context: List[str]
218
- ) -> List[ContextualRecallVerdict]:
219
- prompt = ContextualRecallTemplate.generate_verdicts(
220
- expected_output=expected_output, retrieval_context=retrieval_context
221
- )
222
- if self.using_native_model:
223
- res = self.model.generate(prompt)
224
- data = parse_response_json(res, self)
225
- verdicts = [
226
- ContextualRecallVerdict(**item) for item in data["verdicts"]
227
- ]
228
- return verdicts
229
- else:
230
- try:
231
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
232
- verdicts: Verdicts = [item for item in res.verdicts]
233
- return verdicts
234
- except TypeError:
235
- res = self.model.generate(prompt)
236
- data = parse_response_json(res, self)
237
- verdicts = [
238
- ContextualRecallVerdict(**item) for item in data["verdicts"]
239
- ]
240
- return verdicts
241
-
242
- def _success_check(self) -> bool:
243
- if self.error is not None:
244
- self.success = False
245
- else:
246
- try:
247
- self.success = self.score >= self.threshold
248
- except:
249
- self.success = False
250
- return self.success
251
-
252
- @property
253
- def __name__(self):
254
- return "Contextual Recall"