judgeval 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +0 -71
- judgeval/common/tracer.py +57 -31
- judgeval/constants.py +1 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/evaluation_run.py +16 -15
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/base_judge.py +1 -1
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +2 -2
- judgeval/judges/together_judge.py +2 -2
- judgeval/judges/utils.py +4 -4
- judgeval/judgment_client.py +67 -15
- judgeval/run_evaluation.py +79 -14
- judgeval/scorers/__init__.py +8 -4
- judgeval/scorers/api_scorer.py +64 -0
- judgeval/scorers/base_scorer.py +3 -2
- judgeval/scorers/exceptions.py +11 -0
- judgeval/scorers/{custom_scorer.py → judgeval_scorer.py} +9 -5
- judgeval/scorers/judgeval_scorers/__init__.py +132 -9
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
- judgeval/scorers/judgeval_scorers/{answer_relevancy.py → api_scorers/answer_relevancy.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_recall.py → api_scorers/contextual_recall.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py} +2 -2
- judgeval/scorers/judgeval_scorers/{faithfulness.py → api_scorers/faithfulness.py} +2 -2
- judgeval/scorers/judgeval_scorers/{hallucination.py → api_scorers/hallucination.py} +2 -2
- judgeval/scorers/judgeval_scorers/{json_correctness.py → api_scorers/json_correctness.py} +7 -7
- judgeval/scorers/judgeval_scorers/{summarization.py → api_scorers/summarization.py} +2 -2
- judgeval/scorers/judgeval_scorers/{tool_correctness.py → api_scorers/tool_correctness.py} +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
- judgeval/scorers/prompt_scorer.py +4 -4
- judgeval/scorers/score.py +14 -14
- judgeval/scorers/utils.py +40 -6
- {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/METADATA +1 -1
- judgeval-0.0.4.dist-info/RECORD +78 -0
- judgeval-0.0.3.dist-info/RECORD +0 -46
- {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/WHEEL +0 -0
- {judgeval-0.0.3.dist-info → judgeval-0.0.4.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,259 @@
|
|
1
|
+
from typing import Optional, List, Union
|
2
|
+
|
3
|
+
from judgeval.judges import JudgevalJudge
|
4
|
+
from judgeval.judges.utils import create_judge
|
5
|
+
from judgeval.data import Example, ExampleParams
|
6
|
+
from judgeval.scorers import JudgevalScorer
|
7
|
+
from judgeval.scorers.utils import (
|
8
|
+
get_or_create_event_loop,
|
9
|
+
parse_response_json,
|
10
|
+
scorer_progress_meter,
|
11
|
+
create_verbose_logs,
|
12
|
+
check_example_params,
|
13
|
+
)
|
14
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_precision.prompts import *
|
15
|
+
|
16
|
+
required_params = [
|
17
|
+
ExampleParams.INPUT,
|
18
|
+
ExampleParams.ACTUAL_OUTPUT,
|
19
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
20
|
+
ExampleParams.EXPECTED_OUTPUT,
|
21
|
+
]
|
22
|
+
|
23
|
+
class ContextualPrecisionScorer(JudgevalScorer):
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
threshold: float = 0.5,
|
27
|
+
model: Optional[Union[str, JudgevalJudge]] = None,
|
28
|
+
include_reason: bool = True,
|
29
|
+
async_mode: bool = True,
|
30
|
+
strict_mode: bool = False,
|
31
|
+
verbose_mode: bool = False,
|
32
|
+
):
|
33
|
+
self.threshold = 1 if strict_mode else threshold
|
34
|
+
self.include_reason = include_reason
|
35
|
+
self.model, self.using_native_model = create_judge(model)
|
36
|
+
self.evaluation_model = self.model.get_model_name()
|
37
|
+
self.async_mode = async_mode
|
38
|
+
self.strict_mode = strict_mode
|
39
|
+
self.verbose_mode = verbose_mode
|
40
|
+
|
41
|
+
def score_example(
|
42
|
+
self,
|
43
|
+
example: Example,
|
44
|
+
_show_indicator: bool = True,
|
45
|
+
) -> float:
|
46
|
+
check_example_params(example, required_params, self)
|
47
|
+
|
48
|
+
with scorer_progress_meter(self, display_meter=_show_indicator):
|
49
|
+
if self.async_mode:
|
50
|
+
loop = get_or_create_event_loop()
|
51
|
+
loop.run_until_complete(
|
52
|
+
self.a_score_example(example, _show_indicator=False)
|
53
|
+
)
|
54
|
+
else:
|
55
|
+
self.verdicts: List[ContextualPrecisionVerdict] = (
|
56
|
+
self._generate_verdicts(
|
57
|
+
example.input,
|
58
|
+
example.expected_output,
|
59
|
+
example.retrieval_context,
|
60
|
+
)
|
61
|
+
)
|
62
|
+
self.score = self._calculate_score()
|
63
|
+
self.reason = self._generate_reason(example.input)
|
64
|
+
self.success = self.score >= self.threshold
|
65
|
+
self.verbose_logs = create_verbose_logs(
|
66
|
+
self,
|
67
|
+
steps=[
|
68
|
+
# Convert to dict for serialization purposes
|
69
|
+
f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
|
70
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
71
|
+
],
|
72
|
+
)
|
73
|
+
return self.score
|
74
|
+
|
75
|
+
async def a_score_example(
|
76
|
+
self,
|
77
|
+
example: Example,
|
78
|
+
_show_indicator: bool = True,
|
79
|
+
) -> float:
|
80
|
+
check_example_params(example, required_params, self)
|
81
|
+
|
82
|
+
with scorer_progress_meter(
|
83
|
+
self,
|
84
|
+
async_mode=True,
|
85
|
+
display_meter=_show_indicator,
|
86
|
+
):
|
87
|
+
self.verdicts: List[ContextualPrecisionVerdict] = (
|
88
|
+
await self._a_generate_verdicts(
|
89
|
+
example.input,
|
90
|
+
example.expected_output,
|
91
|
+
example.retrieval_context,
|
92
|
+
)
|
93
|
+
)
|
94
|
+
self.score = self._calculate_score()
|
95
|
+
self.reason = await self._a_generate_reason(example.input)
|
96
|
+
self.success = self.score >= self.threshold
|
97
|
+
self.verbose_logs = create_verbose_logs(
|
98
|
+
self,
|
99
|
+
steps=[
|
100
|
+
# Convert to dict for serialization purposes
|
101
|
+
f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
|
102
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
103
|
+
],
|
104
|
+
)
|
105
|
+
return self.score
|
106
|
+
|
107
|
+
async def _a_generate_reason(self, input: str):
|
108
|
+
if self.include_reason is False:
|
109
|
+
return None
|
110
|
+
|
111
|
+
retrieval_contexts_verdicts = [
|
112
|
+
{"verdict": verdict.verdict, "reasons": verdict.reason}
|
113
|
+
for verdict in self.verdicts
|
114
|
+
]
|
115
|
+
prompt = ContextualPrecisionTemplate.generate_reason(
|
116
|
+
input=input,
|
117
|
+
verdicts=retrieval_contexts_verdicts,
|
118
|
+
score=format(self.score, ".2f"),
|
119
|
+
)
|
120
|
+
|
121
|
+
if self.using_native_model:
|
122
|
+
res = await self.model.a_generate(prompt)
|
123
|
+
data = parse_response_json(res, self)
|
124
|
+
return data["reason"]
|
125
|
+
else:
|
126
|
+
try:
|
127
|
+
res: Reason = await self.model.a_generate(prompt, schema=Reason)
|
128
|
+
return res.reason
|
129
|
+
except TypeError:
|
130
|
+
res = await self.model.a_generate(prompt)
|
131
|
+
data = parse_response_json(res, self)
|
132
|
+
return data["reason"]
|
133
|
+
|
134
|
+
def _generate_reason(self, input: str):
|
135
|
+
if self.include_reason is False:
|
136
|
+
return None
|
137
|
+
|
138
|
+
retrieval_contexts_verdicts = [
|
139
|
+
{"verdict": verdict.verdict, "reasons": verdict.reason}
|
140
|
+
for verdict in self.verdicts
|
141
|
+
]
|
142
|
+
prompt = ContextualPrecisionTemplate.generate_reason(
|
143
|
+
input=input,
|
144
|
+
verdicts=retrieval_contexts_verdicts,
|
145
|
+
score=format(self.score, ".2f"),
|
146
|
+
)
|
147
|
+
|
148
|
+
if self.using_native_model:
|
149
|
+
res = self.model.generate(prompt)
|
150
|
+
data = parse_response_json(res, self)
|
151
|
+
return data["reason"]
|
152
|
+
else:
|
153
|
+
try:
|
154
|
+
res: Reason = self.model.generate(prompt, schema=Reason)
|
155
|
+
return res.reason
|
156
|
+
except TypeError:
|
157
|
+
res = self.model.generate(prompt)
|
158
|
+
data = parse_response_json(res, self)
|
159
|
+
return data["reason"]
|
160
|
+
|
161
|
+
async def _a_generate_verdicts(
|
162
|
+
self, input: str, expected_output: str, retrieval_context: List[str]
|
163
|
+
) -> List[ContextualPrecisionVerdict]:
|
164
|
+
prompt = ContextualPrecisionTemplate.generate_verdicts(
|
165
|
+
input=input,
|
166
|
+
expected_output=expected_output,
|
167
|
+
retrieval_context=retrieval_context,
|
168
|
+
)
|
169
|
+
if self.using_native_model:
|
170
|
+
res = await self.model.a_generate(prompt)
|
171
|
+
data = parse_response_json(res, self)
|
172
|
+
verdicts = [
|
173
|
+
ContextualPrecisionVerdict(**item) for item in data["verdicts"]
|
174
|
+
]
|
175
|
+
return verdicts
|
176
|
+
else:
|
177
|
+
try:
|
178
|
+
res: Verdicts = await self.model.a_generate(
|
179
|
+
prompt, schema=Verdicts
|
180
|
+
)
|
181
|
+
verdicts = [item for item in res.verdicts]
|
182
|
+
return verdicts
|
183
|
+
except TypeError:
|
184
|
+
res = await self.model.a_generate(prompt)
|
185
|
+
data = parse_response_json(res, self)
|
186
|
+
verdicts = [
|
187
|
+
ContextualPrecisionVerdict(**item)
|
188
|
+
for item in data["verdicts"]
|
189
|
+
]
|
190
|
+
return verdicts
|
191
|
+
|
192
|
+
def _generate_verdicts(
|
193
|
+
self, input: str, expected_output: str, retrieval_context: List[str]
|
194
|
+
) -> List[ContextualPrecisionVerdict]:
|
195
|
+
prompt = ContextualPrecisionTemplate.generate_verdicts(
|
196
|
+
input=input,
|
197
|
+
expected_output=expected_output,
|
198
|
+
retrieval_context=retrieval_context,
|
199
|
+
)
|
200
|
+
if self.using_native_model:
|
201
|
+
res = self.model.generate(prompt)
|
202
|
+
data = parse_response_json(res, self)
|
203
|
+
verdicts = [
|
204
|
+
ContextualPrecisionVerdict(**item) for item in data["verdicts"]
|
205
|
+
]
|
206
|
+
return verdicts
|
207
|
+
else:
|
208
|
+
try:
|
209
|
+
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
210
|
+
verdicts = [item for item in res.verdicts]
|
211
|
+
return verdicts
|
212
|
+
except TypeError:
|
213
|
+
res = self.model.generate(prompt)
|
214
|
+
data = parse_response_json(res, self)
|
215
|
+
verdicts = [
|
216
|
+
ContextualPrecisionVerdict(**item)
|
217
|
+
for item in data["verdicts"]
|
218
|
+
]
|
219
|
+
return verdicts
|
220
|
+
|
221
|
+
def _calculate_score(self):
|
222
|
+
number_of_verdicts = len(self.verdicts)
|
223
|
+
if number_of_verdicts == 0:
|
224
|
+
return 0
|
225
|
+
|
226
|
+
# Convert verdicts to a binary list where 'yes' is 1 and others are 0
|
227
|
+
node_verdicts = [
|
228
|
+
1 if v.verdict.strip().lower() == "yes" else 0
|
229
|
+
for v in self.verdicts
|
230
|
+
]
|
231
|
+
|
232
|
+
sum_weighted_precision_at_k = 0.0
|
233
|
+
relevant_nodes_count = 0
|
234
|
+
for k, is_relevant in enumerate(node_verdicts, start=1):
|
235
|
+
# If the item is relevant, update the counter and add the weighted precision at k to the sum
|
236
|
+
if is_relevant:
|
237
|
+
relevant_nodes_count += 1
|
238
|
+
precision_at_k = relevant_nodes_count / k
|
239
|
+
sum_weighted_precision_at_k += precision_at_k * is_relevant
|
240
|
+
|
241
|
+
if relevant_nodes_count == 0:
|
242
|
+
return 0
|
243
|
+
# Calculate weighted cumulative precision
|
244
|
+
score = sum_weighted_precision_at_k / relevant_nodes_count
|
245
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
246
|
+
|
247
|
+
def _success_check(self) -> bool:
|
248
|
+
if self.error is not None:
|
249
|
+
self.success = False
|
250
|
+
else:
|
251
|
+
try:
|
252
|
+
self.success = self.score >= self.threshold
|
253
|
+
except:
|
254
|
+
self.success = False
|
255
|
+
return self.success
|
256
|
+
|
257
|
+
@property
|
258
|
+
def __name__(self):
|
259
|
+
return "Contextual Precision"
|
@@ -0,0 +1,106 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
from pydantic import BaseModel, Field
|
3
|
+
|
4
|
+
|
5
|
+
class ContextualPrecisionVerdict(BaseModel):
|
6
|
+
verdict: str
|
7
|
+
reason: str
|
8
|
+
|
9
|
+
|
10
|
+
class Verdicts(BaseModel):
|
11
|
+
verdicts: List[ContextualPrecisionVerdict]
|
12
|
+
|
13
|
+
|
14
|
+
class Reason(BaseModel):
|
15
|
+
reason: str
|
16
|
+
|
17
|
+
|
18
|
+
class ContextualPrecisionTemplate:
|
19
|
+
@staticmethod
|
20
|
+
def generate_verdicts(input, expected_output, retrieval_context):
|
21
|
+
return f"""==== TASK INSTRUCTIONS ====\nGiven the input, expected output, and retrieval context, your task is to determine whether each document in the retrieval context was relevant to arrive at the expected output.
|
22
|
+
You should reason through the documents in the retrieval context thoroughly, and then generate a list of JSON objects representing your decision.
|
23
|
+
|
24
|
+
==== FORMAT INSTRUCTIONS ====\nIMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON. These JSON only contain the `verdict` key that outputs only 'yes' or 'no', and a `reason` key to justify the verdict. In your reason, aim to quote parts of the context to support your verdict.
|
25
|
+
|
26
|
+
==== EXAMPLE ====
|
27
|
+
Example Input: "What are the main symptoms of COVID-19?"
|
28
|
+
Example Expected Output: "The main symptoms of COVID-19 include fever, cough, fatigue, and loss of taste or smell."
|
29
|
+
Example Retrieval Context: ["Common COVID-19 symptoms include fever and dry cough", "Loss of taste and smell are distinctive COVID-19 symptoms", "The first COVID-19 case was reported in Wuhan", "My friend's birthday party was fun last weekend"]
|
30
|
+
|
31
|
+
Example output JSON:
|
32
|
+
{{
|
33
|
+
"verdicts": [
|
34
|
+
{{
|
35
|
+
"verdict": "yes",
|
36
|
+
"reason": "The text directly lists key COVID-19 symptoms including 'fever and dry cough' which are part of the main symptoms."
|
37
|
+
}},
|
38
|
+
{{
|
39
|
+
"verdict": "yes",
|
40
|
+
"reason": "The text mentions 'loss of taste and smell' which are distinctive symptoms of COVID-19 that should be included."
|
41
|
+
}},
|
42
|
+
{{
|
43
|
+
"verdict": "no",
|
44
|
+
"reason": "While related to COVID-19, the origin of the first case is not relevant to listing the main symptoms."
|
45
|
+
}},
|
46
|
+
{{
|
47
|
+
"verdict": "no",
|
48
|
+
"reason": "A personal anecdote about a birthday party has no relevance to COVID-19 symptoms."
|
49
|
+
}}
|
50
|
+
]
|
51
|
+
}}
|
52
|
+
|
53
|
+
Your task is to generate a verdict for each document in the retrieval context, so the number of 'verdicts' SHOULD BE EXACTLY EQUAL to that of the retrievalcontexts.
|
54
|
+
|
55
|
+
==== YOUR TURN ====
|
56
|
+
Input:
|
57
|
+
{input}
|
58
|
+
|
59
|
+
Expected output:
|
60
|
+
{expected_output}
|
61
|
+
|
62
|
+
Retrieval Context:
|
63
|
+
{retrieval_context}
|
64
|
+
|
65
|
+
JSON:
|
66
|
+
"""
|
67
|
+
|
68
|
+
@staticmethod
|
69
|
+
def generate_reason(input, verdicts, score):
|
70
|
+
return f"""==== TASK INSTRUCTIONS ====\nYou will be provided with an input, retrieval contexts, and a contextual precision score. Your task is to provide a CLEAR and CONCISE reason for the score.
|
71
|
+
You should explain why the score is not higher, but also the current score is reasonable. Here's a further breakdown of the task:
|
72
|
+
|
73
|
+
1. input (str) is a task or question that the model attempted to solve
|
74
|
+
2. retrieval contexts (list[dict]) is a list of JSON with the following keys:
|
75
|
+
- `verdict` (str): either 'yes' or 'no', which represents whether the corresponding document in the retrieval context is relevant to the input.
|
76
|
+
- `reason` (str): a reason for the verdict.
|
77
|
+
3. The contextual precision score is a float between 0 and 1 and represents if the relevant documents are ranked higher than irrelevant ones in the retrieval context.
|
78
|
+
The ranking can be inferred by the order of the retrieval documents: retrieval contexts is given IN THE ORDER OF THE DOCUMENT RANKINGS.
|
79
|
+
This implies that the score will be higher if the relevant documents are ranked higher (appears earlier in the list) than irrelevant ones.
|
80
|
+
|
81
|
+
==== FORMAT INSTRUCTIONS ====\nIMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason for the contextual precision score.
|
82
|
+
Example JSON:
|
83
|
+
{{
|
84
|
+
"reason": "The score is <contextual_precision_score> because <your_reason>."
|
85
|
+
}}
|
86
|
+
|
87
|
+
|
88
|
+
DO NOT mention 'verdict' in your reason, but instead phrase it as irrelevant nodes. The term 'verdict' are just here for you to understand the broader scope of things.
|
89
|
+
Also DO NOT mention there are `reason` fields in the retrieval contexts you are presented with, instead just use the information in the `reason` field.
|
90
|
+
In your reason, you MUST USE the `reason`, QUOTES in the 'reason', and the node RANK (starting from 1, eg. first node) to explain why the 'no' verdicts should be ranked lower than the 'yes' verdicts.
|
91
|
+
When addressing nodes, make it explicit that it is nodes in retrieval context.
|
92
|
+
If the score is 1, keep it short and say something positive with an upbeat tone (but don't overdo it otherwise it gets annoying).
|
93
|
+
|
94
|
+
==== YOUR TURN ====
|
95
|
+
Contextual Precision Score:
|
96
|
+
{score}
|
97
|
+
|
98
|
+
Input:
|
99
|
+
{input}
|
100
|
+
|
101
|
+
Retrieval Contexts:
|
102
|
+
{verdicts}
|
103
|
+
|
104
|
+
JSON:
|
105
|
+
"""
|
106
|
+
|
@@ -0,0 +1,249 @@
|
|
1
|
+
from typing import Optional, List, Union
|
2
|
+
|
3
|
+
from judgeval.scorers.utils import (
|
4
|
+
get_or_create_event_loop,
|
5
|
+
parse_response_json,
|
6
|
+
scorer_progress_meter,
|
7
|
+
create_verbose_logs,
|
8
|
+
check_example_params
|
9
|
+
)
|
10
|
+
from judgeval.judges.utils import create_judge
|
11
|
+
from judgeval.scorers import JudgevalScorer
|
12
|
+
from judgeval.judges import JudgevalJudge
|
13
|
+
from judgeval.judges.utils import create_judge
|
14
|
+
from judgeval.data import Example, ExampleParams
|
15
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.prompts import *
|
16
|
+
|
17
|
+
required_params = [
|
18
|
+
ExampleParams.INPUT,
|
19
|
+
ExampleParams.ACTUAL_OUTPUT,
|
20
|
+
ExampleParams.EXPECTED_OUTPUT,
|
21
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
22
|
+
]
|
23
|
+
|
24
|
+
class ContextualRecallScorer(JudgevalScorer):
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
threshold: float = 0.5,
|
28
|
+
model: Optional[Union[str, JudgevalJudge]] = None,
|
29
|
+
include_reason: bool = True,
|
30
|
+
async_mode: bool = True,
|
31
|
+
strict_mode: bool = False,
|
32
|
+
verbose_mode: bool = False,
|
33
|
+
user: Optional[str] = None
|
34
|
+
):
|
35
|
+
self.user = user
|
36
|
+
self.threshold = 1 if strict_mode else threshold
|
37
|
+
self.model, self.using_native_model = create_judge(model)
|
38
|
+
self.evaluation_model = self.model.get_model_name()
|
39
|
+
self.include_reason = include_reason
|
40
|
+
self.async_mode = async_mode
|
41
|
+
self.strict_mode = strict_mode
|
42
|
+
self.verbose_mode = verbose_mode
|
43
|
+
|
44
|
+
def score_example(
|
45
|
+
self,
|
46
|
+
example: Example,
|
47
|
+
_show_indicator: bool = True,
|
48
|
+
) -> float:
|
49
|
+
check_example_params(example, required_params, self)
|
50
|
+
|
51
|
+
with scorer_progress_meter(self, display_meter=_show_indicator):
|
52
|
+
if self.async_mode:
|
53
|
+
loop = get_or_create_event_loop()
|
54
|
+
loop.run_until_complete(
|
55
|
+
self.a_score_example(example, _show_indicator=False)
|
56
|
+
)
|
57
|
+
else:
|
58
|
+
self.verdicts: List[ContextualRecallVerdict] = (
|
59
|
+
self._generate_verdicts(
|
60
|
+
example.expected_output, example.retrieval_context
|
61
|
+
)
|
62
|
+
)
|
63
|
+
self.score = self._calculate_score()
|
64
|
+
self.reason = self._generate_reason(example.expected_output)
|
65
|
+
self.success = self.score >= self.threshold
|
66
|
+
self.verbose_logs = create_verbose_logs(
|
67
|
+
self,
|
68
|
+
steps=[
|
69
|
+
f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
|
70
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
71
|
+
],
|
72
|
+
)
|
73
|
+
return self.score
|
74
|
+
|
75
|
+
async def a_score_example(
|
76
|
+
self,
|
77
|
+
example: Example,
|
78
|
+
_show_indicator: bool = True,
|
79
|
+
) -> float:
|
80
|
+
check_example_params(example, required_params, self)
|
81
|
+
|
82
|
+
with scorer_progress_meter(
|
83
|
+
self,
|
84
|
+
async_mode=True,
|
85
|
+
display_meter=_show_indicator,
|
86
|
+
):
|
87
|
+
self.verdicts: List[ContextualRecallVerdict] = (
|
88
|
+
await self._a_generate_verdicts(
|
89
|
+
example.expected_output, example.retrieval_context
|
90
|
+
)
|
91
|
+
)
|
92
|
+
self.score = self._calculate_score()
|
93
|
+
self.reason = await self._a_generate_reason(
|
94
|
+
example.expected_output
|
95
|
+
)
|
96
|
+
self.success = self.score >= self.threshold
|
97
|
+
self.verbose_logs = create_verbose_logs(
|
98
|
+
self,
|
99
|
+
steps=[
|
100
|
+
f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
|
101
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
102
|
+
],
|
103
|
+
)
|
104
|
+
return self.score
|
105
|
+
|
106
|
+
async def _a_generate_reason(self, expected_output: str):
|
107
|
+
if self.include_reason is False:
|
108
|
+
return None
|
109
|
+
|
110
|
+
supportive_reasons = []
|
111
|
+
unsupportive_reasons = []
|
112
|
+
for idx, verdict in enumerate(self.verdicts):
|
113
|
+
if verdict.verdict.lower() == "yes":
|
114
|
+
supportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
|
115
|
+
else:
|
116
|
+
unsupportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
|
117
|
+
|
118
|
+
prompt = ContextualRecallTemplate.generate_reason(
|
119
|
+
expected_output=expected_output,
|
120
|
+
supportive_reasons=supportive_reasons,
|
121
|
+
unsupportive_reasons=unsupportive_reasons,
|
122
|
+
score=format(self.score, ".2f"),
|
123
|
+
)
|
124
|
+
|
125
|
+
if self.using_native_model:
|
126
|
+
res = await self.model.a_generate(prompt)
|
127
|
+
data = parse_response_json(res, self)
|
128
|
+
return data["reason"]
|
129
|
+
else:
|
130
|
+
try:
|
131
|
+
res: Reason = await self.model.a_generate(prompt, schema=Reason)
|
132
|
+
return res.reason
|
133
|
+
except TypeError:
|
134
|
+
res = await self.model.a_generate(prompt)
|
135
|
+
data = parse_response_json(res, self)
|
136
|
+
return data["reason"]
|
137
|
+
|
138
|
+
def _generate_reason(self, expected_output: str):
|
139
|
+
if self.include_reason is False:
|
140
|
+
return None
|
141
|
+
|
142
|
+
supportive_reasons = []
|
143
|
+
unsupportive_reasons = []
|
144
|
+
for verdict in self.verdicts:
|
145
|
+
if verdict.verdict.lower() == "yes":
|
146
|
+
supportive_reasons.append(verdict.reason)
|
147
|
+
else:
|
148
|
+
unsupportive_reasons.append(verdict.reason)
|
149
|
+
|
150
|
+
prompt = ContextualRecallTemplate.generate_reason(
|
151
|
+
expected_output=expected_output,
|
152
|
+
supportive_reasons=supportive_reasons,
|
153
|
+
unsupportive_reasons=unsupportive_reasons,
|
154
|
+
score=format(self.score, ".2f"),
|
155
|
+
)
|
156
|
+
|
157
|
+
if self.using_native_model:
|
158
|
+
res = self.model.generate(prompt)
|
159
|
+
data = parse_response_json(res, self)
|
160
|
+
return data["reason"]
|
161
|
+
else:
|
162
|
+
try:
|
163
|
+
res: Reason = self.model.generate(prompt, schema=Reason)
|
164
|
+
return res.reason
|
165
|
+
except TypeError:
|
166
|
+
res = self.model.generate(prompt)
|
167
|
+
data = parse_response_json(res, self)
|
168
|
+
return data["reason"]
|
169
|
+
|
170
|
+
def _calculate_score(self):
|
171
|
+
number_of_verdicts = len(self.verdicts)
|
172
|
+
if number_of_verdicts == 0:
|
173
|
+
return 0
|
174
|
+
|
175
|
+
justified_sentences = 0
|
176
|
+
for verdict in self.verdicts:
|
177
|
+
if verdict.verdict.lower() == "yes":
|
178
|
+
justified_sentences += 1
|
179
|
+
|
180
|
+
score = justified_sentences / number_of_verdicts
|
181
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
182
|
+
|
183
|
+
async def _a_generate_verdicts(
|
184
|
+
self, expected_output: str, retrieval_context: List[str]
|
185
|
+
) -> List[ContextualRecallVerdict]:
|
186
|
+
prompt = ContextualRecallTemplate.generate_verdicts(
|
187
|
+
expected_output=expected_output, retrieval_context=retrieval_context
|
188
|
+
)
|
189
|
+
if self.using_native_model:
|
190
|
+
res = await self.model.a_generate(prompt)
|
191
|
+
data = parse_response_json(res, self)
|
192
|
+
verdicts = [
|
193
|
+
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
194
|
+
]
|
195
|
+
return verdicts
|
196
|
+
else:
|
197
|
+
try:
|
198
|
+
res: Verdicts = await self.model.a_generate(
|
199
|
+
prompt, schema=Verdicts
|
200
|
+
)
|
201
|
+
verdicts: Verdicts = [item for item in res.verdicts]
|
202
|
+
return verdicts
|
203
|
+
except TypeError:
|
204
|
+
res = await self.model.a_generate(prompt)
|
205
|
+
data = parse_response_json(res, self)
|
206
|
+
verdicts = [
|
207
|
+
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
208
|
+
]
|
209
|
+
return verdicts
|
210
|
+
|
211
|
+
def _generate_verdicts(
|
212
|
+
self, expected_output: str, retrieval_context: List[str]
|
213
|
+
) -> List[ContextualRecallVerdict]:
|
214
|
+
prompt = ContextualRecallTemplate.generate_verdicts(
|
215
|
+
expected_output=expected_output, retrieval_context=retrieval_context
|
216
|
+
)
|
217
|
+
if self.using_native_model:
|
218
|
+
res = self.model.generate(prompt)
|
219
|
+
data = parse_response_json(res, self)
|
220
|
+
verdicts = [
|
221
|
+
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
222
|
+
]
|
223
|
+
return verdicts
|
224
|
+
else:
|
225
|
+
try:
|
226
|
+
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
227
|
+
verdicts: Verdicts = [item for item in res.verdicts]
|
228
|
+
return verdicts
|
229
|
+
except TypeError:
|
230
|
+
res = self.model.generate(prompt)
|
231
|
+
data = parse_response_json(res, self)
|
232
|
+
verdicts = [
|
233
|
+
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
234
|
+
]
|
235
|
+
return verdicts
|
236
|
+
|
237
|
+
def _success_check(self) -> bool:
|
238
|
+
if self.error is not None:
|
239
|
+
self.success = False
|
240
|
+
else:
|
241
|
+
try:
|
242
|
+
self.success = self.score >= self.threshold
|
243
|
+
except:
|
244
|
+
self.success = False
|
245
|
+
return self.success
|
246
|
+
|
247
|
+
@property
|
248
|
+
def __name__(self):
|
249
|
+
return "Contextual Recall"
|