judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/s3_storage.py +93 -0
- judgeval/common/tracer.py +612 -123
- judgeval/data/sequence.py +4 -10
- judgeval/judgment_client.py +25 -86
- judgeval/rules.py +4 -7
- judgeval/run_evaluation.py +1 -1
- judgeval/scorers/__init__.py +4 -4
- judgeval/scorers/judgeval_scorers/__init__.py +0 -176
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
- judgeval-0.0.33.dist-info/RECORD +63 -0
- judgeval/scorers/base_scorer.py +0 -58
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
- judgeval-0.0.32.dist-info/RECORD +0 -97
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,254 +0,0 @@
|
|
1
|
-
from typing import Optional, List, Union
|
2
|
-
|
3
|
-
from judgeval.constants import APIScorer
|
4
|
-
from judgeval.scorers.utils import (
|
5
|
-
get_or_create_event_loop,
|
6
|
-
parse_response_json,
|
7
|
-
scorer_progress_meter,
|
8
|
-
create_verbose_logs,
|
9
|
-
check_example_params
|
10
|
-
)
|
11
|
-
from judgeval.judges.utils import create_judge
|
12
|
-
from judgeval.scorers import JudgevalScorer
|
13
|
-
from judgeval.judges import JudgevalJudge
|
14
|
-
from judgeval.judges.utils import create_judge
|
15
|
-
from judgeval.data import Example, ExampleParams
|
16
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.prompts import *
|
17
|
-
|
18
|
-
required_params = [
|
19
|
-
ExampleParams.INPUT,
|
20
|
-
ExampleParams.ACTUAL_OUTPUT,
|
21
|
-
ExampleParams.EXPECTED_OUTPUT,
|
22
|
-
ExampleParams.RETRIEVAL_CONTEXT,
|
23
|
-
]
|
24
|
-
|
25
|
-
class ContextualRecallScorer(JudgevalScorer):
|
26
|
-
def __init__(
|
27
|
-
self,
|
28
|
-
threshold: float = 0.5,
|
29
|
-
model: Optional[Union[str, JudgevalJudge]] = None,
|
30
|
-
include_reason: bool = True,
|
31
|
-
async_mode: bool = True,
|
32
|
-
strict_mode: bool = False,
|
33
|
-
verbose_mode: bool = False,
|
34
|
-
user: Optional[str] = None
|
35
|
-
):
|
36
|
-
super().__init__(
|
37
|
-
score_type=APIScorer.CONTEXTUAL_RECALL,
|
38
|
-
threshold=1 if strict_mode else threshold,
|
39
|
-
evaluation_model=None,
|
40
|
-
include_reason=include_reason,
|
41
|
-
async_mode=async_mode,
|
42
|
-
strict_mode=strict_mode,
|
43
|
-
verbose_mode=verbose_mode
|
44
|
-
)
|
45
|
-
self.user = user
|
46
|
-
self.model, self.using_native_model = create_judge(model)
|
47
|
-
self.evaluation_model = self.model.get_model_name()
|
48
|
-
|
49
|
-
def score_example(
|
50
|
-
self,
|
51
|
-
example: Example,
|
52
|
-
_show_indicator: bool = True,
|
53
|
-
) -> float:
|
54
|
-
check_example_params(example, required_params, self)
|
55
|
-
|
56
|
-
with scorer_progress_meter(self, display_meter=_show_indicator):
|
57
|
-
if self.async_mode:
|
58
|
-
loop = get_or_create_event_loop()
|
59
|
-
loop.run_until_complete(
|
60
|
-
self.a_score_example(example, _show_indicator=False)
|
61
|
-
)
|
62
|
-
else:
|
63
|
-
self.verdicts: List[ContextualRecallVerdict] = (
|
64
|
-
self._generate_verdicts(
|
65
|
-
example.expected_output, example.retrieval_context
|
66
|
-
)
|
67
|
-
)
|
68
|
-
self.score = self._calculate_score()
|
69
|
-
self.reason = self._generate_reason(example.expected_output)
|
70
|
-
self.success = self.score >= self.threshold
|
71
|
-
self.verbose_logs = create_verbose_logs(
|
72
|
-
self,
|
73
|
-
steps=[
|
74
|
-
f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
|
75
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
76
|
-
],
|
77
|
-
)
|
78
|
-
return self.score
|
79
|
-
|
80
|
-
async def a_score_example(
|
81
|
-
self,
|
82
|
-
example: Example,
|
83
|
-
_show_indicator: bool = True,
|
84
|
-
) -> float:
|
85
|
-
check_example_params(example, required_params, self)
|
86
|
-
|
87
|
-
with scorer_progress_meter(
|
88
|
-
self,
|
89
|
-
async_mode=True,
|
90
|
-
display_meter=_show_indicator,
|
91
|
-
):
|
92
|
-
self.verdicts: List[ContextualRecallVerdict] = (
|
93
|
-
await self._a_generate_verdicts(
|
94
|
-
example.expected_output, example.retrieval_context
|
95
|
-
)
|
96
|
-
)
|
97
|
-
self.score = self._calculate_score()
|
98
|
-
self.reason = await self._a_generate_reason(
|
99
|
-
example.expected_output
|
100
|
-
)
|
101
|
-
self.success = self.score >= self.threshold
|
102
|
-
self.verbose_logs = create_verbose_logs(
|
103
|
-
self,
|
104
|
-
steps=[
|
105
|
-
f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
|
106
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
107
|
-
],
|
108
|
-
)
|
109
|
-
return self.score
|
110
|
-
|
111
|
-
async def _a_generate_reason(self, expected_output: str):
|
112
|
-
if self.include_reason is False:
|
113
|
-
return None
|
114
|
-
|
115
|
-
supportive_reasons = []
|
116
|
-
unsupportive_reasons = []
|
117
|
-
for idx, verdict in enumerate(self.verdicts):
|
118
|
-
if verdict.verdict.lower() == "yes":
|
119
|
-
supportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
|
120
|
-
else:
|
121
|
-
unsupportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
|
122
|
-
|
123
|
-
prompt = ContextualRecallTemplate.generate_reason(
|
124
|
-
expected_output=expected_output,
|
125
|
-
supportive_reasons=supportive_reasons,
|
126
|
-
unsupportive_reasons=unsupportive_reasons,
|
127
|
-
score=format(self.score, ".2f"),
|
128
|
-
)
|
129
|
-
|
130
|
-
if self.using_native_model:
|
131
|
-
res = await self.model.a_generate(prompt)
|
132
|
-
data = parse_response_json(res, self)
|
133
|
-
return data["reason"]
|
134
|
-
else:
|
135
|
-
try:
|
136
|
-
res: Reason = await self.model.a_generate(prompt, schema=Reason)
|
137
|
-
return res.reason
|
138
|
-
except TypeError:
|
139
|
-
res = await self.model.a_generate(prompt)
|
140
|
-
data = parse_response_json(res, self)
|
141
|
-
return data["reason"]
|
142
|
-
|
143
|
-
def _generate_reason(self, expected_output: str):
|
144
|
-
if self.include_reason is False:
|
145
|
-
return None
|
146
|
-
|
147
|
-
supportive_reasons = []
|
148
|
-
unsupportive_reasons = []
|
149
|
-
for verdict in self.verdicts:
|
150
|
-
if verdict.verdict.lower() == "yes":
|
151
|
-
supportive_reasons.append(verdict.reason)
|
152
|
-
else:
|
153
|
-
unsupportive_reasons.append(verdict.reason)
|
154
|
-
|
155
|
-
prompt = ContextualRecallTemplate.generate_reason(
|
156
|
-
expected_output=expected_output,
|
157
|
-
supportive_reasons=supportive_reasons,
|
158
|
-
unsupportive_reasons=unsupportive_reasons,
|
159
|
-
score=format(self.score, ".2f"),
|
160
|
-
)
|
161
|
-
|
162
|
-
if self.using_native_model:
|
163
|
-
res = self.model.generate(prompt)
|
164
|
-
data = parse_response_json(res, self)
|
165
|
-
return data["reason"]
|
166
|
-
else:
|
167
|
-
try:
|
168
|
-
res: Reason = self.model.generate(prompt, schema=Reason)
|
169
|
-
return res.reason
|
170
|
-
except TypeError:
|
171
|
-
res = self.model.generate(prompt)
|
172
|
-
data = parse_response_json(res, self)
|
173
|
-
return data["reason"]
|
174
|
-
|
175
|
-
def _calculate_score(self):
|
176
|
-
number_of_verdicts = len(self.verdicts)
|
177
|
-
if number_of_verdicts == 0:
|
178
|
-
return 0
|
179
|
-
|
180
|
-
justified_sentences = 0
|
181
|
-
for verdict in self.verdicts:
|
182
|
-
if verdict.verdict.lower() == "yes":
|
183
|
-
justified_sentences += 1
|
184
|
-
|
185
|
-
score = justified_sentences / number_of_verdicts
|
186
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
187
|
-
|
188
|
-
async def _a_generate_verdicts(
|
189
|
-
self, expected_output: str, retrieval_context: List[str]
|
190
|
-
) -> List[ContextualRecallVerdict]:
|
191
|
-
prompt = ContextualRecallTemplate.generate_verdicts(
|
192
|
-
expected_output=expected_output, retrieval_context=retrieval_context
|
193
|
-
)
|
194
|
-
if self.using_native_model:
|
195
|
-
res = await self.model.a_generate(prompt)
|
196
|
-
data = parse_response_json(res, self)
|
197
|
-
verdicts = [
|
198
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
199
|
-
]
|
200
|
-
return verdicts
|
201
|
-
else:
|
202
|
-
try:
|
203
|
-
res: Verdicts = await self.model.a_generate(
|
204
|
-
prompt, schema=Verdicts
|
205
|
-
)
|
206
|
-
verdicts: Verdicts = [item for item in res.verdicts]
|
207
|
-
return verdicts
|
208
|
-
except TypeError:
|
209
|
-
res = await self.model.a_generate(prompt)
|
210
|
-
data = parse_response_json(res, self)
|
211
|
-
verdicts = [
|
212
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
213
|
-
]
|
214
|
-
return verdicts
|
215
|
-
|
216
|
-
def _generate_verdicts(
|
217
|
-
self, expected_output: str, retrieval_context: List[str]
|
218
|
-
) -> List[ContextualRecallVerdict]:
|
219
|
-
prompt = ContextualRecallTemplate.generate_verdicts(
|
220
|
-
expected_output=expected_output, retrieval_context=retrieval_context
|
221
|
-
)
|
222
|
-
if self.using_native_model:
|
223
|
-
res = self.model.generate(prompt)
|
224
|
-
data = parse_response_json(res, self)
|
225
|
-
verdicts = [
|
226
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
227
|
-
]
|
228
|
-
return verdicts
|
229
|
-
else:
|
230
|
-
try:
|
231
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
232
|
-
verdicts: Verdicts = [item for item in res.verdicts]
|
233
|
-
return verdicts
|
234
|
-
except TypeError:
|
235
|
-
res = self.model.generate(prompt)
|
236
|
-
data = parse_response_json(res, self)
|
237
|
-
verdicts = [
|
238
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
239
|
-
]
|
240
|
-
return verdicts
|
241
|
-
|
242
|
-
def _success_check(self) -> bool:
|
243
|
-
if self.error is not None:
|
244
|
-
self.success = False
|
245
|
-
else:
|
246
|
-
try:
|
247
|
-
self.success = self.score >= self.threshold
|
248
|
-
except:
|
249
|
-
self.success = False
|
250
|
-
return self.success
|
251
|
-
|
252
|
-
@property
|
253
|
-
def __name__(self):
|
254
|
-
return "Contextual Recall"
|
@@ -1,142 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
from pydantic import BaseModel
|
3
|
-
|
4
|
-
|
5
|
-
class ContextualRecallVerdict(BaseModel):
|
6
|
-
verdict: str
|
7
|
-
reason: str
|
8
|
-
|
9
|
-
|
10
|
-
class Verdicts(BaseModel):
|
11
|
-
verdicts: List[ContextualRecallVerdict]
|
12
|
-
|
13
|
-
|
14
|
-
class Reason(BaseModel):
|
15
|
-
reason: str
|
16
|
-
|
17
|
-
|
18
|
-
class ContextualRecallTemplate:
|
19
|
-
|
20
|
-
@staticmethod
|
21
|
-
def generate_verdicts(expected_output, retrieval_context):
|
22
|
-
return f"""
|
23
|
-
==== TASK INSTRUCTIONS ====
|
24
|
-
You will be provided with an expected output and a retrieval context (list of retrieved documents). Your task is to take each sentence in the expected output and determine whether the sentence is ATTRIBUTABLE or RELEVANT to ANY PART of the retrieval context.
|
25
|
-
|
26
|
-
==== FORMATTING YOUR ANSWER ====
|
27
|
-
Please format your answer as a list of JSON objects, each with two keys: `verdict` and `reason`.
|
28
|
-
The `verdict` key should STRICTLY be 'yes' or 'no'. You should answer 'yes' if the sentence can be attributed/is relevant to ANY PART(S) of the retrieval context. If not, you should answer 'no'.
|
29
|
-
The `reason` key should provide a justification of your verdict. In the justification, you should aim to include references to the document(s) in the retrieval context (eg., 1st document, and 2nd document in the retrieval context) that is attributed/relevant to the expected output sentence.
|
30
|
-
Please also AIM TO CITE the specific part of the retrieval context to justify your verdict, but **be extremely concise! Cut short the quote with an ellipsis if possible**.
|
31
|
-
|
32
|
-
Here's an example of formatting your answer:
|
33
|
-
{{
|
34
|
-
"verdicts": [
|
35
|
-
{{
|
36
|
-
"verdict": "yes",
|
37
|
-
"reason": "..."
|
38
|
-
}},
|
39
|
-
...
|
40
|
-
]
|
41
|
-
}}
|
42
|
-
|
43
|
-
==== EXAMPLE ====
|
44
|
-
Expected Output:
|
45
|
-
The Earth's climate has warmed significantly over the past century. This warming is primarily caused by human activities like burning fossil fuels. Today's weather was sunny and warm.
|
46
|
-
|
47
|
-
Retrieval Context:
|
48
|
-
["Global temperatures have risen by approximately 1.1°C since pre-industrial times, with most of this increase occurring in the past 100 years.",
|
49
|
-
"Scientific consensus shows that greenhouse gas emissions from human activities, particularly the burning of coal, oil and gas, are the main driver of observed climate change."]
|
50
|
-
|
51
|
-
Example Response:
|
52
|
-
{{
|
53
|
-
"verdicts": [
|
54
|
-
{{
|
55
|
-
"verdict": "yes",
|
56
|
-
"reason": "The 1st document directly confirms this, stating 'temperatures have risen by approximately 1.1°C...in the past 100 years'"
|
57
|
-
}},
|
58
|
-
{{
|
59
|
-
"verdict": "yes",
|
60
|
-
"reason": "The 2nd document explicitly states that 'greenhouse gas emissions from human activities, particularly the burning of...fossil fuels' drive climate change"
|
61
|
-
}},
|
62
|
-
{{
|
63
|
-
"verdict": "no",
|
64
|
-
"reason": "Neither document contains information about today's specific weather conditions"
|
65
|
-
}}
|
66
|
-
]
|
67
|
-
}}
|
68
|
-
|
69
|
-
Since your task is to generate a verdict for each sentence, the number of 'verdicts' SHOULD BE EXACTLY EQUAL to the number of sentences in of `expected output`.
|
70
|
-
**
|
71
|
-
|
72
|
-
==== YOUR TURN ====
|
73
|
-
Expected Output:
|
74
|
-
{expected_output}
|
75
|
-
|
76
|
-
Retrieval Context:
|
77
|
-
{retrieval_context}
|
78
|
-
|
79
|
-
JSON:
|
80
|
-
"""
|
81
|
-
@staticmethod
|
82
|
-
def generate_reason(
|
83
|
-
expected_output, supportive_reasons, unsupportive_reasons, score
|
84
|
-
):
|
85
|
-
return f"""
|
86
|
-
==== PROBLEM SETUP ====
|
87
|
-
You will be provided with an expected output, a list of supportive reasons, a list of unsupportive reasons, and a contextual recall score. Let's break down each input component:
|
88
|
-
- expected output: A text generated by a language model to answer a question/solve a task.
|
89
|
-
- supportive reasons: A list of reasons why a specific sentence in the expected output can be attributed/is relevant to any part of the retrieval context (a list of documents retrieved in a RAG pipeline)
|
90
|
-
- unsupportive reasons: A list of reasons why a specific sentence in the expected output cannot be attributed/is not relevant to any part of the retrieval context
|
91
|
-
**NOTE**: The reasons are provided in the form of "Sentence <number>: <reason>", where <number> is the sentence number in the expected output.
|
92
|
-
- contextual recall score: A score between 0 and 1 (closer to 1 the better) representing how much of the expected output can be attributed/is relevant to any part of the retrieval context.
|
93
|
-
The point of this score is to measure how well the retriever in a RAG pipeline operates, retrieving relevant documents that should back the expected output of the RAG generator.
|
94
|
-
|
95
|
-
==== TASK INSTRUCTIONS ====
|
96
|
-
Given these inputs, summarize a CONCISE and CLEAR reason for the value of the contextual recall score. Remember, the score is a measure of how well the retriever in a RAG pipeline operates, retrieving relevant documents that should back the expected output of the RAG generator.
|
97
|
-
In your reason, you should reference the supportive/unsupportive reasons by their sentence number to justify the score. Make specific references to the retrieval context in your reason if applicable.
|
98
|
-
|
99
|
-
==== FORMATTING YOUR ANSWER ====
|
100
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
101
|
-
Example JSON:
|
102
|
-
{{
|
103
|
-
"reason": "The score is <contextual_recall_score> because <your_reason>."
|
104
|
-
}}
|
105
|
-
|
106
|
-
DO NOT mention 'supportive reasons' and 'unsupportive reasons' in your reason, these terms are just here for you to understand the broader scope of things.
|
107
|
-
If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
108
|
-
|
109
|
-
==== EXAMPLE ====
|
110
|
-
Expected Output:
|
111
|
-
The Earth's climate has warmed significantly over the past century. This warming is primarily caused by human activities like burning fossil fuels. Today's weather was sunny and warm.
|
112
|
-
|
113
|
-
Supportive Reasons:
|
114
|
-
Sentence 1: The first document confirms this by stating global temperatures have risen by 1.1°C in the past 100 years
|
115
|
-
Sentence 2: The second document directly states that human activities and fossil fuel burning drive climate change
|
116
|
-
|
117
|
-
Unsupportive Reasons:
|
118
|
-
Sentence 3: Neither document contains information about today's specific weather conditions
|
119
|
-
|
120
|
-
Contextual Recall Score:
|
121
|
-
0.67
|
122
|
-
|
123
|
-
Example Response:
|
124
|
-
{{
|
125
|
-
"reason": "The score is 0.67 because while sentences 1 and 2 are well-supported by the retrieval context with specific temperature data and human activity impacts, sentence 3 about today's weather has no backing in the provided documents."
|
126
|
-
}}
|
127
|
-
|
128
|
-
==== YOUR TURN ====
|
129
|
-
Contextual Recall Score:
|
130
|
-
{score}
|
131
|
-
|
132
|
-
Expected Output:
|
133
|
-
{expected_output}
|
134
|
-
|
135
|
-
Supportive Reasons:
|
136
|
-
{supportive_reasons}
|
137
|
-
|
138
|
-
Unsupportive Reasons:
|
139
|
-
{unsupportive_reasons}
|
140
|
-
|
141
|
-
JSON:
|
142
|
-
"""
|
@@ -1,245 +0,0 @@
|
|
1
|
-
from typing import Optional, List, Union
|
2
|
-
import asyncio
|
3
|
-
|
4
|
-
from judgeval.constants import APIScorer
|
5
|
-
from judgeval.scorers.utils import (get_or_create_event_loop,
|
6
|
-
scorer_progress_meter,
|
7
|
-
create_verbose_logs,
|
8
|
-
parse_response_json,
|
9
|
-
check_example_params
|
10
|
-
)
|
11
|
-
from judgeval.scorers import JudgevalScorer
|
12
|
-
from judgeval.judges import JudgevalJudge
|
13
|
-
from judgeval.judges.utils import create_judge
|
14
|
-
from judgeval.data import Example, ExampleParams
|
15
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.prompts import *
|
16
|
-
|
17
|
-
|
18
|
-
required_params = [
|
19
|
-
ExampleParams.INPUT,
|
20
|
-
ExampleParams.ACTUAL_OUTPUT,
|
21
|
-
ExampleParams.RETRIEVAL_CONTEXT,
|
22
|
-
]
|
23
|
-
|
24
|
-
|
25
|
-
class ContextualRelevancyScorer(JudgevalScorer):
|
26
|
-
def __init__(
|
27
|
-
self,
|
28
|
-
threshold: float = 0.5,
|
29
|
-
model: Optional[Union[str, JudgevalJudge]] = None,
|
30
|
-
include_reason: bool = True,
|
31
|
-
async_mode: bool = True,
|
32
|
-
strict_mode: bool = False,
|
33
|
-
verbose_mode: bool = False,
|
34
|
-
user: Optional[str] = None
|
35
|
-
):
|
36
|
-
super().__init__(
|
37
|
-
score_type=APIScorer.CONTEXTUAL_RELEVANCY,
|
38
|
-
threshold=1 if strict_mode else threshold,
|
39
|
-
evaluation_model=None,
|
40
|
-
include_reason=include_reason,
|
41
|
-
async_mode=async_mode,
|
42
|
-
strict_mode=strict_mode,
|
43
|
-
verbose_mode=verbose_mode
|
44
|
-
)
|
45
|
-
self.user = user
|
46
|
-
self.model, self.using_native_model = create_judge(model)
|
47
|
-
self.evaluation_model = self.model.get_model_name()
|
48
|
-
|
49
|
-
def score_example(
|
50
|
-
self,
|
51
|
-
example: Example,
|
52
|
-
_show_indicator: bool = True,
|
53
|
-
) -> float:
|
54
|
-
check_example_params(example, required_params, self)
|
55
|
-
|
56
|
-
with scorer_progress_meter(self, display_meter=_show_indicator):
|
57
|
-
if self.async_mode:
|
58
|
-
loop = get_or_create_event_loop()
|
59
|
-
loop.run_until_complete(
|
60
|
-
self.a_score_example(example, _show_indicator=False)
|
61
|
-
)
|
62
|
-
else:
|
63
|
-
self.verdicts_list: List[ContextualRelevancyVerdicts] = [
|
64
|
-
(self._generate_verdicts(example.input, context))
|
65
|
-
for context in example.retrieval_context
|
66
|
-
]
|
67
|
-
self.score = self._calculate_score()
|
68
|
-
self.reason = self._generate_reason(example.input)
|
69
|
-
self.success = self.score >= self.threshold
|
70
|
-
self.verbose_logs = create_verbose_logs(
|
71
|
-
self,
|
72
|
-
steps=[
|
73
|
-
f"Verdicts:\n{[v.model_dump() for v in self.verdicts_list]}",
|
74
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
75
|
-
],
|
76
|
-
)
|
77
|
-
|
78
|
-
return self.score
|
79
|
-
|
80
|
-
async def a_score_example(
|
81
|
-
self,
|
82
|
-
example: Example,
|
83
|
-
_show_indicator: bool = True,
|
84
|
-
) -> float:
|
85
|
-
check_example_params(example, required_params, self)
|
86
|
-
|
87
|
-
with scorer_progress_meter(
|
88
|
-
self,
|
89
|
-
async_mode=True,
|
90
|
-
display_meter=_show_indicator,
|
91
|
-
):
|
92
|
-
self.verdicts_list: List[ContextualRelevancyVerdicts] = (
|
93
|
-
await asyncio.gather(
|
94
|
-
*[
|
95
|
-
self._a_generate_verdicts(example.input, context)
|
96
|
-
for context in example.retrieval_context
|
97
|
-
]
|
98
|
-
)
|
99
|
-
)
|
100
|
-
self.score = self._calculate_score()
|
101
|
-
self.reason = await self._a_generate_reason(example.input)
|
102
|
-
self.success = self.score >= self.threshold
|
103
|
-
self.verbose_logs = create_verbose_logs(
|
104
|
-
self,
|
105
|
-
steps=[
|
106
|
-
f"Verdicts:\n{[v.model_dump() for v in self.verdicts_list]}",
|
107
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
108
|
-
],
|
109
|
-
)
|
110
|
-
return self.score
|
111
|
-
|
112
|
-
async def _a_generate_reason(self, input: str):
|
113
|
-
if self.include_reason is False:
|
114
|
-
return None
|
115
|
-
|
116
|
-
irrelevancies = []
|
117
|
-
relevant_statements = []
|
118
|
-
for verdicts in self.verdicts_list:
|
119
|
-
for verdict in verdicts.verdicts:
|
120
|
-
if verdict.verdict.lower() == "no":
|
121
|
-
irrelevancies.append(verdict.model_dump())
|
122
|
-
else:
|
123
|
-
relevant_statements.append(verdict.model_dump())
|
124
|
-
prompt: dict = ContextualRelevancyTemplate.generate_reason(
|
125
|
-
input=input,
|
126
|
-
irrelevancies=irrelevancies,
|
127
|
-
relevant_statements=relevant_statements,
|
128
|
-
score=format(self.score, ".2f"),
|
129
|
-
)
|
130
|
-
if self.using_native_model:
|
131
|
-
res = await self.model.a_generate(prompt)
|
132
|
-
data = parse_response_json(res, self)
|
133
|
-
return data["reason"]
|
134
|
-
else:
|
135
|
-
try:
|
136
|
-
res: Reason = await self.model.a_generate(prompt, schema=Reason)
|
137
|
-
return res.reason
|
138
|
-
except TypeError:
|
139
|
-
res = await self.model.a_generate(prompt)
|
140
|
-
data = parse_response_json(res, self)
|
141
|
-
return data["reason"]
|
142
|
-
|
143
|
-
def _generate_reason(self, input: str):
|
144
|
-
if self.include_reason is False:
|
145
|
-
return None
|
146
|
-
|
147
|
-
irrelevancies = []
|
148
|
-
relevant_statements = []
|
149
|
-
for verdicts in self.verdicts_list:
|
150
|
-
for verdict in verdicts.verdicts:
|
151
|
-
if verdict.verdict.lower() == "no":
|
152
|
-
irrelevancies.append(verdict.reason)
|
153
|
-
else:
|
154
|
-
relevant_statements.append(verdict.statement)
|
155
|
-
|
156
|
-
prompt: dict = ContextualRelevancyTemplate.generate_reason(
|
157
|
-
input=input,
|
158
|
-
irrelevancies=irrelevancies,
|
159
|
-
relevant_statements=relevant_statements,
|
160
|
-
score=format(self.score, ".2f"),
|
161
|
-
)
|
162
|
-
if self.using_native_model:
|
163
|
-
res = self.model.generate(prompt)
|
164
|
-
data = parse_response_json(res, self)
|
165
|
-
return data["reason"]
|
166
|
-
else:
|
167
|
-
try:
|
168
|
-
res: Reason = self.model.generate(prompt, schema=Reason)
|
169
|
-
return res.reason
|
170
|
-
except TypeError:
|
171
|
-
res = self.model.generate(prompt)
|
172
|
-
data = parse_response_json(res, self)
|
173
|
-
return data["reason"]
|
174
|
-
|
175
|
-
def _calculate_score(self):
|
176
|
-
total_verdicts = 0
|
177
|
-
relevant_statements = 0
|
178
|
-
for verdicts in self.verdicts_list:
|
179
|
-
for verdict in verdicts.verdicts:
|
180
|
-
total_verdicts += 1
|
181
|
-
if verdict.verdict.lower() == "yes":
|
182
|
-
relevant_statements += 1
|
183
|
-
|
184
|
-
if total_verdicts == 0:
|
185
|
-
return 0
|
186
|
-
|
187
|
-
score = relevant_statements / total_verdicts
|
188
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
189
|
-
|
190
|
-
async def _a_generate_verdicts(
|
191
|
-
self, input: str, context: List[str]
|
192
|
-
) -> ContextualRelevancyVerdicts:
|
193
|
-
prompt = ContextualRelevancyTemplate.generate_verdicts(
|
194
|
-
input=input, context=context
|
195
|
-
)
|
196
|
-
if self.using_native_model:
|
197
|
-
res = await self.model.a_generate(prompt)
|
198
|
-
data = parse_response_json(res, self)
|
199
|
-
return ContextualRelevancyVerdicts(**data)
|
200
|
-
else:
|
201
|
-
try:
|
202
|
-
res = await self.model.a_generate(
|
203
|
-
prompt, schema=ContextualRelevancyVerdicts
|
204
|
-
)
|
205
|
-
return res
|
206
|
-
except TypeError:
|
207
|
-
res = await self.model.a_generate(prompt)
|
208
|
-
data = parse_response_json(res, self)
|
209
|
-
return ContextualRelevancyVerdicts(**data)
|
210
|
-
|
211
|
-
def _generate_verdicts(
|
212
|
-
self, input: str, context: str
|
213
|
-
) -> ContextualRelevancyVerdicts:
|
214
|
-
prompt = ContextualRelevancyTemplate.generate_verdicts(
|
215
|
-
input=input, context=context
|
216
|
-
)
|
217
|
-
if self.using_native_model:
|
218
|
-
res = self.model.generate(prompt)
|
219
|
-
data = parse_response_json(res, self)
|
220
|
-
return ContextualRelevancyVerdicts(**data)
|
221
|
-
else:
|
222
|
-
try:
|
223
|
-
res = self.model.generate(
|
224
|
-
prompt, schema=ContextualRelevancyVerdicts
|
225
|
-
)
|
226
|
-
return res
|
227
|
-
except TypeError:
|
228
|
-
res = self.model.generate(prompt)
|
229
|
-
data = parse_response_json(res, self)
|
230
|
-
return ContextualRelevancyVerdicts(**data)
|
231
|
-
|
232
|
-
def _success_check(self) -> bool:
|
233
|
-
if self.error is not None:
|
234
|
-
self.success = False
|
235
|
-
else:
|
236
|
-
try:
|
237
|
-
self.success = self.score >= self.threshold
|
238
|
-
except:
|
239
|
-
self.success = False
|
240
|
-
return self.success
|
241
|
-
|
242
|
-
@property
|
243
|
-
def __name__(self):
|
244
|
-
return "Contextual Relevancy"
|
245
|
-
|