judgeval 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +0 -71
- judgeval/clients.py +14 -3
- judgeval/common/tracer.py +57 -31
- judgeval/constants.py +1 -0
- judgeval/data/__init__.py +2 -1
- judgeval/data/scorer_data.py +2 -2
- judgeval/evaluation_run.py +16 -15
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/base_judge.py +1 -1
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +2 -2
- judgeval/judges/together_judge.py +2 -2
- judgeval/judges/utils.py +4 -4
- judgeval/judgment_client.py +67 -15
- judgeval/run_evaluation.py +79 -14
- judgeval/scorers/__init__.py +8 -4
- judgeval/scorers/api_scorer.py +64 -0
- judgeval/scorers/base_scorer.py +3 -2
- judgeval/scorers/exceptions.py +11 -0
- judgeval/scorers/{custom_scorer.py → judgeval_scorer.py} +9 -5
- judgeval/scorers/judgeval_scorers/__init__.py +132 -9
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +23 -0
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +19 -0
- judgeval/scorers/judgeval_scorers/{answer_relevancy.py → api_scorers/answer_relevancy.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_precision.py → api_scorers/contextual_precision.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_recall.py → api_scorers/contextual_recall.py} +2 -2
- judgeval/scorers/judgeval_scorers/{contextual_relevancy.py → api_scorers/contextual_relevancy.py} +2 -2
- judgeval/scorers/judgeval_scorers/{faithfulness.py → api_scorers/faithfulness.py} +2 -2
- judgeval/scorers/judgeval_scorers/{hallucination.py → api_scorers/hallucination.py} +2 -2
- judgeval/scorers/judgeval_scorers/{json_correctness.py → api_scorers/json_correctness.py} +7 -7
- judgeval/scorers/judgeval_scorers/{summarization.py → api_scorers/summarization.py} +2 -2
- judgeval/scorers/judgeval_scorers/{tool_correctness.py → api_scorers/tool_correctness.py} +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +24 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +272 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +169 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +4 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +292 -0
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +174 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +259 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +106 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +249 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +142 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +240 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +121 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +318 -0
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +265 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +258 -0
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +104 -0
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +127 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +247 -0
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +541 -0
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +3 -0
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +151 -0
- judgeval/scorers/prompt_scorer.py +4 -4
- judgeval/scorers/score.py +14 -14
- judgeval/scorers/utils.py +40 -6
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/METADATA +1 -1
- judgeval-0.0.5.dist-info/RECORD +78 -0
- judgeval-0.0.3.dist-info/RECORD +0 -46
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/WHEEL +0 -0
- {judgeval-0.0.3.dist-info → judgeval-0.0.5.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
from pydantic import BaseModel, Field
|
3
|
+
|
4
|
+
|
5
|
+
class ContextualRecallVerdict(BaseModel):
|
6
|
+
verdict: str
|
7
|
+
reason: str
|
8
|
+
|
9
|
+
|
10
|
+
class Verdicts(BaseModel):
|
11
|
+
verdicts: List[ContextualRecallVerdict]
|
12
|
+
|
13
|
+
|
14
|
+
class Reason(BaseModel):
|
15
|
+
reason: str
|
16
|
+
|
17
|
+
|
18
|
+
class ContextualRecallTemplate:
|
19
|
+
|
20
|
+
@staticmethod
|
21
|
+
def generate_verdicts(expected_output, retrieval_context):
|
22
|
+
return f"""
|
23
|
+
==== TASK INSTRUCTIONS ====
|
24
|
+
You will be provided with an expected output and a retrieval context (list of retrieved documents). Your task is to take each sentence in the expected output and determine whether the sentence is ATTRIBUTABLE or RELEVANT to ANY PART of the retrieval context.
|
25
|
+
|
26
|
+
==== FORMATTING YOUR ANSWER ====
|
27
|
+
Please format your answer as a list of JSON objects, each with two keys: `verdict` and `reason`.
|
28
|
+
The `verdict` key should STRICTLY be 'yes' or 'no'. You should answer 'yes' if the sentence can be attributed/is relevant to ANY PART(S) of the retrieval context. If not, you should answer 'no'.
|
29
|
+
The `reason` key should provide a justification of your verdict. In the justification, you should aim to include references to the document(s) in the retrieval context (eg., 1st document, and 2nd document in the retrieval context) that is attributed/relevant to the expected output sentence.
|
30
|
+
Please also AIM TO CITE the specific part of the retrieval context to justify your verdict, but **be extremely concise! Cut short the quote with an ellipsis if possible**.
|
31
|
+
|
32
|
+
Here's an example of formatting your answer:
|
33
|
+
{{
|
34
|
+
"verdicts": [
|
35
|
+
{{
|
36
|
+
"verdict": "yes",
|
37
|
+
"reason": "..."
|
38
|
+
}},
|
39
|
+
...
|
40
|
+
]
|
41
|
+
}}
|
42
|
+
|
43
|
+
==== EXAMPLE ====
|
44
|
+
Expected Output:
|
45
|
+
The Earth's climate has warmed significantly over the past century. This warming is primarily caused by human activities like burning fossil fuels. Today's weather was sunny and warm.
|
46
|
+
|
47
|
+
Retrieval Context:
|
48
|
+
["Global temperatures have risen by approximately 1.1°C since pre-industrial times, with most of this increase occurring in the past 100 years.",
|
49
|
+
"Scientific consensus shows that greenhouse gas emissions from human activities, particularly the burning of coal, oil and gas, are the main driver of observed climate change."]
|
50
|
+
|
51
|
+
Example Response:
|
52
|
+
{{
|
53
|
+
"verdicts": [
|
54
|
+
{{
|
55
|
+
"verdict": "yes",
|
56
|
+
"reason": "The 1st document directly confirms this, stating 'temperatures have risen by approximately 1.1°C...in the past 100 years'"
|
57
|
+
}},
|
58
|
+
{{
|
59
|
+
"verdict": "yes",
|
60
|
+
"reason": "The 2nd document explicitly states that 'greenhouse gas emissions from human activities, particularly the burning of...fossil fuels' drive climate change"
|
61
|
+
}},
|
62
|
+
{{
|
63
|
+
"verdict": "no",
|
64
|
+
"reason": "Neither document contains information about today's specific weather conditions"
|
65
|
+
}}
|
66
|
+
]
|
67
|
+
}}
|
68
|
+
|
69
|
+
Since your task is to generate a verdict for each sentence, the number of 'verdicts' SHOULD BE EXACTLY EQUAL to the number of sentences in of `expected output`.
|
70
|
+
**
|
71
|
+
|
72
|
+
==== YOUR TURN ====
|
73
|
+
Expected Output:
|
74
|
+
{expected_output}
|
75
|
+
|
76
|
+
Retrieval Context:
|
77
|
+
{retrieval_context}
|
78
|
+
|
79
|
+
JSON:
|
80
|
+
"""
|
81
|
+
@staticmethod
|
82
|
+
def generate_reason(
|
83
|
+
expected_output, supportive_reasons, unsupportive_reasons, score
|
84
|
+
):
|
85
|
+
return f"""
|
86
|
+
==== PROBLEM SETUP ====
|
87
|
+
You will be provided with an expected output, a list of supportive reasons, a list of unsupportive reasons, and a contextual recall score. Let's break down each input component:
|
88
|
+
- expected output: A text generated by a language model to answer a question/solve a task.
|
89
|
+
- supportive reasons: A list of reasons why a specific sentence in the expected output can be attributed/is relevant to any part of the retrieval context (a list of documents retrieved in a RAG pipeline)
|
90
|
+
- unsupportive reasons: A list of reasons why a specific sentence in the expected output cannot be attributed/is not relevant to any part of the retrieval context
|
91
|
+
**NOTE**: The reasons are provided in the form of "Sentence <number>: <reason>", where <number> is the sentence number in the expected output.
|
92
|
+
- contextual recall score: A score between 0 and 1 (closer to 1 the better) representing how much of the expected output can be attributed/is relevant to any part of the retrieval context.
|
93
|
+
The point of this score is to measure how well the retriever in a RAG pipeline operates, retrieving relevant documents that should back the expected output of the RAG generator.
|
94
|
+
|
95
|
+
==== TASK INSTRUCTIONS ====
|
96
|
+
Given these inputs, summarize a CONCISE and CLEAR reason for the value of the contextual recall score. Remember, the score is a measure of how well the retriever in a RAG pipeline operates, retrieving relevant documents that should back the expected output of the RAG generator.
|
97
|
+
In your reason, you should reference the supportive/unsupportive reasons by their sentence number to justify the score. Make specific references to the retrieval context in your reason if applicable.
|
98
|
+
|
99
|
+
==== FORMATTING YOUR ANSWER ====
|
100
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
101
|
+
Example JSON:
|
102
|
+
{{
|
103
|
+
"reason": "The score is <contextual_recall_score> because <your_reason>."
|
104
|
+
}}
|
105
|
+
|
106
|
+
DO NOT mention 'supportive reasons' and 'unsupportive reasons' in your reason, these terms are just here for you to understand the broader scope of things.
|
107
|
+
If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
108
|
+
|
109
|
+
==== EXAMPLE ====
|
110
|
+
Expected Output:
|
111
|
+
The Earth's climate has warmed significantly over the past century. This warming is primarily caused by human activities like burning fossil fuels. Today's weather was sunny and warm.
|
112
|
+
|
113
|
+
Supportive Reasons:
|
114
|
+
Sentence 1: The first document confirms this by stating global temperatures have risen by 1.1°C in the past 100 years
|
115
|
+
Sentence 2: The second document directly states that human activities and fossil fuel burning drive climate change
|
116
|
+
|
117
|
+
Unsupportive Reasons:
|
118
|
+
Sentence 3: Neither document contains information about today's specific weather conditions
|
119
|
+
|
120
|
+
Contextual Recall Score:
|
121
|
+
0.67
|
122
|
+
|
123
|
+
Example Response:
|
124
|
+
{{
|
125
|
+
"reason": "The score is 0.67 because while sentences 1 and 2 are well-supported by the retrieval context with specific temperature data and human activity impacts, sentence 3 about today's weather has no backing in the provided documents."
|
126
|
+
}}
|
127
|
+
|
128
|
+
==== YOUR TURN ====
|
129
|
+
Contextual Recall Score:
|
130
|
+
{score}
|
131
|
+
|
132
|
+
Expected Output:
|
133
|
+
{expected_output}
|
134
|
+
|
135
|
+
Supportive Reasons:
|
136
|
+
{supportive_reasons}
|
137
|
+
|
138
|
+
Unsupportive Reasons:
|
139
|
+
{unsupportive_reasons}
|
140
|
+
|
141
|
+
JSON:
|
142
|
+
"""
|
@@ -0,0 +1,240 @@
|
|
1
|
+
from typing import Optional, List, Union
|
2
|
+
import asyncio
|
3
|
+
|
4
|
+
from judgeval.scorers.utils import (get_or_create_event_loop,
|
5
|
+
scorer_progress_meter,
|
6
|
+
create_verbose_logs,
|
7
|
+
parse_response_json,
|
8
|
+
check_example_params
|
9
|
+
)
|
10
|
+
from judgeval.scorers import JudgevalScorer
|
11
|
+
from judgeval.judges import JudgevalJudge
|
12
|
+
from judgeval.judges.utils import create_judge
|
13
|
+
from judgeval.data import Example, ExampleParams
|
14
|
+
from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.prompts import *
|
15
|
+
|
16
|
+
|
17
|
+
required_params = [
|
18
|
+
ExampleParams.INPUT,
|
19
|
+
ExampleParams.ACTUAL_OUTPUT,
|
20
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
21
|
+
]
|
22
|
+
|
23
|
+
|
24
|
+
class ContextualRelevancyScorer(JudgevalScorer):
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
threshold: float = 0.5,
|
28
|
+
model: Optional[Union[str, JudgevalJudge]] = None,
|
29
|
+
include_reason: bool = True,
|
30
|
+
async_mode: bool = True,
|
31
|
+
strict_mode: bool = False,
|
32
|
+
verbose_mode: bool = False,
|
33
|
+
user: Optional[str] = None
|
34
|
+
):
|
35
|
+
self.user = user
|
36
|
+
self.threshold = 1 if strict_mode else threshold
|
37
|
+
self.model, self.using_native_model = create_judge(model)
|
38
|
+
self.evaluation_model = self.model.get_model_name()
|
39
|
+
self.include_reason = include_reason
|
40
|
+
self.async_mode = async_mode
|
41
|
+
self.strict_mode = strict_mode
|
42
|
+
self.verbose_mode = verbose_mode
|
43
|
+
|
44
|
+
def score_example(
|
45
|
+
self,
|
46
|
+
example: Example,
|
47
|
+
_show_indicator: bool = True,
|
48
|
+
) -> float:
|
49
|
+
check_example_params(example, required_params, self)
|
50
|
+
|
51
|
+
with scorer_progress_meter(self, display_meter=_show_indicator):
|
52
|
+
if self.async_mode:
|
53
|
+
loop = get_or_create_event_loop()
|
54
|
+
loop.run_until_complete(
|
55
|
+
self.a_score_example(example, _show_indicator=False)
|
56
|
+
)
|
57
|
+
else:
|
58
|
+
self.verdicts_list: List[ContextualRelevancyVerdicts] = [
|
59
|
+
(self._generate_verdicts(example.input, context))
|
60
|
+
for context in example.retrieval_context
|
61
|
+
]
|
62
|
+
self.score = self._calculate_score()
|
63
|
+
self.reason = self._generate_reason(example.input)
|
64
|
+
self.success = self.score >= self.threshold
|
65
|
+
self.verbose_logs = create_verbose_logs(
|
66
|
+
self,
|
67
|
+
steps=[
|
68
|
+
f"Verdicts:\n{[v.model_dump() for v in self.verdicts_list]}",
|
69
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
70
|
+
],
|
71
|
+
)
|
72
|
+
|
73
|
+
return self.score
|
74
|
+
|
75
|
+
async def a_score_example(
|
76
|
+
self,
|
77
|
+
example: Example,
|
78
|
+
_show_indicator: bool = True,
|
79
|
+
) -> float:
|
80
|
+
check_example_params(example, required_params, self)
|
81
|
+
|
82
|
+
with scorer_progress_meter(
|
83
|
+
self,
|
84
|
+
async_mode=True,
|
85
|
+
display_meter=_show_indicator,
|
86
|
+
):
|
87
|
+
self.verdicts_list: List[ContextualRelevancyVerdicts] = (
|
88
|
+
await asyncio.gather(
|
89
|
+
*[
|
90
|
+
self._a_generate_verdicts(example.input, context)
|
91
|
+
for context in example.retrieval_context
|
92
|
+
]
|
93
|
+
)
|
94
|
+
)
|
95
|
+
self.score = self._calculate_score()
|
96
|
+
self.reason = await self._a_generate_reason(example.input)
|
97
|
+
self.success = self.score >= self.threshold
|
98
|
+
self.verbose_logs = create_verbose_logs(
|
99
|
+
self,
|
100
|
+
steps=[
|
101
|
+
f"Verdicts:\n{[v.model_dump() for v in self.verdicts_list]}",
|
102
|
+
f"Score: {self.score}\nReason: {self.reason}",
|
103
|
+
],
|
104
|
+
)
|
105
|
+
return self.score
|
106
|
+
|
107
|
+
async def _a_generate_reason(self, input: str):
|
108
|
+
if self.include_reason is False:
|
109
|
+
return None
|
110
|
+
|
111
|
+
irrelevancies = []
|
112
|
+
relevant_statements = []
|
113
|
+
for verdicts in self.verdicts_list:
|
114
|
+
for verdict in verdicts.verdicts:
|
115
|
+
if verdict.verdict.lower() == "no":
|
116
|
+
irrelevancies.append(verdict.model_dump())
|
117
|
+
else:
|
118
|
+
relevant_statements.append(verdict.model_dump())
|
119
|
+
prompt: dict = ContextualRelevancyTemplate.generate_reason(
|
120
|
+
input=input,
|
121
|
+
irrelevancies=irrelevancies,
|
122
|
+
relevant_statements=relevant_statements,
|
123
|
+
score=format(self.score, ".2f"),
|
124
|
+
)
|
125
|
+
if self.using_native_model:
|
126
|
+
res = await self.model.a_generate(prompt)
|
127
|
+
data = parse_response_json(res, self)
|
128
|
+
return data["reason"]
|
129
|
+
else:
|
130
|
+
try:
|
131
|
+
res: Reason = await self.model.a_generate(prompt, schema=Reason)
|
132
|
+
return res.reason
|
133
|
+
except TypeError:
|
134
|
+
res = await self.model.a_generate(prompt)
|
135
|
+
data = parse_response_json(res, self)
|
136
|
+
return data["reason"]
|
137
|
+
|
138
|
+
def _generate_reason(self, input: str):
|
139
|
+
if self.include_reason is False:
|
140
|
+
return None
|
141
|
+
|
142
|
+
irrelevancies = []
|
143
|
+
relevant_statements = []
|
144
|
+
for verdicts in self.verdicts_list:
|
145
|
+
for verdict in verdicts.verdicts:
|
146
|
+
if verdict.verdict.lower() == "no":
|
147
|
+
irrelevancies.append(verdict.reason)
|
148
|
+
else:
|
149
|
+
relevant_statements.append(verdict.statement)
|
150
|
+
|
151
|
+
prompt: dict = ContextualRelevancyTemplate.generate_reason(
|
152
|
+
input=input,
|
153
|
+
irrelevancies=irrelevancies,
|
154
|
+
relevant_statements=relevant_statements,
|
155
|
+
score=format(self.score, ".2f"),
|
156
|
+
)
|
157
|
+
if self.using_native_model:
|
158
|
+
res = self.model.generate(prompt)
|
159
|
+
data = parse_response_json(res, self)
|
160
|
+
return data["reason"]
|
161
|
+
else:
|
162
|
+
try:
|
163
|
+
res: Reason = self.model.generate(prompt, schema=Reason)
|
164
|
+
return res.reason
|
165
|
+
except TypeError:
|
166
|
+
res = self.model.generate(prompt)
|
167
|
+
data = parse_response_json(res, self)
|
168
|
+
return data["reason"]
|
169
|
+
|
170
|
+
def _calculate_score(self):
|
171
|
+
total_verdicts = 0
|
172
|
+
relevant_statements = 0
|
173
|
+
for verdicts in self.verdicts_list:
|
174
|
+
for verdict in verdicts.verdicts:
|
175
|
+
total_verdicts += 1
|
176
|
+
if verdict.verdict.lower() == "yes":
|
177
|
+
relevant_statements += 1
|
178
|
+
|
179
|
+
if total_verdicts == 0:
|
180
|
+
return 0
|
181
|
+
|
182
|
+
score = relevant_statements / total_verdicts
|
183
|
+
return 0 if self.strict_mode and score < self.threshold else score
|
184
|
+
|
185
|
+
async def _a_generate_verdicts(
|
186
|
+
self, input: str, context: List[str]
|
187
|
+
) -> ContextualRelevancyVerdicts:
|
188
|
+
prompt = ContextualRelevancyTemplate.generate_verdicts(
|
189
|
+
input=input, context=context
|
190
|
+
)
|
191
|
+
if self.using_native_model:
|
192
|
+
res = await self.model.a_generate(prompt)
|
193
|
+
data = parse_response_json(res, self)
|
194
|
+
return ContextualRelevancyVerdicts(**data)
|
195
|
+
else:
|
196
|
+
try:
|
197
|
+
res = await self.model.a_generate(
|
198
|
+
prompt, schema=ContextualRelevancyVerdicts
|
199
|
+
)
|
200
|
+
return res
|
201
|
+
except TypeError:
|
202
|
+
res = await self.model.a_generate(prompt)
|
203
|
+
data = parse_response_json(res, self)
|
204
|
+
return ContextualRelevancyVerdicts(**data)
|
205
|
+
|
206
|
+
def _generate_verdicts(
|
207
|
+
self, input: str, context: str
|
208
|
+
) -> ContextualRelevancyVerdicts:
|
209
|
+
prompt = ContextualRelevancyTemplate.generate_verdicts(
|
210
|
+
input=input, context=context
|
211
|
+
)
|
212
|
+
if self.using_native_model:
|
213
|
+
res = self.model.generate(prompt)
|
214
|
+
data = parse_response_json(res, self)
|
215
|
+
return ContextualRelevancyVerdicts(**data)
|
216
|
+
else:
|
217
|
+
try:
|
218
|
+
res = self.model.generate(
|
219
|
+
prompt, schema=ContextualRelevancyVerdicts
|
220
|
+
)
|
221
|
+
return res
|
222
|
+
except TypeError:
|
223
|
+
res = self.model.generate(prompt)
|
224
|
+
data = parse_response_json(res, self)
|
225
|
+
return ContextualRelevancyVerdicts(**data)
|
226
|
+
|
227
|
+
def _success_check(self) -> bool:
|
228
|
+
if self.error is not None:
|
229
|
+
self.success = False
|
230
|
+
else:
|
231
|
+
try:
|
232
|
+
self.success = self.score >= self.threshold
|
233
|
+
except:
|
234
|
+
self.success = False
|
235
|
+
return self.success
|
236
|
+
|
237
|
+
@property
|
238
|
+
def __name__(self):
|
239
|
+
return "Contextual Relevancy"
|
240
|
+
|
@@ -0,0 +1,121 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
from pydantic import BaseModel, Field
|
3
|
+
|
4
|
+
|
5
|
+
class ContextualRelevancyVerdict(BaseModel):
|
6
|
+
statement: str
|
7
|
+
verdict: str
|
8
|
+
reason: str
|
9
|
+
|
10
|
+
|
11
|
+
class ContextualRelevancyVerdicts(BaseModel):
|
12
|
+
verdicts: List[ContextualRelevancyVerdict]
|
13
|
+
|
14
|
+
|
15
|
+
class Reason(BaseModel):
|
16
|
+
reason: str
|
17
|
+
|
18
|
+
|
19
|
+
class ContextualRelevancyTemplate:
|
20
|
+
|
21
|
+
@staticmethod
|
22
|
+
def generate_verdicts(input: str, context: str):
|
23
|
+
return f"""==== TASK INSTRUCTIONS ====
|
24
|
+
You will be provided with an input (str) and a context (str). The input is a question/task proposed to a language model and the context is a list of documents retrieved in a RAG pipeline.
|
25
|
+
Your task is to determine whether each statement found in the context is relevant to the input. To do so, break down the context into statements (high level pieces of information), then determine whether each statement is relevant to the input.
|
26
|
+
|
27
|
+
==== FORMATTING YOUR ANSWER ====
|
28
|
+
|
29
|
+
You should format your answer as a list of JSON objects, with each JSON object containing the following fields:
|
30
|
+
- 'verdict': a string that is EXACTLY EITHER 'yes' or 'no', indicating whether the statement is relevant to the input
|
31
|
+
- 'statement': a string that is the statement found in the context
|
32
|
+
- 'reason': an string that is the justification for why the statement is relevant to the input. IF your verdict is 'no', you MUST quote the irrelevant parts of the statement to back up your reason.
|
33
|
+
|
34
|
+
IMPORTANT: Please make sure to only return in JSON format.
|
35
|
+
|
36
|
+
==== EXAMPLE ====
|
37
|
+
Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
|
38
|
+
Example Input: "What were some of Einstein's achievements?"
|
39
|
+
|
40
|
+
Example:
|
41
|
+
{{
|
42
|
+
"verdicts": [
|
43
|
+
{{
|
44
|
+
"verdict": "yes",
|
45
|
+
"statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
|
46
|
+
}},
|
47
|
+
{{
|
48
|
+
"verdict": "no",
|
49
|
+
"statement": "There was a cat.",
|
50
|
+
"reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements."
|
51
|
+
}}
|
52
|
+
]
|
53
|
+
}}
|
54
|
+
|
55
|
+
==== YOUR TURN ====
|
56
|
+
|
57
|
+
Input:
|
58
|
+
{input}
|
59
|
+
|
60
|
+
Context:
|
61
|
+
{context}
|
62
|
+
|
63
|
+
JSON:
|
64
|
+
"""
|
65
|
+
|
66
|
+
@staticmethod
|
67
|
+
def generate_reason(
|
68
|
+
input: str,
|
69
|
+
irrelevancies: List[str],
|
70
|
+
relevant_statements: List[str],
|
71
|
+
score: float,
|
72
|
+
):
|
73
|
+
return f"""==== TASK INSTRUCTIONS ====
|
74
|
+
You will be provided with the following information:
|
75
|
+
- An input to a RAG pipeline which is a question/task. There is an associated retrieval context to this input in the RAG pipeline (the context is not provided but is relevant to your task).
|
76
|
+
- A list of irrelevant statements from the retrieval context. These statements are not relevant to the input query.
|
77
|
+
- A list of relevant statements from the retrieval context. These statements are relevant to the input query.
|
78
|
+
- A contextual relevancy score (the closer to 1 the better). Contextual relevancy is a measurement of how relevant the retrieval context is to the input query.
|
79
|
+
|
80
|
+
Your task is to generate a CLEAR and CONCISE reason for the score. You should quote data provided in the reasons for the irrelevant and relevant statements to support your reason.
|
81
|
+
|
82
|
+
==== FORMATTING YOUR ANSWER ====
|
83
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
84
|
+
Example JSON:
|
85
|
+
{{
|
86
|
+
"reason": "The score is <contextual_relevancy_score> because <your_reason>."
|
87
|
+
}}
|
88
|
+
|
89
|
+
If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
90
|
+
|
91
|
+
==== EXAMPLE ====
|
92
|
+
Input: "What is the capital of France?"
|
93
|
+
|
94
|
+
Contextual Relevancy Score: 0.67
|
95
|
+
|
96
|
+
Irrelevant Statements from the retrieval context:
|
97
|
+
[{{"statement": "Flights to Paris are available from San Francisco starting at $1000", "reason": "Flight prices and routes are not relevant to identifying the capital of France"}}]
|
98
|
+
|
99
|
+
Relevant Statements from the retrieval context:
|
100
|
+
[{{"statement": "Paris is the capital of France"}}, {{"statement": "Paris is a major European city"}}]
|
101
|
+
|
102
|
+
Example Response:
|
103
|
+
{{
|
104
|
+
"reason": "The score is 0.67 because while the context contains directly relevant information stating that 'Paris is the capital of France', it also includes irrelevant travel information about flight prices from San Francisco."
|
105
|
+
}}
|
106
|
+
|
107
|
+
==== YOUR TURN ====
|
108
|
+
Contextual Relevancy Score:
|
109
|
+
{score}
|
110
|
+
|
111
|
+
Input:
|
112
|
+
{input}
|
113
|
+
|
114
|
+
Irrelevant Statements from the retrieval context:
|
115
|
+
{irrelevancies}
|
116
|
+
|
117
|
+
Relevant Statements from the retrieval context:
|
118
|
+
{relevant_statements}
|
119
|
+
|
120
|
+
JSON:
|
121
|
+
"""
|