judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. judgeval/common/s3_storage.py +93 -0
  2. judgeval/common/tracer.py +612 -123
  3. judgeval/data/sequence.py +4 -10
  4. judgeval/judgment_client.py +25 -86
  5. judgeval/rules.py +4 -7
  6. judgeval/run_evaluation.py +1 -1
  7. judgeval/scorers/__init__.py +4 -4
  8. judgeval/scorers/judgeval_scorers/__init__.py +0 -176
  9. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
  10. judgeval-0.0.33.dist-info/RECORD +63 -0
  11. judgeval/scorers/base_scorer.py +0 -58
  12. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
  13. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  14. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
  15. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  16. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  17. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  18. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  19. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  20. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
  21. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
  22. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  23. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  24. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  25. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  26. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  27. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  28. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  29. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  30. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  31. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
  32. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
  33. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  34. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
  35. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  36. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  37. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
  38. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  39. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
  40. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
  41. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  42. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  43. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  44. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  45. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
  46. judgeval-0.0.32.dist-info/RECORD +0 -97
  47. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
  48. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,254 +0,0 @@
1
- from typing import Optional, List, Union
2
-
3
- from judgeval.constants import APIScorer
4
- from judgeval.scorers.utils import (
5
- get_or_create_event_loop,
6
- parse_response_json,
7
- scorer_progress_meter,
8
- create_verbose_logs,
9
- check_example_params
10
- )
11
- from judgeval.judges.utils import create_judge
12
- from judgeval.scorers import JudgevalScorer
13
- from judgeval.judges import JudgevalJudge
14
- from judgeval.judges.utils import create_judge
15
- from judgeval.data import Example, ExampleParams
16
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_recall.prompts import *
17
-
18
- required_params = [
19
- ExampleParams.INPUT,
20
- ExampleParams.ACTUAL_OUTPUT,
21
- ExampleParams.EXPECTED_OUTPUT,
22
- ExampleParams.RETRIEVAL_CONTEXT,
23
- ]
24
-
25
- class ContextualRecallScorer(JudgevalScorer):
26
- def __init__(
27
- self,
28
- threshold: float = 0.5,
29
- model: Optional[Union[str, JudgevalJudge]] = None,
30
- include_reason: bool = True,
31
- async_mode: bool = True,
32
- strict_mode: bool = False,
33
- verbose_mode: bool = False,
34
- user: Optional[str] = None
35
- ):
36
- super().__init__(
37
- score_type=APIScorer.CONTEXTUAL_RECALL,
38
- threshold=1 if strict_mode else threshold,
39
- evaluation_model=None,
40
- include_reason=include_reason,
41
- async_mode=async_mode,
42
- strict_mode=strict_mode,
43
- verbose_mode=verbose_mode
44
- )
45
- self.user = user
46
- self.model, self.using_native_model = create_judge(model)
47
- self.evaluation_model = self.model.get_model_name()
48
-
49
- def score_example(
50
- self,
51
- example: Example,
52
- _show_indicator: bool = True,
53
- ) -> float:
54
- check_example_params(example, required_params, self)
55
-
56
- with scorer_progress_meter(self, display_meter=_show_indicator):
57
- if self.async_mode:
58
- loop = get_or_create_event_loop()
59
- loop.run_until_complete(
60
- self.a_score_example(example, _show_indicator=False)
61
- )
62
- else:
63
- self.verdicts: List[ContextualRecallVerdict] = (
64
- self._generate_verdicts(
65
- example.expected_output, example.retrieval_context
66
- )
67
- )
68
- self.score = self._calculate_score()
69
- self.reason = self._generate_reason(example.expected_output)
70
- self.success = self.score >= self.threshold
71
- self.verbose_logs = create_verbose_logs(
72
- self,
73
- steps=[
74
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
75
- f"Score: {self.score}\nReason: {self.reason}",
76
- ],
77
- )
78
- return self.score
79
-
80
- async def a_score_example(
81
- self,
82
- example: Example,
83
- _show_indicator: bool = True,
84
- ) -> float:
85
- check_example_params(example, required_params, self)
86
-
87
- with scorer_progress_meter(
88
- self,
89
- async_mode=True,
90
- display_meter=_show_indicator,
91
- ):
92
- self.verdicts: List[ContextualRecallVerdict] = (
93
- await self._a_generate_verdicts(
94
- example.expected_output, example.retrieval_context
95
- )
96
- )
97
- self.score = self._calculate_score()
98
- self.reason = await self._a_generate_reason(
99
- example.expected_output
100
- )
101
- self.success = self.score >= self.threshold
102
- self.verbose_logs = create_verbose_logs(
103
- self,
104
- steps=[
105
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
106
- f"Score: {self.score}\nReason: {self.reason}",
107
- ],
108
- )
109
- return self.score
110
-
111
- async def _a_generate_reason(self, expected_output: str):
112
- if self.include_reason is False:
113
- return None
114
-
115
- supportive_reasons = []
116
- unsupportive_reasons = []
117
- for idx, verdict in enumerate(self.verdicts):
118
- if verdict.verdict.lower() == "yes":
119
- supportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
120
- else:
121
- unsupportive_reasons.append(f"Sentence {idx + 1}: {verdict.reason}")
122
-
123
- prompt = ContextualRecallTemplate.generate_reason(
124
- expected_output=expected_output,
125
- supportive_reasons=supportive_reasons,
126
- unsupportive_reasons=unsupportive_reasons,
127
- score=format(self.score, ".2f"),
128
- )
129
-
130
- if self.using_native_model:
131
- res = await self.model.a_generate(prompt)
132
- data = parse_response_json(res, self)
133
- return data["reason"]
134
- else:
135
- try:
136
- res: Reason = await self.model.a_generate(prompt, schema=Reason)
137
- return res.reason
138
- except TypeError:
139
- res = await self.model.a_generate(prompt)
140
- data = parse_response_json(res, self)
141
- return data["reason"]
142
-
143
- def _generate_reason(self, expected_output: str):
144
- if self.include_reason is False:
145
- return None
146
-
147
- supportive_reasons = []
148
- unsupportive_reasons = []
149
- for verdict in self.verdicts:
150
- if verdict.verdict.lower() == "yes":
151
- supportive_reasons.append(verdict.reason)
152
- else:
153
- unsupportive_reasons.append(verdict.reason)
154
-
155
- prompt = ContextualRecallTemplate.generate_reason(
156
- expected_output=expected_output,
157
- supportive_reasons=supportive_reasons,
158
- unsupportive_reasons=unsupportive_reasons,
159
- score=format(self.score, ".2f"),
160
- )
161
-
162
- if self.using_native_model:
163
- res = self.model.generate(prompt)
164
- data = parse_response_json(res, self)
165
- return data["reason"]
166
- else:
167
- try:
168
- res: Reason = self.model.generate(prompt, schema=Reason)
169
- return res.reason
170
- except TypeError:
171
- res = self.model.generate(prompt)
172
- data = parse_response_json(res, self)
173
- return data["reason"]
174
-
175
- def _calculate_score(self):
176
- number_of_verdicts = len(self.verdicts)
177
- if number_of_verdicts == 0:
178
- return 0
179
-
180
- justified_sentences = 0
181
- for verdict in self.verdicts:
182
- if verdict.verdict.lower() == "yes":
183
- justified_sentences += 1
184
-
185
- score = justified_sentences / number_of_verdicts
186
- return 0 if self.strict_mode and score < self.threshold else score
187
-
188
- async def _a_generate_verdicts(
189
- self, expected_output: str, retrieval_context: List[str]
190
- ) -> List[ContextualRecallVerdict]:
191
- prompt = ContextualRecallTemplate.generate_verdicts(
192
- expected_output=expected_output, retrieval_context=retrieval_context
193
- )
194
- if self.using_native_model:
195
- res = await self.model.a_generate(prompt)
196
- data = parse_response_json(res, self)
197
- verdicts = [
198
- ContextualRecallVerdict(**item) for item in data["verdicts"]
199
- ]
200
- return verdicts
201
- else:
202
- try:
203
- res: Verdicts = await self.model.a_generate(
204
- prompt, schema=Verdicts
205
- )
206
- verdicts: Verdicts = [item for item in res.verdicts]
207
- return verdicts
208
- except TypeError:
209
- res = await self.model.a_generate(prompt)
210
- data = parse_response_json(res, self)
211
- verdicts = [
212
- ContextualRecallVerdict(**item) for item in data["verdicts"]
213
- ]
214
- return verdicts
215
-
216
- def _generate_verdicts(
217
- self, expected_output: str, retrieval_context: List[str]
218
- ) -> List[ContextualRecallVerdict]:
219
- prompt = ContextualRecallTemplate.generate_verdicts(
220
- expected_output=expected_output, retrieval_context=retrieval_context
221
- )
222
- if self.using_native_model:
223
- res = self.model.generate(prompt)
224
- data = parse_response_json(res, self)
225
- verdicts = [
226
- ContextualRecallVerdict(**item) for item in data["verdicts"]
227
- ]
228
- return verdicts
229
- else:
230
- try:
231
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
232
- verdicts: Verdicts = [item for item in res.verdicts]
233
- return verdicts
234
- except TypeError:
235
- res = self.model.generate(prompt)
236
- data = parse_response_json(res, self)
237
- verdicts = [
238
- ContextualRecallVerdict(**item) for item in data["verdicts"]
239
- ]
240
- return verdicts
241
-
242
- def _success_check(self) -> bool:
243
- if self.error is not None:
244
- self.success = False
245
- else:
246
- try:
247
- self.success = self.score >= self.threshold
248
- except:
249
- self.success = False
250
- return self.success
251
-
252
- @property
253
- def __name__(self):
254
- return "Contextual Recall"
@@ -1,142 +0,0 @@
1
- from typing import List
2
- from pydantic import BaseModel
3
-
4
-
5
- class ContextualRecallVerdict(BaseModel):
6
- verdict: str
7
- reason: str
8
-
9
-
10
- class Verdicts(BaseModel):
11
- verdicts: List[ContextualRecallVerdict]
12
-
13
-
14
- class Reason(BaseModel):
15
- reason: str
16
-
17
-
18
- class ContextualRecallTemplate:
19
-
20
- @staticmethod
21
- def generate_verdicts(expected_output, retrieval_context):
22
- return f"""
23
- ==== TASK INSTRUCTIONS ====
24
- You will be provided with an expected output and a retrieval context (list of retrieved documents). Your task is to take each sentence in the expected output and determine whether the sentence is ATTRIBUTABLE or RELEVANT to ANY PART of the retrieval context.
25
-
26
- ==== FORMATTING YOUR ANSWER ====
27
- Please format your answer as a list of JSON objects, each with two keys: `verdict` and `reason`.
28
- The `verdict` key should STRICTLY be 'yes' or 'no'. You should answer 'yes' if the sentence can be attributed/is relevant to ANY PART(S) of the retrieval context. If not, you should answer 'no'.
29
- The `reason` key should provide a justification of your verdict. In the justification, you should aim to include references to the document(s) in the retrieval context (eg., 1st document, and 2nd document in the retrieval context) that is attributed/relevant to the expected output sentence.
30
- Please also AIM TO CITE the specific part of the retrieval context to justify your verdict, but **be extremely concise! Cut short the quote with an ellipsis if possible**.
31
-
32
- Here's an example of formatting your answer:
33
- {{
34
- "verdicts": [
35
- {{
36
- "verdict": "yes",
37
- "reason": "..."
38
- }},
39
- ...
40
- ]
41
- }}
42
-
43
- ==== EXAMPLE ====
44
- Expected Output:
45
- The Earth's climate has warmed significantly over the past century. This warming is primarily caused by human activities like burning fossil fuels. Today's weather was sunny and warm.
46
-
47
- Retrieval Context:
48
- ["Global temperatures have risen by approximately 1.1°C since pre-industrial times, with most of this increase occurring in the past 100 years.",
49
- "Scientific consensus shows that greenhouse gas emissions from human activities, particularly the burning of coal, oil and gas, are the main driver of observed climate change."]
50
-
51
- Example Response:
52
- {{
53
- "verdicts": [
54
- {{
55
- "verdict": "yes",
56
- "reason": "The 1st document directly confirms this, stating 'temperatures have risen by approximately 1.1°C...in the past 100 years'"
57
- }},
58
- {{
59
- "verdict": "yes",
60
- "reason": "The 2nd document explicitly states that 'greenhouse gas emissions from human activities, particularly the burning of...fossil fuels' drive climate change"
61
- }},
62
- {{
63
- "verdict": "no",
64
- "reason": "Neither document contains information about today's specific weather conditions"
65
- }}
66
- ]
67
- }}
68
-
69
- Since your task is to generate a verdict for each sentence, the number of 'verdicts' SHOULD BE EXACTLY EQUAL to the number of sentences in of `expected output`.
70
- **
71
-
72
- ==== YOUR TURN ====
73
- Expected Output:
74
- {expected_output}
75
-
76
- Retrieval Context:
77
- {retrieval_context}
78
-
79
- JSON:
80
- """
81
- @staticmethod
82
- def generate_reason(
83
- expected_output, supportive_reasons, unsupportive_reasons, score
84
- ):
85
- return f"""
86
- ==== PROBLEM SETUP ====
87
- You will be provided with an expected output, a list of supportive reasons, a list of unsupportive reasons, and a contextual recall score. Let's break down each input component:
88
- - expected output: A text generated by a language model to answer a question/solve a task.
89
- - supportive reasons: A list of reasons why a specific sentence in the expected output can be attributed/is relevant to any part of the retrieval context (a list of documents retrieved in a RAG pipeline)
90
- - unsupportive reasons: A list of reasons why a specific sentence in the expected output cannot be attributed/is not relevant to any part of the retrieval context
91
- **NOTE**: The reasons are provided in the form of "Sentence <number>: <reason>", where <number> is the sentence number in the expected output.
92
- - contextual recall score: A score between 0 and 1 (closer to 1 the better) representing how much of the expected output can be attributed/is relevant to any part of the retrieval context.
93
- The point of this score is to measure how well the retriever in a RAG pipeline operates, retrieving relevant documents that should back the expected output of the RAG generator.
94
-
95
- ==== TASK INSTRUCTIONS ====
96
- Given these inputs, summarize a CONCISE and CLEAR reason for the value of the contextual recall score. Remember, the score is a measure of how well the retriever in a RAG pipeline operates, retrieving relevant documents that should back the expected output of the RAG generator.
97
- In your reason, you should reference the supportive/unsupportive reasons by their sentence number to justify the score. Make specific references to the retrieval context in your reason if applicable.
98
-
99
- ==== FORMATTING YOUR ANSWER ====
100
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
101
- Example JSON:
102
- {{
103
- "reason": "The score is <contextual_recall_score> because <your_reason>."
104
- }}
105
-
106
- DO NOT mention 'supportive reasons' and 'unsupportive reasons' in your reason, these terms are just here for you to understand the broader scope of things.
107
- If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
108
-
109
- ==== EXAMPLE ====
110
- Expected Output:
111
- The Earth's climate has warmed significantly over the past century. This warming is primarily caused by human activities like burning fossil fuels. Today's weather was sunny and warm.
112
-
113
- Supportive Reasons:
114
- Sentence 1: The first document confirms this by stating global temperatures have risen by 1.1°C in the past 100 years
115
- Sentence 2: The second document directly states that human activities and fossil fuel burning drive climate change
116
-
117
- Unsupportive Reasons:
118
- Sentence 3: Neither document contains information about today's specific weather conditions
119
-
120
- Contextual Recall Score:
121
- 0.67
122
-
123
- Example Response:
124
- {{
125
- "reason": "The score is 0.67 because while sentences 1 and 2 are well-supported by the retrieval context with specific temperature data and human activity impacts, sentence 3 about today's weather has no backing in the provided documents."
126
- }}
127
-
128
- ==== YOUR TURN ====
129
- Contextual Recall Score:
130
- {score}
131
-
132
- Expected Output:
133
- {expected_output}
134
-
135
- Supportive Reasons:
136
- {supportive_reasons}
137
-
138
- Unsupportive Reasons:
139
- {unsupportive_reasons}
140
-
141
- JSON:
142
- """
@@ -1,3 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.contextual_relevancy_scorer import ContextualRelevancyScorer
2
-
3
- __all__ = ["ContextualRelevancyScorer"]
@@ -1,245 +0,0 @@
1
- from typing import Optional, List, Union
2
- import asyncio
3
-
4
- from judgeval.constants import APIScorer
5
- from judgeval.scorers.utils import (get_or_create_event_loop,
6
- scorer_progress_meter,
7
- create_verbose_logs,
8
- parse_response_json,
9
- check_example_params
10
- )
11
- from judgeval.scorers import JudgevalScorer
12
- from judgeval.judges import JudgevalJudge
13
- from judgeval.judges.utils import create_judge
14
- from judgeval.data import Example, ExampleParams
15
- from judgeval.scorers.judgeval_scorers.local_implementations.contextual_relevancy.prompts import *
16
-
17
-
18
- required_params = [
19
- ExampleParams.INPUT,
20
- ExampleParams.ACTUAL_OUTPUT,
21
- ExampleParams.RETRIEVAL_CONTEXT,
22
- ]
23
-
24
-
25
- class ContextualRelevancyScorer(JudgevalScorer):
26
- def __init__(
27
- self,
28
- threshold: float = 0.5,
29
- model: Optional[Union[str, JudgevalJudge]] = None,
30
- include_reason: bool = True,
31
- async_mode: bool = True,
32
- strict_mode: bool = False,
33
- verbose_mode: bool = False,
34
- user: Optional[str] = None
35
- ):
36
- super().__init__(
37
- score_type=APIScorer.CONTEXTUAL_RELEVANCY,
38
- threshold=1 if strict_mode else threshold,
39
- evaluation_model=None,
40
- include_reason=include_reason,
41
- async_mode=async_mode,
42
- strict_mode=strict_mode,
43
- verbose_mode=verbose_mode
44
- )
45
- self.user = user
46
- self.model, self.using_native_model = create_judge(model)
47
- self.evaluation_model = self.model.get_model_name()
48
-
49
- def score_example(
50
- self,
51
- example: Example,
52
- _show_indicator: bool = True,
53
- ) -> float:
54
- check_example_params(example, required_params, self)
55
-
56
- with scorer_progress_meter(self, display_meter=_show_indicator):
57
- if self.async_mode:
58
- loop = get_or_create_event_loop()
59
- loop.run_until_complete(
60
- self.a_score_example(example, _show_indicator=False)
61
- )
62
- else:
63
- self.verdicts_list: List[ContextualRelevancyVerdicts] = [
64
- (self._generate_verdicts(example.input, context))
65
- for context in example.retrieval_context
66
- ]
67
- self.score = self._calculate_score()
68
- self.reason = self._generate_reason(example.input)
69
- self.success = self.score >= self.threshold
70
- self.verbose_logs = create_verbose_logs(
71
- self,
72
- steps=[
73
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts_list]}",
74
- f"Score: {self.score}\nReason: {self.reason}",
75
- ],
76
- )
77
-
78
- return self.score
79
-
80
- async def a_score_example(
81
- self,
82
- example: Example,
83
- _show_indicator: bool = True,
84
- ) -> float:
85
- check_example_params(example, required_params, self)
86
-
87
- with scorer_progress_meter(
88
- self,
89
- async_mode=True,
90
- display_meter=_show_indicator,
91
- ):
92
- self.verdicts_list: List[ContextualRelevancyVerdicts] = (
93
- await asyncio.gather(
94
- *[
95
- self._a_generate_verdicts(example.input, context)
96
- for context in example.retrieval_context
97
- ]
98
- )
99
- )
100
- self.score = self._calculate_score()
101
- self.reason = await self._a_generate_reason(example.input)
102
- self.success = self.score >= self.threshold
103
- self.verbose_logs = create_verbose_logs(
104
- self,
105
- steps=[
106
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts_list]}",
107
- f"Score: {self.score}\nReason: {self.reason}",
108
- ],
109
- )
110
- return self.score
111
-
112
- async def _a_generate_reason(self, input: str):
113
- if self.include_reason is False:
114
- return None
115
-
116
- irrelevancies = []
117
- relevant_statements = []
118
- for verdicts in self.verdicts_list:
119
- for verdict in verdicts.verdicts:
120
- if verdict.verdict.lower() == "no":
121
- irrelevancies.append(verdict.model_dump())
122
- else:
123
- relevant_statements.append(verdict.model_dump())
124
- prompt: dict = ContextualRelevancyTemplate.generate_reason(
125
- input=input,
126
- irrelevancies=irrelevancies,
127
- relevant_statements=relevant_statements,
128
- score=format(self.score, ".2f"),
129
- )
130
- if self.using_native_model:
131
- res = await self.model.a_generate(prompt)
132
- data = parse_response_json(res, self)
133
- return data["reason"]
134
- else:
135
- try:
136
- res: Reason = await self.model.a_generate(prompt, schema=Reason)
137
- return res.reason
138
- except TypeError:
139
- res = await self.model.a_generate(prompt)
140
- data = parse_response_json(res, self)
141
- return data["reason"]
142
-
143
- def _generate_reason(self, input: str):
144
- if self.include_reason is False:
145
- return None
146
-
147
- irrelevancies = []
148
- relevant_statements = []
149
- for verdicts in self.verdicts_list:
150
- for verdict in verdicts.verdicts:
151
- if verdict.verdict.lower() == "no":
152
- irrelevancies.append(verdict.reason)
153
- else:
154
- relevant_statements.append(verdict.statement)
155
-
156
- prompt: dict = ContextualRelevancyTemplate.generate_reason(
157
- input=input,
158
- irrelevancies=irrelevancies,
159
- relevant_statements=relevant_statements,
160
- score=format(self.score, ".2f"),
161
- )
162
- if self.using_native_model:
163
- res = self.model.generate(prompt)
164
- data = parse_response_json(res, self)
165
- return data["reason"]
166
- else:
167
- try:
168
- res: Reason = self.model.generate(prompt, schema=Reason)
169
- return res.reason
170
- except TypeError:
171
- res = self.model.generate(prompt)
172
- data = parse_response_json(res, self)
173
- return data["reason"]
174
-
175
- def _calculate_score(self):
176
- total_verdicts = 0
177
- relevant_statements = 0
178
- for verdicts in self.verdicts_list:
179
- for verdict in verdicts.verdicts:
180
- total_verdicts += 1
181
- if verdict.verdict.lower() == "yes":
182
- relevant_statements += 1
183
-
184
- if total_verdicts == 0:
185
- return 0
186
-
187
- score = relevant_statements / total_verdicts
188
- return 0 if self.strict_mode and score < self.threshold else score
189
-
190
- async def _a_generate_verdicts(
191
- self, input: str, context: List[str]
192
- ) -> ContextualRelevancyVerdicts:
193
- prompt = ContextualRelevancyTemplate.generate_verdicts(
194
- input=input, context=context
195
- )
196
- if self.using_native_model:
197
- res = await self.model.a_generate(prompt)
198
- data = parse_response_json(res, self)
199
- return ContextualRelevancyVerdicts(**data)
200
- else:
201
- try:
202
- res = await self.model.a_generate(
203
- prompt, schema=ContextualRelevancyVerdicts
204
- )
205
- return res
206
- except TypeError:
207
- res = await self.model.a_generate(prompt)
208
- data = parse_response_json(res, self)
209
- return ContextualRelevancyVerdicts(**data)
210
-
211
- def _generate_verdicts(
212
- self, input: str, context: str
213
- ) -> ContextualRelevancyVerdicts:
214
- prompt = ContextualRelevancyTemplate.generate_verdicts(
215
- input=input, context=context
216
- )
217
- if self.using_native_model:
218
- res = self.model.generate(prompt)
219
- data = parse_response_json(res, self)
220
- return ContextualRelevancyVerdicts(**data)
221
- else:
222
- try:
223
- res = self.model.generate(
224
- prompt, schema=ContextualRelevancyVerdicts
225
- )
226
- return res
227
- except TypeError:
228
- res = self.model.generate(prompt)
229
- data = parse_response_json(res, self)
230
- return ContextualRelevancyVerdicts(**data)
231
-
232
- def _success_check(self) -> bool:
233
- if self.error is not None:
234
- self.success = False
235
- else:
236
- try:
237
- self.success = self.score >= self.threshold
238
- except:
239
- self.success = False
240
- return self.success
241
-
242
- @property
243
- def __name__(self):
244
- return "Contextual Relevancy"
245
-