judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/s3_storage.py +93 -0
- judgeval/common/tracer.py +612 -123
- judgeval/data/sequence.py +4 -10
- judgeval/judgment_client.py +25 -86
- judgeval/rules.py +4 -7
- judgeval/run_evaluation.py +1 -1
- judgeval/scorers/__init__.py +4 -4
- judgeval/scorers/judgeval_scorers/__init__.py +0 -176
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
- judgeval-0.0.33.dist-info/RECORD +63 -0
- judgeval/scorers/base_scorer.py +0 -58
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
- judgeval-0.0.32.dist-info/RECORD +0 -97
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
- {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,121 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
from pydantic import BaseModel
|
3
|
-
|
4
|
-
|
5
|
-
class ContextualRelevancyVerdict(BaseModel):
|
6
|
-
statement: str
|
7
|
-
verdict: str
|
8
|
-
reason: str
|
9
|
-
|
10
|
-
|
11
|
-
class ContextualRelevancyVerdicts(BaseModel):
|
12
|
-
verdicts: List[ContextualRelevancyVerdict]
|
13
|
-
|
14
|
-
|
15
|
-
class Reason(BaseModel):
|
16
|
-
reason: str
|
17
|
-
|
18
|
-
|
19
|
-
class ContextualRelevancyTemplate:
|
20
|
-
|
21
|
-
@staticmethod
|
22
|
-
def generate_verdicts(input: str, context: str):
|
23
|
-
return f"""==== TASK INSTRUCTIONS ====
|
24
|
-
You will be provided with an input (str) and a context (str). The input is a question/task proposed to a language model and the context is a list of documents retrieved in a RAG pipeline.
|
25
|
-
Your task is to determine whether each statement found in the context is relevant to the input. To do so, break down the context into statements (high level pieces of information), then determine whether each statement is relevant to the input.
|
26
|
-
|
27
|
-
==== FORMATTING YOUR ANSWER ====
|
28
|
-
|
29
|
-
You should format your answer as a list of JSON objects, with each JSON object containing the following fields:
|
30
|
-
- 'verdict': a string that is EXACTLY EITHER 'yes' or 'no', indicating whether the statement is relevant to the input
|
31
|
-
- 'statement': a string that is the statement found in the context
|
32
|
-
- 'reason': an string that is the justification for why the statement is relevant to the input. IF your verdict is 'no', you MUST quote the irrelevant parts of the statement to back up your reason.
|
33
|
-
|
34
|
-
IMPORTANT: Please make sure to only return in JSON format.
|
35
|
-
|
36
|
-
==== EXAMPLE ====
|
37
|
-
Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
|
38
|
-
Example Input: "What were some of Einstein's achievements?"
|
39
|
-
|
40
|
-
Example:
|
41
|
-
{{
|
42
|
-
"verdicts": [
|
43
|
-
{{
|
44
|
-
"verdict": "yes",
|
45
|
-
"statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
|
46
|
-
}},
|
47
|
-
{{
|
48
|
-
"verdict": "no",
|
49
|
-
"statement": "There was a cat.",
|
50
|
-
"reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements."
|
51
|
-
}}
|
52
|
-
]
|
53
|
-
}}
|
54
|
-
|
55
|
-
==== YOUR TURN ====
|
56
|
-
|
57
|
-
Input:
|
58
|
-
{input}
|
59
|
-
|
60
|
-
Context:
|
61
|
-
{context}
|
62
|
-
|
63
|
-
JSON:
|
64
|
-
"""
|
65
|
-
|
66
|
-
@staticmethod
|
67
|
-
def generate_reason(
|
68
|
-
input: str,
|
69
|
-
irrelevancies: List[str],
|
70
|
-
relevant_statements: List[str],
|
71
|
-
score: float,
|
72
|
-
):
|
73
|
-
return f"""==== TASK INSTRUCTIONS ====
|
74
|
-
You will be provided with the following information:
|
75
|
-
- An input to a RAG pipeline which is a question/task. There is an associated retrieval context to this input in the RAG pipeline (the context is not provided but is relevant to your task).
|
76
|
-
- A list of irrelevant statements from the retrieval context. These statements are not relevant to the input query.
|
77
|
-
- A list of relevant statements from the retrieval context. These statements are relevant to the input query.
|
78
|
-
- A contextual relevancy score (the closer to 1 the better). Contextual relevancy is a measurement of how relevant the retrieval context is to the input query.
|
79
|
-
|
80
|
-
Your task is to generate a CLEAR and CONCISE reason for the score. You should quote data provided in the reasons for the irrelevant and relevant statements to support your reason.
|
81
|
-
|
82
|
-
==== FORMATTING YOUR ANSWER ====
|
83
|
-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
84
|
-
Example JSON:
|
85
|
-
{{
|
86
|
-
"reason": "The score is <contextual_relevancy_score> because <your_reason>."
|
87
|
-
}}
|
88
|
-
|
89
|
-
If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
|
90
|
-
|
91
|
-
==== EXAMPLE ====
|
92
|
-
Input: "What is the capital of France?"
|
93
|
-
|
94
|
-
Contextual Relevancy Score: 0.67
|
95
|
-
|
96
|
-
Irrelevant Statements from the retrieval context:
|
97
|
-
[{{"statement": "Flights to Paris are available from San Francisco starting at $1000", "reason": "Flight prices and routes are not relevant to identifying the capital of France"}}]
|
98
|
-
|
99
|
-
Relevant Statements from the retrieval context:
|
100
|
-
[{{"statement": "Paris is the capital of France"}}, {{"statement": "Paris is a major European city"}}]
|
101
|
-
|
102
|
-
Example Response:
|
103
|
-
{{
|
104
|
-
"reason": "The score is 0.67 because while the context contains directly relevant information stating that 'Paris is the capital of France', it also includes irrelevant travel information about flight prices from San Francisco."
|
105
|
-
}}
|
106
|
-
|
107
|
-
==== YOUR TURN ====
|
108
|
-
Contextual Relevancy Score:
|
109
|
-
{score}
|
110
|
-
|
111
|
-
Input:
|
112
|
-
{input}
|
113
|
-
|
114
|
-
Irrelevant Statements from the retrieval context:
|
115
|
-
{irrelevancies}
|
116
|
-
|
117
|
-
Relevant Statements from the retrieval context:
|
118
|
-
{relevant_statements}
|
119
|
-
|
120
|
-
JSON:
|
121
|
-
"""
|
@@ -1,156 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
|
-
from judgeval.constants import APIScorer
|
4
|
-
from judgeval.scorers.utils import (
|
5
|
-
scorer_progress_meter,
|
6
|
-
create_verbose_logs,
|
7
|
-
check_example_params
|
8
|
-
)
|
9
|
-
from judgeval.data import Example, ExampleParams
|
10
|
-
from judgeval.scorers import JudgevalScorer
|
11
|
-
|
12
|
-
|
13
|
-
required_params = [
|
14
|
-
ExampleParams.INPUT,
|
15
|
-
ExampleParams.ACTUAL_OUTPUT,
|
16
|
-
ExampleParams.EXPECTED_TOOLS,
|
17
|
-
ExampleParams.TOOLS_CALLED,
|
18
|
-
]
|
19
|
-
|
20
|
-
|
21
|
-
def get_lcs(seq1, seq2):
|
22
|
-
m, n = len(seq1), len(seq2)
|
23
|
-
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
24
|
-
|
25
|
-
for i in range(1, m + 1):
|
26
|
-
for j in range(1, n + 1):
|
27
|
-
if seq1[i - 1] == seq2[j - 1]:
|
28
|
-
dp[i][j] = dp[i - 1][j - 1] + 1
|
29
|
-
else:
|
30
|
-
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
31
|
-
|
32
|
-
# Reconstruct the LCS
|
33
|
-
lcs = []
|
34
|
-
i, j = m, n
|
35
|
-
while i > 0 and j > 0:
|
36
|
-
if seq1[i - 1] == seq2[j - 1]:
|
37
|
-
lcs.append(seq1[i - 1])
|
38
|
-
i -= 1
|
39
|
-
j -= 1
|
40
|
-
elif dp[i - 1][j] > dp[i][j - 1]:
|
41
|
-
i -= 1
|
42
|
-
else:
|
43
|
-
j -= 1
|
44
|
-
|
45
|
-
return lcs[::-1]
|
46
|
-
|
47
|
-
|
48
|
-
class ExecutionOrderScorer(JudgevalScorer):
|
49
|
-
def __init__(
|
50
|
-
self,
|
51
|
-
threshold: float = 0.5,
|
52
|
-
include_reason: bool = True,
|
53
|
-
strict_mode: bool = False,
|
54
|
-
verbose_mode: bool = False,
|
55
|
-
should_exact_match: bool = False,
|
56
|
-
should_consider_ordering: bool = False,
|
57
|
-
):
|
58
|
-
super().__init__(
|
59
|
-
score_type=APIScorer.EXECUTION_ORDER,
|
60
|
-
threshold=1 if strict_mode else threshold,
|
61
|
-
evaluation_model=None,
|
62
|
-
include_reason=include_reason,
|
63
|
-
async_mode=False,
|
64
|
-
strict_mode=strict_mode,
|
65
|
-
verbose_mode=verbose_mode
|
66
|
-
)
|
67
|
-
self.should_exact_match = should_exact_match
|
68
|
-
self.should_consider_ordering = should_consider_ordering
|
69
|
-
|
70
|
-
def measure(
|
71
|
-
self,
|
72
|
-
example: Example,
|
73
|
-
_show_indicator: bool = True,
|
74
|
-
) -> float:
|
75
|
-
check_example_params(example, required_params, self)
|
76
|
-
|
77
|
-
with scorer_progress_meter(self, display_meter=_show_indicator):
|
78
|
-
self.tools_called: List[str] = example.tools_called
|
79
|
-
self.expected_tools: List[str] = example.expected_tools
|
80
|
-
self.score = self._calculate_score()
|
81
|
-
self.reason = self._generate_reason()
|
82
|
-
self.success = self.score >= self.threshold
|
83
|
-
self.verbose_logs = create_verbose_logs(
|
84
|
-
self,
|
85
|
-
steps=[
|
86
|
-
f"Expected Tools:\n{self.expected_tools}",
|
87
|
-
f"Tools Called:\n{self.tools_called}",
|
88
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
89
|
-
],
|
90
|
-
)
|
91
|
-
return self.score
|
92
|
-
|
93
|
-
async def a_measure(
|
94
|
-
self, test_case: Example, _show_indicator: bool = True
|
95
|
-
) -> float:
|
96
|
-
check_example_params(test_case, required_params, self)
|
97
|
-
return self.measure(test_case, _show_indicator=_show_indicator)
|
98
|
-
|
99
|
-
def _generate_reason(self):
|
100
|
-
if self.should_exact_match:
|
101
|
-
return f"{'Exact match' if self.tools_called == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_called}."
|
102
|
-
|
103
|
-
elif self.should_consider_ordering:
|
104
|
-
lcs = get_lcs(self.expected_tools, self.tools_called)
|
105
|
-
missing = set(self.expected_tools) - set(self.tools_called)
|
106
|
-
out_of_order = set(self.expected_tools) - set(lcs)
|
107
|
-
|
108
|
-
if len(lcs) == len(self.expected_tools):
|
109
|
-
return f"Correct ordering: all expected tools {self.expected_tools} were called in the correct order."
|
110
|
-
else:
|
111
|
-
issues = []
|
112
|
-
if missing:
|
113
|
-
issues.append(f"missing tools {list(missing)}")
|
114
|
-
if out_of_order:
|
115
|
-
issues.append(f"out-of-order tools {list(out_of_order)}")
|
116
|
-
return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_called}."
|
117
|
-
|
118
|
-
else:
|
119
|
-
used_expected = set(self.tools_called).intersection(
|
120
|
-
set(self.expected_tools)
|
121
|
-
)
|
122
|
-
missing = set(self.expected_tools) - used_expected
|
123
|
-
|
124
|
-
if len(used_expected) == len(self.expected_tools):
|
125
|
-
return f"All expected tools {self.expected_tools} were called (order not considered)."
|
126
|
-
else:
|
127
|
-
return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_called}."
|
128
|
-
|
129
|
-
def _calculate_score(self):
|
130
|
-
if self.should_exact_match:
|
131
|
-
return 1.0 if self.tools_called == self.expected_tools else 0.0
|
132
|
-
|
133
|
-
elif self.should_consider_ordering:
|
134
|
-
longest_common_subsequence = get_lcs(
|
135
|
-
self.expected_tools, self.tools_called
|
136
|
-
)
|
137
|
-
score = len(longest_common_subsequence) / len(self.expected_tools)
|
138
|
-
|
139
|
-
else:
|
140
|
-
used_expected_tools = set(self.tools_called).intersection(
|
141
|
-
set(self.expected_tools)
|
142
|
-
)
|
143
|
-
score = len(used_expected_tools) / len(self.expected_tools)
|
144
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
145
|
-
|
146
|
-
def _success_check(self) -> bool:
|
147
|
-
try:
|
148
|
-
self.success = self.score >= self.threshold
|
149
|
-
except:
|
150
|
-
self.success = False
|
151
|
-
return self.success
|
152
|
-
|
153
|
-
@property
|
154
|
-
def __name__(self):
|
155
|
-
return "Execution Order"
|
156
|
-
|
@@ -1,318 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Code for the local implementation of the Faithfulness metric.
|
3
|
-
"""
|
4
|
-
from typing import List, Optional, Union
|
5
|
-
from judgeval.constants import APIScorer
|
6
|
-
from judgeval.data import (
|
7
|
-
Example,
|
8
|
-
ExampleParams
|
9
|
-
)
|
10
|
-
from judgeval.scorers import JudgevalScorer
|
11
|
-
from judgeval.scorers.utils import (
|
12
|
-
get_or_create_event_loop,
|
13
|
-
check_example_params
|
14
|
-
)
|
15
|
-
from judgeval.judges.utils import create_judge
|
16
|
-
from judgeval.judges import JudgevalJudge
|
17
|
-
from judgeval.scorers.utils import (
|
18
|
-
scorer_progress_meter,
|
19
|
-
create_verbose_logs,
|
20
|
-
parse_response_json
|
21
|
-
)
|
22
|
-
from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.prompts import *
|
23
|
-
|
24
|
-
|
25
|
-
required_params = [
|
26
|
-
ExampleParams.INPUT,
|
27
|
-
ExampleParams.ACTUAL_OUTPUT,
|
28
|
-
ExampleParams.RETRIEVAL_CONTEXT,
|
29
|
-
]
|
30
|
-
|
31
|
-
|
32
|
-
class FaithfulnessScorer(JudgevalScorer):
|
33
|
-
def __init__(
|
34
|
-
self,
|
35
|
-
threshold: float = 0.5,
|
36
|
-
model: Optional[Union[str, JudgevalJudge]] = None,
|
37
|
-
include_reason: bool = True,
|
38
|
-
async_mode: bool = True,
|
39
|
-
strict_mode: bool = False,
|
40
|
-
verbose_mode: bool = False,
|
41
|
-
user: Optional[str] = None
|
42
|
-
):
|
43
|
-
super().__init__(
|
44
|
-
score_type=APIScorer.FAITHFULNESS,
|
45
|
-
threshold=1 if strict_mode else threshold,
|
46
|
-
evaluation_model=None,
|
47
|
-
include_reason=include_reason,
|
48
|
-
async_mode=async_mode,
|
49
|
-
strict_mode=strict_mode,
|
50
|
-
verbose_mode=verbose_mode
|
51
|
-
)
|
52
|
-
self.user = user
|
53
|
-
self.model, self.using_native_model = create_judge(model)
|
54
|
-
self.using_native_model = True # NOTE: SETTING THIS FOR LITELLM and TOGETHER usage
|
55
|
-
self.evaluation_model = self.model.get_model_name()
|
56
|
-
|
57
|
-
def score_example(
|
58
|
-
self,
|
59
|
-
example: Example,
|
60
|
-
all_claims: bool = False,
|
61
|
-
_show_indicator: bool = True,
|
62
|
-
) -> float:
|
63
|
-
check_example_params(example, required_params, self)
|
64
|
-
|
65
|
-
with scorer_progress_meter(self, display_meter=_show_indicator):
|
66
|
-
if self.async_mode:
|
67
|
-
loop = get_or_create_event_loop()
|
68
|
-
loop.run_until_complete(
|
69
|
-
self.a_score_example(
|
70
|
-
example,
|
71
|
-
all_claims=all_claims,
|
72
|
-
_show_indicator=False
|
73
|
-
)
|
74
|
-
)
|
75
|
-
else:
|
76
|
-
self.claims = self._generate_claims(example.actual_output, all_claims=all_claims)
|
77
|
-
if self.additional_metadata is None:
|
78
|
-
self.additional_metadata = {}
|
79
|
-
self.additional_metadata["claims"] = self.claims # Add claims generated to metadata
|
80
|
-
|
81
|
-
self.verdicts = self._generate_verdicts(example.retrieval_context)
|
82
|
-
self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
|
83
|
-
|
84
|
-
self.score = self._calculate_score()
|
85
|
-
self.reason = self._generate_reason()
|
86
|
-
self.success = self.score >= self.threshold
|
87
|
-
self.verbose_logs = create_verbose_logs(
|
88
|
-
self,
|
89
|
-
steps=[
|
90
|
-
f"Claims:\n{self.claims}",
|
91
|
-
f"Verdicts:\n{self.verdicts}",
|
92
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
93
|
-
],
|
94
|
-
)
|
95
|
-
|
96
|
-
return self.score
|
97
|
-
|
98
|
-
async def a_score_example(
|
99
|
-
self,
|
100
|
-
example: Example,
|
101
|
-
_show_indicator: bool = True
|
102
|
-
) -> float:
|
103
|
-
check_example_params(example, required_params, self)
|
104
|
-
|
105
|
-
with scorer_progress_meter(
|
106
|
-
self, async_mode=True, display_meter=_show_indicator
|
107
|
-
):
|
108
|
-
self.claims = await self._a_generate_claims(example.actual_output)
|
109
|
-
|
110
|
-
|
111
|
-
if self.additional_metadata is None:
|
112
|
-
self.additional_metadata = {}
|
113
|
-
self.additional_metadata["claims"] = self.claims
|
114
|
-
|
115
|
-
self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
|
116
|
-
|
117
|
-
self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
|
118
|
-
|
119
|
-
self.score = self._calculate_score()
|
120
|
-
self.reason = await self._a_generate_reason()
|
121
|
-
self.success = self.score >= self.threshold
|
122
|
-
self.verbose_logs = create_verbose_logs(
|
123
|
-
self,
|
124
|
-
steps=[
|
125
|
-
f"Claims:\n{self.claims}",
|
126
|
-
f"Verdicts:\n{self.verdicts}",
|
127
|
-
f"Score: {self.score}\nReason: {self.reason}",
|
128
|
-
],
|
129
|
-
)
|
130
|
-
|
131
|
-
return self.score
|
132
|
-
|
133
|
-
async def _a_generate_reason(self) -> str:
|
134
|
-
if self.include_reason is False:
|
135
|
-
return None
|
136
|
-
|
137
|
-
contradictions = []
|
138
|
-
for verdict in self.verdicts:
|
139
|
-
if verdict.verdict.strip().lower() == "no":
|
140
|
-
contradictions.append(verdict.model_dump())
|
141
|
-
|
142
|
-
prompt: dict = FaithfulnessTemplate.justify_reason(
|
143
|
-
contradictions=contradictions,
|
144
|
-
score=format(self.score, ".2f"),
|
145
|
-
)
|
146
|
-
if self.using_native_model:
|
147
|
-
res = await self.model.a_generate(prompt)
|
148
|
-
data = parse_response_json(res, self)
|
149
|
-
return data["reason"]
|
150
|
-
else:
|
151
|
-
try:
|
152
|
-
res: Reason = await self.model.a_generate(prompt, schema=Reason)
|
153
|
-
return res.reason
|
154
|
-
except TypeError:
|
155
|
-
res = await self.model.a_generate(prompt)
|
156
|
-
data = parse_response_json(res, self)
|
157
|
-
return data["reason"]
|
158
|
-
|
159
|
-
def _generate_reason(self) -> str:
|
160
|
-
if self.include_reason is False:
|
161
|
-
return None
|
162
|
-
|
163
|
-
contradictions = []
|
164
|
-
for verdict in self.verdicts:
|
165
|
-
if verdict.verdict.strip().lower() == "no":
|
166
|
-
contradictions.append(verdict.reason)
|
167
|
-
|
168
|
-
prompt: dict = FaithfulnessTemplate.justify_reason(
|
169
|
-
contradictions=contradictions,
|
170
|
-
score=format(self.score, ".2f"),
|
171
|
-
)
|
172
|
-
|
173
|
-
if self.using_native_model:
|
174
|
-
res = self.model.generate(prompt)
|
175
|
-
data = parse_response_json(res, self)
|
176
|
-
return data["reason"]
|
177
|
-
else:
|
178
|
-
try:
|
179
|
-
res: Reason = self.model.generate(prompt, schema=Reason)
|
180
|
-
return res.reason
|
181
|
-
except TypeError:
|
182
|
-
res = self.model.generate(prompt)
|
183
|
-
data = parse_response_json(res, self)
|
184
|
-
return data["reason"]
|
185
|
-
|
186
|
-
async def _a_generate_verdicts(self, retrieval_context: str) -> List[FaithfulnessVerdict]:
|
187
|
-
if len(self.claims) == 0:
|
188
|
-
return []
|
189
|
-
|
190
|
-
verdicts: List[FaithfulnessVerdict] = []
|
191
|
-
|
192
|
-
claims = [
|
193
|
-
claim["claim"] for claim in self.claims
|
194
|
-
] # We only need the claims, not the quotes involved
|
195
|
-
|
196
|
-
prompt = FaithfulnessTemplate.create_verdicts(
|
197
|
-
claims=claims,
|
198
|
-
retrieval_context=retrieval_context,
|
199
|
-
)
|
200
|
-
if self.using_native_model:
|
201
|
-
res = await self.model.a_generate(prompt)
|
202
|
-
data = parse_response_json(res, self)
|
203
|
-
verdicts = [
|
204
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
205
|
-
]
|
206
|
-
return verdicts
|
207
|
-
else:
|
208
|
-
try:
|
209
|
-
res: Verdicts = await self.model.generate(
|
210
|
-
prompt, schema=Verdicts
|
211
|
-
)
|
212
|
-
verdicts = [item for item in res.verdicts]
|
213
|
-
return verdicts
|
214
|
-
except TypeError:
|
215
|
-
res = await self.model.a_generate(prompt)
|
216
|
-
data = parse_response_json(res, self)
|
217
|
-
verdicts = [
|
218
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
219
|
-
]
|
220
|
-
return verdicts
|
221
|
-
|
222
|
-
def _generate_verdicts(self, retrieval_context: str) -> List[FaithfulnessVerdict]:
|
223
|
-
if len(self.claims) == 0:
|
224
|
-
return []
|
225
|
-
|
226
|
-
verdicts: List[FaithfulnessVerdict] = []
|
227
|
-
|
228
|
-
claims = [
|
229
|
-
claim["claim"] for claim in self.claims
|
230
|
-
] # We only need the claims, not the quotes involved
|
231
|
-
|
232
|
-
prompt = FaithfulnessTemplate.create_verdicts(
|
233
|
-
claims=claims,
|
234
|
-
retrieval_context=retrieval_context,
|
235
|
-
)
|
236
|
-
if self.using_native_model:
|
237
|
-
res = self.model.generate(prompt)
|
238
|
-
data = parse_response_json(res, self)
|
239
|
-
verdicts = [
|
240
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
241
|
-
]
|
242
|
-
return verdicts
|
243
|
-
else:
|
244
|
-
try:
|
245
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
246
|
-
verdicts = [item for item in res.verdicts]
|
247
|
-
return verdicts
|
248
|
-
except TypeError:
|
249
|
-
res = self.model.generate(prompt)
|
250
|
-
data = parse_response_json(res, self)
|
251
|
-
verdicts = [
|
252
|
-
FaithfulnessVerdict(**item) for item in data["verdicts"]
|
253
|
-
]
|
254
|
-
return verdicts
|
255
|
-
|
256
|
-
async def _a_generate_claims(self, actual_output: str) -> List[str]:
|
257
|
-
prompt = FaithfulnessTemplate.find_claims(text=actual_output)
|
258
|
-
if self.using_native_model:
|
259
|
-
res = await self.model.a_generate(prompt)
|
260
|
-
data = parse_response_json(res, self)
|
261
|
-
return data["claims"]
|
262
|
-
else:
|
263
|
-
try:
|
264
|
-
res: Claims = await self.model.a_generate(prompt, schema=Claims)
|
265
|
-
return res.claims
|
266
|
-
except TypeError:
|
267
|
-
res = await self.model.a_generate(prompt)
|
268
|
-
data = parse_response_json(res, self)
|
269
|
-
return data["claims"]
|
270
|
-
|
271
|
-
def _generate_claims(self, actual_output: str, all_claims: bool = False) -> List[str]:
|
272
|
-
prompt = FaithfulnessTemplate.find_claims(text=actual_output)
|
273
|
-
if self.using_native_model:
|
274
|
-
res = self.model.generate(prompt)
|
275
|
-
data = parse_response_json(res, self)
|
276
|
-
return data["claims"]
|
277
|
-
else:
|
278
|
-
try:
|
279
|
-
res: Claims = self.model.generate(prompt, schema=Claims)
|
280
|
-
return res.claims
|
281
|
-
except TypeError:
|
282
|
-
res = self.model.generate(prompt)
|
283
|
-
data = parse_response_json(res, self)
|
284
|
-
return data["claims"]
|
285
|
-
|
286
|
-
def _calculate_score(self) -> float:
|
287
|
-
number_of_verdicts = len(self.verdicts)
|
288
|
-
if number_of_verdicts == 0:
|
289
|
-
return 1
|
290
|
-
|
291
|
-
faithfulness_count = 0
|
292
|
-
for verdict in self.verdicts:
|
293
|
-
if verdict.verdict.strip().lower() != "no":
|
294
|
-
faithfulness_count += 1
|
295
|
-
|
296
|
-
score = faithfulness_count / number_of_verdicts
|
297
|
-
return 0 if self.strict_mode and score < self.threshold else score
|
298
|
-
|
299
|
-
def _success_check(self) -> bool:
|
300
|
-
if self.error is not None:
|
301
|
-
self.success = False
|
302
|
-
else:
|
303
|
-
try:
|
304
|
-
self.success = self.score >= self.threshold
|
305
|
-
except:
|
306
|
-
self.success = False
|
307
|
-
return self.success
|
308
|
-
|
309
|
-
def get_claims(self):
|
310
|
-
return self.claims
|
311
|
-
|
312
|
-
def get_verdicts(self):
|
313
|
-
return self.verdicts
|
314
|
-
|
315
|
-
@property
|
316
|
-
def __name__(self):
|
317
|
-
return "Faithfulness"
|
318
|
-
|