judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. judgeval/common/s3_storage.py +93 -0
  2. judgeval/common/tracer.py +612 -123
  3. judgeval/data/sequence.py +4 -10
  4. judgeval/judgment_client.py +25 -86
  5. judgeval/rules.py +4 -7
  6. judgeval/run_evaluation.py +1 -1
  7. judgeval/scorers/__init__.py +4 -4
  8. judgeval/scorers/judgeval_scorers/__init__.py +0 -176
  9. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
  10. judgeval-0.0.33.dist-info/RECORD +63 -0
  11. judgeval/scorers/base_scorer.py +0 -58
  12. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
  13. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  14. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
  15. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  16. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  17. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  18. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  19. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  20. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
  21. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
  22. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  23. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  24. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  25. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  26. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  27. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  28. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  29. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  30. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  31. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
  32. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
  33. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  34. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
  35. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  36. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  37. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
  38. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  39. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
  40. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
  41. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  42. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  43. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  44. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  45. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
  46. judgeval-0.0.32.dist-info/RECORD +0 -97
  47. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
  48. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,121 +0,0 @@
1
- from typing import List
2
- from pydantic import BaseModel
3
-
4
-
5
- class ContextualRelevancyVerdict(BaseModel):
6
- statement: str
7
- verdict: str
8
- reason: str
9
-
10
-
11
- class ContextualRelevancyVerdicts(BaseModel):
12
- verdicts: List[ContextualRelevancyVerdict]
13
-
14
-
15
- class Reason(BaseModel):
16
- reason: str
17
-
18
-
19
- class ContextualRelevancyTemplate:
20
-
21
- @staticmethod
22
- def generate_verdicts(input: str, context: str):
23
- return f"""==== TASK INSTRUCTIONS ====
24
- You will be provided with an input (str) and a context (str). The input is a question/task proposed to a language model and the context is a list of documents retrieved in a RAG pipeline.
25
- Your task is to determine whether each statement found in the context is relevant to the input. To do so, break down the context into statements (high level pieces of information), then determine whether each statement is relevant to the input.
26
-
27
- ==== FORMATTING YOUR ANSWER ====
28
-
29
- You should format your answer as a list of JSON objects, with each JSON object containing the following fields:
30
- - 'verdict': a string that is EXACTLY EITHER 'yes' or 'no', indicating whether the statement is relevant to the input
31
- - 'statement': a string that is the statement found in the context
32
- - 'reason': an string that is the justification for why the statement is relevant to the input. IF your verdict is 'no', you MUST quote the irrelevant parts of the statement to back up your reason.
33
-
34
- IMPORTANT: Please make sure to only return in JSON format.
35
-
36
- ==== EXAMPLE ====
37
- Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat."
38
- Example Input: "What were some of Einstein's achievements?"
39
-
40
- Example:
41
- {{
42
- "verdicts": [
43
- {{
44
- "verdict": "yes",
45
- "statement": "Einstein won the Nobel Prize for his discovery of the photoelectric effect in 1968",
46
- }},
47
- {{
48
- "verdict": "no",
49
- "statement": "There was a cat.",
50
- "reason": "The retrieval context contained the information 'There was a cat' when it has nothing to do with Einstein's achievements."
51
- }}
52
- ]
53
- }}
54
-
55
- ==== YOUR TURN ====
56
-
57
- Input:
58
- {input}
59
-
60
- Context:
61
- {context}
62
-
63
- JSON:
64
- """
65
-
66
- @staticmethod
67
- def generate_reason(
68
- input: str,
69
- irrelevancies: List[str],
70
- relevant_statements: List[str],
71
- score: float,
72
- ):
73
- return f"""==== TASK INSTRUCTIONS ====
74
- You will be provided with the following information:
75
- - An input to a RAG pipeline which is a question/task. There is an associated retrieval context to this input in the RAG pipeline (the context is not provided but is relevant to your task).
76
- - A list of irrelevant statements from the retrieval context. These statements are not relevant to the input query.
77
- - A list of relevant statements from the retrieval context. These statements are relevant to the input query.
78
- - A contextual relevancy score (the closer to 1 the better). Contextual relevancy is a measurement of how relevant the retrieval context is to the input query.
79
-
80
- Your task is to generate a CLEAR and CONCISE reason for the score. You should quote data provided in the reasons for the irrelevant and relevant statements to support your reason.
81
-
82
- ==== FORMATTING YOUR ANSWER ====
83
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
84
- Example JSON:
85
- {{
86
- "reason": "The score is <contextual_relevancy_score> because <your_reason>."
87
- }}
88
-
89
- If the score is 1, keep it short and say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).
90
-
91
- ==== EXAMPLE ====
92
- Input: "What is the capital of France?"
93
-
94
- Contextual Relevancy Score: 0.67
95
-
96
- Irrelevant Statements from the retrieval context:
97
- [{{"statement": "Flights to Paris are available from San Francisco starting at $1000", "reason": "Flight prices and routes are not relevant to identifying the capital of France"}}]
98
-
99
- Relevant Statements from the retrieval context:
100
- [{{"statement": "Paris is the capital of France"}}, {{"statement": "Paris is a major European city"}}]
101
-
102
- Example Response:
103
- {{
104
- "reason": "The score is 0.67 because while the context contains directly relevant information stating that 'Paris is the capital of France', it also includes irrelevant travel information about flight prices from San Francisco."
105
- }}
106
-
107
- ==== YOUR TURN ====
108
- Contextual Relevancy Score:
109
- {score}
110
-
111
- Input:
112
- {input}
113
-
114
- Irrelevant Statements from the retrieval context:
115
- {irrelevancies}
116
-
117
- Relevant Statements from the retrieval context:
118
- {relevant_statements}
119
-
120
- JSON:
121
- """
@@ -1,3 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.execution_order.execution_order import ExecutionOrderScorer
2
-
3
- __all__ = ["ExecutionOrderScorer"]
@@ -1,156 +0,0 @@
1
- from typing import List
2
-
3
- from judgeval.constants import APIScorer
4
- from judgeval.scorers.utils import (
5
- scorer_progress_meter,
6
- create_verbose_logs,
7
- check_example_params
8
- )
9
- from judgeval.data import Example, ExampleParams
10
- from judgeval.scorers import JudgevalScorer
11
-
12
-
13
- required_params = [
14
- ExampleParams.INPUT,
15
- ExampleParams.ACTUAL_OUTPUT,
16
- ExampleParams.EXPECTED_TOOLS,
17
- ExampleParams.TOOLS_CALLED,
18
- ]
19
-
20
-
21
- def get_lcs(seq1, seq2):
22
- m, n = len(seq1), len(seq2)
23
- dp = [[0] * (n + 1) for _ in range(m + 1)]
24
-
25
- for i in range(1, m + 1):
26
- for j in range(1, n + 1):
27
- if seq1[i - 1] == seq2[j - 1]:
28
- dp[i][j] = dp[i - 1][j - 1] + 1
29
- else:
30
- dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
31
-
32
- # Reconstruct the LCS
33
- lcs = []
34
- i, j = m, n
35
- while i > 0 and j > 0:
36
- if seq1[i - 1] == seq2[j - 1]:
37
- lcs.append(seq1[i - 1])
38
- i -= 1
39
- j -= 1
40
- elif dp[i - 1][j] > dp[i][j - 1]:
41
- i -= 1
42
- else:
43
- j -= 1
44
-
45
- return lcs[::-1]
46
-
47
-
48
- class ExecutionOrderScorer(JudgevalScorer):
49
- def __init__(
50
- self,
51
- threshold: float = 0.5,
52
- include_reason: bool = True,
53
- strict_mode: bool = False,
54
- verbose_mode: bool = False,
55
- should_exact_match: bool = False,
56
- should_consider_ordering: bool = False,
57
- ):
58
- super().__init__(
59
- score_type=APIScorer.EXECUTION_ORDER,
60
- threshold=1 if strict_mode else threshold,
61
- evaluation_model=None,
62
- include_reason=include_reason,
63
- async_mode=False,
64
- strict_mode=strict_mode,
65
- verbose_mode=verbose_mode
66
- )
67
- self.should_exact_match = should_exact_match
68
- self.should_consider_ordering = should_consider_ordering
69
-
70
- def measure(
71
- self,
72
- example: Example,
73
- _show_indicator: bool = True,
74
- ) -> float:
75
- check_example_params(example, required_params, self)
76
-
77
- with scorer_progress_meter(self, display_meter=_show_indicator):
78
- self.tools_called: List[str] = example.tools_called
79
- self.expected_tools: List[str] = example.expected_tools
80
- self.score = self._calculate_score()
81
- self.reason = self._generate_reason()
82
- self.success = self.score >= self.threshold
83
- self.verbose_logs = create_verbose_logs(
84
- self,
85
- steps=[
86
- f"Expected Tools:\n{self.expected_tools}",
87
- f"Tools Called:\n{self.tools_called}",
88
- f"Score: {self.score}\nReason: {self.reason}",
89
- ],
90
- )
91
- return self.score
92
-
93
- async def a_measure(
94
- self, test_case: Example, _show_indicator: bool = True
95
- ) -> float:
96
- check_example_params(test_case, required_params, self)
97
- return self.measure(test_case, _show_indicator=_show_indicator)
98
-
99
- def _generate_reason(self):
100
- if self.should_exact_match:
101
- return f"{'Exact match' if self.tools_called == self.expected_tools else 'Not an exact match'}: expected {self.expected_tools}, called {self.tools_called}."
102
-
103
- elif self.should_consider_ordering:
104
- lcs = get_lcs(self.expected_tools, self.tools_called)
105
- missing = set(self.expected_tools) - set(self.tools_called)
106
- out_of_order = set(self.expected_tools) - set(lcs)
107
-
108
- if len(lcs) == len(self.expected_tools):
109
- return f"Correct ordering: all expected tools {self.expected_tools} were called in the correct order."
110
- else:
111
- issues = []
112
- if missing:
113
- issues.append(f"missing tools {list(missing)}")
114
- if out_of_order:
115
- issues.append(f"out-of-order tools {list(out_of_order)}")
116
- return f"Incorrect tool usage: {' and '.join(issues)}; expected {self.expected_tools}, called {self.tools_called}."
117
-
118
- else:
119
- used_expected = set(self.tools_called).intersection(
120
- set(self.expected_tools)
121
- )
122
- missing = set(self.expected_tools) - used_expected
123
-
124
- if len(used_expected) == len(self.expected_tools):
125
- return f"All expected tools {self.expected_tools} were called (order not considered)."
126
- else:
127
- return f"Incomplete tool usage: missing tools {list(missing)}; expected {self.expected_tools}, called {self.tools_called}."
128
-
129
- def _calculate_score(self):
130
- if self.should_exact_match:
131
- return 1.0 if self.tools_called == self.expected_tools else 0.0
132
-
133
- elif self.should_consider_ordering:
134
- longest_common_subsequence = get_lcs(
135
- self.expected_tools, self.tools_called
136
- )
137
- score = len(longest_common_subsequence) / len(self.expected_tools)
138
-
139
- else:
140
- used_expected_tools = set(self.tools_called).intersection(
141
- set(self.expected_tools)
142
- )
143
- score = len(used_expected_tools) / len(self.expected_tools)
144
- return 0 if self.strict_mode and score < self.threshold else score
145
-
146
- def _success_check(self) -> bool:
147
- try:
148
- self.success = self.score >= self.threshold
149
- except:
150
- self.success = False
151
- return self.success
152
-
153
- @property
154
- def __name__(self):
155
- return "Execution Order"
156
-
@@ -1,3 +0,0 @@
1
- from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.faithfulness_scorer import FaithfulnessScorer
2
-
3
- __all__ = ["FaithfulnessScorer"]
@@ -1,318 +0,0 @@
1
- """
2
- Code for the local implementation of the Faithfulness metric.
3
- """
4
- from typing import List, Optional, Union
5
- from judgeval.constants import APIScorer
6
- from judgeval.data import (
7
- Example,
8
- ExampleParams
9
- )
10
- from judgeval.scorers import JudgevalScorer
11
- from judgeval.scorers.utils import (
12
- get_or_create_event_loop,
13
- check_example_params
14
- )
15
- from judgeval.judges.utils import create_judge
16
- from judgeval.judges import JudgevalJudge
17
- from judgeval.scorers.utils import (
18
- scorer_progress_meter,
19
- create_verbose_logs,
20
- parse_response_json
21
- )
22
- from judgeval.scorers.judgeval_scorers.local_implementations.faithfulness.prompts import *
23
-
24
-
25
- required_params = [
26
- ExampleParams.INPUT,
27
- ExampleParams.ACTUAL_OUTPUT,
28
- ExampleParams.RETRIEVAL_CONTEXT,
29
- ]
30
-
31
-
32
- class FaithfulnessScorer(JudgevalScorer):
33
- def __init__(
34
- self,
35
- threshold: float = 0.5,
36
- model: Optional[Union[str, JudgevalJudge]] = None,
37
- include_reason: bool = True,
38
- async_mode: bool = True,
39
- strict_mode: bool = False,
40
- verbose_mode: bool = False,
41
- user: Optional[str] = None
42
- ):
43
- super().__init__(
44
- score_type=APIScorer.FAITHFULNESS,
45
- threshold=1 if strict_mode else threshold,
46
- evaluation_model=None,
47
- include_reason=include_reason,
48
- async_mode=async_mode,
49
- strict_mode=strict_mode,
50
- verbose_mode=verbose_mode
51
- )
52
- self.user = user
53
- self.model, self.using_native_model = create_judge(model)
54
- self.using_native_model = True # NOTE: SETTING THIS FOR LITELLM and TOGETHER usage
55
- self.evaluation_model = self.model.get_model_name()
56
-
57
- def score_example(
58
- self,
59
- example: Example,
60
- all_claims: bool = False,
61
- _show_indicator: bool = True,
62
- ) -> float:
63
- check_example_params(example, required_params, self)
64
-
65
- with scorer_progress_meter(self, display_meter=_show_indicator):
66
- if self.async_mode:
67
- loop = get_or_create_event_loop()
68
- loop.run_until_complete(
69
- self.a_score_example(
70
- example,
71
- all_claims=all_claims,
72
- _show_indicator=False
73
- )
74
- )
75
- else:
76
- self.claims = self._generate_claims(example.actual_output, all_claims=all_claims)
77
- if self.additional_metadata is None:
78
- self.additional_metadata = {}
79
- self.additional_metadata["claims"] = self.claims # Add claims generated to metadata
80
-
81
- self.verdicts = self._generate_verdicts(example.retrieval_context)
82
- self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
83
-
84
- self.score = self._calculate_score()
85
- self.reason = self._generate_reason()
86
- self.success = self.score >= self.threshold
87
- self.verbose_logs = create_verbose_logs(
88
- self,
89
- steps=[
90
- f"Claims:\n{self.claims}",
91
- f"Verdicts:\n{self.verdicts}",
92
- f"Score: {self.score}\nReason: {self.reason}",
93
- ],
94
- )
95
-
96
- return self.score
97
-
98
- async def a_score_example(
99
- self,
100
- example: Example,
101
- _show_indicator: bool = True
102
- ) -> float:
103
- check_example_params(example, required_params, self)
104
-
105
- with scorer_progress_meter(
106
- self, async_mode=True, display_meter=_show_indicator
107
- ):
108
- self.claims = await self._a_generate_claims(example.actual_output)
109
-
110
-
111
- if self.additional_metadata is None:
112
- self.additional_metadata = {}
113
- self.additional_metadata["claims"] = self.claims
114
-
115
- self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
116
-
117
- self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
118
-
119
- self.score = self._calculate_score()
120
- self.reason = await self._a_generate_reason()
121
- self.success = self.score >= self.threshold
122
- self.verbose_logs = create_verbose_logs(
123
- self,
124
- steps=[
125
- f"Claims:\n{self.claims}",
126
- f"Verdicts:\n{self.verdicts}",
127
- f"Score: {self.score}\nReason: {self.reason}",
128
- ],
129
- )
130
-
131
- return self.score
132
-
133
- async def _a_generate_reason(self) -> str:
134
- if self.include_reason is False:
135
- return None
136
-
137
- contradictions = []
138
- for verdict in self.verdicts:
139
- if verdict.verdict.strip().lower() == "no":
140
- contradictions.append(verdict.model_dump())
141
-
142
- prompt: dict = FaithfulnessTemplate.justify_reason(
143
- contradictions=contradictions,
144
- score=format(self.score, ".2f"),
145
- )
146
- if self.using_native_model:
147
- res = await self.model.a_generate(prompt)
148
- data = parse_response_json(res, self)
149
- return data["reason"]
150
- else:
151
- try:
152
- res: Reason = await self.model.a_generate(prompt, schema=Reason)
153
- return res.reason
154
- except TypeError:
155
- res = await self.model.a_generate(prompt)
156
- data = parse_response_json(res, self)
157
- return data["reason"]
158
-
159
- def _generate_reason(self) -> str:
160
- if self.include_reason is False:
161
- return None
162
-
163
- contradictions = []
164
- for verdict in self.verdicts:
165
- if verdict.verdict.strip().lower() == "no":
166
- contradictions.append(verdict.reason)
167
-
168
- prompt: dict = FaithfulnessTemplate.justify_reason(
169
- contradictions=contradictions,
170
- score=format(self.score, ".2f"),
171
- )
172
-
173
- if self.using_native_model:
174
- res = self.model.generate(prompt)
175
- data = parse_response_json(res, self)
176
- return data["reason"]
177
- else:
178
- try:
179
- res: Reason = self.model.generate(prompt, schema=Reason)
180
- return res.reason
181
- except TypeError:
182
- res = self.model.generate(prompt)
183
- data = parse_response_json(res, self)
184
- return data["reason"]
185
-
186
- async def _a_generate_verdicts(self, retrieval_context: str) -> List[FaithfulnessVerdict]:
187
- if len(self.claims) == 0:
188
- return []
189
-
190
- verdicts: List[FaithfulnessVerdict] = []
191
-
192
- claims = [
193
- claim["claim"] for claim in self.claims
194
- ] # We only need the claims, not the quotes involved
195
-
196
- prompt = FaithfulnessTemplate.create_verdicts(
197
- claims=claims,
198
- retrieval_context=retrieval_context,
199
- )
200
- if self.using_native_model:
201
- res = await self.model.a_generate(prompt)
202
- data = parse_response_json(res, self)
203
- verdicts = [
204
- FaithfulnessVerdict(**item) for item in data["verdicts"]
205
- ]
206
- return verdicts
207
- else:
208
- try:
209
- res: Verdicts = await self.model.generate(
210
- prompt, schema=Verdicts
211
- )
212
- verdicts = [item for item in res.verdicts]
213
- return verdicts
214
- except TypeError:
215
- res = await self.model.a_generate(prompt)
216
- data = parse_response_json(res, self)
217
- verdicts = [
218
- FaithfulnessVerdict(**item) for item in data["verdicts"]
219
- ]
220
- return verdicts
221
-
222
- def _generate_verdicts(self, retrieval_context: str) -> List[FaithfulnessVerdict]:
223
- if len(self.claims) == 0:
224
- return []
225
-
226
- verdicts: List[FaithfulnessVerdict] = []
227
-
228
- claims = [
229
- claim["claim"] for claim in self.claims
230
- ] # We only need the claims, not the quotes involved
231
-
232
- prompt = FaithfulnessTemplate.create_verdicts(
233
- claims=claims,
234
- retrieval_context=retrieval_context,
235
- )
236
- if self.using_native_model:
237
- res = self.model.generate(prompt)
238
- data = parse_response_json(res, self)
239
- verdicts = [
240
- FaithfulnessVerdict(**item) for item in data["verdicts"]
241
- ]
242
- return verdicts
243
- else:
244
- try:
245
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
246
- verdicts = [item for item in res.verdicts]
247
- return verdicts
248
- except TypeError:
249
- res = self.model.generate(prompt)
250
- data = parse_response_json(res, self)
251
- verdicts = [
252
- FaithfulnessVerdict(**item) for item in data["verdicts"]
253
- ]
254
- return verdicts
255
-
256
- async def _a_generate_claims(self, actual_output: str) -> List[str]:
257
- prompt = FaithfulnessTemplate.find_claims(text=actual_output)
258
- if self.using_native_model:
259
- res = await self.model.a_generate(prompt)
260
- data = parse_response_json(res, self)
261
- return data["claims"]
262
- else:
263
- try:
264
- res: Claims = await self.model.a_generate(prompt, schema=Claims)
265
- return res.claims
266
- except TypeError:
267
- res = await self.model.a_generate(prompt)
268
- data = parse_response_json(res, self)
269
- return data["claims"]
270
-
271
- def _generate_claims(self, actual_output: str, all_claims: bool = False) -> List[str]:
272
- prompt = FaithfulnessTemplate.find_claims(text=actual_output)
273
- if self.using_native_model:
274
- res = self.model.generate(prompt)
275
- data = parse_response_json(res, self)
276
- return data["claims"]
277
- else:
278
- try:
279
- res: Claims = self.model.generate(prompt, schema=Claims)
280
- return res.claims
281
- except TypeError:
282
- res = self.model.generate(prompt)
283
- data = parse_response_json(res, self)
284
- return data["claims"]
285
-
286
- def _calculate_score(self) -> float:
287
- number_of_verdicts = len(self.verdicts)
288
- if number_of_verdicts == 0:
289
- return 1
290
-
291
- faithfulness_count = 0
292
- for verdict in self.verdicts:
293
- if verdict.verdict.strip().lower() != "no":
294
- faithfulness_count += 1
295
-
296
- score = faithfulness_count / number_of_verdicts
297
- return 0 if self.strict_mode and score < self.threshold else score
298
-
299
- def _success_check(self) -> bool:
300
- if self.error is not None:
301
- self.success = False
302
- else:
303
- try:
304
- self.success = self.score >= self.threshold
305
- except:
306
- self.success = False
307
- return self.success
308
-
309
- def get_claims(self):
310
- return self.claims
311
-
312
- def get_verdicts(self):
313
- return self.verdicts
314
-
315
- @property
316
- def __name__(self):
317
- return "Faithfulness"
318
-