judgeval 0.0.32__py3-none-any.whl → 0.0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. judgeval/common/s3_storage.py +93 -0
  2. judgeval/common/tracer.py +612 -123
  3. judgeval/data/sequence.py +4 -10
  4. judgeval/judgment_client.py +25 -86
  5. judgeval/rules.py +4 -7
  6. judgeval/run_evaluation.py +1 -1
  7. judgeval/scorers/__init__.py +4 -4
  8. judgeval/scorers/judgeval_scorers/__init__.py +0 -176
  9. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/METADATA +15 -2
  10. judgeval-0.0.33.dist-info/RECORD +63 -0
  11. judgeval/scorers/base_scorer.py +0 -58
  12. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -27
  13. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  14. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -276
  15. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  16. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  17. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  18. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  19. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  20. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -161
  21. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -222
  22. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  23. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  24. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  25. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  26. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  27. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  28. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  29. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  30. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  31. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -3
  32. judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -156
  33. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  34. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -318
  35. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  36. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  37. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -264
  38. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  39. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -232
  40. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -102
  41. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  42. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  43. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  44. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  45. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -551
  46. judgeval-0.0.32.dist-info/RECORD +0 -97
  47. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/WHEEL +0 -0
  48. {judgeval-0.0.32.dist-info → judgeval-0.0.33.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,298 +0,0 @@
1
- from typing import Optional, List, Union, Tuple
2
-
3
- from judgeval.constants import APIScorer
4
- from judgeval.scorers.utils import (
5
- get_or_create_event_loop,
6
- scorer_progress_meter,
7
- create_verbose_logs,
8
- parse_response_json,
9
- check_example_params
10
- )
11
- from judgeval.scorers import JudgevalScorer
12
- from judgeval.judges import JudgevalJudge
13
- from judgeval.judges.utils import create_judge
14
- from judgeval.data import Example, ExampleParams
15
- from judgeval.scorers.judgeval_scorers.local_implementations.answer_relevancy.prompts import (
16
- Statements,
17
- ARVerdict,
18
- Verdicts,
19
- Reason,
20
- AnswerRelevancyTemplate,
21
- )
22
-
23
- required_params = [
24
- ExampleParams.INPUT,
25
- ExampleParams.ACTUAL_OUTPUT,
26
- ]
27
-
28
-
29
- class AnswerRelevancyScorer(JudgevalScorer):
30
- def __init__(
31
- self,
32
- threshold: float = 0.5,
33
- model: Optional[Union[str, JudgevalJudge]] = None,
34
- include_reason: bool = True,
35
- async_mode: bool = True,
36
- strict_mode: bool = False,
37
- verbose_mode: bool = False,
38
- ):
39
- super().__init__(
40
- score_type=APIScorer.ANSWER_RELEVANCY,
41
- threshold=1 if strict_mode else threshold,
42
- evaluation_model=None,
43
- include_reason=include_reason,
44
- async_mode=async_mode,
45
- strict_mode=strict_mode,
46
- verbose_mode=verbose_mode
47
- )
48
- self.model, self.using_native_model = create_judge(model)
49
- self.evaluation_model = self.model.get_model_name()
50
-
51
- def score_example(
52
- self,
53
- example: Example,
54
- _show_indicator: bool = True,
55
- ) -> float:
56
- check_example_params(example, required_params, self)
57
-
58
- with scorer_progress_meter(self, display_meter=_show_indicator):
59
- try:
60
- if self.async_mode:
61
- loop = get_or_create_event_loop()
62
- loop.run_until_complete(
63
- self.a_score_example(example, _show_indicator=False)
64
- )
65
- else:
66
- self.statements: List[str] = self._get_statements(
67
- example.actual_output
68
- )
69
- self.verdicts: List[ARVerdict] = (
70
- self._get_verdicts(example.input)
71
- )
72
- self.score = self._compute_score()
73
- self.reason = self._get_reason(example.input)
74
- self.success = self.score >= self.threshold
75
- self.verbose_logs = create_verbose_logs(
76
- self,
77
- steps=[
78
- f"Statements:\n{self.statements}",
79
- # Convert to dict for serialization purposes
80
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
81
- f"Score: {self.score}\nReason: {self.reason}",
82
- ],
83
- )
84
- return self.score
85
- except Exception as e:
86
- raise
87
-
88
- async def a_score_example(
89
- self,
90
- example: Example,
91
- _show_indicator: bool = True,
92
- ) -> float:
93
- check_example_params(example, required_params, self)
94
- try:
95
- with scorer_progress_meter(
96
- self, async_mode=True, display_meter=_show_indicator
97
- ):
98
- self.statements: List[str] = await self._a_get_statements(
99
- example.actual_output
100
- )
101
- self.verdicts: List[ARVerdict] = (
102
- await self._a_get_verdicts(example.input)
103
- )
104
- self.score = self._compute_score()
105
- self.reason = await self._a_get_reason(example.input)
106
- self.success = self.score >= self.threshold
107
- self.verbose_logs = create_verbose_logs(
108
- self,
109
- steps=[
110
- f"Statements:\n{self.statements}",
111
- # Convert to dict for serialization purposes
112
- f"Verdicts:\n{[v.model_dump() for v in self.verdicts]}",
113
- f"Score: {self.score}\nReason: {self.reason}",
114
- ],
115
- )
116
- return self.score
117
- except Exception as e:
118
- print(f"Error: {e}")
119
- raise
120
-
121
- async def _a_get_reason(self, input: str) -> str:
122
- if self.include_reason is False:
123
- return None
124
-
125
- irrelevant_statements: List[Tuple[str, str]] = []
126
- for idx, verdict in enumerate(self.verdicts):
127
- if verdict.verdict.strip().lower() == "no":
128
- irrelevant_statements.append((self.statements[idx], verdict.reason))
129
-
130
- prompt = AnswerRelevancyTemplate.generate_reason(
131
- irrelevant_statements=irrelevant_statements,
132
- input=input,
133
- score=format(self.score, ".2f"),
134
- )
135
- if self.using_native_model:
136
- res = await self.model.a_generate(prompt)
137
- data = parse_response_json(res, self)
138
- return data["reason"]
139
- else:
140
- try:
141
- res: Reason = await self.model.a_generate(
142
- prompt=prompt, schema=Reason
143
- )
144
- return res.reason
145
- except TypeError:
146
- res = await self.model.a_generate(prompt)
147
- data = parse_response_json(res, self)
148
- return data["reason"]
149
-
150
- def _get_reason(self, input: str) -> str:
151
- if self.include_reason is False:
152
- return None
153
-
154
- irrelevant_statements = []
155
- for verdict in self.verdicts:
156
- if verdict.verdict.strip().lower() == "no":
157
- irrelevant_statements.append(verdict.reason)
158
-
159
- prompt = AnswerRelevancyTemplate.generate_reason(
160
- irrelevant_statements=irrelevant_statements,
161
- input=input,
162
- score=format(self.score, ".2f"),
163
- )
164
-
165
- if self.using_native_model:
166
- res = self.model.generate(prompt)
167
- data = parse_response_json(res, self)
168
- return data["reason"]
169
- else:
170
- try:
171
- res: Reason = self.model.generate(prompt, schema=Reason)
172
- return res.reason
173
- except TypeError:
174
- res = self.model.generate(prompt)
175
- data = parse_response_json(res, self)
176
- return data["reason"]
177
-
178
- async def _a_get_verdicts(
179
- self, input: str
180
- ) -> List[ARVerdict]:
181
- if len(self.statements) == 0:
182
- return []
183
-
184
- prompt = AnswerRelevancyTemplate.generate_verdicts(
185
- input=input,
186
- actual_output=self.statements,
187
- )
188
- if self.using_native_model:
189
- res = await self.model.a_generate(prompt)
190
- data = parse_response_json(res, self)
191
- return [
192
- ARVerdict(**item) for item in data["verdicts"]
193
- ]
194
- else:
195
- try:
196
- res: Verdicts = await self.model.a_generate(
197
- prompt, schema=Verdicts
198
- )
199
- return [item for item in res.verdicts]
200
- except TypeError:
201
- res = await self.model.a_generate(prompt)
202
- data = parse_response_json(res, self)
203
- return [
204
- ARVerdict(**item) for item in data["verdicts"]
205
- ]
206
-
207
- def _get_verdicts(self, input: str) -> List[ARVerdict]:
208
- if len(self.statements) == 0:
209
- return []
210
-
211
- prompt = AnswerRelevancyTemplate.generate_verdicts(
212
- input=input,
213
- actual_output=self.statements,
214
- )
215
- if self.using_native_model:
216
- res = self.model.generate(prompt)
217
- data = parse_response_json(res, self)
218
- return [ARVerdict(**item) for item in data["verdicts"]]
219
- else:
220
- try:
221
- res: Verdicts = self.model.generate(prompt, schema=Verdicts)
222
- return [item for item in res.verdicts]
223
- except TypeError:
224
- res = self.model.generate(prompt)
225
- data = parse_response_json(res, self)
226
- return [
227
- ARVerdict(**item) for item in data["verdicts"]
228
- ]
229
-
230
- async def _a_get_statements(
231
- self,
232
- actual_output: str,
233
- ) -> List[str]:
234
- prompt = AnswerRelevancyTemplate.deduce_statements(
235
- actual_output=actual_output,
236
- )
237
- if self.using_native_model:
238
- res = await self.model.a_generate(prompt)
239
- data = parse_response_json(res, self)
240
- return data["statements"]
241
- else:
242
- try:
243
- res: Statements = await self.model.a_generate(
244
- prompt, schema=Statements
245
- )
246
- return res.statements
247
- except TypeError:
248
- res = await self.model.a_generate(prompt)
249
- data = parse_response_json(res, self)
250
- return data["statements"]
251
-
252
- def _get_statements(
253
- self,
254
- actual_output: str,
255
- ) -> List[str]:
256
- prompt = AnswerRelevancyTemplate.deduce_statements(
257
- actual_output=actual_output,
258
- )
259
- if self.using_native_model:
260
- res = self.model.generate(prompt)
261
- data = parse_response_json(res, self)
262
- return data["statements"]
263
- else:
264
- try:
265
- res: Statements = self.model.generate(prompt, schema=Statements)
266
- return res.statements
267
- except TypeError:
268
- res = self.model.generate(prompt)
269
- data = parse_response_json(res, self)
270
- return data["statements"]
271
-
272
- def _compute_score(self):
273
- number_of_verdicts = len(self.verdicts)
274
- if number_of_verdicts == 0:
275
- return 1
276
-
277
- relevant_count = 0
278
- for verdict in self.verdicts:
279
- if verdict.verdict.strip().lower() != "no":
280
- relevant_count += 1
281
-
282
- score = relevant_count / number_of_verdicts
283
- return 0 if self.strict_mode and score < self.threshold else score
284
-
285
- def _success_check(self) -> bool:
286
- if self.error is not None:
287
- self.success = False
288
- else:
289
- try:
290
- self.success = self.score >= self.threshold
291
- except:
292
- self.success = False
293
- return self.success
294
-
295
- @property
296
- def __name__(self):
297
- return "Answer Relevancy"
298
-
@@ -1,174 +0,0 @@
1
- """
2
- Util prompts for AnswerRelevancyScorer
3
- """
4
-
5
- from typing import List, Tuple
6
- from pydantic import BaseModel
7
-
8
-
9
- # BaseModels to enforce formatting in LLM JSON response
10
- class Statements(BaseModel):
11
- statements: List[str]
12
-
13
-
14
- class ARVerdict(BaseModel):
15
- verdict: str
16
- reason: str
17
-
18
-
19
- class Verdicts(BaseModel):
20
- verdicts: List[ARVerdict]
21
-
22
-
23
- class Reason(BaseModel):
24
- reason: str
25
-
26
-
27
- class AnswerRelevancyTemplate:
28
- @staticmethod
29
- def deduce_statements(actual_output):
30
- return f"""You will be presented with a piece of text. Your task is to break down the text and generate a list of statements contained within the text. Single words and ambiguous phrases should be considered statements.
31
-
32
- ===== START OF EXAMPLES =====
33
- Example 1:
34
- Example text: The weather is sunny today. Temperature is 75 degrees. Don't forget your sunscreen!
35
-
36
- Output:
37
- {{
38
- "statements": ["The weather is sunny today", "Temperature is 75 degrees", "Don't forget your sunscreen!"]
39
- }}
40
-
41
- Example 2:
42
- Example text: I love pizza. It has cheese and tomato sauce and the crust is crispy.
43
-
44
- Output:
45
- {{
46
- "statements": ["I love pizza", "It has cheese and tomato sauce", "The crust is crispy"]
47
- }}
48
- ===== END OF EXAMPLES =====
49
-
50
-
51
- **
52
- IMPORTANT: Please return your answer in valid JSON format, with the "statements" key mapping to a list of strings. No words or explanation is needed.
53
- **
54
-
55
- ==== START OF INPUT ====
56
- Text:
57
- {actual_output}
58
- ==== END OF INPUT ====
59
-
60
- ==== YOUR ANSWER ====
61
- JSON:
62
- """
63
-
64
- @staticmethod
65
- def generate_verdicts(input, actual_output):
66
- return f"""You will be provided with a list of statements from a response; your task is to determine whether each statement is relevant with respect to a provided input.
67
- More specifically, you should generate a JSON object with the key "verdicts". "verdicts" will map to a list of nested JSON objects with two keys: `verdict` and `reason`.
68
- The "verdict" key be ONE OF THE FOLLOWING: ["yes", "no", "idk"]. You should select "yes" if the statement is relevant to addressing the original input, "no" if the statement is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input).
69
- The "reason" key should provide an explanation for your choice, regardless of which verdict you select.
70
-
71
- NOTE: the list of statements was generated from an output corresponding to the provided `input`. Account for this relationship during your evaluation of the content relevancy.
72
-
73
- ==== OUTPUT FORMATTING ====
74
- IMPORTANT: Please make sure to only return in JSON format, with the "verdicts" key mapping to a list of JSON objects. Each JSON object should contain keys "verdict" (one of ["yes", "no", "idk"]) and "reason" (str).
75
-
76
- ==== START OF EXAMPLES ====
77
- Example input 1: How do I make chocolate chip cookies?
78
- Example statements 1: ["Preheat the oven to 375°F.", "I love baking!", "My grandmother had a cat.", "Mix the butter and sugar until creamy.", "Have a great day!"]
79
- Example JSON 1:
80
- {{
81
- "verdicts": [
82
- {{
83
- "verdict": "yes",
84
- "reason": "Preheating the oven is a crucial first step in baking cookies"
85
- }},
86
- {{
87
- "verdict": "idk",
88
- "reason": "While showing enthusiasm for baking, this statement doesn't directly contribute to the recipe instructions"
89
- }},
90
- {{
91
- "verdict": "no",
92
- "reason": "The statement about the grandmother's cat is completely irrelevant to instructions for making chocolate chip cookies"
93
- }},
94
- {{
95
- "verdict": "yes",
96
- "reason": "Mixing butter and sugar is an essential step in cookie preparation"
97
- }},
98
- {{
99
- "verdict": "no",
100
- "reason": "A farewell message is not relevant to the cookie recipe instructions being requested"
101
- }}
102
- ]
103
- }}
104
-
105
- Example input 2: What are the main causes of climate change?
106
- Example statements 2: ["Greenhouse gas emissions trap heat in the atmosphere.", "I watched a movie yesterday.", "Industrial processes release large amounts of CO2.", "The weather is nice today."]
107
- Example JSON 2:
108
- {{
109
- "verdicts": [
110
- {{
111
- "verdict": "yes",
112
- "reason": "This directly explains a key mechanism of climate change"
113
- }},
114
- {{
115
- "verdict": "no",
116
- "reason": "Personal entertainment activities are not related to the causes of climate change"
117
- }},
118
- {{
119
- "verdict": "yes",
120
- "reason": "This identifies a major source of greenhouse gas emissions contributing to climate change"
121
- }},
122
- {{
123
- "verdict": "idk",
124
- "reason": "While weather is related to climate, a single day's weather observation doesn't directly address the causes of climate change"
125
- }}
126
- ]
127
- }}
128
- ==== END OF EXAMPLES ====
129
-
130
- ** LASTLY **
131
- Since you are tasked to choose a verdict for each statement, the number of "verdicts" SHOULD BE EXACTLY EQUAL to the number of "statements".
132
-
133
-
134
- ==== YOUR TURN =====
135
-
136
- Input:
137
- {input}
138
-
139
- Statements:
140
- {actual_output}
141
-
142
- JSON:
143
- """
144
-
145
- @staticmethod
146
- def generate_reason(irrelevant_statements: List[Tuple[str, str]], input: str, score: float):
147
- irrelevant_statements = "\n".join([f"statement: {statement}\nreason: {reason}\n------" for statement, reason in irrelevant_statements])
148
- return f"""==== TASK INSTRUCTIONS ====\nYou will provided with three inputs: an answer relevancy score, a list of irrelevant statements made in a model's output (with the reason why it's irrelevant), and the corresponding input to the output. Your task is to provide a CLEAR and CONCISE reason for the answer relevancy score.
149
- You should explain why the score is not higher, but also include why its current score is fair.
150
- The irrelevant statements represent parts of the model output that are irrelevant to addressing whatever is asked/talked about in the input. The irrelevant statement will be paired with the reason why it's irrelevant.
151
- If there are no irrelevant statements, instead respond with a positive remark with an upbeat encouraging tone (but don't overblow the kind attitude).
152
-
153
-
154
- ==== FORMATTING YOUR ANSWER ====
155
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
156
- Example JSON:
157
- {{
158
- "reason": "The score is <answer_relevancy_score> because <your_reason>."
159
- }}
160
-
161
- ==== YOUR TURN ====
162
- ---- ANSWER RELEVANCY SCORE ----
163
- {score}
164
-
165
- ---- IRRELEVANT STATEMENTS ----
166
- {irrelevant_statements}
167
-
168
- ---- INPUT ----
169
- {input}
170
-
171
- ---- YOUR RESPONSE ----
172
- JSON:
173
- """
174
-
@@ -1,161 +0,0 @@
1
- from typing import Optional, Union, List
2
- from pydantic import BaseModel
3
-
4
- from judgeval.constants import APIScorer
5
- from judgeval.scorers import JudgevalScorer
6
- from judgeval.judges import JudgevalJudge
7
- from judgeval.judges.utils import create_judge
8
- from judgeval.data import Example, ExampleParams
9
- from judgeval.scorers.utils import (
10
- get_or_create_event_loop,
11
- scorer_progress_meter,
12
- create_verbose_logs,
13
- parse_response_json,
14
- check_example_params
15
- )
16
- from .prompts import ComparisonTemplate
17
-
18
- required_params = [
19
- ExampleParams.INPUT,
20
- ExampleParams.ACTUAL_OUTPUT,
21
- ExampleParams.EXPECTED_OUTPUT,
22
- ]
23
-
24
- class ComparisonDifference(BaseModel):
25
- actual_output_sentence: str
26
- expected_output_sentence: str
27
- reason: str
28
-
29
- class ComparisonDifferences(BaseModel):
30
- differences: List[ComparisonDifference]
31
-
32
- class ComparisonScorer(JudgevalScorer):
33
- def __init__(
34
- self,
35
- criteria: str,
36
- description: str,
37
- threshold: float = 1,
38
- model: Optional[Union[str, JudgevalJudge]] = None,
39
- include_reason: bool = True,
40
- async_mode: bool = True,
41
- verbose_mode: bool = False,
42
- ):
43
- super().__init__(
44
- score_type=APIScorer.COMPARISON,
45
- threshold=threshold,
46
- evaluation_model=None,
47
- include_reason=include_reason,
48
- async_mode=async_mode,
49
- verbose_mode=verbose_mode
50
- )
51
- self.model, self.using_native_model = create_judge(model)
52
- self.evaluation_model = self.model.get_model_name()
53
- self.criteria = criteria
54
- self.description = description
55
-
56
- def score_example(
57
- self,
58
- example: Example,
59
- _show_indicator: bool = True,
60
- ) -> float:
61
- check_example_params(example, required_params, self)
62
-
63
- with scorer_progress_meter(self, display_meter=_show_indicator):
64
- if self.async_mode:
65
- loop = get_or_create_event_loop()
66
- loop.run_until_complete(
67
- self.a_score_example(
68
- example,
69
- _show_indicator=False
70
- )
71
- )
72
- else:
73
- self.differences = self._find_differences(example)
74
- self.score = len(self.differences)
75
- self.reason = str(self.differences)
76
- self.success = self.score <= self.threshold
77
- self.verbose_logs = create_verbose_logs(
78
- self,
79
- steps=[
80
- f"Score: {self.score}\nReason: {self.reason}",
81
- ],
82
- )
83
-
84
- return len(self.differences)
85
-
86
- async def a_score_example(
87
- self,
88
- example: Example,
89
- _show_indicator: bool = True
90
- ) -> float:
91
- check_example_params(example, required_params, self)
92
-
93
- with scorer_progress_meter(
94
- self, async_mode=True, display_meter=_show_indicator
95
- ):
96
- self.differences = self.a_find_differences(example)
97
- self.score = len(self.differences)
98
- self.reason = str(self.differences)
99
- self.success = self.score <= self.threshold
100
- self.verbose_logs = create_verbose_logs(
101
- self,
102
- steps=[
103
- f"Score: {self.score}\nReason: {self.reason}",
104
- ],
105
- )
106
-
107
- return self.score
108
-
109
- def _find_differences(self, example: Example) -> float:
110
- prompt = ComparisonTemplate.find_differences(
111
- criteria=self.criteria,
112
- description=self.description,
113
- actual_output=example.actual_output,
114
- expected_output=example.expected_output
115
- )
116
- if self.using_native_model:
117
- res = self.model.generate(prompt)
118
- data = parse_response_json(res, self)
119
- return data["differences"]
120
- else:
121
- try:
122
- res: ComparisonDifferences = self.model.generate(prompt, schema=ComparisonDifferences)
123
- return res.differences
124
- except TypeError:
125
- res = self.model.generate(prompt)
126
- data = parse_response_json(res, self)
127
- return data["differences"]
128
-
129
- async def a_find_differences(self, example: Example) -> float:
130
- prompt = ComparisonTemplate.find_differences(
131
- criteria=self.criteria,
132
- description=self.description,
133
- actual_output=example.actual_output,
134
- expected_output=example.expected_output
135
- )
136
- if self.using_native_model:
137
- res = await self.model.a_generate(prompt)
138
- data = parse_response_json(res, self)
139
- return data["differences"]
140
- else:
141
- try:
142
- res: ComparisonDifferences = await self.model.a_generate(prompt, schema=ComparisonDifferences)
143
- return res.differences
144
- except TypeError:
145
- res = await self.model.a_generate(prompt)
146
- data = parse_response_json(res, self)
147
- return data["differences"]
148
-
149
- def _success_check(self) -> bool:
150
- if self.error is not None:
151
- self.success = False
152
- else:
153
- try:
154
- self.success = self.score <= self.threshold
155
- except:
156
- self.success = False
157
- return self.success
158
-
159
- @property
160
- def __name__(self):
161
- return f"Comparison - {self.criteria}"