eval-ai-library 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.1.0.dist-info/METADATA +753 -0
- eval_ai_library-0.1.0.dist-info/RECORD +34 -0
- eval_ai_library-0.1.0.dist-info/WHEEL +5 -0
- eval_ai_library-0.1.0.dist-info/licenses/LICENSE +21 -0
- eval_ai_library-0.1.0.dist-info/top_level.txt +1 -0
- eval_lib/__init__.py +122 -0
- eval_lib/agent_metrics/__init__.py +12 -0
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +231 -0
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +251 -0
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +347 -0
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +106 -0
- eval_lib/datagenerator/datagenerator.py +230 -0
- eval_lib/datagenerator/document_loader.py +510 -0
- eval_lib/datagenerator/prompts.py +192 -0
- eval_lib/evaluate.py +335 -0
- eval_lib/evaluation_schema.py +63 -0
- eval_lib/llm_client.py +286 -0
- eval_lib/metric_pattern.py +229 -0
- eval_lib/metrics/__init__.py +25 -0
- eval_lib/metrics/answer_precision_metric/answer_precision.py +405 -0
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +195 -0
- eval_lib/metrics/bias_metric/bias.py +114 -0
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +102 -0
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +91 -0
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +169 -0
- eval_lib/metrics/custom_metric/custom_eval.py +303 -0
- eval_lib/metrics/faithfulness_metric/faithfulness.py +140 -0
- eval_lib/metrics/geval/geval.py +326 -0
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +102 -0
- eval_lib/metrics/toxicity_metric/toxicity.py +113 -0
- eval_lib/price.py +37 -0
- eval_lib/py.typed +1 -0
- eval_lib/testcases_schema.py +27 -0
- eval_lib/utils.py +99 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# custom_eval.py
|
|
2
|
+
"""
|
|
3
|
+
Custom Evaluation Metric with Chain-of-Thought and Probability-Weighted Scoring
|
|
4
|
+
Uses advanced techniques from G-Eval for improved accuracy
|
|
5
|
+
"""
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
from typing import Dict, Any, List, Tuple
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from eval_lib.metric_pattern import MetricPattern
|
|
11
|
+
from eval_lib.testcases_schema import EvalTestCase
|
|
12
|
+
from eval_lib.llm_client import chat_complete
|
|
13
|
+
from eval_lib.utils import extract_json_block
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CustomEvalMetric(MetricPattern):
|
|
17
|
+
name = "customEval"
|
|
18
|
+
|
|
19
|
+
def __init__(self, model: str, threshold: float, name: str, criteria: str):
|
|
20
|
+
super().__init__(model=model, threshold=threshold)
|
|
21
|
+
self.custom_name = name
|
|
22
|
+
self.criteria = criteria
|
|
23
|
+
|
|
24
|
+
# ==================== PROMPTS ====================
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def _prompt_generate_steps(criteria: str) -> str:
|
|
28
|
+
"""Generate evaluation steps from criteria (Chain-of-Thought)"""
|
|
29
|
+
return f"""Given the evaluation criteria below, generate 3-5 detailed evaluation steps.
|
|
30
|
+
|
|
31
|
+
Evaluation Criteria:
|
|
32
|
+
{criteria}
|
|
33
|
+
|
|
34
|
+
Generate steps that are:
|
|
35
|
+
1. Specific and actionable
|
|
36
|
+
2. Logically ordered
|
|
37
|
+
3. Lead to assigning a score from 0.0 to 1.0
|
|
38
|
+
|
|
39
|
+
**
|
|
40
|
+
Return ONLY JSON:
|
|
41
|
+
{{
|
|
42
|
+
"steps": ["Step 1: ...", "Step 2: ...", "Step 3: ..."]
|
|
43
|
+
}}
|
|
44
|
+
**
|
|
45
|
+
|
|
46
|
+
JSON:"""
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _prompt_evaluate(criteria: str, evaluation_steps: List[str], test_case: EvalTestCase) -> str:
|
|
50
|
+
"""Generate evaluation prompt with CoT steps"""
|
|
51
|
+
steps_text = "\n".join(
|
|
52
|
+
[f"{i+1}. {step}" for i, step in enumerate(evaluation_steps)])
|
|
53
|
+
|
|
54
|
+
parts = [
|
|
55
|
+
f"User Input:\n{test_case.input}",
|
|
56
|
+
f"Model Output:\n{test_case.actual_output}"
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
if test_case.expected_output:
|
|
60
|
+
parts.append(f"Expected Output:\n{test_case.expected_output}")
|
|
61
|
+
|
|
62
|
+
if test_case.retrieval_context:
|
|
63
|
+
parts.append(f"Context:\n" +
|
|
64
|
+
"\n".join(test_case.retrieval_context))
|
|
65
|
+
|
|
66
|
+
input_block = "\n\n".join(parts)
|
|
67
|
+
|
|
68
|
+
return f"""You are a strict evaluator. Use the criteria and evaluation steps below.
|
|
69
|
+
|
|
70
|
+
Evaluation Criteria:
|
|
71
|
+
{criteria}
|
|
72
|
+
|
|
73
|
+
Evaluation Steps:
|
|
74
|
+
{steps_text}
|
|
75
|
+
|
|
76
|
+
{input_block}
|
|
77
|
+
|
|
78
|
+
Based on the evaluation steps, assign a score from 0.0 to 1.0 (where 0.0 is worst and 1.0 is best).
|
|
79
|
+
|
|
80
|
+
**
|
|
81
|
+
Return ONLY JSON:
|
|
82
|
+
{{
|
|
83
|
+
"score": <float between 0.0 and 1.0>
|
|
84
|
+
}}
|
|
85
|
+
**
|
|
86
|
+
|
|
87
|
+
JSON:"""
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _prompt_reason(
|
|
91
|
+
criteria: str,
|
|
92
|
+
evaluation_steps: List[str],
|
|
93
|
+
test_case: EvalTestCase,
|
|
94
|
+
score: float
|
|
95
|
+
) -> str:
|
|
96
|
+
"""Generate explanation for the score"""
|
|
97
|
+
steps_text = "\n".join(
|
|
98
|
+
[f"{i+1}. {step}" for i, step in enumerate(evaluation_steps)])
|
|
99
|
+
|
|
100
|
+
parts = [
|
|
101
|
+
f"User Input:\n{test_case.input}",
|
|
102
|
+
f"Model Output:\n{test_case.actual_output}"
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
if test_case.expected_output:
|
|
106
|
+
parts.append(f"Expected Output:\n{test_case.expected_output}")
|
|
107
|
+
|
|
108
|
+
if test_case.retrieval_context:
|
|
109
|
+
parts.append(f"Context:\n" +
|
|
110
|
+
"\n".join(test_case.retrieval_context))
|
|
111
|
+
|
|
112
|
+
input_block = "\n\n".join(parts)
|
|
113
|
+
|
|
114
|
+
return f"""You assigned a score of {score:.2f} (out of 1.0) for this evaluation.
|
|
115
|
+
|
|
116
|
+
Evaluation Criteria:
|
|
117
|
+
{criteria}
|
|
118
|
+
|
|
119
|
+
Evaluation Steps:
|
|
120
|
+
{steps_text}
|
|
121
|
+
|
|
122
|
+
{input_block}
|
|
123
|
+
|
|
124
|
+
Final Score: {score:.2f}/1.0
|
|
125
|
+
|
|
126
|
+
Explain why this score was assigned, referencing specific aspects from the evaluation steps.
|
|
127
|
+
|
|
128
|
+
**
|
|
129
|
+
Return ONLY JSON:
|
|
130
|
+
{{
|
|
131
|
+
"reason": "Your explanation..."
|
|
132
|
+
}}
|
|
133
|
+
**
|
|
134
|
+
|
|
135
|
+
JSON:"""
|
|
136
|
+
|
|
137
|
+
# ==================== HELPER METHODS ====================
|
|
138
|
+
|
|
139
|
+
def _extract_score_from_response(self, text: str) -> float:
|
|
140
|
+
"""Extract float score from LLM response (0.0-1.0 range)"""
|
|
141
|
+
text = text.strip()
|
|
142
|
+
|
|
143
|
+
# Try JSON parsing first
|
|
144
|
+
try:
|
|
145
|
+
data = json.loads(extract_json_block(text))
|
|
146
|
+
if "score" in data:
|
|
147
|
+
score = float(data["score"])
|
|
148
|
+
if 0.0 <= score <= 1.0:
|
|
149
|
+
return score
|
|
150
|
+
except:
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
# Try regex patterns
|
|
154
|
+
patterns = [
|
|
155
|
+
r'"score"\s*:\s*(\d+\.?\d*)',
|
|
156
|
+
r'score[:\s]+(\d+\.?\d*)',
|
|
157
|
+
r'^\s*(\d+\.?\d*)\s*$',
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
for pattern in patterns:
|
|
161
|
+
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
|
|
162
|
+
if match:
|
|
163
|
+
score = float(match.group(1))
|
|
164
|
+
if 0.0 <= score <= 1.0:
|
|
165
|
+
return score
|
|
166
|
+
|
|
167
|
+
raise RuntimeError(f"Failed to extract score from response: {text}")
|
|
168
|
+
|
|
169
|
+
# ==================== CORE ALGORITHM ====================
|
|
170
|
+
|
|
171
|
+
async def _probability_weighted_scoring(
|
|
172
|
+
self,
|
|
173
|
+
prompt: str,
|
|
174
|
+
n_samples: int = 20,
|
|
175
|
+
temperature: float = 2.0
|
|
176
|
+
) -> Tuple[float, List[float], float]:
|
|
177
|
+
"""
|
|
178
|
+
Probability-weighted scoring: score = Σ p(si) × si
|
|
179
|
+
Samples multiple times to estimate probability distribution
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
(final_score, sampled_scores, total_cost)
|
|
183
|
+
"""
|
|
184
|
+
total_cost = 0.0
|
|
185
|
+
scores = []
|
|
186
|
+
|
|
187
|
+
# Sample n times with high temperature
|
|
188
|
+
for _ in range(n_samples):
|
|
189
|
+
text, cost = await chat_complete(
|
|
190
|
+
self.model,
|
|
191
|
+
messages=[{"role": "user", "content": prompt}],
|
|
192
|
+
temperature=temperature
|
|
193
|
+
)
|
|
194
|
+
total_cost += cost or 0.0
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
score = self._extract_score_from_response(text)
|
|
198
|
+
scores.append(score)
|
|
199
|
+
except:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
if not scores:
|
|
203
|
+
raise RuntimeError(
|
|
204
|
+
f"Failed to extract any valid scores from {n_samples} samples")
|
|
205
|
+
|
|
206
|
+
# Calculate probability-weighted score as mean
|
|
207
|
+
weighted_score = sum(scores) / len(scores)
|
|
208
|
+
|
|
209
|
+
return weighted_score, scores, total_cost
|
|
210
|
+
|
|
211
|
+
async def evaluate(self, test_case: EvalTestCase) -> Dict[str, Any]:
|
|
212
|
+
"""
|
|
213
|
+
Evaluate using Chain-of-Thought and Probability-Weighted Scoring.
|
|
214
|
+
|
|
215
|
+
Algorithm:
|
|
216
|
+
1. Auto-generate evaluation steps from criteria (CoT)
|
|
217
|
+
2. Apply probability-weighted scoring (20 samples, temp=2.0)
|
|
218
|
+
3. Generate detailed explanation
|
|
219
|
+
4. Build comprehensive evaluation_log
|
|
220
|
+
"""
|
|
221
|
+
total_cost = 0.0
|
|
222
|
+
|
|
223
|
+
# Step 1: Auto-generate evaluation steps (Chain-of-Thought from G-Eval)
|
|
224
|
+
steps_prompt = self._prompt_generate_steps(self.criteria)
|
|
225
|
+
steps_text, step_cost = await chat_complete(
|
|
226
|
+
self.model,
|
|
227
|
+
messages=[{"role": "user", "content": steps_prompt}],
|
|
228
|
+
temperature=0.0
|
|
229
|
+
)
|
|
230
|
+
total_cost += step_cost or 0.0
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
parsed_steps = json.loads(extract_json_block(steps_text))
|
|
234
|
+
evaluation_steps = parsed_steps["steps"]
|
|
235
|
+
except Exception as e:
|
|
236
|
+
raise RuntimeError(
|
|
237
|
+
f"Failed to parse evaluation steps: {e}\n{steps_text}")
|
|
238
|
+
|
|
239
|
+
# Step 2: Generate evaluation prompt with CoT
|
|
240
|
+
eval_prompt = self._prompt_evaluate(
|
|
241
|
+
self.criteria, evaluation_steps, test_case)
|
|
242
|
+
|
|
243
|
+
# Step 3: Probability-weighted scoring (20 samples from G-Eval)
|
|
244
|
+
final_score, sampled_scores, scoring_cost = await self._probability_weighted_scoring(
|
|
245
|
+
eval_prompt,
|
|
246
|
+
n_samples=20,
|
|
247
|
+
temperature=2.0
|
|
248
|
+
)
|
|
249
|
+
total_cost += scoring_cost
|
|
250
|
+
|
|
251
|
+
# Step 4: Generate explanation
|
|
252
|
+
reason_prompt = self._prompt_reason(
|
|
253
|
+
self.criteria, evaluation_steps, test_case, final_score)
|
|
254
|
+
reason_text, reason_cost = await chat_complete(
|
|
255
|
+
self.model,
|
|
256
|
+
messages=[{"role": "user", "content": reason_prompt}],
|
|
257
|
+
temperature=0.0
|
|
258
|
+
)
|
|
259
|
+
total_cost += reason_cost or 0.0
|
|
260
|
+
|
|
261
|
+
# Parse reason
|
|
262
|
+
try:
|
|
263
|
+
reason_data = json.loads(extract_json_block(reason_text))
|
|
264
|
+
reason = reason_data.get("reason", reason_text)
|
|
265
|
+
except:
|
|
266
|
+
reason = reason_text.strip()
|
|
267
|
+
|
|
268
|
+
success = final_score >= self.threshold
|
|
269
|
+
|
|
270
|
+
# Step 5: Build comprehensive evaluation_log
|
|
271
|
+
evaluation_log = {
|
|
272
|
+
"input_question": test_case.input,
|
|
273
|
+
"actual_output": test_case.actual_output,
|
|
274
|
+
"expected_output": test_case.expected_output,
|
|
275
|
+
"retrieval_context": test_case.retrieval_context,
|
|
276
|
+
"criteria": self.criteria,
|
|
277
|
+
"comment_criteria": "Custom evaluation criteria provided by user.",
|
|
278
|
+
"evaluation_steps": evaluation_steps,
|
|
279
|
+
"comment_evaluation_steps": "Auto-generated evaluation steps using Chain-of-Thought (CoT) technique from G-Eval.",
|
|
280
|
+
"sampled_scores": sampled_scores,
|
|
281
|
+
"comment_sampled_scores": f"Individual scores from {len(sampled_scores)} samples with temperature=2.0.",
|
|
282
|
+
"score_distribution": {f"{s:.2f}": sampled_scores.count(s) for s in set(sampled_scores)},
|
|
283
|
+
"comment_score_distribution": "Frequency distribution of sampled scores for probability-weighted calculation.",
|
|
284
|
+
"final_score": round(final_score, 4),
|
|
285
|
+
"comment_final_score": "Probability-weighted score calculated as mean of sampled scores (G-Eval technique).",
|
|
286
|
+
"threshold": self.threshold,
|
|
287
|
+
"success": success,
|
|
288
|
+
"comment_success": "Whether the final score passes the custom threshold.",
|
|
289
|
+
"final_reason": reason,
|
|
290
|
+
"comment_reasoning": "LLM-generated explanation based on evaluation steps and criteria."
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return {
|
|
294
|
+
"score": round(final_score, 4),
|
|
295
|
+
"success": success,
|
|
296
|
+
"reason": reason,
|
|
297
|
+
"evaluation_cost": round(total_cost, 6),
|
|
298
|
+
"evaluation_log": evaluation_log,
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
@property
|
|
302
|
+
def name(self):
|
|
303
|
+
return f"Custom: {self.custom_name}"
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# faithfulness_metric.py
|
|
2
|
+
'''
|
|
3
|
+
Faithfulness Metric: Evaluates the factual consistency of a chatbot's answer
|
|
4
|
+
with respect to the retrieved context.
|
|
5
|
+
Score calculation: Softmax aggregation of verdicts on factual statements
|
|
6
|
+
'''
|
|
7
|
+
from typing import List, Dict, Tuple, Any
|
|
8
|
+
import json
|
|
9
|
+
import re
|
|
10
|
+
import numpy as np
|
|
11
|
+
from math import exp
|
|
12
|
+
from eval_lib.testcases_schema import EvalTestCase
|
|
13
|
+
from eval_lib.metric_pattern import MetricPattern
|
|
14
|
+
from eval_lib.llm_client import chat_complete
|
|
15
|
+
from eval_lib.utils import score_agg, extract_json_block
|
|
16
|
+
|
|
17
|
+
VERDICT_WEIGHTS = {
|
|
18
|
+
"fully": 1.0,
|
|
19
|
+
"mostly": 0.9,
|
|
20
|
+
"partial": 0.7,
|
|
21
|
+
"minor": 0.3,
|
|
22
|
+
"none": 0.0,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FaithfulnessMetric(MetricPattern):
|
|
27
|
+
name = "faithfulnessMetric"
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
model: str,
|
|
32
|
+
threshold: float = 0.7,
|
|
33
|
+
temperature: float = 0.5,
|
|
34
|
+
):
|
|
35
|
+
super().__init__(model=model, threshold=threshold)
|
|
36
|
+
self.temperature = temperature
|
|
37
|
+
|
|
38
|
+
async def _generate_statements(self, answer: str) -> Tuple[List[str], float]:
|
|
39
|
+
prompt = (
|
|
40
|
+
"Extract standalone factual claims from the following answer.\n"
|
|
41
|
+
"Each statement must be a distinct, verifiable fact.\n\n"
|
|
42
|
+
f"Answer:\n{answer}\n\n"
|
|
43
|
+
"Return a JSON array of strings."
|
|
44
|
+
)
|
|
45
|
+
text, cost = await chat_complete(self.model, [{"role": "user", "content": prompt}], temperature=0.0)
|
|
46
|
+
raw_json = extract_json_block(text)
|
|
47
|
+
statements = json.loads(raw_json)
|
|
48
|
+
assert isinstance(statements, list)
|
|
49
|
+
return statements, cost or 0.0
|
|
50
|
+
|
|
51
|
+
async def _generate_verdicts(self, context: str, statements: List[str]) -> Tuple[List[Dict[str, str]], float, float]:
|
|
52
|
+
prompt = (
|
|
53
|
+
"Evaluate how well each statement is supported by the context.\n\n"
|
|
54
|
+
"Levels:\n"
|
|
55
|
+
"- fully: directly supported word-for-word\n"
|
|
56
|
+
"- mostly: strongly supported but wording differs slightly\n"
|
|
57
|
+
"- partial: partially supported but with some gaps\n"
|
|
58
|
+
"- minor: tangentially related or ambiguous\n"
|
|
59
|
+
"- none: clearly unsupported or contradicted\n\n"
|
|
60
|
+
f"CONTEXT:\n{context}\n\n"
|
|
61
|
+
f"STATEMENTS (JSON array):\n{json.dumps(statements, ensure_ascii=False)}\n\n"
|
|
62
|
+
"Return only a JSON array of objects like:\n"
|
|
63
|
+
'[{"verdict": "fully|mostly|partial|minor|none", '
|
|
64
|
+
'"reason": "<brief>", '
|
|
65
|
+
'"support": "<exact context sentence(s)> or \'none\'"}]'
|
|
66
|
+
)
|
|
67
|
+
text, cost = await chat_complete(self.model, [{"role": "user", "content": prompt}], temperature=0.0)
|
|
68
|
+
raw_json = extract_json_block(text)
|
|
69
|
+
verdicts: List[Dict[str, Any]] = json.loads(raw_json)
|
|
70
|
+
|
|
71
|
+
for v in verdicts:
|
|
72
|
+
supp = v.get("support", "").strip().lower()
|
|
73
|
+
if supp == "none" and v["verdict"] in ("fully", "mostly"):
|
|
74
|
+
v["verdict"] = "partial"
|
|
75
|
+
|
|
76
|
+
scores = [VERDICT_WEIGHTS.get(v["verdict"], 0.0) for v in verdicts]
|
|
77
|
+
score = round(score_agg(scores, temperature=self.temperature), 4)
|
|
78
|
+
return verdicts, score, cost or 0.0
|
|
79
|
+
|
|
80
|
+
async def _summarize_reasons_via_llm(self, verdicts: List[Dict[str, str]]) -> Tuple[str, float]:
|
|
81
|
+
grouped: Dict[str, List[str]] = {}
|
|
82
|
+
for v in verdicts:
|
|
83
|
+
grouped.setdefault(v["verdict"], []).append(v["reason"])
|
|
84
|
+
bullets = []
|
|
85
|
+
for tag in ("fully", "mostly", "partial", "none"):
|
|
86
|
+
bullets.extend(f"- {r}" for r in grouped.get(tag, [])[:2])
|
|
87
|
+
prompt = (
|
|
88
|
+
"Summarize the following points from a factual consistency evaluation.\n"
|
|
89
|
+
"Give one short paragraph (1-2 sentences) that explains whether the answer "
|
|
90
|
+
"was well supported by the context, mentioning both strong and weak parts.\n\n"
|
|
91
|
+
f"{chr(10).join(bullets)}\n\n"
|
|
92
|
+
"Summary:"
|
|
93
|
+
)
|
|
94
|
+
text, cost = await chat_complete(self.model, [{"role": "user", "content": prompt}], temperature=0.0)
|
|
95
|
+
return text.strip(), cost or 0.0
|
|
96
|
+
|
|
97
|
+
async def evaluate(self, test_case: EvalTestCase) -> Dict[str, any]:
|
|
98
|
+
llm_cost = 0.0
|
|
99
|
+
answer = test_case.actual_output
|
|
100
|
+
context = "\n".join(test_case.retrieval_context or [])
|
|
101
|
+
question = test_case.input
|
|
102
|
+
|
|
103
|
+
# 1. Statements from answer
|
|
104
|
+
statements, cost = await self._generate_statements(answer)
|
|
105
|
+
llm_cost += cost
|
|
106
|
+
|
|
107
|
+
# 2. Verdicts against context
|
|
108
|
+
verdicts, verdict_score, cost = await self._generate_verdicts(context, statements)
|
|
109
|
+
llm_cost += cost
|
|
110
|
+
|
|
111
|
+
# 3. Reason summary
|
|
112
|
+
summary_reason, cost = await self._summarize_reasons_via_llm(verdicts)
|
|
113
|
+
llm_cost += cost
|
|
114
|
+
|
|
115
|
+
success = verdict_score >= self.threshold
|
|
116
|
+
|
|
117
|
+
evaluation_log = {
|
|
118
|
+
"input_question": question,
|
|
119
|
+
"retrieval_context": test_case.retrieval_context,
|
|
120
|
+
"answer": answer,
|
|
121
|
+
"statements": statements,
|
|
122
|
+
"comment_statements": "Factual assertions extracted from the answer.",
|
|
123
|
+
"verdicts": verdicts,
|
|
124
|
+
"comment_verdicts": "Each verdict shows how well a statement is supported by the context.",
|
|
125
|
+
"final_score": verdict_score,
|
|
126
|
+
"comment_final_score": "Final score based on faithfulness of statements.",
|
|
127
|
+
"threshold": self.threshold,
|
|
128
|
+
"success": success,
|
|
129
|
+
"comment_success": "Whether the score meets the required threshold.",
|
|
130
|
+
"final_reason": summary_reason,
|
|
131
|
+
"comment_reasoning": "Summary explanation based on all verdicts."
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
"score": verdict_score,
|
|
136
|
+
"success": success,
|
|
137
|
+
"reason": summary_reason,
|
|
138
|
+
"evaluation_cost": round(llm_cost, 6),
|
|
139
|
+
"evaluation_log": evaluation_log
|
|
140
|
+
}
|