eval-ai-library 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.3.0.dist-info/METADATA +1042 -0
- eval_ai_library-0.3.0.dist-info/RECORD +34 -0
- eval_lib/__init__.py +19 -6
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +8 -3
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +12 -4
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +23 -23
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
- eval_lib/datagenerator/datagenerator.py +208 -12
- eval_lib/datagenerator/document_loader.py +29 -29
- eval_lib/evaluate.py +0 -22
- eval_lib/llm_client.py +223 -78
- eval_lib/metric_pattern.py +208 -152
- eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +7 -2
- eval_lib/metrics/bias_metric/bias.py +12 -2
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +8 -2
- eval_lib/metrics/custom_metric/custom_eval.py +237 -204
- eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
- eval_lib/metrics/geval/geval.py +8 -2
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
- eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
- eval_lib/utils.py +44 -29
- eval_ai_library-0.2.1.dist-info/METADATA +0 -753
- eval_ai_library-0.2.1.dist-info/RECORD +0 -34
- {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,303 +1,336 @@
|
|
|
1
1
|
# custom_eval.py
|
|
2
2
|
"""
|
|
3
|
-
Custom Evaluation Metric with
|
|
4
|
-
|
|
3
|
+
Custom Evaluation Metric with Verdict-based Scoring
|
|
4
|
+
Breaks down evaluation into multiple criteria with individual verdicts
|
|
5
5
|
"""
|
|
6
6
|
import json
|
|
7
|
-
import re
|
|
8
7
|
from typing import Dict, Any, List, Tuple
|
|
9
|
-
from collections import Counter
|
|
10
8
|
from eval_lib.metric_pattern import MetricPattern
|
|
11
9
|
from eval_lib.testcases_schema import EvalTestCase
|
|
12
10
|
from eval_lib.llm_client import chat_complete
|
|
13
|
-
from eval_lib.utils import extract_json_block
|
|
11
|
+
from eval_lib.utils import score_agg, extract_json_block
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Verdict weights for scoring
|
|
15
|
+
VERDICT_WEIGHTS = {
|
|
16
|
+
"fully": 1.0, # Criterion fully satisfied
|
|
17
|
+
"mostly": 0.9, # Criterion largely satisfied with minor gaps
|
|
18
|
+
"partial": 0.7, # Criterion partially satisfied
|
|
19
|
+
"minor": 0.3, # Criterion minimally addressed
|
|
20
|
+
"none": 0.0 # Criterion not satisfied at all
|
|
21
|
+
}
|
|
14
22
|
|
|
15
23
|
|
|
16
24
|
class CustomEvalMetric(MetricPattern):
|
|
25
|
+
"""
|
|
26
|
+
Custom evaluation metric with verdict-based scoring.
|
|
27
|
+
Allows defining custom criteria and evaluates each one separately.
|
|
28
|
+
"""
|
|
29
|
+
|
|
17
30
|
name = "customEval"
|
|
18
31
|
|
|
19
|
-
def __init__(
|
|
20
|
-
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
model: str,
|
|
35
|
+
threshold: float,
|
|
36
|
+
name: str,
|
|
37
|
+
criteria: str,
|
|
38
|
+
evaluation_steps: List[str] = None,
|
|
39
|
+
temperature: float = 0.8,
|
|
40
|
+
verbose: bool = False
|
|
41
|
+
):
|
|
42
|
+
"""
|
|
43
|
+
Initialize Custom Evaluation Metric.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
model: LLM model name
|
|
47
|
+
threshold: Success threshold (0.0-1.0)
|
|
48
|
+
name: Custom metric name
|
|
49
|
+
criteria: High-level evaluation criteria description
|
|
50
|
+
evaluation_steps: List of specific criteria to evaluate (auto-generated if None)
|
|
51
|
+
temperature: Score aggregation temperature for softmax
|
|
52
|
+
verbose: Enable detailed logging
|
|
53
|
+
"""
|
|
54
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
21
55
|
self.custom_name = name
|
|
22
56
|
self.criteria = criteria
|
|
57
|
+
self.evaluation_steps = evaluation_steps
|
|
58
|
+
self.temperature = temperature
|
|
23
59
|
|
|
24
60
|
# ==================== PROMPTS ====================
|
|
25
61
|
|
|
26
62
|
@staticmethod
|
|
27
|
-
def
|
|
28
|
-
"""
|
|
29
|
-
return
|
|
30
|
-
|
|
31
|
-
Evaluation Criteria:
|
|
32
|
-
{criteria}
|
|
63
|
+
def _prompt_label_help() -> str:
|
|
64
|
+
"""Explanation of verdict levels"""
|
|
65
|
+
return """Rate how well each criterion is satisfied (worst → best):
|
|
33
66
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
**
|
|
40
|
-
Return ONLY JSON:
|
|
41
|
-
{{
|
|
42
|
-
"steps": ["Step 1: ...", "Step 2: ...", "Step 3: ..."]
|
|
43
|
-
}}
|
|
44
|
-
**
|
|
45
|
-
|
|
46
|
-
JSON:"""
|
|
67
|
+
none – criterion not satisfied at all
|
|
68
|
+
minor – criterion minimally addressed
|
|
69
|
+
partial – criterion partially satisfied
|
|
70
|
+
mostly – criterion largely satisfied with minor gaps
|
|
71
|
+
fully – criterion fully satisfied"""
|
|
47
72
|
|
|
48
73
|
@staticmethod
|
|
49
|
-
def
|
|
50
|
-
"""Generate evaluation
|
|
51
|
-
|
|
52
|
-
[f"{i+1}. {step}" for i, step in enumerate(evaluation_steps)])
|
|
53
|
-
|
|
54
|
-
parts = [
|
|
55
|
-
f"User Input:\n{test_case.input}",
|
|
56
|
-
f"Model Output:\n{test_case.actual_output}"
|
|
57
|
-
]
|
|
58
|
-
|
|
59
|
-
if test_case.expected_output:
|
|
60
|
-
parts.append(f"Expected Output:\n{test_case.expected_output}")
|
|
61
|
-
|
|
62
|
-
if test_case.retrieval_context:
|
|
63
|
-
parts.append(f"Context:\n" +
|
|
64
|
-
"\n".join(test_case.retrieval_context))
|
|
65
|
-
|
|
66
|
-
input_block = "\n\n".join(parts)
|
|
74
|
+
def _prompt_generate_criteria(main_criteria: str) -> str:
|
|
75
|
+
"""Generate specific evaluation criteria from high-level description"""
|
|
76
|
+
return f"""Given the high-level evaluation criteria below, generate 3-5 specific, measurable sub-criteria.
|
|
67
77
|
|
|
68
|
-
|
|
78
|
+
High-level Criteria:
|
|
79
|
+
{main_criteria}
|
|
69
80
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
{steps_text}
|
|
75
|
-
|
|
76
|
-
{input_block}
|
|
77
|
-
|
|
78
|
-
Based on the evaluation steps, assign a score from 0.0 to 1.0 (where 0.0 is worst and 1.0 is best).
|
|
81
|
+
Generate sub-criteria that are:
|
|
82
|
+
1. Specific and observable
|
|
83
|
+
2. Can be evaluated independently
|
|
84
|
+
3. Together cover all aspects of the main criteria
|
|
79
85
|
|
|
80
86
|
**
|
|
81
87
|
Return ONLY JSON:
|
|
82
88
|
{{
|
|
83
|
-
"
|
|
89
|
+
"criteria": ["Criterion 1: ...", "Criterion 2: ...", "Criterion 3: ..."]
|
|
84
90
|
}}
|
|
85
91
|
**
|
|
86
92
|
|
|
87
93
|
JSON:"""
|
|
88
94
|
|
|
89
|
-
@
|
|
90
|
-
def
|
|
91
|
-
|
|
95
|
+
@classmethod
|
|
96
|
+
def _prompt_evaluate(
|
|
97
|
+
cls,
|
|
98
|
+
main_criteria: str,
|
|
92
99
|
evaluation_steps: List[str],
|
|
93
|
-
test_case: EvalTestCase
|
|
94
|
-
score: float
|
|
100
|
+
test_case: EvalTestCase
|
|
95
101
|
) -> str:
|
|
96
|
-
"""Generate
|
|
97
|
-
steps_text = "\n".join(
|
|
98
|
-
[f"{i+1}. {step}" for i, step in enumerate(evaluation_steps)])
|
|
102
|
+
"""Generate evaluation prompt with verdict scoring"""
|
|
99
103
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
]
|
|
104
|
+
# Build input block
|
|
105
|
+
parts = [f"User Input:\n{test_case.input}"]
|
|
106
|
+
parts.append(f"Model Output:\n{test_case.actual_output}")
|
|
104
107
|
|
|
105
108
|
if test_case.expected_output:
|
|
106
109
|
parts.append(f"Expected Output:\n{test_case.expected_output}")
|
|
107
110
|
|
|
108
111
|
if test_case.retrieval_context:
|
|
109
|
-
|
|
110
|
-
|
|
112
|
+
context_text = "\n".join(test_case.retrieval_context)
|
|
113
|
+
parts.append(f"Context:\n{context_text}")
|
|
111
114
|
|
|
112
115
|
input_block = "\n\n".join(parts)
|
|
113
116
|
|
|
114
|
-
|
|
117
|
+
# Format criteria
|
|
118
|
+
criteria_text = "\n".join(
|
|
119
|
+
[f"{i+1}. {criterion}" for i,
|
|
120
|
+
criterion in enumerate(evaluation_steps)]
|
|
121
|
+
)
|
|
115
122
|
|
|
116
|
-
|
|
117
|
-
{criteria}
|
|
123
|
+
return f"""{cls._prompt_label_help()}
|
|
118
124
|
|
|
119
|
-
|
|
120
|
-
{
|
|
125
|
+
HIGH-LEVEL CRITERIA:
|
|
126
|
+
{main_criteria}
|
|
121
127
|
|
|
122
|
-
|
|
128
|
+
SPECIFIC CRITERIA TO EVALUATE:
|
|
129
|
+
{criteria_text}
|
|
123
130
|
|
|
124
|
-
|
|
131
|
+
{input_block}
|
|
125
132
|
|
|
126
|
-
|
|
133
|
+
Task: For EACH criterion, decide how well it is satisfied in the Model Output.
|
|
134
|
+
Use exactly one of: fully, mostly, partial, minor, none.
|
|
127
135
|
|
|
128
136
|
**
|
|
129
|
-
Return
|
|
130
|
-
|
|
131
|
-
"reason": "
|
|
132
|
-
|
|
137
|
+
Return JSON array with exactly {len(evaluation_steps)} verdicts:
|
|
138
|
+
[
|
|
139
|
+
{{"verdict": "fully|mostly|partial|minor|none", "reason": "<one sentence>"}},
|
|
140
|
+
...
|
|
141
|
+
]
|
|
133
142
|
**
|
|
134
143
|
|
|
135
144
|
JSON:"""
|
|
136
145
|
|
|
137
|
-
# ====================
|
|
138
|
-
|
|
139
|
-
def _extract_score_from_response(self, text: str) -> float:
|
|
140
|
-
"""Extract float score from LLM response (0.0-1.0 range)"""
|
|
141
|
-
text = text.strip()
|
|
142
|
-
|
|
143
|
-
# Try JSON parsing first
|
|
144
|
-
try:
|
|
145
|
-
data = json.loads(extract_json_block(text))
|
|
146
|
-
if "score" in data:
|
|
147
|
-
score = float(data["score"])
|
|
148
|
-
if 0.0 <= score <= 1.0:
|
|
149
|
-
return score
|
|
150
|
-
except:
|
|
151
|
-
pass
|
|
152
|
-
|
|
153
|
-
# Try regex patterns
|
|
154
|
-
patterns = [
|
|
155
|
-
r'"score"\s*:\s*(\d+\.?\d*)',
|
|
156
|
-
r'score[:\s]+(\d+\.?\d*)',
|
|
157
|
-
r'^\s*(\d+\.?\d*)\s*$',
|
|
158
|
-
]
|
|
159
|
-
|
|
160
|
-
for pattern in patterns:
|
|
161
|
-
match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
|
|
162
|
-
if match:
|
|
163
|
-
score = float(match.group(1))
|
|
164
|
-
if 0.0 <= score <= 1.0:
|
|
165
|
-
return score
|
|
146
|
+
# ==================== CORE EVALUATION ====================
|
|
166
147
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
# ==================== CORE ALGORITHM ====================
|
|
170
|
-
|
|
171
|
-
async def _probability_weighted_scoring(
|
|
172
|
-
self,
|
|
173
|
-
prompt: str,
|
|
174
|
-
n_samples: int = 20,
|
|
175
|
-
temperature: float = 2.0
|
|
176
|
-
) -> Tuple[float, List[float], float]:
|
|
148
|
+
async def _generate_evaluation_steps(self, main_criteria: str) -> Tuple[List[str], float]:
|
|
177
149
|
"""
|
|
178
|
-
|
|
179
|
-
|
|
150
|
+
Auto-generate specific evaluation criteria from high-level description.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
main_criteria: High-level evaluation criteria
|
|
180
154
|
|
|
181
155
|
Returns:
|
|
182
|
-
(
|
|
156
|
+
Tuple of (criteria_list, llm_cost)
|
|
183
157
|
"""
|
|
184
|
-
|
|
185
|
-
scores = []
|
|
186
|
-
|
|
187
|
-
# Sample n times with high temperature
|
|
188
|
-
for _ in range(n_samples):
|
|
189
|
-
text, cost = await chat_complete(
|
|
190
|
-
self.model,
|
|
191
|
-
messages=[{"role": "user", "content": prompt}],
|
|
192
|
-
temperature=temperature
|
|
193
|
-
)
|
|
194
|
-
total_cost += cost or 0.0
|
|
158
|
+
prompt = self._prompt_generate_criteria(main_criteria)
|
|
195
159
|
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
160
|
+
text, cost = await chat_complete(
|
|
161
|
+
self.model,
|
|
162
|
+
messages=[{"role": "user", "content": prompt}],
|
|
163
|
+
temperature=0.0
|
|
164
|
+
)
|
|
201
165
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
166
|
+
try:
|
|
167
|
+
raw_json = extract_json_block(text)
|
|
168
|
+
data = json.loads(raw_json)
|
|
169
|
+
criteria = data.get("criteria", [])
|
|
205
170
|
|
|
206
|
-
|
|
207
|
-
|
|
171
|
+
if not isinstance(criteria, list) or len(criteria) == 0:
|
|
172
|
+
raise ValueError("Expected non-empty list of criteria")
|
|
208
173
|
|
|
209
|
-
|
|
174
|
+
return criteria, cost or 0.0
|
|
210
175
|
|
|
211
|
-
|
|
176
|
+
except Exception as e:
|
|
177
|
+
raise RuntimeError(
|
|
178
|
+
f"Failed to generate evaluation criteria: {e}\n{text}"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
async def _generate_verdicts(
|
|
182
|
+
self,
|
|
183
|
+
main_criteria: str,
|
|
184
|
+
evaluation_steps: List[str],
|
|
185
|
+
test_case: EvalTestCase
|
|
186
|
+
) -> Tuple[List[Dict[str, str]], float, float]:
|
|
212
187
|
"""
|
|
213
|
-
|
|
188
|
+
Generate verdicts for each evaluation criterion.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
main_criteria: High-level criteria description
|
|
192
|
+
evaluation_steps: List of specific criteria
|
|
193
|
+
test_case: Test case to evaluate
|
|
214
194
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
2. Apply probability-weighted scoring (20 samples, temp=2.0)
|
|
218
|
-
3. Generate detailed explanation
|
|
219
|
-
4. Build comprehensive evaluation_log
|
|
195
|
+
Returns:
|
|
196
|
+
Tuple of (verdicts_list, aggregated_score, llm_cost)
|
|
220
197
|
"""
|
|
221
|
-
|
|
198
|
+
prompt = self._prompt_evaluate(
|
|
199
|
+
main_criteria, evaluation_steps, test_case)
|
|
222
200
|
|
|
223
|
-
|
|
224
|
-
steps_prompt = self._prompt_generate_steps(self.criteria)
|
|
225
|
-
steps_text, step_cost = await chat_complete(
|
|
201
|
+
text, cost = await chat_complete(
|
|
226
202
|
self.model,
|
|
227
|
-
messages=[{"role": "user", "content":
|
|
203
|
+
messages=[{"role": "user", "content": prompt}],
|
|
228
204
|
temperature=0.0
|
|
229
205
|
)
|
|
230
|
-
total_cost += step_cost or 0.0
|
|
231
206
|
|
|
232
207
|
try:
|
|
233
|
-
|
|
234
|
-
|
|
208
|
+
raw_json = extract_json_block(text)
|
|
209
|
+
verdicts = json.loads(raw_json)
|
|
210
|
+
|
|
211
|
+
if not isinstance(verdicts, list):
|
|
212
|
+
raise ValueError("Expected JSON array of verdicts")
|
|
213
|
+
|
|
214
|
+
# Ensure verdicts match criteria length
|
|
215
|
+
if len(verdicts) != len(evaluation_steps):
|
|
216
|
+
if len(verdicts) < len(evaluation_steps):
|
|
217
|
+
# Pad with "none" verdicts
|
|
218
|
+
verdicts.extend([
|
|
219
|
+
{"verdict": "none", "reason": "Missing evaluation"}
|
|
220
|
+
] * (len(evaluation_steps) - len(verdicts)))
|
|
221
|
+
else:
|
|
222
|
+
# Truncate
|
|
223
|
+
verdicts = verdicts[:len(evaluation_steps)]
|
|
224
|
+
|
|
225
|
+
# Calculate aggregated score
|
|
226
|
+
weights = [
|
|
227
|
+
VERDICT_WEIGHTS.get(v.get("verdict", "none"), 0.0)
|
|
228
|
+
for v in verdicts
|
|
229
|
+
]
|
|
230
|
+
score = round(score_agg(weights, temperature=self.temperature), 4)
|
|
231
|
+
|
|
232
|
+
return verdicts, score, cost or 0.0
|
|
233
|
+
|
|
235
234
|
except Exception as e:
|
|
236
235
|
raise RuntimeError(
|
|
237
|
-
f"Failed to parse
|
|
236
|
+
f"Failed to parse verdicts: {e}\n{text}"
|
|
237
|
+
)
|
|
238
238
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
239
|
+
async def evaluate(self, test_case: EvalTestCase) -> Dict[str, Any]:
|
|
240
|
+
"""
|
|
241
|
+
Evaluate using custom criteria with verdict-based scoring.
|
|
242
242
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
)
|
|
249
|
-
total_cost += scoring_cost
|
|
243
|
+
Steps:
|
|
244
|
+
1. Auto-generate specific criteria if not provided (1 LLM call)
|
|
245
|
+
2. Generate verdicts for each criterion (1 LLM call)
|
|
246
|
+
3. Aggregate verdicts into final score using softmax
|
|
247
|
+
4. Build evaluation log
|
|
250
248
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
self.criteria, evaluation_steps, test_case, final_score)
|
|
254
|
-
reason_text, reason_cost = await chat_complete(
|
|
255
|
-
self.model,
|
|
256
|
-
messages=[{"role": "user", "content": reason_prompt}],
|
|
257
|
-
temperature=0.0
|
|
258
|
-
)
|
|
259
|
-
total_cost += reason_cost or 0.0
|
|
249
|
+
Args:
|
|
250
|
+
test_case: Test case to evaluate
|
|
260
251
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
except:
|
|
266
|
-
reason = reason_text.strip()
|
|
252
|
+
Returns:
|
|
253
|
+
Evaluation results with score, success, reason, cost, and detailed log
|
|
254
|
+
"""
|
|
255
|
+
total_cost = 0.0
|
|
267
256
|
|
|
257
|
+
# Step 1: Generate evaluation steps if not provided
|
|
258
|
+
if not self.evaluation_steps:
|
|
259
|
+
evaluation_steps, cost = await self._generate_evaluation_steps(
|
|
260
|
+
self.criteria
|
|
261
|
+
)
|
|
262
|
+
total_cost += cost
|
|
263
|
+
self.evaluation_steps = evaluation_steps
|
|
264
|
+
else:
|
|
265
|
+
evaluation_steps = self.evaluation_steps
|
|
266
|
+
|
|
267
|
+
# Step 2: Generate verdicts for each criterion
|
|
268
|
+
verdicts, final_score, cost = await self._generate_verdicts(
|
|
269
|
+
self.criteria,
|
|
270
|
+
evaluation_steps,
|
|
271
|
+
test_case
|
|
272
|
+
)
|
|
273
|
+
total_cost += cost
|
|
274
|
+
|
|
275
|
+
# Step 3: Determine success
|
|
268
276
|
success = final_score >= self.threshold
|
|
269
277
|
|
|
270
|
-
# Step
|
|
278
|
+
# Step 4: Build summary reason from verdicts
|
|
279
|
+
positive_verdicts = [
|
|
280
|
+
v for v in verdicts
|
|
281
|
+
if v.get("verdict") in ["fully", "mostly"]
|
|
282
|
+
]
|
|
283
|
+
negative_verdicts = [
|
|
284
|
+
v for v in verdicts
|
|
285
|
+
if v.get("verdict") in ["none", "minor", "partial"]
|
|
286
|
+
]
|
|
287
|
+
|
|
288
|
+
if len(positive_verdicts) >= len(verdicts) * 0.7:
|
|
289
|
+
summary = f"Strong performance: {len(positive_verdicts)}/{len(verdicts)} criteria fully or mostly satisfied."
|
|
290
|
+
elif len(negative_verdicts) >= len(verdicts) * 0.7:
|
|
291
|
+
summary = f"Weak performance: {len(negative_verdicts)}/{len(verdicts)} criteria not satisfied or minimally addressed."
|
|
292
|
+
else:
|
|
293
|
+
summary = f"Mixed performance: {len(positive_verdicts)}/{len(verdicts)} criteria satisfied, with room for improvement."
|
|
294
|
+
|
|
295
|
+
# Step 5: Build evaluation log
|
|
271
296
|
evaluation_log = {
|
|
272
297
|
"input_question": test_case.input,
|
|
273
298
|
"actual_output": test_case.actual_output,
|
|
274
299
|
"expected_output": test_case.expected_output,
|
|
275
300
|
"retrieval_context": test_case.retrieval_context,
|
|
276
|
-
"
|
|
277
|
-
"
|
|
278
|
-
"
|
|
279
|
-
"
|
|
280
|
-
"
|
|
281
|
-
"
|
|
282
|
-
"
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
301
|
+
"main_criteria": self.criteria,
|
|
302
|
+
"comment_main_criteria": "High-level evaluation criteria provided by user.",
|
|
303
|
+
"evaluation_criteria": evaluation_steps,
|
|
304
|
+
"comment_evaluation_criteria": f"Specific sub-criteria ({len(evaluation_steps)} items) used for verdict-based evaluation.",
|
|
305
|
+
"verdicts": verdicts,
|
|
306
|
+
"comment_verdicts": "Individual verdicts for each criterion (fully/mostly/partial/minor/none).",
|
|
307
|
+
"verdict_weights": {
|
|
308
|
+
i: VERDICT_WEIGHTS.get(v["verdict"], 0.0)
|
|
309
|
+
for i, v in enumerate(verdicts)
|
|
310
|
+
},
|
|
311
|
+
"comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
|
|
312
|
+
"final_score": final_score,
|
|
313
|
+
"comment_final_score": f"Weighted average of verdict scores calculated using softmax aggregation (temperature={self.temperature}).",
|
|
286
314
|
"threshold": self.threshold,
|
|
287
315
|
"success": success,
|
|
288
|
-
"comment_success": "Whether the final score
|
|
289
|
-
"
|
|
290
|
-
"
|
|
316
|
+
"comment_success": "Whether the final score meets the required threshold.",
|
|
317
|
+
"summary": summary,
|
|
318
|
+
"comment_summary": "High-level summary of evaluation performance."
|
|
291
319
|
}
|
|
292
320
|
|
|
293
|
-
|
|
294
|
-
"
|
|
321
|
+
result = {
|
|
322
|
+
"name": self.name,
|
|
323
|
+
"score": final_score,
|
|
295
324
|
"success": success,
|
|
296
|
-
"reason":
|
|
325
|
+
"reason": summary,
|
|
297
326
|
"evaluation_cost": round(total_cost, 6),
|
|
298
|
-
"evaluation_log": evaluation_log
|
|
327
|
+
"evaluation_log": evaluation_log
|
|
299
328
|
}
|
|
300
329
|
|
|
330
|
+
self.print_result(result)
|
|
331
|
+
|
|
332
|
+
return result
|
|
333
|
+
|
|
301
334
|
@property
|
|
302
335
|
def name(self):
|
|
303
336
|
return f"Custom: {self.custom_name}"
|
|
@@ -31,8 +31,9 @@ class FaithfulnessMetric(MetricPattern):
|
|
|
31
31
|
model: str,
|
|
32
32
|
threshold: float = 0.7,
|
|
33
33
|
temperature: float = 0.5,
|
|
34
|
+
verbose: bool = False
|
|
34
35
|
):
|
|
35
|
-
super().__init__(model=model, threshold=threshold)
|
|
36
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
36
37
|
self.temperature = temperature
|
|
37
38
|
|
|
38
39
|
async def _generate_statements(self, answer: str) -> Tuple[List[str], float]:
|
|
@@ -131,10 +132,14 @@ class FaithfulnessMetric(MetricPattern):
|
|
|
131
132
|
"comment_reasoning": "Summary explanation based on all verdicts."
|
|
132
133
|
}
|
|
133
134
|
|
|
134
|
-
|
|
135
|
+
result = {
|
|
136
|
+
"name": self.name,
|
|
135
137
|
"score": verdict_score,
|
|
136
138
|
"success": success,
|
|
137
139
|
"reason": summary_reason,
|
|
138
140
|
"evaluation_cost": round(llm_cost, 6),
|
|
139
141
|
"evaluation_log": evaluation_log
|
|
140
142
|
}
|
|
143
|
+
self.print_result(result)
|
|
144
|
+
|
|
145
|
+
return result
|
eval_lib/metrics/geval/geval.py
CHANGED
|
@@ -23,13 +23,14 @@ class GEval(MetricPattern):
|
|
|
23
23
|
self,
|
|
24
24
|
model: str,
|
|
25
25
|
threshold: float,
|
|
26
|
+
verbose: bool = False,
|
|
26
27
|
name: Optional[str] = None,
|
|
27
28
|
criteria: Optional[str] = None,
|
|
28
29
|
evaluation_steps: Optional[List[str]] = None,
|
|
29
30
|
n_samples: int = 20,
|
|
30
31
|
sampling_temperature: float = 2.0,
|
|
31
32
|
):
|
|
32
|
-
super().__init__(model=model, threshold=threshold)
|
|
33
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
33
34
|
self.criteria = criteria
|
|
34
35
|
self.custom_name = name
|
|
35
36
|
self.evaluation_steps = evaluation_steps
|
|
@@ -313,7 +314,8 @@ JSON:"""
|
|
|
313
314
|
"comment_reasoning": "LLM-generated explanation based on evaluation steps and criteria."
|
|
314
315
|
}
|
|
315
316
|
|
|
316
|
-
|
|
317
|
+
result = {
|
|
318
|
+
"name": self.name,
|
|
317
319
|
"score": round(final_score, 4),
|
|
318
320
|
"success": success,
|
|
319
321
|
"reason": reason,
|
|
@@ -321,6 +323,10 @@ JSON:"""
|
|
|
321
323
|
"evaluation_log": evaluation_log,
|
|
322
324
|
}
|
|
323
325
|
|
|
326
|
+
self.print_result(result)
|
|
327
|
+
|
|
328
|
+
return result
|
|
329
|
+
|
|
324
330
|
@property
|
|
325
331
|
def name(self):
|
|
326
332
|
return self.custom_name or self.__class__.name
|
|
@@ -66,9 +66,9 @@ class RestrictedRefusalMetric(MetricPattern):
|
|
|
66
66
|
"""
|
|
67
67
|
name = "restrictedRefusalMetric"
|
|
68
68
|
|
|
69
|
-
def __init__(self, threshold: float = 0.5):
|
|
69
|
+
def __init__(self, threshold: float = 0.5, verbose: bool = False):
|
|
70
70
|
|
|
71
|
-
super().__init__(model=None, threshold=threshold)
|
|
71
|
+
super().__init__(model=None, threshold=threshold, verbose=verbose)
|
|
72
72
|
|
|
73
73
|
async def evaluate(self, tc: EvalTestCase) -> Dict[str, Any]:
|
|
74
74
|
answer = tc.actual_output
|
|
@@ -93,10 +93,14 @@ class RestrictedRefusalMetric(MetricPattern):
|
|
|
93
93
|
"final_reason": reason
|
|
94
94
|
}
|
|
95
95
|
|
|
96
|
-
|
|
96
|
+
result = {
|
|
97
|
+
"name": self.name,
|
|
97
98
|
"score": score,
|
|
98
99
|
"success": success,
|
|
99
100
|
"reason": reason,
|
|
100
101
|
"evaluation_cost": 0.0,
|
|
101
102
|
"evaluation_log": evaluation_log
|
|
102
103
|
}
|
|
104
|
+
self.print_result(result)
|
|
105
|
+
|
|
106
|
+
return result
|
|
@@ -12,7 +12,9 @@ from eval_lib.llm_client import chat_complete
|
|
|
12
12
|
|
|
13
13
|
class ToxicityMetric(MetricPattern):
|
|
14
14
|
name = "toxicityMetric"
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
def __init__(self, model: str, threshold: float = 0.7, verbose: bool = False):
|
|
17
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
16
18
|
|
|
17
19
|
# ==================== PROMPTS ====================
|
|
18
20
|
|
|
@@ -104,10 +106,14 @@ JSON:"""
|
|
|
104
106
|
"comment_reasoning": "Explanation of the toxicity assessment, including specific toxic elements if found."
|
|
105
107
|
}
|
|
106
108
|
|
|
107
|
-
|
|
109
|
+
result = {
|
|
110
|
+
"name": self.name,
|
|
108
111
|
"score": score,
|
|
109
112
|
"success": success,
|
|
110
113
|
"reason": reason,
|
|
111
114
|
"evaluation_cost": round(total_cost, 6),
|
|
112
115
|
"evaluation_log": evaluation_log
|
|
113
116
|
}
|
|
117
|
+
self.print_result(result)
|
|
118
|
+
|
|
119
|
+
return result
|