eval-ai-library 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

Files changed (29) hide show
  1. eval_ai_library-0.3.0.dist-info/METADATA +1042 -0
  2. eval_ai_library-0.3.0.dist-info/RECORD +34 -0
  3. eval_lib/__init__.py +19 -6
  4. eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +8 -3
  5. eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +12 -4
  6. eval_lib/agent_metrics/task_success_metric/task_success_rate.py +23 -23
  7. eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
  8. eval_lib/datagenerator/datagenerator.py +208 -12
  9. eval_lib/datagenerator/document_loader.py +29 -29
  10. eval_lib/evaluate.py +0 -22
  11. eval_lib/llm_client.py +223 -78
  12. eval_lib/metric_pattern.py +208 -152
  13. eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
  14. eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +7 -2
  15. eval_lib/metrics/bias_metric/bias.py +12 -2
  16. eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
  17. eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
  18. eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +8 -2
  19. eval_lib/metrics/custom_metric/custom_eval.py +237 -204
  20. eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
  21. eval_lib/metrics/geval/geval.py +8 -2
  22. eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
  23. eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
  24. eval_lib/utils.py +44 -29
  25. eval_ai_library-0.2.1.dist-info/METADATA +0 -753
  26. eval_ai_library-0.2.1.dist-info/RECORD +0 -34
  27. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/WHEEL +0 -0
  28. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/licenses/LICENSE +0 -0
  29. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/top_level.txt +0 -0
@@ -1,303 +1,336 @@
1
1
  # custom_eval.py
2
2
  """
3
- Custom Evaluation Metric with Chain-of-Thought and Probability-Weighted Scoring
4
- Uses advanced techniques from G-Eval for improved accuracy
3
+ Custom Evaluation Metric with Verdict-based Scoring
4
+ Breaks down evaluation into multiple criteria with individual verdicts
5
5
  """
6
6
  import json
7
- import re
8
7
  from typing import Dict, Any, List, Tuple
9
- from collections import Counter
10
8
  from eval_lib.metric_pattern import MetricPattern
11
9
  from eval_lib.testcases_schema import EvalTestCase
12
10
  from eval_lib.llm_client import chat_complete
13
- from eval_lib.utils import extract_json_block
11
+ from eval_lib.utils import score_agg, extract_json_block
12
+
13
+
14
+ # Verdict weights for scoring
15
+ VERDICT_WEIGHTS = {
16
+ "fully": 1.0, # Criterion fully satisfied
17
+ "mostly": 0.9, # Criterion largely satisfied with minor gaps
18
+ "partial": 0.7, # Criterion partially satisfied
19
+ "minor": 0.3, # Criterion minimally addressed
20
+ "none": 0.0 # Criterion not satisfied at all
21
+ }
14
22
 
15
23
 
16
24
  class CustomEvalMetric(MetricPattern):
25
+ """
26
+ Custom evaluation metric with verdict-based scoring.
27
+ Allows defining custom criteria and evaluates each one separately.
28
+ """
29
+
17
30
  name = "customEval"
18
31
 
19
- def __init__(self, model: str, threshold: float, name: str, criteria: str):
20
- super().__init__(model=model, threshold=threshold)
32
+ def __init__(
33
+ self,
34
+ model: str,
35
+ threshold: float,
36
+ name: str,
37
+ criteria: str,
38
+ evaluation_steps: List[str] = None,
39
+ temperature: float = 0.8,
40
+ verbose: bool = False
41
+ ):
42
+ """
43
+ Initialize Custom Evaluation Metric.
44
+
45
+ Args:
46
+ model: LLM model name
47
+ threshold: Success threshold (0.0-1.0)
48
+ name: Custom metric name
49
+ criteria: High-level evaluation criteria description
50
+ evaluation_steps: List of specific criteria to evaluate (auto-generated if None)
51
+ temperature: Score aggregation temperature for softmax
52
+ verbose: Enable detailed logging
53
+ """
54
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
21
55
  self.custom_name = name
22
56
  self.criteria = criteria
57
+ self.evaluation_steps = evaluation_steps
58
+ self.temperature = temperature
23
59
 
24
60
  # ==================== PROMPTS ====================
25
61
 
26
62
  @staticmethod
27
- def _prompt_generate_steps(criteria: str) -> str:
28
- """Generate evaluation steps from criteria (Chain-of-Thought)"""
29
- return f"""Given the evaluation criteria below, generate 3-5 detailed evaluation steps.
30
-
31
- Evaluation Criteria:
32
- {criteria}
63
+ def _prompt_label_help() -> str:
64
+ """Explanation of verdict levels"""
65
+ return """Rate how well each criterion is satisfied (worst best):
33
66
 
34
- Generate steps that are:
35
- 1. Specific and actionable
36
- 2. Logically ordered
37
- 3. Lead to assigning a score from 0.0 to 1.0
38
-
39
- **
40
- Return ONLY JSON:
41
- {{
42
- "steps": ["Step 1: ...", "Step 2: ...", "Step 3: ..."]
43
- }}
44
- **
45
-
46
- JSON:"""
67
+ none – criterion not satisfied at all
68
+ minor – criterion minimally addressed
69
+ partial criterion partially satisfied
70
+ mostly – criterion largely satisfied with minor gaps
71
+ fully – criterion fully satisfied"""
47
72
 
48
73
  @staticmethod
49
- def _prompt_evaluate(criteria: str, evaluation_steps: List[str], test_case: EvalTestCase) -> str:
50
- """Generate evaluation prompt with CoT steps"""
51
- steps_text = "\n".join(
52
- [f"{i+1}. {step}" for i, step in enumerate(evaluation_steps)])
53
-
54
- parts = [
55
- f"User Input:\n{test_case.input}",
56
- f"Model Output:\n{test_case.actual_output}"
57
- ]
58
-
59
- if test_case.expected_output:
60
- parts.append(f"Expected Output:\n{test_case.expected_output}")
61
-
62
- if test_case.retrieval_context:
63
- parts.append(f"Context:\n" +
64
- "\n".join(test_case.retrieval_context))
65
-
66
- input_block = "\n\n".join(parts)
74
+ def _prompt_generate_criteria(main_criteria: str) -> str:
75
+ """Generate specific evaluation criteria from high-level description"""
76
+ return f"""Given the high-level evaluation criteria below, generate 3-5 specific, measurable sub-criteria.
67
77
 
68
- return f"""You are a strict evaluator. Use the criteria and evaluation steps below.
78
+ High-level Criteria:
79
+ {main_criteria}
69
80
 
70
- Evaluation Criteria:
71
- {criteria}
72
-
73
- Evaluation Steps:
74
- {steps_text}
75
-
76
- {input_block}
77
-
78
- Based on the evaluation steps, assign a score from 0.0 to 1.0 (where 0.0 is worst and 1.0 is best).
81
+ Generate sub-criteria that are:
82
+ 1. Specific and observable
83
+ 2. Can be evaluated independently
84
+ 3. Together cover all aspects of the main criteria
79
85
 
80
86
  **
81
87
  Return ONLY JSON:
82
88
  {{
83
- "score": <float between 0.0 and 1.0>
89
+ "criteria": ["Criterion 1: ...", "Criterion 2: ...", "Criterion 3: ..."]
84
90
  }}
85
91
  **
86
92
 
87
93
  JSON:"""
88
94
 
89
- @staticmethod
90
- def _prompt_reason(
91
- criteria: str,
95
+ @classmethod
96
+ def _prompt_evaluate(
97
+ cls,
98
+ main_criteria: str,
92
99
  evaluation_steps: List[str],
93
- test_case: EvalTestCase,
94
- score: float
100
+ test_case: EvalTestCase
95
101
  ) -> str:
96
- """Generate explanation for the score"""
97
- steps_text = "\n".join(
98
- [f"{i+1}. {step}" for i, step in enumerate(evaluation_steps)])
102
+ """Generate evaluation prompt with verdict scoring"""
99
103
 
100
- parts = [
101
- f"User Input:\n{test_case.input}",
102
- f"Model Output:\n{test_case.actual_output}"
103
- ]
104
+ # Build input block
105
+ parts = [f"User Input:\n{test_case.input}"]
106
+ parts.append(f"Model Output:\n{test_case.actual_output}")
104
107
 
105
108
  if test_case.expected_output:
106
109
  parts.append(f"Expected Output:\n{test_case.expected_output}")
107
110
 
108
111
  if test_case.retrieval_context:
109
- parts.append(f"Context:\n" +
110
- "\n".join(test_case.retrieval_context))
112
+ context_text = "\n".join(test_case.retrieval_context)
113
+ parts.append(f"Context:\n{context_text}")
111
114
 
112
115
  input_block = "\n\n".join(parts)
113
116
 
114
- return f"""You assigned a score of {score:.2f} (out of 1.0) for this evaluation.
117
+ # Format criteria
118
+ criteria_text = "\n".join(
119
+ [f"{i+1}. {criterion}" for i,
120
+ criterion in enumerate(evaluation_steps)]
121
+ )
115
122
 
116
- Evaluation Criteria:
117
- {criteria}
123
+ return f"""{cls._prompt_label_help()}
118
124
 
119
- Evaluation Steps:
120
- {steps_text}
125
+ HIGH-LEVEL CRITERIA:
126
+ {main_criteria}
121
127
 
122
- {input_block}
128
+ SPECIFIC CRITERIA TO EVALUATE:
129
+ {criteria_text}
123
130
 
124
- Final Score: {score:.2f}/1.0
131
+ {input_block}
125
132
 
126
- Explain why this score was assigned, referencing specific aspects from the evaluation steps.
133
+ Task: For EACH criterion, decide how well it is satisfied in the Model Output.
134
+ Use exactly one of: fully, mostly, partial, minor, none.
127
135
 
128
136
  **
129
- Return ONLY JSON:
130
- {{
131
- "reason": "Your explanation..."
132
- }}
137
+ Return JSON array with exactly {len(evaluation_steps)} verdicts:
138
+ [
139
+ {{"verdict": "fully|mostly|partial|minor|none", "reason": "<one sentence>"}},
140
+ ...
141
+ ]
133
142
  **
134
143
 
135
144
  JSON:"""
136
145
 
137
- # ==================== HELPER METHODS ====================
138
-
139
- def _extract_score_from_response(self, text: str) -> float:
140
- """Extract float score from LLM response (0.0-1.0 range)"""
141
- text = text.strip()
142
-
143
- # Try JSON parsing first
144
- try:
145
- data = json.loads(extract_json_block(text))
146
- if "score" in data:
147
- score = float(data["score"])
148
- if 0.0 <= score <= 1.0:
149
- return score
150
- except:
151
- pass
152
-
153
- # Try regex patterns
154
- patterns = [
155
- r'"score"\s*:\s*(\d+\.?\d*)',
156
- r'score[:\s]+(\d+\.?\d*)',
157
- r'^\s*(\d+\.?\d*)\s*$',
158
- ]
159
-
160
- for pattern in patterns:
161
- match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
162
- if match:
163
- score = float(match.group(1))
164
- if 0.0 <= score <= 1.0:
165
- return score
146
+ # ==================== CORE EVALUATION ====================
166
147
 
167
- raise RuntimeError(f"Failed to extract score from response: {text}")
168
-
169
- # ==================== CORE ALGORITHM ====================
170
-
171
- async def _probability_weighted_scoring(
172
- self,
173
- prompt: str,
174
- n_samples: int = 20,
175
- temperature: float = 2.0
176
- ) -> Tuple[float, List[float], float]:
148
+ async def _generate_evaluation_steps(self, main_criteria: str) -> Tuple[List[str], float]:
177
149
  """
178
- Probability-weighted scoring: score = Σ p(si) × si
179
- Samples multiple times to estimate probability distribution
150
+ Auto-generate specific evaluation criteria from high-level description.
151
+
152
+ Args:
153
+ main_criteria: High-level evaluation criteria
180
154
 
181
155
  Returns:
182
- (final_score, sampled_scores, total_cost)
156
+ Tuple of (criteria_list, llm_cost)
183
157
  """
184
- total_cost = 0.0
185
- scores = []
186
-
187
- # Sample n times with high temperature
188
- for _ in range(n_samples):
189
- text, cost = await chat_complete(
190
- self.model,
191
- messages=[{"role": "user", "content": prompt}],
192
- temperature=temperature
193
- )
194
- total_cost += cost or 0.0
158
+ prompt = self._prompt_generate_criteria(main_criteria)
195
159
 
196
- try:
197
- score = self._extract_score_from_response(text)
198
- scores.append(score)
199
- except:
200
- continue
160
+ text, cost = await chat_complete(
161
+ self.model,
162
+ messages=[{"role": "user", "content": prompt}],
163
+ temperature=0.0
164
+ )
201
165
 
202
- if not scores:
203
- raise RuntimeError(
204
- f"Failed to extract any valid scores from {n_samples} samples")
166
+ try:
167
+ raw_json = extract_json_block(text)
168
+ data = json.loads(raw_json)
169
+ criteria = data.get("criteria", [])
205
170
 
206
- # Calculate probability-weighted score as mean
207
- weighted_score = sum(scores) / len(scores)
171
+ if not isinstance(criteria, list) or len(criteria) == 0:
172
+ raise ValueError("Expected non-empty list of criteria")
208
173
 
209
- return weighted_score, scores, total_cost
174
+ return criteria, cost or 0.0
210
175
 
211
- async def evaluate(self, test_case: EvalTestCase) -> Dict[str, Any]:
176
+ except Exception as e:
177
+ raise RuntimeError(
178
+ f"Failed to generate evaluation criteria: {e}\n{text}"
179
+ )
180
+
181
+ async def _generate_verdicts(
182
+ self,
183
+ main_criteria: str,
184
+ evaluation_steps: List[str],
185
+ test_case: EvalTestCase
186
+ ) -> Tuple[List[Dict[str, str]], float, float]:
212
187
  """
213
- Evaluate using Chain-of-Thought and Probability-Weighted Scoring.
188
+ Generate verdicts for each evaluation criterion.
189
+
190
+ Args:
191
+ main_criteria: High-level criteria description
192
+ evaluation_steps: List of specific criteria
193
+ test_case: Test case to evaluate
214
194
 
215
- Algorithm:
216
- 1. Auto-generate evaluation steps from criteria (CoT)
217
- 2. Apply probability-weighted scoring (20 samples, temp=2.0)
218
- 3. Generate detailed explanation
219
- 4. Build comprehensive evaluation_log
195
+ Returns:
196
+ Tuple of (verdicts_list, aggregated_score, llm_cost)
220
197
  """
221
- total_cost = 0.0
198
+ prompt = self._prompt_evaluate(
199
+ main_criteria, evaluation_steps, test_case)
222
200
 
223
- # Step 1: Auto-generate evaluation steps (Chain-of-Thought from G-Eval)
224
- steps_prompt = self._prompt_generate_steps(self.criteria)
225
- steps_text, step_cost = await chat_complete(
201
+ text, cost = await chat_complete(
226
202
  self.model,
227
- messages=[{"role": "user", "content": steps_prompt}],
203
+ messages=[{"role": "user", "content": prompt}],
228
204
  temperature=0.0
229
205
  )
230
- total_cost += step_cost or 0.0
231
206
 
232
207
  try:
233
- parsed_steps = json.loads(extract_json_block(steps_text))
234
- evaluation_steps = parsed_steps["steps"]
208
+ raw_json = extract_json_block(text)
209
+ verdicts = json.loads(raw_json)
210
+
211
+ if not isinstance(verdicts, list):
212
+ raise ValueError("Expected JSON array of verdicts")
213
+
214
+ # Ensure verdicts match criteria length
215
+ if len(verdicts) != len(evaluation_steps):
216
+ if len(verdicts) < len(evaluation_steps):
217
+ # Pad with "none" verdicts
218
+ verdicts.extend([
219
+ {"verdict": "none", "reason": "Missing evaluation"}
220
+ ] * (len(evaluation_steps) - len(verdicts)))
221
+ else:
222
+ # Truncate
223
+ verdicts = verdicts[:len(evaluation_steps)]
224
+
225
+ # Calculate aggregated score
226
+ weights = [
227
+ VERDICT_WEIGHTS.get(v.get("verdict", "none"), 0.0)
228
+ for v in verdicts
229
+ ]
230
+ score = round(score_agg(weights, temperature=self.temperature), 4)
231
+
232
+ return verdicts, score, cost or 0.0
233
+
235
234
  except Exception as e:
236
235
  raise RuntimeError(
237
- f"Failed to parse evaluation steps: {e}\n{steps_text}")
236
+ f"Failed to parse verdicts: {e}\n{text}"
237
+ )
238
238
 
239
- # Step 2: Generate evaluation prompt with CoT
240
- eval_prompt = self._prompt_evaluate(
241
- self.criteria, evaluation_steps, test_case)
239
+ async def evaluate(self, test_case: EvalTestCase) -> Dict[str, Any]:
240
+ """
241
+ Evaluate using custom criteria with verdict-based scoring.
242
242
 
243
- # Step 3: Probability-weighted scoring (20 samples from G-Eval)
244
- final_score, sampled_scores, scoring_cost = await self._probability_weighted_scoring(
245
- eval_prompt,
246
- n_samples=20,
247
- temperature=2.0
248
- )
249
- total_cost += scoring_cost
243
+ Steps:
244
+ 1. Auto-generate specific criteria if not provided (1 LLM call)
245
+ 2. Generate verdicts for each criterion (1 LLM call)
246
+ 3. Aggregate verdicts into final score using softmax
247
+ 4. Build evaluation log
250
248
 
251
- # Step 4: Generate explanation
252
- reason_prompt = self._prompt_reason(
253
- self.criteria, evaluation_steps, test_case, final_score)
254
- reason_text, reason_cost = await chat_complete(
255
- self.model,
256
- messages=[{"role": "user", "content": reason_prompt}],
257
- temperature=0.0
258
- )
259
- total_cost += reason_cost or 0.0
249
+ Args:
250
+ test_case: Test case to evaluate
260
251
 
261
- # Parse reason
262
- try:
263
- reason_data = json.loads(extract_json_block(reason_text))
264
- reason = reason_data.get("reason", reason_text)
265
- except:
266
- reason = reason_text.strip()
252
+ Returns:
253
+ Evaluation results with score, success, reason, cost, and detailed log
254
+ """
255
+ total_cost = 0.0
267
256
 
257
+ # Step 1: Generate evaluation steps if not provided
258
+ if not self.evaluation_steps:
259
+ evaluation_steps, cost = await self._generate_evaluation_steps(
260
+ self.criteria
261
+ )
262
+ total_cost += cost
263
+ self.evaluation_steps = evaluation_steps
264
+ else:
265
+ evaluation_steps = self.evaluation_steps
266
+
267
+ # Step 2: Generate verdicts for each criterion
268
+ verdicts, final_score, cost = await self._generate_verdicts(
269
+ self.criteria,
270
+ evaluation_steps,
271
+ test_case
272
+ )
273
+ total_cost += cost
274
+
275
+ # Step 3: Determine success
268
276
  success = final_score >= self.threshold
269
277
 
270
- # Step 5: Build comprehensive evaluation_log
278
+ # Step 4: Build summary reason from verdicts
279
+ positive_verdicts = [
280
+ v for v in verdicts
281
+ if v.get("verdict") in ["fully", "mostly"]
282
+ ]
283
+ negative_verdicts = [
284
+ v for v in verdicts
285
+ if v.get("verdict") in ["none", "minor", "partial"]
286
+ ]
287
+
288
+ if len(positive_verdicts) >= len(verdicts) * 0.7:
289
+ summary = f"Strong performance: {len(positive_verdicts)}/{len(verdicts)} criteria fully or mostly satisfied."
290
+ elif len(negative_verdicts) >= len(verdicts) * 0.7:
291
+ summary = f"Weak performance: {len(negative_verdicts)}/{len(verdicts)} criteria not satisfied or minimally addressed."
292
+ else:
293
+ summary = f"Mixed performance: {len(positive_verdicts)}/{len(verdicts)} criteria satisfied, with room for improvement."
294
+
295
+ # Step 5: Build evaluation log
271
296
  evaluation_log = {
272
297
  "input_question": test_case.input,
273
298
  "actual_output": test_case.actual_output,
274
299
  "expected_output": test_case.expected_output,
275
300
  "retrieval_context": test_case.retrieval_context,
276
- "criteria": self.criteria,
277
- "comment_criteria": "Custom evaluation criteria provided by user.",
278
- "evaluation_steps": evaluation_steps,
279
- "comment_evaluation_steps": "Auto-generated evaluation steps using Chain-of-Thought (CoT) technique from G-Eval.",
280
- "sampled_scores": sampled_scores,
281
- "comment_sampled_scores": f"Individual scores from {len(sampled_scores)} samples with temperature=2.0.",
282
- "score_distribution": {f"{s:.2f}": sampled_scores.count(s) for s in set(sampled_scores)},
283
- "comment_score_distribution": "Frequency distribution of sampled scores for probability-weighted calculation.",
284
- "final_score": round(final_score, 4),
285
- "comment_final_score": "Probability-weighted score calculated as mean of sampled scores (G-Eval technique).",
301
+ "main_criteria": self.criteria,
302
+ "comment_main_criteria": "High-level evaluation criteria provided by user.",
303
+ "evaluation_criteria": evaluation_steps,
304
+ "comment_evaluation_criteria": f"Specific sub-criteria ({len(evaluation_steps)} items) used for verdict-based evaluation.",
305
+ "verdicts": verdicts,
306
+ "comment_verdicts": "Individual verdicts for each criterion (fully/mostly/partial/minor/none).",
307
+ "verdict_weights": {
308
+ i: VERDICT_WEIGHTS.get(v["verdict"], 0.0)
309
+ for i, v in enumerate(verdicts)
310
+ },
311
+ "comment_verdict_weights": "Numeric weights assigned to each verdict for score calculation.",
312
+ "final_score": final_score,
313
+ "comment_final_score": f"Weighted average of verdict scores calculated using softmax aggregation (temperature={self.temperature}).",
286
314
  "threshold": self.threshold,
287
315
  "success": success,
288
- "comment_success": "Whether the final score passes the custom threshold.",
289
- "final_reason": reason,
290
- "comment_reasoning": "LLM-generated explanation based on evaluation steps and criteria."
316
+ "comment_success": "Whether the final score meets the required threshold.",
317
+ "summary": summary,
318
+ "comment_summary": "High-level summary of evaluation performance."
291
319
  }
292
320
 
293
- return {
294
- "score": round(final_score, 4),
321
+ result = {
322
+ "name": self.name,
323
+ "score": final_score,
295
324
  "success": success,
296
- "reason": reason,
325
+ "reason": summary,
297
326
  "evaluation_cost": round(total_cost, 6),
298
- "evaluation_log": evaluation_log,
327
+ "evaluation_log": evaluation_log
299
328
  }
300
329
 
330
+ self.print_result(result)
331
+
332
+ return result
333
+
301
334
  @property
302
335
  def name(self):
303
336
  return f"Custom: {self.custom_name}"
@@ -31,8 +31,9 @@ class FaithfulnessMetric(MetricPattern):
31
31
  model: str,
32
32
  threshold: float = 0.7,
33
33
  temperature: float = 0.5,
34
+ verbose: bool = False
34
35
  ):
35
- super().__init__(model=model, threshold=threshold)
36
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
36
37
  self.temperature = temperature
37
38
 
38
39
  async def _generate_statements(self, answer: str) -> Tuple[List[str], float]:
@@ -131,10 +132,14 @@ class FaithfulnessMetric(MetricPattern):
131
132
  "comment_reasoning": "Summary explanation based on all verdicts."
132
133
  }
133
134
 
134
- return {
135
+ result = {
136
+ "name": self.name,
135
137
  "score": verdict_score,
136
138
  "success": success,
137
139
  "reason": summary_reason,
138
140
  "evaluation_cost": round(llm_cost, 6),
139
141
  "evaluation_log": evaluation_log
140
142
  }
143
+ self.print_result(result)
144
+
145
+ return result
@@ -23,13 +23,14 @@ class GEval(MetricPattern):
23
23
  self,
24
24
  model: str,
25
25
  threshold: float,
26
+ verbose: bool = False,
26
27
  name: Optional[str] = None,
27
28
  criteria: Optional[str] = None,
28
29
  evaluation_steps: Optional[List[str]] = None,
29
30
  n_samples: int = 20,
30
31
  sampling_temperature: float = 2.0,
31
32
  ):
32
- super().__init__(model=model, threshold=threshold)
33
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
33
34
  self.criteria = criteria
34
35
  self.custom_name = name
35
36
  self.evaluation_steps = evaluation_steps
@@ -313,7 +314,8 @@ JSON:"""
313
314
  "comment_reasoning": "LLM-generated explanation based on evaluation steps and criteria."
314
315
  }
315
316
 
316
- return {
317
+ result = {
318
+ "name": self.name,
317
319
  "score": round(final_score, 4),
318
320
  "success": success,
319
321
  "reason": reason,
@@ -321,6 +323,10 @@ JSON:"""
321
323
  "evaluation_log": evaluation_log,
322
324
  }
323
325
 
326
+ self.print_result(result)
327
+
328
+ return result
329
+
324
330
  @property
325
331
  def name(self):
326
332
  return self.custom_name or self.__class__.name
@@ -66,9 +66,9 @@ class RestrictedRefusalMetric(MetricPattern):
66
66
  """
67
67
  name = "restrictedRefusalMetric"
68
68
 
69
- def __init__(self, threshold: float = 0.5):
69
+ def __init__(self, threshold: float = 0.5, verbose: bool = False):
70
70
 
71
- super().__init__(model=None, threshold=threshold)
71
+ super().__init__(model=None, threshold=threshold, verbose=verbose)
72
72
 
73
73
  async def evaluate(self, tc: EvalTestCase) -> Dict[str, Any]:
74
74
  answer = tc.actual_output
@@ -93,10 +93,14 @@ class RestrictedRefusalMetric(MetricPattern):
93
93
  "final_reason": reason
94
94
  }
95
95
 
96
- return {
96
+ result = {
97
+ "name": self.name,
97
98
  "score": score,
98
99
  "success": success,
99
100
  "reason": reason,
100
101
  "evaluation_cost": 0.0,
101
102
  "evaluation_log": evaluation_log
102
103
  }
104
+ self.print_result(result)
105
+
106
+ return result
@@ -12,7 +12,9 @@ from eval_lib.llm_client import chat_complete
12
12
 
13
13
  class ToxicityMetric(MetricPattern):
14
14
  name = "toxicityMetric"
15
- template_cls = None # all prompts inside the class
15
+
16
+ def __init__(self, model: str, threshold: float = 0.7, verbose: bool = False):
17
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
16
18
 
17
19
  # ==================== PROMPTS ====================
18
20
 
@@ -104,10 +106,14 @@ JSON:"""
104
106
  "comment_reasoning": "Explanation of the toxicity assessment, including specific toxic elements if found."
105
107
  }
106
108
 
107
- return {
109
+ result = {
110
+ "name": self.name,
108
111
  "score": score,
109
112
  "success": success,
110
113
  "reason": reason,
111
114
  "evaluation_cost": round(total_cost, 6),
112
115
  "evaluation_log": evaluation_log
113
116
  }
117
+ self.print_result(result)
118
+
119
+ return result