eval-ai-library 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

Files changed (29) hide show
  1. eval_ai_library-0.3.0.dist-info/METADATA +1042 -0
  2. eval_ai_library-0.3.0.dist-info/RECORD +34 -0
  3. eval_lib/__init__.py +19 -6
  4. eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +8 -3
  5. eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +12 -4
  6. eval_lib/agent_metrics/task_success_metric/task_success_rate.py +23 -23
  7. eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
  8. eval_lib/datagenerator/datagenerator.py +208 -12
  9. eval_lib/datagenerator/document_loader.py +29 -29
  10. eval_lib/evaluate.py +0 -22
  11. eval_lib/llm_client.py +223 -78
  12. eval_lib/metric_pattern.py +208 -152
  13. eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
  14. eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +7 -2
  15. eval_lib/metrics/bias_metric/bias.py +12 -2
  16. eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
  17. eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
  18. eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +8 -2
  19. eval_lib/metrics/custom_metric/custom_eval.py +237 -204
  20. eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
  21. eval_lib/metrics/geval/geval.py +8 -2
  22. eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
  23. eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
  24. eval_lib/utils.py +44 -29
  25. eval_ai_library-0.2.1.dist-info/METADATA +0 -753
  26. eval_ai_library-0.2.1.dist-info/RECORD +0 -34
  27. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/WHEEL +0 -0
  28. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/licenses/LICENSE +0 -0
  29. {eval_ai_library-0.2.1.dist-info → eval_ai_library-0.3.0.dist-info}/top_level.txt +0 -0
@@ -10,8 +10,8 @@ from eval_lib.testcases_schema import EvalTestCase, ConversationalEvalTestCase
10
10
  from eval_lib.llm_client import chat_complete
11
11
 
12
12
 
13
- # ANSI color codes for beautiful console output
14
13
  class Colors:
14
+ """ANSI color codes for beautiful console output"""
15
15
  HEADER = '\033[95m'
16
16
  BLUE = '\033[94m'
17
17
  CYAN = '\033[96m'
@@ -31,7 +31,7 @@ class MetricPattern:
31
31
  """
32
32
  name: str # name of the metric
33
33
 
34
- def __init__(self, model: str, threshold: float, verbose: bool = True):
34
+ def __init__(self, model: str, threshold: float, verbose: bool = False):
35
35
  self.model = model
36
36
  self.threshold = threshold
37
37
  self.verbose = verbose
@@ -47,74 +47,111 @@ class MetricPattern:
47
47
  prefix = f"[{step_num}] " if step_num else ""
48
48
  print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
49
49
 
50
- async def evaluate(self, test_case: Union[EvalTestCase]) -> Dict[str, Any]:
50
+ def print_result(self, result: Dict[str, Any]):
51
51
  """
52
- Base evaluation method - override in subclasses for custom behavior.
52
+ Print evaluation result based on verbose setting.
53
+ If verbose=False: simple dict print
54
+ If verbose=True: beautiful formatted output with colors
53
55
  """
54
- start_time = time.time()
55
-
56
- if self.verbose:
57
- print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
58
- print(f"{Colors.BOLD}{Colors.BLUE}🔍 Evaluating: {self.name}{Colors.ENDC}")
59
- print(f"{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
60
- print(f"{Colors.DIM}Model: {self.model}{Colors.ENDC}")
61
- print(f"{Colors.DIM}Threshold: {self.threshold}{Colors.ENDC}")
62
-
63
- self._log_step("Generating evaluation prompt", 1)
64
-
65
- # 1) Generate prompt
66
- prompt = self.template.generate_prompt(
67
- test_case=test_case,
68
- threshold=self.threshold
69
- )
70
-
71
- self._log_step("Calling LLM", 2)
72
-
73
- # 2) Make API call
74
- text, cost = await chat_complete(
75
- self.model,
76
- messages=[{"role": "user", "content": prompt}],
77
- temperature=0.0
78
- )
79
-
80
- self._log_step("Parsing response", 3)
81
-
82
- # 3) Parse the response
83
- try:
84
- data = json.loads(text)
85
- except Exception as e:
86
- self._log(f" Failed to parse JSON: {e}", Colors.RED)
87
- raise RuntimeError(
88
- f"Cannot parse JSON from model response: {e}\n{text}")
89
-
90
- score = float(data.get("score", 0.0))
91
- reason = data.get("reason")
92
- success = score >= self.threshold
93
-
94
- # Calculate elapsed time
95
- elapsed_time = time.time() - start_time
96
-
97
- # Log results
98
- if self.verbose:
99
- print(f"\n{Colors.BOLD}📊 Results:{Colors.ENDC}")
100
- score_color = Colors.GREEN if success else Colors.RED
101
- success_icon = "✅" if success else "❌"
102
- print(
103
- f" {success_icon} Status: {score_color}{Colors.BOLD}{'PASSED' if success else 'FAILED'}{Colors.ENDC}")
104
- print(
105
- f" 📈 Score: {score_color}{score:.2f}{Colors.ENDC} (threshold: {self.threshold})")
106
- print(f" 💰 Cost: {Colors.YELLOW}${cost:.6f}{Colors.ENDC}")
107
- print(f" ⏱️ Time: {Colors.DIM}{elapsed_time:.2f}s{Colors.ENDC}")
108
- if reason:
109
- print(
110
- f" 💬 Reason: {Colors.DIM}{reason[:100]}{'...' if len(reason) > 100 else ''}{Colors.ENDC}")
111
-
112
- return {
113
- "score": score,
114
- "success": success,
115
- "reason": reason,
116
- "evaluation_cost": cost,
117
- }
56
+ if not self.verbose:
57
+ print(result)
58
+ return
59
+
60
+ import shutil
61
+ import textwrap
62
+ import re
63
+ import json
64
+
65
+ # Получаем ширину терминала и делим пополам
66
+ terminal_width = shutil.get_terminal_size().columns
67
+ WIDTH = terminal_width // 2
68
+ WIDTH = max(WIDTH, 60) # Минимум 60 символов
69
+
70
+ # Функция для переноса длинного текста
71
+ def wrap_text(text, width, indent=0):
72
+ """Переносит текст на несколько строк с отступом"""
73
+ wrapper = textwrap.TextWrapper(
74
+ width=width - indent,
75
+ initial_indent=' ' * indent,
76
+ subsequent_indent=' ' * indent,
77
+ break_long_words=True,
78
+ break_on_hyphens=False
79
+ )
80
+ return wrapper.fill(text)
81
+
82
+ success = result.get('success', False)
83
+ score = result.get('score', 0.0)
84
+ reason = result.get('reason', 'N/A')
85
+ cost = result.get('evaluation_cost', 0.0)
86
+ evaluation_log = result.get('evaluation_log', None)
87
+
88
+ status_icon = "✅" if success else "❌"
89
+ status_color = Colors.GREEN if success else Colors.RED
90
+ status_text = "PASSED" if success else "FAILED"
91
+
92
+ bar_length = min(30, WIDTH - 30) # Адаптивная длина прогресс-бара
93
+ filled = int(bar_length * score)
94
+ bar = '█' * filled + '░' * (bar_length - filled)
95
+
96
+ metric_name = result.get('name', self.name)
97
+ formatted_name = f"📊 {metric_name}"
98
+
99
+ # Центрируем заголовок
100
+ name_len = len(formatted_name)
101
+ if name_len > WIDTH:
102
+ formatted_name = formatted_name[:WIDTH-3] + "..."
103
+ centered_name = formatted_name
104
+ else:
105
+ padding = WIDTH - name_len
106
+ left_pad = padding // 2
107
+ right_pad = padding - left_pad
108
+ centered_name = " " * left_pad + formatted_name + " " * right_pad
109
+
110
+ # Рамка заголовка
111
+ border = "═" * WIDTH
112
+
113
+ print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
114
+ print(f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_name}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
115
+ print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
116
+
117
+ print(f"{Colors.BOLD}Status:{Colors.ENDC} {status_icon} {status_color}{Colors.BOLD}{status_text}{Colors.ENDC}")
118
+ print(
119
+ f"{Colors.BOLD}Score:{Colors.ENDC} {Colors.YELLOW}{score:.2f}{Colors.ENDC} [{bar}] {score*100:.0f}%")
120
+ print(
121
+ f"{Colors.BOLD}Cost:{Colors.ENDC} {Colors.BLUE}💰 ${cost:.6f}{Colors.ENDC}")
122
+
123
+ # Переносим Reason на несколько строк если нужно
124
+ print(f"{Colors.BOLD}Reason:{Colors.ENDC}")
125
+ wrapped_reason = wrap_text(reason, WIDTH, indent=2)
126
+ print(f"{Colors.DIM}{wrapped_reason}{Colors.ENDC}\n")
127
+
128
+ if evaluation_log:
129
+ log_json = json.dumps(evaluation_log, indent=2, ensure_ascii=False)
130
+ log_lines = log_json.split('\n')
131
+
132
+ print(f"{Colors.BOLD}Evaluation Log:{Colors.ENDC}")
133
+ log_border = "─" * WIDTH
134
+ print(f"{Colors.DIM}╭{log_border}╮{Colors.ENDC}")
135
+
136
+ for line in log_lines:
137
+ # Если строка длиннее WIDTH, переносим
138
+ if len(line) > WIDTH - 4:
139
+ # Разбиваем длинную строку
140
+ wrapped_lines = textwrap.wrap(line, width=WIDTH - 4,
141
+ break_long_words=True,
142
+ break_on_hyphens=False)
143
+ for wrapped_line in wrapped_lines:
144
+ spaces_needed = WIDTH - len(wrapped_line) - 2
145
+ print(
146
+ f"{Colors.DIM}│{Colors.ENDC} {wrapped_line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
147
+ else:
148
+ spaces_needed = WIDTH - len(line) - 2
149
+ print(
150
+ f"{Colors.DIM}│{Colors.ENDC} {line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
151
+
152
+ print(f"{Colors.DIM}╰{log_border}╯{Colors.ENDC}")
153
+
154
+ print(f"{Colors.DIM}{'─' * WIDTH}{Colors.ENDC}\n")
118
155
 
119
156
 
120
157
  class ConversationalMetricPattern:
@@ -123,16 +160,11 @@ class ConversationalMetricPattern:
123
160
  Used for metrics like RoleAdherence, DialogueCoherence, etc.
124
161
  """
125
162
  name: str
126
- template_cls: Type
127
163
 
128
- def __init__(self, model: str, threshold: float, verbose: bool = True):
164
+ def __init__(self, model: str, threshold: float, verbose: bool = False):
129
165
  self.model = model
130
166
  self.threshold = threshold
131
167
  self.verbose = verbose
132
- if self.template_cls:
133
- self.template = self.template_cls()
134
- else:
135
- self.template = None
136
168
  self.chatbot_role: Optional[str] = None
137
169
 
138
170
  def _log(self, message: str, color: str = Colors.CYAN):
@@ -146,84 +178,108 @@ class ConversationalMetricPattern:
146
178
  prefix = f"[{step_num}] " if step_num else ""
147
179
  print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
148
180
 
149
- async def evaluate(self, test_case: ConversationalEvalTestCase) -> Dict[str, Any]:
181
+ def print_result(self, result: Dict[str, Any]):
150
182
  """
151
- Evaluate conversational test case with logging.
183
+ Print evaluation result based on verbose setting.
184
+ If verbose=False: simple dict print
185
+ If verbose=True: beautiful formatted output with colors
152
186
  """
153
- start_time = time.time()
154
-
155
- if self.verbose:
156
- print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
157
- print(
158
- f"{Colors.BOLD}{Colors.BLUE}💬 Evaluating Conversation: {self.name}{Colors.ENDC}")
159
- print(f"{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
160
- print(f"{Colors.DIM}Model: {self.model}{Colors.ENDC}")
161
- print(f"{Colors.DIM}Threshold: {self.threshold}{Colors.ENDC}")
162
- print(f"{Colors.DIM}Turns: {len(test_case.turns)}{Colors.ENDC}")
163
-
164
- self._log_step("Generating evaluation prompt", 1)
165
-
166
- # 1. Generate prompt
167
- if hasattr(self.template, "generate_prompt"):
168
- try:
169
- prompt = self.template.generate_prompt(
170
- test_case=test_case,
171
- threshold=self.threshold,
172
- chatbot_role=self.chatbot_role
173
- )
174
- except TypeError:
175
- prompt = self.template.generate_prompt(
176
- test_case=test_case,
177
- threshold=self.threshold,
178
- temperature=0.0
179
- )
187
+ if not self.verbose:
188
+ print(result)
189
+ return
190
+
191
+ import shutil
192
+ import textwrap
193
+ import re
194
+ import json
195
+
196
+ # Получаем ширину терминала и делим пополам
197
+ terminal_width = shutil.get_terminal_size().columns
198
+ WIDTH = terminal_width // 2
199
+ WIDTH = max(WIDTH, 60) # Минимум 60 символов
200
+
201
+ # Функция для переноса длинного текста
202
+ def wrap_text(text, width, indent=0):
203
+ """Переносит текст на несколько строк с отступом"""
204
+ wrapper = textwrap.TextWrapper(
205
+ width=width - indent,
206
+ initial_indent=' ' * indent,
207
+ subsequent_indent=' ' * indent,
208
+ break_long_words=True,
209
+ break_on_hyphens=False
210
+ )
211
+ return wrapper.fill(text)
212
+
213
+ success = result.get('success', False)
214
+ score = result.get('score', 0.0)
215
+ reason = result.get('reason', 'N/A')
216
+ cost = result.get('evaluation_cost', 0.0)
217
+ evaluation_log = result.get('evaluation_log', None)
218
+
219
+ status_icon = "✅" if success else "❌"
220
+ status_color = Colors.GREEN if success else Colors.RED
221
+ status_text = "PASSED" if success else "FAILED"
222
+
223
+ bar_length = min(30, WIDTH - 30) # Адаптивная длина прогресс-бара
224
+ filled = int(bar_length * score)
225
+ bar = '█' * filled + '░' * (bar_length - filled)
226
+
227
+ metric_name = result.get('name', self.name)
228
+ formatted_name = f"📊 {metric_name}"
229
+
230
+ # Центрируем заголовок
231
+ name_len = len(formatted_name)
232
+ if name_len > WIDTH:
233
+ formatted_name = formatted_name[:WIDTH-3] + "..."
234
+ centered_name = formatted_name
180
235
  else:
181
- raise RuntimeError("Template is missing method generate_prompt")
182
-
183
- self._log_step("Calling LLM", 2)
184
-
185
- # 2. Call API
186
- text, cost = await chat_complete(
187
- self.model,
188
- messages=[{"role": "user", "content": prompt}],
189
- temperature=0.0
190
- )
191
-
192
- self._log_step("Parsing response", 3)
193
-
194
- # 3. Parse response
195
- try:
196
- data = json.loads(text)
197
- except Exception as e:
198
- self._log(f"❌ Failed to parse JSON: {e}", Colors.RED)
199
- raise RuntimeError(
200
- f"Cannot parse JSON from model response: {e}\n{text}")
201
-
202
- score = float(data.get("score", 0.0))
203
- reason = data.get("reason")
204
- success = score >= self.threshold
205
-
206
- # Calculate elapsed time
207
- elapsed_time = time.time() - start_time
208
-
209
- # Log results
210
- if self.verbose:
211
- print(f"\n{Colors.BOLD}📊 Results:{Colors.ENDC}")
212
- score_color = Colors.GREEN if success else Colors.RED
213
- success_icon = "✅" if success else "❌"
214
- print(
215
- f" {success_icon} Status: {score_color}{Colors.BOLD}{'PASSED' if success else 'FAILED'}{Colors.ENDC}")
216
- print(
217
- f" 📈 Score: {score_color}{score:.2f}{Colors.ENDC} (threshold: {self.threshold})")
218
- print(f" 💰 Cost: {Colors.YELLOW}${cost:.6f}{Colors.ENDC}")
219
- print(f" ⏱️ Time: {Colors.DIM}{elapsed_time:.2f}s{Colors.ENDC}")
220
- if reason:
221
- print(
222
- f" 💬 Reason: {Colors.DIM}{reason[:100]}{'...' if len(reason) > 100 else ''}{Colors.ENDC}")
223
-
224
- return {
225
- "score": score,
226
- "success": success,
227
- "reason": reason,
228
- "evaluation_cost": cost,
229
- }
236
+ padding = WIDTH - name_len
237
+ left_pad = padding // 2
238
+ right_pad = padding - left_pad
239
+ centered_name = " " * left_pad + formatted_name + " " * right_pad
240
+
241
+ # Рамка заголовка
242
+ border = "═" * WIDTH
243
+
244
+ print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
245
+ print(f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_name}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
246
+ print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
247
+
248
+ print(f"{Colors.BOLD}Status:{Colors.ENDC} {status_icon} {status_color}{Colors.BOLD}{status_text}{Colors.ENDC}")
249
+ print(
250
+ f"{Colors.BOLD}Score:{Colors.ENDC} {Colors.YELLOW}{score:.2f}{Colors.ENDC} [{bar}] {score*100:.0f}%")
251
+ print(
252
+ f"{Colors.BOLD}Cost:{Colors.ENDC} {Colors.BLUE}💰 ${cost:.6f}{Colors.ENDC}")
253
+
254
+ # Переносим Reason на несколько строк если нужно
255
+ print(f"{Colors.BOLD}Reason:{Colors.ENDC}")
256
+ wrapped_reason = wrap_text(reason, WIDTH, indent=2)
257
+ print(f"{Colors.DIM}{wrapped_reason}{Colors.ENDC}\n")
258
+
259
+ if evaluation_log:
260
+ log_json = json.dumps(evaluation_log, indent=2, ensure_ascii=False)
261
+ log_lines = log_json.split('\n')
262
+
263
+ print(f"{Colors.BOLD}Evaluation Log:{Colors.ENDC}")
264
+ log_border = "─" * WIDTH
265
+ print(f"{Colors.DIM}╭{log_border}╮{Colors.ENDC}")
266
+
267
+ for line in log_lines:
268
+ # Если строка длиннее WIDTH, переносим
269
+ if len(line) > WIDTH - 4:
270
+ # Разбиваем длинную строку
271
+ wrapped_lines = textwrap.wrap(line, width=WIDTH - 4,
272
+ break_long_words=True,
273
+ break_on_hyphens=False)
274
+ for wrapped_line in wrapped_lines:
275
+ spaces_needed = WIDTH - len(wrapped_line) - 2
276
+ print(
277
+ f"{Colors.DIM}{Colors.ENDC} {wrapped_line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
278
+ else:
279
+ spaces_needed = WIDTH - len(line) - 2
280
+ print(
281
+ f"{Colors.DIM}│{Colors.ENDC} {line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
282
+
283
+ print(f"{Colors.DIM}╰{log_border}╯{Colors.ENDC}")
284
+
285
+ print(f"{Colors.DIM}{'─' * WIDTH}{Colors.ENDC}\n")
@@ -178,8 +178,8 @@ class PrecisionConfig:
178
178
  class AnswerPrecisionMetric(MetricPattern):
179
179
  name = "answerPrecisionMetric"
180
180
 
181
- def __init__(self, model: str, threshold: float = 0.8, config: Optional[PrecisionConfig] = None):
182
- super().__init__(model=model, threshold=threshold)
181
+ def __init__(self, model: str, threshold: float = 0.8, verbose: bool = False, config: Optional[PrecisionConfig] = None):
182
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
183
183
  self.config = config or PrecisionConfig()
184
184
 
185
185
  # --- core similarity components ---
@@ -395,7 +395,8 @@ class AnswerPrecisionMetric(MetricPattern):
395
395
  },
396
396
  }
397
397
 
398
- return {
398
+ result = {
399
+ "name": self.name,
399
400
  "score": round(final_score, 4),
400
401
  "success": success,
401
402
  "reason": reason,
@@ -403,3 +404,7 @@ class AnswerPrecisionMetric(MetricPattern):
403
404
  "evaluation_cost": 0.0,
404
405
  "evaluation_log": evaluation_log,
405
406
  }
407
+
408
+ self.print_result(result)
409
+
410
+ return result
@@ -34,8 +34,9 @@ class AnswerRelevancyMetric(MetricPattern):
34
34
  model: str,
35
35
  threshold: float = 0.6,
36
36
  temperature: float = 0.5,
37
+ verbose: bool = False
37
38
  ):
38
- super().__init__(model=model, threshold=threshold)
39
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
39
40
  self.temperature = temperature
40
41
 
41
42
  async def _infer_user_intent(self, question: str) -> str:
@@ -186,10 +187,14 @@ class AnswerRelevancyMetric(MetricPattern):
186
187
  "comment_reasoning": "Compressed explanation of the key verdict rationales."
187
188
  }
188
189
 
189
- return {
190
+ result = {
191
+ "name": self.name,
190
192
  "score": final_score,
191
193
  "success": success,
192
194
  "reason": summary_reason,
193
195
  "evaluation_cost": round(llm_cost, 6),
194
196
  "evaluation_log": evaluation_log
195
197
  }
198
+ self.print_result(result)
199
+
200
+ return result
@@ -12,7 +12,14 @@ from eval_lib.llm_client import chat_complete
12
12
 
13
13
  class BiasMetric(MetricPattern):
14
14
  name = "biasMetric"
15
- template_cls = None # all prompts inside the class
15
+
16
+ def __init__(
17
+ self,
18
+ model: str,
19
+ threshold: float = 0.8,
20
+ verbose: bool = False,
21
+ ):
22
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
16
23
 
17
24
  # ==================== PROMPTS ====================
18
25
 
@@ -105,10 +112,13 @@ JSON:"""
105
112
  "comment_reasoning": "Explanation of the bias assessment, including specific biased elements if found."
106
113
  }
107
114
 
108
- return {
115
+ result = {
116
+ "name": self.name,
109
117
  "score": score,
110
118
  "success": success,
111
119
  "reason": reason,
112
120
  "evaluation_cost": round(total_cost, 6),
113
121
  "evaluation_log": evaluation_log
114
122
  }
123
+ self.print_result(result)
124
+ return result
@@ -18,9 +18,9 @@ from eval_lib.utils import extract_json_block
18
18
  class ContextualPrecisionMetric(MetricPattern):
19
19
  name = "contextPrecisionMetric"
20
20
 
21
- def __init__(self, model: str, threshold: float = 0.7, top_k: int | None = None, ):
22
- super().__init__(model=model, threshold=threshold)
23
- self.top_k = top_k # limit of chunks inspected (None = all)
21
+ def __init__(self, model: str, threshold: float = 0.7, top_k: int | None = None, verbose: bool = False):
22
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
23
+ self.top_k = top_k
24
24
 
25
25
  # ------------------------------------------------------------------ #
26
26
  async def _is_chunk_relevant( # judgement = 0 / 1
@@ -93,10 +93,15 @@ class ContextualPrecisionMetric(MetricPattern):
93
93
  "comment_success": "Whether precision meets threshold."
94
94
  }
95
95
 
96
- return {
96
+ result = {
97
+ "name": self.name,
97
98
  "score": ctx_precision,
98
99
  "success": success,
99
100
  "reason": f"Average precision across top-{len(chunks)} context chunks.",
100
101
  "evaluation_cost": round(llm_cost, 6),
101
102
  "evaluation_log": evaluation_log,
102
103
  }
104
+
105
+ self.print_result(result)
106
+
107
+ return result
@@ -18,8 +18,8 @@ from eval_lib.utils import extract_json_block
18
18
  class ContextualRecallMetric(MetricPattern):
19
19
  name = "contextualRecallMetric"
20
20
 
21
- def __init__(self, model: str, threshold: float = 0.7):
22
- super().__init__(model=model, threshold=threshold)
21
+ def __init__(self, model: str, threshold: float = 0.7, verbose: bool = False):
22
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
23
23
 
24
24
  async def _extract_claims(self, reference: str) -> Tuple[List[str], float]:
25
25
  prompt = (
@@ -82,10 +82,14 @@ class ContextualRecallMetric(MetricPattern):
82
82
  "comment_success": "Whether the score exceeds the threshold.",
83
83
  }
84
84
 
85
- return {
85
+ result = {
86
+ "name": self.name,
86
87
  "score": recall_score,
87
88
  "success": success,
88
89
  "reason": f"{supported_count} out of {total_claims} reference claims supported by context.",
89
90
  "evaluation_cost": round(llm_cost, 6),
90
91
  "evaluation_log": evaluation_log
91
92
  }
93
+ self.print_result(result)
94
+
95
+ return result
@@ -34,8 +34,9 @@ class ContextualRelevancyMetric(MetricPattern):
34
34
  model: str,
35
35
  threshold: float = 0.6,
36
36
  temperature: float = 0.5,
37
+ verbose: bool = False
37
38
  ):
38
- super().__init__(model=model, threshold=threshold)
39
+ super().__init__(model=model, threshold=threshold, verbose=verbose)
39
40
  self.temperature = temperature
40
41
 
41
42
  async def _infer_user_intent(self, question: str) -> Tuple[str, float]:
@@ -160,10 +161,15 @@ class ContextualRelevancyMetric(MetricPattern):
160
161
  "comment_reasoning": "LLM-generated explanation based on verdict rationales."
161
162
  }
162
163
 
163
- return {
164
+ result = {
165
+ "name": self.name,
164
166
  "score": score,
165
167
  "success": success,
166
168
  "reason": summary,
167
169
  "evaluation_cost": round(llm_cost, 6),
168
170
  "evaluation_log": evaluation_log
169
171
  }
172
+
173
+ self.print_result(result)
174
+
175
+ return result