eval-ai-library 0.2.2__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.3.0.dist-info/METADATA +1042 -0
- eval_ai_library-0.3.0.dist-info/RECORD +34 -0
- eval_lib/__init__.py +19 -6
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +8 -3
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +12 -4
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +23 -23
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +8 -2
- eval_lib/datagenerator/datagenerator.py +208 -12
- eval_lib/datagenerator/document_loader.py +29 -29
- eval_lib/evaluate.py +0 -22
- eval_lib/llm_client.py +223 -78
- eval_lib/metric_pattern.py +208 -152
- eval_lib/metrics/answer_precision_metric/answer_precision.py +8 -3
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +7 -2
- eval_lib/metrics/bias_metric/bias.py +12 -2
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +9 -4
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +7 -3
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +8 -2
- eval_lib/metrics/custom_metric/custom_eval.py +237 -204
- eval_lib/metrics/faithfulness_metric/faithfulness.py +7 -2
- eval_lib/metrics/geval/geval.py +8 -2
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +7 -3
- eval_lib/metrics/toxicity_metric/toxicity.py +8 -2
- eval_lib/utils.py +44 -29
- eval_ai_library-0.2.2.dist-info/METADATA +0 -779
- eval_ai_library-0.2.2.dist-info/RECORD +0 -34
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.0.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.2.2.dist-info → eval_ai_library-0.3.0.dist-info}/top_level.txt +0 -0
eval_lib/metric_pattern.py
CHANGED
|
@@ -10,8 +10,8 @@ from eval_lib.testcases_schema import EvalTestCase, ConversationalEvalTestCase
|
|
|
10
10
|
from eval_lib.llm_client import chat_complete
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
# ANSI color codes for beautiful console output
|
|
14
13
|
class Colors:
|
|
14
|
+
"""ANSI color codes for beautiful console output"""
|
|
15
15
|
HEADER = '\033[95m'
|
|
16
16
|
BLUE = '\033[94m'
|
|
17
17
|
CYAN = '\033[96m'
|
|
@@ -31,7 +31,7 @@ class MetricPattern:
|
|
|
31
31
|
"""
|
|
32
32
|
name: str # name of the metric
|
|
33
33
|
|
|
34
|
-
def __init__(self, model: str, threshold: float, verbose: bool =
|
|
34
|
+
def __init__(self, model: str, threshold: float, verbose: bool = False):
|
|
35
35
|
self.model = model
|
|
36
36
|
self.threshold = threshold
|
|
37
37
|
self.verbose = verbose
|
|
@@ -47,74 +47,111 @@ class MetricPattern:
|
|
|
47
47
|
prefix = f"[{step_num}] " if step_num else ""
|
|
48
48
|
print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
|
|
49
49
|
|
|
50
|
-
|
|
50
|
+
def print_result(self, result: Dict[str, Any]):
|
|
51
51
|
"""
|
|
52
|
-
|
|
52
|
+
Print evaluation result based on verbose setting.
|
|
53
|
+
If verbose=False: simple dict print
|
|
54
|
+
If verbose=True: beautiful formatted output with colors
|
|
53
55
|
"""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
#
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
56
|
+
if not self.verbose:
|
|
57
|
+
print(result)
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
import shutil
|
|
61
|
+
import textwrap
|
|
62
|
+
import re
|
|
63
|
+
import json
|
|
64
|
+
|
|
65
|
+
# Получаем ширину терминала и делим пополам
|
|
66
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
67
|
+
WIDTH = terminal_width // 2
|
|
68
|
+
WIDTH = max(WIDTH, 60) # Минимум 60 символов
|
|
69
|
+
|
|
70
|
+
# Функция для переноса длинного текста
|
|
71
|
+
def wrap_text(text, width, indent=0):
|
|
72
|
+
"""Переносит текст на несколько строк с отступом"""
|
|
73
|
+
wrapper = textwrap.TextWrapper(
|
|
74
|
+
width=width - indent,
|
|
75
|
+
initial_indent=' ' * indent,
|
|
76
|
+
subsequent_indent=' ' * indent,
|
|
77
|
+
break_long_words=True,
|
|
78
|
+
break_on_hyphens=False
|
|
79
|
+
)
|
|
80
|
+
return wrapper.fill(text)
|
|
81
|
+
|
|
82
|
+
success = result.get('success', False)
|
|
83
|
+
score = result.get('score', 0.0)
|
|
84
|
+
reason = result.get('reason', 'N/A')
|
|
85
|
+
cost = result.get('evaluation_cost', 0.0)
|
|
86
|
+
evaluation_log = result.get('evaluation_log', None)
|
|
87
|
+
|
|
88
|
+
status_icon = "✅" if success else "❌"
|
|
89
|
+
status_color = Colors.GREEN if success else Colors.RED
|
|
90
|
+
status_text = "PASSED" if success else "FAILED"
|
|
91
|
+
|
|
92
|
+
bar_length = min(30, WIDTH - 30) # Адаптивная длина прогресс-бара
|
|
93
|
+
filled = int(bar_length * score)
|
|
94
|
+
bar = '█' * filled + '░' * (bar_length - filled)
|
|
95
|
+
|
|
96
|
+
metric_name = result.get('name', self.name)
|
|
97
|
+
formatted_name = f"📊 {metric_name}"
|
|
98
|
+
|
|
99
|
+
# Центрируем заголовок
|
|
100
|
+
name_len = len(formatted_name)
|
|
101
|
+
if name_len > WIDTH:
|
|
102
|
+
formatted_name = formatted_name[:WIDTH-3] + "..."
|
|
103
|
+
centered_name = formatted_name
|
|
104
|
+
else:
|
|
105
|
+
padding = WIDTH - name_len
|
|
106
|
+
left_pad = padding // 2
|
|
107
|
+
right_pad = padding - left_pad
|
|
108
|
+
centered_name = " " * left_pad + formatted_name + " " * right_pad
|
|
109
|
+
|
|
110
|
+
# Рамка заголовка
|
|
111
|
+
border = "═" * WIDTH
|
|
112
|
+
|
|
113
|
+
print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
|
|
114
|
+
print(f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_name}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
|
|
115
|
+
print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
|
|
116
|
+
|
|
117
|
+
print(f"{Colors.BOLD}Status:{Colors.ENDC} {status_icon} {status_color}{Colors.BOLD}{status_text}{Colors.ENDC}")
|
|
118
|
+
print(
|
|
119
|
+
f"{Colors.BOLD}Score:{Colors.ENDC} {Colors.YELLOW}{score:.2f}{Colors.ENDC} [{bar}] {score*100:.0f}%")
|
|
120
|
+
print(
|
|
121
|
+
f"{Colors.BOLD}Cost:{Colors.ENDC} {Colors.BLUE}💰 ${cost:.6f}{Colors.ENDC}")
|
|
122
|
+
|
|
123
|
+
# Переносим Reason на несколько строк если нужно
|
|
124
|
+
print(f"{Colors.BOLD}Reason:{Colors.ENDC}")
|
|
125
|
+
wrapped_reason = wrap_text(reason, WIDTH, indent=2)
|
|
126
|
+
print(f"{Colors.DIM}{wrapped_reason}{Colors.ENDC}\n")
|
|
127
|
+
|
|
128
|
+
if evaluation_log:
|
|
129
|
+
log_json = json.dumps(evaluation_log, indent=2, ensure_ascii=False)
|
|
130
|
+
log_lines = log_json.split('\n')
|
|
131
|
+
|
|
132
|
+
print(f"{Colors.BOLD}Evaluation Log:{Colors.ENDC}")
|
|
133
|
+
log_border = "─" * WIDTH
|
|
134
|
+
print(f"{Colors.DIM}╭{log_border}╮{Colors.ENDC}")
|
|
135
|
+
|
|
136
|
+
for line in log_lines:
|
|
137
|
+
# Если строка длиннее WIDTH, переносим
|
|
138
|
+
if len(line) > WIDTH - 4:
|
|
139
|
+
# Разбиваем длинную строку
|
|
140
|
+
wrapped_lines = textwrap.wrap(line, width=WIDTH - 4,
|
|
141
|
+
break_long_words=True,
|
|
142
|
+
break_on_hyphens=False)
|
|
143
|
+
for wrapped_line in wrapped_lines:
|
|
144
|
+
spaces_needed = WIDTH - len(wrapped_line) - 2
|
|
145
|
+
print(
|
|
146
|
+
f"{Colors.DIM}│{Colors.ENDC} {wrapped_line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
|
|
147
|
+
else:
|
|
148
|
+
spaces_needed = WIDTH - len(line) - 2
|
|
149
|
+
print(
|
|
150
|
+
f"{Colors.DIM}│{Colors.ENDC} {line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
|
|
151
|
+
|
|
152
|
+
print(f"{Colors.DIM}╰{log_border}╯{Colors.ENDC}")
|
|
153
|
+
|
|
154
|
+
print(f"{Colors.DIM}{'─' * WIDTH}{Colors.ENDC}\n")
|
|
118
155
|
|
|
119
156
|
|
|
120
157
|
class ConversationalMetricPattern:
|
|
@@ -123,16 +160,11 @@ class ConversationalMetricPattern:
|
|
|
123
160
|
Used for metrics like RoleAdherence, DialogueCoherence, etc.
|
|
124
161
|
"""
|
|
125
162
|
name: str
|
|
126
|
-
template_cls: Type
|
|
127
163
|
|
|
128
|
-
def __init__(self, model: str, threshold: float, verbose: bool =
|
|
164
|
+
def __init__(self, model: str, threshold: float, verbose: bool = False):
|
|
129
165
|
self.model = model
|
|
130
166
|
self.threshold = threshold
|
|
131
167
|
self.verbose = verbose
|
|
132
|
-
if self.template_cls:
|
|
133
|
-
self.template = self.template_cls()
|
|
134
|
-
else:
|
|
135
|
-
self.template = None
|
|
136
168
|
self.chatbot_role: Optional[str] = None
|
|
137
169
|
|
|
138
170
|
def _log(self, message: str, color: str = Colors.CYAN):
|
|
@@ -146,84 +178,108 @@ class ConversationalMetricPattern:
|
|
|
146
178
|
prefix = f"[{step_num}] " if step_num else ""
|
|
147
179
|
print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
|
|
148
180
|
|
|
149
|
-
|
|
181
|
+
def print_result(self, result: Dict[str, Any]):
|
|
150
182
|
"""
|
|
151
|
-
|
|
183
|
+
Print evaluation result based on verbose setting.
|
|
184
|
+
If verbose=False: simple dict print
|
|
185
|
+
If verbose=True: beautiful formatted output with colors
|
|
152
186
|
"""
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
187
|
+
if not self.verbose:
|
|
188
|
+
print(result)
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
import shutil
|
|
192
|
+
import textwrap
|
|
193
|
+
import re
|
|
194
|
+
import json
|
|
195
|
+
|
|
196
|
+
# Получаем ширину терминала и делим пополам
|
|
197
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
198
|
+
WIDTH = terminal_width // 2
|
|
199
|
+
WIDTH = max(WIDTH, 60) # Минимум 60 символов
|
|
200
|
+
|
|
201
|
+
# Функция для переноса длинного текста
|
|
202
|
+
def wrap_text(text, width, indent=0):
|
|
203
|
+
"""Переносит текст на несколько строк с отступом"""
|
|
204
|
+
wrapper = textwrap.TextWrapper(
|
|
205
|
+
width=width - indent,
|
|
206
|
+
initial_indent=' ' * indent,
|
|
207
|
+
subsequent_indent=' ' * indent,
|
|
208
|
+
break_long_words=True,
|
|
209
|
+
break_on_hyphens=False
|
|
210
|
+
)
|
|
211
|
+
return wrapper.fill(text)
|
|
212
|
+
|
|
213
|
+
success = result.get('success', False)
|
|
214
|
+
score = result.get('score', 0.0)
|
|
215
|
+
reason = result.get('reason', 'N/A')
|
|
216
|
+
cost = result.get('evaluation_cost', 0.0)
|
|
217
|
+
evaluation_log = result.get('evaluation_log', None)
|
|
218
|
+
|
|
219
|
+
status_icon = "✅" if success else "❌"
|
|
220
|
+
status_color = Colors.GREEN if success else Colors.RED
|
|
221
|
+
status_text = "PASSED" if success else "FAILED"
|
|
222
|
+
|
|
223
|
+
bar_length = min(30, WIDTH - 30) # Адаптивная длина прогресс-бара
|
|
224
|
+
filled = int(bar_length * score)
|
|
225
|
+
bar = '█' * filled + '░' * (bar_length - filled)
|
|
226
|
+
|
|
227
|
+
metric_name = result.get('name', self.name)
|
|
228
|
+
formatted_name = f"📊 {metric_name}"
|
|
229
|
+
|
|
230
|
+
# Центрируем заголовок
|
|
231
|
+
name_len = len(formatted_name)
|
|
232
|
+
if name_len > WIDTH:
|
|
233
|
+
formatted_name = formatted_name[:WIDTH-3] + "..."
|
|
234
|
+
centered_name = formatted_name
|
|
180
235
|
else:
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
)
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
"
|
|
229
|
-
|
|
236
|
+
padding = WIDTH - name_len
|
|
237
|
+
left_pad = padding // 2
|
|
238
|
+
right_pad = padding - left_pad
|
|
239
|
+
centered_name = " " * left_pad + formatted_name + " " * right_pad
|
|
240
|
+
|
|
241
|
+
# Рамка заголовка
|
|
242
|
+
border = "═" * WIDTH
|
|
243
|
+
|
|
244
|
+
print(f"\n{Colors.BOLD}{Colors.CYAN}╔{border}╗{Colors.ENDC}")
|
|
245
|
+
print(f"{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}{centered_name}{Colors.BOLD}{Colors.CYAN}║{Colors.ENDC}")
|
|
246
|
+
print(f"{Colors.BOLD}{Colors.CYAN}╚{border}╝{Colors.ENDC}\n")
|
|
247
|
+
|
|
248
|
+
print(f"{Colors.BOLD}Status:{Colors.ENDC} {status_icon} {status_color}{Colors.BOLD}{status_text}{Colors.ENDC}")
|
|
249
|
+
print(
|
|
250
|
+
f"{Colors.BOLD}Score:{Colors.ENDC} {Colors.YELLOW}{score:.2f}{Colors.ENDC} [{bar}] {score*100:.0f}%")
|
|
251
|
+
print(
|
|
252
|
+
f"{Colors.BOLD}Cost:{Colors.ENDC} {Colors.BLUE}💰 ${cost:.6f}{Colors.ENDC}")
|
|
253
|
+
|
|
254
|
+
# Переносим Reason на несколько строк если нужно
|
|
255
|
+
print(f"{Colors.BOLD}Reason:{Colors.ENDC}")
|
|
256
|
+
wrapped_reason = wrap_text(reason, WIDTH, indent=2)
|
|
257
|
+
print(f"{Colors.DIM}{wrapped_reason}{Colors.ENDC}\n")
|
|
258
|
+
|
|
259
|
+
if evaluation_log:
|
|
260
|
+
log_json = json.dumps(evaluation_log, indent=2, ensure_ascii=False)
|
|
261
|
+
log_lines = log_json.split('\n')
|
|
262
|
+
|
|
263
|
+
print(f"{Colors.BOLD}Evaluation Log:{Colors.ENDC}")
|
|
264
|
+
log_border = "─" * WIDTH
|
|
265
|
+
print(f"{Colors.DIM}╭{log_border}╮{Colors.ENDC}")
|
|
266
|
+
|
|
267
|
+
for line in log_lines:
|
|
268
|
+
# Если строка длиннее WIDTH, переносим
|
|
269
|
+
if len(line) > WIDTH - 4:
|
|
270
|
+
# Разбиваем длинную строку
|
|
271
|
+
wrapped_lines = textwrap.wrap(line, width=WIDTH - 4,
|
|
272
|
+
break_long_words=True,
|
|
273
|
+
break_on_hyphens=False)
|
|
274
|
+
for wrapped_line in wrapped_lines:
|
|
275
|
+
spaces_needed = WIDTH - len(wrapped_line) - 2
|
|
276
|
+
print(
|
|
277
|
+
f"{Colors.DIM}│{Colors.ENDC} {wrapped_line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
|
|
278
|
+
else:
|
|
279
|
+
spaces_needed = WIDTH - len(line) - 2
|
|
280
|
+
print(
|
|
281
|
+
f"{Colors.DIM}│{Colors.ENDC} {line}{' ' * spaces_needed}{Colors.DIM}│{Colors.ENDC}")
|
|
282
|
+
|
|
283
|
+
print(f"{Colors.DIM}╰{log_border}╯{Colors.ENDC}")
|
|
284
|
+
|
|
285
|
+
print(f"{Colors.DIM}{'─' * WIDTH}{Colors.ENDC}\n")
|
|
@@ -178,8 +178,8 @@ class PrecisionConfig:
|
|
|
178
178
|
class AnswerPrecisionMetric(MetricPattern):
|
|
179
179
|
name = "answerPrecisionMetric"
|
|
180
180
|
|
|
181
|
-
def __init__(self, model: str, threshold: float = 0.8, config: Optional[PrecisionConfig] = None):
|
|
182
|
-
super().__init__(model=model, threshold=threshold)
|
|
181
|
+
def __init__(self, model: str, threshold: float = 0.8, verbose: bool = False, config: Optional[PrecisionConfig] = None):
|
|
182
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
183
183
|
self.config = config or PrecisionConfig()
|
|
184
184
|
|
|
185
185
|
# --- core similarity components ---
|
|
@@ -395,7 +395,8 @@ class AnswerPrecisionMetric(MetricPattern):
|
|
|
395
395
|
},
|
|
396
396
|
}
|
|
397
397
|
|
|
398
|
-
|
|
398
|
+
result = {
|
|
399
|
+
"name": self.name,
|
|
399
400
|
"score": round(final_score, 4),
|
|
400
401
|
"success": success,
|
|
401
402
|
"reason": reason,
|
|
@@ -403,3 +404,7 @@ class AnswerPrecisionMetric(MetricPattern):
|
|
|
403
404
|
"evaluation_cost": 0.0,
|
|
404
405
|
"evaluation_log": evaluation_log,
|
|
405
406
|
}
|
|
407
|
+
|
|
408
|
+
self.print_result(result)
|
|
409
|
+
|
|
410
|
+
return result
|
|
@@ -34,8 +34,9 @@ class AnswerRelevancyMetric(MetricPattern):
|
|
|
34
34
|
model: str,
|
|
35
35
|
threshold: float = 0.6,
|
|
36
36
|
temperature: float = 0.5,
|
|
37
|
+
verbose: bool = False
|
|
37
38
|
):
|
|
38
|
-
super().__init__(model=model, threshold=threshold)
|
|
39
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
39
40
|
self.temperature = temperature
|
|
40
41
|
|
|
41
42
|
async def _infer_user_intent(self, question: str) -> str:
|
|
@@ -186,10 +187,14 @@ class AnswerRelevancyMetric(MetricPattern):
|
|
|
186
187
|
"comment_reasoning": "Compressed explanation of the key verdict rationales."
|
|
187
188
|
}
|
|
188
189
|
|
|
189
|
-
|
|
190
|
+
result = {
|
|
191
|
+
"name": self.name,
|
|
190
192
|
"score": final_score,
|
|
191
193
|
"success": success,
|
|
192
194
|
"reason": summary_reason,
|
|
193
195
|
"evaluation_cost": round(llm_cost, 6),
|
|
194
196
|
"evaluation_log": evaluation_log
|
|
195
197
|
}
|
|
198
|
+
self.print_result(result)
|
|
199
|
+
|
|
200
|
+
return result
|
|
@@ -12,7 +12,14 @@ from eval_lib.llm_client import chat_complete
|
|
|
12
12
|
|
|
13
13
|
class BiasMetric(MetricPattern):
|
|
14
14
|
name = "biasMetric"
|
|
15
|
-
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
model: str,
|
|
19
|
+
threshold: float = 0.8,
|
|
20
|
+
verbose: bool = False,
|
|
21
|
+
):
|
|
22
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
16
23
|
|
|
17
24
|
# ==================== PROMPTS ====================
|
|
18
25
|
|
|
@@ -105,10 +112,13 @@ JSON:"""
|
|
|
105
112
|
"comment_reasoning": "Explanation of the bias assessment, including specific biased elements if found."
|
|
106
113
|
}
|
|
107
114
|
|
|
108
|
-
|
|
115
|
+
result = {
|
|
116
|
+
"name": self.name,
|
|
109
117
|
"score": score,
|
|
110
118
|
"success": success,
|
|
111
119
|
"reason": reason,
|
|
112
120
|
"evaluation_cost": round(total_cost, 6),
|
|
113
121
|
"evaluation_log": evaluation_log
|
|
114
122
|
}
|
|
123
|
+
self.print_result(result)
|
|
124
|
+
return result
|
|
@@ -18,9 +18,9 @@ from eval_lib.utils import extract_json_block
|
|
|
18
18
|
class ContextualPrecisionMetric(MetricPattern):
|
|
19
19
|
name = "contextPrecisionMetric"
|
|
20
20
|
|
|
21
|
-
def __init__(self, model: str, threshold: float = 0.7, top_k: int | None = None, ):
|
|
22
|
-
super().__init__(model=model, threshold=threshold)
|
|
23
|
-
self.top_k = top_k
|
|
21
|
+
def __init__(self, model: str, threshold: float = 0.7, top_k: int | None = None, verbose: bool = False):
|
|
22
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
23
|
+
self.top_k = top_k
|
|
24
24
|
|
|
25
25
|
# ------------------------------------------------------------------ #
|
|
26
26
|
async def _is_chunk_relevant( # judgement = 0 / 1
|
|
@@ -93,10 +93,15 @@ class ContextualPrecisionMetric(MetricPattern):
|
|
|
93
93
|
"comment_success": "Whether precision meets threshold."
|
|
94
94
|
}
|
|
95
95
|
|
|
96
|
-
|
|
96
|
+
result = {
|
|
97
|
+
"name": self.name,
|
|
97
98
|
"score": ctx_precision,
|
|
98
99
|
"success": success,
|
|
99
100
|
"reason": f"Average precision across top-{len(chunks)} context chunks.",
|
|
100
101
|
"evaluation_cost": round(llm_cost, 6),
|
|
101
102
|
"evaluation_log": evaluation_log,
|
|
102
103
|
}
|
|
104
|
+
|
|
105
|
+
self.print_result(result)
|
|
106
|
+
|
|
107
|
+
return result
|
|
@@ -18,8 +18,8 @@ from eval_lib.utils import extract_json_block
|
|
|
18
18
|
class ContextualRecallMetric(MetricPattern):
|
|
19
19
|
name = "contextualRecallMetric"
|
|
20
20
|
|
|
21
|
-
def __init__(self, model: str, threshold: float = 0.7):
|
|
22
|
-
super().__init__(model=model, threshold=threshold)
|
|
21
|
+
def __init__(self, model: str, threshold: float = 0.7, verbose: bool = False):
|
|
22
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
23
23
|
|
|
24
24
|
async def _extract_claims(self, reference: str) -> Tuple[List[str], float]:
|
|
25
25
|
prompt = (
|
|
@@ -82,10 +82,14 @@ class ContextualRecallMetric(MetricPattern):
|
|
|
82
82
|
"comment_success": "Whether the score exceeds the threshold.",
|
|
83
83
|
}
|
|
84
84
|
|
|
85
|
-
|
|
85
|
+
result = {
|
|
86
|
+
"name": self.name,
|
|
86
87
|
"score": recall_score,
|
|
87
88
|
"success": success,
|
|
88
89
|
"reason": f"{supported_count} out of {total_claims} reference claims supported by context.",
|
|
89
90
|
"evaluation_cost": round(llm_cost, 6),
|
|
90
91
|
"evaluation_log": evaluation_log
|
|
91
92
|
}
|
|
93
|
+
self.print_result(result)
|
|
94
|
+
|
|
95
|
+
return result
|
|
@@ -34,8 +34,9 @@ class ContextualRelevancyMetric(MetricPattern):
|
|
|
34
34
|
model: str,
|
|
35
35
|
threshold: float = 0.6,
|
|
36
36
|
temperature: float = 0.5,
|
|
37
|
+
verbose: bool = False
|
|
37
38
|
):
|
|
38
|
-
super().__init__(model=model, threshold=threshold)
|
|
39
|
+
super().__init__(model=model, threshold=threshold, verbose=verbose)
|
|
39
40
|
self.temperature = temperature
|
|
40
41
|
|
|
41
42
|
async def _infer_user_intent(self, question: str) -> Tuple[str, float]:
|
|
@@ -160,10 +161,15 @@ class ContextualRelevancyMetric(MetricPattern):
|
|
|
160
161
|
"comment_reasoning": "LLM-generated explanation based on verdict rationales."
|
|
161
162
|
}
|
|
162
163
|
|
|
163
|
-
|
|
164
|
+
result = {
|
|
165
|
+
"name": self.name,
|
|
164
166
|
"score": score,
|
|
165
167
|
"success": success,
|
|
166
168
|
"reason": summary,
|
|
167
169
|
"evaluation_cost": round(llm_cost, 6),
|
|
168
170
|
"evaluation_log": evaluation_log
|
|
169
171
|
}
|
|
172
|
+
|
|
173
|
+
self.print_result(result)
|
|
174
|
+
|
|
175
|
+
return result
|