eval-ai-library 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- eval_ai_library-0.1.0.dist-info/METADATA +753 -0
- eval_ai_library-0.1.0.dist-info/RECORD +34 -0
- eval_ai_library-0.1.0.dist-info/WHEEL +5 -0
- eval_ai_library-0.1.0.dist-info/licenses/LICENSE +21 -0
- eval_ai_library-0.1.0.dist-info/top_level.txt +1 -0
- eval_lib/__init__.py +122 -0
- eval_lib/agent_metrics/__init__.py +12 -0
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +231 -0
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +251 -0
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +347 -0
- eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +106 -0
- eval_lib/datagenerator/datagenerator.py +230 -0
- eval_lib/datagenerator/document_loader.py +510 -0
- eval_lib/datagenerator/prompts.py +192 -0
- eval_lib/evaluate.py +335 -0
- eval_lib/evaluation_schema.py +63 -0
- eval_lib/llm_client.py +286 -0
- eval_lib/metric_pattern.py +229 -0
- eval_lib/metrics/__init__.py +25 -0
- eval_lib/metrics/answer_precision_metric/answer_precision.py +405 -0
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +195 -0
- eval_lib/metrics/bias_metric/bias.py +114 -0
- eval_lib/metrics/contextual_precision_metric/contextual_precision.py +102 -0
- eval_lib/metrics/contextual_recall_metric/contextual_recall.py +91 -0
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +169 -0
- eval_lib/metrics/custom_metric/custom_eval.py +303 -0
- eval_lib/metrics/faithfulness_metric/faithfulness.py +140 -0
- eval_lib/metrics/geval/geval.py +326 -0
- eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +102 -0
- eval_lib/metrics/toxicity_metric/toxicity.py +113 -0
- eval_lib/price.py +37 -0
- eval_lib/py.typed +1 -0
- eval_lib/testcases_schema.py +27 -0
- eval_lib/utils.py +99 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# metric_pattern.py
|
|
2
|
+
"""
|
|
3
|
+
Base classes for evaluation metrics with beautiful console logging.
|
|
4
|
+
"""
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from typing import Type, Dict, Any, Union, Optional
|
|
8
|
+
|
|
9
|
+
from eval_lib.testcases_schema import EvalTestCase, ConversationalEvalTestCase
|
|
10
|
+
from eval_lib.llm_client import chat_complete
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ANSI color codes for beautiful console output
|
|
14
|
+
class Colors:
|
|
15
|
+
HEADER = '\033[95m'
|
|
16
|
+
BLUE = '\033[94m'
|
|
17
|
+
CYAN = '\033[96m'
|
|
18
|
+
GREEN = '\033[92m'
|
|
19
|
+
YELLOW = '\033[93m'
|
|
20
|
+
RED = '\033[91m'
|
|
21
|
+
ENDC = '\033[0m'
|
|
22
|
+
BOLD = '\033[1m'
|
|
23
|
+
UNDERLINE = '\033[4m'
|
|
24
|
+
DIM = '\033[2m'
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MetricPattern:
|
|
28
|
+
"""
|
|
29
|
+
Base class for metrics that use a pattern-based approach to evaluation.
|
|
30
|
+
This class is designed to be subclassed for specific metrics.
|
|
31
|
+
"""
|
|
32
|
+
name: str # name of the metric
|
|
33
|
+
|
|
34
|
+
def __init__(self, model: str, threshold: float, verbose: bool = True):
|
|
35
|
+
self.model = model
|
|
36
|
+
self.threshold = threshold
|
|
37
|
+
self.verbose = verbose
|
|
38
|
+
|
|
39
|
+
def _log(self, message: str, color: str = Colors.CYAN):
|
|
40
|
+
"""Log message with color if verbose mode is enabled"""
|
|
41
|
+
if self.verbose:
|
|
42
|
+
print(f"{color}{message}{Colors.ENDC}")
|
|
43
|
+
|
|
44
|
+
def _log_step(self, step_name: str, step_num: int = None):
|
|
45
|
+
"""Log evaluation step"""
|
|
46
|
+
if self.verbose:
|
|
47
|
+
prefix = f"[{step_num}] " if step_num else ""
|
|
48
|
+
print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
|
|
49
|
+
|
|
50
|
+
async def evaluate(self, test_case: Union[EvalTestCase]) -> Dict[str, Any]:
|
|
51
|
+
"""
|
|
52
|
+
Base evaluation method - override in subclasses for custom behavior.
|
|
53
|
+
"""
|
|
54
|
+
start_time = time.time()
|
|
55
|
+
|
|
56
|
+
if self.verbose:
|
|
57
|
+
print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
|
|
58
|
+
print(f"{Colors.BOLD}{Colors.BLUE}π Evaluating: {self.name}{Colors.ENDC}")
|
|
59
|
+
print(f"{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
|
|
60
|
+
print(f"{Colors.DIM}Model: {self.model}{Colors.ENDC}")
|
|
61
|
+
print(f"{Colors.DIM}Threshold: {self.threshold}{Colors.ENDC}")
|
|
62
|
+
|
|
63
|
+
self._log_step("Generating evaluation prompt", 1)
|
|
64
|
+
|
|
65
|
+
# 1) Generate prompt
|
|
66
|
+
prompt = self.template.generate_prompt(
|
|
67
|
+
test_case=test_case,
|
|
68
|
+
threshold=self.threshold
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
self._log_step("Calling LLM", 2)
|
|
72
|
+
|
|
73
|
+
# 2) Make API call
|
|
74
|
+
text, cost = await chat_complete(
|
|
75
|
+
self.model,
|
|
76
|
+
messages=[{"role": "user", "content": prompt}],
|
|
77
|
+
temperature=0.0
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
self._log_step("Parsing response", 3)
|
|
81
|
+
|
|
82
|
+
# 3) Parse the response
|
|
83
|
+
try:
|
|
84
|
+
data = json.loads(text)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
self._log(f"β Failed to parse JSON: {e}", Colors.RED)
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
f"Cannot parse JSON from model response: {e}\n{text}")
|
|
89
|
+
|
|
90
|
+
score = float(data.get("score", 0.0))
|
|
91
|
+
reason = data.get("reason")
|
|
92
|
+
success = score >= self.threshold
|
|
93
|
+
|
|
94
|
+
# Calculate elapsed time
|
|
95
|
+
elapsed_time = time.time() - start_time
|
|
96
|
+
|
|
97
|
+
# Log results
|
|
98
|
+
if self.verbose:
|
|
99
|
+
print(f"\n{Colors.BOLD}π Results:{Colors.ENDC}")
|
|
100
|
+
score_color = Colors.GREEN if success else Colors.RED
|
|
101
|
+
success_icon = "β
" if success else "β"
|
|
102
|
+
print(
|
|
103
|
+
f" {success_icon} Status: {score_color}{Colors.BOLD}{'PASSED' if success else 'FAILED'}{Colors.ENDC}")
|
|
104
|
+
print(
|
|
105
|
+
f" π Score: {score_color}{score:.2f}{Colors.ENDC} (threshold: {self.threshold})")
|
|
106
|
+
print(f" π° Cost: {Colors.YELLOW}${cost:.6f}{Colors.ENDC}")
|
|
107
|
+
print(f" β±οΈ Time: {Colors.DIM}{elapsed_time:.2f}s{Colors.ENDC}")
|
|
108
|
+
if reason:
|
|
109
|
+
print(
|
|
110
|
+
f" π¬ Reason: {Colors.DIM}{reason[:100]}{'...' if len(reason) > 100 else ''}{Colors.ENDC}")
|
|
111
|
+
|
|
112
|
+
return {
|
|
113
|
+
"score": score,
|
|
114
|
+
"success": success,
|
|
115
|
+
"reason": reason,
|
|
116
|
+
"evaluation_cost": cost,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ConversationalMetricPattern:
|
|
121
|
+
"""
|
|
122
|
+
Base class for conversational metrics (evaluating full dialogues).
|
|
123
|
+
Used for metrics like RoleAdherence, DialogueCoherence, etc.
|
|
124
|
+
"""
|
|
125
|
+
name: str
|
|
126
|
+
template_cls: Type
|
|
127
|
+
|
|
128
|
+
def __init__(self, model: str, threshold: float, verbose: bool = True):
|
|
129
|
+
self.model = model
|
|
130
|
+
self.threshold = threshold
|
|
131
|
+
self.verbose = verbose
|
|
132
|
+
if self.template_cls:
|
|
133
|
+
self.template = self.template_cls()
|
|
134
|
+
else:
|
|
135
|
+
self.template = None
|
|
136
|
+
self.chatbot_role: Optional[str] = None
|
|
137
|
+
|
|
138
|
+
def _log(self, message: str, color: str = Colors.CYAN):
|
|
139
|
+
"""Log message with color if verbose mode is enabled"""
|
|
140
|
+
if self.verbose:
|
|
141
|
+
print(f"{color}{message}{Colors.ENDC}")
|
|
142
|
+
|
|
143
|
+
def _log_step(self, step_name: str, step_num: int = None):
|
|
144
|
+
"""Log evaluation step"""
|
|
145
|
+
if self.verbose:
|
|
146
|
+
prefix = f"[{step_num}] " if step_num else ""
|
|
147
|
+
print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
|
|
148
|
+
|
|
149
|
+
async def evaluate(self, test_case: ConversationalEvalTestCase) -> Dict[str, Any]:
|
|
150
|
+
"""
|
|
151
|
+
Evaluate conversational test case with logging.
|
|
152
|
+
"""
|
|
153
|
+
start_time = time.time()
|
|
154
|
+
|
|
155
|
+
if self.verbose:
|
|
156
|
+
print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
|
|
157
|
+
print(
|
|
158
|
+
f"{Colors.BOLD}{Colors.BLUE}π¬ Evaluating Conversation: {self.name}{Colors.ENDC}")
|
|
159
|
+
print(f"{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
|
|
160
|
+
print(f"{Colors.DIM}Model: {self.model}{Colors.ENDC}")
|
|
161
|
+
print(f"{Colors.DIM}Threshold: {self.threshold}{Colors.ENDC}")
|
|
162
|
+
print(f"{Colors.DIM}Turns: {len(test_case.turns)}{Colors.ENDC}")
|
|
163
|
+
|
|
164
|
+
self._log_step("Generating evaluation prompt", 1)
|
|
165
|
+
|
|
166
|
+
# 1. Generate prompt
|
|
167
|
+
if hasattr(self.template, "generate_prompt"):
|
|
168
|
+
try:
|
|
169
|
+
prompt = self.template.generate_prompt(
|
|
170
|
+
test_case=test_case,
|
|
171
|
+
threshold=self.threshold,
|
|
172
|
+
chatbot_role=self.chatbot_role
|
|
173
|
+
)
|
|
174
|
+
except TypeError:
|
|
175
|
+
prompt = self.template.generate_prompt(
|
|
176
|
+
test_case=test_case,
|
|
177
|
+
threshold=self.threshold,
|
|
178
|
+
temperature=0.0
|
|
179
|
+
)
|
|
180
|
+
else:
|
|
181
|
+
raise RuntimeError("Template is missing method generate_prompt")
|
|
182
|
+
|
|
183
|
+
self._log_step("Calling LLM", 2)
|
|
184
|
+
|
|
185
|
+
# 2. Call API
|
|
186
|
+
text, cost = await chat_complete(
|
|
187
|
+
self.model,
|
|
188
|
+
messages=[{"role": "user", "content": prompt}],
|
|
189
|
+
temperature=0.0
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
self._log_step("Parsing response", 3)
|
|
193
|
+
|
|
194
|
+
# 3. Parse response
|
|
195
|
+
try:
|
|
196
|
+
data = json.loads(text)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
self._log(f"β Failed to parse JSON: {e}", Colors.RED)
|
|
199
|
+
raise RuntimeError(
|
|
200
|
+
f"Cannot parse JSON from model response: {e}\n{text}")
|
|
201
|
+
|
|
202
|
+
score = float(data.get("score", 0.0))
|
|
203
|
+
reason = data.get("reason")
|
|
204
|
+
success = score >= self.threshold
|
|
205
|
+
|
|
206
|
+
# Calculate elapsed time
|
|
207
|
+
elapsed_time = time.time() - start_time
|
|
208
|
+
|
|
209
|
+
# Log results
|
|
210
|
+
if self.verbose:
|
|
211
|
+
print(f"\n{Colors.BOLD}π Results:{Colors.ENDC}")
|
|
212
|
+
score_color = Colors.GREEN if success else Colors.RED
|
|
213
|
+
success_icon = "β
" if success else "β"
|
|
214
|
+
print(
|
|
215
|
+
f" {success_icon} Status: {score_color}{Colors.BOLD}{'PASSED' if success else 'FAILED'}{Colors.ENDC}")
|
|
216
|
+
print(
|
|
217
|
+
f" π Score: {score_color}{score:.2f}{Colors.ENDC} (threshold: {self.threshold})")
|
|
218
|
+
print(f" π° Cost: {Colors.YELLOW}${cost:.6f}{Colors.ENDC}")
|
|
219
|
+
print(f" β±οΈ Time: {Colors.DIM}{elapsed_time:.2f}s{Colors.ENDC}")
|
|
220
|
+
if reason:
|
|
221
|
+
print(
|
|
222
|
+
f" π¬ Reason: {Colors.DIM}{reason[:100]}{'...' if len(reason) > 100 else ''}{Colors.ENDC}")
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
"score": score,
|
|
226
|
+
"success": success,
|
|
227
|
+
"reason": reason,
|
|
228
|
+
"evaluation_cost": cost,
|
|
229
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from eval_lib.metrics.answer_relevancy_metric.answer_relevancy import AnswerRelevancyMetric
|
|
2
|
+
from eval_lib.metrics.faithfulness_metric.faithfulness import FaithfulnessMetric
|
|
3
|
+
from eval_lib.metrics.contextual_relevancy_metric.contextual_relevancy import ContextualRelevancyMetric
|
|
4
|
+
from eval_lib.metrics.contextual_precision_metric.contextual_precision import ContextualPrecisionMetric
|
|
5
|
+
from eval_lib.metrics.contextual_recall_metric.contextual_recall import ContextualRecallMetric
|
|
6
|
+
from eval_lib.metrics.bias_metric.bias import BiasMetric
|
|
7
|
+
from eval_lib.metrics.toxicity_metric.toxicity import ToxicityMetric
|
|
8
|
+
from eval_lib.metrics.geval.geval import GEval
|
|
9
|
+
from eval_lib.metrics.custom_metric.custom_eval import CustomEvalMetric
|
|
10
|
+
from eval_lib.metrics.restricted_refusal_metric.restricted_refusal import RestrictedRefusalMetric
|
|
11
|
+
from eval_lib.metrics.answer_precision_metric.answer_precision import AnswerPrecisionMetric
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"AnswerRelevancyMetric",
|
|
15
|
+
"AnswerPrecisionMetric",
|
|
16
|
+
"FaithfulnessMetric",
|
|
17
|
+
"ContextualRelevancyMetric",
|
|
18
|
+
"ContextualPrecisionMetric",
|
|
19
|
+
"ContextualRecallMetric",
|
|
20
|
+
"BiasMetric",
|
|
21
|
+
"ToxicityMetric",
|
|
22
|
+
"GEval",
|
|
23
|
+
"RestrictedRefusalMetric",
|
|
24
|
+
"CustomEvalMetric"
|
|
25
|
+
]
|
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Answer Precision Metric: Evaluates how precisely a model's answer matches the expected answer
|
|
3
|
+
using multiple text similarity components and weighted aggregation.
|
|
4
|
+
|
|
5
|
+
Score Components:
|
|
6
|
+
- Exact Match: 1.0 if actual == expected else 0.0
|
|
7
|
+
- Normalized Match: 1.0 if normalized(actual) == normalized(expected) else 0.0
|
|
8
|
+
- Character Similarity: Ratio of matching characters after normalization.
|
|
9
|
+
- Token Precision: Proportion of expected tokens present in actual answer.
|
|
10
|
+
- Numeric Agreement: Proportion of expected numeric values correctly represented in actual answer.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
'''
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import math
|
|
18
|
+
import re
|
|
19
|
+
import unicodedata
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from difflib import SequenceMatcher
|
|
22
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
23
|
+
|
|
24
|
+
from eval_lib.testcases_schema import EvalTestCase
|
|
25
|
+
from eval_lib.metric_pattern import MetricPattern
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# -------------------------------
|
|
29
|
+
# Helpers to normalize and compare text
|
|
30
|
+
# -------------------------------
|
|
31
|
+
|
|
32
|
+
def _normalize_text_basic(text: str) -> str:
|
|
33
|
+
"""Lowercase, strip, collapse whitespace; also strip markdown links/URLs."""
|
|
34
|
+
if text is None:
|
|
35
|
+
return ""
|
|
36
|
+
# [label](https://...) -> label
|
|
37
|
+
text = re.sub(r"\[([^\]]+)\]\(\s*https?://[^)]+\s*\)", r"\1", text)
|
|
38
|
+
# bare URLs
|
|
39
|
+
text = re.sub(r"https?://\S+", "", text)
|
|
40
|
+
text = unicodedata.normalize("NFKC", text).lower().strip()
|
|
41
|
+
text = re.sub(r"\s+", " ", text)
|
|
42
|
+
return text
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
_PUNCT_RE = re.compile(r"[!\"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~]")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _normalize_for_tokens(text: str) -> str:
|
|
49
|
+
text = _normalize_text_basic(text)
|
|
50
|
+
text = _PUNCT_RE.sub(" ", text)
|
|
51
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
52
|
+
return text
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _power_mean_score(components: Dict[str, float], weights: Dict[str, float], p: float = 0.3) -> float:
|
|
56
|
+
"""
|
|
57
|
+
Weighted power mean:
|
|
58
|
+
M_p = ( sum_i alpha_i * s_i^p )^(1/p)
|
|
59
|
+
alpha_i = w_i / sum(w_i). Recommended 0 < p < 1.
|
|
60
|
+
"""
|
|
61
|
+
keys = [k for k in components.keys() if k in weights]
|
|
62
|
+
if not keys:
|
|
63
|
+
return 0.0
|
|
64
|
+
|
|
65
|
+
raw = {k: max(0.0, float(weights[k])) for k in keys}
|
|
66
|
+
total = sum(raw.values())
|
|
67
|
+
if total <= 0:
|
|
68
|
+
alpha = {k: 1.0 / len(keys) for k in keys}
|
|
69
|
+
else:
|
|
70
|
+
alpha = {k: raw[k] / total for k in keys}
|
|
71
|
+
|
|
72
|
+
acc = 0.0
|
|
73
|
+
for k in keys:
|
|
74
|
+
s = max(0.0, min(1.0, float(components[k])))
|
|
75
|
+
acc += alpha[k] * (s ** p)
|
|
76
|
+
score = acc ** (1.0 / p)
|
|
77
|
+
return max(0.0, min(1.0, score))
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
STOPWORDS: Set[str] = {
|
|
81
|
+
"the", "a", "an", "and", "or", "but", "if", "then", "else",
|
|
82
|
+
"to", "of", "in", "on", "for", "with", "as", "by", "at",
|
|
83
|
+
"from", "that", "this", "it", "is", "are", "was", "were",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _line_word_diffs(actual: str, expected: str, drop_stopwords: bool = True) -> List[Dict[str, Any]]:
|
|
88
|
+
"""
|
|
89
|
+
Post-line diagnostics: words added/removed per line (human-readable).
|
|
90
|
+
"""
|
|
91
|
+
a_lines = (actual or "").splitlines()
|
|
92
|
+
e_lines = (expected or "").splitlines()
|
|
93
|
+
n = max(len(a_lines), len(e_lines))
|
|
94
|
+
diffs: List[Dict[str, Any]] = []
|
|
95
|
+
|
|
96
|
+
def words(s: str) -> List[str]:
|
|
97
|
+
return _tokenize(s, drop_stopwords=drop_stopwords)
|
|
98
|
+
|
|
99
|
+
for i in range(n):
|
|
100
|
+
e_line = e_lines[i] if i < len(e_lines) else ""
|
|
101
|
+
a_line = a_lines[i] if i < len(a_lines) else ""
|
|
102
|
+
if _normalize_text_basic(e_line) == _normalize_text_basic(a_line):
|
|
103
|
+
continue
|
|
104
|
+
e_set = set(words(e_line))
|
|
105
|
+
a_set = set(words(a_line))
|
|
106
|
+
removed = sorted(list(e_set - a_set))
|
|
107
|
+
added = sorted(list(a_set - e_set))
|
|
108
|
+
if removed or added:
|
|
109
|
+
diffs.append({
|
|
110
|
+
"line_no": i + 1, # 1-based
|
|
111
|
+
"expected": e_line,
|
|
112
|
+
"actual": a_line,
|
|
113
|
+
"removed": removed,
|
|
114
|
+
"added": added,
|
|
115
|
+
})
|
|
116
|
+
return diffs
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _tokenize(text: str, drop_stopwords: bool = True) -> List[str]:
|
|
120
|
+
text = _normalize_for_tokens(text)
|
|
121
|
+
tokens = [t for t in text.split(" ") if t]
|
|
122
|
+
if drop_stopwords:
|
|
123
|
+
tokens = [t for t in tokens if t not in STOPWORDS]
|
|
124
|
+
return tokens
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
_NUM_RE = re.compile(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?")
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _extract_numbers(text: str) -> List[float]:
|
|
131
|
+
nums: List[float] = []
|
|
132
|
+
for m in _NUM_RE.finditer(text or ""):
|
|
133
|
+
try:
|
|
134
|
+
nums.append(float(m.group(0)))
|
|
135
|
+
except Exception:
|
|
136
|
+
pass
|
|
137
|
+
return nums
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _token_sets(a: str, e: str) -> Tuple[Set[str], Set[str]]:
|
|
141
|
+
a_tokens = set(_tokenize(a))
|
|
142
|
+
e_tokens = set(_tokenize(e))
|
|
143
|
+
return a_tokens, e_tokens
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _token_overlap_coefficient(a: str, e: str) -> float:
|
|
147
|
+
"""
|
|
148
|
+
Overlap coefficient: |A β© E| / min(|A|, |E|).
|
|
149
|
+
"""
|
|
150
|
+
A, E = _token_sets(a, e)
|
|
151
|
+
if not A and not E:
|
|
152
|
+
return 1.0
|
|
153
|
+
if not A or not E:
|
|
154
|
+
return 0.0
|
|
155
|
+
inter = len(A & E)
|
|
156
|
+
return inter / min(len(A), len(E))
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass
|
|
160
|
+
class PrecisionConfig:
|
|
161
|
+
token_stopwords: Set[str] = field(default_factory=lambda: set(STOPWORDS))
|
|
162
|
+
numeric_tolerance_abs: float = 0.0
|
|
163
|
+
numeric_tolerance_rel: float = 0.0
|
|
164
|
+
require_expected_present: bool = True
|
|
165
|
+
weights: Optional[Dict[str, float]] = None
|
|
166
|
+
power_p: float = 0.3
|
|
167
|
+
|
|
168
|
+
def __post_init__(self):
|
|
169
|
+
if self.weights is None:
|
|
170
|
+
self.weights = {
|
|
171
|
+
"contains": 0.48,
|
|
172
|
+
"char_ratio": 0.32,
|
|
173
|
+
"token_precision": 0.15,
|
|
174
|
+
"numeric": 0.35,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class AnswerPrecisionMetric(MetricPattern):
|
|
179
|
+
name = "answerPrecisionMetric"
|
|
180
|
+
|
|
181
|
+
def __init__(self, model: str, threshold: float = 0.8, config: Optional[PrecisionConfig] = None):
|
|
182
|
+
super().__init__(model=model, threshold=threshold)
|
|
183
|
+
self.config = config or PrecisionConfig()
|
|
184
|
+
|
|
185
|
+
# --- core similarity components ---
|
|
186
|
+
def _exact_match(self, a: str, e: str) -> float:
|
|
187
|
+
return 1.0 if (a or "") == (e or "") else 0.0
|
|
188
|
+
|
|
189
|
+
def _normalized_match(self, a: str, e: str) -> float:
|
|
190
|
+
return 1.0 if _normalize_text_basic(a) == _normalize_text_basic(e) else 0.0
|
|
191
|
+
|
|
192
|
+
def _char_similarity(self, a: str, e: str) -> float:
|
|
193
|
+
a_norm = _normalize_text_basic(a)
|
|
194
|
+
e_norm = _normalize_text_basic(e)
|
|
195
|
+
if not a_norm and not e_norm:
|
|
196
|
+
return 1.0
|
|
197
|
+
return SequenceMatcher(a=a_norm, b=e_norm).ratio()
|
|
198
|
+
|
|
199
|
+
def _token_precision(self, a: str, e: str) -> Tuple[float, Dict[str, Any]]:
|
|
200
|
+
a_tokens = set(_tokenize(a))
|
|
201
|
+
e_tokens = set(_tokenize(e))
|
|
202
|
+
if not a_tokens:
|
|
203
|
+
return 1.0 if not e_tokens else 0.0, {
|
|
204
|
+
"actual_tokens": [],
|
|
205
|
+
"expected_tokens": sorted(list(e_tokens)),
|
|
206
|
+
"true_positive": [],
|
|
207
|
+
"false_positive": [],
|
|
208
|
+
"false_negative": sorted(list(e_tokens)),
|
|
209
|
+
}
|
|
210
|
+
tp = sorted(list(a_tokens & e_tokens))
|
|
211
|
+
fp = sorted(list(a_tokens - e_tokens))
|
|
212
|
+
fn = sorted(list(e_tokens - a_tokens))
|
|
213
|
+
precision = len(tp) / (len(tp) + len(fp)
|
|
214
|
+
) if (len(tp) + len(fp)) > 0 else 0.0
|
|
215
|
+
return precision, {
|
|
216
|
+
"actual_tokens": sorted(list(a_tokens)),
|
|
217
|
+
"expected_tokens": sorted(list(e_tokens)),
|
|
218
|
+
"true_positive": tp,
|
|
219
|
+
"false_positive": fp,
|
|
220
|
+
"false_negative": fn,
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
def _numeric_agreement(self, a: str, e: str) -> Tuple[float, Dict[str, Any]]:
|
|
224
|
+
a_nums = _extract_numbers(a)
|
|
225
|
+
e_nums = _extract_numbers(e)
|
|
226
|
+
if not e_nums and not a_nums:
|
|
227
|
+
return 1.0, {"actual_numbers": [], "expected_numbers": [], "matches": [], "mismatches": []}
|
|
228
|
+
if not e_nums and a_nums:
|
|
229
|
+
return 0.0, {"actual_numbers": a_nums, "expected_numbers": [], "matches": [], "mismatches": a_nums}
|
|
230
|
+
tol_abs = self.config.numeric_tolerance_abs
|
|
231
|
+
tol_rel = self.config.numeric_tolerance_rel
|
|
232
|
+
used_idx: Set[int] = set()
|
|
233
|
+
matches: List[Tuple[float, float]] = []
|
|
234
|
+
mismatches: List[Tuple[float, float, float]] = []
|
|
235
|
+
for exp_v in e_nums:
|
|
236
|
+
best_i = None
|
|
237
|
+
best_err = math.inf
|
|
238
|
+
for i, act_v in enumerate(a_nums):
|
|
239
|
+
if i in used_idx:
|
|
240
|
+
continue
|
|
241
|
+
err = abs(act_v - exp_v)
|
|
242
|
+
if err < best_err:
|
|
243
|
+
best_err = err
|
|
244
|
+
best_i = i
|
|
245
|
+
if best_i is None:
|
|
246
|
+
mismatches.append((exp_v, float("nan"), float("inf")))
|
|
247
|
+
continue
|
|
248
|
+
act_v = a_nums[best_i]
|
|
249
|
+
used_idx.add(best_i)
|
|
250
|
+
rel_err = abs(act_v - exp_v) / (abs(exp_v) + 1e-12)
|
|
251
|
+
within = (best_err <= tol_abs) or (rel_err <= tol_rel)
|
|
252
|
+
if within:
|
|
253
|
+
matches.append((exp_v, act_v))
|
|
254
|
+
else:
|
|
255
|
+
mismatches.append((exp_v, act_v, rel_err))
|
|
256
|
+
score = len(matches) / \
|
|
257
|
+
len(e_nums) if e_nums else (1.0 if not a_nums else 0.0)
|
|
258
|
+
detail = {
|
|
259
|
+
"actual_numbers": a_nums,
|
|
260
|
+
"expected_numbers": e_nums,
|
|
261
|
+
"matches": matches,
|
|
262
|
+
"mismatches": mismatches,
|
|
263
|
+
"tolerance_abs": tol_abs,
|
|
264
|
+
"tolerance_rel": tol_rel,
|
|
265
|
+
}
|
|
266
|
+
return score, detail
|
|
267
|
+
|
|
268
|
+
# --- evaluation entrypoint ---
|
|
269
|
+
async def evaluate(self, test_case: EvalTestCase) -> Dict[str, Any]:
|
|
270
|
+
actual = test_case.actual_output or ""
|
|
271
|
+
expected = test_case.expected_output or ""
|
|
272
|
+
|
|
273
|
+
if self.config.require_expected_present and not test_case.expected_output:
|
|
274
|
+
return {
|
|
275
|
+
"score": 0.0,
|
|
276
|
+
"success": False,
|
|
277
|
+
"reason": "No expected output provided.",
|
|
278
|
+
"evaluation_cost": 0.0,
|
|
279
|
+
"evaluation_log": {"note": "expected_output is required for AnswerPrecisionMetric"},
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
w = self.config.weights
|
|
283
|
+
|
|
284
|
+
exact = self._exact_match(actual, expected)
|
|
285
|
+
normalized = self._normalized_match(actual, expected)
|
|
286
|
+
contains = _token_overlap_coefficient(actual, expected)
|
|
287
|
+
char_ratio = self._char_similarity(actual, expected)
|
|
288
|
+
token_prec, token_detail = self._token_precision(actual, expected)
|
|
289
|
+
numeric_score, numeric_detail = self._numeric_agreement(
|
|
290
|
+
actual, expected)
|
|
291
|
+
|
|
292
|
+
# ------- Diagnostics / Issues -------
|
|
293
|
+
missing_terms = token_detail.get("false_negative", [])
|
|
294
|
+
extra_terms = token_detail.get("false_positive", [])
|
|
295
|
+
|
|
296
|
+
numeric_mismatches_raw = numeric_detail.get("mismatches", [])
|
|
297
|
+
line_diffs = _line_word_diffs(actual, expected, drop_stopwords=True)
|
|
298
|
+
|
|
299
|
+
# Human-readable diagnostics
|
|
300
|
+
num_msgs: List[str] = []
|
|
301
|
+
normalized_numeric_mismatches: List[Dict[str, Any]] = []
|
|
302
|
+
for m in numeric_mismatches_raw:
|
|
303
|
+
if isinstance(m, (list, tuple)) and len(m) >= 2:
|
|
304
|
+
exp = m[0]
|
|
305
|
+
act = m[1]
|
|
306
|
+
rel = float(m[2]) if (isinstance(m, (list, tuple))
|
|
307
|
+
and len(m) > 2) else None
|
|
308
|
+
num_msgs.append(f"expected {exp}, got {act}" + (
|
|
309
|
+
f" (rel_err={rel:.4g})" if rel is not None and rel != float('inf') else ""))
|
|
310
|
+
normalized_numeric_mismatches.append(
|
|
311
|
+
{"expected": exp, "actual": act, "rel_err": rel})
|
|
312
|
+
else:
|
|
313
|
+
# unexpected format
|
|
314
|
+
num_msgs.append(f"unexpected number {m}")
|
|
315
|
+
normalized_numeric_mismatches.append(
|
|
316
|
+
{"expected": None, "actual": m, "rel_err": None})
|
|
317
|
+
|
|
318
|
+
if num_msgs:
|
|
319
|
+
numbers_summary = "numeric mismatches: " + "; ".join(num_msgs)
|
|
320
|
+
else:
|
|
321
|
+
numbers_summary = "Numbers: all matched."
|
|
322
|
+
|
|
323
|
+
missing_summary = f"Missing terms: {missing_terms}" if missing_terms else "Missing terms: none."
|
|
324
|
+
extra_summary = f"Extra terms: {extra_terms}" if extra_terms else "Extra terms: none."
|
|
325
|
+
diagnostics = f"{missing_summary} {extra_summary} {numbers_summary}"
|
|
326
|
+
|
|
327
|
+
# --- Aggregation via power mean ---
|
|
328
|
+
component_scores = {
|
|
329
|
+
"contains": contains,
|
|
330
|
+
"char_ratio": char_ratio,
|
|
331
|
+
"token_precision": token_prec,
|
|
332
|
+
"numeric": numeric_score,
|
|
333
|
+
}
|
|
334
|
+
base_score = _power_mean_score(
|
|
335
|
+
component_scores, w, p=self.config.power_p)
|
|
336
|
+
|
|
337
|
+
# Penalties
|
|
338
|
+
fp = token_detail["false_positive"]
|
|
339
|
+
heavy_penalty = 0.0
|
|
340
|
+
if len(fp) >= 5 and token_prec < 0.7:
|
|
341
|
+
heavy_penalty = 0.05
|
|
342
|
+
|
|
343
|
+
final_score = base_score * (1.0 - heavy_penalty)
|
|
344
|
+
final_score = max(0.0, min(1.0, final_score))
|
|
345
|
+
success = final_score >= self.threshold
|
|
346
|
+
|
|
347
|
+
# Human-readable reason
|
|
348
|
+
reason_bits: List[str] = []
|
|
349
|
+
if exact == 1.0:
|
|
350
|
+
reason_bits.append("exact match")
|
|
351
|
+
elif normalized == 1.0:
|
|
352
|
+
reason_bits.append("normalized match")
|
|
353
|
+
else:
|
|
354
|
+
reason_bits.append(f"char similarity {char_ratio:.2f}")
|
|
355
|
+
reason_bits.append(f"token precision {token_prec:.2f}")
|
|
356
|
+
if numeric_detail["expected_numbers"] or numeric_detail["actual_numbers"]:
|
|
357
|
+
reason_bits.append(f"numeric agreement {numeric_score:.2f}")
|
|
358
|
+
if contains == 1.0:
|
|
359
|
+
reason_bits.append("full containment")
|
|
360
|
+
elif contains > 0:
|
|
361
|
+
reason_bits.append(f"containment {contains:.2f}")
|
|
362
|
+
if heavy_penalty > 0:
|
|
363
|
+
reason_bits.append("penalized for many extra tokens")
|
|
364
|
+
reason_bits.append(
|
|
365
|
+
f"power-mean aggregation (p={self.config.power_p:.2f}); {missing_summary} {extra_summary}")
|
|
366
|
+
reason = ", ".join(reason_bits) # <<< ΡΠΈΠΊΡ: ΡΠΎΡΠΌΠΈΡΡΠ΅ΠΌ reason
|
|
367
|
+
|
|
368
|
+
evaluation_log = {
|
|
369
|
+
"actual": actual,
|
|
370
|
+
"expected": expected,
|
|
371
|
+
"components": {
|
|
372
|
+
"exact": exact,
|
|
373
|
+
"normalized": normalized,
|
|
374
|
+
"contains": contains,
|
|
375
|
+
"char_ratio": char_ratio,
|
|
376
|
+
"token_precision": token_prec,
|
|
377
|
+
"numeric": numeric_score,
|
|
378
|
+
"weights": w,
|
|
379
|
+
"heavy_penalty": heavy_penalty,
|
|
380
|
+
},
|
|
381
|
+
"token_detail": token_detail,
|
|
382
|
+
"numeric_detail": numeric_detail,
|
|
383
|
+
"issues": {
|
|
384
|
+
"missing_terms": missing_terms,
|
|
385
|
+
"extra_terms": extra_terms,
|
|
386
|
+
"numeric_mismatches": normalized_numeric_mismatches,
|
|
387
|
+
"line_diffs": line_diffs,
|
|
388
|
+
},
|
|
389
|
+
"threshold": self.threshold,
|
|
390
|
+
"config": {
|
|
391
|
+
"numeric_tolerance_abs": self.config.numeric_tolerance_abs,
|
|
392
|
+
"numeric_tolerance_rel": self.config.numeric_tolerance_rel,
|
|
393
|
+
"require_expected_present": self.config.require_expected_present,
|
|
394
|
+
"stopwords_count": len(self.config.token_stopwords),
|
|
395
|
+
},
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
return {
|
|
399
|
+
"score": round(final_score, 4),
|
|
400
|
+
"success": success,
|
|
401
|
+
"reason": reason,
|
|
402
|
+
"diagnostics": diagnostics,
|
|
403
|
+
"evaluation_cost": 0.0,
|
|
404
|
+
"evaluation_log": evaluation_log,
|
|
405
|
+
}
|