eval-ai-library 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

Files changed (34) hide show
  1. eval_ai_library-0.1.0.dist-info/METADATA +753 -0
  2. eval_ai_library-0.1.0.dist-info/RECORD +34 -0
  3. eval_ai_library-0.1.0.dist-info/WHEEL +5 -0
  4. eval_ai_library-0.1.0.dist-info/licenses/LICENSE +21 -0
  5. eval_ai_library-0.1.0.dist-info/top_level.txt +1 -0
  6. eval_lib/__init__.py +122 -0
  7. eval_lib/agent_metrics/__init__.py +12 -0
  8. eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +231 -0
  9. eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +251 -0
  10. eval_lib/agent_metrics/task_success_metric/task_success_rate.py +347 -0
  11. eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +106 -0
  12. eval_lib/datagenerator/datagenerator.py +230 -0
  13. eval_lib/datagenerator/document_loader.py +510 -0
  14. eval_lib/datagenerator/prompts.py +192 -0
  15. eval_lib/evaluate.py +335 -0
  16. eval_lib/evaluation_schema.py +63 -0
  17. eval_lib/llm_client.py +286 -0
  18. eval_lib/metric_pattern.py +229 -0
  19. eval_lib/metrics/__init__.py +25 -0
  20. eval_lib/metrics/answer_precision_metric/answer_precision.py +405 -0
  21. eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +195 -0
  22. eval_lib/metrics/bias_metric/bias.py +114 -0
  23. eval_lib/metrics/contextual_precision_metric/contextual_precision.py +102 -0
  24. eval_lib/metrics/contextual_recall_metric/contextual_recall.py +91 -0
  25. eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +169 -0
  26. eval_lib/metrics/custom_metric/custom_eval.py +303 -0
  27. eval_lib/metrics/faithfulness_metric/faithfulness.py +140 -0
  28. eval_lib/metrics/geval/geval.py +326 -0
  29. eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +102 -0
  30. eval_lib/metrics/toxicity_metric/toxicity.py +113 -0
  31. eval_lib/price.py +37 -0
  32. eval_lib/py.typed +1 -0
  33. eval_lib/testcases_schema.py +27 -0
  34. eval_lib/utils.py +99 -0
@@ -0,0 +1,229 @@
1
+ # metric_pattern.py
2
+ """
3
+ Base classes for evaluation metrics with beautiful console logging.
4
+ """
5
+ import json
6
+ import time
7
+ from typing import Type, Dict, Any, Union, Optional
8
+
9
+ from eval_lib.testcases_schema import EvalTestCase, ConversationalEvalTestCase
10
+ from eval_lib.llm_client import chat_complete
11
+
12
+
13
+ # ANSI color codes for beautiful console output
14
+ class Colors:
15
+ HEADER = '\033[95m'
16
+ BLUE = '\033[94m'
17
+ CYAN = '\033[96m'
18
+ GREEN = '\033[92m'
19
+ YELLOW = '\033[93m'
20
+ RED = '\033[91m'
21
+ ENDC = '\033[0m'
22
+ BOLD = '\033[1m'
23
+ UNDERLINE = '\033[4m'
24
+ DIM = '\033[2m'
25
+
26
+
27
+ class MetricPattern:
28
+ """
29
+ Base class for metrics that use a pattern-based approach to evaluation.
30
+ This class is designed to be subclassed for specific metrics.
31
+ """
32
+ name: str # name of the metric
33
+
34
+ def __init__(self, model: str, threshold: float, verbose: bool = True):
35
+ self.model = model
36
+ self.threshold = threshold
37
+ self.verbose = verbose
38
+
39
+ def _log(self, message: str, color: str = Colors.CYAN):
40
+ """Log message with color if verbose mode is enabled"""
41
+ if self.verbose:
42
+ print(f"{color}{message}{Colors.ENDC}")
43
+
44
+ def _log_step(self, step_name: str, step_num: int = None):
45
+ """Log evaluation step"""
46
+ if self.verbose:
47
+ prefix = f"[{step_num}] " if step_num else ""
48
+ print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
49
+
50
+ async def evaluate(self, test_case: Union[EvalTestCase]) -> Dict[str, Any]:
51
+ """
52
+ Base evaluation method - override in subclasses for custom behavior.
53
+ """
54
+ start_time = time.time()
55
+
56
+ if self.verbose:
57
+ print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
58
+ print(f"{Colors.BOLD}{Colors.BLUE}πŸ” Evaluating: {self.name}{Colors.ENDC}")
59
+ print(f"{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
60
+ print(f"{Colors.DIM}Model: {self.model}{Colors.ENDC}")
61
+ print(f"{Colors.DIM}Threshold: {self.threshold}{Colors.ENDC}")
62
+
63
+ self._log_step("Generating evaluation prompt", 1)
64
+
65
+ # 1) Generate prompt
66
+ prompt = self.template.generate_prompt(
67
+ test_case=test_case,
68
+ threshold=self.threshold
69
+ )
70
+
71
+ self._log_step("Calling LLM", 2)
72
+
73
+ # 2) Make API call
74
+ text, cost = await chat_complete(
75
+ self.model,
76
+ messages=[{"role": "user", "content": prompt}],
77
+ temperature=0.0
78
+ )
79
+
80
+ self._log_step("Parsing response", 3)
81
+
82
+ # 3) Parse the response
83
+ try:
84
+ data = json.loads(text)
85
+ except Exception as e:
86
+ self._log(f"❌ Failed to parse JSON: {e}", Colors.RED)
87
+ raise RuntimeError(
88
+ f"Cannot parse JSON from model response: {e}\n{text}")
89
+
90
+ score = float(data.get("score", 0.0))
91
+ reason = data.get("reason")
92
+ success = score >= self.threshold
93
+
94
+ # Calculate elapsed time
95
+ elapsed_time = time.time() - start_time
96
+
97
+ # Log results
98
+ if self.verbose:
99
+ print(f"\n{Colors.BOLD}πŸ“Š Results:{Colors.ENDC}")
100
+ score_color = Colors.GREEN if success else Colors.RED
101
+ success_icon = "βœ…" if success else "❌"
102
+ print(
103
+ f" {success_icon} Status: {score_color}{Colors.BOLD}{'PASSED' if success else 'FAILED'}{Colors.ENDC}")
104
+ print(
105
+ f" πŸ“ˆ Score: {score_color}{score:.2f}{Colors.ENDC} (threshold: {self.threshold})")
106
+ print(f" πŸ’° Cost: {Colors.YELLOW}${cost:.6f}{Colors.ENDC}")
107
+ print(f" ⏱️ Time: {Colors.DIM}{elapsed_time:.2f}s{Colors.ENDC}")
108
+ if reason:
109
+ print(
110
+ f" πŸ’¬ Reason: {Colors.DIM}{reason[:100]}{'...' if len(reason) > 100 else ''}{Colors.ENDC}")
111
+
112
+ return {
113
+ "score": score,
114
+ "success": success,
115
+ "reason": reason,
116
+ "evaluation_cost": cost,
117
+ }
118
+
119
+
120
+ class ConversationalMetricPattern:
121
+ """
122
+ Base class for conversational metrics (evaluating full dialogues).
123
+ Used for metrics like RoleAdherence, DialogueCoherence, etc.
124
+ """
125
+ name: str
126
+ template_cls: Type
127
+
128
+ def __init__(self, model: str, threshold: float, verbose: bool = True):
129
+ self.model = model
130
+ self.threshold = threshold
131
+ self.verbose = verbose
132
+ if self.template_cls:
133
+ self.template = self.template_cls()
134
+ else:
135
+ self.template = None
136
+ self.chatbot_role: Optional[str] = None
137
+
138
+ def _log(self, message: str, color: str = Colors.CYAN):
139
+ """Log message with color if verbose mode is enabled"""
140
+ if self.verbose:
141
+ print(f"{color}{message}{Colors.ENDC}")
142
+
143
+ def _log_step(self, step_name: str, step_num: int = None):
144
+ """Log evaluation step"""
145
+ if self.verbose:
146
+ prefix = f"[{step_num}] " if step_num else ""
147
+ print(f"{Colors.DIM} {prefix}{step_name}...{Colors.ENDC}")
148
+
149
+ async def evaluate(self, test_case: ConversationalEvalTestCase) -> Dict[str, Any]:
150
+ """
151
+ Evaluate conversational test case with logging.
152
+ """
153
+ start_time = time.time()
154
+
155
+ if self.verbose:
156
+ print(f"\n{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
157
+ print(
158
+ f"{Colors.BOLD}{Colors.BLUE}πŸ’¬ Evaluating Conversation: {self.name}{Colors.ENDC}")
159
+ print(f"{Colors.BOLD}{Colors.BLUE}{'='*60}{Colors.ENDC}")
160
+ print(f"{Colors.DIM}Model: {self.model}{Colors.ENDC}")
161
+ print(f"{Colors.DIM}Threshold: {self.threshold}{Colors.ENDC}")
162
+ print(f"{Colors.DIM}Turns: {len(test_case.turns)}{Colors.ENDC}")
163
+
164
+ self._log_step("Generating evaluation prompt", 1)
165
+
166
+ # 1. Generate prompt
167
+ if hasattr(self.template, "generate_prompt"):
168
+ try:
169
+ prompt = self.template.generate_prompt(
170
+ test_case=test_case,
171
+ threshold=self.threshold,
172
+ chatbot_role=self.chatbot_role
173
+ )
174
+ except TypeError:
175
+ prompt = self.template.generate_prompt(
176
+ test_case=test_case,
177
+ threshold=self.threshold,
178
+ temperature=0.0
179
+ )
180
+ else:
181
+ raise RuntimeError("Template is missing method generate_prompt")
182
+
183
+ self._log_step("Calling LLM", 2)
184
+
185
+ # 2. Call API
186
+ text, cost = await chat_complete(
187
+ self.model,
188
+ messages=[{"role": "user", "content": prompt}],
189
+ temperature=0.0
190
+ )
191
+
192
+ self._log_step("Parsing response", 3)
193
+
194
+ # 3. Parse response
195
+ try:
196
+ data = json.loads(text)
197
+ except Exception as e:
198
+ self._log(f"❌ Failed to parse JSON: {e}", Colors.RED)
199
+ raise RuntimeError(
200
+ f"Cannot parse JSON from model response: {e}\n{text}")
201
+
202
+ score = float(data.get("score", 0.0))
203
+ reason = data.get("reason")
204
+ success = score >= self.threshold
205
+
206
+ # Calculate elapsed time
207
+ elapsed_time = time.time() - start_time
208
+
209
+ # Log results
210
+ if self.verbose:
211
+ print(f"\n{Colors.BOLD}πŸ“Š Results:{Colors.ENDC}")
212
+ score_color = Colors.GREEN if success else Colors.RED
213
+ success_icon = "βœ…" if success else "❌"
214
+ print(
215
+ f" {success_icon} Status: {score_color}{Colors.BOLD}{'PASSED' if success else 'FAILED'}{Colors.ENDC}")
216
+ print(
217
+ f" πŸ“ˆ Score: {score_color}{score:.2f}{Colors.ENDC} (threshold: {self.threshold})")
218
+ print(f" πŸ’° Cost: {Colors.YELLOW}${cost:.6f}{Colors.ENDC}")
219
+ print(f" ⏱️ Time: {Colors.DIM}{elapsed_time:.2f}s{Colors.ENDC}")
220
+ if reason:
221
+ print(
222
+ f" πŸ’¬ Reason: {Colors.DIM}{reason[:100]}{'...' if len(reason) > 100 else ''}{Colors.ENDC}")
223
+
224
+ return {
225
+ "score": score,
226
+ "success": success,
227
+ "reason": reason,
228
+ "evaluation_cost": cost,
229
+ }
@@ -0,0 +1,25 @@
1
+ from eval_lib.metrics.answer_relevancy_metric.answer_relevancy import AnswerRelevancyMetric
2
+ from eval_lib.metrics.faithfulness_metric.faithfulness import FaithfulnessMetric
3
+ from eval_lib.metrics.contextual_relevancy_metric.contextual_relevancy import ContextualRelevancyMetric
4
+ from eval_lib.metrics.contextual_precision_metric.contextual_precision import ContextualPrecisionMetric
5
+ from eval_lib.metrics.contextual_recall_metric.contextual_recall import ContextualRecallMetric
6
+ from eval_lib.metrics.bias_metric.bias import BiasMetric
7
+ from eval_lib.metrics.toxicity_metric.toxicity import ToxicityMetric
8
+ from eval_lib.metrics.geval.geval import GEval
9
+ from eval_lib.metrics.custom_metric.custom_eval import CustomEvalMetric
10
+ from eval_lib.metrics.restricted_refusal_metric.restricted_refusal import RestrictedRefusalMetric
11
+ from eval_lib.metrics.answer_precision_metric.answer_precision import AnswerPrecisionMetric
12
+
13
+ __all__ = [
14
+ "AnswerRelevancyMetric",
15
+ "AnswerPrecisionMetric",
16
+ "FaithfulnessMetric",
17
+ "ContextualRelevancyMetric",
18
+ "ContextualPrecisionMetric",
19
+ "ContextualRecallMetric",
20
+ "BiasMetric",
21
+ "ToxicityMetric",
22
+ "GEval",
23
+ "RestrictedRefusalMetric",
24
+ "CustomEvalMetric"
25
+ ]
@@ -0,0 +1,405 @@
1
+ '''
2
+ Answer Precision Metric: Evaluates how precisely a model's answer matches the expected answer
3
+ using multiple text similarity components and weighted aggregation.
4
+
5
+ Score Components:
6
+ - Exact Match: 1.0 if actual == expected else 0.0
7
+ - Normalized Match: 1.0 if normalized(actual) == normalized(expected) else 0.0
8
+ - Character Similarity: Ratio of matching characters after normalization.
9
+ - Token Precision: Proportion of expected tokens present in actual answer.
10
+ - Numeric Agreement: Proportion of expected numeric values correctly represented in actual answer.
11
+
12
+
13
+ '''
14
+
15
+ from __future__ import annotations
16
+
17
+ import math
18
+ import re
19
+ import unicodedata
20
+ from dataclasses import dataclass, field
21
+ from difflib import SequenceMatcher
22
+ from typing import Any, Dict, List, Optional, Set, Tuple
23
+
24
+ from eval_lib.testcases_schema import EvalTestCase
25
+ from eval_lib.metric_pattern import MetricPattern
26
+
27
+
28
+ # -------------------------------
29
+ # Helpers to normalize and compare text
30
+ # -------------------------------
31
+
32
+ def _normalize_text_basic(text: str) -> str:
33
+ """Lowercase, strip, collapse whitespace; also strip markdown links/URLs."""
34
+ if text is None:
35
+ return ""
36
+ # [label](https://...) -> label
37
+ text = re.sub(r"\[([^\]]+)\]\(\s*https?://[^)]+\s*\)", r"\1", text)
38
+ # bare URLs
39
+ text = re.sub(r"https?://\S+", "", text)
40
+ text = unicodedata.normalize("NFKC", text).lower().strip()
41
+ text = re.sub(r"\s+", " ", text)
42
+ return text
43
+
44
+
45
+ _PUNCT_RE = re.compile(r"[!\"#$%&'()*+,\-./:;<=>?@\[\\\]^_`{|}~]")
46
+
47
+
48
+ def _normalize_for_tokens(text: str) -> str:
49
+ text = _normalize_text_basic(text)
50
+ text = _PUNCT_RE.sub(" ", text)
51
+ text = re.sub(r"\s+", " ", text).strip()
52
+ return text
53
+
54
+
55
+ def _power_mean_score(components: Dict[str, float], weights: Dict[str, float], p: float = 0.3) -> float:
56
+ """
57
+ Weighted power mean:
58
+ M_p = ( sum_i alpha_i * s_i^p )^(1/p)
59
+ alpha_i = w_i / sum(w_i). Recommended 0 < p < 1.
60
+ """
61
+ keys = [k for k in components.keys() if k in weights]
62
+ if not keys:
63
+ return 0.0
64
+
65
+ raw = {k: max(0.0, float(weights[k])) for k in keys}
66
+ total = sum(raw.values())
67
+ if total <= 0:
68
+ alpha = {k: 1.0 / len(keys) for k in keys}
69
+ else:
70
+ alpha = {k: raw[k] / total for k in keys}
71
+
72
+ acc = 0.0
73
+ for k in keys:
74
+ s = max(0.0, min(1.0, float(components[k])))
75
+ acc += alpha[k] * (s ** p)
76
+ score = acc ** (1.0 / p)
77
+ return max(0.0, min(1.0, score))
78
+
79
+
80
+ STOPWORDS: Set[str] = {
81
+ "the", "a", "an", "and", "or", "but", "if", "then", "else",
82
+ "to", "of", "in", "on", "for", "with", "as", "by", "at",
83
+ "from", "that", "this", "it", "is", "are", "was", "were",
84
+ }
85
+
86
+
87
+ def _line_word_diffs(actual: str, expected: str, drop_stopwords: bool = True) -> List[Dict[str, Any]]:
88
+ """
89
+ Post-line diagnostics: words added/removed per line (human-readable).
90
+ """
91
+ a_lines = (actual or "").splitlines()
92
+ e_lines = (expected or "").splitlines()
93
+ n = max(len(a_lines), len(e_lines))
94
+ diffs: List[Dict[str, Any]] = []
95
+
96
+ def words(s: str) -> List[str]:
97
+ return _tokenize(s, drop_stopwords=drop_stopwords)
98
+
99
+ for i in range(n):
100
+ e_line = e_lines[i] if i < len(e_lines) else ""
101
+ a_line = a_lines[i] if i < len(a_lines) else ""
102
+ if _normalize_text_basic(e_line) == _normalize_text_basic(a_line):
103
+ continue
104
+ e_set = set(words(e_line))
105
+ a_set = set(words(a_line))
106
+ removed = sorted(list(e_set - a_set))
107
+ added = sorted(list(a_set - e_set))
108
+ if removed or added:
109
+ diffs.append({
110
+ "line_no": i + 1, # 1-based
111
+ "expected": e_line,
112
+ "actual": a_line,
113
+ "removed": removed,
114
+ "added": added,
115
+ })
116
+ return diffs
117
+
118
+
119
+ def _tokenize(text: str, drop_stopwords: bool = True) -> List[str]:
120
+ text = _normalize_for_tokens(text)
121
+ tokens = [t for t in text.split(" ") if t]
122
+ if drop_stopwords:
123
+ tokens = [t for t in tokens if t not in STOPWORDS]
124
+ return tokens
125
+
126
+
127
+ _NUM_RE = re.compile(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?")
128
+
129
+
130
+ def _extract_numbers(text: str) -> List[float]:
131
+ nums: List[float] = []
132
+ for m in _NUM_RE.finditer(text or ""):
133
+ try:
134
+ nums.append(float(m.group(0)))
135
+ except Exception:
136
+ pass
137
+ return nums
138
+
139
+
140
+ def _token_sets(a: str, e: str) -> Tuple[Set[str], Set[str]]:
141
+ a_tokens = set(_tokenize(a))
142
+ e_tokens = set(_tokenize(e))
143
+ return a_tokens, e_tokens
144
+
145
+
146
+ def _token_overlap_coefficient(a: str, e: str) -> float:
147
+ """
148
+ Overlap coefficient: |A ∩ E| / min(|A|, |E|).
149
+ """
150
+ A, E = _token_sets(a, e)
151
+ if not A and not E:
152
+ return 1.0
153
+ if not A or not E:
154
+ return 0.0
155
+ inter = len(A & E)
156
+ return inter / min(len(A), len(E))
157
+
158
+
159
+ @dataclass
160
+ class PrecisionConfig:
161
+ token_stopwords: Set[str] = field(default_factory=lambda: set(STOPWORDS))
162
+ numeric_tolerance_abs: float = 0.0
163
+ numeric_tolerance_rel: float = 0.0
164
+ require_expected_present: bool = True
165
+ weights: Optional[Dict[str, float]] = None
166
+ power_p: float = 0.3
167
+
168
+ def __post_init__(self):
169
+ if self.weights is None:
170
+ self.weights = {
171
+ "contains": 0.48,
172
+ "char_ratio": 0.32,
173
+ "token_precision": 0.15,
174
+ "numeric": 0.35,
175
+ }
176
+
177
+
178
+ class AnswerPrecisionMetric(MetricPattern):
179
+ name = "answerPrecisionMetric"
180
+
181
+ def __init__(self, model: str, threshold: float = 0.8, config: Optional[PrecisionConfig] = None):
182
+ super().__init__(model=model, threshold=threshold)
183
+ self.config = config or PrecisionConfig()
184
+
185
+ # --- core similarity components ---
186
+ def _exact_match(self, a: str, e: str) -> float:
187
+ return 1.0 if (a or "") == (e or "") else 0.0
188
+
189
+ def _normalized_match(self, a: str, e: str) -> float:
190
+ return 1.0 if _normalize_text_basic(a) == _normalize_text_basic(e) else 0.0
191
+
192
+ def _char_similarity(self, a: str, e: str) -> float:
193
+ a_norm = _normalize_text_basic(a)
194
+ e_norm = _normalize_text_basic(e)
195
+ if not a_norm and not e_norm:
196
+ return 1.0
197
+ return SequenceMatcher(a=a_norm, b=e_norm).ratio()
198
+
199
+ def _token_precision(self, a: str, e: str) -> Tuple[float, Dict[str, Any]]:
200
+ a_tokens = set(_tokenize(a))
201
+ e_tokens = set(_tokenize(e))
202
+ if not a_tokens:
203
+ return 1.0 if not e_tokens else 0.0, {
204
+ "actual_tokens": [],
205
+ "expected_tokens": sorted(list(e_tokens)),
206
+ "true_positive": [],
207
+ "false_positive": [],
208
+ "false_negative": sorted(list(e_tokens)),
209
+ }
210
+ tp = sorted(list(a_tokens & e_tokens))
211
+ fp = sorted(list(a_tokens - e_tokens))
212
+ fn = sorted(list(e_tokens - a_tokens))
213
+ precision = len(tp) / (len(tp) + len(fp)
214
+ ) if (len(tp) + len(fp)) > 0 else 0.0
215
+ return precision, {
216
+ "actual_tokens": sorted(list(a_tokens)),
217
+ "expected_tokens": sorted(list(e_tokens)),
218
+ "true_positive": tp,
219
+ "false_positive": fp,
220
+ "false_negative": fn,
221
+ }
222
+
223
+ def _numeric_agreement(self, a: str, e: str) -> Tuple[float, Dict[str, Any]]:
224
+ a_nums = _extract_numbers(a)
225
+ e_nums = _extract_numbers(e)
226
+ if not e_nums and not a_nums:
227
+ return 1.0, {"actual_numbers": [], "expected_numbers": [], "matches": [], "mismatches": []}
228
+ if not e_nums and a_nums:
229
+ return 0.0, {"actual_numbers": a_nums, "expected_numbers": [], "matches": [], "mismatches": a_nums}
230
+ tol_abs = self.config.numeric_tolerance_abs
231
+ tol_rel = self.config.numeric_tolerance_rel
232
+ used_idx: Set[int] = set()
233
+ matches: List[Tuple[float, float]] = []
234
+ mismatches: List[Tuple[float, float, float]] = []
235
+ for exp_v in e_nums:
236
+ best_i = None
237
+ best_err = math.inf
238
+ for i, act_v in enumerate(a_nums):
239
+ if i in used_idx:
240
+ continue
241
+ err = abs(act_v - exp_v)
242
+ if err < best_err:
243
+ best_err = err
244
+ best_i = i
245
+ if best_i is None:
246
+ mismatches.append((exp_v, float("nan"), float("inf")))
247
+ continue
248
+ act_v = a_nums[best_i]
249
+ used_idx.add(best_i)
250
+ rel_err = abs(act_v - exp_v) / (abs(exp_v) + 1e-12)
251
+ within = (best_err <= tol_abs) or (rel_err <= tol_rel)
252
+ if within:
253
+ matches.append((exp_v, act_v))
254
+ else:
255
+ mismatches.append((exp_v, act_v, rel_err))
256
+ score = len(matches) / \
257
+ len(e_nums) if e_nums else (1.0 if not a_nums else 0.0)
258
+ detail = {
259
+ "actual_numbers": a_nums,
260
+ "expected_numbers": e_nums,
261
+ "matches": matches,
262
+ "mismatches": mismatches,
263
+ "tolerance_abs": tol_abs,
264
+ "tolerance_rel": tol_rel,
265
+ }
266
+ return score, detail
267
+
268
+ # --- evaluation entrypoint ---
269
+ async def evaluate(self, test_case: EvalTestCase) -> Dict[str, Any]:
270
+ actual = test_case.actual_output or ""
271
+ expected = test_case.expected_output or ""
272
+
273
+ if self.config.require_expected_present and not test_case.expected_output:
274
+ return {
275
+ "score": 0.0,
276
+ "success": False,
277
+ "reason": "No expected output provided.",
278
+ "evaluation_cost": 0.0,
279
+ "evaluation_log": {"note": "expected_output is required for AnswerPrecisionMetric"},
280
+ }
281
+
282
+ w = self.config.weights
283
+
284
+ exact = self._exact_match(actual, expected)
285
+ normalized = self._normalized_match(actual, expected)
286
+ contains = _token_overlap_coefficient(actual, expected)
287
+ char_ratio = self._char_similarity(actual, expected)
288
+ token_prec, token_detail = self._token_precision(actual, expected)
289
+ numeric_score, numeric_detail = self._numeric_agreement(
290
+ actual, expected)
291
+
292
+ # ------- Diagnostics / Issues -------
293
+ missing_terms = token_detail.get("false_negative", [])
294
+ extra_terms = token_detail.get("false_positive", [])
295
+
296
+ numeric_mismatches_raw = numeric_detail.get("mismatches", [])
297
+ line_diffs = _line_word_diffs(actual, expected, drop_stopwords=True)
298
+
299
+ # Human-readable diagnostics
300
+ num_msgs: List[str] = []
301
+ normalized_numeric_mismatches: List[Dict[str, Any]] = []
302
+ for m in numeric_mismatches_raw:
303
+ if isinstance(m, (list, tuple)) and len(m) >= 2:
304
+ exp = m[0]
305
+ act = m[1]
306
+ rel = float(m[2]) if (isinstance(m, (list, tuple))
307
+ and len(m) > 2) else None
308
+ num_msgs.append(f"expected {exp}, got {act}" + (
309
+ f" (rel_err={rel:.4g})" if rel is not None and rel != float('inf') else ""))
310
+ normalized_numeric_mismatches.append(
311
+ {"expected": exp, "actual": act, "rel_err": rel})
312
+ else:
313
+ # unexpected format
314
+ num_msgs.append(f"unexpected number {m}")
315
+ normalized_numeric_mismatches.append(
316
+ {"expected": None, "actual": m, "rel_err": None})
317
+
318
+ if num_msgs:
319
+ numbers_summary = "numeric mismatches: " + "; ".join(num_msgs)
320
+ else:
321
+ numbers_summary = "Numbers: all matched."
322
+
323
+ missing_summary = f"Missing terms: {missing_terms}" if missing_terms else "Missing terms: none."
324
+ extra_summary = f"Extra terms: {extra_terms}" if extra_terms else "Extra terms: none."
325
+ diagnostics = f"{missing_summary} {extra_summary} {numbers_summary}"
326
+
327
+ # --- Aggregation via power mean ---
328
+ component_scores = {
329
+ "contains": contains,
330
+ "char_ratio": char_ratio,
331
+ "token_precision": token_prec,
332
+ "numeric": numeric_score,
333
+ }
334
+ base_score = _power_mean_score(
335
+ component_scores, w, p=self.config.power_p)
336
+
337
+ # Penalties
338
+ fp = token_detail["false_positive"]
339
+ heavy_penalty = 0.0
340
+ if len(fp) >= 5 and token_prec < 0.7:
341
+ heavy_penalty = 0.05
342
+
343
+ final_score = base_score * (1.0 - heavy_penalty)
344
+ final_score = max(0.0, min(1.0, final_score))
345
+ success = final_score >= self.threshold
346
+
347
+ # Human-readable reason
348
+ reason_bits: List[str] = []
349
+ if exact == 1.0:
350
+ reason_bits.append("exact match")
351
+ elif normalized == 1.0:
352
+ reason_bits.append("normalized match")
353
+ else:
354
+ reason_bits.append(f"char similarity {char_ratio:.2f}")
355
+ reason_bits.append(f"token precision {token_prec:.2f}")
356
+ if numeric_detail["expected_numbers"] or numeric_detail["actual_numbers"]:
357
+ reason_bits.append(f"numeric agreement {numeric_score:.2f}")
358
+ if contains == 1.0:
359
+ reason_bits.append("full containment")
360
+ elif contains > 0:
361
+ reason_bits.append(f"containment {contains:.2f}")
362
+ if heavy_penalty > 0:
363
+ reason_bits.append("penalized for many extra tokens")
364
+ reason_bits.append(
365
+ f"power-mean aggregation (p={self.config.power_p:.2f}); {missing_summary} {extra_summary}")
366
+ reason = ", ".join(reason_bits) # <<< фикс: Ρ„ΠΎΡ€ΠΌΠΈΡ€ΡƒΠ΅ΠΌ reason
367
+
368
+ evaluation_log = {
369
+ "actual": actual,
370
+ "expected": expected,
371
+ "components": {
372
+ "exact": exact,
373
+ "normalized": normalized,
374
+ "contains": contains,
375
+ "char_ratio": char_ratio,
376
+ "token_precision": token_prec,
377
+ "numeric": numeric_score,
378
+ "weights": w,
379
+ "heavy_penalty": heavy_penalty,
380
+ },
381
+ "token_detail": token_detail,
382
+ "numeric_detail": numeric_detail,
383
+ "issues": {
384
+ "missing_terms": missing_terms,
385
+ "extra_terms": extra_terms,
386
+ "numeric_mismatches": normalized_numeric_mismatches,
387
+ "line_diffs": line_diffs,
388
+ },
389
+ "threshold": self.threshold,
390
+ "config": {
391
+ "numeric_tolerance_abs": self.config.numeric_tolerance_abs,
392
+ "numeric_tolerance_rel": self.config.numeric_tolerance_rel,
393
+ "require_expected_present": self.config.require_expected_present,
394
+ "stopwords_count": len(self.config.token_stopwords),
395
+ },
396
+ }
397
+
398
+ return {
399
+ "score": round(final_score, 4),
400
+ "success": success,
401
+ "reason": reason,
402
+ "diagnostics": diagnostics,
403
+ "evaluation_cost": 0.0,
404
+ "evaluation_log": evaluation_log,
405
+ }