onetool-mcp 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +5 -0
- bench/cli.py +69 -0
- bench/harness/__init__.py +66 -0
- bench/harness/client.py +692 -0
- bench/harness/config.py +397 -0
- bench/harness/csv_writer.py +109 -0
- bench/harness/evaluate.py +512 -0
- bench/harness/metrics.py +283 -0
- bench/harness/runner.py +899 -0
- bench/py.typed +0 -0
- bench/reporter.py +629 -0
- bench/run.py +487 -0
- bench/secrets.py +101 -0
- bench/utils.py +16 -0
- onetool/__init__.py +4 -0
- onetool/cli.py +391 -0
- onetool/py.typed +0 -0
- onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
- onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
- onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
- onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
- onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
- ot/__init__.py +37 -0
- ot/__main__.py +6 -0
- ot/_cli.py +107 -0
- ot/_tui.py +53 -0
- ot/config/__init__.py +46 -0
- ot/config/defaults/bench.yaml +4 -0
- ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
- ot/config/defaults/diagram-templates/c4-context.puml +30 -0
- ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
- ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
- ot/config/defaults/diagram-templates/microservices.d2 +81 -0
- ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
- ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
- ot/config/defaults/onetool.yaml +25 -0
- ot/config/defaults/prompts.yaml +97 -0
- ot/config/defaults/servers.yaml +7 -0
- ot/config/defaults/snippets.yaml +4 -0
- ot/config/defaults/tool_templates/__init__.py +7 -0
- ot/config/defaults/tool_templates/extension.py +52 -0
- ot/config/defaults/tool_templates/isolated.py +61 -0
- ot/config/dynamic.py +121 -0
- ot/config/global_templates/__init__.py +2 -0
- ot/config/global_templates/bench-secrets-template.yaml +6 -0
- ot/config/global_templates/bench.yaml +9 -0
- ot/config/global_templates/onetool.yaml +27 -0
- ot/config/global_templates/secrets-template.yaml +44 -0
- ot/config/global_templates/servers.yaml +18 -0
- ot/config/global_templates/snippets.yaml +235 -0
- ot/config/loader.py +1087 -0
- ot/config/mcp.py +145 -0
- ot/config/secrets.py +190 -0
- ot/config/tool_config.py +125 -0
- ot/decorators.py +116 -0
- ot/executor/__init__.py +35 -0
- ot/executor/base.py +16 -0
- ot/executor/fence_processor.py +83 -0
- ot/executor/linter.py +142 -0
- ot/executor/pack_proxy.py +260 -0
- ot/executor/param_resolver.py +140 -0
- ot/executor/pep723.py +288 -0
- ot/executor/result_store.py +369 -0
- ot/executor/runner.py +496 -0
- ot/executor/simple.py +163 -0
- ot/executor/tool_loader.py +396 -0
- ot/executor/validator.py +398 -0
- ot/executor/worker_pool.py +388 -0
- ot/executor/worker_proxy.py +189 -0
- ot/http_client.py +145 -0
- ot/logging/__init__.py +37 -0
- ot/logging/config.py +315 -0
- ot/logging/entry.py +213 -0
- ot/logging/format.py +188 -0
- ot/logging/span.py +349 -0
- ot/meta.py +1555 -0
- ot/paths.py +453 -0
- ot/prompts.py +218 -0
- ot/proxy/__init__.py +21 -0
- ot/proxy/manager.py +396 -0
- ot/py.typed +0 -0
- ot/registry/__init__.py +189 -0
- ot/registry/models.py +57 -0
- ot/registry/parser.py +269 -0
- ot/registry/registry.py +413 -0
- ot/server.py +315 -0
- ot/shortcuts/__init__.py +15 -0
- ot/shortcuts/aliases.py +87 -0
- ot/shortcuts/snippets.py +258 -0
- ot/stats/__init__.py +35 -0
- ot/stats/html.py +250 -0
- ot/stats/jsonl_writer.py +283 -0
- ot/stats/reader.py +354 -0
- ot/stats/timing.py +57 -0
- ot/support.py +63 -0
- ot/tools.py +114 -0
- ot/utils/__init__.py +81 -0
- ot/utils/batch.py +161 -0
- ot/utils/cache.py +120 -0
- ot/utils/deps.py +403 -0
- ot/utils/exceptions.py +23 -0
- ot/utils/factory.py +179 -0
- ot/utils/format.py +65 -0
- ot/utils/http.py +202 -0
- ot/utils/platform.py +45 -0
- ot/utils/sanitize.py +130 -0
- ot/utils/truncate.py +69 -0
- ot_tools/__init__.py +4 -0
- ot_tools/_convert/__init__.py +12 -0
- ot_tools/_convert/excel.py +279 -0
- ot_tools/_convert/pdf.py +254 -0
- ot_tools/_convert/powerpoint.py +268 -0
- ot_tools/_convert/utils.py +358 -0
- ot_tools/_convert/word.py +283 -0
- ot_tools/brave_search.py +604 -0
- ot_tools/code_search.py +736 -0
- ot_tools/context7.py +495 -0
- ot_tools/convert.py +614 -0
- ot_tools/db.py +415 -0
- ot_tools/diagram.py +1604 -0
- ot_tools/diagram.yaml +167 -0
- ot_tools/excel.py +1372 -0
- ot_tools/file.py +1348 -0
- ot_tools/firecrawl.py +732 -0
- ot_tools/grounding_search.py +646 -0
- ot_tools/package.py +604 -0
- ot_tools/py.typed +0 -0
- ot_tools/ripgrep.py +544 -0
- ot_tools/scaffold.py +471 -0
- ot_tools/transform.py +213 -0
- ot_tools/web_fetch.py +384 -0
|
@@ -0,0 +1,512 @@
|
|
|
1
|
+
"""Evaluation module for benchmark responses.
|
|
2
|
+
|
|
3
|
+
Supports three evaluation methods:
|
|
4
|
+
1. Regex - Pattern matching with expect_match flag
|
|
5
|
+
2. Deterministic - Contains checks for strings, lists, dicts, scalars
|
|
6
|
+
3. LLM-as-judge - AI-based evaluation with custom prompts
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
The main entry point is `evaluate_task()` which routes to the appropriate
|
|
10
|
+
evaluation method based on the EvaluateConfig.
|
|
11
|
+
|
|
12
|
+
Evaluation is called AFTER task completion to ensure task duration excludes
|
|
13
|
+
evaluation time. The runner handles this in the task loop.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
from typing import TYPE_CHECKING, Any
|
|
21
|
+
|
|
22
|
+
from loguru import logger
|
|
23
|
+
from openai import OpenAI
|
|
24
|
+
|
|
25
|
+
from bench.harness.metrics import EvaluationResult, TaskResult
|
|
26
|
+
from bench.secrets import get_bench_secret
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from bench.harness.config import EvaluateConfig, HarnessConfig, TaskConfig
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# =============================================================================
|
|
33
|
+
# Helper Functions
|
|
34
|
+
# =============================================================================
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _normalize_value(value: Any) -> str:
|
|
38
|
+
"""Convert a value to string for comparison."""
|
|
39
|
+
if isinstance(value, str):
|
|
40
|
+
return value
|
|
41
|
+
if isinstance(value, bool):
|
|
42
|
+
# Return both Python and JSON representations for matching
|
|
43
|
+
return str(value) # "True" or "False"
|
|
44
|
+
return json.dumps(value, sort_keys=True)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _check_pattern(pattern: Any, response: str) -> bool:
|
|
48
|
+
"""Check if a pattern matches the response.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
pattern: String, dict with 'regex' key, or other value
|
|
52
|
+
response: Response text to check
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
True if pattern matches
|
|
56
|
+
"""
|
|
57
|
+
if isinstance(pattern, dict) and "regex" in pattern:
|
|
58
|
+
return bool(re.search(pattern["regex"], response))
|
|
59
|
+
elif isinstance(pattern, str):
|
|
60
|
+
return pattern in response
|
|
61
|
+
else:
|
|
62
|
+
# For numbers, bools, etc - convert to string and check contains
|
|
63
|
+
return _normalize_value(pattern) in response
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _list_is_expected_output(lst: list[Any]) -> bool:
|
|
67
|
+
"""Check if list represents an expected output (not patterns to check).
|
|
68
|
+
|
|
69
|
+
Returns True for lists like [97, 101] or [True, False, True] that should
|
|
70
|
+
be checked as serialized JSON. Returns False for lists with strings or
|
|
71
|
+
regex patterns that should be checked individually.
|
|
72
|
+
"""
|
|
73
|
+
# Lists with any strings are treated as patterns to check
|
|
74
|
+
if any(isinstance(item, str) for item in lst):
|
|
75
|
+
return False
|
|
76
|
+
# Lists with regex dicts are patterns
|
|
77
|
+
if any(isinstance(item, dict) and "regex" in item for item in lst):
|
|
78
|
+
return False
|
|
79
|
+
# Pure numeric/boolean lists are expected outputs
|
|
80
|
+
return all(isinstance(item, (int, float, bool)) or item is None for item in lst)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _truncate(s: str, max_len: int = 100) -> str:
|
|
84
|
+
"""Truncate string for display."""
|
|
85
|
+
if len(s) <= max_len:
|
|
86
|
+
return s
|
|
87
|
+
return s[:max_len] + "..."
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _find_actual_match(
|
|
91
|
+
response: str, pattern: str, max_context: int = 50
|
|
92
|
+
) -> str | None:
|
|
93
|
+
"""Find where a pattern appears in response and extract context."""
|
|
94
|
+
idx = response.find(pattern)
|
|
95
|
+
if idx == -1:
|
|
96
|
+
return None
|
|
97
|
+
start = max(0, idx - 10)
|
|
98
|
+
end = min(len(response), idx + len(pattern) + max_context)
|
|
99
|
+
return response[start:end]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# =============================================================================
|
|
103
|
+
# Evaluation Methods
|
|
104
|
+
# =============================================================================
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def evaluate_deterministic(
|
|
108
|
+
response: str,
|
|
109
|
+
expected: str | list[Any] | dict[str, Any] | int | float | bool,
|
|
110
|
+
expect_error: bool = False,
|
|
111
|
+
) -> EvaluationResult:
|
|
112
|
+
"""Evaluate response against expected value(s) deterministically.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
response: The response text to evaluate
|
|
116
|
+
expected: Expected value(s) - string, list, dict, or scalar
|
|
117
|
+
expect_error: If True, test expects an error. Failure to match means LLM fixed the code.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
EvaluationResult with pass/fail status and expected/actual values
|
|
121
|
+
"""
|
|
122
|
+
if isinstance(expected, list):
|
|
123
|
+
# For lists of pure numbers/booleans, check if the serialized list appears
|
|
124
|
+
if _list_is_expected_output(expected):
|
|
125
|
+
# Try multiple representations
|
|
126
|
+
representations = [
|
|
127
|
+
json.dumps(expected), # [true, false, ...]
|
|
128
|
+
str(expected), # [True, False, ...] - Python repr
|
|
129
|
+
repr(expected), # [True, False, ...]
|
|
130
|
+
]
|
|
131
|
+
for rep in representations:
|
|
132
|
+
if rep in response:
|
|
133
|
+
actual = _find_actual_match(response, rep)
|
|
134
|
+
return EvaluationResult(
|
|
135
|
+
score=100,
|
|
136
|
+
reason="Expected list found",
|
|
137
|
+
eval_type="pass_fail",
|
|
138
|
+
passed=True,
|
|
139
|
+
expected=_truncate(rep),
|
|
140
|
+
actual=actual,
|
|
141
|
+
)
|
|
142
|
+
return EvaluationResult(
|
|
143
|
+
score=0,
|
|
144
|
+
reason="Expected list not found",
|
|
145
|
+
eval_type="pass_fail",
|
|
146
|
+
passed=False,
|
|
147
|
+
expected=_truncate(representations[0]),
|
|
148
|
+
actual=_truncate(response, 200),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# For lists with patterns (regex dicts), check each item
|
|
152
|
+
missing = []
|
|
153
|
+
found = []
|
|
154
|
+
for item in expected:
|
|
155
|
+
if _check_pattern(item, response):
|
|
156
|
+
if isinstance(item, dict) and "regex" in item:
|
|
157
|
+
found.append(f"regex:{item['regex'][:30]}")
|
|
158
|
+
else:
|
|
159
|
+
found.append(str(item)[:30])
|
|
160
|
+
else:
|
|
161
|
+
if isinstance(item, dict) and "regex" in item:
|
|
162
|
+
missing.append(f"regex:{item['regex']}")
|
|
163
|
+
else:
|
|
164
|
+
missing.append(str(item))
|
|
165
|
+
|
|
166
|
+
if missing:
|
|
167
|
+
# When expect_error=True and LLM fixed the code (no error patterns matched),
|
|
168
|
+
# this is a PASS - demonstrates LLM's ability to fix small errors
|
|
169
|
+
if expect_error:
|
|
170
|
+
return EvaluationResult(
|
|
171
|
+
score=100,
|
|
172
|
+
reason="LLM fixed the error",
|
|
173
|
+
eval_type="pass_fail",
|
|
174
|
+
passed=True,
|
|
175
|
+
expected="error or fix",
|
|
176
|
+
actual="LLM fixed code",
|
|
177
|
+
)
|
|
178
|
+
reason = (
|
|
179
|
+
f"Missing: {', '.join(missing[:3])}{'...' if len(missing) > 3 else ''}"
|
|
180
|
+
)
|
|
181
|
+
return EvaluationResult(
|
|
182
|
+
score=0,
|
|
183
|
+
reason=reason,
|
|
184
|
+
eval_type="pass_fail",
|
|
185
|
+
passed=False,
|
|
186
|
+
expected=_truncate(str(expected)),
|
|
187
|
+
actual=f"Found: {', '.join(found[:3])}" if found else "None matched",
|
|
188
|
+
)
|
|
189
|
+
# When expect_error=True and error patterns matched, the error was returned
|
|
190
|
+
reason = (
|
|
191
|
+
"Error returned"
|
|
192
|
+
if expect_error
|
|
193
|
+
else f"All {len(expected)} expected items found"
|
|
194
|
+
)
|
|
195
|
+
actual = (
|
|
196
|
+
"Error in response"
|
|
197
|
+
if expect_error
|
|
198
|
+
else f"All {len(expected)} patterns matched"
|
|
199
|
+
)
|
|
200
|
+
return EvaluationResult(
|
|
201
|
+
score=100,
|
|
202
|
+
reason=reason,
|
|
203
|
+
eval_type="pass_fail",
|
|
204
|
+
passed=True,
|
|
205
|
+
expected="error or fix" if expect_error else _truncate(str(expected)),
|
|
206
|
+
actual=actual,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
elif isinstance(expected, dict):
|
|
210
|
+
# Check if dict is in response (JSON serialized)
|
|
211
|
+
expected_str = _normalize_value(expected)
|
|
212
|
+
if expected_str in response:
|
|
213
|
+
actual = _find_actual_match(response, expected_str)
|
|
214
|
+
return EvaluationResult(
|
|
215
|
+
score=100,
|
|
216
|
+
reason="Expected dict found in response",
|
|
217
|
+
eval_type="pass_fail",
|
|
218
|
+
passed=True,
|
|
219
|
+
expected=_truncate(expected_str),
|
|
220
|
+
actual=actual,
|
|
221
|
+
)
|
|
222
|
+
# Try checking each key-value pair
|
|
223
|
+
missing = []
|
|
224
|
+
found_keys = []
|
|
225
|
+
for key, _value in expected.items():
|
|
226
|
+
pattern = f'"{key}"' if isinstance(key, str) else str(key)
|
|
227
|
+
if pattern not in response:
|
|
228
|
+
missing.append(key)
|
|
229
|
+
else:
|
|
230
|
+
found_keys.append(key)
|
|
231
|
+
if missing:
|
|
232
|
+
return EvaluationResult(
|
|
233
|
+
score=0,
|
|
234
|
+
reason=f"Missing keys: {', '.join(str(k) for k in missing[:3])}",
|
|
235
|
+
eval_type="pass_fail",
|
|
236
|
+
passed=False,
|
|
237
|
+
expected=_truncate(expected_str),
|
|
238
|
+
actual=f"Found keys: {', '.join(str(k) for k in found_keys[:3])}"
|
|
239
|
+
if found_keys
|
|
240
|
+
else "No keys found",
|
|
241
|
+
)
|
|
242
|
+
return EvaluationResult(
|
|
243
|
+
score=100,
|
|
244
|
+
reason="All expected keys found",
|
|
245
|
+
eval_type="pass_fail",
|
|
246
|
+
passed=True,
|
|
247
|
+
expected=_truncate(expected_str),
|
|
248
|
+
actual=f"All {len(expected)} keys present",
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
else:
|
|
252
|
+
# String or scalar - simple contains check
|
|
253
|
+
expected_str = _normalize_value(expected)
|
|
254
|
+
if expected_str in response:
|
|
255
|
+
actual = _find_actual_match(response, expected_str)
|
|
256
|
+
return EvaluationResult(
|
|
257
|
+
score=100,
|
|
258
|
+
reason="Expected value found",
|
|
259
|
+
eval_type="pass_fail",
|
|
260
|
+
passed=True,
|
|
261
|
+
expected=_truncate(expected_str),
|
|
262
|
+
actual=actual,
|
|
263
|
+
)
|
|
264
|
+
return EvaluationResult(
|
|
265
|
+
score=0,
|
|
266
|
+
reason="Expected value not found in response",
|
|
267
|
+
eval_type="pass_fail",
|
|
268
|
+
passed=False,
|
|
269
|
+
expected=_truncate(expected_str),
|
|
270
|
+
actual=_truncate(response, 200),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def evaluate_llm(
|
|
275
|
+
response: str,
|
|
276
|
+
config: EvaluateConfig,
|
|
277
|
+
expected: Any = None,
|
|
278
|
+
) -> EvaluationResult:
|
|
279
|
+
"""Evaluate response using LLM-as-judge.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
response: The response text to evaluate
|
|
283
|
+
config: Evaluation config with prompt and model
|
|
284
|
+
expected: Optional expected value for substitution
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
EvaluationResult with score (0-100) and reason
|
|
288
|
+
"""
|
|
289
|
+
if not config.prompt:
|
|
290
|
+
return EvaluationResult(
|
|
291
|
+
score=0,
|
|
292
|
+
reason="No evaluation prompt configured",
|
|
293
|
+
eval_type="scored",
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
if not config.model:
|
|
297
|
+
return EvaluationResult(
|
|
298
|
+
score=0,
|
|
299
|
+
reason="No evaluation model configured - set evaluator.model in YAML",
|
|
300
|
+
eval_type="scored",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
client = OpenAI(
|
|
304
|
+
api_key=get_bench_secret("OPENAI_API_KEY"),
|
|
305
|
+
base_url=get_bench_secret("OPENAI_BASE_URL"),
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Format the evaluation prompt
|
|
309
|
+
prompt = config.prompt.replace("{response}", response)
|
|
310
|
+
if expected is not None:
|
|
311
|
+
expected_str = _normalize_value(expected)
|
|
312
|
+
prompt = prompt.replace("{expected}", expected_str)
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
llm_response = client.chat.completions.create(
|
|
316
|
+
model=config.model,
|
|
317
|
+
messages=[{"role": "user", "content": prompt}],
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
content = llm_response.choices[0].message.content or ""
|
|
321
|
+
|
|
322
|
+
# Strip markdown code blocks if present
|
|
323
|
+
content = re.sub(r"```json\s*", "", content)
|
|
324
|
+
content = re.sub(r"```\s*", "", content)
|
|
325
|
+
|
|
326
|
+
# Try to parse JSON response
|
|
327
|
+
brace_start = content.find("{")
|
|
328
|
+
if brace_start != -1:
|
|
329
|
+
depth = 0
|
|
330
|
+
brace_end = -1
|
|
331
|
+
for i, c in enumerate(content[brace_start:], brace_start):
|
|
332
|
+
if c == "{":
|
|
333
|
+
depth += 1
|
|
334
|
+
elif c == "}":
|
|
335
|
+
depth -= 1
|
|
336
|
+
if depth == 0:
|
|
337
|
+
brace_end = i + 1
|
|
338
|
+
break
|
|
339
|
+
|
|
340
|
+
if brace_end > brace_start:
|
|
341
|
+
json_str = content[brace_start:brace_end]
|
|
342
|
+
try:
|
|
343
|
+
data = json.loads(json_str)
|
|
344
|
+
score = int(data.get("score") or 5)
|
|
345
|
+
reason = data.get("reason", "No reason provided")
|
|
346
|
+
if isinstance(reason, dict):
|
|
347
|
+
reason = json.dumps(reason)
|
|
348
|
+
return EvaluationResult(
|
|
349
|
+
score=score,
|
|
350
|
+
reason=str(reason),
|
|
351
|
+
eval_type="scored",
|
|
352
|
+
)
|
|
353
|
+
except (json.JSONDecodeError, ValueError):
|
|
354
|
+
pass
|
|
355
|
+
|
|
356
|
+
# Fallback: try to extract score from text (e.g., "7/10")
|
|
357
|
+
score_match = re.search(r"(\d+)\s*/?\s*10", content)
|
|
358
|
+
if score_match:
|
|
359
|
+
# Scale from 0-10 to 0-100
|
|
360
|
+
score = int(score_match.group(1)) * 10
|
|
361
|
+
return EvaluationResult(
|
|
362
|
+
score=score,
|
|
363
|
+
reason=_truncate(content, 200),
|
|
364
|
+
eval_type="scored",
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
logger.warning(f"Could not parse evaluation response: {content[:100]}")
|
|
368
|
+
return EvaluationResult(
|
|
369
|
+
score=5,
|
|
370
|
+
reason="Could not parse evaluation response",
|
|
371
|
+
eval_type="scored",
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
except Exception as e:
|
|
375
|
+
logger.error(f"Evaluation failed: {e}")
|
|
376
|
+
return EvaluationResult(
|
|
377
|
+
score=0,
|
|
378
|
+
reason=f"Evaluation error: {e}",
|
|
379
|
+
eval_type="scored",
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
# =============================================================================
|
|
384
|
+
# Evaluator Resolution and Task Evaluation
|
|
385
|
+
# =============================================================================
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def resolve_evaluator(
|
|
389
|
+
task: TaskConfig,
|
|
390
|
+
harness_config: HarnessConfig,
|
|
391
|
+
) -> EvaluateConfig | None:
|
|
392
|
+
"""Resolve the evaluator config for a task.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
task: The task configuration
|
|
396
|
+
harness_config: The harness configuration with evaluators dict
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Resolved EvaluateConfig or None if no evaluation
|
|
400
|
+
"""
|
|
401
|
+
if task.evaluate is None:
|
|
402
|
+
return None
|
|
403
|
+
|
|
404
|
+
if isinstance(task.evaluate, str):
|
|
405
|
+
# Reference to named evaluator
|
|
406
|
+
if task.evaluate in harness_config.evaluators:
|
|
407
|
+
return harness_config.evaluators[task.evaluate]
|
|
408
|
+
logger.warning(f"Unknown evaluator '{task.evaluate}', skipping evaluation")
|
|
409
|
+
return None
|
|
410
|
+
|
|
411
|
+
# Inline EvaluateConfig
|
|
412
|
+
return task.evaluate
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def evaluate_regex(
|
|
416
|
+
response: str,
|
|
417
|
+
pattern: str,
|
|
418
|
+
expect_match: bool = True,
|
|
419
|
+
) -> EvaluationResult:
|
|
420
|
+
"""Evaluate response against a regex pattern.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
response: The response text to evaluate
|
|
424
|
+
pattern: Regex pattern to match
|
|
425
|
+
expect_match: If True, pattern must match. If False, must NOT match.
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
EvaluationResult with pass/fail status
|
|
429
|
+
"""
|
|
430
|
+
match = re.search(pattern, response)
|
|
431
|
+
|
|
432
|
+
if expect_match:
|
|
433
|
+
if match:
|
|
434
|
+
return EvaluationResult(
|
|
435
|
+
score=100,
|
|
436
|
+
reason="Regex pattern matched",
|
|
437
|
+
eval_type="pass_fail",
|
|
438
|
+
passed=True,
|
|
439
|
+
expected=f"match: {_truncate(pattern, 50)}",
|
|
440
|
+
actual=_truncate(match.group(0), 100),
|
|
441
|
+
)
|
|
442
|
+
return EvaluationResult(
|
|
443
|
+
score=0,
|
|
444
|
+
reason="Regex pattern did not match",
|
|
445
|
+
eval_type="pass_fail",
|
|
446
|
+
passed=False,
|
|
447
|
+
expected=f"match: {_truncate(pattern, 50)}",
|
|
448
|
+
actual=_truncate(response, 200),
|
|
449
|
+
)
|
|
450
|
+
else:
|
|
451
|
+
# expect_match=False means pattern must NOT match
|
|
452
|
+
if not match:
|
|
453
|
+
return EvaluationResult(
|
|
454
|
+
score=100,
|
|
455
|
+
reason="Regex pattern correctly did not match",
|
|
456
|
+
eval_type="pass_fail",
|
|
457
|
+
passed=True,
|
|
458
|
+
expected=f"no match: {_truncate(pattern, 50)}",
|
|
459
|
+
actual="No match found",
|
|
460
|
+
)
|
|
461
|
+
return EvaluationResult(
|
|
462
|
+
score=0,
|
|
463
|
+
reason="Regex pattern matched when it should not",
|
|
464
|
+
eval_type="pass_fail",
|
|
465
|
+
passed=False,
|
|
466
|
+
expected=f"no match: {_truncate(pattern, 50)}",
|
|
467
|
+
actual=_truncate(match.group(0), 100),
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def evaluate_task(
|
|
472
|
+
task_result: TaskResult,
|
|
473
|
+
task: TaskConfig,
|
|
474
|
+
harness_config: HarnessConfig,
|
|
475
|
+
) -> EvaluationResult | None:
|
|
476
|
+
"""Evaluate a task result.
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
task_result: The task result with response
|
|
480
|
+
task: The task configuration
|
|
481
|
+
harness_config: The harness configuration
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
EvaluationResult or None if no evaluation configured
|
|
485
|
+
"""
|
|
486
|
+
eval_config = resolve_evaluator(task, harness_config)
|
|
487
|
+
|
|
488
|
+
if eval_config is None:
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
# Regex evaluation if regex pattern is set
|
|
492
|
+
if eval_config.regex is not None:
|
|
493
|
+
return evaluate_regex(
|
|
494
|
+
task_result.response,
|
|
495
|
+
eval_config.regex,
|
|
496
|
+
expect_match=eval_config.expect_match,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Deterministic evaluation if expected is set
|
|
500
|
+
if eval_config.expected is not None:
|
|
501
|
+
return evaluate_deterministic(
|
|
502
|
+
task_result.response,
|
|
503
|
+
eval_config.expected,
|
|
504
|
+
expect_error=eval_config.expect_error,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# LLM evaluation if prompt is set
|
|
508
|
+
if eval_config.prompt:
|
|
509
|
+
return evaluate_llm(task_result.response, eval_config)
|
|
510
|
+
|
|
511
|
+
# No evaluation method configured
|
|
512
|
+
return None
|