tsugite-cli 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsugite/__init__.py +6 -0
- tsugite/agent_composition.py +163 -0
- tsugite/agent_inheritance.py +479 -0
- tsugite/agent_preparation.py +236 -0
- tsugite/agent_runner/__init__.py +45 -0
- tsugite/agent_runner/helpers.py +106 -0
- tsugite/agent_runner/history_integration.py +248 -0
- tsugite/agent_runner/metrics.py +100 -0
- tsugite/agent_runner/runner.py +1879 -0
- tsugite/agent_runner/validation.py +70 -0
- tsugite/agent_utils.py +167 -0
- tsugite/attachments/__init__.py +65 -0
- tsugite/attachments/auto_context.py +199 -0
- tsugite/attachments/base.py +34 -0
- tsugite/attachments/file.py +51 -0
- tsugite/attachments/inline.py +31 -0
- tsugite/attachments/storage.py +178 -0
- tsugite/attachments/url.py +59 -0
- tsugite/attachments/youtube.py +101 -0
- tsugite/benchmark/__init__.py +62 -0
- tsugite/benchmark/config.py +183 -0
- tsugite/benchmark/core.py +292 -0
- tsugite/benchmark/discovery.py +377 -0
- tsugite/benchmark/evaluators.py +671 -0
- tsugite/benchmark/execution.py +657 -0
- tsugite/benchmark/metrics.py +204 -0
- tsugite/benchmark/reports.py +420 -0
- tsugite/benchmark/utils.py +288 -0
- tsugite/builtin_agents/chat-assistant.md +53 -0
- tsugite/builtin_agents/default.md +140 -0
- tsugite/builtin_agents.py +5 -0
- tsugite/cache.py +195 -0
- tsugite/cli/__init__.py +1042 -0
- tsugite/cli/agents.py +148 -0
- tsugite/cli/attachments.py +193 -0
- tsugite/cli/benchmark.py +663 -0
- tsugite/cli/cache.py +113 -0
- tsugite/cli/config.py +272 -0
- tsugite/cli/helpers.py +534 -0
- tsugite/cli/history.py +193 -0
- tsugite/cli/init.py +387 -0
- tsugite/cli/mcp.py +193 -0
- tsugite/cli/tools.py +419 -0
- tsugite/config.py +204 -0
- tsugite/console.py +48 -0
- tsugite/constants.py +21 -0
- tsugite/core/__init__.py +19 -0
- tsugite/core/agent.py +774 -0
- tsugite/core/executor.py +300 -0
- tsugite/core/memory.py +67 -0
- tsugite/core/tools.py +271 -0
- tsugite/docker_cli.py +270 -0
- tsugite/events/__init__.py +55 -0
- tsugite/events/base.py +46 -0
- tsugite/events/bus.py +62 -0
- tsugite/events/events.py +224 -0
- tsugite/exceptions.py +40 -0
- tsugite/history/__init__.py +29 -0
- tsugite/history/index.py +210 -0
- tsugite/history/models.py +106 -0
- tsugite/history/storage.py +157 -0
- tsugite/mcp_client.py +219 -0
- tsugite/mcp_config.py +174 -0
- tsugite/md_agents.py +751 -0
- tsugite/models.py +257 -0
- tsugite/renderer.py +151 -0
- tsugite/shell_tool_config.py +265 -0
- tsugite/templates/assistant.md +14 -0
- tsugite/tools/__init__.py +265 -0
- tsugite/tools/agents.py +312 -0
- tsugite/tools/edit_strategies.py +393 -0
- tsugite/tools/fs.py +329 -0
- tsugite/tools/http.py +239 -0
- tsugite/tools/interactive.py +430 -0
- tsugite/tools/shell.py +129 -0
- tsugite/tools/shell_tools.py +214 -0
- tsugite/tools/tasks.py +339 -0
- tsugite/tsugite.py +7 -0
- tsugite/ui/__init__.py +46 -0
- tsugite/ui/base.py +638 -0
- tsugite/ui/chat.py +265 -0
- tsugite/ui/chat.tcss +92 -0
- tsugite/ui/chat_history.py +286 -0
- tsugite/ui/helpers.py +102 -0
- tsugite/ui/jsonl.py +125 -0
- tsugite/ui/live_template.py +529 -0
- tsugite/ui/plain.py +419 -0
- tsugite/ui/textual_chat.py +642 -0
- tsugite/ui/textual_handler.py +225 -0
- tsugite/ui/widgets/__init__.py +6 -0
- tsugite/ui/widgets/base_scroll_log.py +27 -0
- tsugite/ui/widgets/message_list.py +121 -0
- tsugite/ui/widgets/thought_log.py +80 -0
- tsugite/ui_context.py +90 -0
- tsugite/utils.py +367 -0
- tsugite/xdg.py +104 -0
- tsugite_cli-0.3.3.dist-info/METADATA +325 -0
- tsugite_cli-0.3.3.dist-info/RECORD +101 -0
- tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
- tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
- tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
|
@@ -0,0 +1,657 @@
|
|
|
1
|
+
"""Test execution logic for benchmark framework."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
from ..agent_runner import run_agent
|
|
12
|
+
from ..exceptions import AgentExecutionError
|
|
13
|
+
from .config import (
|
|
14
|
+
EVALUATION_WEIGHTS,
|
|
15
|
+
SIMILARITY_THRESHOLDS,
|
|
16
|
+
)
|
|
17
|
+
from .discovery import BenchmarkTest, TestCase
|
|
18
|
+
from .evaluators import CorrectnessEvaluator, LLMEvaluator
|
|
19
|
+
from .metrics import BenchmarkTestResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TestExecutor:
|
|
23
|
+
"""Executes benchmark tests against models."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, output_dir: Path, llm_evaluator_model: str = "openai:gpt-4o-mini"):
|
|
26
|
+
"""Initialize test executor.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
output_dir: Directory for temporary files
|
|
30
|
+
llm_evaluator_model: Model to use for LLM evaluation
|
|
31
|
+
"""
|
|
32
|
+
self.output_dir = Path(output_dir)
|
|
33
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
self.correctness_evaluator = CorrectnessEvaluator()
|
|
35
|
+
self.llm_evaluator = LLMEvaluator(evaluator_model=llm_evaluator_model)
|
|
36
|
+
|
|
37
|
+
async def run_test(self, model_name: str, test: BenchmarkTest) -> BenchmarkTestResult:
|
|
38
|
+
"""Run all test cases for a single agent against a model.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
model_name: Name of model to test
|
|
42
|
+
test: Benchmark test to run
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Aggregated test result
|
|
46
|
+
"""
|
|
47
|
+
start_time = time.time()
|
|
48
|
+
|
|
49
|
+
if not test.test_cases:
|
|
50
|
+
raise ValueError(f"No test cases found for {test.test_id}")
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
# Create temporary agent file with target model
|
|
54
|
+
temp_agent_path = self._create_temp_agent(test, model_name)
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
# Run all test cases
|
|
58
|
+
case_results, aggregated_output, total_steps, total_tokens, total_cost = await self._run_all_test_cases(
|
|
59
|
+
test, temp_agent_path, model_name
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Calculate aggregate results
|
|
63
|
+
total_passed = sum(1 for r in case_results if r["passed"])
|
|
64
|
+
overall_passed = total_passed == len(test.test_cases)
|
|
65
|
+
|
|
66
|
+
# Calculate weighted score
|
|
67
|
+
total_score = sum(r["score"] * test.test_cases[i].weight for i, r in enumerate(case_results))
|
|
68
|
+
total_weight = sum(tc.weight for tc in test.test_cases)
|
|
69
|
+
overall_score = total_score / total_weight if total_weight > 0 else 0.0
|
|
70
|
+
|
|
71
|
+
# Aggregate metrics
|
|
72
|
+
aggregate_metrics = {
|
|
73
|
+
"case_results": case_results,
|
|
74
|
+
"cases_passed": total_passed,
|
|
75
|
+
"total_cases": len(test.test_cases),
|
|
76
|
+
"pass_rate": total_passed / len(test.test_cases),
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
duration = time.time() - start_time
|
|
80
|
+
|
|
81
|
+
# Use first case's expected output for legacy compatibility
|
|
82
|
+
expected_output = test.test_cases[0].expected_output if test.test_cases else ""
|
|
83
|
+
|
|
84
|
+
return BenchmarkTestResult(
|
|
85
|
+
test_id=test.test_id,
|
|
86
|
+
model=model_name,
|
|
87
|
+
passed=overall_passed,
|
|
88
|
+
score=overall_score,
|
|
89
|
+
duration=duration,
|
|
90
|
+
output=aggregated_output,
|
|
91
|
+
expected_output=expected_output or "",
|
|
92
|
+
category=test.category,
|
|
93
|
+
error=None,
|
|
94
|
+
token_usage={"total": total_tokens},
|
|
95
|
+
cost=total_cost,
|
|
96
|
+
steps_taken=total_steps,
|
|
97
|
+
metrics=aggregate_metrics,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
finally:
|
|
101
|
+
# Clean up temp file
|
|
102
|
+
if temp_agent_path.exists():
|
|
103
|
+
temp_agent_path.unlink()
|
|
104
|
+
|
|
105
|
+
except AgentExecutionError as e:
|
|
106
|
+
# Agent execution failed with execution details
|
|
107
|
+
duration = time.time() - start_time
|
|
108
|
+
|
|
109
|
+
execution_trace = None
|
|
110
|
+
if e.execution_steps:
|
|
111
|
+
# Build execution trace from steps in exception
|
|
112
|
+
execution_trace = [
|
|
113
|
+
{
|
|
114
|
+
"step": step.step_number,
|
|
115
|
+
"thought": step.thought,
|
|
116
|
+
"code": step.code,
|
|
117
|
+
"output": step.output,
|
|
118
|
+
"tools_called": step.tools_called,
|
|
119
|
+
"error": step.error,
|
|
120
|
+
}
|
|
121
|
+
for step in e.execution_steps
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
metrics = {}
|
|
125
|
+
if execution_trace:
|
|
126
|
+
metrics["execution_trace"] = execution_trace
|
|
127
|
+
|
|
128
|
+
return BenchmarkTestResult(
|
|
129
|
+
test_id=test.test_id,
|
|
130
|
+
model=model_name,
|
|
131
|
+
passed=False,
|
|
132
|
+
score=0.0,
|
|
133
|
+
duration=duration,
|
|
134
|
+
output="",
|
|
135
|
+
expected_output="",
|
|
136
|
+
category=test.category,
|
|
137
|
+
error=str(e),
|
|
138
|
+
token_usage={"total": e.token_usage} if e.token_usage else {},
|
|
139
|
+
cost=e.cost or 0.0,
|
|
140
|
+
steps_taken=e.step_count,
|
|
141
|
+
metrics=metrics,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
except Exception as e:
|
|
145
|
+
# Other unexpected errors without execution details
|
|
146
|
+
duration = time.time() - start_time
|
|
147
|
+
|
|
148
|
+
return BenchmarkTestResult(
|
|
149
|
+
test_id=test.test_id,
|
|
150
|
+
model=model_name,
|
|
151
|
+
passed=False,
|
|
152
|
+
score=0.0,
|
|
153
|
+
duration=duration,
|
|
154
|
+
output="",
|
|
155
|
+
expected_output="",
|
|
156
|
+
category=test.category,
|
|
157
|
+
error=str(e),
|
|
158
|
+
token_usage={},
|
|
159
|
+
cost=0.0,
|
|
160
|
+
steps_taken=0,
|
|
161
|
+
metrics={},
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def _create_temp_agent(self, test: BenchmarkTest, model_name: str) -> Path:
|
|
165
|
+
"""Create temporary agent file with specified model.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
test: Benchmark test
|
|
169
|
+
model_name: Model name to inject
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Path to temporary agent file
|
|
173
|
+
"""
|
|
174
|
+
original_content = test.agent_path.read_text()
|
|
175
|
+
|
|
176
|
+
# Find YAML frontmatter boundaries
|
|
177
|
+
lines = original_content.split("\n")
|
|
178
|
+
yaml_end = -1
|
|
179
|
+
for i, line in enumerate(lines[1:], 1):
|
|
180
|
+
if line.strip() == "---":
|
|
181
|
+
yaml_end = i
|
|
182
|
+
break
|
|
183
|
+
|
|
184
|
+
if yaml_end == -1:
|
|
185
|
+
raise ValueError("Invalid YAML frontmatter format")
|
|
186
|
+
|
|
187
|
+
# Parse original YAML
|
|
188
|
+
original_yaml = "\n".join(lines[1:yaml_end])
|
|
189
|
+
yaml_data = yaml.safe_load(original_yaml) or {}
|
|
190
|
+
markdown_content = "\n".join(lines[yaml_end + 1 :])
|
|
191
|
+
|
|
192
|
+
# Create clean YAML with only agent fields
|
|
193
|
+
clean_yaml_data = {
|
|
194
|
+
"name": yaml_data.get("name", test.test_id),
|
|
195
|
+
"description": yaml_data.get("description", ""),
|
|
196
|
+
"model": model_name, # Override model
|
|
197
|
+
"max_turns": yaml_data.get("max_turns", 5),
|
|
198
|
+
"tools": yaml_data.get("tools", []),
|
|
199
|
+
"text_mode": yaml_data.get("text_mode"),
|
|
200
|
+
"prefetch": yaml_data.get("prefetch", []),
|
|
201
|
+
"permissions_profile": yaml_data.get("permissions_profile", "default"),
|
|
202
|
+
"context_budget": yaml_data.get("context_budget"),
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Remove None values and empty lists
|
|
206
|
+
clean_yaml_data = {k: v for k, v in clean_yaml_data.items() if v is not None and v != []}
|
|
207
|
+
|
|
208
|
+
# Create clean agent content
|
|
209
|
+
clean_yaml = yaml.dump(clean_yaml_data, default_flow_style=False)
|
|
210
|
+
clean_agent_content = f"---\n{clean_yaml}---\n\n{markdown_content}"
|
|
211
|
+
|
|
212
|
+
# Write temporary agent file
|
|
213
|
+
temp_agent_path = self.output_dir / f"temp_{test.test_id}_{model_name.replace(':', '_')}.md"
|
|
214
|
+
temp_agent_path.write_text(clean_agent_content)
|
|
215
|
+
|
|
216
|
+
return temp_agent_path
|
|
217
|
+
|
|
218
|
+
async def _run_all_test_cases(
|
|
219
|
+
self, test: BenchmarkTest, temp_agent_path: Path, model_name: str
|
|
220
|
+
) -> tuple[list[Dict[str, Any]], str, int, int, float]:
|
|
221
|
+
"""Run all test cases for a test.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
test: Benchmark test
|
|
225
|
+
temp_agent_path: Path to temporary agent file
|
|
226
|
+
model_name: Model name
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
Tuple of (case_results, aggregated_output, total_steps, total_tokens, total_cost)
|
|
230
|
+
"""
|
|
231
|
+
case_results = []
|
|
232
|
+
raw_outputs = []
|
|
233
|
+
aggregated_output_parts = []
|
|
234
|
+
total_steps = 0
|
|
235
|
+
total_tokens = 0
|
|
236
|
+
total_cost = 0.0
|
|
237
|
+
|
|
238
|
+
for test_case in test.test_cases:
|
|
239
|
+
case_start = time.time()
|
|
240
|
+
|
|
241
|
+
# Prepare prompt with planning instruction if needed
|
|
242
|
+
final_prompt = self._prepare_prompt(test_case)
|
|
243
|
+
|
|
244
|
+
# Run the agent with token usage tracking to get step count
|
|
245
|
+
result_tuple = run_agent(
|
|
246
|
+
agent_path=temp_agent_path,
|
|
247
|
+
prompt=final_prompt,
|
|
248
|
+
model_override=model_name,
|
|
249
|
+
debug=False,
|
|
250
|
+
return_token_usage=True,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Unpack result: (output, token_count, cost, step_count, execution_steps)
|
|
254
|
+
result, token_count, cost, steps, execution_steps = result_tuple
|
|
255
|
+
total_steps += steps
|
|
256
|
+
total_tokens += token_count or 0
|
|
257
|
+
total_cost += cost or 0.0
|
|
258
|
+
|
|
259
|
+
case_duration = time.time() - case_start
|
|
260
|
+
|
|
261
|
+
# Evaluate this test case
|
|
262
|
+
case_evaluation = await self._evaluate_test_case(test_case, result, case_duration, execution_steps, test)
|
|
263
|
+
|
|
264
|
+
case_results.append(case_evaluation)
|
|
265
|
+
raw_outputs.append(str(result))
|
|
266
|
+
aggregated_output_parts.append(
|
|
267
|
+
f"Test Case: {test_case.name}\nPrompt: {test_case.prompt}\nOutput: {result}\n---\n"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Create aggregated output
|
|
271
|
+
if len(raw_outputs) == 1:
|
|
272
|
+
aggregated_output = raw_outputs[0]
|
|
273
|
+
else:
|
|
274
|
+
aggregated_output = "\n".join(aggregated_output_parts)
|
|
275
|
+
|
|
276
|
+
return case_results, aggregated_output, total_steps, total_tokens, total_cost
|
|
277
|
+
|
|
278
|
+
def _prepare_prompt(self, test_case: TestCase) -> str:
|
|
279
|
+
"""Prepare prompt with optional planning instruction.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
test_case: Test case
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Final prompt
|
|
286
|
+
"""
|
|
287
|
+
if not test_case.requires_plan:
|
|
288
|
+
return test_case.prompt
|
|
289
|
+
|
|
290
|
+
planning_instruction = """
|
|
291
|
+
|
|
292
|
+
PLANNING REQUIREMENT: Before executing this task, you must first create a detailed plan. Your response should be structured as follows:
|
|
293
|
+
|
|
294
|
+
1. **PLAN SECTION**: Start with a clear, step-by-step plan that outlines how you will approach this task
|
|
295
|
+
2. **EXECUTION SECTION**: Then proceed with executing the plan
|
|
296
|
+
|
|
297
|
+
Make sure your plan includes the key steps and reasoning before you start executing."""
|
|
298
|
+
|
|
299
|
+
return test_case.prompt + planning_instruction
|
|
300
|
+
|
|
301
|
+
async def _evaluate_test_case(
|
|
302
|
+
self,
|
|
303
|
+
test_case: TestCase,
|
|
304
|
+
result: str,
|
|
305
|
+
duration: float,
|
|
306
|
+
execution_steps: list = None,
|
|
307
|
+
test: BenchmarkTest = None,
|
|
308
|
+
) -> Dict[str, Any]:
|
|
309
|
+
"""Evaluate a single test case result.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
test_case: Test case definition
|
|
313
|
+
result: Agent output
|
|
314
|
+
duration: Execution duration
|
|
315
|
+
execution_steps: List of StepResult objects from execution
|
|
316
|
+
test: BenchmarkTest object with agent configuration
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
Evaluation dictionary with passed, score, and metrics
|
|
320
|
+
"""
|
|
321
|
+
evaluation = {
|
|
322
|
+
"test_case": test_case.name,
|
|
323
|
+
"passed": False,
|
|
324
|
+
"score": 0.0,
|
|
325
|
+
"metrics": {},
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
# 1. Basic correctness evaluation
|
|
330
|
+
if test_case.expected_output:
|
|
331
|
+
evaluation = self._evaluate_correctness(test_case, result, evaluation)
|
|
332
|
+
|
|
333
|
+
# 2. Custom evaluation criteria (includes tool checking)
|
|
334
|
+
if test_case.evaluation:
|
|
335
|
+
evaluation = self._evaluate_custom_criteria(test_case, result, evaluation, execution_steps, test)
|
|
336
|
+
|
|
337
|
+
# 3. LLM evaluation (if enabled)
|
|
338
|
+
if test_case.use_llm_evaluation and test_case.llm_evaluation_criteria:
|
|
339
|
+
evaluation = await self._evaluate_with_llm(test_case, result, evaluation)
|
|
340
|
+
|
|
341
|
+
# 4. Plan evaluation (if required)
|
|
342
|
+
if test_case.requires_plan:
|
|
343
|
+
evaluation = self._evaluate_planning(test_case, result, evaluation)
|
|
344
|
+
|
|
345
|
+
except Exception as e:
|
|
346
|
+
evaluation["error"] = str(e)
|
|
347
|
+
|
|
348
|
+
# Add execution trace for failed tests (for debugging)
|
|
349
|
+
# Only save for failures to reduce JSON size
|
|
350
|
+
if execution_steps and not evaluation["passed"]:
|
|
351
|
+
evaluation["execution_trace"] = [
|
|
352
|
+
{
|
|
353
|
+
"step": step.step_number,
|
|
354
|
+
"thought": step.thought,
|
|
355
|
+
"code": step.code,
|
|
356
|
+
"output": step.output,
|
|
357
|
+
"tools_called": step.tools_called,
|
|
358
|
+
"error": step.error,
|
|
359
|
+
}
|
|
360
|
+
for step in execution_steps
|
|
361
|
+
]
|
|
362
|
+
|
|
363
|
+
return evaluation
|
|
364
|
+
|
|
365
|
+
def _evaluate_correctness(self, test_case: TestCase, result: str, evaluation: Dict[str, Any]) -> Dict[str, Any]:
|
|
366
|
+
"""Evaluate output correctness."""
|
|
367
|
+
correctness = self.correctness_evaluator.evaluate(
|
|
368
|
+
output=result,
|
|
369
|
+
expected=test_case.expected_output,
|
|
370
|
+
output_type="string",
|
|
371
|
+
)
|
|
372
|
+
evaluation["passed"] = correctness["passed"]
|
|
373
|
+
evaluation["score"] = correctness["score"]
|
|
374
|
+
evaluation["metrics"]["correctness"] = correctness
|
|
375
|
+
return evaluation
|
|
376
|
+
|
|
377
|
+
def _evaluate_custom_criteria(
|
|
378
|
+
self,
|
|
379
|
+
test_case: TestCase,
|
|
380
|
+
result: str,
|
|
381
|
+
evaluation: Dict[str, Any],
|
|
382
|
+
execution_steps: list = None,
|
|
383
|
+
test: BenchmarkTest = None,
|
|
384
|
+
) -> Dict[str, Any]:
|
|
385
|
+
"""Evaluate custom criteria."""
|
|
386
|
+
custom_checks = {}
|
|
387
|
+
|
|
388
|
+
for criterion, expected_value in test_case.evaluation.items():
|
|
389
|
+
check_passed = self._check_criterion(criterion, expected_value, result, execution_steps, test)
|
|
390
|
+
custom_checks[criterion] = check_passed
|
|
391
|
+
|
|
392
|
+
evaluation["metrics"]["custom"] = custom_checks
|
|
393
|
+
|
|
394
|
+
# ALL custom criteria must pass for score of 1.0, otherwise 0.0
|
|
395
|
+
if not test_case.expected_output:
|
|
396
|
+
all_custom_pass = all(custom_checks.values()) if custom_checks else True
|
|
397
|
+
evaluation["score"] = 1.0 if all_custom_pass else 0.0
|
|
398
|
+
evaluation["passed"] = all_custom_pass
|
|
399
|
+
|
|
400
|
+
return evaluation
|
|
401
|
+
|
|
402
|
+
def _check_criterion(
|
|
403
|
+
self,
|
|
404
|
+
criterion: str,
|
|
405
|
+
expected_value: Any,
|
|
406
|
+
result: str,
|
|
407
|
+
execution_steps: list = None,
|
|
408
|
+
test: BenchmarkTest = None,
|
|
409
|
+
) -> bool:
|
|
410
|
+
"""Check a single custom criterion.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
criterion: Criterion name
|
|
414
|
+
expected_value: Expected value
|
|
415
|
+
result: Agent output
|
|
416
|
+
execution_steps: List of StepResult objects from execution
|
|
417
|
+
test: BenchmarkTest object with agent configuration
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
True if criterion passes
|
|
421
|
+
"""
|
|
422
|
+
if criterion == "tool_called":
|
|
423
|
+
# Check if specific tools were called during execution
|
|
424
|
+
if not execution_steps:
|
|
425
|
+
return False
|
|
426
|
+
|
|
427
|
+
# Collect all tools called across all steps
|
|
428
|
+
tools_called = set()
|
|
429
|
+
for step in execution_steps:
|
|
430
|
+
if hasattr(step, "tools_called") and step.tools_called:
|
|
431
|
+
tools_called.update(step.tools_called)
|
|
432
|
+
|
|
433
|
+
# expected_value can be a single tool name or a list of required tools
|
|
434
|
+
if isinstance(expected_value, list):
|
|
435
|
+
# ALL listed tools must have been called
|
|
436
|
+
return all(tool in tools_called for tool in expected_value)
|
|
437
|
+
else:
|
|
438
|
+
# Single tool must have been called
|
|
439
|
+
return expected_value in tools_called
|
|
440
|
+
|
|
441
|
+
elif criterion == "exact_match":
|
|
442
|
+
return result.strip() == expected_value
|
|
443
|
+
|
|
444
|
+
elif criterion == "contains":
|
|
445
|
+
result_lower = result.lower()
|
|
446
|
+
if isinstance(expected_value, list):
|
|
447
|
+
return all(str(item).lower() in result_lower for item in expected_value)
|
|
448
|
+
else:
|
|
449
|
+
return str(expected_value).lower() in result_lower
|
|
450
|
+
|
|
451
|
+
elif criterion == "valid_json":
|
|
452
|
+
try:
|
|
453
|
+
json.loads(result.strip())
|
|
454
|
+
return expected_value # true means should be valid
|
|
455
|
+
except (json.JSONDecodeError, ValueError):
|
|
456
|
+
return not expected_value # false if expecting invalid
|
|
457
|
+
|
|
458
|
+
elif criterion == "contains_keys":
|
|
459
|
+
try:
|
|
460
|
+
parsed = json.loads(result.strip())
|
|
461
|
+
if isinstance(parsed, dict):
|
|
462
|
+
if isinstance(expected_value, list):
|
|
463
|
+
return all(key in parsed for key in expected_value)
|
|
464
|
+
else:
|
|
465
|
+
return expected_value in parsed
|
|
466
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
467
|
+
pass
|
|
468
|
+
return False
|
|
469
|
+
|
|
470
|
+
elif criterion == "content_pattern":
|
|
471
|
+
return bool(re.search(expected_value, result))
|
|
472
|
+
|
|
473
|
+
elif criterion == "matches":
|
|
474
|
+
# Regex pattern matching on agent output
|
|
475
|
+
# expected_value can be a single pattern or list of patterns
|
|
476
|
+
if isinstance(expected_value, list):
|
|
477
|
+
# ALL patterns must match
|
|
478
|
+
return all(bool(re.search(pattern, result)) for pattern in expected_value)
|
|
479
|
+
else:
|
|
480
|
+
# Single pattern must match
|
|
481
|
+
return bool(re.search(expected_value, result))
|
|
482
|
+
|
|
483
|
+
elif criterion == "file_check":
|
|
484
|
+
# Verify file contents directly
|
|
485
|
+
# expected_value is a dict with: path, exists, contains, matches, exact
|
|
486
|
+
if not isinstance(expected_value, dict):
|
|
487
|
+
return False
|
|
488
|
+
|
|
489
|
+
file_path = expected_value.get("path")
|
|
490
|
+
if not file_path:
|
|
491
|
+
return False
|
|
492
|
+
|
|
493
|
+
from pathlib import Path
|
|
494
|
+
|
|
495
|
+
path = Path(file_path)
|
|
496
|
+
|
|
497
|
+
# Check existence
|
|
498
|
+
if expected_value.get("exists"):
|
|
499
|
+
if not path.exists():
|
|
500
|
+
return False
|
|
501
|
+
|
|
502
|
+
# If file doesn't exist and we're not checking existence, fail
|
|
503
|
+
if not path.exists():
|
|
504
|
+
return False
|
|
505
|
+
|
|
506
|
+
# Read file content for verification
|
|
507
|
+
try:
|
|
508
|
+
file_content = path.read_text(encoding="utf-8")
|
|
509
|
+
except Exception:
|
|
510
|
+
# File read failed
|
|
511
|
+
return False
|
|
512
|
+
|
|
513
|
+
# Check exact match
|
|
514
|
+
if "exact" in expected_value:
|
|
515
|
+
if file_content.strip() != expected_value["exact"].strip():
|
|
516
|
+
return False
|
|
517
|
+
|
|
518
|
+
# Check contains (list of strings that must all appear)
|
|
519
|
+
if "contains" in expected_value:
|
|
520
|
+
contains_list = expected_value["contains"]
|
|
521
|
+
if isinstance(contains_list, list):
|
|
522
|
+
if not all(str(item) in file_content for item in contains_list):
|
|
523
|
+
return False
|
|
524
|
+
else:
|
|
525
|
+
if str(contains_list) not in file_content:
|
|
526
|
+
return False
|
|
527
|
+
|
|
528
|
+
# Check regex matches
|
|
529
|
+
if "matches" in expected_value:
|
|
530
|
+
pattern = expected_value["matches"]
|
|
531
|
+
if isinstance(pattern, list):
|
|
532
|
+
# All patterns must match
|
|
533
|
+
if not all(bool(re.search(p, file_content)) for p in pattern):
|
|
534
|
+
return False
|
|
535
|
+
else:
|
|
536
|
+
# Single pattern must match
|
|
537
|
+
if not bool(re.search(pattern, file_content)):
|
|
538
|
+
return False
|
|
539
|
+
|
|
540
|
+
# All checks passed
|
|
541
|
+
return True
|
|
542
|
+
|
|
543
|
+
elif criterion == "min_length":
|
|
544
|
+
return len(result) >= expected_value
|
|
545
|
+
|
|
546
|
+
else:
|
|
547
|
+
# Default to simple substring check
|
|
548
|
+
return str(expected_value).lower() in result.lower()
|
|
549
|
+
|
|
550
|
+
async def _evaluate_with_llm(self, test_case: TestCase, result: str, evaluation: Dict[str, Any]) -> Dict[str, Any]:
|
|
551
|
+
"""Evaluate using LLM judge."""
|
|
552
|
+
llm_eval = await self.llm_evaluator.evaluate(
|
|
553
|
+
output=result,
|
|
554
|
+
task_description=test_case.prompt,
|
|
555
|
+
evaluation_criteria=test_case.llm_evaluation_criteria,
|
|
556
|
+
rubric=test_case.llm_evaluation_rubric or {},
|
|
557
|
+
)
|
|
558
|
+
evaluation["metrics"]["llm_evaluation"] = llm_eval
|
|
559
|
+
|
|
560
|
+
# Blend with existing score or use standalone
|
|
561
|
+
if evaluation["score"] == 0.0:
|
|
562
|
+
evaluation["score"] = llm_eval["llm_score"]
|
|
563
|
+
evaluation["passed"] = llm_eval["llm_score"] >= SIMILARITY_THRESHOLDS.llm_evaluation_threshold
|
|
564
|
+
else:
|
|
565
|
+
# Blend LLM score with existing evaluation
|
|
566
|
+
blended_score = (
|
|
567
|
+
evaluation["score"] * EVALUATION_WEIGHTS.base_weight
|
|
568
|
+
+ llm_eval["llm_score"] * EVALUATION_WEIGHTS.llm_weight
|
|
569
|
+
)
|
|
570
|
+
evaluation["score"] = blended_score
|
|
571
|
+
|
|
572
|
+
return evaluation
|
|
573
|
+
|
|
574
|
+
def _evaluate_planning(self, test_case: TestCase, result: str, evaluation: Dict[str, Any]) -> Dict[str, Any]:
|
|
575
|
+
"""Evaluate planning quality."""
|
|
576
|
+
plan_score = 0.0
|
|
577
|
+
plan_checks = {}
|
|
578
|
+
|
|
579
|
+
# Check for planning indicators
|
|
580
|
+
plan_indicators = ["plan", "step", "approach", "strategy", "outline", "first", "then", "next", "finally"]
|
|
581
|
+
has_plan_structure = any(indicator in result.lower() for indicator in plan_indicators)
|
|
582
|
+
plan_checks["has_planning_structure"] = has_plan_structure
|
|
583
|
+
if has_plan_structure:
|
|
584
|
+
plan_score += 0.3
|
|
585
|
+
|
|
586
|
+
# Check for specific plan elements
|
|
587
|
+
if test_case.expected_plan_elements:
|
|
588
|
+
elements_found = sum(1 for element in test_case.expected_plan_elements if element.lower() in result.lower())
|
|
589
|
+
element_score = elements_found / len(test_case.expected_plan_elements)
|
|
590
|
+
plan_score += element_score * 0.4
|
|
591
|
+
|
|
592
|
+
for element in test_case.expected_plan_elements:
|
|
593
|
+
plan_checks[f"element_{element}"] = element.lower() in result.lower()
|
|
594
|
+
|
|
595
|
+
# Check custom plan evaluation criteria
|
|
596
|
+
if test_case.plan_evaluation:
|
|
597
|
+
custom_plan_score = self._evaluate_plan_criteria(test_case, result, plan_checks)
|
|
598
|
+
plan_score += custom_plan_score * 0.3
|
|
599
|
+
|
|
600
|
+
evaluation["metrics"]["planning"] = {
|
|
601
|
+
"plan_score": plan_score,
|
|
602
|
+
"plan_checks": plan_checks,
|
|
603
|
+
"required": True,
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
# Blend planning score with existing evaluation
|
|
607
|
+
if evaluation["score"] == 0.0:
|
|
608
|
+
evaluation["score"] = plan_score
|
|
609
|
+
evaluation["passed"] = plan_score >= SIMILARITY_THRESHOLDS.behavior_pass_threshold
|
|
610
|
+
else:
|
|
611
|
+
blended_score = (
|
|
612
|
+
evaluation["score"] * (1 - EVALUATION_WEIGHTS.planning_weight)
|
|
613
|
+
+ plan_score * EVALUATION_WEIGHTS.planning_weight
|
|
614
|
+
)
|
|
615
|
+
evaluation["score"] = blended_score
|
|
616
|
+
# Plan must meet minimum threshold
|
|
617
|
+
evaluation["passed"] = evaluation["passed"] and plan_score >= EVALUATION_WEIGHTS.planning_minimum
|
|
618
|
+
|
|
619
|
+
return evaluation
|
|
620
|
+
|
|
621
|
+
def _evaluate_plan_criteria(self, test_case: TestCase, result: str, plan_checks: Dict[str, Any]) -> float:
|
|
622
|
+
"""Evaluate custom planning criteria.
|
|
623
|
+
|
|
624
|
+
Args:
|
|
625
|
+
test_case: Test case
|
|
626
|
+
result: Agent output
|
|
627
|
+
plan_checks: Dictionary to store check results
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
Custom plan score (0.0 to 1.0)
|
|
631
|
+
"""
|
|
632
|
+
custom_plan_score = 0.0
|
|
633
|
+
criteria_count = len(test_case.plan_evaluation)
|
|
634
|
+
|
|
635
|
+
for criterion, expected_value in test_case.plan_evaluation.items():
|
|
636
|
+
if criterion == "min_plan_steps":
|
|
637
|
+
# Count numbered or bulleted steps
|
|
638
|
+
steps = len(re.findall(r"(\d+\.|•|-|\*)\s", result))
|
|
639
|
+
plan_checks[criterion] = steps >= expected_value
|
|
640
|
+
if steps >= expected_value:
|
|
641
|
+
custom_plan_score += 1.0 / criteria_count
|
|
642
|
+
|
|
643
|
+
elif criterion == "structured_sections":
|
|
644
|
+
# Look for section headers
|
|
645
|
+
sections = any(header in result.upper() for header in ["PLAN", "EXECUTION", "APPROACH", "STRATEGY"])
|
|
646
|
+
plan_checks[criterion] = sections == expected_value
|
|
647
|
+
if sections == expected_value:
|
|
648
|
+
custom_plan_score += 1.0 / criteria_count
|
|
649
|
+
|
|
650
|
+
else:
|
|
651
|
+
# Default criterion check
|
|
652
|
+
criterion_met = str(expected_value).lower() in result.lower()
|
|
653
|
+
plan_checks[criterion] = criterion_met
|
|
654
|
+
if criterion_met:
|
|
655
|
+
custom_plan_score += 1.0 / criteria_count
|
|
656
|
+
|
|
657
|
+
return custom_plan_score
|