tsugite-cli 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsugite/__init__.py +6 -0
- tsugite/agent_composition.py +163 -0
- tsugite/agent_inheritance.py +479 -0
- tsugite/agent_preparation.py +236 -0
- tsugite/agent_runner/__init__.py +45 -0
- tsugite/agent_runner/helpers.py +106 -0
- tsugite/agent_runner/history_integration.py +248 -0
- tsugite/agent_runner/metrics.py +100 -0
- tsugite/agent_runner/runner.py +1879 -0
- tsugite/agent_runner/validation.py +70 -0
- tsugite/agent_utils.py +167 -0
- tsugite/attachments/__init__.py +65 -0
- tsugite/attachments/auto_context.py +199 -0
- tsugite/attachments/base.py +34 -0
- tsugite/attachments/file.py +51 -0
- tsugite/attachments/inline.py +31 -0
- tsugite/attachments/storage.py +178 -0
- tsugite/attachments/url.py +59 -0
- tsugite/attachments/youtube.py +101 -0
- tsugite/benchmark/__init__.py +62 -0
- tsugite/benchmark/config.py +183 -0
- tsugite/benchmark/core.py +292 -0
- tsugite/benchmark/discovery.py +377 -0
- tsugite/benchmark/evaluators.py +671 -0
- tsugite/benchmark/execution.py +657 -0
- tsugite/benchmark/metrics.py +204 -0
- tsugite/benchmark/reports.py +420 -0
- tsugite/benchmark/utils.py +288 -0
- tsugite/builtin_agents/chat-assistant.md +53 -0
- tsugite/builtin_agents/default.md +140 -0
- tsugite/builtin_agents.py +5 -0
- tsugite/cache.py +195 -0
- tsugite/cli/__init__.py +1042 -0
- tsugite/cli/agents.py +148 -0
- tsugite/cli/attachments.py +193 -0
- tsugite/cli/benchmark.py +663 -0
- tsugite/cli/cache.py +113 -0
- tsugite/cli/config.py +272 -0
- tsugite/cli/helpers.py +534 -0
- tsugite/cli/history.py +193 -0
- tsugite/cli/init.py +387 -0
- tsugite/cli/mcp.py +193 -0
- tsugite/cli/tools.py +419 -0
- tsugite/config.py +204 -0
- tsugite/console.py +48 -0
- tsugite/constants.py +21 -0
- tsugite/core/__init__.py +19 -0
- tsugite/core/agent.py +774 -0
- tsugite/core/executor.py +300 -0
- tsugite/core/memory.py +67 -0
- tsugite/core/tools.py +271 -0
- tsugite/docker_cli.py +270 -0
- tsugite/events/__init__.py +55 -0
- tsugite/events/base.py +46 -0
- tsugite/events/bus.py +62 -0
- tsugite/events/events.py +224 -0
- tsugite/exceptions.py +40 -0
- tsugite/history/__init__.py +29 -0
- tsugite/history/index.py +210 -0
- tsugite/history/models.py +106 -0
- tsugite/history/storage.py +157 -0
- tsugite/mcp_client.py +219 -0
- tsugite/mcp_config.py +174 -0
- tsugite/md_agents.py +751 -0
- tsugite/models.py +257 -0
- tsugite/renderer.py +151 -0
- tsugite/shell_tool_config.py +265 -0
- tsugite/templates/assistant.md +14 -0
- tsugite/tools/__init__.py +265 -0
- tsugite/tools/agents.py +312 -0
- tsugite/tools/edit_strategies.py +393 -0
- tsugite/tools/fs.py +329 -0
- tsugite/tools/http.py +239 -0
- tsugite/tools/interactive.py +430 -0
- tsugite/tools/shell.py +129 -0
- tsugite/tools/shell_tools.py +214 -0
- tsugite/tools/tasks.py +339 -0
- tsugite/tsugite.py +7 -0
- tsugite/ui/__init__.py +46 -0
- tsugite/ui/base.py +638 -0
- tsugite/ui/chat.py +265 -0
- tsugite/ui/chat.tcss +92 -0
- tsugite/ui/chat_history.py +286 -0
- tsugite/ui/helpers.py +102 -0
- tsugite/ui/jsonl.py +125 -0
- tsugite/ui/live_template.py +529 -0
- tsugite/ui/plain.py +419 -0
- tsugite/ui/textual_chat.py +642 -0
- tsugite/ui/textual_handler.py +225 -0
- tsugite/ui/widgets/__init__.py +6 -0
- tsugite/ui/widgets/base_scroll_log.py +27 -0
- tsugite/ui/widgets/message_list.py +121 -0
- tsugite/ui/widgets/thought_log.py +80 -0
- tsugite/ui_context.py +90 -0
- tsugite/utils.py +367 -0
- tsugite/xdg.py +104 -0
- tsugite_cli-0.3.3.dist-info/METADATA +325 -0
- tsugite_cli-0.3.3.dist-info/RECORD +101 -0
- tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
- tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
- tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
"""Core benchmark framework for evaluating Tsugite agents."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from .discovery import BenchmarkTest, TestDiscovery
|
|
9
|
+
from .execution import TestExecutor
|
|
10
|
+
from .metrics import BenchmarkTestResult, ModelPerformance
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class BenchmarkConfig:
|
|
15
|
+
"""Configuration for benchmark runs."""
|
|
16
|
+
|
|
17
|
+
models: List[str] = field(default_factory=list)
|
|
18
|
+
categories: List[str] = field(default_factory=lambda: ["basic"])
|
|
19
|
+
timeout: int = 120 # seconds
|
|
20
|
+
parallel: bool = True
|
|
21
|
+
output_dir: Path = field(default_factory=lambda: Path("benchmark_results"))
|
|
22
|
+
repeat_count: int = 1 # Number of times to run each test for averaging
|
|
23
|
+
llm_evaluator_model: str = "openai:gpt-4o-mini" # Model to use for LLM evaluation
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class BenchmarkResult:
|
|
28
|
+
"""Results from a complete benchmark run."""
|
|
29
|
+
|
|
30
|
+
config: BenchmarkConfig
|
|
31
|
+
start_time: datetime
|
|
32
|
+
end_time: datetime
|
|
33
|
+
total_duration: float
|
|
34
|
+
model_performances: Dict[str, ModelPerformance]
|
|
35
|
+
test_results: Dict[str, Dict[str, BenchmarkTestResult]] # model -> test_id -> result
|
|
36
|
+
summary: Dict[str, Any]
|
|
37
|
+
errors: List[str] = field(default_factory=list)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class BenchmarkRunner:
|
|
41
|
+
"""Main benchmark runner for evaluating models across test suites."""
|
|
42
|
+
|
|
43
|
+
benchmark_dir: Path = Path("benchmarks")
|
|
44
|
+
|
|
45
|
+
def __init__(self, config: BenchmarkConfig):
|
|
46
|
+
"""Initialize benchmark runner.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
config: Benchmark configuration
|
|
50
|
+
"""
|
|
51
|
+
self.config = config
|
|
52
|
+
self.benchmark_dir = Path(type(self).benchmark_dir)
|
|
53
|
+
|
|
54
|
+
# Initialize components
|
|
55
|
+
self.discovery = TestDiscovery(self.benchmark_dir, config.timeout)
|
|
56
|
+
self.executor = TestExecutor(config.output_dir, config.llm_evaluator_model)
|
|
57
|
+
|
|
58
|
+
# Create output directory
|
|
59
|
+
self.config.output_dir.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
def discover_tests(
|
|
62
|
+
self, categories: Optional[List[str]] = None, agent_path: Optional[Path] = None
|
|
63
|
+
) -> List[BenchmarkTest]:
|
|
64
|
+
"""Discover benchmark tests.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
categories: Categories to search (e.g., ["basic", "tools"])
|
|
68
|
+
agent_path: Specific agent file to test
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
List of discovered tests
|
|
72
|
+
"""
|
|
73
|
+
if categories is None:
|
|
74
|
+
categories = self.config.categories
|
|
75
|
+
|
|
76
|
+
return self.discovery.discover_tests(categories, agent_path)
|
|
77
|
+
|
|
78
|
+
async def run_benchmark(
|
|
79
|
+
self,
|
|
80
|
+
models: Optional[List[str]] = None,
|
|
81
|
+
categories: Optional[List[str]] = None,
|
|
82
|
+
test_filter: Optional[str] = None,
|
|
83
|
+
agent_path: Optional[Path] = None,
|
|
84
|
+
) -> BenchmarkResult:
|
|
85
|
+
"""Run benchmark suite against specified models.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
models: List of models to test
|
|
89
|
+
categories: Test categories to run
|
|
90
|
+
test_filter: Filter tests by name/ID substring
|
|
91
|
+
agent_path: Specific agent file to test
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Benchmark results
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
ValueError: If no models or tests found
|
|
98
|
+
"""
|
|
99
|
+
start_time = datetime.now()
|
|
100
|
+
|
|
101
|
+
# Validate models
|
|
102
|
+
if models is None:
|
|
103
|
+
models = self.config.models
|
|
104
|
+
if not models:
|
|
105
|
+
raise ValueError("No models specified for benchmarking")
|
|
106
|
+
|
|
107
|
+
# Discover tests
|
|
108
|
+
if agent_path:
|
|
109
|
+
tests = self.discovery.discover_tests(agent_path=agent_path)
|
|
110
|
+
else:
|
|
111
|
+
tests = self.discovery.discover_tests(categories)
|
|
112
|
+
if test_filter:
|
|
113
|
+
tests = [
|
|
114
|
+
t
|
|
115
|
+
for t in tests
|
|
116
|
+
if test_filter.lower() in t.name.lower() or test_filter.lower() in t.test_id.lower()
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
if not tests:
|
|
120
|
+
raise ValueError("No tests found matching criteria")
|
|
121
|
+
|
|
122
|
+
print(f"Running {len(tests)} tests across {len(models)} models...")
|
|
123
|
+
|
|
124
|
+
# Run tests for each model
|
|
125
|
+
model_performances = {}
|
|
126
|
+
test_results = {model: {} for model in models}
|
|
127
|
+
errors = []
|
|
128
|
+
|
|
129
|
+
for model_name in models:
|
|
130
|
+
print(f"\nEvaluating model: {model_name}")
|
|
131
|
+
try:
|
|
132
|
+
model_perf, model_test_results, model_errors = await self._run_model_tests(model_name, tests)
|
|
133
|
+
model_performances[model_name] = model_perf
|
|
134
|
+
test_results[model_name] = model_test_results
|
|
135
|
+
errors.extend(model_errors)
|
|
136
|
+
except Exception as e:
|
|
137
|
+
error_msg = f"Failed to evaluate model {model_name}: {e}"
|
|
138
|
+
errors.append(error_msg)
|
|
139
|
+
print(f"Error: {error_msg}")
|
|
140
|
+
|
|
141
|
+
end_time = datetime.now()
|
|
142
|
+
total_duration = (end_time - start_time).total_seconds()
|
|
143
|
+
|
|
144
|
+
# Generate summary
|
|
145
|
+
summary = self._generate_summary(model_performances, test_results)
|
|
146
|
+
|
|
147
|
+
return BenchmarkResult(
|
|
148
|
+
config=self.config,
|
|
149
|
+
start_time=start_time,
|
|
150
|
+
end_time=end_time,
|
|
151
|
+
total_duration=total_duration,
|
|
152
|
+
model_performances=model_performances,
|
|
153
|
+
test_results=test_results,
|
|
154
|
+
summary=summary,
|
|
155
|
+
errors=errors,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
async def _run_model_tests(
|
|
159
|
+
self, model_name: str, tests: List[BenchmarkTest]
|
|
160
|
+
) -> tuple[ModelPerformance, Dict[str, BenchmarkTestResult], List[str]]:
|
|
161
|
+
"""Run all tests for a single model.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
model_name: Model to test
|
|
165
|
+
tests: List of tests to run
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Tuple of (model_performance, test_results, errors)
|
|
169
|
+
"""
|
|
170
|
+
model_test_results = {}
|
|
171
|
+
model_errors = []
|
|
172
|
+
|
|
173
|
+
# Aggregate metrics
|
|
174
|
+
total_tests = len(tests)
|
|
175
|
+
passed_tests = 0
|
|
176
|
+
total_duration = 0.0
|
|
177
|
+
total_tokens = 0
|
|
178
|
+
total_cost = 0.0
|
|
179
|
+
total_steps = 0
|
|
180
|
+
|
|
181
|
+
for test in tests:
|
|
182
|
+
print(f" Running test: {test.test_id}")
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
test_result = await self.executor.run_test(model_name, test)
|
|
186
|
+
model_test_results[test.test_id] = test_result
|
|
187
|
+
|
|
188
|
+
if test_result.error:
|
|
189
|
+
model_errors.append(f"Test {test.test_id} for model {model_name}: {test_result.error}")
|
|
190
|
+
|
|
191
|
+
if test_result.passed:
|
|
192
|
+
passed_tests += 1
|
|
193
|
+
|
|
194
|
+
total_duration += test_result.duration
|
|
195
|
+
total_tokens += test_result.token_usage.get("total", 0)
|
|
196
|
+
total_cost += test_result.cost
|
|
197
|
+
total_steps += test_result.steps_taken
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
error_msg = f"Test {test.test_id} for model {model_name} failed: {e}"
|
|
201
|
+
model_errors.append(error_msg)
|
|
202
|
+
print(f" Error: {error_msg}")
|
|
203
|
+
|
|
204
|
+
# Create failed test result
|
|
205
|
+
failed_result = BenchmarkTestResult(
|
|
206
|
+
test_id=test.test_id,
|
|
207
|
+
model=model_name,
|
|
208
|
+
passed=False,
|
|
209
|
+
score=0.0,
|
|
210
|
+
duration=0.0,
|
|
211
|
+
output="",
|
|
212
|
+
expected_output=test.expected_output or "",
|
|
213
|
+
error=str(e),
|
|
214
|
+
token_usage={},
|
|
215
|
+
cost=0.0,
|
|
216
|
+
metrics={},
|
|
217
|
+
)
|
|
218
|
+
model_test_results[test.test_id] = failed_result
|
|
219
|
+
|
|
220
|
+
# Calculate overall metrics
|
|
221
|
+
accuracy = passed_tests / total_tests if total_tests > 0 else 0.0
|
|
222
|
+
avg_duration = total_duration / total_tests if total_tests > 0 else 0.0
|
|
223
|
+
avg_steps = total_steps / total_tests if total_tests > 0 else 0.0
|
|
224
|
+
|
|
225
|
+
model_performance = ModelPerformance(
|
|
226
|
+
model=model_name,
|
|
227
|
+
total_tests=total_tests,
|
|
228
|
+
passed_tests=passed_tests,
|
|
229
|
+
accuracy=accuracy,
|
|
230
|
+
average_duration=avg_duration,
|
|
231
|
+
total_duration=total_duration,
|
|
232
|
+
total_tokens=total_tokens,
|
|
233
|
+
total_cost=total_cost,
|
|
234
|
+
scores_by_category={}, # Will be calculated in summary
|
|
235
|
+
average_steps=avg_steps,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return model_performance, model_test_results, model_errors
|
|
239
|
+
|
|
240
|
+
def _generate_summary(
|
|
241
|
+
self,
|
|
242
|
+
model_performances: Dict[str, ModelPerformance],
|
|
243
|
+
test_results: Dict[str, Dict[str, BenchmarkTestResult]],
|
|
244
|
+
) -> Dict[str, Any]:
|
|
245
|
+
"""Generate summary statistics from benchmark results.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
model_performances: Performance data per model
|
|
249
|
+
test_results: Detailed test results per model
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Summary dictionary
|
|
253
|
+
"""
|
|
254
|
+
summary = {
|
|
255
|
+
"total_models": len(model_performances),
|
|
256
|
+
"model_rankings": [],
|
|
257
|
+
"category_performance": {},
|
|
258
|
+
"best_model": None,
|
|
259
|
+
"worst_model": None,
|
|
260
|
+
"average_accuracy": 0.0,
|
|
261
|
+
"total_tests": 0,
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
if not model_performances:
|
|
265
|
+
return summary
|
|
266
|
+
|
|
267
|
+
# Calculate model rankings by accuracy
|
|
268
|
+
ranked_models = sorted(model_performances.items(), key=lambda x: x[1].accuracy, reverse=True)
|
|
269
|
+
|
|
270
|
+
summary["model_rankings"] = [
|
|
271
|
+
{
|
|
272
|
+
"model": model,
|
|
273
|
+
"accuracy": perf.accuracy,
|
|
274
|
+
"avg_duration": perf.average_duration,
|
|
275
|
+
"total_cost": perf.total_cost,
|
|
276
|
+
}
|
|
277
|
+
for model, perf in ranked_models
|
|
278
|
+
]
|
|
279
|
+
|
|
280
|
+
summary["best_model"] = ranked_models[0][0] if ranked_models else None
|
|
281
|
+
summary["worst_model"] = ranked_models[-1][0] if ranked_models else None
|
|
282
|
+
|
|
283
|
+
# Calculate average accuracy
|
|
284
|
+
total_accuracy = sum(perf.accuracy for perf in model_performances.values())
|
|
285
|
+
summary["average_accuracy"] = total_accuracy / len(model_performances)
|
|
286
|
+
|
|
287
|
+
# Get total tests from first model
|
|
288
|
+
if model_performances:
|
|
289
|
+
first_model = next(iter(model_performances.values()))
|
|
290
|
+
summary["total_tests"] = first_model.total_tests
|
|
291
|
+
|
|
292
|
+
return summary
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
"""Test discovery and parsing for benchmark framework."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from ..utils import parse_yaml_frontmatter
|
|
9
|
+
from .utils import (
|
|
10
|
+
extract_block,
|
|
11
|
+
extract_inline_field,
|
|
12
|
+
extract_prompt_from_markdown,
|
|
13
|
+
parse_bullet_list,
|
|
14
|
+
parse_key_value_block,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class TestCase:
|
|
20
|
+
"""Individual test case within a benchmark test."""
|
|
21
|
+
|
|
22
|
+
name: str
|
|
23
|
+
prompt: str
|
|
24
|
+
expected_output: Optional[str] = None
|
|
25
|
+
evaluation: Dict[str, Any] = field(default_factory=dict)
|
|
26
|
+
weight: float = 1.0
|
|
27
|
+
requires_plan: bool = False
|
|
28
|
+
expected_plan_elements: List[str] = field(default_factory=list)
|
|
29
|
+
plan_evaluation: Dict[str, Any] = field(default_factory=dict)
|
|
30
|
+
# LLM evaluation fields
|
|
31
|
+
use_llm_evaluation: bool = False
|
|
32
|
+
llm_evaluation_criteria: str = ""
|
|
33
|
+
llm_evaluation_rubric: Dict[str, Any] = field(default_factory=dict)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class BenchmarkTest:
|
|
38
|
+
"""Test specification for an agent benchmark."""
|
|
39
|
+
|
|
40
|
+
name: str
|
|
41
|
+
agent_path: Path
|
|
42
|
+
test_id: str
|
|
43
|
+
category: str
|
|
44
|
+
description: str = ""
|
|
45
|
+
expected_output: Optional[str] = None
|
|
46
|
+
expected_type: str = "string"
|
|
47
|
+
timeout: int = 60
|
|
48
|
+
weight: float = 1.0
|
|
49
|
+
requires_tools: List[str] = field(default_factory=list)
|
|
50
|
+
test_cases: List[TestCase] = field(default_factory=list)
|
|
51
|
+
evaluation_criteria: Dict[str, Any] = field(default_factory=dict)
|
|
52
|
+
test_path: Optional[Path] = None
|
|
53
|
+
# LLM evaluation fields
|
|
54
|
+
use_llm_evaluation: bool = False
|
|
55
|
+
llm_evaluation_criteria: str = ""
|
|
56
|
+
llm_evaluation_rubric: Dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class TestDiscovery:
|
|
60
|
+
"""Handles discovery and parsing of benchmark tests."""
|
|
61
|
+
|
|
62
|
+
def __init__(self, benchmark_dir: Path, default_timeout: int = 120):
|
|
63
|
+
"""Initialize test discovery.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
benchmark_dir: Root directory containing benchmark tests
|
|
67
|
+
default_timeout: Default timeout for tests in seconds
|
|
68
|
+
"""
|
|
69
|
+
self.benchmark_dir = Path(benchmark_dir)
|
|
70
|
+
self.default_timeout = default_timeout
|
|
71
|
+
self.test_cache: Dict[str, List[BenchmarkTest]] = {}
|
|
72
|
+
|
|
73
|
+
def discover_tests(
|
|
74
|
+
self, categories: Optional[List[str]] = None, agent_path: Optional[Path] = None
|
|
75
|
+
) -> List[BenchmarkTest]:
|
|
76
|
+
"""Discover agent + test.md pairs in specified categories or for a specific agent.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
categories: List of category names to search (e.g., ["basic", "tools"])
|
|
80
|
+
agent_path: Specific agent file to test (bypasses category search)
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
List of discovered benchmark tests
|
|
84
|
+
"""
|
|
85
|
+
# Test a specific agent
|
|
86
|
+
if agent_path:
|
|
87
|
+
agent_path = Path(agent_path)
|
|
88
|
+
category = agent_path.parent.name if agent_path.parent.parent == self.benchmark_dir else "custom"
|
|
89
|
+
test_file = agent_path.with_suffix(".test.md")
|
|
90
|
+
|
|
91
|
+
if test_file.exists():
|
|
92
|
+
return [self._parse_agent_test_pair(agent_path, test_file, category)]
|
|
93
|
+
return [self._parse_benchmark_test(agent_path, category)]
|
|
94
|
+
|
|
95
|
+
# Discover tests from categories
|
|
96
|
+
if categories is None:
|
|
97
|
+
categories = ["basic"]
|
|
98
|
+
|
|
99
|
+
tests = []
|
|
100
|
+
for category in categories:
|
|
101
|
+
# Check cache first
|
|
102
|
+
if category in self.test_cache:
|
|
103
|
+
tests.extend(self.test_cache[category])
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# Discover from filesystem
|
|
107
|
+
category_dir = self.benchmark_dir / category
|
|
108
|
+
if not category_dir.exists():
|
|
109
|
+
print(f"Warning: Category directory not found: {category_dir}")
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
category_tests = self._discover_category_tests(category)
|
|
113
|
+
self.test_cache[category] = category_tests
|
|
114
|
+
tests.extend(category_tests)
|
|
115
|
+
|
|
116
|
+
return tests
|
|
117
|
+
|
|
118
|
+
def _discover_category_tests(self, category: str) -> List[BenchmarkTest]:
|
|
119
|
+
"""Discover all tests in a category directory.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
category: Category name
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of tests in this category
|
|
126
|
+
"""
|
|
127
|
+
category_dir = self.benchmark_dir / category
|
|
128
|
+
category_tests = []
|
|
129
|
+
|
|
130
|
+
for agent_file in category_dir.glob("*.md"):
|
|
131
|
+
# Skip test definition files and documentation
|
|
132
|
+
if agent_file.name.endswith(".test.md") or agent_file.name == "README.md":
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
test_file = agent_file.with_suffix(".test.md")
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
if test_file.exists():
|
|
139
|
+
test = self._parse_agent_test_pair(agent_file, test_file, category)
|
|
140
|
+
else:
|
|
141
|
+
test = self._parse_benchmark_test(agent_file, category)
|
|
142
|
+
category_tests.append(test)
|
|
143
|
+
except Exception as e:
|
|
144
|
+
print(f"Error parsing benchmark test {agent_file}: {e}")
|
|
145
|
+
|
|
146
|
+
return category_tests
|
|
147
|
+
|
|
148
|
+
def _parse_agent_test_pair(self, agent_path: Path, test_path: Path, category: str) -> BenchmarkTest:
|
|
149
|
+
"""Parse an agent + test.md pair into a BenchmarkTest.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
agent_path: Path to agent definition file
|
|
153
|
+
test_path: Path to test definition file
|
|
154
|
+
category: Test category
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Parsed benchmark test
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
ValueError: If parsing fails
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
test_metadata, markdown_content = parse_yaml_frontmatter(test_path.read_text(), "Test file")
|
|
164
|
+
|
|
165
|
+
common = self._extract_common_test_fields(
|
|
166
|
+
test_metadata,
|
|
167
|
+
fallback_id=agent_path.stem,
|
|
168
|
+
fallback_description=f"Test for {agent_path.stem}",
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
test_cases = self._parse_test_cases(markdown_content)
|
|
172
|
+
|
|
173
|
+
# Validate agent file exists
|
|
174
|
+
if not agent_path.exists():
|
|
175
|
+
raise ValueError(f"Agent file not found: {agent_path}")
|
|
176
|
+
|
|
177
|
+
# Use first test case's expected output if no global output specified
|
|
178
|
+
expected_output = common["expected_output"]
|
|
179
|
+
if expected_output is None and test_cases:
|
|
180
|
+
expected_output = test_cases[0].expected_output
|
|
181
|
+
|
|
182
|
+
return BenchmarkTest(
|
|
183
|
+
name=agent_path.stem,
|
|
184
|
+
agent_path=agent_path,
|
|
185
|
+
test_id=common["test_id"],
|
|
186
|
+
category=category,
|
|
187
|
+
description=common["description"],
|
|
188
|
+
expected_output=expected_output,
|
|
189
|
+
expected_type=common["expected_type"],
|
|
190
|
+
timeout=common["timeout"],
|
|
191
|
+
weight=common["weight"],
|
|
192
|
+
requires_tools=common["requires_tools"],
|
|
193
|
+
test_cases=test_cases,
|
|
194
|
+
evaluation_criteria=common["evaluation_criteria"],
|
|
195
|
+
test_path=test_path,
|
|
196
|
+
use_llm_evaluation=test_metadata.get("use_llm_evaluation", False),
|
|
197
|
+
llm_evaluation_criteria=test_metadata.get("llm_evaluation_criteria", ""),
|
|
198
|
+
llm_evaluation_rubric=test_metadata.get("llm_evaluation_rubric", {}),
|
|
199
|
+
)
|
|
200
|
+
except Exception as e:
|
|
201
|
+
raise ValueError(f"Failed to parse agent test pair {agent_path}/{test_path}: {e}")
|
|
202
|
+
|
|
203
|
+
def _parse_benchmark_test(self, agent_path: Path, category: str) -> BenchmarkTest:
|
|
204
|
+
"""Parse a single benchmark markdown file with embedded expectations.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
agent_path: Path to agent/benchmark file
|
|
208
|
+
category: Test category
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Parsed benchmark test
|
|
212
|
+
|
|
213
|
+
Raises:
|
|
214
|
+
ValueError: If parsing fails
|
|
215
|
+
"""
|
|
216
|
+
try:
|
|
217
|
+
metadata, markdown_content = parse_yaml_frontmatter(agent_path.read_text(), "Benchmark file")
|
|
218
|
+
|
|
219
|
+
common = self._extract_common_test_fields(
|
|
220
|
+
metadata,
|
|
221
|
+
fallback_id=agent_path.stem,
|
|
222
|
+
fallback_description=metadata.get("name", ""),
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
prompt = metadata.get("prompt") or extract_prompt_from_markdown(markdown_content)
|
|
226
|
+
|
|
227
|
+
test_case = TestCase(
|
|
228
|
+
name=f"{common['test_id']}_default",
|
|
229
|
+
prompt=prompt,
|
|
230
|
+
expected_output=common["expected_output"],
|
|
231
|
+
evaluation=metadata.get("evaluation", {}),
|
|
232
|
+
weight=common["weight"],
|
|
233
|
+
requires_plan=metadata.get("requires_plan", False),
|
|
234
|
+
expected_plan_elements=metadata.get("expected_plan_elements", []),
|
|
235
|
+
plan_evaluation=metadata.get("plan_evaluation", {}),
|
|
236
|
+
use_llm_evaluation=metadata.get("use_llm_evaluation", False),
|
|
237
|
+
llm_evaluation_criteria=metadata.get("llm_evaluation_criteria", ""),
|
|
238
|
+
llm_evaluation_rubric=metadata.get("llm_evaluation_rubric", {}),
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return BenchmarkTest(
|
|
242
|
+
name=metadata.get("name", agent_path.stem),
|
|
243
|
+
agent_path=agent_path,
|
|
244
|
+
test_id=common["test_id"],
|
|
245
|
+
category=category,
|
|
246
|
+
description=common["description"],
|
|
247
|
+
expected_output=common["expected_output"],
|
|
248
|
+
expected_type=common["expected_type"],
|
|
249
|
+
timeout=common["timeout"],
|
|
250
|
+
weight=common["weight"],
|
|
251
|
+
requires_tools=common["requires_tools"],
|
|
252
|
+
test_cases=[test_case],
|
|
253
|
+
evaluation_criteria=common["evaluation_criteria"],
|
|
254
|
+
test_path=None,
|
|
255
|
+
use_llm_evaluation=metadata.get("use_llm_evaluation", False),
|
|
256
|
+
llm_evaluation_criteria=metadata.get("llm_evaluation_criteria", ""),
|
|
257
|
+
llm_evaluation_rubric=metadata.get("llm_evaluation_rubric", {}),
|
|
258
|
+
)
|
|
259
|
+
except Exception as e:
|
|
260
|
+
raise ValueError(f"Failed to parse benchmark test {agent_path}: {e}")
|
|
261
|
+
|
|
262
|
+
def _extract_common_test_fields(
|
|
263
|
+
self,
|
|
264
|
+
metadata: Dict[str, Any],
|
|
265
|
+
fallback_id: str,
|
|
266
|
+
fallback_description: str,
|
|
267
|
+
) -> Dict[str, Any]:
|
|
268
|
+
"""Extract common metadata fields from test definition.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
metadata: Parsed YAML frontmatter
|
|
272
|
+
fallback_id: ID to use if not specified
|
|
273
|
+
fallback_description: Description to use if not specified
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Dictionary of common fields
|
|
277
|
+
"""
|
|
278
|
+
weight_value = metadata.get("weight", 1.0)
|
|
279
|
+
try:
|
|
280
|
+
weight = float(weight_value)
|
|
281
|
+
except (TypeError, ValueError):
|
|
282
|
+
weight = 1.0
|
|
283
|
+
|
|
284
|
+
requires_tools = metadata.get("requires_tools", metadata.get("tools", []))
|
|
285
|
+
|
|
286
|
+
return {
|
|
287
|
+
"test_id": metadata.get("test_id", fallback_id),
|
|
288
|
+
"description": metadata.get("description", fallback_description),
|
|
289
|
+
"timeout": metadata.get("timeout", self.default_timeout),
|
|
290
|
+
"requires_tools": requires_tools,
|
|
291
|
+
"weight": weight,
|
|
292
|
+
"expected_output": metadata.get("expected_output"),
|
|
293
|
+
"expected_type": metadata.get("expected_type", "string"),
|
|
294
|
+
"evaluation_criteria": metadata.get("evaluation_criteria", {}),
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
def _parse_test_cases(self, markdown_content: str) -> List[TestCase]:
|
|
298
|
+
"""Parse test cases from markdown content.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
markdown_content: Markdown content containing test cases
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
List of parsed test cases
|
|
305
|
+
"""
|
|
306
|
+
test_cases = []
|
|
307
|
+
|
|
308
|
+
# Split content by ## headers (test cases)
|
|
309
|
+
sections = re.split(r"\n## (.+?)\n", markdown_content)
|
|
310
|
+
|
|
311
|
+
# Skip the first section (usually intro text)
|
|
312
|
+
for i in range(1, len(sections), 2):
|
|
313
|
+
if i + 1 >= len(sections):
|
|
314
|
+
break
|
|
315
|
+
|
|
316
|
+
case_name = sections[i].strip()
|
|
317
|
+
case_content = sections[i + 1].strip()
|
|
318
|
+
|
|
319
|
+
# Parse the test case content
|
|
320
|
+
test_case = self._parse_single_test_case(case_name, case_content)
|
|
321
|
+
if test_case:
|
|
322
|
+
test_cases.append(test_case)
|
|
323
|
+
|
|
324
|
+
return test_cases
|
|
325
|
+
|
|
326
|
+
def _parse_single_test_case(self, name: str, content: str) -> Optional[TestCase]:
|
|
327
|
+
"""Parse a single test case from markdown content.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
name: Test case name
|
|
331
|
+
content: Test case markdown content
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Parsed test case or None if parsing fails
|
|
335
|
+
"""
|
|
336
|
+
# Extract prompt - required
|
|
337
|
+
prompt = extract_inline_field(content, "Prompt")
|
|
338
|
+
if not prompt:
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
# Extract expected output
|
|
342
|
+
expected_output = extract_inline_field(content, "Expected Output")
|
|
343
|
+
|
|
344
|
+
# Parse evaluation criteria
|
|
345
|
+
evaluation = parse_key_value_block(extract_block(content, "Evaluation"))
|
|
346
|
+
|
|
347
|
+
# Parse planning requirements
|
|
348
|
+
requires_plan_text = extract_inline_field(content, "Requires Plan")
|
|
349
|
+
requires_plan = bool(requires_plan_text and requires_plan_text.strip().lower() in {"true", "yes", "1"})
|
|
350
|
+
|
|
351
|
+
expected_plan_elements = parse_bullet_list(extract_block(content, "Expected Plan Elements"))
|
|
352
|
+
plan_evaluation = parse_key_value_block(extract_block(content, "Plan Evaluation"))
|
|
353
|
+
|
|
354
|
+
# Parse LLM evaluation fields
|
|
355
|
+
use_llm_evaluation_text = extract_inline_field(content, "Use LLM Evaluation")
|
|
356
|
+
use_llm_evaluation = bool(
|
|
357
|
+
use_llm_evaluation_text and use_llm_evaluation_text.strip().lower() in {"true", "yes", "1"}
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
llm_evaluation_criteria = extract_inline_field(content, "LLM Evaluation Criteria") or ""
|
|
361
|
+
|
|
362
|
+
llm_rubric_block = extract_block(content, "LLM Evaluation Rubric")
|
|
363
|
+
llm_evaluation_rubric = parse_key_value_block(llm_rubric_block)
|
|
364
|
+
|
|
365
|
+
return TestCase(
|
|
366
|
+
name=name,
|
|
367
|
+
prompt=prompt,
|
|
368
|
+
expected_output=expected_output,
|
|
369
|
+
evaluation=evaluation,
|
|
370
|
+
weight=1.0,
|
|
371
|
+
requires_plan=requires_plan,
|
|
372
|
+
expected_plan_elements=expected_plan_elements,
|
|
373
|
+
plan_evaluation=plan_evaluation,
|
|
374
|
+
use_llm_evaluation=use_llm_evaluation,
|
|
375
|
+
llm_evaluation_criteria=llm_evaluation_criteria,
|
|
376
|
+
llm_evaluation_rubric=llm_evaluation_rubric,
|
|
377
|
+
)
|