tsugite-cli 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. tsugite/__init__.py +6 -0
  2. tsugite/agent_composition.py +163 -0
  3. tsugite/agent_inheritance.py +479 -0
  4. tsugite/agent_preparation.py +236 -0
  5. tsugite/agent_runner/__init__.py +45 -0
  6. tsugite/agent_runner/helpers.py +106 -0
  7. tsugite/agent_runner/history_integration.py +248 -0
  8. tsugite/agent_runner/metrics.py +100 -0
  9. tsugite/agent_runner/runner.py +1879 -0
  10. tsugite/agent_runner/validation.py +70 -0
  11. tsugite/agent_utils.py +167 -0
  12. tsugite/attachments/__init__.py +65 -0
  13. tsugite/attachments/auto_context.py +199 -0
  14. tsugite/attachments/base.py +34 -0
  15. tsugite/attachments/file.py +51 -0
  16. tsugite/attachments/inline.py +31 -0
  17. tsugite/attachments/storage.py +178 -0
  18. tsugite/attachments/url.py +59 -0
  19. tsugite/attachments/youtube.py +101 -0
  20. tsugite/benchmark/__init__.py +62 -0
  21. tsugite/benchmark/config.py +183 -0
  22. tsugite/benchmark/core.py +292 -0
  23. tsugite/benchmark/discovery.py +377 -0
  24. tsugite/benchmark/evaluators.py +671 -0
  25. tsugite/benchmark/execution.py +657 -0
  26. tsugite/benchmark/metrics.py +204 -0
  27. tsugite/benchmark/reports.py +420 -0
  28. tsugite/benchmark/utils.py +288 -0
  29. tsugite/builtin_agents/chat-assistant.md +53 -0
  30. tsugite/builtin_agents/default.md +140 -0
  31. tsugite/builtin_agents.py +5 -0
  32. tsugite/cache.py +195 -0
  33. tsugite/cli/__init__.py +1042 -0
  34. tsugite/cli/agents.py +148 -0
  35. tsugite/cli/attachments.py +193 -0
  36. tsugite/cli/benchmark.py +663 -0
  37. tsugite/cli/cache.py +113 -0
  38. tsugite/cli/config.py +272 -0
  39. tsugite/cli/helpers.py +534 -0
  40. tsugite/cli/history.py +193 -0
  41. tsugite/cli/init.py +387 -0
  42. tsugite/cli/mcp.py +193 -0
  43. tsugite/cli/tools.py +419 -0
  44. tsugite/config.py +204 -0
  45. tsugite/console.py +48 -0
  46. tsugite/constants.py +21 -0
  47. tsugite/core/__init__.py +19 -0
  48. tsugite/core/agent.py +774 -0
  49. tsugite/core/executor.py +300 -0
  50. tsugite/core/memory.py +67 -0
  51. tsugite/core/tools.py +271 -0
  52. tsugite/docker_cli.py +270 -0
  53. tsugite/events/__init__.py +55 -0
  54. tsugite/events/base.py +46 -0
  55. tsugite/events/bus.py +62 -0
  56. tsugite/events/events.py +224 -0
  57. tsugite/exceptions.py +40 -0
  58. tsugite/history/__init__.py +29 -0
  59. tsugite/history/index.py +210 -0
  60. tsugite/history/models.py +106 -0
  61. tsugite/history/storage.py +157 -0
  62. tsugite/mcp_client.py +219 -0
  63. tsugite/mcp_config.py +174 -0
  64. tsugite/md_agents.py +751 -0
  65. tsugite/models.py +257 -0
  66. tsugite/renderer.py +151 -0
  67. tsugite/shell_tool_config.py +265 -0
  68. tsugite/templates/assistant.md +14 -0
  69. tsugite/tools/__init__.py +265 -0
  70. tsugite/tools/agents.py +312 -0
  71. tsugite/tools/edit_strategies.py +393 -0
  72. tsugite/tools/fs.py +329 -0
  73. tsugite/tools/http.py +239 -0
  74. tsugite/tools/interactive.py +430 -0
  75. tsugite/tools/shell.py +129 -0
  76. tsugite/tools/shell_tools.py +214 -0
  77. tsugite/tools/tasks.py +339 -0
  78. tsugite/tsugite.py +7 -0
  79. tsugite/ui/__init__.py +46 -0
  80. tsugite/ui/base.py +638 -0
  81. tsugite/ui/chat.py +265 -0
  82. tsugite/ui/chat.tcss +92 -0
  83. tsugite/ui/chat_history.py +286 -0
  84. tsugite/ui/helpers.py +102 -0
  85. tsugite/ui/jsonl.py +125 -0
  86. tsugite/ui/live_template.py +529 -0
  87. tsugite/ui/plain.py +419 -0
  88. tsugite/ui/textual_chat.py +642 -0
  89. tsugite/ui/textual_handler.py +225 -0
  90. tsugite/ui/widgets/__init__.py +6 -0
  91. tsugite/ui/widgets/base_scroll_log.py +27 -0
  92. tsugite/ui/widgets/message_list.py +121 -0
  93. tsugite/ui/widgets/thought_log.py +80 -0
  94. tsugite/ui_context.py +90 -0
  95. tsugite/utils.py +367 -0
  96. tsugite/xdg.py +104 -0
  97. tsugite_cli-0.3.3.dist-info/METADATA +325 -0
  98. tsugite_cli-0.3.3.dist-info/RECORD +101 -0
  99. tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
  100. tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
  101. tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
@@ -0,0 +1,292 @@
1
+ """Core benchmark framework for evaluating Tsugite agents."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from .discovery import BenchmarkTest, TestDiscovery
9
+ from .execution import TestExecutor
10
+ from .metrics import BenchmarkTestResult, ModelPerformance
11
+
12
+
13
+ @dataclass
14
+ class BenchmarkConfig:
15
+ """Configuration for benchmark runs."""
16
+
17
+ models: List[str] = field(default_factory=list)
18
+ categories: List[str] = field(default_factory=lambda: ["basic"])
19
+ timeout: int = 120 # seconds
20
+ parallel: bool = True
21
+ output_dir: Path = field(default_factory=lambda: Path("benchmark_results"))
22
+ repeat_count: int = 1 # Number of times to run each test for averaging
23
+ llm_evaluator_model: str = "openai:gpt-4o-mini" # Model to use for LLM evaluation
24
+
25
+
26
+ @dataclass
27
+ class BenchmarkResult:
28
+ """Results from a complete benchmark run."""
29
+
30
+ config: BenchmarkConfig
31
+ start_time: datetime
32
+ end_time: datetime
33
+ total_duration: float
34
+ model_performances: Dict[str, ModelPerformance]
35
+ test_results: Dict[str, Dict[str, BenchmarkTestResult]] # model -> test_id -> result
36
+ summary: Dict[str, Any]
37
+ errors: List[str] = field(default_factory=list)
38
+
39
+
40
+ class BenchmarkRunner:
41
+ """Main benchmark runner for evaluating models across test suites."""
42
+
43
+ benchmark_dir: Path = Path("benchmarks")
44
+
45
+ def __init__(self, config: BenchmarkConfig):
46
+ """Initialize benchmark runner.
47
+
48
+ Args:
49
+ config: Benchmark configuration
50
+ """
51
+ self.config = config
52
+ self.benchmark_dir = Path(type(self).benchmark_dir)
53
+
54
+ # Initialize components
55
+ self.discovery = TestDiscovery(self.benchmark_dir, config.timeout)
56
+ self.executor = TestExecutor(config.output_dir, config.llm_evaluator_model)
57
+
58
+ # Create output directory
59
+ self.config.output_dir.mkdir(parents=True, exist_ok=True)
60
+
61
+ def discover_tests(
62
+ self, categories: Optional[List[str]] = None, agent_path: Optional[Path] = None
63
+ ) -> List[BenchmarkTest]:
64
+ """Discover benchmark tests.
65
+
66
+ Args:
67
+ categories: Categories to search (e.g., ["basic", "tools"])
68
+ agent_path: Specific agent file to test
69
+
70
+ Returns:
71
+ List of discovered tests
72
+ """
73
+ if categories is None:
74
+ categories = self.config.categories
75
+
76
+ return self.discovery.discover_tests(categories, agent_path)
77
+
78
+ async def run_benchmark(
79
+ self,
80
+ models: Optional[List[str]] = None,
81
+ categories: Optional[List[str]] = None,
82
+ test_filter: Optional[str] = None,
83
+ agent_path: Optional[Path] = None,
84
+ ) -> BenchmarkResult:
85
+ """Run benchmark suite against specified models.
86
+
87
+ Args:
88
+ models: List of models to test
89
+ categories: Test categories to run
90
+ test_filter: Filter tests by name/ID substring
91
+ agent_path: Specific agent file to test
92
+
93
+ Returns:
94
+ Benchmark results
95
+
96
+ Raises:
97
+ ValueError: If no models or tests found
98
+ """
99
+ start_time = datetime.now()
100
+
101
+ # Validate models
102
+ if models is None:
103
+ models = self.config.models
104
+ if not models:
105
+ raise ValueError("No models specified for benchmarking")
106
+
107
+ # Discover tests
108
+ if agent_path:
109
+ tests = self.discovery.discover_tests(agent_path=agent_path)
110
+ else:
111
+ tests = self.discovery.discover_tests(categories)
112
+ if test_filter:
113
+ tests = [
114
+ t
115
+ for t in tests
116
+ if test_filter.lower() in t.name.lower() or test_filter.lower() in t.test_id.lower()
117
+ ]
118
+
119
+ if not tests:
120
+ raise ValueError("No tests found matching criteria")
121
+
122
+ print(f"Running {len(tests)} tests across {len(models)} models...")
123
+
124
+ # Run tests for each model
125
+ model_performances = {}
126
+ test_results = {model: {} for model in models}
127
+ errors = []
128
+
129
+ for model_name in models:
130
+ print(f"\nEvaluating model: {model_name}")
131
+ try:
132
+ model_perf, model_test_results, model_errors = await self._run_model_tests(model_name, tests)
133
+ model_performances[model_name] = model_perf
134
+ test_results[model_name] = model_test_results
135
+ errors.extend(model_errors)
136
+ except Exception as e:
137
+ error_msg = f"Failed to evaluate model {model_name}: {e}"
138
+ errors.append(error_msg)
139
+ print(f"Error: {error_msg}")
140
+
141
+ end_time = datetime.now()
142
+ total_duration = (end_time - start_time).total_seconds()
143
+
144
+ # Generate summary
145
+ summary = self._generate_summary(model_performances, test_results)
146
+
147
+ return BenchmarkResult(
148
+ config=self.config,
149
+ start_time=start_time,
150
+ end_time=end_time,
151
+ total_duration=total_duration,
152
+ model_performances=model_performances,
153
+ test_results=test_results,
154
+ summary=summary,
155
+ errors=errors,
156
+ )
157
+
158
+ async def _run_model_tests(
159
+ self, model_name: str, tests: List[BenchmarkTest]
160
+ ) -> tuple[ModelPerformance, Dict[str, BenchmarkTestResult], List[str]]:
161
+ """Run all tests for a single model.
162
+
163
+ Args:
164
+ model_name: Model to test
165
+ tests: List of tests to run
166
+
167
+ Returns:
168
+ Tuple of (model_performance, test_results, errors)
169
+ """
170
+ model_test_results = {}
171
+ model_errors = []
172
+
173
+ # Aggregate metrics
174
+ total_tests = len(tests)
175
+ passed_tests = 0
176
+ total_duration = 0.0
177
+ total_tokens = 0
178
+ total_cost = 0.0
179
+ total_steps = 0
180
+
181
+ for test in tests:
182
+ print(f" Running test: {test.test_id}")
183
+
184
+ try:
185
+ test_result = await self.executor.run_test(model_name, test)
186
+ model_test_results[test.test_id] = test_result
187
+
188
+ if test_result.error:
189
+ model_errors.append(f"Test {test.test_id} for model {model_name}: {test_result.error}")
190
+
191
+ if test_result.passed:
192
+ passed_tests += 1
193
+
194
+ total_duration += test_result.duration
195
+ total_tokens += test_result.token_usage.get("total", 0)
196
+ total_cost += test_result.cost
197
+ total_steps += test_result.steps_taken
198
+
199
+ except Exception as e:
200
+ error_msg = f"Test {test.test_id} for model {model_name} failed: {e}"
201
+ model_errors.append(error_msg)
202
+ print(f" Error: {error_msg}")
203
+
204
+ # Create failed test result
205
+ failed_result = BenchmarkTestResult(
206
+ test_id=test.test_id,
207
+ model=model_name,
208
+ passed=False,
209
+ score=0.0,
210
+ duration=0.0,
211
+ output="",
212
+ expected_output=test.expected_output or "",
213
+ error=str(e),
214
+ token_usage={},
215
+ cost=0.0,
216
+ metrics={},
217
+ )
218
+ model_test_results[test.test_id] = failed_result
219
+
220
+ # Calculate overall metrics
221
+ accuracy = passed_tests / total_tests if total_tests > 0 else 0.0
222
+ avg_duration = total_duration / total_tests if total_tests > 0 else 0.0
223
+ avg_steps = total_steps / total_tests if total_tests > 0 else 0.0
224
+
225
+ model_performance = ModelPerformance(
226
+ model=model_name,
227
+ total_tests=total_tests,
228
+ passed_tests=passed_tests,
229
+ accuracy=accuracy,
230
+ average_duration=avg_duration,
231
+ total_duration=total_duration,
232
+ total_tokens=total_tokens,
233
+ total_cost=total_cost,
234
+ scores_by_category={}, # Will be calculated in summary
235
+ average_steps=avg_steps,
236
+ )
237
+
238
+ return model_performance, model_test_results, model_errors
239
+
240
+ def _generate_summary(
241
+ self,
242
+ model_performances: Dict[str, ModelPerformance],
243
+ test_results: Dict[str, Dict[str, BenchmarkTestResult]],
244
+ ) -> Dict[str, Any]:
245
+ """Generate summary statistics from benchmark results.
246
+
247
+ Args:
248
+ model_performances: Performance data per model
249
+ test_results: Detailed test results per model
250
+
251
+ Returns:
252
+ Summary dictionary
253
+ """
254
+ summary = {
255
+ "total_models": len(model_performances),
256
+ "model_rankings": [],
257
+ "category_performance": {},
258
+ "best_model": None,
259
+ "worst_model": None,
260
+ "average_accuracy": 0.0,
261
+ "total_tests": 0,
262
+ }
263
+
264
+ if not model_performances:
265
+ return summary
266
+
267
+ # Calculate model rankings by accuracy
268
+ ranked_models = sorted(model_performances.items(), key=lambda x: x[1].accuracy, reverse=True)
269
+
270
+ summary["model_rankings"] = [
271
+ {
272
+ "model": model,
273
+ "accuracy": perf.accuracy,
274
+ "avg_duration": perf.average_duration,
275
+ "total_cost": perf.total_cost,
276
+ }
277
+ for model, perf in ranked_models
278
+ ]
279
+
280
+ summary["best_model"] = ranked_models[0][0] if ranked_models else None
281
+ summary["worst_model"] = ranked_models[-1][0] if ranked_models else None
282
+
283
+ # Calculate average accuracy
284
+ total_accuracy = sum(perf.accuracy for perf in model_performances.values())
285
+ summary["average_accuracy"] = total_accuracy / len(model_performances)
286
+
287
+ # Get total tests from first model
288
+ if model_performances:
289
+ first_model = next(iter(model_performances.values()))
290
+ summary["total_tests"] = first_model.total_tests
291
+
292
+ return summary
@@ -0,0 +1,377 @@
1
+ """Test discovery and parsing for benchmark framework."""
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from ..utils import parse_yaml_frontmatter
9
+ from .utils import (
10
+ extract_block,
11
+ extract_inline_field,
12
+ extract_prompt_from_markdown,
13
+ parse_bullet_list,
14
+ parse_key_value_block,
15
+ )
16
+
17
+
18
+ @dataclass
19
+ class TestCase:
20
+ """Individual test case within a benchmark test."""
21
+
22
+ name: str
23
+ prompt: str
24
+ expected_output: Optional[str] = None
25
+ evaluation: Dict[str, Any] = field(default_factory=dict)
26
+ weight: float = 1.0
27
+ requires_plan: bool = False
28
+ expected_plan_elements: List[str] = field(default_factory=list)
29
+ plan_evaluation: Dict[str, Any] = field(default_factory=dict)
30
+ # LLM evaluation fields
31
+ use_llm_evaluation: bool = False
32
+ llm_evaluation_criteria: str = ""
33
+ llm_evaluation_rubric: Dict[str, Any] = field(default_factory=dict)
34
+
35
+
36
+ @dataclass
37
+ class BenchmarkTest:
38
+ """Test specification for an agent benchmark."""
39
+
40
+ name: str
41
+ agent_path: Path
42
+ test_id: str
43
+ category: str
44
+ description: str = ""
45
+ expected_output: Optional[str] = None
46
+ expected_type: str = "string"
47
+ timeout: int = 60
48
+ weight: float = 1.0
49
+ requires_tools: List[str] = field(default_factory=list)
50
+ test_cases: List[TestCase] = field(default_factory=list)
51
+ evaluation_criteria: Dict[str, Any] = field(default_factory=dict)
52
+ test_path: Optional[Path] = None
53
+ # LLM evaluation fields
54
+ use_llm_evaluation: bool = False
55
+ llm_evaluation_criteria: str = ""
56
+ llm_evaluation_rubric: Dict[str, Any] = field(default_factory=dict)
57
+
58
+
59
+ class TestDiscovery:
60
+ """Handles discovery and parsing of benchmark tests."""
61
+
62
+ def __init__(self, benchmark_dir: Path, default_timeout: int = 120):
63
+ """Initialize test discovery.
64
+
65
+ Args:
66
+ benchmark_dir: Root directory containing benchmark tests
67
+ default_timeout: Default timeout for tests in seconds
68
+ """
69
+ self.benchmark_dir = Path(benchmark_dir)
70
+ self.default_timeout = default_timeout
71
+ self.test_cache: Dict[str, List[BenchmarkTest]] = {}
72
+
73
+ def discover_tests(
74
+ self, categories: Optional[List[str]] = None, agent_path: Optional[Path] = None
75
+ ) -> List[BenchmarkTest]:
76
+ """Discover agent + test.md pairs in specified categories or for a specific agent.
77
+
78
+ Args:
79
+ categories: List of category names to search (e.g., ["basic", "tools"])
80
+ agent_path: Specific agent file to test (bypasses category search)
81
+
82
+ Returns:
83
+ List of discovered benchmark tests
84
+ """
85
+ # Test a specific agent
86
+ if agent_path:
87
+ agent_path = Path(agent_path)
88
+ category = agent_path.parent.name if agent_path.parent.parent == self.benchmark_dir else "custom"
89
+ test_file = agent_path.with_suffix(".test.md")
90
+
91
+ if test_file.exists():
92
+ return [self._parse_agent_test_pair(agent_path, test_file, category)]
93
+ return [self._parse_benchmark_test(agent_path, category)]
94
+
95
+ # Discover tests from categories
96
+ if categories is None:
97
+ categories = ["basic"]
98
+
99
+ tests = []
100
+ for category in categories:
101
+ # Check cache first
102
+ if category in self.test_cache:
103
+ tests.extend(self.test_cache[category])
104
+ continue
105
+
106
+ # Discover from filesystem
107
+ category_dir = self.benchmark_dir / category
108
+ if not category_dir.exists():
109
+ print(f"Warning: Category directory not found: {category_dir}")
110
+ continue
111
+
112
+ category_tests = self._discover_category_tests(category)
113
+ self.test_cache[category] = category_tests
114
+ tests.extend(category_tests)
115
+
116
+ return tests
117
+
118
+ def _discover_category_tests(self, category: str) -> List[BenchmarkTest]:
119
+ """Discover all tests in a category directory.
120
+
121
+ Args:
122
+ category: Category name
123
+
124
+ Returns:
125
+ List of tests in this category
126
+ """
127
+ category_dir = self.benchmark_dir / category
128
+ category_tests = []
129
+
130
+ for agent_file in category_dir.glob("*.md"):
131
+ # Skip test definition files and documentation
132
+ if agent_file.name.endswith(".test.md") or agent_file.name == "README.md":
133
+ continue
134
+
135
+ test_file = agent_file.with_suffix(".test.md")
136
+
137
+ try:
138
+ if test_file.exists():
139
+ test = self._parse_agent_test_pair(agent_file, test_file, category)
140
+ else:
141
+ test = self._parse_benchmark_test(agent_file, category)
142
+ category_tests.append(test)
143
+ except Exception as e:
144
+ print(f"Error parsing benchmark test {agent_file}: {e}")
145
+
146
+ return category_tests
147
+
148
+ def _parse_agent_test_pair(self, agent_path: Path, test_path: Path, category: str) -> BenchmarkTest:
149
+ """Parse an agent + test.md pair into a BenchmarkTest.
150
+
151
+ Args:
152
+ agent_path: Path to agent definition file
153
+ test_path: Path to test definition file
154
+ category: Test category
155
+
156
+ Returns:
157
+ Parsed benchmark test
158
+
159
+ Raises:
160
+ ValueError: If parsing fails
161
+ """
162
+ try:
163
+ test_metadata, markdown_content = parse_yaml_frontmatter(test_path.read_text(), "Test file")
164
+
165
+ common = self._extract_common_test_fields(
166
+ test_metadata,
167
+ fallback_id=agent_path.stem,
168
+ fallback_description=f"Test for {agent_path.stem}",
169
+ )
170
+
171
+ test_cases = self._parse_test_cases(markdown_content)
172
+
173
+ # Validate agent file exists
174
+ if not agent_path.exists():
175
+ raise ValueError(f"Agent file not found: {agent_path}")
176
+
177
+ # Use first test case's expected output if no global output specified
178
+ expected_output = common["expected_output"]
179
+ if expected_output is None and test_cases:
180
+ expected_output = test_cases[0].expected_output
181
+
182
+ return BenchmarkTest(
183
+ name=agent_path.stem,
184
+ agent_path=agent_path,
185
+ test_id=common["test_id"],
186
+ category=category,
187
+ description=common["description"],
188
+ expected_output=expected_output,
189
+ expected_type=common["expected_type"],
190
+ timeout=common["timeout"],
191
+ weight=common["weight"],
192
+ requires_tools=common["requires_tools"],
193
+ test_cases=test_cases,
194
+ evaluation_criteria=common["evaluation_criteria"],
195
+ test_path=test_path,
196
+ use_llm_evaluation=test_metadata.get("use_llm_evaluation", False),
197
+ llm_evaluation_criteria=test_metadata.get("llm_evaluation_criteria", ""),
198
+ llm_evaluation_rubric=test_metadata.get("llm_evaluation_rubric", {}),
199
+ )
200
+ except Exception as e:
201
+ raise ValueError(f"Failed to parse agent test pair {agent_path}/{test_path}: {e}")
202
+
203
+ def _parse_benchmark_test(self, agent_path: Path, category: str) -> BenchmarkTest:
204
+ """Parse a single benchmark markdown file with embedded expectations.
205
+
206
+ Args:
207
+ agent_path: Path to agent/benchmark file
208
+ category: Test category
209
+
210
+ Returns:
211
+ Parsed benchmark test
212
+
213
+ Raises:
214
+ ValueError: If parsing fails
215
+ """
216
+ try:
217
+ metadata, markdown_content = parse_yaml_frontmatter(agent_path.read_text(), "Benchmark file")
218
+
219
+ common = self._extract_common_test_fields(
220
+ metadata,
221
+ fallback_id=agent_path.stem,
222
+ fallback_description=metadata.get("name", ""),
223
+ )
224
+
225
+ prompt = metadata.get("prompt") or extract_prompt_from_markdown(markdown_content)
226
+
227
+ test_case = TestCase(
228
+ name=f"{common['test_id']}_default",
229
+ prompt=prompt,
230
+ expected_output=common["expected_output"],
231
+ evaluation=metadata.get("evaluation", {}),
232
+ weight=common["weight"],
233
+ requires_plan=metadata.get("requires_plan", False),
234
+ expected_plan_elements=metadata.get("expected_plan_elements", []),
235
+ plan_evaluation=metadata.get("plan_evaluation", {}),
236
+ use_llm_evaluation=metadata.get("use_llm_evaluation", False),
237
+ llm_evaluation_criteria=metadata.get("llm_evaluation_criteria", ""),
238
+ llm_evaluation_rubric=metadata.get("llm_evaluation_rubric", {}),
239
+ )
240
+
241
+ return BenchmarkTest(
242
+ name=metadata.get("name", agent_path.stem),
243
+ agent_path=agent_path,
244
+ test_id=common["test_id"],
245
+ category=category,
246
+ description=common["description"],
247
+ expected_output=common["expected_output"],
248
+ expected_type=common["expected_type"],
249
+ timeout=common["timeout"],
250
+ weight=common["weight"],
251
+ requires_tools=common["requires_tools"],
252
+ test_cases=[test_case],
253
+ evaluation_criteria=common["evaluation_criteria"],
254
+ test_path=None,
255
+ use_llm_evaluation=metadata.get("use_llm_evaluation", False),
256
+ llm_evaluation_criteria=metadata.get("llm_evaluation_criteria", ""),
257
+ llm_evaluation_rubric=metadata.get("llm_evaluation_rubric", {}),
258
+ )
259
+ except Exception as e:
260
+ raise ValueError(f"Failed to parse benchmark test {agent_path}: {e}")
261
+
262
+ def _extract_common_test_fields(
263
+ self,
264
+ metadata: Dict[str, Any],
265
+ fallback_id: str,
266
+ fallback_description: str,
267
+ ) -> Dict[str, Any]:
268
+ """Extract common metadata fields from test definition.
269
+
270
+ Args:
271
+ metadata: Parsed YAML frontmatter
272
+ fallback_id: ID to use if not specified
273
+ fallback_description: Description to use if not specified
274
+
275
+ Returns:
276
+ Dictionary of common fields
277
+ """
278
+ weight_value = metadata.get("weight", 1.0)
279
+ try:
280
+ weight = float(weight_value)
281
+ except (TypeError, ValueError):
282
+ weight = 1.0
283
+
284
+ requires_tools = metadata.get("requires_tools", metadata.get("tools", []))
285
+
286
+ return {
287
+ "test_id": metadata.get("test_id", fallback_id),
288
+ "description": metadata.get("description", fallback_description),
289
+ "timeout": metadata.get("timeout", self.default_timeout),
290
+ "requires_tools": requires_tools,
291
+ "weight": weight,
292
+ "expected_output": metadata.get("expected_output"),
293
+ "expected_type": metadata.get("expected_type", "string"),
294
+ "evaluation_criteria": metadata.get("evaluation_criteria", {}),
295
+ }
296
+
297
+ def _parse_test_cases(self, markdown_content: str) -> List[TestCase]:
298
+ """Parse test cases from markdown content.
299
+
300
+ Args:
301
+ markdown_content: Markdown content containing test cases
302
+
303
+ Returns:
304
+ List of parsed test cases
305
+ """
306
+ test_cases = []
307
+
308
+ # Split content by ## headers (test cases)
309
+ sections = re.split(r"\n## (.+?)\n", markdown_content)
310
+
311
+ # Skip the first section (usually intro text)
312
+ for i in range(1, len(sections), 2):
313
+ if i + 1 >= len(sections):
314
+ break
315
+
316
+ case_name = sections[i].strip()
317
+ case_content = sections[i + 1].strip()
318
+
319
+ # Parse the test case content
320
+ test_case = self._parse_single_test_case(case_name, case_content)
321
+ if test_case:
322
+ test_cases.append(test_case)
323
+
324
+ return test_cases
325
+
326
+ def _parse_single_test_case(self, name: str, content: str) -> Optional[TestCase]:
327
+ """Parse a single test case from markdown content.
328
+
329
+ Args:
330
+ name: Test case name
331
+ content: Test case markdown content
332
+
333
+ Returns:
334
+ Parsed test case or None if parsing fails
335
+ """
336
+ # Extract prompt - required
337
+ prompt = extract_inline_field(content, "Prompt")
338
+ if not prompt:
339
+ return None
340
+
341
+ # Extract expected output
342
+ expected_output = extract_inline_field(content, "Expected Output")
343
+
344
+ # Parse evaluation criteria
345
+ evaluation = parse_key_value_block(extract_block(content, "Evaluation"))
346
+
347
+ # Parse planning requirements
348
+ requires_plan_text = extract_inline_field(content, "Requires Plan")
349
+ requires_plan = bool(requires_plan_text and requires_plan_text.strip().lower() in {"true", "yes", "1"})
350
+
351
+ expected_plan_elements = parse_bullet_list(extract_block(content, "Expected Plan Elements"))
352
+ plan_evaluation = parse_key_value_block(extract_block(content, "Plan Evaluation"))
353
+
354
+ # Parse LLM evaluation fields
355
+ use_llm_evaluation_text = extract_inline_field(content, "Use LLM Evaluation")
356
+ use_llm_evaluation = bool(
357
+ use_llm_evaluation_text and use_llm_evaluation_text.strip().lower() in {"true", "yes", "1"}
358
+ )
359
+
360
+ llm_evaluation_criteria = extract_inline_field(content, "LLM Evaluation Criteria") or ""
361
+
362
+ llm_rubric_block = extract_block(content, "LLM Evaluation Rubric")
363
+ llm_evaluation_rubric = parse_key_value_block(llm_rubric_block)
364
+
365
+ return TestCase(
366
+ name=name,
367
+ prompt=prompt,
368
+ expected_output=expected_output,
369
+ evaluation=evaluation,
370
+ weight=1.0,
371
+ requires_plan=requires_plan,
372
+ expected_plan_elements=expected_plan_elements,
373
+ plan_evaluation=plan_evaluation,
374
+ use_llm_evaluation=use_llm_evaluation,
375
+ llm_evaluation_criteria=llm_evaluation_criteria,
376
+ llm_evaluation_rubric=llm_evaluation_rubric,
377
+ )