tsugite-cli 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsugite/__init__.py +6 -0
- tsugite/agent_composition.py +163 -0
- tsugite/agent_inheritance.py +479 -0
- tsugite/agent_preparation.py +236 -0
- tsugite/agent_runner/__init__.py +45 -0
- tsugite/agent_runner/helpers.py +106 -0
- tsugite/agent_runner/history_integration.py +248 -0
- tsugite/agent_runner/metrics.py +100 -0
- tsugite/agent_runner/runner.py +1879 -0
- tsugite/agent_runner/validation.py +70 -0
- tsugite/agent_utils.py +167 -0
- tsugite/attachments/__init__.py +65 -0
- tsugite/attachments/auto_context.py +199 -0
- tsugite/attachments/base.py +34 -0
- tsugite/attachments/file.py +51 -0
- tsugite/attachments/inline.py +31 -0
- tsugite/attachments/storage.py +178 -0
- tsugite/attachments/url.py +59 -0
- tsugite/attachments/youtube.py +101 -0
- tsugite/benchmark/__init__.py +62 -0
- tsugite/benchmark/config.py +183 -0
- tsugite/benchmark/core.py +292 -0
- tsugite/benchmark/discovery.py +377 -0
- tsugite/benchmark/evaluators.py +671 -0
- tsugite/benchmark/execution.py +657 -0
- tsugite/benchmark/metrics.py +204 -0
- tsugite/benchmark/reports.py +420 -0
- tsugite/benchmark/utils.py +288 -0
- tsugite/builtin_agents/chat-assistant.md +53 -0
- tsugite/builtin_agents/default.md +140 -0
- tsugite/builtin_agents.py +5 -0
- tsugite/cache.py +195 -0
- tsugite/cli/__init__.py +1042 -0
- tsugite/cli/agents.py +148 -0
- tsugite/cli/attachments.py +193 -0
- tsugite/cli/benchmark.py +663 -0
- tsugite/cli/cache.py +113 -0
- tsugite/cli/config.py +272 -0
- tsugite/cli/helpers.py +534 -0
- tsugite/cli/history.py +193 -0
- tsugite/cli/init.py +387 -0
- tsugite/cli/mcp.py +193 -0
- tsugite/cli/tools.py +419 -0
- tsugite/config.py +204 -0
- tsugite/console.py +48 -0
- tsugite/constants.py +21 -0
- tsugite/core/__init__.py +19 -0
- tsugite/core/agent.py +774 -0
- tsugite/core/executor.py +300 -0
- tsugite/core/memory.py +67 -0
- tsugite/core/tools.py +271 -0
- tsugite/docker_cli.py +270 -0
- tsugite/events/__init__.py +55 -0
- tsugite/events/base.py +46 -0
- tsugite/events/bus.py +62 -0
- tsugite/events/events.py +224 -0
- tsugite/exceptions.py +40 -0
- tsugite/history/__init__.py +29 -0
- tsugite/history/index.py +210 -0
- tsugite/history/models.py +106 -0
- tsugite/history/storage.py +157 -0
- tsugite/mcp_client.py +219 -0
- tsugite/mcp_config.py +174 -0
- tsugite/md_agents.py +751 -0
- tsugite/models.py +257 -0
- tsugite/renderer.py +151 -0
- tsugite/shell_tool_config.py +265 -0
- tsugite/templates/assistant.md +14 -0
- tsugite/tools/__init__.py +265 -0
- tsugite/tools/agents.py +312 -0
- tsugite/tools/edit_strategies.py +393 -0
- tsugite/tools/fs.py +329 -0
- tsugite/tools/http.py +239 -0
- tsugite/tools/interactive.py +430 -0
- tsugite/tools/shell.py +129 -0
- tsugite/tools/shell_tools.py +214 -0
- tsugite/tools/tasks.py +339 -0
- tsugite/tsugite.py +7 -0
- tsugite/ui/__init__.py +46 -0
- tsugite/ui/base.py +638 -0
- tsugite/ui/chat.py +265 -0
- tsugite/ui/chat.tcss +92 -0
- tsugite/ui/chat_history.py +286 -0
- tsugite/ui/helpers.py +102 -0
- tsugite/ui/jsonl.py +125 -0
- tsugite/ui/live_template.py +529 -0
- tsugite/ui/plain.py +419 -0
- tsugite/ui/textual_chat.py +642 -0
- tsugite/ui/textual_handler.py +225 -0
- tsugite/ui/widgets/__init__.py +6 -0
- tsugite/ui/widgets/base_scroll_log.py +27 -0
- tsugite/ui/widgets/message_list.py +121 -0
- tsugite/ui/widgets/thought_log.py +80 -0
- tsugite/ui_context.py +90 -0
- tsugite/utils.py +367 -0
- tsugite/xdg.py +104 -0
- tsugite_cli-0.3.3.dist-info/METADATA +325 -0
- tsugite_cli-0.3.3.dist-info/RECORD +101 -0
- tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
- tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
- tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
tsugite/cli/benchmark.py
ADDED
|
@@ -0,0 +1,663 @@
|
|
|
1
|
+
"""Benchmark CLI commands."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.panel import Panel
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
|
|
11
|
+
console = Console()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def benchmark_command(
|
|
15
|
+
action: str = typer.Argument(help="Action: run, view, list"),
|
|
16
|
+
models: Optional[str] = typer.Option(None, "--models", help="Comma-separated list of models to test"),
|
|
17
|
+
categories: Optional[str] = typer.Option(None, "--categories", help="Comma-separated list of categories to test"),
|
|
18
|
+
agent_path: Optional[str] = typer.Option(None, "--agent", help="Path to specific agent file to test"),
|
|
19
|
+
output: Optional[str] = typer.Option(None, "--output", help="Output file for report"),
|
|
20
|
+
format: Optional[str] = typer.Option("markdown", "--format", help="Report format: json, markdown, html, csv"),
|
|
21
|
+
test_filter: Optional[str] = typer.Option(None, "--filter", help="Filter tests by name/ID"),
|
|
22
|
+
parallel: bool = typer.Option(True, "--parallel/--sequential", help="Run tests in parallel"),
|
|
23
|
+
repeat: int = typer.Option(1, "--repeat", help="Number of times to repeat each test"),
|
|
24
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed test outputs and case breakdowns"),
|
|
25
|
+
):
|
|
26
|
+
"""Run benchmarks and generate reports."""
|
|
27
|
+
if action == "run":
|
|
28
|
+
run_benchmark_action(
|
|
29
|
+
console=console,
|
|
30
|
+
models=models,
|
|
31
|
+
categories=categories,
|
|
32
|
+
agent_path=agent_path,
|
|
33
|
+
output=output,
|
|
34
|
+
format=format,
|
|
35
|
+
test_filter=test_filter,
|
|
36
|
+
parallel=parallel,
|
|
37
|
+
repeat=repeat,
|
|
38
|
+
)
|
|
39
|
+
elif action == "view":
|
|
40
|
+
view_benchmark_action(console=console, output=output, verbose=verbose)
|
|
41
|
+
elif action == "list":
|
|
42
|
+
list_benchmark_action(console=console)
|
|
43
|
+
else:
|
|
44
|
+
console.print(f"[red]Unknown action: {action}[/red]")
|
|
45
|
+
console.print("Available actions: run, view, list")
|
|
46
|
+
raise typer.Exit(1)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def parse_benchmark_run_args(
|
|
50
|
+
models: Optional[str], categories: Optional[str], console: Console
|
|
51
|
+
) -> tuple[List[str], List[str]]:
|
|
52
|
+
"""Parse and validate models and categories arguments.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
models: Comma-separated list of models
|
|
56
|
+
categories: Comma-separated list of categories (optional)
|
|
57
|
+
console: Rich console for error output
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Tuple of (model_list, category_list)
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
typer.Exit: If models argument is missing
|
|
64
|
+
"""
|
|
65
|
+
import typer
|
|
66
|
+
|
|
67
|
+
# Parse models list (required)
|
|
68
|
+
if not models:
|
|
69
|
+
console.print("[red]Error: --models is required for run action[/red]")
|
|
70
|
+
raise typer.Exit(1)
|
|
71
|
+
|
|
72
|
+
model_list = [m.strip() for m in models.split(",")]
|
|
73
|
+
|
|
74
|
+
# Parse categories list (optional, defaults to ["basic"])
|
|
75
|
+
category_list = ["basic"]
|
|
76
|
+
if categories:
|
|
77
|
+
category_list = [c.strip() for c in categories.split(",")]
|
|
78
|
+
|
|
79
|
+
return model_list, category_list
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def print_benchmark_summary(
|
|
83
|
+
console: Console,
|
|
84
|
+
duration: float,
|
|
85
|
+
model_count: int,
|
|
86
|
+
total_tests: int,
|
|
87
|
+
avg_accuracy: float,
|
|
88
|
+
best_model: Optional[str] = None,
|
|
89
|
+
) -> None:
|
|
90
|
+
"""Print benchmark summary section.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
console: Rich console for output
|
|
94
|
+
duration: Total duration in seconds
|
|
95
|
+
model_count: Number of models tested
|
|
96
|
+
total_tests: Total number of tests
|
|
97
|
+
avg_accuracy: Average accuracy (0.0-1.0)
|
|
98
|
+
best_model: Name of best performing model (optional)
|
|
99
|
+
"""
|
|
100
|
+
console.print("\n" + "=" * 50)
|
|
101
|
+
console.print("[bold green]Benchmark Complete[/bold green]")
|
|
102
|
+
console.print("=" * 50)
|
|
103
|
+
console.print(f"Duration: {duration:.2f}s")
|
|
104
|
+
console.print(f"Models: {model_count}")
|
|
105
|
+
console.print(f"Tests: {total_tests}")
|
|
106
|
+
console.print(f"Average Accuracy: {avg_accuracy:.1%}")
|
|
107
|
+
if best_model:
|
|
108
|
+
console.print(f"Best Model: {best_model}")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def print_model_performance(console: Console, model_performances: Dict[str, Any]) -> None:
|
|
112
|
+
"""Print per-model performance metrics.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
console: Rich console for output
|
|
116
|
+
model_performances: Dict mapping model names to performance data
|
|
117
|
+
"""
|
|
118
|
+
if not model_performances:
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
console.print("\n[bold]Model Performance:[/bold]")
|
|
122
|
+
for model_name, perf in model_performances.items():
|
|
123
|
+
# Handle both dict (from JSON) and object (from result) formats
|
|
124
|
+
if isinstance(perf, dict):
|
|
125
|
+
accuracy = perf["accuracy"]
|
|
126
|
+
passed = perf["passed_tests"]
|
|
127
|
+
total = perf["total_tests"]
|
|
128
|
+
avg_dur = perf["average_duration"]
|
|
129
|
+
avg_steps = perf.get("average_steps", 0)
|
|
130
|
+
total_cost = perf.get("total_cost", 0)
|
|
131
|
+
else:
|
|
132
|
+
accuracy = perf.accuracy
|
|
133
|
+
passed = perf.passed_tests
|
|
134
|
+
total = perf.total_tests
|
|
135
|
+
avg_dur = perf.average_duration
|
|
136
|
+
avg_steps = perf.average_steps
|
|
137
|
+
total_cost = perf.total_cost
|
|
138
|
+
|
|
139
|
+
console.print(f" [cyan]{model_name}[/cyan]:")
|
|
140
|
+
console.print(f" Accuracy: {accuracy:.1%} ({passed}/{total} passed)")
|
|
141
|
+
console.print(f" Avg Duration: {avg_dur:.2f}s")
|
|
142
|
+
console.print(f" Avg Steps: {avg_steps:.1f}")
|
|
143
|
+
if total_cost > 0:
|
|
144
|
+
console.print(f" Total Cost: ${total_cost:.4f}")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def print_test_results_table(
|
|
148
|
+
console: Console,
|
|
149
|
+
model_name: str,
|
|
150
|
+
test_results: Dict[str, Any],
|
|
151
|
+
) -> None:
|
|
152
|
+
"""Print test results table for a single model.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
console: Rich console for output
|
|
156
|
+
model_name: Name of the model
|
|
157
|
+
test_results: Dict mapping test IDs to test result data
|
|
158
|
+
"""
|
|
159
|
+
console.print(f"\n[bold]Test Results - {model_name}:[/bold]")
|
|
160
|
+
|
|
161
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
162
|
+
table.add_column("Test ID", style="dim", width=30)
|
|
163
|
+
table.add_column("Category", width=12)
|
|
164
|
+
table.add_column("Status", justify="center", width=10)
|
|
165
|
+
table.add_column("Score", justify="right", width=8)
|
|
166
|
+
table.add_column("Duration", justify="right", width=10)
|
|
167
|
+
table.add_column("Steps", justify="right", width=8)
|
|
168
|
+
table.add_column("Cost", justify="right", width=10)
|
|
169
|
+
|
|
170
|
+
test_items = list(test_results.items())
|
|
171
|
+
|
|
172
|
+
# Show all tests if <=20, otherwise show first 15 and last 5
|
|
173
|
+
num_hidden = 0
|
|
174
|
+
if len(test_items) <= 20:
|
|
175
|
+
display_tests = test_items
|
|
176
|
+
truncated = False
|
|
177
|
+
else:
|
|
178
|
+
display_tests = test_items[:15] + test_items[-5:]
|
|
179
|
+
truncated = True
|
|
180
|
+
num_hidden = len(test_items) - 20
|
|
181
|
+
|
|
182
|
+
for i, (test_id, test_result) in enumerate(display_tests):
|
|
183
|
+
# Add separator row if we truncated
|
|
184
|
+
if truncated and i == 15:
|
|
185
|
+
table.add_row(f"... {num_hidden} more tests ...", "", "", "", "", "", "", style="dim italic")
|
|
186
|
+
|
|
187
|
+
# Handle both dict (from JSON) and object (from result) formats
|
|
188
|
+
if isinstance(test_result, dict):
|
|
189
|
+
category = test_result.get("category", "unknown")
|
|
190
|
+
passed = test_result.get("passed", False)
|
|
191
|
+
score = test_result.get("score", 0)
|
|
192
|
+
duration = test_result.get("duration", 0)
|
|
193
|
+
steps = test_result.get("steps_taken", 0)
|
|
194
|
+
cost_val = test_result.get("cost", 0)
|
|
195
|
+
else:
|
|
196
|
+
category = test_result.category
|
|
197
|
+
passed = test_result.passed
|
|
198
|
+
score = test_result.score
|
|
199
|
+
duration = test_result.duration
|
|
200
|
+
steps = test_result.steps_taken
|
|
201
|
+
cost_val = test_result.cost
|
|
202
|
+
|
|
203
|
+
status = "✅ PASS" if passed else "❌ FAIL"
|
|
204
|
+
status_style = "green" if passed else "red"
|
|
205
|
+
score_str = f"{score:.2f}"
|
|
206
|
+
duration_str = f"{duration:.2f}s"
|
|
207
|
+
steps_str = str(steps)
|
|
208
|
+
cost_str = f"${cost_val:.4f}" if cost_val > 0 else "$0.00"
|
|
209
|
+
|
|
210
|
+
table.add_row(
|
|
211
|
+
test_id,
|
|
212
|
+
category,
|
|
213
|
+
f"[{status_style}]{status}[/{status_style}]",
|
|
214
|
+
score_str,
|
|
215
|
+
duration_str,
|
|
216
|
+
steps_str,
|
|
217
|
+
cost_str,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
console.print(table)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def print_detailed_test_results(console: Console, detailed_results: Dict[str, Dict[str, Any]]) -> None:
|
|
224
|
+
"""Print detailed test results with outputs and errors (verbose mode).
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
console: Rich console for output
|
|
228
|
+
detailed_results: Dict mapping model names to test results
|
|
229
|
+
"""
|
|
230
|
+
console.print("\n")
|
|
231
|
+
console.rule("[bold cyan]Detailed Test Results[/bold cyan]")
|
|
232
|
+
|
|
233
|
+
for model_name, tests in detailed_results.items():
|
|
234
|
+
for test_id, test_result in tests.items():
|
|
235
|
+
# Test header
|
|
236
|
+
passed = test_result.get("passed", False)
|
|
237
|
+
status_icon = "✅" if passed else "❌"
|
|
238
|
+
status_text = "PASS" if passed else "FAIL"
|
|
239
|
+
status_color = "green" if passed else "red"
|
|
240
|
+
|
|
241
|
+
console.print(f"\n📝 [bold]{test_id}[/bold]")
|
|
242
|
+
cost_val = test_result.get("cost", 0)
|
|
243
|
+
cost_str = f"${cost_val:.4f}" if cost_val > 0 else "$0.00"
|
|
244
|
+
console.print(
|
|
245
|
+
f" Status: [{status_color}]{status_icon} {status_text}[/{status_color}] | "
|
|
246
|
+
f"Score: {test_result.get('score', 0):.2f} | "
|
|
247
|
+
f"Duration: {test_result.get('duration', 0):.2f}s | "
|
|
248
|
+
f"Steps: {test_result.get('steps_taken', 0)} | "
|
|
249
|
+
f"Cost: {cost_str}"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# Individual test cases (if available)
|
|
253
|
+
metrics = test_result.get("metrics", {})
|
|
254
|
+
case_results = metrics.get("case_results", [])
|
|
255
|
+
if case_results:
|
|
256
|
+
total_cases = metrics.get("total_cases", len(case_results))
|
|
257
|
+
console.print(f"\n Test Cases ({total_cases}):")
|
|
258
|
+
for case in case_results:
|
|
259
|
+
case_passed = case.get("passed", False)
|
|
260
|
+
case_icon = "✅" if case_passed else "❌"
|
|
261
|
+
case_score = case.get("score", 0)
|
|
262
|
+
case_name = case.get("test_case", "Unknown")
|
|
263
|
+
console.print(f" • {case_name} {case_icon} (score: {case_score:.2f})")
|
|
264
|
+
|
|
265
|
+
# Actual vs Expected Output
|
|
266
|
+
actual = test_result.get("output", "")
|
|
267
|
+
expected = test_result.get("expected_output", "")
|
|
268
|
+
|
|
269
|
+
if actual or expected:
|
|
270
|
+
console.print("\n [bold]Actual Output:[/bold]")
|
|
271
|
+
if actual:
|
|
272
|
+
# Truncate if too long
|
|
273
|
+
display_actual = actual if len(actual) <= 500 else actual[:500] + "\n... (truncated)"
|
|
274
|
+
panel = Panel(display_actual, border_style="cyan", padding=(0, 1))
|
|
275
|
+
console.print(panel)
|
|
276
|
+
else:
|
|
277
|
+
console.print(" [dim](empty)[/dim]")
|
|
278
|
+
|
|
279
|
+
if expected:
|
|
280
|
+
console.print("\n [bold]Expected Output:[/bold]")
|
|
281
|
+
display_expected = expected if len(expected) <= 200 else expected[:200] + "..."
|
|
282
|
+
console.print(f" [yellow]{display_expected}[/yellow]")
|
|
283
|
+
|
|
284
|
+
# Error details
|
|
285
|
+
error = test_result.get("error")
|
|
286
|
+
if error:
|
|
287
|
+
console.print("\n [bold red]Error:[/bold red]")
|
|
288
|
+
error_panel = Panel(error, border_style="red", padding=(0, 1))
|
|
289
|
+
console.print(error_panel)
|
|
290
|
+
|
|
291
|
+
console.print() # Blank line between tests
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def print_error_summary(console: Console, errors: List[str], verbose: bool = False) -> None:
|
|
295
|
+
"""Print error summary.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
console: Rich console for output
|
|
299
|
+
errors: List of error messages
|
|
300
|
+
verbose: If True, show all errors; otherwise show first 3
|
|
301
|
+
"""
|
|
302
|
+
if not errors:
|
|
303
|
+
return
|
|
304
|
+
|
|
305
|
+
error_count = len(errors)
|
|
306
|
+
console.print(f"\n[bold red]Errors ({error_count}):[/bold red]")
|
|
307
|
+
|
|
308
|
+
# In verbose mode, show all errors; otherwise show first 3
|
|
309
|
+
display_errors = errors if verbose else errors[:3]
|
|
310
|
+
|
|
311
|
+
for error in display_errors:
|
|
312
|
+
console.print(f" [red]•[/red] {error}")
|
|
313
|
+
|
|
314
|
+
if not verbose and len(errors) > 3:
|
|
315
|
+
remaining = len(errors) - 3
|
|
316
|
+
console.print(f" [dim]... and {remaining} more errors (use --verbose to see all)[/dim]")
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def load_benchmark_file(file_path: Path, console: Console) -> Dict[str, Any]:
|
|
320
|
+
"""Load benchmark JSON file.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
file_path: Path to benchmark JSON file
|
|
324
|
+
console: Rich console for error output
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Loaded JSON data as dict
|
|
328
|
+
|
|
329
|
+
Raises:
|
|
330
|
+
typer.Exit: If file doesn't exist or can't be loaded
|
|
331
|
+
"""
|
|
332
|
+
import json
|
|
333
|
+
|
|
334
|
+
import typer
|
|
335
|
+
|
|
336
|
+
if not file_path.exists():
|
|
337
|
+
console.print(f"[red]File not found: {file_path}[/red]")
|
|
338
|
+
raise typer.Exit(1)
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
with open(file_path) as f:
|
|
342
|
+
return json.load(f)
|
|
343
|
+
except Exception as e:
|
|
344
|
+
console.print(f"[red]Failed to load benchmark data: {e}[/red]")
|
|
345
|
+
raise typer.Exit(1)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def find_latest_benchmark() -> Optional[Path]:
|
|
349
|
+
"""Find the most recent benchmark JSON file.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Path to latest benchmark file, or None if no files found
|
|
353
|
+
"""
|
|
354
|
+
results_dir = Path("benchmark_results")
|
|
355
|
+
if not results_dir.exists():
|
|
356
|
+
return None
|
|
357
|
+
|
|
358
|
+
# Find all benchmark JSON files
|
|
359
|
+
json_files = list(results_dir.glob("benchmark_*.json"))
|
|
360
|
+
if not json_files:
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
# Sort by modification time, most recent first
|
|
364
|
+
json_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
365
|
+
return json_files[0]
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def resolve_benchmark_file(output: Optional[str], console: Console) -> Path:
|
|
369
|
+
"""Resolve which benchmark file to view.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
output: User-specified file path or "latest"
|
|
373
|
+
console: Rich console for error output
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Path to benchmark file
|
|
377
|
+
|
|
378
|
+
Raises:
|
|
379
|
+
typer.Exit: If no files found
|
|
380
|
+
"""
|
|
381
|
+
import typer
|
|
382
|
+
|
|
383
|
+
if output:
|
|
384
|
+
# User specified a specific file
|
|
385
|
+
if output == "latest":
|
|
386
|
+
file_path = find_latest_benchmark()
|
|
387
|
+
if not file_path:
|
|
388
|
+
console.print("[red]No benchmark results found in benchmark_results/[/red]")
|
|
389
|
+
raise typer.Exit(1)
|
|
390
|
+
else:
|
|
391
|
+
file_path = Path(output)
|
|
392
|
+
else:
|
|
393
|
+
# Default to latest
|
|
394
|
+
file_path = find_latest_benchmark()
|
|
395
|
+
if not file_path:
|
|
396
|
+
console.print("[red]No benchmark results found in benchmark_results/[/red]")
|
|
397
|
+
console.print("[dim]Run benchmarks first with: tsugite benchmark run --models <model>[/dim]")
|
|
398
|
+
raise typer.Exit(1)
|
|
399
|
+
|
|
400
|
+
return file_path
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def run_benchmark_action(
|
|
404
|
+
console: Console,
|
|
405
|
+
models: Optional[str],
|
|
406
|
+
categories: Optional[str],
|
|
407
|
+
agent_path: Optional[str],
|
|
408
|
+
output: Optional[str],
|
|
409
|
+
format: str,
|
|
410
|
+
test_filter: Optional[str],
|
|
411
|
+
parallel: bool,
|
|
412
|
+
repeat: int,
|
|
413
|
+
) -> None:
|
|
414
|
+
"""Handle the 'run' benchmark action.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
console: Rich console for output
|
|
418
|
+
models: Comma-separated list of models
|
|
419
|
+
categories: Comma-separated list of categories
|
|
420
|
+
agent_path: Path to specific agent file
|
|
421
|
+
output: Output file path
|
|
422
|
+
format: Report format (json, markdown, html, csv)
|
|
423
|
+
test_filter: Filter tests by name/ID
|
|
424
|
+
parallel: Run tests in parallel
|
|
425
|
+
repeat: Number of times to repeat each test
|
|
426
|
+
"""
|
|
427
|
+
import asyncio
|
|
428
|
+
|
|
429
|
+
import typer
|
|
430
|
+
|
|
431
|
+
from tsugite.benchmark import BenchmarkConfig, BenchmarkRunner
|
|
432
|
+
from tsugite.benchmark.reports import ReportGenerator
|
|
433
|
+
|
|
434
|
+
# Parse and validate arguments
|
|
435
|
+
model_list, category_list = parse_benchmark_run_args(models, categories, console)
|
|
436
|
+
|
|
437
|
+
# Create config
|
|
438
|
+
config = BenchmarkConfig(
|
|
439
|
+
models=model_list,
|
|
440
|
+
categories=category_list,
|
|
441
|
+
parallel=parallel,
|
|
442
|
+
repeat_count=repeat,
|
|
443
|
+
output_dir=Path("benchmark_results"),
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
console.print("[cyan]Running benchmarks...[/cyan]")
|
|
447
|
+
console.print(f"Models: {', '.join(model_list)}")
|
|
448
|
+
|
|
449
|
+
if agent_path:
|
|
450
|
+
console.print(f"Agent: {agent_path}")
|
|
451
|
+
else:
|
|
452
|
+
console.print(f"Categories: {', '.join(config.categories)}")
|
|
453
|
+
|
|
454
|
+
# Run benchmark
|
|
455
|
+
runner = BenchmarkRunner(config)
|
|
456
|
+
try:
|
|
457
|
+
result = asyncio.run(
|
|
458
|
+
runner.run_benchmark(
|
|
459
|
+
models=model_list,
|
|
460
|
+
categories=category_list,
|
|
461
|
+
test_filter=test_filter,
|
|
462
|
+
agent_path=Path(agent_path) if agent_path else None,
|
|
463
|
+
)
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Generate reports
|
|
467
|
+
output_dir = Path("benchmark_results")
|
|
468
|
+
output_dir.mkdir(exist_ok=True)
|
|
469
|
+
timestamp = result.start_time.strftime("%Y%m%d_%H%M%S")
|
|
470
|
+
report_gen = ReportGenerator(result)
|
|
471
|
+
|
|
472
|
+
# JSON report (always generated for data)
|
|
473
|
+
json_path = output_dir / f"benchmark_{timestamp}.json"
|
|
474
|
+
report_gen.generate_json_report(json_path)
|
|
475
|
+
console.print(f"[green]JSON report saved: {json_path}[/green]")
|
|
476
|
+
|
|
477
|
+
# Main report in requested format
|
|
478
|
+
if output:
|
|
479
|
+
output_path = Path(output)
|
|
480
|
+
else:
|
|
481
|
+
output_path = output_dir / f"benchmark_{timestamp}.{format}"
|
|
482
|
+
|
|
483
|
+
if format == "json":
|
|
484
|
+
report_gen.generate_json_report(output_path)
|
|
485
|
+
elif format == "markdown":
|
|
486
|
+
report_gen.generate_markdown_report(output_path)
|
|
487
|
+
elif format == "html":
|
|
488
|
+
report_gen.generate_html_report(output_path)
|
|
489
|
+
elif format == "csv":
|
|
490
|
+
report_gen.generate_csv_summary(output_path)
|
|
491
|
+
else:
|
|
492
|
+
console.print(f"[red]Unknown format: {format}[/red]")
|
|
493
|
+
raise typer.Exit(1)
|
|
494
|
+
|
|
495
|
+
console.print(f"[green]Report saved: {output_path}[/green]")
|
|
496
|
+
|
|
497
|
+
# Print summary
|
|
498
|
+
summary = result.summary
|
|
499
|
+
print_benchmark_summary(
|
|
500
|
+
console,
|
|
501
|
+
duration=result.total_duration,
|
|
502
|
+
model_count=len(result.model_performances),
|
|
503
|
+
total_tests=summary.get("total_tests", 0),
|
|
504
|
+
avg_accuracy=summary.get("average_accuracy", 0),
|
|
505
|
+
best_model=summary.get("best_model"),
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# Print model performance
|
|
509
|
+
print_model_performance(console, result.model_performances)
|
|
510
|
+
|
|
511
|
+
# Print test results tables
|
|
512
|
+
if result.model_performances:
|
|
513
|
+
for model_name, _ in result.model_performances.items():
|
|
514
|
+
model_tests = result.test_results.get(model_name, {})
|
|
515
|
+
print_test_results_table(console, model_name, model_tests)
|
|
516
|
+
|
|
517
|
+
# Print errors
|
|
518
|
+
print_error_summary(console, result.errors, verbose=False)
|
|
519
|
+
|
|
520
|
+
except Exception as e:
|
|
521
|
+
console.print(f"[red]Benchmark failed: {e}[/red]")
|
|
522
|
+
raise typer.Exit(1)
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def view_benchmark_action(
|
|
526
|
+
console: Console,
|
|
527
|
+
output: Optional[str],
|
|
528
|
+
verbose: bool,
|
|
529
|
+
) -> None:
|
|
530
|
+
"""Handle the 'view' benchmark action.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
console: Rich console for output
|
|
534
|
+
output: File path or "latest"
|
|
535
|
+
verbose: Show detailed test outputs
|
|
536
|
+
"""
|
|
537
|
+
# Resolve and load file
|
|
538
|
+
file_path = resolve_benchmark_file(output, console)
|
|
539
|
+
data = load_benchmark_file(file_path, console)
|
|
540
|
+
|
|
541
|
+
# Display results
|
|
542
|
+
console.print("\n[bold cyan]Benchmark Results[/bold cyan]")
|
|
543
|
+
console.print(f"[dim]File: {file_path}[/dim]")
|
|
544
|
+
console.print(f"[dim]Generated: {data.get('generated_at', 'Unknown')}[/dim]\n")
|
|
545
|
+
|
|
546
|
+
console.print("=" * 50)
|
|
547
|
+
console.print("[bold green]Summary[/bold green]")
|
|
548
|
+
console.print("=" * 50)
|
|
549
|
+
|
|
550
|
+
# Print summary
|
|
551
|
+
benchmark_info = data.get("benchmark_info", {})
|
|
552
|
+
summary = data.get("summary", {})
|
|
553
|
+
|
|
554
|
+
print_benchmark_summary(
|
|
555
|
+
console,
|
|
556
|
+
duration=benchmark_info.get("total_duration", 0),
|
|
557
|
+
model_count=len(benchmark_info.get("models_tested", [])),
|
|
558
|
+
total_tests=benchmark_info.get("total_tests", 0),
|
|
559
|
+
avg_accuracy=summary.get("average_accuracy", 0),
|
|
560
|
+
best_model=summary.get("best_model"),
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
# Print model performance
|
|
564
|
+
model_performances = data.get("model_performances", {})
|
|
565
|
+
print_model_performance(console, model_performances)
|
|
566
|
+
|
|
567
|
+
# Print test results tables
|
|
568
|
+
detailed_results = data.get("detailed_results", {})
|
|
569
|
+
for model_name, tests in detailed_results.items():
|
|
570
|
+
print_test_results_table(console, model_name, tests)
|
|
571
|
+
|
|
572
|
+
# Detailed test breakdown (verbose mode)
|
|
573
|
+
if verbose:
|
|
574
|
+
print_detailed_test_results(console, detailed_results)
|
|
575
|
+
|
|
576
|
+
# Print errors
|
|
577
|
+
errors = data.get("errors", [])
|
|
578
|
+
print_error_summary(console, errors, verbose)
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def list_benchmark_action(console: Console) -> None:
|
|
582
|
+
"""Handle the 'list' benchmark action.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
console: Rich console for output
|
|
586
|
+
"""
|
|
587
|
+
import json
|
|
588
|
+
from datetime import datetime
|
|
589
|
+
|
|
590
|
+
import typer
|
|
591
|
+
|
|
592
|
+
results_dir = Path("benchmark_results")
|
|
593
|
+
if not results_dir.exists():
|
|
594
|
+
console.print("[red]No benchmark results directory found[/red]")
|
|
595
|
+
console.print("[dim]Run benchmarks first with: tsugite benchmark run --models <model>[/dim]")
|
|
596
|
+
raise typer.Exit(1)
|
|
597
|
+
|
|
598
|
+
# Find all benchmark JSON files
|
|
599
|
+
json_files = list(results_dir.glob("benchmark_*.json"))
|
|
600
|
+
if not json_files:
|
|
601
|
+
console.print("[red]No benchmark results found in benchmark_results/[/red]")
|
|
602
|
+
console.print("[dim]Run benchmarks first with: tsugite benchmark run --models <model>[/dim]")
|
|
603
|
+
raise typer.Exit(1)
|
|
604
|
+
|
|
605
|
+
# Sort by modification time, most recent first
|
|
606
|
+
json_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
607
|
+
|
|
608
|
+
console.print("\n[bold cyan]Benchmark Results[/bold cyan]")
|
|
609
|
+
console.print(f"Found {len(json_files)} result file(s) in {results_dir}/\n")
|
|
610
|
+
|
|
611
|
+
table = Table(show_header=True, header_style="bold magenta")
|
|
612
|
+
table.add_column("", width=3) # Marker for latest
|
|
613
|
+
table.add_column("Timestamp", style="cyan", width=20)
|
|
614
|
+
table.add_column("Models", width=15)
|
|
615
|
+
table.add_column("Tests", justify="right", width=7)
|
|
616
|
+
table.add_column("Accuracy", justify="right", width=10)
|
|
617
|
+
table.add_column("Duration", justify="right", width=10)
|
|
618
|
+
table.add_column("File", style="dim", width=30)
|
|
619
|
+
|
|
620
|
+
for i, file_path in enumerate(json_files):
|
|
621
|
+
try:
|
|
622
|
+
with open(file_path) as f:
|
|
623
|
+
data = json.load(f)
|
|
624
|
+
|
|
625
|
+
benchmark_info = data.get("benchmark_info", {})
|
|
626
|
+
summary = data.get("summary", {})
|
|
627
|
+
|
|
628
|
+
# Extract info
|
|
629
|
+
timestamp_str = benchmark_info.get("start_time", "")
|
|
630
|
+
if timestamp_str:
|
|
631
|
+
timestamp = datetime.fromisoformat(timestamp_str).strftime("%Y-%m-%d %H:%M:%S")
|
|
632
|
+
else:
|
|
633
|
+
timestamp = "Unknown"
|
|
634
|
+
|
|
635
|
+
models = benchmark_info.get("models_tested", [])
|
|
636
|
+
model_str = ", ".join(models) if models else "Unknown"
|
|
637
|
+
if len(model_str) > 15:
|
|
638
|
+
model_str = model_str[:12] + "..."
|
|
639
|
+
|
|
640
|
+
tests = benchmark_info.get("total_tests", 0)
|
|
641
|
+
accuracy = summary.get("average_accuracy", 0)
|
|
642
|
+
duration = benchmark_info.get("total_duration", 0)
|
|
643
|
+
|
|
644
|
+
# Mark most recent
|
|
645
|
+
marker = "→" if i == 0 else ""
|
|
646
|
+
|
|
647
|
+
table.add_row(
|
|
648
|
+
f"[green]{marker}[/green]" if marker else "",
|
|
649
|
+
timestamp,
|
|
650
|
+
model_str,
|
|
651
|
+
str(tests),
|
|
652
|
+
f"{accuracy:.1%}",
|
|
653
|
+
f"{duration:.1f}s",
|
|
654
|
+
file_path.name,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
except Exception:
|
|
658
|
+
# Skip files that can't be loaded
|
|
659
|
+
table.add_row("", "Error", "-", "-", "-", "-", f"[red]{file_path.name}[/red]")
|
|
660
|
+
|
|
661
|
+
console.print(table)
|
|
662
|
+
console.print("\n[dim]View details: tsugite benchmark view [file|latest][/dim]")
|
|
663
|
+
console.print("[dim]Most recent run is marked with →[/dim]")
|