tsugite-cli 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. tsugite/__init__.py +6 -0
  2. tsugite/agent_composition.py +163 -0
  3. tsugite/agent_inheritance.py +479 -0
  4. tsugite/agent_preparation.py +236 -0
  5. tsugite/agent_runner/__init__.py +45 -0
  6. tsugite/agent_runner/helpers.py +106 -0
  7. tsugite/agent_runner/history_integration.py +248 -0
  8. tsugite/agent_runner/metrics.py +100 -0
  9. tsugite/agent_runner/runner.py +1879 -0
  10. tsugite/agent_runner/validation.py +70 -0
  11. tsugite/agent_utils.py +167 -0
  12. tsugite/attachments/__init__.py +65 -0
  13. tsugite/attachments/auto_context.py +199 -0
  14. tsugite/attachments/base.py +34 -0
  15. tsugite/attachments/file.py +51 -0
  16. tsugite/attachments/inline.py +31 -0
  17. tsugite/attachments/storage.py +178 -0
  18. tsugite/attachments/url.py +59 -0
  19. tsugite/attachments/youtube.py +101 -0
  20. tsugite/benchmark/__init__.py +62 -0
  21. tsugite/benchmark/config.py +183 -0
  22. tsugite/benchmark/core.py +292 -0
  23. tsugite/benchmark/discovery.py +377 -0
  24. tsugite/benchmark/evaluators.py +671 -0
  25. tsugite/benchmark/execution.py +657 -0
  26. tsugite/benchmark/metrics.py +204 -0
  27. tsugite/benchmark/reports.py +420 -0
  28. tsugite/benchmark/utils.py +288 -0
  29. tsugite/builtin_agents/chat-assistant.md +53 -0
  30. tsugite/builtin_agents/default.md +140 -0
  31. tsugite/builtin_agents.py +5 -0
  32. tsugite/cache.py +195 -0
  33. tsugite/cli/__init__.py +1042 -0
  34. tsugite/cli/agents.py +148 -0
  35. tsugite/cli/attachments.py +193 -0
  36. tsugite/cli/benchmark.py +663 -0
  37. tsugite/cli/cache.py +113 -0
  38. tsugite/cli/config.py +272 -0
  39. tsugite/cli/helpers.py +534 -0
  40. tsugite/cli/history.py +193 -0
  41. tsugite/cli/init.py +387 -0
  42. tsugite/cli/mcp.py +193 -0
  43. tsugite/cli/tools.py +419 -0
  44. tsugite/config.py +204 -0
  45. tsugite/console.py +48 -0
  46. tsugite/constants.py +21 -0
  47. tsugite/core/__init__.py +19 -0
  48. tsugite/core/agent.py +774 -0
  49. tsugite/core/executor.py +300 -0
  50. tsugite/core/memory.py +67 -0
  51. tsugite/core/tools.py +271 -0
  52. tsugite/docker_cli.py +270 -0
  53. tsugite/events/__init__.py +55 -0
  54. tsugite/events/base.py +46 -0
  55. tsugite/events/bus.py +62 -0
  56. tsugite/events/events.py +224 -0
  57. tsugite/exceptions.py +40 -0
  58. tsugite/history/__init__.py +29 -0
  59. tsugite/history/index.py +210 -0
  60. tsugite/history/models.py +106 -0
  61. tsugite/history/storage.py +157 -0
  62. tsugite/mcp_client.py +219 -0
  63. tsugite/mcp_config.py +174 -0
  64. tsugite/md_agents.py +751 -0
  65. tsugite/models.py +257 -0
  66. tsugite/renderer.py +151 -0
  67. tsugite/shell_tool_config.py +265 -0
  68. tsugite/templates/assistant.md +14 -0
  69. tsugite/tools/__init__.py +265 -0
  70. tsugite/tools/agents.py +312 -0
  71. tsugite/tools/edit_strategies.py +393 -0
  72. tsugite/tools/fs.py +329 -0
  73. tsugite/tools/http.py +239 -0
  74. tsugite/tools/interactive.py +430 -0
  75. tsugite/tools/shell.py +129 -0
  76. tsugite/tools/shell_tools.py +214 -0
  77. tsugite/tools/tasks.py +339 -0
  78. tsugite/tsugite.py +7 -0
  79. tsugite/ui/__init__.py +46 -0
  80. tsugite/ui/base.py +638 -0
  81. tsugite/ui/chat.py +265 -0
  82. tsugite/ui/chat.tcss +92 -0
  83. tsugite/ui/chat_history.py +286 -0
  84. tsugite/ui/helpers.py +102 -0
  85. tsugite/ui/jsonl.py +125 -0
  86. tsugite/ui/live_template.py +529 -0
  87. tsugite/ui/plain.py +419 -0
  88. tsugite/ui/textual_chat.py +642 -0
  89. tsugite/ui/textual_handler.py +225 -0
  90. tsugite/ui/widgets/__init__.py +6 -0
  91. tsugite/ui/widgets/base_scroll_log.py +27 -0
  92. tsugite/ui/widgets/message_list.py +121 -0
  93. tsugite/ui/widgets/thought_log.py +80 -0
  94. tsugite/ui_context.py +90 -0
  95. tsugite/utils.py +367 -0
  96. tsugite/xdg.py +104 -0
  97. tsugite_cli-0.3.3.dist-info/METADATA +325 -0
  98. tsugite_cli-0.3.3.dist-info/RECORD +101 -0
  99. tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
  100. tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
  101. tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
@@ -0,0 +1,663 @@
1
+ """Benchmark CLI commands."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List, Optional
5
+
6
+ import typer
7
+ from rich.console import Console
8
+ from rich.panel import Panel
9
+ from rich.table import Table
10
+
11
+ console = Console()
12
+
13
+
14
+ def benchmark_command(
15
+ action: str = typer.Argument(help="Action: run, view, list"),
16
+ models: Optional[str] = typer.Option(None, "--models", help="Comma-separated list of models to test"),
17
+ categories: Optional[str] = typer.Option(None, "--categories", help="Comma-separated list of categories to test"),
18
+ agent_path: Optional[str] = typer.Option(None, "--agent", help="Path to specific agent file to test"),
19
+ output: Optional[str] = typer.Option(None, "--output", help="Output file for report"),
20
+ format: Optional[str] = typer.Option("markdown", "--format", help="Report format: json, markdown, html, csv"),
21
+ test_filter: Optional[str] = typer.Option(None, "--filter", help="Filter tests by name/ID"),
22
+ parallel: bool = typer.Option(True, "--parallel/--sequential", help="Run tests in parallel"),
23
+ repeat: int = typer.Option(1, "--repeat", help="Number of times to repeat each test"),
24
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed test outputs and case breakdowns"),
25
+ ):
26
+ """Run benchmarks and generate reports."""
27
+ if action == "run":
28
+ run_benchmark_action(
29
+ console=console,
30
+ models=models,
31
+ categories=categories,
32
+ agent_path=agent_path,
33
+ output=output,
34
+ format=format,
35
+ test_filter=test_filter,
36
+ parallel=parallel,
37
+ repeat=repeat,
38
+ )
39
+ elif action == "view":
40
+ view_benchmark_action(console=console, output=output, verbose=verbose)
41
+ elif action == "list":
42
+ list_benchmark_action(console=console)
43
+ else:
44
+ console.print(f"[red]Unknown action: {action}[/red]")
45
+ console.print("Available actions: run, view, list")
46
+ raise typer.Exit(1)
47
+
48
+
49
+ def parse_benchmark_run_args(
50
+ models: Optional[str], categories: Optional[str], console: Console
51
+ ) -> tuple[List[str], List[str]]:
52
+ """Parse and validate models and categories arguments.
53
+
54
+ Args:
55
+ models: Comma-separated list of models
56
+ categories: Comma-separated list of categories (optional)
57
+ console: Rich console for error output
58
+
59
+ Returns:
60
+ Tuple of (model_list, category_list)
61
+
62
+ Raises:
63
+ typer.Exit: If models argument is missing
64
+ """
65
+ import typer
66
+
67
+ # Parse models list (required)
68
+ if not models:
69
+ console.print("[red]Error: --models is required for run action[/red]")
70
+ raise typer.Exit(1)
71
+
72
+ model_list = [m.strip() for m in models.split(",")]
73
+
74
+ # Parse categories list (optional, defaults to ["basic"])
75
+ category_list = ["basic"]
76
+ if categories:
77
+ category_list = [c.strip() for c in categories.split(",")]
78
+
79
+ return model_list, category_list
80
+
81
+
82
+ def print_benchmark_summary(
83
+ console: Console,
84
+ duration: float,
85
+ model_count: int,
86
+ total_tests: int,
87
+ avg_accuracy: float,
88
+ best_model: Optional[str] = None,
89
+ ) -> None:
90
+ """Print benchmark summary section.
91
+
92
+ Args:
93
+ console: Rich console for output
94
+ duration: Total duration in seconds
95
+ model_count: Number of models tested
96
+ total_tests: Total number of tests
97
+ avg_accuracy: Average accuracy (0.0-1.0)
98
+ best_model: Name of best performing model (optional)
99
+ """
100
+ console.print("\n" + "=" * 50)
101
+ console.print("[bold green]Benchmark Complete[/bold green]")
102
+ console.print("=" * 50)
103
+ console.print(f"Duration: {duration:.2f}s")
104
+ console.print(f"Models: {model_count}")
105
+ console.print(f"Tests: {total_tests}")
106
+ console.print(f"Average Accuracy: {avg_accuracy:.1%}")
107
+ if best_model:
108
+ console.print(f"Best Model: {best_model}")
109
+
110
+
111
+ def print_model_performance(console: Console, model_performances: Dict[str, Any]) -> None:
112
+ """Print per-model performance metrics.
113
+
114
+ Args:
115
+ console: Rich console for output
116
+ model_performances: Dict mapping model names to performance data
117
+ """
118
+ if not model_performances:
119
+ return
120
+
121
+ console.print("\n[bold]Model Performance:[/bold]")
122
+ for model_name, perf in model_performances.items():
123
+ # Handle both dict (from JSON) and object (from result) formats
124
+ if isinstance(perf, dict):
125
+ accuracy = perf["accuracy"]
126
+ passed = perf["passed_tests"]
127
+ total = perf["total_tests"]
128
+ avg_dur = perf["average_duration"]
129
+ avg_steps = perf.get("average_steps", 0)
130
+ total_cost = perf.get("total_cost", 0)
131
+ else:
132
+ accuracy = perf.accuracy
133
+ passed = perf.passed_tests
134
+ total = perf.total_tests
135
+ avg_dur = perf.average_duration
136
+ avg_steps = perf.average_steps
137
+ total_cost = perf.total_cost
138
+
139
+ console.print(f" [cyan]{model_name}[/cyan]:")
140
+ console.print(f" Accuracy: {accuracy:.1%} ({passed}/{total} passed)")
141
+ console.print(f" Avg Duration: {avg_dur:.2f}s")
142
+ console.print(f" Avg Steps: {avg_steps:.1f}")
143
+ if total_cost > 0:
144
+ console.print(f" Total Cost: ${total_cost:.4f}")
145
+
146
+
147
+ def print_test_results_table(
148
+ console: Console,
149
+ model_name: str,
150
+ test_results: Dict[str, Any],
151
+ ) -> None:
152
+ """Print test results table for a single model.
153
+
154
+ Args:
155
+ console: Rich console for output
156
+ model_name: Name of the model
157
+ test_results: Dict mapping test IDs to test result data
158
+ """
159
+ console.print(f"\n[bold]Test Results - {model_name}:[/bold]")
160
+
161
+ table = Table(show_header=True, header_style="bold magenta")
162
+ table.add_column("Test ID", style="dim", width=30)
163
+ table.add_column("Category", width=12)
164
+ table.add_column("Status", justify="center", width=10)
165
+ table.add_column("Score", justify="right", width=8)
166
+ table.add_column("Duration", justify="right", width=10)
167
+ table.add_column("Steps", justify="right", width=8)
168
+ table.add_column("Cost", justify="right", width=10)
169
+
170
+ test_items = list(test_results.items())
171
+
172
+ # Show all tests if <=20, otherwise show first 15 and last 5
173
+ num_hidden = 0
174
+ if len(test_items) <= 20:
175
+ display_tests = test_items
176
+ truncated = False
177
+ else:
178
+ display_tests = test_items[:15] + test_items[-5:]
179
+ truncated = True
180
+ num_hidden = len(test_items) - 20
181
+
182
+ for i, (test_id, test_result) in enumerate(display_tests):
183
+ # Add separator row if we truncated
184
+ if truncated and i == 15:
185
+ table.add_row(f"... {num_hidden} more tests ...", "", "", "", "", "", "", style="dim italic")
186
+
187
+ # Handle both dict (from JSON) and object (from result) formats
188
+ if isinstance(test_result, dict):
189
+ category = test_result.get("category", "unknown")
190
+ passed = test_result.get("passed", False)
191
+ score = test_result.get("score", 0)
192
+ duration = test_result.get("duration", 0)
193
+ steps = test_result.get("steps_taken", 0)
194
+ cost_val = test_result.get("cost", 0)
195
+ else:
196
+ category = test_result.category
197
+ passed = test_result.passed
198
+ score = test_result.score
199
+ duration = test_result.duration
200
+ steps = test_result.steps_taken
201
+ cost_val = test_result.cost
202
+
203
+ status = "✅ PASS" if passed else "❌ FAIL"
204
+ status_style = "green" if passed else "red"
205
+ score_str = f"{score:.2f}"
206
+ duration_str = f"{duration:.2f}s"
207
+ steps_str = str(steps)
208
+ cost_str = f"${cost_val:.4f}" if cost_val > 0 else "$0.00"
209
+
210
+ table.add_row(
211
+ test_id,
212
+ category,
213
+ f"[{status_style}]{status}[/{status_style}]",
214
+ score_str,
215
+ duration_str,
216
+ steps_str,
217
+ cost_str,
218
+ )
219
+
220
+ console.print(table)
221
+
222
+
223
+ def print_detailed_test_results(console: Console, detailed_results: Dict[str, Dict[str, Any]]) -> None:
224
+ """Print detailed test results with outputs and errors (verbose mode).
225
+
226
+ Args:
227
+ console: Rich console for output
228
+ detailed_results: Dict mapping model names to test results
229
+ """
230
+ console.print("\n")
231
+ console.rule("[bold cyan]Detailed Test Results[/bold cyan]")
232
+
233
+ for model_name, tests in detailed_results.items():
234
+ for test_id, test_result in tests.items():
235
+ # Test header
236
+ passed = test_result.get("passed", False)
237
+ status_icon = "✅" if passed else "❌"
238
+ status_text = "PASS" if passed else "FAIL"
239
+ status_color = "green" if passed else "red"
240
+
241
+ console.print(f"\n📝 [bold]{test_id}[/bold]")
242
+ cost_val = test_result.get("cost", 0)
243
+ cost_str = f"${cost_val:.4f}" if cost_val > 0 else "$0.00"
244
+ console.print(
245
+ f" Status: [{status_color}]{status_icon} {status_text}[/{status_color}] | "
246
+ f"Score: {test_result.get('score', 0):.2f} | "
247
+ f"Duration: {test_result.get('duration', 0):.2f}s | "
248
+ f"Steps: {test_result.get('steps_taken', 0)} | "
249
+ f"Cost: {cost_str}"
250
+ )
251
+
252
+ # Individual test cases (if available)
253
+ metrics = test_result.get("metrics", {})
254
+ case_results = metrics.get("case_results", [])
255
+ if case_results:
256
+ total_cases = metrics.get("total_cases", len(case_results))
257
+ console.print(f"\n Test Cases ({total_cases}):")
258
+ for case in case_results:
259
+ case_passed = case.get("passed", False)
260
+ case_icon = "✅" if case_passed else "❌"
261
+ case_score = case.get("score", 0)
262
+ case_name = case.get("test_case", "Unknown")
263
+ console.print(f" • {case_name} {case_icon} (score: {case_score:.2f})")
264
+
265
+ # Actual vs Expected Output
266
+ actual = test_result.get("output", "")
267
+ expected = test_result.get("expected_output", "")
268
+
269
+ if actual or expected:
270
+ console.print("\n [bold]Actual Output:[/bold]")
271
+ if actual:
272
+ # Truncate if too long
273
+ display_actual = actual if len(actual) <= 500 else actual[:500] + "\n... (truncated)"
274
+ panel = Panel(display_actual, border_style="cyan", padding=(0, 1))
275
+ console.print(panel)
276
+ else:
277
+ console.print(" [dim](empty)[/dim]")
278
+
279
+ if expected:
280
+ console.print("\n [bold]Expected Output:[/bold]")
281
+ display_expected = expected if len(expected) <= 200 else expected[:200] + "..."
282
+ console.print(f" [yellow]{display_expected}[/yellow]")
283
+
284
+ # Error details
285
+ error = test_result.get("error")
286
+ if error:
287
+ console.print("\n [bold red]Error:[/bold red]")
288
+ error_panel = Panel(error, border_style="red", padding=(0, 1))
289
+ console.print(error_panel)
290
+
291
+ console.print() # Blank line between tests
292
+
293
+
294
+ def print_error_summary(console: Console, errors: List[str], verbose: bool = False) -> None:
295
+ """Print error summary.
296
+
297
+ Args:
298
+ console: Rich console for output
299
+ errors: List of error messages
300
+ verbose: If True, show all errors; otherwise show first 3
301
+ """
302
+ if not errors:
303
+ return
304
+
305
+ error_count = len(errors)
306
+ console.print(f"\n[bold red]Errors ({error_count}):[/bold red]")
307
+
308
+ # In verbose mode, show all errors; otherwise show first 3
309
+ display_errors = errors if verbose else errors[:3]
310
+
311
+ for error in display_errors:
312
+ console.print(f" [red]•[/red] {error}")
313
+
314
+ if not verbose and len(errors) > 3:
315
+ remaining = len(errors) - 3
316
+ console.print(f" [dim]... and {remaining} more errors (use --verbose to see all)[/dim]")
317
+
318
+
319
+ def load_benchmark_file(file_path: Path, console: Console) -> Dict[str, Any]:
320
+ """Load benchmark JSON file.
321
+
322
+ Args:
323
+ file_path: Path to benchmark JSON file
324
+ console: Rich console for error output
325
+
326
+ Returns:
327
+ Loaded JSON data as dict
328
+
329
+ Raises:
330
+ typer.Exit: If file doesn't exist or can't be loaded
331
+ """
332
+ import json
333
+
334
+ import typer
335
+
336
+ if not file_path.exists():
337
+ console.print(f"[red]File not found: {file_path}[/red]")
338
+ raise typer.Exit(1)
339
+
340
+ try:
341
+ with open(file_path) as f:
342
+ return json.load(f)
343
+ except Exception as e:
344
+ console.print(f"[red]Failed to load benchmark data: {e}[/red]")
345
+ raise typer.Exit(1)
346
+
347
+
348
+ def find_latest_benchmark() -> Optional[Path]:
349
+ """Find the most recent benchmark JSON file.
350
+
351
+ Returns:
352
+ Path to latest benchmark file, or None if no files found
353
+ """
354
+ results_dir = Path("benchmark_results")
355
+ if not results_dir.exists():
356
+ return None
357
+
358
+ # Find all benchmark JSON files
359
+ json_files = list(results_dir.glob("benchmark_*.json"))
360
+ if not json_files:
361
+ return None
362
+
363
+ # Sort by modification time, most recent first
364
+ json_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
365
+ return json_files[0]
366
+
367
+
368
+ def resolve_benchmark_file(output: Optional[str], console: Console) -> Path:
369
+ """Resolve which benchmark file to view.
370
+
371
+ Args:
372
+ output: User-specified file path or "latest"
373
+ console: Rich console for error output
374
+
375
+ Returns:
376
+ Path to benchmark file
377
+
378
+ Raises:
379
+ typer.Exit: If no files found
380
+ """
381
+ import typer
382
+
383
+ if output:
384
+ # User specified a specific file
385
+ if output == "latest":
386
+ file_path = find_latest_benchmark()
387
+ if not file_path:
388
+ console.print("[red]No benchmark results found in benchmark_results/[/red]")
389
+ raise typer.Exit(1)
390
+ else:
391
+ file_path = Path(output)
392
+ else:
393
+ # Default to latest
394
+ file_path = find_latest_benchmark()
395
+ if not file_path:
396
+ console.print("[red]No benchmark results found in benchmark_results/[/red]")
397
+ console.print("[dim]Run benchmarks first with: tsugite benchmark run --models <model>[/dim]")
398
+ raise typer.Exit(1)
399
+
400
+ return file_path
401
+
402
+
403
+ def run_benchmark_action(
404
+ console: Console,
405
+ models: Optional[str],
406
+ categories: Optional[str],
407
+ agent_path: Optional[str],
408
+ output: Optional[str],
409
+ format: str,
410
+ test_filter: Optional[str],
411
+ parallel: bool,
412
+ repeat: int,
413
+ ) -> None:
414
+ """Handle the 'run' benchmark action.
415
+
416
+ Args:
417
+ console: Rich console for output
418
+ models: Comma-separated list of models
419
+ categories: Comma-separated list of categories
420
+ agent_path: Path to specific agent file
421
+ output: Output file path
422
+ format: Report format (json, markdown, html, csv)
423
+ test_filter: Filter tests by name/ID
424
+ parallel: Run tests in parallel
425
+ repeat: Number of times to repeat each test
426
+ """
427
+ import asyncio
428
+
429
+ import typer
430
+
431
+ from tsugite.benchmark import BenchmarkConfig, BenchmarkRunner
432
+ from tsugite.benchmark.reports import ReportGenerator
433
+
434
+ # Parse and validate arguments
435
+ model_list, category_list = parse_benchmark_run_args(models, categories, console)
436
+
437
+ # Create config
438
+ config = BenchmarkConfig(
439
+ models=model_list,
440
+ categories=category_list,
441
+ parallel=parallel,
442
+ repeat_count=repeat,
443
+ output_dir=Path("benchmark_results"),
444
+ )
445
+
446
+ console.print("[cyan]Running benchmarks...[/cyan]")
447
+ console.print(f"Models: {', '.join(model_list)}")
448
+
449
+ if agent_path:
450
+ console.print(f"Agent: {agent_path}")
451
+ else:
452
+ console.print(f"Categories: {', '.join(config.categories)}")
453
+
454
+ # Run benchmark
455
+ runner = BenchmarkRunner(config)
456
+ try:
457
+ result = asyncio.run(
458
+ runner.run_benchmark(
459
+ models=model_list,
460
+ categories=category_list,
461
+ test_filter=test_filter,
462
+ agent_path=Path(agent_path) if agent_path else None,
463
+ )
464
+ )
465
+
466
+ # Generate reports
467
+ output_dir = Path("benchmark_results")
468
+ output_dir.mkdir(exist_ok=True)
469
+ timestamp = result.start_time.strftime("%Y%m%d_%H%M%S")
470
+ report_gen = ReportGenerator(result)
471
+
472
+ # JSON report (always generated for data)
473
+ json_path = output_dir / f"benchmark_{timestamp}.json"
474
+ report_gen.generate_json_report(json_path)
475
+ console.print(f"[green]JSON report saved: {json_path}[/green]")
476
+
477
+ # Main report in requested format
478
+ if output:
479
+ output_path = Path(output)
480
+ else:
481
+ output_path = output_dir / f"benchmark_{timestamp}.{format}"
482
+
483
+ if format == "json":
484
+ report_gen.generate_json_report(output_path)
485
+ elif format == "markdown":
486
+ report_gen.generate_markdown_report(output_path)
487
+ elif format == "html":
488
+ report_gen.generate_html_report(output_path)
489
+ elif format == "csv":
490
+ report_gen.generate_csv_summary(output_path)
491
+ else:
492
+ console.print(f"[red]Unknown format: {format}[/red]")
493
+ raise typer.Exit(1)
494
+
495
+ console.print(f"[green]Report saved: {output_path}[/green]")
496
+
497
+ # Print summary
498
+ summary = result.summary
499
+ print_benchmark_summary(
500
+ console,
501
+ duration=result.total_duration,
502
+ model_count=len(result.model_performances),
503
+ total_tests=summary.get("total_tests", 0),
504
+ avg_accuracy=summary.get("average_accuracy", 0),
505
+ best_model=summary.get("best_model"),
506
+ )
507
+
508
+ # Print model performance
509
+ print_model_performance(console, result.model_performances)
510
+
511
+ # Print test results tables
512
+ if result.model_performances:
513
+ for model_name, _ in result.model_performances.items():
514
+ model_tests = result.test_results.get(model_name, {})
515
+ print_test_results_table(console, model_name, model_tests)
516
+
517
+ # Print errors
518
+ print_error_summary(console, result.errors, verbose=False)
519
+
520
+ except Exception as e:
521
+ console.print(f"[red]Benchmark failed: {e}[/red]")
522
+ raise typer.Exit(1)
523
+
524
+
525
+ def view_benchmark_action(
526
+ console: Console,
527
+ output: Optional[str],
528
+ verbose: bool,
529
+ ) -> None:
530
+ """Handle the 'view' benchmark action.
531
+
532
+ Args:
533
+ console: Rich console for output
534
+ output: File path or "latest"
535
+ verbose: Show detailed test outputs
536
+ """
537
+ # Resolve and load file
538
+ file_path = resolve_benchmark_file(output, console)
539
+ data = load_benchmark_file(file_path, console)
540
+
541
+ # Display results
542
+ console.print("\n[bold cyan]Benchmark Results[/bold cyan]")
543
+ console.print(f"[dim]File: {file_path}[/dim]")
544
+ console.print(f"[dim]Generated: {data.get('generated_at', 'Unknown')}[/dim]\n")
545
+
546
+ console.print("=" * 50)
547
+ console.print("[bold green]Summary[/bold green]")
548
+ console.print("=" * 50)
549
+
550
+ # Print summary
551
+ benchmark_info = data.get("benchmark_info", {})
552
+ summary = data.get("summary", {})
553
+
554
+ print_benchmark_summary(
555
+ console,
556
+ duration=benchmark_info.get("total_duration", 0),
557
+ model_count=len(benchmark_info.get("models_tested", [])),
558
+ total_tests=benchmark_info.get("total_tests", 0),
559
+ avg_accuracy=summary.get("average_accuracy", 0),
560
+ best_model=summary.get("best_model"),
561
+ )
562
+
563
+ # Print model performance
564
+ model_performances = data.get("model_performances", {})
565
+ print_model_performance(console, model_performances)
566
+
567
+ # Print test results tables
568
+ detailed_results = data.get("detailed_results", {})
569
+ for model_name, tests in detailed_results.items():
570
+ print_test_results_table(console, model_name, tests)
571
+
572
+ # Detailed test breakdown (verbose mode)
573
+ if verbose:
574
+ print_detailed_test_results(console, detailed_results)
575
+
576
+ # Print errors
577
+ errors = data.get("errors", [])
578
+ print_error_summary(console, errors, verbose)
579
+
580
+
581
+ def list_benchmark_action(console: Console) -> None:
582
+ """Handle the 'list' benchmark action.
583
+
584
+ Args:
585
+ console: Rich console for output
586
+ """
587
+ import json
588
+ from datetime import datetime
589
+
590
+ import typer
591
+
592
+ results_dir = Path("benchmark_results")
593
+ if not results_dir.exists():
594
+ console.print("[red]No benchmark results directory found[/red]")
595
+ console.print("[dim]Run benchmarks first with: tsugite benchmark run --models <model>[/dim]")
596
+ raise typer.Exit(1)
597
+
598
+ # Find all benchmark JSON files
599
+ json_files = list(results_dir.glob("benchmark_*.json"))
600
+ if not json_files:
601
+ console.print("[red]No benchmark results found in benchmark_results/[/red]")
602
+ console.print("[dim]Run benchmarks first with: tsugite benchmark run --models <model>[/dim]")
603
+ raise typer.Exit(1)
604
+
605
+ # Sort by modification time, most recent first
606
+ json_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
607
+
608
+ console.print("\n[bold cyan]Benchmark Results[/bold cyan]")
609
+ console.print(f"Found {len(json_files)} result file(s) in {results_dir}/\n")
610
+
611
+ table = Table(show_header=True, header_style="bold magenta")
612
+ table.add_column("", width=3) # Marker for latest
613
+ table.add_column("Timestamp", style="cyan", width=20)
614
+ table.add_column("Models", width=15)
615
+ table.add_column("Tests", justify="right", width=7)
616
+ table.add_column("Accuracy", justify="right", width=10)
617
+ table.add_column("Duration", justify="right", width=10)
618
+ table.add_column("File", style="dim", width=30)
619
+
620
+ for i, file_path in enumerate(json_files):
621
+ try:
622
+ with open(file_path) as f:
623
+ data = json.load(f)
624
+
625
+ benchmark_info = data.get("benchmark_info", {})
626
+ summary = data.get("summary", {})
627
+
628
+ # Extract info
629
+ timestamp_str = benchmark_info.get("start_time", "")
630
+ if timestamp_str:
631
+ timestamp = datetime.fromisoformat(timestamp_str).strftime("%Y-%m-%d %H:%M:%S")
632
+ else:
633
+ timestamp = "Unknown"
634
+
635
+ models = benchmark_info.get("models_tested", [])
636
+ model_str = ", ".join(models) if models else "Unknown"
637
+ if len(model_str) > 15:
638
+ model_str = model_str[:12] + "..."
639
+
640
+ tests = benchmark_info.get("total_tests", 0)
641
+ accuracy = summary.get("average_accuracy", 0)
642
+ duration = benchmark_info.get("total_duration", 0)
643
+
644
+ # Mark most recent
645
+ marker = "→" if i == 0 else ""
646
+
647
+ table.add_row(
648
+ f"[green]{marker}[/green]" if marker else "",
649
+ timestamp,
650
+ model_str,
651
+ str(tests),
652
+ f"{accuracy:.1%}",
653
+ f"{duration:.1f}s",
654
+ file_path.name,
655
+ )
656
+
657
+ except Exception:
658
+ # Skip files that can't be loaded
659
+ table.add_row("", "Error", "-", "-", "-", "-", f"[red]{file_path.name}[/red]")
660
+
661
+ console.print(table)
662
+ console.print("\n[dim]View details: tsugite benchmark view [file|latest][/dim]")
663
+ console.print("[dim]Most recent run is marked with →[/dim]")