tsugite-cli 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. tsugite/__init__.py +6 -0
  2. tsugite/agent_composition.py +163 -0
  3. tsugite/agent_inheritance.py +479 -0
  4. tsugite/agent_preparation.py +236 -0
  5. tsugite/agent_runner/__init__.py +45 -0
  6. tsugite/agent_runner/helpers.py +106 -0
  7. tsugite/agent_runner/history_integration.py +248 -0
  8. tsugite/agent_runner/metrics.py +100 -0
  9. tsugite/agent_runner/runner.py +1879 -0
  10. tsugite/agent_runner/validation.py +70 -0
  11. tsugite/agent_utils.py +167 -0
  12. tsugite/attachments/__init__.py +65 -0
  13. tsugite/attachments/auto_context.py +199 -0
  14. tsugite/attachments/base.py +34 -0
  15. tsugite/attachments/file.py +51 -0
  16. tsugite/attachments/inline.py +31 -0
  17. tsugite/attachments/storage.py +178 -0
  18. tsugite/attachments/url.py +59 -0
  19. tsugite/attachments/youtube.py +101 -0
  20. tsugite/benchmark/__init__.py +62 -0
  21. tsugite/benchmark/config.py +183 -0
  22. tsugite/benchmark/core.py +292 -0
  23. tsugite/benchmark/discovery.py +377 -0
  24. tsugite/benchmark/evaluators.py +671 -0
  25. tsugite/benchmark/execution.py +657 -0
  26. tsugite/benchmark/metrics.py +204 -0
  27. tsugite/benchmark/reports.py +420 -0
  28. tsugite/benchmark/utils.py +288 -0
  29. tsugite/builtin_agents/chat-assistant.md +53 -0
  30. tsugite/builtin_agents/default.md +140 -0
  31. tsugite/builtin_agents.py +5 -0
  32. tsugite/cache.py +195 -0
  33. tsugite/cli/__init__.py +1042 -0
  34. tsugite/cli/agents.py +148 -0
  35. tsugite/cli/attachments.py +193 -0
  36. tsugite/cli/benchmark.py +663 -0
  37. tsugite/cli/cache.py +113 -0
  38. tsugite/cli/config.py +272 -0
  39. tsugite/cli/helpers.py +534 -0
  40. tsugite/cli/history.py +193 -0
  41. tsugite/cli/init.py +387 -0
  42. tsugite/cli/mcp.py +193 -0
  43. tsugite/cli/tools.py +419 -0
  44. tsugite/config.py +204 -0
  45. tsugite/console.py +48 -0
  46. tsugite/constants.py +21 -0
  47. tsugite/core/__init__.py +19 -0
  48. tsugite/core/agent.py +774 -0
  49. tsugite/core/executor.py +300 -0
  50. tsugite/core/memory.py +67 -0
  51. tsugite/core/tools.py +271 -0
  52. tsugite/docker_cli.py +270 -0
  53. tsugite/events/__init__.py +55 -0
  54. tsugite/events/base.py +46 -0
  55. tsugite/events/bus.py +62 -0
  56. tsugite/events/events.py +224 -0
  57. tsugite/exceptions.py +40 -0
  58. tsugite/history/__init__.py +29 -0
  59. tsugite/history/index.py +210 -0
  60. tsugite/history/models.py +106 -0
  61. tsugite/history/storage.py +157 -0
  62. tsugite/mcp_client.py +219 -0
  63. tsugite/mcp_config.py +174 -0
  64. tsugite/md_agents.py +751 -0
  65. tsugite/models.py +257 -0
  66. tsugite/renderer.py +151 -0
  67. tsugite/shell_tool_config.py +265 -0
  68. tsugite/templates/assistant.md +14 -0
  69. tsugite/tools/__init__.py +265 -0
  70. tsugite/tools/agents.py +312 -0
  71. tsugite/tools/edit_strategies.py +393 -0
  72. tsugite/tools/fs.py +329 -0
  73. tsugite/tools/http.py +239 -0
  74. tsugite/tools/interactive.py +430 -0
  75. tsugite/tools/shell.py +129 -0
  76. tsugite/tools/shell_tools.py +214 -0
  77. tsugite/tools/tasks.py +339 -0
  78. tsugite/tsugite.py +7 -0
  79. tsugite/ui/__init__.py +46 -0
  80. tsugite/ui/base.py +638 -0
  81. tsugite/ui/chat.py +265 -0
  82. tsugite/ui/chat.tcss +92 -0
  83. tsugite/ui/chat_history.py +286 -0
  84. tsugite/ui/helpers.py +102 -0
  85. tsugite/ui/jsonl.py +125 -0
  86. tsugite/ui/live_template.py +529 -0
  87. tsugite/ui/plain.py +419 -0
  88. tsugite/ui/textual_chat.py +642 -0
  89. tsugite/ui/textual_handler.py +225 -0
  90. tsugite/ui/widgets/__init__.py +6 -0
  91. tsugite/ui/widgets/base_scroll_log.py +27 -0
  92. tsugite/ui/widgets/message_list.py +121 -0
  93. tsugite/ui/widgets/thought_log.py +80 -0
  94. tsugite/ui_context.py +90 -0
  95. tsugite/utils.py +367 -0
  96. tsugite/xdg.py +104 -0
  97. tsugite_cli-0.3.3.dist-info/METADATA +325 -0
  98. tsugite_cli-0.3.3.dist-info/RECORD +101 -0
  99. tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
  100. tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
  101. tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
@@ -0,0 +1,204 @@
1
+ """Metrics and data structures for benchmark results."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Any, Dict, List, Optional
6
+
7
+
8
+ @dataclass
9
+ class BenchmarkTestResult:
10
+ """Result from running a single test."""
11
+
12
+ test_id: str
13
+ model: str
14
+ passed: bool
15
+ score: float # 0.0 to 1.0
16
+ duration: float # seconds
17
+ output: str
18
+ expected_output: str
19
+ category: str = "unknown"
20
+ error: Optional[str] = None
21
+ token_usage: Dict[str, int] = field(default_factory=dict)
22
+ cost: float = 0.0
23
+ steps_taken: int = 0 # Number of reasoning steps/attempts
24
+ metrics: Dict[str, Any] = field(default_factory=dict)
25
+ timestamp: datetime = field(default_factory=datetime.now)
26
+
27
+
28
+ @dataclass
29
+ class ModelPerformance:
30
+ """Aggregated performance metrics for a model."""
31
+
32
+ model: str
33
+ total_tests: int
34
+ passed_tests: int
35
+ accuracy: float # passed_tests / total_tests
36
+ average_duration: float
37
+ total_duration: float
38
+ total_tokens: int
39
+ total_cost: float
40
+ scores_by_category: Dict[str, float] = field(default_factory=dict)
41
+ error_rate: float = 0.0
42
+ reliability_score: float = 0.0
43
+ average_steps: float = 0.0
44
+
45
+
46
+ @dataclass
47
+ class CategoryMetrics:
48
+ """Performance metrics for a test category."""
49
+
50
+ category: str
51
+ total_tests: int
52
+ model_scores: Dict[str, float] # model -> average score
53
+ average_score: float
54
+ best_model: str
55
+ worst_model: str
56
+
57
+
58
+ class BenchmarkMetrics:
59
+ """Utility class for calculating benchmark metrics."""
60
+
61
+ @staticmethod
62
+ def calculate_accuracy(passed: int, total: int) -> float:
63
+ """Calculate accuracy percentage."""
64
+ if total == 0:
65
+ return 0.0
66
+ return passed / total
67
+
68
+ @staticmethod
69
+ def calculate_efficiency_score(duration: float, baseline_duration: float) -> float:
70
+ """Calculate efficiency score relative to baseline (higher is better)."""
71
+ if baseline_duration <= 0:
72
+ return 1.0
73
+ return min(baseline_duration / duration, 1.0)
74
+
75
+ @staticmethod
76
+ def calculate_cost_efficiency(score: float, cost: float) -> float:
77
+ """Calculate cost efficiency (score per unit cost)."""
78
+ if cost <= 0:
79
+ return score
80
+ return score / cost
81
+
82
+ @staticmethod
83
+ def calculate_weighted_score(scores: Dict[str, float], weights: Dict[str, float]) -> float:
84
+ """Calculate weighted average of scores."""
85
+ total_weight = sum(weights.values())
86
+ if total_weight == 0:
87
+ return 0.0
88
+
89
+ weighted_sum = sum(scores.get(metric, 0.0) * weight for metric, weight in weights.items())
90
+ return weighted_sum / total_weight
91
+
92
+ @staticmethod
93
+ def calculate_reliability_score(test_results: List[BenchmarkTestResult]) -> float:
94
+ """Calculate reliability score based on error patterns."""
95
+ if not test_results:
96
+ return 0.0
97
+
98
+ total_tests = len(test_results)
99
+ error_count = sum(1 for result in test_results if result.error is not None)
100
+ timeout_count = sum(1 for result in test_results if result.error and "timeout" in result.error.lower())
101
+
102
+ # Penalize errors and timeouts more heavily
103
+ error_penalty = (error_count * 0.5 + timeout_count * 0.3) / total_tests
104
+ reliability = max(0.0, 1.0 - error_penalty)
105
+
106
+ return reliability
107
+
108
+ @staticmethod
109
+ def calculate_category_performance(test_results: Dict[str, BenchmarkTestResult], category: str) -> CategoryMetrics:
110
+ """Calculate performance metrics for a specific category."""
111
+ category_results = [result for result in test_results.values() if result.test_id.startswith(category)]
112
+
113
+ if not category_results:
114
+ return CategoryMetrics(
115
+ category=category,
116
+ total_tests=0,
117
+ model_scores={},
118
+ average_score=0.0,
119
+ best_model="",
120
+ worst_model="",
121
+ )
122
+
123
+ # Group by model
124
+ model_scores = {}
125
+ for result in category_results:
126
+ if result.model not in model_scores:
127
+ model_scores[result.model] = []
128
+ model_scores[result.model].append(result.score)
129
+
130
+ # Calculate average scores per model
131
+ avg_model_scores = {model: round(sum(scores) / len(scores), 4) for model, scores in model_scores.items()}
132
+
133
+ # Find best and worst models
134
+ if avg_model_scores:
135
+ best_model = max(avg_model_scores.items(), key=lambda x: x[1])[0]
136
+ worst_model = min(avg_model_scores.items(), key=lambda x: x[1])[0]
137
+ average_score = round(sum(avg_model_scores.values()) / len(avg_model_scores), 4)
138
+ else:
139
+ best_model = worst_model = ""
140
+ average_score = 0.0
141
+
142
+ return CategoryMetrics(
143
+ category=category,
144
+ total_tests=len(category_results),
145
+ model_scores=avg_model_scores,
146
+ average_score=average_score,
147
+ best_model=best_model,
148
+ worst_model=worst_model,
149
+ )
150
+
151
+ @staticmethod
152
+ def get_performance_tier(accuracy: float) -> str:
153
+ """Classify performance into tiers."""
154
+ if accuracy >= 0.9:
155
+ return "Excellent"
156
+ elif accuracy >= 0.75:
157
+ return "Good"
158
+ elif accuracy >= 0.6:
159
+ return "Fair"
160
+ elif accuracy >= 0.4:
161
+ return "Poor"
162
+ else:
163
+ return "Very Poor"
164
+
165
+ @staticmethod
166
+ def calculate_improvement_percentage(baseline_score: float, current_score: float) -> float:
167
+ """Calculate percentage improvement over baseline."""
168
+ if baseline_score <= 0:
169
+ return 0.0
170
+
171
+ improvement = ((current_score - baseline_score) / baseline_score) * 100
172
+ return round(improvement, 4)
173
+
174
+ @staticmethod
175
+ def aggregate_test_results(results: List[BenchmarkTestResult]) -> Dict[str, Any]:
176
+ """Aggregate multiple test results into summary statistics."""
177
+ if not results:
178
+ return {
179
+ "total_tests": 0,
180
+ "passed_tests": 0,
181
+ "accuracy": 0.0,
182
+ "average_score": 0.0,
183
+ "average_duration": 0.0,
184
+ "total_cost": 0.0,
185
+ "error_rate": 0.0,
186
+ }
187
+
188
+ total_tests = len(results)
189
+ passed_tests = sum(1 for r in results if r.passed)
190
+ total_score = sum(r.score for r in results)
191
+ total_duration = sum(r.duration for r in results)
192
+ total_cost = sum(r.cost for r in results)
193
+ error_count = sum(1 for r in results if r.error is not None)
194
+
195
+ return {
196
+ "total_tests": total_tests,
197
+ "passed_tests": passed_tests,
198
+ "accuracy": passed_tests / total_tests,
199
+ "average_score": total_score / total_tests,
200
+ "average_duration": total_duration / total_tests,
201
+ "total_duration": total_duration,
202
+ "total_cost": total_cost,
203
+ "error_rate": error_count / total_tests,
204
+ }
@@ -0,0 +1,420 @@
1
+ """Report generation for benchmark results."""
2
+
3
+ import json
4
+ from dataclasses import asdict
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List
8
+
9
+ from .config import get_performance_tier
10
+ from .core import BenchmarkResult
11
+
12
+
13
+ class ReportGenerator:
14
+ """Generate various formats of benchmark reports."""
15
+
16
+ def __init__(self, result: BenchmarkResult):
17
+ self.result = result
18
+
19
+ def generate_json_report(self, output_path: Path) -> None:
20
+ """Generate a comprehensive JSON report."""
21
+ report_data = {
22
+ "benchmark_info": {
23
+ "start_time": self.result.start_time.isoformat(),
24
+ "end_time": self.result.end_time.isoformat(),
25
+ "total_duration": self.result.total_duration,
26
+ "models_tested": list(self.result.model_performances.keys()),
27
+ "total_tests": len(self._get_all_test_ids()),
28
+ },
29
+ "config": asdict(self.result.config),
30
+ "summary": self.result.summary,
31
+ "model_performances": {model: asdict(perf) for model, perf in self.result.model_performances.items()},
32
+ "detailed_results": {
33
+ model: {test_id: asdict(result) for test_id, result in tests.items()}
34
+ for model, tests in self.result.test_results.items()
35
+ },
36
+ "errors": self.result.errors,
37
+ "generated_at": datetime.now().isoformat(),
38
+ }
39
+
40
+ with open(output_path, "w") as f:
41
+ json.dump(report_data, f, indent=2, default=str)
42
+
43
+ def generate_markdown_report(self, output_path: Path) -> None:
44
+ """Generate a markdown report for easy reading."""
45
+ report = []
46
+
47
+ # Header
48
+ report.append("# Tsugite Benchmark Report")
49
+ report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
50
+ report.append("")
51
+
52
+ # Summary
53
+ report.append("## Summary")
54
+ report.append(f"- **Duration**: {self.result.total_duration:.2f} seconds")
55
+ report.append(f"- **Models Tested**: {len(self.result.model_performances)}")
56
+ report.append(f"- **Total Tests**: {self.result.summary.get('total_tests', 0)}")
57
+ report.append(f"- **Average Accuracy**: {self.result.summary.get('average_accuracy', 0):.1%}")
58
+ report.append("")
59
+
60
+ # Model Rankings
61
+ if "model_rankings" in self.result.summary:
62
+ report.append("## Model Rankings")
63
+ report.append("| Rank | Model | Accuracy | Avg Duration | Total Cost |")
64
+ report.append("|------|-------|----------|--------------|------------|")
65
+
66
+ for i, ranking in enumerate(self.result.summary["model_rankings"], 1):
67
+ model = ranking["model"]
68
+ accuracy = f"{ranking['accuracy']:.1%}"
69
+ duration = f"{ranking['avg_duration']:.2f}s"
70
+ cost = f"${ranking['total_cost']:.4f}"
71
+ report.append(f"| {i} | {model} | {accuracy} | {duration} | {cost} |")
72
+ report.append("")
73
+
74
+ # Detailed Performance
75
+ report.append("## Detailed Performance")
76
+ for model, performance in self.result.model_performances.items():
77
+ report.append(f"### {model}")
78
+ report.append(
79
+ f"- **Accuracy**: {performance.accuracy:.1%} ({performance.passed_tests}/{performance.total_tests})"
80
+ )
81
+ report.append(f"- **Average Duration**: {performance.average_duration:.2f}s")
82
+ report.append(f"- **Average Steps**: {performance.average_steps:.1f}")
83
+ report.append(f"- **Total Cost**: ${performance.total_cost:.4f}")
84
+ report.append(f"- **Performance Tier**: {get_performance_tier(performance.accuracy)}")
85
+ report.append("")
86
+
87
+ # Category Breakdown
88
+ category_breakdown = self._calculate_category_breakdown()
89
+ if category_breakdown:
90
+ report.append("## Performance by Category")
91
+ for category, data in category_breakdown.items():
92
+ report.append(f"### {category.title()}")
93
+ report.append(f"- **Best Model**: {data['best_model']} ({data['best_score']:.1%})")
94
+ report.append(f"- **Average Score**: {data['average_score']:.1%}")
95
+ report.append("")
96
+
97
+ # Test Details
98
+ report.append("## Test Results Details")
99
+ for model in self.result.model_performances.keys():
100
+ report.append(f"### {model}")
101
+ report.append("| Test ID | Category | Result | Score | Duration | Steps | Cost |")
102
+ report.append("|---------|----------|--------|-------|----------|-------|------|")
103
+
104
+ model_tests = self.result.test_results.get(model, {})
105
+ for test_id, test_result in model_tests.items():
106
+ category = test_result.category
107
+ result_status = "✅ PASS" if test_result.passed else "❌ FAIL"
108
+ score = f"{test_result.score:.2f}"
109
+ duration = f"{test_result.duration:.2f}s"
110
+ steps = str(test_result.steps_taken)
111
+ cost = f"${test_result.cost:.4f}" if test_result.cost > 0 else "$0.00"
112
+ report.append(f"| {test_id} | {category} | {result_status} | {score} | {duration} | {steps} | {cost} |")
113
+ report.append("")
114
+
115
+ # Errors
116
+ if self.result.errors:
117
+ report.append("## Errors")
118
+ for error in self.result.errors:
119
+ report.append(f"- {error}")
120
+ report.append("")
121
+
122
+ # Write to file
123
+ with open(output_path, "w") as f:
124
+ f.write("\n".join(report))
125
+
126
+ def generate_html_report(self, output_path: Path) -> None:
127
+ """Generate an interactive HTML report."""
128
+ html_content = self._generate_html_content()
129
+
130
+ with open(output_path, "w") as f:
131
+ f.write(html_content)
132
+
133
+ def generate_csv_summary(self, output_path: Path) -> None:
134
+ """Generate CSV summary for data analysis."""
135
+ import csv
136
+
137
+ with open(output_path, "w", newline="") as f:
138
+ writer = csv.writer(f)
139
+
140
+ # Header
141
+ writer.writerow(
142
+ [
143
+ "Model",
144
+ "Test_ID",
145
+ "Category",
146
+ "Passed",
147
+ "Score",
148
+ "Duration",
149
+ "Steps",
150
+ "Token_Usage",
151
+ "Cost",
152
+ "Error",
153
+ ]
154
+ )
155
+
156
+ # Data rows
157
+ for model, tests in self.result.test_results.items():
158
+ for test_id, test_result in tests.items():
159
+ category = test_result.category
160
+ token_usage = test_result.token_usage.get("total", 0)
161
+
162
+ writer.writerow(
163
+ [
164
+ model,
165
+ test_id,
166
+ category,
167
+ test_result.passed,
168
+ test_result.score,
169
+ test_result.duration,
170
+ test_result.steps_taken,
171
+ token_usage,
172
+ test_result.cost,
173
+ test_result.error or "",
174
+ ]
175
+ )
176
+
177
+ def _get_all_test_ids(self) -> List[str]:
178
+ """Get all unique test IDs across all models."""
179
+ all_test_ids = set()
180
+ for tests in self.result.test_results.values():
181
+ all_test_ids.update(tests.keys())
182
+ return list(all_test_ids)
183
+
184
+ def _calculate_category_breakdown(self) -> Dict[str, Any]:
185
+ """Calculate performance breakdown by category."""
186
+ categories = {}
187
+
188
+ for model, tests in self.result.test_results.items():
189
+ for test_id, test_result in tests.items():
190
+ category = test_result.category
191
+
192
+ if category not in categories:
193
+ categories[category] = {}
194
+
195
+ if model not in categories[category]:
196
+ categories[category][model] = []
197
+
198
+ categories[category][model].append(test_result.score)
199
+
200
+ # Calculate averages and find best/worst
201
+ breakdown = {}
202
+ for category, model_scores in categories.items():
203
+ avg_scores = {model: sum(scores) / len(scores) for model, scores in model_scores.items()}
204
+
205
+ if avg_scores:
206
+ best_model = max(avg_scores.items(), key=lambda x: x[1])
207
+ average_score = sum(avg_scores.values()) / len(avg_scores)
208
+
209
+ breakdown[category] = {
210
+ "best_model": best_model[0],
211
+ "best_score": best_model[1],
212
+ "average_score": average_score,
213
+ "model_scores": avg_scores,
214
+ }
215
+
216
+ return breakdown
217
+
218
+ def _generate_html_content(self) -> str:
219
+ """Generate HTML content for the report."""
220
+ html = f"""
221
+ <!DOCTYPE html>
222
+ <html lang="en">
223
+ <head>
224
+ <meta charset="UTF-8">
225
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
226
+ <title>Tsugite Benchmark Report</title>
227
+ <style>
228
+ body {{
229
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
230
+ line-height: 1.6;
231
+ margin: 0;
232
+ padding: 20px;
233
+ background-color: #f5f5f5;
234
+ }}
235
+ .container {{
236
+ max-width: 1200px;
237
+ margin: 0 auto;
238
+ background: white;
239
+ padding: 30px;
240
+ border-radius: 8px;
241
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
242
+ }}
243
+ h1, h2, h3 {{
244
+ color: #333;
245
+ }}
246
+ .summary {{
247
+ background: #f8f9fa;
248
+ padding: 20px;
249
+ border-radius: 6px;
250
+ margin: 20px 0;
251
+ }}
252
+ .metric {{
253
+ display: inline-block;
254
+ margin: 10px 20px 10px 0;
255
+ }}
256
+ .metric-value {{
257
+ font-size: 1.5em;
258
+ font-weight: bold;
259
+ color: #007bff;
260
+ }}
261
+ .metric-label {{
262
+ font-size: 0.9em;
263
+ color: #666;
264
+ }}
265
+ table {{
266
+ width: 100%;
267
+ border-collapse: collapse;
268
+ margin: 20px 0;
269
+ }}
270
+ th, td {{
271
+ padding: 12px;
272
+ text-align: left;
273
+ border-bottom: 1px solid #ddd;
274
+ }}
275
+ th {{
276
+ background-color: #f8f9fa;
277
+ font-weight: 600;
278
+ }}
279
+ .pass {{ color: #28a745; }}
280
+ .fail {{ color: #dc3545; }}
281
+ .tier-excellent {{ color: #28a745; font-weight: bold; }}
282
+ .tier-good {{ color: #17a2b8; }}
283
+ .tier-fair {{ color: #ffc107; }}
284
+ .tier-poor {{ color: #fd7e14; }}
285
+ .tier-very-poor {{ color: #dc3545; }}
286
+ .progress-bar {{
287
+ background: #e9ecef;
288
+ border-radius: 10px;
289
+ height: 20px;
290
+ margin: 5px 0;
291
+ }}
292
+ .progress-fill {{
293
+ height: 100%;
294
+ border-radius: 10px;
295
+ transition: width 0.3s ease;
296
+ }}
297
+ .progress-excellent {{ background: #28a745; }}
298
+ .progress-good {{ background: #17a2b8; }}
299
+ .progress-fair {{ background: #ffc107; }}
300
+ .progress-poor {{ background: #fd7e14; }}
301
+ .progress-very-poor {{ background: #dc3545; }}
302
+ </style>
303
+ </head>
304
+ <body>
305
+ <div class="container">
306
+ <h1>🚀 Tsugite Benchmark Report</h1>
307
+ <p>Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
308
+
309
+ <div class="summary">
310
+ <h2>📊 Summary</h2>
311
+ <div class="metric">
312
+ <div class="metric-value">{self.result.total_duration:.1f}s</div>
313
+ <div class="metric-label">Total Duration</div>
314
+ </div>
315
+ <div class="metric">
316
+ <div class="metric-value">{len(self.result.model_performances)}</div>
317
+ <div class="metric-label">Models Tested</div>
318
+ </div>
319
+ <div class="metric">
320
+ <div class="metric-value">{self.result.summary.get("total_tests", 0)}</div>
321
+ <div class="metric-label">Total Tests</div>
322
+ </div>
323
+ <div class="metric">
324
+ <div class="metric-value">{self.result.summary.get("average_accuracy", 0):.1%}</div>
325
+ <div class="metric-label">Average Accuracy</div>
326
+ </div>
327
+ </div>
328
+
329
+ <h2>🏆 Model Rankings</h2>
330
+ <table>
331
+ <thead>
332
+ <tr>
333
+ <th>Rank</th>
334
+ <th>Model</th>
335
+ <th>Accuracy</th>
336
+ <th>Performance</th>
337
+ <th>Avg Duration</th>
338
+ <th>Total Cost</th>
339
+ </tr>
340
+ </thead>
341
+ <tbody>
342
+ """
343
+
344
+ # Model rankings table
345
+ if "model_rankings" in self.result.summary:
346
+ for i, ranking in enumerate(self.result.summary["model_rankings"], 1):
347
+ accuracy = ranking["accuracy"]
348
+ tier_class = self._get_tier_class(accuracy)
349
+ progress_class = self._get_progress_class(accuracy)
350
+
351
+ html += f"""
352
+ <tr>
353
+ <td>{i}</td>
354
+ <td><strong>{ranking["model"]}</strong></td>
355
+ <td>
356
+ {accuracy:.1%}
357
+ <div class="progress-bar">
358
+ <div class="progress-fill {progress_class}" style="width: {accuracy * 100}%"></div>
359
+ </div>
360
+ </td>
361
+ <td><span class="{tier_class}">{get_performance_tier(accuracy)}</span></td>
362
+ <td>{ranking["avg_duration"]:.2f}s</td>
363
+ <td>${ranking["total_cost"]:.4f}</td>
364
+ </tr>
365
+ """
366
+
367
+ html += """
368
+ </tbody>
369
+ </table>
370
+
371
+ <h2>📈 Detailed Results</h2>
372
+ """
373
+
374
+ # Detailed results for each model
375
+ for model, performance in self.result.model_performances.items():
376
+ tier_class = self._get_tier_class(performance.accuracy)
377
+ html += f"""
378
+ <h3>{model}</h3>
379
+ <p>
380
+ <span class="{tier_class}">
381
+ {get_performance_tier(performance.accuracy)}
382
+ </span>
383
+ - {performance.accuracy:.1%} accuracy
384
+ ({performance.passed_tests}/{performance.total_tests} tests passed)
385
+ </p>
386
+ """
387
+
388
+ html += """
389
+ </div>
390
+ </body>
391
+ </html>
392
+ """
393
+
394
+ return html
395
+
396
+ def _get_tier_class(self, accuracy: float) -> str:
397
+ """Get CSS class for performance tier."""
398
+ if accuracy >= 0.9:
399
+ return "tier-excellent"
400
+ elif accuracy >= 0.75:
401
+ return "tier-good"
402
+ elif accuracy >= 0.6:
403
+ return "tier-fair"
404
+ elif accuracy >= 0.4:
405
+ return "tier-poor"
406
+ else:
407
+ return "tier-very-poor"
408
+
409
+ def _get_progress_class(self, accuracy: float) -> str:
410
+ """Get CSS class for progress bar."""
411
+ if accuracy >= 0.9:
412
+ return "progress-excellent"
413
+ elif accuracy >= 0.75:
414
+ return "progress-good"
415
+ elif accuracy >= 0.6:
416
+ return "progress-fair"
417
+ elif accuracy >= 0.4:
418
+ return "progress-poor"
419
+ else:
420
+ return "progress-very-poor"