tsugite-cli 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsugite/__init__.py +6 -0
- tsugite/agent_composition.py +163 -0
- tsugite/agent_inheritance.py +479 -0
- tsugite/agent_preparation.py +236 -0
- tsugite/agent_runner/__init__.py +45 -0
- tsugite/agent_runner/helpers.py +106 -0
- tsugite/agent_runner/history_integration.py +248 -0
- tsugite/agent_runner/metrics.py +100 -0
- tsugite/agent_runner/runner.py +1879 -0
- tsugite/agent_runner/validation.py +70 -0
- tsugite/agent_utils.py +167 -0
- tsugite/attachments/__init__.py +65 -0
- tsugite/attachments/auto_context.py +199 -0
- tsugite/attachments/base.py +34 -0
- tsugite/attachments/file.py +51 -0
- tsugite/attachments/inline.py +31 -0
- tsugite/attachments/storage.py +178 -0
- tsugite/attachments/url.py +59 -0
- tsugite/attachments/youtube.py +101 -0
- tsugite/benchmark/__init__.py +62 -0
- tsugite/benchmark/config.py +183 -0
- tsugite/benchmark/core.py +292 -0
- tsugite/benchmark/discovery.py +377 -0
- tsugite/benchmark/evaluators.py +671 -0
- tsugite/benchmark/execution.py +657 -0
- tsugite/benchmark/metrics.py +204 -0
- tsugite/benchmark/reports.py +420 -0
- tsugite/benchmark/utils.py +288 -0
- tsugite/builtin_agents/chat-assistant.md +53 -0
- tsugite/builtin_agents/default.md +140 -0
- tsugite/builtin_agents.py +5 -0
- tsugite/cache.py +195 -0
- tsugite/cli/__init__.py +1042 -0
- tsugite/cli/agents.py +148 -0
- tsugite/cli/attachments.py +193 -0
- tsugite/cli/benchmark.py +663 -0
- tsugite/cli/cache.py +113 -0
- tsugite/cli/config.py +272 -0
- tsugite/cli/helpers.py +534 -0
- tsugite/cli/history.py +193 -0
- tsugite/cli/init.py +387 -0
- tsugite/cli/mcp.py +193 -0
- tsugite/cli/tools.py +419 -0
- tsugite/config.py +204 -0
- tsugite/console.py +48 -0
- tsugite/constants.py +21 -0
- tsugite/core/__init__.py +19 -0
- tsugite/core/agent.py +774 -0
- tsugite/core/executor.py +300 -0
- tsugite/core/memory.py +67 -0
- tsugite/core/tools.py +271 -0
- tsugite/docker_cli.py +270 -0
- tsugite/events/__init__.py +55 -0
- tsugite/events/base.py +46 -0
- tsugite/events/bus.py +62 -0
- tsugite/events/events.py +224 -0
- tsugite/exceptions.py +40 -0
- tsugite/history/__init__.py +29 -0
- tsugite/history/index.py +210 -0
- tsugite/history/models.py +106 -0
- tsugite/history/storage.py +157 -0
- tsugite/mcp_client.py +219 -0
- tsugite/mcp_config.py +174 -0
- tsugite/md_agents.py +751 -0
- tsugite/models.py +257 -0
- tsugite/renderer.py +151 -0
- tsugite/shell_tool_config.py +265 -0
- tsugite/templates/assistant.md +14 -0
- tsugite/tools/__init__.py +265 -0
- tsugite/tools/agents.py +312 -0
- tsugite/tools/edit_strategies.py +393 -0
- tsugite/tools/fs.py +329 -0
- tsugite/tools/http.py +239 -0
- tsugite/tools/interactive.py +430 -0
- tsugite/tools/shell.py +129 -0
- tsugite/tools/shell_tools.py +214 -0
- tsugite/tools/tasks.py +339 -0
- tsugite/tsugite.py +7 -0
- tsugite/ui/__init__.py +46 -0
- tsugite/ui/base.py +638 -0
- tsugite/ui/chat.py +265 -0
- tsugite/ui/chat.tcss +92 -0
- tsugite/ui/chat_history.py +286 -0
- tsugite/ui/helpers.py +102 -0
- tsugite/ui/jsonl.py +125 -0
- tsugite/ui/live_template.py +529 -0
- tsugite/ui/plain.py +419 -0
- tsugite/ui/textual_chat.py +642 -0
- tsugite/ui/textual_handler.py +225 -0
- tsugite/ui/widgets/__init__.py +6 -0
- tsugite/ui/widgets/base_scroll_log.py +27 -0
- tsugite/ui/widgets/message_list.py +121 -0
- tsugite/ui/widgets/thought_log.py +80 -0
- tsugite/ui_context.py +90 -0
- tsugite/utils.py +367 -0
- tsugite/xdg.py +104 -0
- tsugite_cli-0.3.3.dist-info/METADATA +325 -0
- tsugite_cli-0.3.3.dist-info/RECORD +101 -0
- tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
- tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
- tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Metrics and data structures for benchmark results."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class BenchmarkTestResult:
|
|
10
|
+
"""Result from running a single test."""
|
|
11
|
+
|
|
12
|
+
test_id: str
|
|
13
|
+
model: str
|
|
14
|
+
passed: bool
|
|
15
|
+
score: float # 0.0 to 1.0
|
|
16
|
+
duration: float # seconds
|
|
17
|
+
output: str
|
|
18
|
+
expected_output: str
|
|
19
|
+
category: str = "unknown"
|
|
20
|
+
error: Optional[str] = None
|
|
21
|
+
token_usage: Dict[str, int] = field(default_factory=dict)
|
|
22
|
+
cost: float = 0.0
|
|
23
|
+
steps_taken: int = 0 # Number of reasoning steps/attempts
|
|
24
|
+
metrics: Dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
timestamp: datetime = field(default_factory=datetime.now)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ModelPerformance:
|
|
30
|
+
"""Aggregated performance metrics for a model."""
|
|
31
|
+
|
|
32
|
+
model: str
|
|
33
|
+
total_tests: int
|
|
34
|
+
passed_tests: int
|
|
35
|
+
accuracy: float # passed_tests / total_tests
|
|
36
|
+
average_duration: float
|
|
37
|
+
total_duration: float
|
|
38
|
+
total_tokens: int
|
|
39
|
+
total_cost: float
|
|
40
|
+
scores_by_category: Dict[str, float] = field(default_factory=dict)
|
|
41
|
+
error_rate: float = 0.0
|
|
42
|
+
reliability_score: float = 0.0
|
|
43
|
+
average_steps: float = 0.0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class CategoryMetrics:
|
|
48
|
+
"""Performance metrics for a test category."""
|
|
49
|
+
|
|
50
|
+
category: str
|
|
51
|
+
total_tests: int
|
|
52
|
+
model_scores: Dict[str, float] # model -> average score
|
|
53
|
+
average_score: float
|
|
54
|
+
best_model: str
|
|
55
|
+
worst_model: str
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class BenchmarkMetrics:
|
|
59
|
+
"""Utility class for calculating benchmark metrics."""
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def calculate_accuracy(passed: int, total: int) -> float:
|
|
63
|
+
"""Calculate accuracy percentage."""
|
|
64
|
+
if total == 0:
|
|
65
|
+
return 0.0
|
|
66
|
+
return passed / total
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def calculate_efficiency_score(duration: float, baseline_duration: float) -> float:
|
|
70
|
+
"""Calculate efficiency score relative to baseline (higher is better)."""
|
|
71
|
+
if baseline_duration <= 0:
|
|
72
|
+
return 1.0
|
|
73
|
+
return min(baseline_duration / duration, 1.0)
|
|
74
|
+
|
|
75
|
+
@staticmethod
|
|
76
|
+
def calculate_cost_efficiency(score: float, cost: float) -> float:
|
|
77
|
+
"""Calculate cost efficiency (score per unit cost)."""
|
|
78
|
+
if cost <= 0:
|
|
79
|
+
return score
|
|
80
|
+
return score / cost
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def calculate_weighted_score(scores: Dict[str, float], weights: Dict[str, float]) -> float:
|
|
84
|
+
"""Calculate weighted average of scores."""
|
|
85
|
+
total_weight = sum(weights.values())
|
|
86
|
+
if total_weight == 0:
|
|
87
|
+
return 0.0
|
|
88
|
+
|
|
89
|
+
weighted_sum = sum(scores.get(metric, 0.0) * weight for metric, weight in weights.items())
|
|
90
|
+
return weighted_sum / total_weight
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def calculate_reliability_score(test_results: List[BenchmarkTestResult]) -> float:
|
|
94
|
+
"""Calculate reliability score based on error patterns."""
|
|
95
|
+
if not test_results:
|
|
96
|
+
return 0.0
|
|
97
|
+
|
|
98
|
+
total_tests = len(test_results)
|
|
99
|
+
error_count = sum(1 for result in test_results if result.error is not None)
|
|
100
|
+
timeout_count = sum(1 for result in test_results if result.error and "timeout" in result.error.lower())
|
|
101
|
+
|
|
102
|
+
# Penalize errors and timeouts more heavily
|
|
103
|
+
error_penalty = (error_count * 0.5 + timeout_count * 0.3) / total_tests
|
|
104
|
+
reliability = max(0.0, 1.0 - error_penalty)
|
|
105
|
+
|
|
106
|
+
return reliability
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def calculate_category_performance(test_results: Dict[str, BenchmarkTestResult], category: str) -> CategoryMetrics:
|
|
110
|
+
"""Calculate performance metrics for a specific category."""
|
|
111
|
+
category_results = [result for result in test_results.values() if result.test_id.startswith(category)]
|
|
112
|
+
|
|
113
|
+
if not category_results:
|
|
114
|
+
return CategoryMetrics(
|
|
115
|
+
category=category,
|
|
116
|
+
total_tests=0,
|
|
117
|
+
model_scores={},
|
|
118
|
+
average_score=0.0,
|
|
119
|
+
best_model="",
|
|
120
|
+
worst_model="",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Group by model
|
|
124
|
+
model_scores = {}
|
|
125
|
+
for result in category_results:
|
|
126
|
+
if result.model not in model_scores:
|
|
127
|
+
model_scores[result.model] = []
|
|
128
|
+
model_scores[result.model].append(result.score)
|
|
129
|
+
|
|
130
|
+
# Calculate average scores per model
|
|
131
|
+
avg_model_scores = {model: round(sum(scores) / len(scores), 4) for model, scores in model_scores.items()}
|
|
132
|
+
|
|
133
|
+
# Find best and worst models
|
|
134
|
+
if avg_model_scores:
|
|
135
|
+
best_model = max(avg_model_scores.items(), key=lambda x: x[1])[0]
|
|
136
|
+
worst_model = min(avg_model_scores.items(), key=lambda x: x[1])[0]
|
|
137
|
+
average_score = round(sum(avg_model_scores.values()) / len(avg_model_scores), 4)
|
|
138
|
+
else:
|
|
139
|
+
best_model = worst_model = ""
|
|
140
|
+
average_score = 0.0
|
|
141
|
+
|
|
142
|
+
return CategoryMetrics(
|
|
143
|
+
category=category,
|
|
144
|
+
total_tests=len(category_results),
|
|
145
|
+
model_scores=avg_model_scores,
|
|
146
|
+
average_score=average_score,
|
|
147
|
+
best_model=best_model,
|
|
148
|
+
worst_model=worst_model,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def get_performance_tier(accuracy: float) -> str:
|
|
153
|
+
"""Classify performance into tiers."""
|
|
154
|
+
if accuracy >= 0.9:
|
|
155
|
+
return "Excellent"
|
|
156
|
+
elif accuracy >= 0.75:
|
|
157
|
+
return "Good"
|
|
158
|
+
elif accuracy >= 0.6:
|
|
159
|
+
return "Fair"
|
|
160
|
+
elif accuracy >= 0.4:
|
|
161
|
+
return "Poor"
|
|
162
|
+
else:
|
|
163
|
+
return "Very Poor"
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def calculate_improvement_percentage(baseline_score: float, current_score: float) -> float:
|
|
167
|
+
"""Calculate percentage improvement over baseline."""
|
|
168
|
+
if baseline_score <= 0:
|
|
169
|
+
return 0.0
|
|
170
|
+
|
|
171
|
+
improvement = ((current_score - baseline_score) / baseline_score) * 100
|
|
172
|
+
return round(improvement, 4)
|
|
173
|
+
|
|
174
|
+
@staticmethod
|
|
175
|
+
def aggregate_test_results(results: List[BenchmarkTestResult]) -> Dict[str, Any]:
|
|
176
|
+
"""Aggregate multiple test results into summary statistics."""
|
|
177
|
+
if not results:
|
|
178
|
+
return {
|
|
179
|
+
"total_tests": 0,
|
|
180
|
+
"passed_tests": 0,
|
|
181
|
+
"accuracy": 0.0,
|
|
182
|
+
"average_score": 0.0,
|
|
183
|
+
"average_duration": 0.0,
|
|
184
|
+
"total_cost": 0.0,
|
|
185
|
+
"error_rate": 0.0,
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
total_tests = len(results)
|
|
189
|
+
passed_tests = sum(1 for r in results if r.passed)
|
|
190
|
+
total_score = sum(r.score for r in results)
|
|
191
|
+
total_duration = sum(r.duration for r in results)
|
|
192
|
+
total_cost = sum(r.cost for r in results)
|
|
193
|
+
error_count = sum(1 for r in results if r.error is not None)
|
|
194
|
+
|
|
195
|
+
return {
|
|
196
|
+
"total_tests": total_tests,
|
|
197
|
+
"passed_tests": passed_tests,
|
|
198
|
+
"accuracy": passed_tests / total_tests,
|
|
199
|
+
"average_score": total_score / total_tests,
|
|
200
|
+
"average_duration": total_duration / total_tests,
|
|
201
|
+
"total_duration": total_duration,
|
|
202
|
+
"total_cost": total_cost,
|
|
203
|
+
"error_rate": error_count / total_tests,
|
|
204
|
+
}
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""Report generation for benchmark results."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import asdict
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List
|
|
8
|
+
|
|
9
|
+
from .config import get_performance_tier
|
|
10
|
+
from .core import BenchmarkResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ReportGenerator:
|
|
14
|
+
"""Generate various formats of benchmark reports."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, result: BenchmarkResult):
|
|
17
|
+
self.result = result
|
|
18
|
+
|
|
19
|
+
def generate_json_report(self, output_path: Path) -> None:
|
|
20
|
+
"""Generate a comprehensive JSON report."""
|
|
21
|
+
report_data = {
|
|
22
|
+
"benchmark_info": {
|
|
23
|
+
"start_time": self.result.start_time.isoformat(),
|
|
24
|
+
"end_time": self.result.end_time.isoformat(),
|
|
25
|
+
"total_duration": self.result.total_duration,
|
|
26
|
+
"models_tested": list(self.result.model_performances.keys()),
|
|
27
|
+
"total_tests": len(self._get_all_test_ids()),
|
|
28
|
+
},
|
|
29
|
+
"config": asdict(self.result.config),
|
|
30
|
+
"summary": self.result.summary,
|
|
31
|
+
"model_performances": {model: asdict(perf) for model, perf in self.result.model_performances.items()},
|
|
32
|
+
"detailed_results": {
|
|
33
|
+
model: {test_id: asdict(result) for test_id, result in tests.items()}
|
|
34
|
+
for model, tests in self.result.test_results.items()
|
|
35
|
+
},
|
|
36
|
+
"errors": self.result.errors,
|
|
37
|
+
"generated_at": datetime.now().isoformat(),
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
with open(output_path, "w") as f:
|
|
41
|
+
json.dump(report_data, f, indent=2, default=str)
|
|
42
|
+
|
|
43
|
+
def generate_markdown_report(self, output_path: Path) -> None:
|
|
44
|
+
"""Generate a markdown report for easy reading."""
|
|
45
|
+
report = []
|
|
46
|
+
|
|
47
|
+
# Header
|
|
48
|
+
report.append("# Tsugite Benchmark Report")
|
|
49
|
+
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
50
|
+
report.append("")
|
|
51
|
+
|
|
52
|
+
# Summary
|
|
53
|
+
report.append("## Summary")
|
|
54
|
+
report.append(f"- **Duration**: {self.result.total_duration:.2f} seconds")
|
|
55
|
+
report.append(f"- **Models Tested**: {len(self.result.model_performances)}")
|
|
56
|
+
report.append(f"- **Total Tests**: {self.result.summary.get('total_tests', 0)}")
|
|
57
|
+
report.append(f"- **Average Accuracy**: {self.result.summary.get('average_accuracy', 0):.1%}")
|
|
58
|
+
report.append("")
|
|
59
|
+
|
|
60
|
+
# Model Rankings
|
|
61
|
+
if "model_rankings" in self.result.summary:
|
|
62
|
+
report.append("## Model Rankings")
|
|
63
|
+
report.append("| Rank | Model | Accuracy | Avg Duration | Total Cost |")
|
|
64
|
+
report.append("|------|-------|----------|--------------|------------|")
|
|
65
|
+
|
|
66
|
+
for i, ranking in enumerate(self.result.summary["model_rankings"], 1):
|
|
67
|
+
model = ranking["model"]
|
|
68
|
+
accuracy = f"{ranking['accuracy']:.1%}"
|
|
69
|
+
duration = f"{ranking['avg_duration']:.2f}s"
|
|
70
|
+
cost = f"${ranking['total_cost']:.4f}"
|
|
71
|
+
report.append(f"| {i} | {model} | {accuracy} | {duration} | {cost} |")
|
|
72
|
+
report.append("")
|
|
73
|
+
|
|
74
|
+
# Detailed Performance
|
|
75
|
+
report.append("## Detailed Performance")
|
|
76
|
+
for model, performance in self.result.model_performances.items():
|
|
77
|
+
report.append(f"### {model}")
|
|
78
|
+
report.append(
|
|
79
|
+
f"- **Accuracy**: {performance.accuracy:.1%} ({performance.passed_tests}/{performance.total_tests})"
|
|
80
|
+
)
|
|
81
|
+
report.append(f"- **Average Duration**: {performance.average_duration:.2f}s")
|
|
82
|
+
report.append(f"- **Average Steps**: {performance.average_steps:.1f}")
|
|
83
|
+
report.append(f"- **Total Cost**: ${performance.total_cost:.4f}")
|
|
84
|
+
report.append(f"- **Performance Tier**: {get_performance_tier(performance.accuracy)}")
|
|
85
|
+
report.append("")
|
|
86
|
+
|
|
87
|
+
# Category Breakdown
|
|
88
|
+
category_breakdown = self._calculate_category_breakdown()
|
|
89
|
+
if category_breakdown:
|
|
90
|
+
report.append("## Performance by Category")
|
|
91
|
+
for category, data in category_breakdown.items():
|
|
92
|
+
report.append(f"### {category.title()}")
|
|
93
|
+
report.append(f"- **Best Model**: {data['best_model']} ({data['best_score']:.1%})")
|
|
94
|
+
report.append(f"- **Average Score**: {data['average_score']:.1%}")
|
|
95
|
+
report.append("")
|
|
96
|
+
|
|
97
|
+
# Test Details
|
|
98
|
+
report.append("## Test Results Details")
|
|
99
|
+
for model in self.result.model_performances.keys():
|
|
100
|
+
report.append(f"### {model}")
|
|
101
|
+
report.append("| Test ID | Category | Result | Score | Duration | Steps | Cost |")
|
|
102
|
+
report.append("|---------|----------|--------|-------|----------|-------|------|")
|
|
103
|
+
|
|
104
|
+
model_tests = self.result.test_results.get(model, {})
|
|
105
|
+
for test_id, test_result in model_tests.items():
|
|
106
|
+
category = test_result.category
|
|
107
|
+
result_status = "✅ PASS" if test_result.passed else "❌ FAIL"
|
|
108
|
+
score = f"{test_result.score:.2f}"
|
|
109
|
+
duration = f"{test_result.duration:.2f}s"
|
|
110
|
+
steps = str(test_result.steps_taken)
|
|
111
|
+
cost = f"${test_result.cost:.4f}" if test_result.cost > 0 else "$0.00"
|
|
112
|
+
report.append(f"| {test_id} | {category} | {result_status} | {score} | {duration} | {steps} | {cost} |")
|
|
113
|
+
report.append("")
|
|
114
|
+
|
|
115
|
+
# Errors
|
|
116
|
+
if self.result.errors:
|
|
117
|
+
report.append("## Errors")
|
|
118
|
+
for error in self.result.errors:
|
|
119
|
+
report.append(f"- {error}")
|
|
120
|
+
report.append("")
|
|
121
|
+
|
|
122
|
+
# Write to file
|
|
123
|
+
with open(output_path, "w") as f:
|
|
124
|
+
f.write("\n".join(report))
|
|
125
|
+
|
|
126
|
+
def generate_html_report(self, output_path: Path) -> None:
|
|
127
|
+
"""Generate an interactive HTML report."""
|
|
128
|
+
html_content = self._generate_html_content()
|
|
129
|
+
|
|
130
|
+
with open(output_path, "w") as f:
|
|
131
|
+
f.write(html_content)
|
|
132
|
+
|
|
133
|
+
def generate_csv_summary(self, output_path: Path) -> None:
|
|
134
|
+
"""Generate CSV summary for data analysis."""
|
|
135
|
+
import csv
|
|
136
|
+
|
|
137
|
+
with open(output_path, "w", newline="") as f:
|
|
138
|
+
writer = csv.writer(f)
|
|
139
|
+
|
|
140
|
+
# Header
|
|
141
|
+
writer.writerow(
|
|
142
|
+
[
|
|
143
|
+
"Model",
|
|
144
|
+
"Test_ID",
|
|
145
|
+
"Category",
|
|
146
|
+
"Passed",
|
|
147
|
+
"Score",
|
|
148
|
+
"Duration",
|
|
149
|
+
"Steps",
|
|
150
|
+
"Token_Usage",
|
|
151
|
+
"Cost",
|
|
152
|
+
"Error",
|
|
153
|
+
]
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Data rows
|
|
157
|
+
for model, tests in self.result.test_results.items():
|
|
158
|
+
for test_id, test_result in tests.items():
|
|
159
|
+
category = test_result.category
|
|
160
|
+
token_usage = test_result.token_usage.get("total", 0)
|
|
161
|
+
|
|
162
|
+
writer.writerow(
|
|
163
|
+
[
|
|
164
|
+
model,
|
|
165
|
+
test_id,
|
|
166
|
+
category,
|
|
167
|
+
test_result.passed,
|
|
168
|
+
test_result.score,
|
|
169
|
+
test_result.duration,
|
|
170
|
+
test_result.steps_taken,
|
|
171
|
+
token_usage,
|
|
172
|
+
test_result.cost,
|
|
173
|
+
test_result.error or "",
|
|
174
|
+
]
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def _get_all_test_ids(self) -> List[str]:
|
|
178
|
+
"""Get all unique test IDs across all models."""
|
|
179
|
+
all_test_ids = set()
|
|
180
|
+
for tests in self.result.test_results.values():
|
|
181
|
+
all_test_ids.update(tests.keys())
|
|
182
|
+
return list(all_test_ids)
|
|
183
|
+
|
|
184
|
+
def _calculate_category_breakdown(self) -> Dict[str, Any]:
|
|
185
|
+
"""Calculate performance breakdown by category."""
|
|
186
|
+
categories = {}
|
|
187
|
+
|
|
188
|
+
for model, tests in self.result.test_results.items():
|
|
189
|
+
for test_id, test_result in tests.items():
|
|
190
|
+
category = test_result.category
|
|
191
|
+
|
|
192
|
+
if category not in categories:
|
|
193
|
+
categories[category] = {}
|
|
194
|
+
|
|
195
|
+
if model not in categories[category]:
|
|
196
|
+
categories[category][model] = []
|
|
197
|
+
|
|
198
|
+
categories[category][model].append(test_result.score)
|
|
199
|
+
|
|
200
|
+
# Calculate averages and find best/worst
|
|
201
|
+
breakdown = {}
|
|
202
|
+
for category, model_scores in categories.items():
|
|
203
|
+
avg_scores = {model: sum(scores) / len(scores) for model, scores in model_scores.items()}
|
|
204
|
+
|
|
205
|
+
if avg_scores:
|
|
206
|
+
best_model = max(avg_scores.items(), key=lambda x: x[1])
|
|
207
|
+
average_score = sum(avg_scores.values()) / len(avg_scores)
|
|
208
|
+
|
|
209
|
+
breakdown[category] = {
|
|
210
|
+
"best_model": best_model[0],
|
|
211
|
+
"best_score": best_model[1],
|
|
212
|
+
"average_score": average_score,
|
|
213
|
+
"model_scores": avg_scores,
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return breakdown
|
|
217
|
+
|
|
218
|
+
def _generate_html_content(self) -> str:
|
|
219
|
+
"""Generate HTML content for the report."""
|
|
220
|
+
html = f"""
|
|
221
|
+
<!DOCTYPE html>
|
|
222
|
+
<html lang="en">
|
|
223
|
+
<head>
|
|
224
|
+
<meta charset="UTF-8">
|
|
225
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
226
|
+
<title>Tsugite Benchmark Report</title>
|
|
227
|
+
<style>
|
|
228
|
+
body {{
|
|
229
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
230
|
+
line-height: 1.6;
|
|
231
|
+
margin: 0;
|
|
232
|
+
padding: 20px;
|
|
233
|
+
background-color: #f5f5f5;
|
|
234
|
+
}}
|
|
235
|
+
.container {{
|
|
236
|
+
max-width: 1200px;
|
|
237
|
+
margin: 0 auto;
|
|
238
|
+
background: white;
|
|
239
|
+
padding: 30px;
|
|
240
|
+
border-radius: 8px;
|
|
241
|
+
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
|
242
|
+
}}
|
|
243
|
+
h1, h2, h3 {{
|
|
244
|
+
color: #333;
|
|
245
|
+
}}
|
|
246
|
+
.summary {{
|
|
247
|
+
background: #f8f9fa;
|
|
248
|
+
padding: 20px;
|
|
249
|
+
border-radius: 6px;
|
|
250
|
+
margin: 20px 0;
|
|
251
|
+
}}
|
|
252
|
+
.metric {{
|
|
253
|
+
display: inline-block;
|
|
254
|
+
margin: 10px 20px 10px 0;
|
|
255
|
+
}}
|
|
256
|
+
.metric-value {{
|
|
257
|
+
font-size: 1.5em;
|
|
258
|
+
font-weight: bold;
|
|
259
|
+
color: #007bff;
|
|
260
|
+
}}
|
|
261
|
+
.metric-label {{
|
|
262
|
+
font-size: 0.9em;
|
|
263
|
+
color: #666;
|
|
264
|
+
}}
|
|
265
|
+
table {{
|
|
266
|
+
width: 100%;
|
|
267
|
+
border-collapse: collapse;
|
|
268
|
+
margin: 20px 0;
|
|
269
|
+
}}
|
|
270
|
+
th, td {{
|
|
271
|
+
padding: 12px;
|
|
272
|
+
text-align: left;
|
|
273
|
+
border-bottom: 1px solid #ddd;
|
|
274
|
+
}}
|
|
275
|
+
th {{
|
|
276
|
+
background-color: #f8f9fa;
|
|
277
|
+
font-weight: 600;
|
|
278
|
+
}}
|
|
279
|
+
.pass {{ color: #28a745; }}
|
|
280
|
+
.fail {{ color: #dc3545; }}
|
|
281
|
+
.tier-excellent {{ color: #28a745; font-weight: bold; }}
|
|
282
|
+
.tier-good {{ color: #17a2b8; }}
|
|
283
|
+
.tier-fair {{ color: #ffc107; }}
|
|
284
|
+
.tier-poor {{ color: #fd7e14; }}
|
|
285
|
+
.tier-very-poor {{ color: #dc3545; }}
|
|
286
|
+
.progress-bar {{
|
|
287
|
+
background: #e9ecef;
|
|
288
|
+
border-radius: 10px;
|
|
289
|
+
height: 20px;
|
|
290
|
+
margin: 5px 0;
|
|
291
|
+
}}
|
|
292
|
+
.progress-fill {{
|
|
293
|
+
height: 100%;
|
|
294
|
+
border-radius: 10px;
|
|
295
|
+
transition: width 0.3s ease;
|
|
296
|
+
}}
|
|
297
|
+
.progress-excellent {{ background: #28a745; }}
|
|
298
|
+
.progress-good {{ background: #17a2b8; }}
|
|
299
|
+
.progress-fair {{ background: #ffc107; }}
|
|
300
|
+
.progress-poor {{ background: #fd7e14; }}
|
|
301
|
+
.progress-very-poor {{ background: #dc3545; }}
|
|
302
|
+
</style>
|
|
303
|
+
</head>
|
|
304
|
+
<body>
|
|
305
|
+
<div class="container">
|
|
306
|
+
<h1>🚀 Tsugite Benchmark Report</h1>
|
|
307
|
+
<p>Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
|
|
308
|
+
|
|
309
|
+
<div class="summary">
|
|
310
|
+
<h2>📊 Summary</h2>
|
|
311
|
+
<div class="metric">
|
|
312
|
+
<div class="metric-value">{self.result.total_duration:.1f}s</div>
|
|
313
|
+
<div class="metric-label">Total Duration</div>
|
|
314
|
+
</div>
|
|
315
|
+
<div class="metric">
|
|
316
|
+
<div class="metric-value">{len(self.result.model_performances)}</div>
|
|
317
|
+
<div class="metric-label">Models Tested</div>
|
|
318
|
+
</div>
|
|
319
|
+
<div class="metric">
|
|
320
|
+
<div class="metric-value">{self.result.summary.get("total_tests", 0)}</div>
|
|
321
|
+
<div class="metric-label">Total Tests</div>
|
|
322
|
+
</div>
|
|
323
|
+
<div class="metric">
|
|
324
|
+
<div class="metric-value">{self.result.summary.get("average_accuracy", 0):.1%}</div>
|
|
325
|
+
<div class="metric-label">Average Accuracy</div>
|
|
326
|
+
</div>
|
|
327
|
+
</div>
|
|
328
|
+
|
|
329
|
+
<h2>🏆 Model Rankings</h2>
|
|
330
|
+
<table>
|
|
331
|
+
<thead>
|
|
332
|
+
<tr>
|
|
333
|
+
<th>Rank</th>
|
|
334
|
+
<th>Model</th>
|
|
335
|
+
<th>Accuracy</th>
|
|
336
|
+
<th>Performance</th>
|
|
337
|
+
<th>Avg Duration</th>
|
|
338
|
+
<th>Total Cost</th>
|
|
339
|
+
</tr>
|
|
340
|
+
</thead>
|
|
341
|
+
<tbody>
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
# Model rankings table
|
|
345
|
+
if "model_rankings" in self.result.summary:
|
|
346
|
+
for i, ranking in enumerate(self.result.summary["model_rankings"], 1):
|
|
347
|
+
accuracy = ranking["accuracy"]
|
|
348
|
+
tier_class = self._get_tier_class(accuracy)
|
|
349
|
+
progress_class = self._get_progress_class(accuracy)
|
|
350
|
+
|
|
351
|
+
html += f"""
|
|
352
|
+
<tr>
|
|
353
|
+
<td>{i}</td>
|
|
354
|
+
<td><strong>{ranking["model"]}</strong></td>
|
|
355
|
+
<td>
|
|
356
|
+
{accuracy:.1%}
|
|
357
|
+
<div class="progress-bar">
|
|
358
|
+
<div class="progress-fill {progress_class}" style="width: {accuracy * 100}%"></div>
|
|
359
|
+
</div>
|
|
360
|
+
</td>
|
|
361
|
+
<td><span class="{tier_class}">{get_performance_tier(accuracy)}</span></td>
|
|
362
|
+
<td>{ranking["avg_duration"]:.2f}s</td>
|
|
363
|
+
<td>${ranking["total_cost"]:.4f}</td>
|
|
364
|
+
</tr>
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
html += """
|
|
368
|
+
</tbody>
|
|
369
|
+
</table>
|
|
370
|
+
|
|
371
|
+
<h2>📈 Detailed Results</h2>
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
# Detailed results for each model
|
|
375
|
+
for model, performance in self.result.model_performances.items():
|
|
376
|
+
tier_class = self._get_tier_class(performance.accuracy)
|
|
377
|
+
html += f"""
|
|
378
|
+
<h3>{model}</h3>
|
|
379
|
+
<p>
|
|
380
|
+
<span class="{tier_class}">
|
|
381
|
+
{get_performance_tier(performance.accuracy)}
|
|
382
|
+
</span>
|
|
383
|
+
- {performance.accuracy:.1%} accuracy
|
|
384
|
+
({performance.passed_tests}/{performance.total_tests} tests passed)
|
|
385
|
+
</p>
|
|
386
|
+
"""
|
|
387
|
+
|
|
388
|
+
html += """
|
|
389
|
+
</div>
|
|
390
|
+
</body>
|
|
391
|
+
</html>
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
return html
|
|
395
|
+
|
|
396
|
+
def _get_tier_class(self, accuracy: float) -> str:
|
|
397
|
+
"""Get CSS class for performance tier."""
|
|
398
|
+
if accuracy >= 0.9:
|
|
399
|
+
return "tier-excellent"
|
|
400
|
+
elif accuracy >= 0.75:
|
|
401
|
+
return "tier-good"
|
|
402
|
+
elif accuracy >= 0.6:
|
|
403
|
+
return "tier-fair"
|
|
404
|
+
elif accuracy >= 0.4:
|
|
405
|
+
return "tier-poor"
|
|
406
|
+
else:
|
|
407
|
+
return "tier-very-poor"
|
|
408
|
+
|
|
409
|
+
def _get_progress_class(self, accuracy: float) -> str:
|
|
410
|
+
"""Get CSS class for progress bar."""
|
|
411
|
+
if accuracy >= 0.9:
|
|
412
|
+
return "progress-excellent"
|
|
413
|
+
elif accuracy >= 0.75:
|
|
414
|
+
return "progress-good"
|
|
415
|
+
elif accuracy >= 0.6:
|
|
416
|
+
return "progress-fair"
|
|
417
|
+
elif accuracy >= 0.4:
|
|
418
|
+
return "progress-poor"
|
|
419
|
+
else:
|
|
420
|
+
return "progress-very-poor"
|