code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
|
@@ -0,0 +1,596 @@
|
|
|
1
|
+
"""Unified benchmark dashboard.
|
|
2
|
+
|
|
3
|
+
Aggregates results from the three C3 benchmark tiers into one HTML index:
|
|
4
|
+
- Quick (local synthetic) -> .c3/benchmark/runs/benchmark_*.json
|
|
5
|
+
- Session (workflow synthetic) -> .c3/session_benchmark/runs/session_*.json
|
|
6
|
+
- E2E (real AI CLI calls) -> .c3/e2e_benchmark/runs/*.json
|
|
7
|
+
- Delegate (Ollama vs Codex) -> .c3/e2e_benchmark/runs/delegate_*.json
|
|
8
|
+
|
|
9
|
+
Output: .c3/benchmarks/index.html — a single entry point with tier badges,
|
|
10
|
+
latest metrics, run history tables, and links to the detailed per-tier reports.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import html
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
TIER_BADGES = {
|
|
20
|
+
"quick": ("Synthetic", "#818cf8"),
|
|
21
|
+
"session": ("Synthetic", "#818cf8"),
|
|
22
|
+
"e2e": ("Live AI", "#34d399"),
|
|
23
|
+
"delegate": ("Live AI", "#34d399"),
|
|
24
|
+
"external": ("External", "#fbbf24"),
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _load_runs(runs_dir: Path, prefix: str = "") -> list[dict]:
|
|
29
|
+
if not runs_dir.exists():
|
|
30
|
+
return []
|
|
31
|
+
runs: list[dict] = []
|
|
32
|
+
for f in sorted(runs_dir.glob("*.json")):
|
|
33
|
+
if prefix and not f.name.startswith(prefix):
|
|
34
|
+
continue
|
|
35
|
+
try:
|
|
36
|
+
runs.append(json.loads(f.read_text(encoding="utf-8")))
|
|
37
|
+
except Exception:
|
|
38
|
+
continue
|
|
39
|
+
runs.sort(key=lambda r: r.get("timestamp", ""))
|
|
40
|
+
return runs
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _latest_quick_metrics(run: dict) -> dict[str, Any]:
|
|
44
|
+
sc = run.get("scorecard", {})
|
|
45
|
+
tok = sc.get("token_usage", {}) if isinstance(sc, dict) else {}
|
|
46
|
+
perf = sc.get("performance", {}) if isinstance(sc, dict) else {}
|
|
47
|
+
return {
|
|
48
|
+
"timestamp": run.get("timestamp", ""),
|
|
49
|
+
"token_savings_pct": tok.get("savings_pct", 0),
|
|
50
|
+
"budget_multiplier": tok.get("prompt_budget_multiplier", 0),
|
|
51
|
+
"quality_c3": perf.get("with_c3_quality_pct", 0),
|
|
52
|
+
"quality_baseline": perf.get("without_c3_quality_pct", 0),
|
|
53
|
+
"files_considered": run.get("files_considered", 0),
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _latest_session_metrics(run: dict) -> dict[str, Any]:
|
|
58
|
+
sc = run.get("scorecard", {})
|
|
59
|
+
lon = run.get("session_longevity", {})
|
|
60
|
+
return {
|
|
61
|
+
"timestamp": run.get("timestamp", ""),
|
|
62
|
+
"token_savings_pct": sc.get("token_savings_pct", 0),
|
|
63
|
+
"budget_multiplier": sc.get("budget_multiplier", 0),
|
|
64
|
+
"quality_c3": sc.get("avg_quality_c3", 0),
|
|
65
|
+
"quality_baseline": sc.get("avg_quality_baseline", 0),
|
|
66
|
+
"turns_c3": lon.get("estimated_turns_c3", 0),
|
|
67
|
+
"turns_baseline": lon.get("estimated_turns_baseline", 0),
|
|
68
|
+
"turn_multiplier": lon.get("turn_multiplier", 0),
|
|
69
|
+
"scenarios": len(run.get("scenarios", [])),
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _latest_e2e_metrics(run: dict) -> dict[str, Any]:
|
|
74
|
+
sc = run.get("scorecard", {})
|
|
75
|
+
eff = run.get("efficiency_summary", {})
|
|
76
|
+
return {
|
|
77
|
+
"timestamp": run.get("timestamp", ""),
|
|
78
|
+
"win_rate_c3": sc.get("win_rate_c3", 0),
|
|
79
|
+
"avg_score_c3": sc.get("avg_score_c3", 0),
|
|
80
|
+
"avg_score_baseline": sc.get("avg_score_baseline", 0),
|
|
81
|
+
"score_delta": sc.get("avg_score_delta", 0),
|
|
82
|
+
"time_saved_s": eff.get("total_time_saved_s", 0),
|
|
83
|
+
"cost_saved_usd": eff.get("total_cost_saved_usd", 0),
|
|
84
|
+
"tokens_saved": eff.get("total_tokens_saved", 0),
|
|
85
|
+
"providers": run.get("providers_tested", []),
|
|
86
|
+
"tasks": run.get("tasks_run", 0),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _latest_delegate_metrics(run: dict) -> dict[str, Any]:
|
|
91
|
+
backends = run.get("backends", {})
|
|
92
|
+
return {
|
|
93
|
+
"timestamp": run.get("timestamp", ""),
|
|
94
|
+
"backends": backends,
|
|
95
|
+
"total_results": run.get("total_results", 0),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _latest_external_metrics(run: dict) -> dict[str, Any]:
|
|
100
|
+
sc = run.get("scorecard", {})
|
|
101
|
+
return {
|
|
102
|
+
"timestamp": run.get("timestamp", ""),
|
|
103
|
+
"suite": run.get("suite", run.get("benchmark_type", "external")),
|
|
104
|
+
"model": run.get("model", ""),
|
|
105
|
+
"languages": run.get("languages", []),
|
|
106
|
+
"exercises_run": run.get("exercises_run", 0),
|
|
107
|
+
"with_c3_pass_rate": sc.get("with_c3_pass_rate", 0),
|
|
108
|
+
"baseline_pass_rate": sc.get("baseline_pass_rate", 0),
|
|
109
|
+
"pass_rate_delta": sc.get("pass_rate_delta", 0),
|
|
110
|
+
"with_c3_avg_latency_s": sc.get("with_c3_avg_latency_s", 0),
|
|
111
|
+
"baseline_avg_latency_s": sc.get("baseline_avg_latency_s", 0),
|
|
112
|
+
"with_c3_total_cost_usd": sc.get("with_c3_total_cost_usd", 0),
|
|
113
|
+
"baseline_total_cost_usd": sc.get("baseline_total_cost_usd", 0),
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _fmt_ts(ts: str) -> str:
|
|
118
|
+
if not ts:
|
|
119
|
+
return "—"
|
|
120
|
+
return html.escape(ts.replace("T", " ").replace("Z", ""))
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _badge(tier: str) -> str:
|
|
124
|
+
label, color = TIER_BADGES.get(tier, ("Unknown", "#888"))
|
|
125
|
+
return f'<span class="tier-badge" style="background:{color}">{label}</span>'
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _card_quick(runs: list[dict], detail_url: str) -> str:
|
|
129
|
+
if not runs:
|
|
130
|
+
return _empty_card("Quick", "quick",
|
|
131
|
+
"Local synthetic micro-benchmark: compression, retrieval, file maps.",
|
|
132
|
+
"c3 bench quick")
|
|
133
|
+
m = _latest_quick_metrics(runs[-1])
|
|
134
|
+
metrics_html = f"""
|
|
135
|
+
<div class="card-metric"><span class="metric-label">Token savings</span><span class="metric-val good">{m['token_savings_pct']}%</span></div>
|
|
136
|
+
<div class="card-metric"><span class="metric-label">Budget multiplier</span><span class="metric-val">{m['budget_multiplier']}x</span></div>
|
|
137
|
+
<div class="card-metric"><span class="metric-label">Quality (C3 / base)</span><span class="metric-val">{m['quality_c3']:.0f}% / {m['quality_baseline']:.0f}%</span></div>
|
|
138
|
+
<div class="card-metric"><span class="metric-label">Files sampled</span><span class="metric-val">{m['files_considered']}</span></div>
|
|
139
|
+
"""
|
|
140
|
+
return _wrap_card("Quick", "quick", metrics_html, m["timestamp"], len(runs), detail_url)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _card_session(runs: list[dict], detail_url: str) -> str:
|
|
144
|
+
if not runs:
|
|
145
|
+
return _empty_card("Session", "session",
|
|
146
|
+
"6 workflow scenarios (bug, feature, review, log, refactor, onboarding).",
|
|
147
|
+
"c3 bench session")
|
|
148
|
+
m = _latest_session_metrics(runs[-1])
|
|
149
|
+
metrics_html = f"""
|
|
150
|
+
<div class="card-metric"><span class="metric-label">Token savings</span><span class="metric-val good">{m['token_savings_pct']}%</span></div>
|
|
151
|
+
<div class="card-metric"><span class="metric-label">Budget multiplier</span><span class="metric-val">{m['budget_multiplier']}x</span></div>
|
|
152
|
+
<div class="card-metric"><span class="metric-label">Quality (C3 / base)</span><span class="metric-val">{m['quality_c3']:.0f}% / {m['quality_baseline']:.0f}%</span></div>
|
|
153
|
+
<div class="card-metric"><span class="metric-label">Session turns</span><span class="metric-val">{m['turns_c3']:.0f} vs {m['turns_baseline']:.0f} ({m['turn_multiplier']}x)</span></div>
|
|
154
|
+
"""
|
|
155
|
+
return _wrap_card("Session", "session", metrics_html, m["timestamp"], len(runs), detail_url)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _card_e2e(runs: list[dict], detail_url: str) -> str:
|
|
159
|
+
if not runs:
|
|
160
|
+
return _empty_card("E2E", "e2e",
|
|
161
|
+
"Real AI CLI calls (claude / gemini / codex). The most credible tier.",
|
|
162
|
+
"c3 bench e2e")
|
|
163
|
+
m = _latest_e2e_metrics(runs[-1])
|
|
164
|
+
providers = ", ".join(m["providers"]) or "—"
|
|
165
|
+
metrics_html = f"""
|
|
166
|
+
<div class="card-metric"><span class="metric-label">Win rate (C3)</span><span class="metric-val good">{m['win_rate_c3']:.1f}%</span></div>
|
|
167
|
+
<div class="card-metric"><span class="metric-label">Avg score (C3 / base)</span><span class="metric-val">{m['avg_score_c3']:.3f} / {m['avg_score_baseline']:.3f}</span></div>
|
|
168
|
+
<div class="card-metric"><span class="metric-label">Time / cost saved</span><span class="metric-val">{m['time_saved_s']:.0f}s / ${m['cost_saved_usd']:.4f}</span></div>
|
|
169
|
+
<div class="card-metric"><span class="metric-label">Providers</span><span class="metric-val">{html.escape(providers)}</span></div>
|
|
170
|
+
"""
|
|
171
|
+
return _wrap_card("E2E", "e2e", metrics_html, m["timestamp"], len(runs), detail_url)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _card_delegate(runs: list[dict]) -> str:
|
|
175
|
+
if not runs:
|
|
176
|
+
return _empty_card("Delegate", "delegate",
|
|
177
|
+
"Ollama vs Codex on the same tasks. Measures delegate backend quality.",
|
|
178
|
+
"c3 bench delegate")
|
|
179
|
+
m = _latest_delegate_metrics(runs[-1])
|
|
180
|
+
rows = []
|
|
181
|
+
for backend, stats in m["backends"].items():
|
|
182
|
+
rows.append(
|
|
183
|
+
f'<div class="card-metric"><span class="metric-label">{html.escape(backend)}</span>'
|
|
184
|
+
f'<span class="metric-val">{stats.get("success_rate", 0)}% ({stats.get("successes", 0)}/{stats.get("tasks_run", 0)}) '
|
|
185
|
+
f'· {stats.get("avg_latency_s", 0)}s</span></div>'
|
|
186
|
+
)
|
|
187
|
+
metrics_html = "\n".join(rows) or '<div class="card-metric"><span class="metric-label">No backends</span></div>'
|
|
188
|
+
return _wrap_card("Delegate", "delegate", metrics_html, m["timestamp"], len(runs), "")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _card_external(runs: list[dict]) -> str:
|
|
192
|
+
if not runs:
|
|
193
|
+
return _empty_card("External", "external",
|
|
194
|
+
"Aider Polyglot / SWE-bench. Third-party benchmarks for credible cross-tool comparisons.",
|
|
195
|
+
"c3 bench external --suite aider-polyglot")
|
|
196
|
+
m = _latest_external_metrics(runs[-1])
|
|
197
|
+
delta = m["pass_rate_delta"]
|
|
198
|
+
delta_color = "#34d399" if delta > 0 else ("#f87171" if delta < 0 else "#fbbf24")
|
|
199
|
+
langs = ", ".join(m["languages"]) or "—"
|
|
200
|
+
metrics_html = f"""
|
|
201
|
+
<div class="card-metric"><span class="metric-label">Suite</span><span class="metric-val">{html.escape(m['suite'])}</span></div>
|
|
202
|
+
<div class="card-metric"><span class="metric-label">Pass rate (C3)</span><span class="metric-val good">{m['with_c3_pass_rate']}%</span></div>
|
|
203
|
+
<div class="card-metric"><span class="metric-label">Pass rate (base)</span><span class="metric-val">{m['baseline_pass_rate']}%</span></div>
|
|
204
|
+
<div class="card-metric"><span class="metric-label">Delta</span><span class="metric-val" style="color:{delta_color}">{delta:+.1f} pp</span></div>
|
|
205
|
+
<div class="card-metric"><span class="metric-label">Exercises / langs</span><span class="metric-val">{m['exercises_run']} · {html.escape(langs)}</span></div>
|
|
206
|
+
"""
|
|
207
|
+
return _wrap_card("External", "external", metrics_html, m["timestamp"], len(runs), "")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _wrap_card(title: str, tier: str, metrics_html: str, ts: str, run_count: int, detail_url: str) -> str:
|
|
211
|
+
link = f'<a class="card-link" href="{html.escape(detail_url)}">Open detail →</a>' if detail_url else ""
|
|
212
|
+
return f"""
|
|
213
|
+
<div class="card">
|
|
214
|
+
<div class="card-head">
|
|
215
|
+
<span class="card-title">{html.escape(title)}</span>
|
|
216
|
+
{_badge(tier)}
|
|
217
|
+
</div>
|
|
218
|
+
<div class="card-metrics">{metrics_html}</div>
|
|
219
|
+
<div class="card-foot">
|
|
220
|
+
<span class="card-ts">Last run: {_fmt_ts(ts)}</span>
|
|
221
|
+
<span class="card-runs">{run_count} run{'' if run_count == 1 else 's'}</span>
|
|
222
|
+
{link}
|
|
223
|
+
</div>
|
|
224
|
+
</div>"""
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _empty_card(title: str, tier: str, desc: str, cmd: str) -> str:
|
|
228
|
+
return f"""
|
|
229
|
+
<div class="card empty">
|
|
230
|
+
<div class="card-head">
|
|
231
|
+
<span class="card-title">{html.escape(title)}</span>
|
|
232
|
+
{_badge(tier)}
|
|
233
|
+
</div>
|
|
234
|
+
<div class="card-metrics"><p class="empty-desc">{html.escape(desc)}</p></div>
|
|
235
|
+
<div class="card-foot">
|
|
236
|
+
<span class="card-ts">Not yet run</span>
|
|
237
|
+
<code class="card-cmd">{html.escape(cmd)}</code>
|
|
238
|
+
</div>
|
|
239
|
+
</div>"""
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _runs_table(runs: list[dict], metric_extractor, headers: list[str]) -> str:
|
|
243
|
+
if not runs:
|
|
244
|
+
return '<p class="muted">No runs yet.</p>'
|
|
245
|
+
rows = []
|
|
246
|
+
for run in reversed(runs[-20:]): # newest first, cap at 20
|
|
247
|
+
m = metric_extractor(run)
|
|
248
|
+
cells = [f'<td>{_fmt_ts(m["timestamp"])}</td>']
|
|
249
|
+
for h in headers[1:]:
|
|
250
|
+
val = m.get(h["key"], "")
|
|
251
|
+
fmt = h.get("fmt", "{}")
|
|
252
|
+
if isinstance(val, float):
|
|
253
|
+
cells.append(f'<td>{fmt.format(val)}</td>')
|
|
254
|
+
elif isinstance(val, list):
|
|
255
|
+
cells.append(f'<td>{html.escape(", ".join(str(v) for v in val))}</td>')
|
|
256
|
+
else:
|
|
257
|
+
cells.append(f'<td>{fmt.format(val) if val else "—"}</td>')
|
|
258
|
+
rows.append(f'<tr>{"".join(cells)}</tr>')
|
|
259
|
+
head = "".join(f'<th>{h["label"]}</th>' for h in headers)
|
|
260
|
+
return f'<table class="runs-table"><thead><tr>{head}</tr></thead><tbody>{"".join(rows)}</tbody></table>'
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _project_rel(project_path: Path, target: Path) -> str:
|
|
264
|
+
"""Best-effort relative path for in-dashboard links."""
|
|
265
|
+
try:
|
|
266
|
+
return str(target.relative_to(project_path.parent)).replace("\\", "/")
|
|
267
|
+
except Exception:
|
|
268
|
+
return target.as_uri() if target.exists() else ""
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def generate_dashboard(project_path: str) -> Path:
|
|
272
|
+
"""Generate the unified benchmark dashboard HTML.
|
|
273
|
+
|
|
274
|
+
Returns the path to the written HTML file.
|
|
275
|
+
"""
|
|
276
|
+
root = Path(project_path).resolve()
|
|
277
|
+
quick_runs = _load_runs(root / ".c3" / "benchmark" / "runs")
|
|
278
|
+
session_runs = _load_runs(root / ".c3" / "session_benchmark" / "runs")
|
|
279
|
+
all_e2e = _load_runs(root / ".c3" / "e2e_benchmark" / "runs")
|
|
280
|
+
e2e_runs = [r for r in all_e2e if r.get("benchmark_type") != "delegate_comparison"
|
|
281
|
+
and "backends" not in r]
|
|
282
|
+
delegate_runs = [r for r in all_e2e if r.get("benchmark_type") == "delegate_comparison"
|
|
283
|
+
or "backends" in r]
|
|
284
|
+
external_runs = _load_runs(root / ".c3" / "external_benchmark" / "runs")
|
|
285
|
+
|
|
286
|
+
quick_html = root / ".c3" / "benchmark" / "latest.html"
|
|
287
|
+
session_html = root / ".c3" / "session_benchmark" / "latest.html"
|
|
288
|
+
e2e_html = root / ".c3" / "e2e_benchmark" / "latest.html"
|
|
289
|
+
|
|
290
|
+
def _rel_or_empty(p: Path) -> str:
|
|
291
|
+
return f"../{p.relative_to(root).as_posix()}" if p.exists() else ""
|
|
292
|
+
|
|
293
|
+
cards = [
|
|
294
|
+
_card_quick(quick_runs, _rel_or_empty(quick_html)),
|
|
295
|
+
_card_session(session_runs, _rel_or_empty(session_html)),
|
|
296
|
+
_card_e2e(e2e_runs, _rel_or_empty(e2e_html)),
|
|
297
|
+
_card_delegate(delegate_runs),
|
|
298
|
+
_card_external(external_runs),
|
|
299
|
+
]
|
|
300
|
+
|
|
301
|
+
quick_table = _runs_table(
|
|
302
|
+
quick_runs, _latest_quick_metrics,
|
|
303
|
+
[
|
|
304
|
+
{"key": "timestamp", "label": "Run"},
|
|
305
|
+
{"key": "token_savings_pct", "label": "Token savings", "fmt": "{}%"},
|
|
306
|
+
{"key": "budget_multiplier", "label": "Budget", "fmt": "{}x"},
|
|
307
|
+
{"key": "quality_c3", "label": "Quality C3", "fmt": "{:.0f}%"},
|
|
308
|
+
{"key": "quality_baseline", "label": "Quality base", "fmt": "{:.0f}%"},
|
|
309
|
+
{"key": "files_considered", "label": "Files"},
|
|
310
|
+
],
|
|
311
|
+
)
|
|
312
|
+
session_table = _runs_table(
|
|
313
|
+
session_runs, _latest_session_metrics,
|
|
314
|
+
[
|
|
315
|
+
{"key": "timestamp", "label": "Run"},
|
|
316
|
+
{"key": "token_savings_pct", "label": "Token savings", "fmt": "{}%"},
|
|
317
|
+
{"key": "budget_multiplier", "label": "Budget", "fmt": "{}x"},
|
|
318
|
+
{"key": "quality_c3", "label": "Quality C3", "fmt": "{:.0f}%"},
|
|
319
|
+
{"key": "turn_multiplier", "label": "Turn mult", "fmt": "{}x"},
|
|
320
|
+
{"key": "scenarios", "label": "Scenarios"},
|
|
321
|
+
],
|
|
322
|
+
)
|
|
323
|
+
e2e_table = _runs_table(
|
|
324
|
+
e2e_runs, _latest_e2e_metrics,
|
|
325
|
+
[
|
|
326
|
+
{"key": "timestamp", "label": "Run"},
|
|
327
|
+
{"key": "win_rate_c3", "label": "Win rate", "fmt": "{:.1f}%"},
|
|
328
|
+
{"key": "avg_score_c3", "label": "Score C3", "fmt": "{:.3f}"},
|
|
329
|
+
{"key": "avg_score_baseline", "label": "Score base", "fmt": "{:.3f}"},
|
|
330
|
+
{"key": "time_saved_s", "label": "Time saved", "fmt": "{:.0f}s"},
|
|
331
|
+
{"key": "cost_saved_usd", "label": "Cost saved", "fmt": "${:.4f}"},
|
|
332
|
+
{"key": "providers", "label": "Providers"},
|
|
333
|
+
{"key": "tasks", "label": "Tasks"},
|
|
334
|
+
],
|
|
335
|
+
)
|
|
336
|
+
external_table = _runs_table(
|
|
337
|
+
external_runs, _latest_external_metrics,
|
|
338
|
+
[
|
|
339
|
+
{"key": "timestamp", "label": "Run"},
|
|
340
|
+
{"key": "suite", "label": "Suite"},
|
|
341
|
+
{"key": "model", "label": "Model"},
|
|
342
|
+
{"key": "with_c3_pass_rate", "label": "Pass C3", "fmt": "{}%"},
|
|
343
|
+
{"key": "baseline_pass_rate", "label": "Pass base", "fmt": "{}%"},
|
|
344
|
+
{"key": "pass_rate_delta", "label": "Delta", "fmt": "{:+.1f}pp"},
|
|
345
|
+
{"key": "exercises_run", "label": "Exercises"},
|
|
346
|
+
{"key": "languages", "label": "Languages"},
|
|
347
|
+
],
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
out_dir = root / ".c3" / "benchmarks"
|
|
351
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
352
|
+
out_path = out_dir / "index.html"
|
|
353
|
+
|
|
354
|
+
html_doc = _render_dashboard_html(
|
|
355
|
+
project_path=str(root),
|
|
356
|
+
cards_html="\n".join(cards),
|
|
357
|
+
quick_table=quick_table,
|
|
358
|
+
session_table=session_table,
|
|
359
|
+
e2e_table=e2e_table,
|
|
360
|
+
external_table=external_table,
|
|
361
|
+
delegate_runs=delegate_runs,
|
|
362
|
+
counts={
|
|
363
|
+
"quick": len(quick_runs),
|
|
364
|
+
"session": len(session_runs),
|
|
365
|
+
"e2e": len(e2e_runs),
|
|
366
|
+
"delegate": len(delegate_runs),
|
|
367
|
+
"external": len(external_runs),
|
|
368
|
+
},
|
|
369
|
+
)
|
|
370
|
+
out_path.write_text(html_doc, encoding="utf-8")
|
|
371
|
+
return out_path
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _render_dashboard_html(
|
|
375
|
+
*,
|
|
376
|
+
project_path: str,
|
|
377
|
+
cards_html: str,
|
|
378
|
+
quick_table: str,
|
|
379
|
+
session_table: str,
|
|
380
|
+
e2e_table: str,
|
|
381
|
+
external_table: str,
|
|
382
|
+
delegate_runs: list[dict],
|
|
383
|
+
counts: dict[str, int],
|
|
384
|
+
) -> str:
|
|
385
|
+
import time as _time
|
|
386
|
+
generated_at = _time.strftime("%Y-%m-%d %H:%M:%S")
|
|
387
|
+
project_name = html.escape(Path(project_path).name)
|
|
388
|
+
|
|
389
|
+
delegate_detail = ""
|
|
390
|
+
if delegate_runs:
|
|
391
|
+
latest = delegate_runs[-1]
|
|
392
|
+
backends = latest.get("backends", {})
|
|
393
|
+
rows = []
|
|
394
|
+
for backend, stats in backends.items():
|
|
395
|
+
rows.append(f"""
|
|
396
|
+
<tr>
|
|
397
|
+
<td><strong>{html.escape(backend)}</strong></td>
|
|
398
|
+
<td>{stats.get('success_rate', 0)}%</td>
|
|
399
|
+
<td>{stats.get('successes', 0)}/{stats.get('tasks_run', 0)}</td>
|
|
400
|
+
<td>{stats.get('avg_latency_s', 0)}s</td>
|
|
401
|
+
<td>{stats.get('avg_output_tokens', 0)}</td>
|
|
402
|
+
<td>{html.escape(', '.join(stats.get('models_used', [])))}</td>
|
|
403
|
+
</tr>""")
|
|
404
|
+
delegate_detail = f"""
|
|
405
|
+
<table class="runs-table">
|
|
406
|
+
<thead><tr>
|
|
407
|
+
<th>Backend</th><th>Success</th><th>Runs</th><th>Avg latency</th>
|
|
408
|
+
<th>Avg tokens</th><th>Models</th>
|
|
409
|
+
</tr></thead>
|
|
410
|
+
<tbody>{''.join(rows)}</tbody>
|
|
411
|
+
</table>"""
|
|
412
|
+
else:
|
|
413
|
+
delegate_detail = '<p class="muted">No delegate runs yet. Try: <code>c3 bench delegate</code></p>'
|
|
414
|
+
|
|
415
|
+
return f"""<!doctype html>
|
|
416
|
+
<html lang="en">
|
|
417
|
+
<head>
|
|
418
|
+
<meta charset="utf-8">
|
|
419
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
420
|
+
<title>C3 Benchmark Dashboard · {project_name}</title>
|
|
421
|
+
<style>
|
|
422
|
+
:root {{
|
|
423
|
+
--bg: #0b1020;
|
|
424
|
+
--surface: #131932;
|
|
425
|
+
--surface2: #1c2444;
|
|
426
|
+
--border: #2a3560;
|
|
427
|
+
--text: #e6ebff;
|
|
428
|
+
--text-dim: #9aa3c7;
|
|
429
|
+
--accent: #818cf8;
|
|
430
|
+
--good: #34d399;
|
|
431
|
+
--warn: #fbbf24;
|
|
432
|
+
--bad: #f87171;
|
|
433
|
+
}}
|
|
434
|
+
* {{ box-sizing: border-box; }}
|
|
435
|
+
body {{
|
|
436
|
+
margin: 0; padding: 2rem 1.5rem; background: var(--bg); color: var(--text);
|
|
437
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
|
438
|
+
line-height: 1.5;
|
|
439
|
+
}}
|
|
440
|
+
.container {{ max-width: 1200px; margin: 0 auto; }}
|
|
441
|
+
header {{
|
|
442
|
+
display: flex; align-items: center; justify-content: space-between;
|
|
443
|
+
margin-bottom: 2rem; padding-bottom: 1rem; border-bottom: 1px solid var(--border);
|
|
444
|
+
}}
|
|
445
|
+
h1 {{ margin: 0; font-size: 1.6rem; font-weight: 600; }}
|
|
446
|
+
.subtitle {{ color: var(--text-dim); font-size: 0.9rem; margin-top: 0.25rem; }}
|
|
447
|
+
.meta {{ text-align: right; color: var(--text-dim); font-size: 0.85rem; }}
|
|
448
|
+
.cards {{
|
|
449
|
+
display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
|
|
450
|
+
gap: 1rem; margin-bottom: 2rem;
|
|
451
|
+
}}
|
|
452
|
+
.card {{
|
|
453
|
+
background: var(--surface); border: 1px solid var(--border);
|
|
454
|
+
border-radius: 10px; padding: 1.1rem; display: flex; flex-direction: column;
|
|
455
|
+
}}
|
|
456
|
+
.card.empty {{ opacity: 0.65; border-style: dashed; }}
|
|
457
|
+
.card-head {{
|
|
458
|
+
display: flex; align-items: center; justify-content: space-between;
|
|
459
|
+
margin-bottom: 0.9rem;
|
|
460
|
+
}}
|
|
461
|
+
.card-title {{ font-size: 1.05rem; font-weight: 600; }}
|
|
462
|
+
.tier-badge {{
|
|
463
|
+
padding: 0.15rem 0.55rem; border-radius: 999px; font-size: 0.7rem;
|
|
464
|
+
font-weight: 600; color: #0b1020;
|
|
465
|
+
}}
|
|
466
|
+
.card-metrics {{ flex: 1; }}
|
|
467
|
+
.card-metric {{
|
|
468
|
+
display: flex; justify-content: space-between; padding: 0.4rem 0;
|
|
469
|
+
border-bottom: 1px dashed var(--surface2); font-size: 0.9rem;
|
|
470
|
+
}}
|
|
471
|
+
.card-metric:last-child {{ border-bottom: none; }}
|
|
472
|
+
.metric-label {{ color: var(--text-dim); }}
|
|
473
|
+
.metric-val {{ font-weight: 600; }}
|
|
474
|
+
.metric-val.good {{ color: var(--good); }}
|
|
475
|
+
.empty-desc {{ color: var(--text-dim); font-size: 0.85rem; margin: 0; }}
|
|
476
|
+
.card-foot {{
|
|
477
|
+
margin-top: 0.9rem; padding-top: 0.6rem; border-top: 1px solid var(--border);
|
|
478
|
+
display: flex; align-items: center; justify-content: space-between; gap: 0.5rem;
|
|
479
|
+
font-size: 0.8rem; color: var(--text-dim); flex-wrap: wrap;
|
|
480
|
+
}}
|
|
481
|
+
.card-link {{ color: var(--accent); text-decoration: none; font-weight: 500; }}
|
|
482
|
+
.card-link:hover {{ text-decoration: underline; }}
|
|
483
|
+
.card-cmd {{
|
|
484
|
+
background: var(--surface2); padding: 0.15rem 0.4rem; border-radius: 4px;
|
|
485
|
+
font-size: 0.75rem; color: var(--accent);
|
|
486
|
+
}}
|
|
487
|
+
.tabs {{
|
|
488
|
+
display: flex; gap: 0.25rem; margin-bottom: 1rem; border-bottom: 1px solid var(--border);
|
|
489
|
+
overflow-x: auto;
|
|
490
|
+
}}
|
|
491
|
+
.tab {{
|
|
492
|
+
padding: 0.6rem 1rem; background: none; border: none; color: var(--text-dim);
|
|
493
|
+
cursor: pointer; font-size: 0.95rem; border-bottom: 2px solid transparent;
|
|
494
|
+
font-family: inherit;
|
|
495
|
+
}}
|
|
496
|
+
.tab.active {{ color: var(--text); border-bottom-color: var(--accent); }}
|
|
497
|
+
.tab .count {{ color: var(--text-dim); font-size: 0.8rem; margin-left: 0.3rem; }}
|
|
498
|
+
.panel {{ display: none; }}
|
|
499
|
+
.panel.active {{ display: block; }}
|
|
500
|
+
h2 {{ font-size: 1.15rem; margin: 0 0 0.8rem 0; }}
|
|
501
|
+
.panel-desc {{ color: var(--text-dim); font-size: 0.9rem; margin-bottom: 1rem; }}
|
|
502
|
+
.runs-table {{ width: 100%; border-collapse: collapse; font-size: 0.85rem; }}
|
|
503
|
+
.runs-table th, .runs-table td {{
|
|
504
|
+
padding: 0.55rem 0.7rem; text-align: left; border-bottom: 1px solid var(--border);
|
|
505
|
+
}}
|
|
506
|
+
.runs-table th {{ color: var(--text-dim); font-weight: 500; background: var(--surface); }}
|
|
507
|
+
.runs-table tbody tr:hover {{ background: var(--surface); }}
|
|
508
|
+
.muted {{ color: var(--text-dim); }}
|
|
509
|
+
code {{ background: var(--surface2); padding: 0.1rem 0.35rem; border-radius: 3px; font-size: 0.85em; }}
|
|
510
|
+
footer {{ margin-top: 3rem; padding-top: 1rem; border-top: 1px solid var(--border); color: var(--text-dim); font-size: 0.8rem; }}
|
|
511
|
+
</style>
|
|
512
|
+
</head>
|
|
513
|
+
<body>
|
|
514
|
+
<div class="container">
|
|
515
|
+
<header>
|
|
516
|
+
<div>
|
|
517
|
+
<h1>C3 Benchmark Dashboard</h1>
|
|
518
|
+
<div class="subtitle">{project_name} · aggregated results across all benchmark tiers</div>
|
|
519
|
+
</div>
|
|
520
|
+
<div class="meta">Generated {generated_at}</div>
|
|
521
|
+
</header>
|
|
522
|
+
|
|
523
|
+
<section class="cards">
|
|
524
|
+
{cards_html}
|
|
525
|
+
</section>
|
|
526
|
+
|
|
527
|
+
<nav class="tabs" role="tablist">
|
|
528
|
+
<button class="tab active" data-tab="quick" role="tab">Quick<span class="count">{counts['quick']}</span></button>
|
|
529
|
+
<button class="tab" data-tab="session" role="tab">Session<span class="count">{counts['session']}</span></button>
|
|
530
|
+
<button class="tab" data-tab="e2e" role="tab">E2E<span class="count">{counts['e2e']}</span></button>
|
|
531
|
+
<button class="tab" data-tab="delegate" role="tab">Delegate<span class="count">{counts['delegate']}</span></button>
|
|
532
|
+
<button class="tab" data-tab="external" role="tab">External<span class="count">{counts['external']}</span></button>
|
|
533
|
+
<button class="tab" data-tab="about" role="tab">About</button>
|
|
534
|
+
</nav>
|
|
535
|
+
|
|
536
|
+
<section class="panel active" data-panel="quick">
|
|
537
|
+
<h2>Quick Benchmark — local synthetic <span class="tier-badge" style="background:#818cf8">Synthetic</span></h2>
|
|
538
|
+
<p class="panel-desc">Measures C3's local compression, retrieval, and file-map savings on sampled project files. No AI calls. Run with <code>c3 bench quick</code>.</p>
|
|
539
|
+
{quick_table}
|
|
540
|
+
</section>
|
|
541
|
+
|
|
542
|
+
<section class="panel" data-panel="session">
|
|
543
|
+
<h2>Session Benchmark — 6 workflow scenarios <span class="tier-badge" style="background:#818cf8">Synthetic</span></h2>
|
|
544
|
+
<p class="panel-desc">Simulates bug investigation, feature exploration, code review, log triage, refactor planning, and onboarding workflows. Baseline is synthesised heuristically. Run with <code>c3 bench session</code>.</p>
|
|
545
|
+
{session_table}
|
|
546
|
+
</section>
|
|
547
|
+
|
|
548
|
+
<section class="panel" data-panel="e2e">
|
|
549
|
+
<h2>End-to-End Benchmark <span class="tier-badge" style="background:#34d399">Live AI</span></h2>
|
|
550
|
+
<p class="panel-desc">Runs real claude / gemini / codex CLI calls against the same tasks, with and without the C3 MCP server. Most credible tier. Run with <code>c3 bench e2e</code>.</p>
|
|
551
|
+
{e2e_table}
|
|
552
|
+
</section>
|
|
553
|
+
|
|
554
|
+
<section class="panel" data-panel="delegate">
|
|
555
|
+
<h2>Delegate Benchmark — Ollama vs Codex <span class="tier-badge" style="background:#34d399">Live AI</span></h2>
|
|
556
|
+
<p class="panel-desc">Compares local Ollama against OpenAI Codex as delegate backends across the same task set. Run with <code>c3 bench delegate</code>.</p>
|
|
557
|
+
{delegate_detail}
|
|
558
|
+
</section>
|
|
559
|
+
|
|
560
|
+
<section class="panel" data-panel="external">
|
|
561
|
+
<h2>External Benchmarks — Aider Polyglot / SWE-bench <span class="tier-badge" style="background:#fbbf24">External</span></h2>
|
|
562
|
+
<p class="panel-desc">Third-party benchmark suites for credibility outside the C3 repo. Requires cloning the benchmark corpus and installing the CLI (e.g. aider-chat). Run with <code>c3 bench external --suite aider-polyglot</code>.</p>
|
|
563
|
+
{external_table}
|
|
564
|
+
</section>
|
|
565
|
+
|
|
566
|
+
<section class="panel" data-panel="about">
|
|
567
|
+
<h2>About the tiers</h2>
|
|
568
|
+
<p class="panel-desc">Three complementary benchmark tiers. Prefer E2E numbers when citing to external audiences; Quick and Session are useful for CI + development iteration.</p>
|
|
569
|
+
<table class="runs-table">
|
|
570
|
+
<thead><tr><th>Tier</th><th>Label</th><th>Cost</th><th>Runtime</th><th>What it measures</th></tr></thead>
|
|
571
|
+
<tbody>
|
|
572
|
+
<tr><td><strong>Quick</strong></td><td><span class="tier-badge" style="background:#818cf8">Synthetic</span></td><td>Free (local)</td><td>~30s</td><td>Compression + retrieval + file-map token savings on sample files</td></tr>
|
|
573
|
+
<tr><td><strong>Session</strong></td><td><span class="tier-badge" style="background:#818cf8">Synthetic</span></td><td>Free (local)</td><td>~2min</td><td>Simulated multi-step workflows; budget multiplier + session longevity</td></tr>
|
|
574
|
+
<tr><td><strong>E2E</strong></td><td><span class="tier-badge" style="background:#34d399">Live AI</span></td><td>Per-token</td><td>10–60min</td><td>Actual AI CLI calls; win rate, tool use, cost, judged quality</td></tr>
|
|
575
|
+
<tr><td><strong>Delegate</strong></td><td><span class="tier-badge" style="background:#34d399">Live AI</span></td><td>Mostly free</td><td>~5min</td><td>Ollama vs Codex delegate backend quality / latency</td></tr>
|
|
576
|
+
<tr><td><strong>External</strong></td><td><span class="tier-badge" style="background:#fbbf24">External</span></td><td>Per-token</td><td>Hours</td><td>Aider Polyglot (live); SWE-bench Lite (planned). Third-party credibility anchor.</td></tr>
|
|
577
|
+
</tbody>
|
|
578
|
+
</table>
|
|
579
|
+
</section>
|
|
580
|
+
|
|
581
|
+
<footer>
|
|
582
|
+
Regenerate: <code>c3 bench dashboard</code> · Full run: <code>c3 bench all</code>
|
|
583
|
+
</footer>
|
|
584
|
+
</div>
|
|
585
|
+
<script>
|
|
586
|
+
document.querySelectorAll('.tab').forEach(btn => {{
|
|
587
|
+
btn.addEventListener('click', () => {{
|
|
588
|
+
const tab = btn.dataset.tab;
|
|
589
|
+
document.querySelectorAll('.tab').forEach(t => t.classList.toggle('active', t === btn));
|
|
590
|
+
document.querySelectorAll('.panel').forEach(p => p.classList.toggle('active', p.dataset.panel === tab));
|
|
591
|
+
}});
|
|
592
|
+
}});
|
|
593
|
+
</script>
|
|
594
|
+
</body>
|
|
595
|
+
</html>
|
|
596
|
+
"""
|