code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
|
@@ -0,0 +1,2884 @@
|
|
|
1
|
+
"""
|
|
2
|
+
E2E Benchmark Engine — runs real AI sessions comparing C3-augmented vs baseline workflows.
|
|
3
|
+
|
|
4
|
+
Mode 2: Full agent with tool access.
|
|
5
|
+
- C3 run: MCP tools available (.mcp.json present)
|
|
6
|
+
- Baseline run: No C3 MCP (strict config override)
|
|
7
|
+
- Both can use native CLI tools (Read, Grep, Bash, etc.)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import shutil
|
|
17
|
+
import subprocess
|
|
18
|
+
import sys
|
|
19
|
+
import time
|
|
20
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from core import count_tokens
|
|
25
|
+
from services.e2e_evaluator import EvalScore, Evaluator
|
|
26
|
+
from services.e2e_tasks import DIFFICULTY_WEIGHTS, E2ETask, build_prompt
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _unicode_safe() -> bool:
|
|
30
|
+
"""True if the terminal can render Unicode box-drawing/check characters."""
|
|
31
|
+
try:
|
|
32
|
+
enc = getattr(sys.stdout, "encoding", "") or ""
|
|
33
|
+
"─✓✗".encode(enc)
|
|
34
|
+
return True
|
|
35
|
+
except (UnicodeEncodeError, LookupError, AttributeError):
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_UNI = _unicode_safe()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _fmt_duration(seconds: float) -> str:
|
|
43
|
+
"""Format seconds as '1m23s' or '45s'."""
|
|
44
|
+
s = int(seconds)
|
|
45
|
+
return f"{s // 60}m{s % 60:02d}s" if s >= 60 else f"{s}s"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# CLI Provider
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ToolUsage:
|
|
54
|
+
"""Tool usage statistics from a single CLI run."""
|
|
55
|
+
tool_counts: dict[str, int] = field(default_factory=dict) # tool_name -> call_count
|
|
56
|
+
tool_categories: dict[str, list[str]] = field(default_factory=dict) # category -> [tool_names]
|
|
57
|
+
total_tool_calls: int = 0
|
|
58
|
+
unique_tools: int = 0
|
|
59
|
+
# C3-specific tools detected
|
|
60
|
+
c3_tool_calls: int = 0
|
|
61
|
+
native_tool_calls: int = 0
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> dict:
|
|
64
|
+
return {
|
|
65
|
+
"tool_counts": self.tool_counts,
|
|
66
|
+
"tool_categories": self.tool_categories,
|
|
67
|
+
"total_tool_calls": self.total_tool_calls,
|
|
68
|
+
"unique_tools": self.unique_tools,
|
|
69
|
+
"c3_tool_calls": self.c3_tool_calls,
|
|
70
|
+
"native_tool_calls": self.native_tool_calls,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# C3 MCP tools — detect to separate C3 tools from native tools
|
|
75
|
+
# Anthropic API pricing ($ per million tokens) — used for cost consistency checks.
|
|
76
|
+
# Keys are model ID substrings; first match wins.
|
|
77
|
+
_MODEL_PRICING = {
|
|
78
|
+
"claude-opus-4-6": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
|
|
79
|
+
"claude-opus-4": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
|
|
80
|
+
"claude-opus-3-5": {"input": 15.0, "output": 75.0, "cache_write": 18.75, "cache_read": 1.50},
|
|
81
|
+
"claude-sonnet-4-6": {"input": 3.0, "output": 15.0, "cache_write": 3.75, "cache_read": 0.30},
|
|
82
|
+
"claude-sonnet-4": {"input": 3.0, "output": 15.0, "cache_write": 3.75, "cache_read": 0.30},
|
|
83
|
+
"claude-sonnet-3-5": {"input": 3.0, "output": 15.0, "cache_write": 3.75, "cache_read": 0.30},
|
|
84
|
+
"claude-haiku-4-5": {"input": 0.80, "output": 4.0, "cache_write": 1.0, "cache_read": 0.08},
|
|
85
|
+
"claude-haiku-4": {"input": 0.80, "output": 4.0, "cache_write": 1.0, "cache_read": 0.08},
|
|
86
|
+
"claude-haiku-3-5": {"input": 0.80, "output": 4.0, "cache_write": 1.0, "cache_read": 0.08},
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
def _get_pricing(model_id: str) -> dict | None:
|
|
90
|
+
mid = (model_id or "").lower()
|
|
91
|
+
for key, rates in _MODEL_PRICING.items():
|
|
92
|
+
if key in mid:
|
|
93
|
+
return rates
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
def _compute_expected_cost(inp: int, out: int, cw: int, cr: int, model_id: str) -> float | None:
|
|
97
|
+
"""Re-derive cost from token breakdown. Returns None if model pricing is unknown."""
|
|
98
|
+
rates = _get_pricing(model_id)
|
|
99
|
+
if not rates:
|
|
100
|
+
return None
|
|
101
|
+
return (inp * rates["input"] + out * rates["output"] +
|
|
102
|
+
cw * rates["cache_write"] + cr * rates["cache_read"]) / 1_000_000
|
|
103
|
+
|
|
104
|
+
_C3_TOOLS = {"c3_compress", "c3_read", "c3_search", "c3_filter", "c3_validate",
|
|
105
|
+
"c3_memory", "c3_session", "c3_status", "c3_delegate", "c3_shell",
|
|
106
|
+
"c3_impact", "c3_edit", "c3_edits", "c3_agent"}
|
|
107
|
+
|
|
108
|
+
# Native Claude Code tools
|
|
109
|
+
_NATIVE_TOOLS = {"Read", "Write", "Edit", "Bash", "Glob", "Grep", "Agent",
|
|
110
|
+
"WebSearch", "WebFetch", "NotebookEdit", "TodoRead", "TodoWrite"}
|
|
111
|
+
|
|
112
|
+
# Tool category mapping
|
|
113
|
+
_TOOL_CATEGORIES = {
|
|
114
|
+
"c3_mcp": list(_C3_TOOLS),
|
|
115
|
+
"file_ops": ["Read", "Write", "Edit", "Glob", "c3_read", "c3_compress"],
|
|
116
|
+
"search": ["Grep", "Glob", "c3_search", "WebSearch"],
|
|
117
|
+
"execution": ["Bash", "Agent", "c3_delegate", "c3_shell"],
|
|
118
|
+
"analysis": ["c3_validate", "c3_filter", "c3_status"],
|
|
119
|
+
"context": ["c3_memory", "c3_session"],
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class CLIResponse:
|
|
125
|
+
"""Result from a single CLI invocation."""
|
|
126
|
+
text: str = ""
|
|
127
|
+
latency_ms: float = 0.0
|
|
128
|
+
exit_code: int = -1
|
|
129
|
+
input_tokens: int = 0
|
|
130
|
+
output_tokens: int = 0
|
|
131
|
+
cost_usd: float = 0.0
|
|
132
|
+
num_turns: int = 0
|
|
133
|
+
model_used: str = ""
|
|
134
|
+
raw_stdout: str = ""
|
|
135
|
+
raw_stderr: str = ""
|
|
136
|
+
error: str = ""
|
|
137
|
+
# Rich timing from Claude JSON
|
|
138
|
+
duration_ms: float = 0.0
|
|
139
|
+
duration_api_ms: float = 0.0
|
|
140
|
+
# Cache economics
|
|
141
|
+
cache_creation_tokens: int = 0
|
|
142
|
+
cache_read_tokens: int = 0
|
|
143
|
+
# Model metadata
|
|
144
|
+
model_id: str = ""
|
|
145
|
+
context_window: int = 0
|
|
146
|
+
# Full response text for side-by-side comparison
|
|
147
|
+
response_text: str = ""
|
|
148
|
+
# Tool usage analysis
|
|
149
|
+
tool_usage: ToolUsage = field(default_factory=ToolUsage)
|
|
150
|
+
# Token count reliability: "modelUsage", "usage", or "partial" (cost/token mismatch)
|
|
151
|
+
token_count_source: str = ""
|
|
152
|
+
# Estimated peak context window fill % (proxy via cache_read growth across turns)
|
|
153
|
+
context_pressure_pct: float = 0.0
|
|
154
|
+
|
|
155
|
+
def to_dict(self) -> dict:
|
|
156
|
+
total_tokens = self.input_tokens + self.output_tokens + self.cache_creation_tokens + self.cache_read_tokens
|
|
157
|
+
computed = _compute_expected_cost(
|
|
158
|
+
self.input_tokens, self.output_tokens,
|
|
159
|
+
self.cache_creation_tokens, self.cache_read_tokens,
|
|
160
|
+
self.model_id or self.model_used,
|
|
161
|
+
)
|
|
162
|
+
d = {
|
|
163
|
+
"text_length": len(self.text),
|
|
164
|
+
"response_text": self.response_text[:800] + "…" if len(self.response_text) > 800 else self.response_text,
|
|
165
|
+
"latency_ms": round(self.latency_ms, 1),
|
|
166
|
+
"duration_ms": round(self.duration_ms, 1),
|
|
167
|
+
"duration_api_ms": round(self.duration_api_ms, 1),
|
|
168
|
+
"exit_code": self.exit_code,
|
|
169
|
+
"input_tokens": self.input_tokens,
|
|
170
|
+
"output_tokens": self.output_tokens,
|
|
171
|
+
"cache_creation_tokens": self.cache_creation_tokens,
|
|
172
|
+
"cache_read_tokens": self.cache_read_tokens,
|
|
173
|
+
"total_tokens": total_tokens,
|
|
174
|
+
"cost_usd": round(self.cost_usd, 6),
|
|
175
|
+
"computed_cost_usd": round(computed, 6) if computed is not None else None,
|
|
176
|
+
"token_count_source": self.token_count_source,
|
|
177
|
+
"context_pressure_pct": round(self.context_pressure_pct, 1),
|
|
178
|
+
"num_turns": self.num_turns,
|
|
179
|
+
"model_used": self.model_used,
|
|
180
|
+
"model_id": self.model_id,
|
|
181
|
+
"context_window": self.context_window,
|
|
182
|
+
"error": self.error,
|
|
183
|
+
"tool_usage": self.tool_usage.to_dict(),
|
|
184
|
+
}
|
|
185
|
+
return d
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@dataclass
|
|
189
|
+
class CLIProvider:
|
|
190
|
+
"""Wraps an AI CLI for non-interactive prompt execution."""
|
|
191
|
+
name: str
|
|
192
|
+
executable: str = ""
|
|
193
|
+
model: str | None = None
|
|
194
|
+
available: bool = False
|
|
195
|
+
permission_mode: str = "bypassPermissions"
|
|
196
|
+
|
|
197
|
+
def detect(self) -> bool:
|
|
198
|
+
"""Check if CLI is installed and accessible."""
|
|
199
|
+
exe = self.executable or self.name
|
|
200
|
+
# shutil.which resolves .cmd/.bat wrappers on Windows (e.g. gemini.CMD, codex.CMD)
|
|
201
|
+
resolved = shutil.which(exe)
|
|
202
|
+
if not resolved:
|
|
203
|
+
self.available = False
|
|
204
|
+
return False
|
|
205
|
+
try:
|
|
206
|
+
result = subprocess.run(
|
|
207
|
+
[resolved, "--version"], capture_output=True, text=True, timeout=10,
|
|
208
|
+
creationflags=subprocess.CREATE_NO_WINDOW if hasattr(subprocess, "CREATE_NO_WINDOW") else 0,
|
|
209
|
+
)
|
|
210
|
+
self.available = result.returncode == 0
|
|
211
|
+
if self.available:
|
|
212
|
+
self.executable = resolved
|
|
213
|
+
except Exception:
|
|
214
|
+
self.available = False
|
|
215
|
+
return self.available
|
|
216
|
+
|
|
217
|
+
def run(self, prompt: str, cwd: str, with_c3: bool = True,
|
|
218
|
+
timeout: int = 180, multi_turn: bool = False) -> CLIResponse:
|
|
219
|
+
"""Execute prompt through CLI, return structured response."""
|
|
220
|
+
response = CLIResponse()
|
|
221
|
+
|
|
222
|
+
if multi_turn and with_c3 and self.name == "claude":
|
|
223
|
+
return self._run_multi_turn(prompt, cwd, timeout)
|
|
224
|
+
|
|
225
|
+
cmd = self._build_command(prompt, with_c3)
|
|
226
|
+
|
|
227
|
+
env = os.environ.copy()
|
|
228
|
+
for block_var in ("CLAUDECODE", "CLAUDE_CODE", "CLAUDE_CODE_ENTRYPOINT",
|
|
229
|
+
"GEMINI_CLI", "CODEX_CLI"):
|
|
230
|
+
env.pop(block_var, None)
|
|
231
|
+
# Prevent C3 MCP server subprocesses from auto-restoring snapshots between tasks,
|
|
232
|
+
# which would carry over accumulated budget from the previous task's session end.
|
|
233
|
+
env["C3_BENCHMARK_MODE"] = "1"
|
|
234
|
+
|
|
235
|
+
t0 = time.perf_counter()
|
|
236
|
+
try:
|
|
237
|
+
result = subprocess.run(
|
|
238
|
+
cmd, capture_output=True, text=True, timeout=timeout, cwd=cwd,
|
|
239
|
+
env=env, encoding="utf-8", errors="replace",
|
|
240
|
+
stdin=subprocess.DEVNULL,
|
|
241
|
+
creationflags=subprocess.CREATE_NO_WINDOW if hasattr(subprocess, "CREATE_NO_WINDOW") else 0,
|
|
242
|
+
)
|
|
243
|
+
response.latency_ms = (time.perf_counter() - t0) * 1000
|
|
244
|
+
response.exit_code = result.returncode
|
|
245
|
+
response.raw_stdout = result.stdout or ""
|
|
246
|
+
response.raw_stderr = result.stderr or ""
|
|
247
|
+
self._parse_output(response)
|
|
248
|
+
|
|
249
|
+
except subprocess.TimeoutExpired:
|
|
250
|
+
response.latency_ms = timeout * 1000
|
|
251
|
+
response.error = f"Timeout after {timeout}s"
|
|
252
|
+
except Exception as e:
|
|
253
|
+
response.latency_ms = (time.perf_counter() - t0) * 1000
|
|
254
|
+
response.error = str(e)
|
|
255
|
+
|
|
256
|
+
# Store full response for comparison view
|
|
257
|
+
response.response_text = response.text
|
|
258
|
+
|
|
259
|
+
# Extract tool usage
|
|
260
|
+
response.tool_usage = self._extract_tool_usage(response, with_c3=with_c3)
|
|
261
|
+
|
|
262
|
+
return response
|
|
263
|
+
|
|
264
|
+
def _build_command(self, prompt: str, with_c3: bool) -> list[str]:
|
|
265
|
+
"""Build CLI command for non-interactive execution."""
|
|
266
|
+
exe = self.executable or self.name
|
|
267
|
+
|
|
268
|
+
if self.name == "claude":
|
|
269
|
+
cmd = [exe, "-p", prompt, "--output-format", "json",
|
|
270
|
+
"--permission-mode", self.permission_mode]
|
|
271
|
+
if self.model:
|
|
272
|
+
cmd += ["--model", self.model]
|
|
273
|
+
if not with_c3:
|
|
274
|
+
cmd += ["--strict-mcp-config", "--mcp-config", '{"mcpServers":{}}']
|
|
275
|
+
return cmd
|
|
276
|
+
|
|
277
|
+
elif self.name == "gemini":
|
|
278
|
+
# --approval-mode yolo: auto-approve all tool calls (required for
|
|
279
|
+
# non-interactive benchmark runs; "plan" falls back to "default"
|
|
280
|
+
# which prompts interactively and causes timeout).
|
|
281
|
+
cmd = [exe, "-p", prompt, "--output-format", "json",
|
|
282
|
+
"--approval-mode", "yolo"]
|
|
283
|
+
if self.model:
|
|
284
|
+
cmd += ["-m", self.model]
|
|
285
|
+
if not with_c3:
|
|
286
|
+
# Pass a dummy server name to bypass all configured MCP servers.
|
|
287
|
+
cmd += ["--allowed-mcp-server-names", "__none__"]
|
|
288
|
+
return cmd
|
|
289
|
+
|
|
290
|
+
elif self.name == "codex":
|
|
291
|
+
cmd = [exe, "exec", prompt, "--json"]
|
|
292
|
+
if self.model:
|
|
293
|
+
cmd += ["--model", self.model]
|
|
294
|
+
if not with_c3:
|
|
295
|
+
cmd += ["-c", "mcp_servers={}"]
|
|
296
|
+
return cmd
|
|
297
|
+
|
|
298
|
+
raise ValueError(f"Unknown provider: {self.name}")
|
|
299
|
+
|
|
300
|
+
def _run_multi_turn(self, prompt: str, cwd: str, timeout: int) -> CLIResponse:
|
|
301
|
+
"""Two-prompt flow: explore first, then answer with --resume."""
|
|
302
|
+
response = CLIResponse()
|
|
303
|
+
exe = self.executable or self.name
|
|
304
|
+
|
|
305
|
+
env = os.environ.copy()
|
|
306
|
+
for block_var in ("CLAUDECODE", "CLAUDE_CODE", "CLAUDE_CODE_ENTRYPOINT",
|
|
307
|
+
"GEMINI_CLI", "CODEX_CLI"):
|
|
308
|
+
env.pop(block_var, None)
|
|
309
|
+
env["C3_BENCHMARK_MODE"] = "1"
|
|
310
|
+
_cflags = subprocess.CREATE_NO_WINDOW if hasattr(subprocess, "CREATE_NO_WINDOW") else 0
|
|
311
|
+
|
|
312
|
+
explore_prompt = (
|
|
313
|
+
f"Using C3 MCP tools, explore the codebase to understand the following. "
|
|
314
|
+
f"Do NOT answer yet — just gather context using c3_memory, c3_search, "
|
|
315
|
+
f"c3_compress, and c3_read.\n\nQuestion: {prompt}"
|
|
316
|
+
)
|
|
317
|
+
answer_prompt = (
|
|
318
|
+
f"Based on your exploration, now answer the question. "
|
|
319
|
+
f"Be specific with file paths, function names, and line numbers. "
|
|
320
|
+
f"Keep your answer concise (under 500 words).\n\nQuestion: {prompt}"
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
t0 = time.perf_counter()
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
# Turn 1: Explore
|
|
327
|
+
cmd1 = [exe, "-p", explore_prompt, "--output-format", "json",
|
|
328
|
+
"--permission-mode", self.permission_mode]
|
|
329
|
+
if self.model:
|
|
330
|
+
cmd1 += ["--model", self.model]
|
|
331
|
+
|
|
332
|
+
result1 = subprocess.run(
|
|
333
|
+
cmd1, capture_output=True, text=True, timeout=timeout, cwd=cwd,
|
|
334
|
+
env=env, encoding="utf-8", errors="replace",
|
|
335
|
+
stdin=subprocess.DEVNULL, creationflags=_cflags,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Extract session ID from turn 1 for safe --resume
|
|
339
|
+
# (--continue would race under concurrent task_workers)
|
|
340
|
+
session_id = None
|
|
341
|
+
try:
|
|
342
|
+
data1_parsed = json.loads(result1.stdout or "{}")
|
|
343
|
+
session_id = data1_parsed.get("session_id") or data1_parsed.get("sessionId")
|
|
344
|
+
except (json.JSONDecodeError, TypeError):
|
|
345
|
+
pass
|
|
346
|
+
|
|
347
|
+
# Turn 2: Answer — use --resume if we got a session ID, else --continue
|
|
348
|
+
cmd2 = [exe, "-p", answer_prompt, "--output-format", "json",
|
|
349
|
+
"--permission-mode", self.permission_mode]
|
|
350
|
+
if session_id:
|
|
351
|
+
cmd2 += ["--resume", session_id]
|
|
352
|
+
else:
|
|
353
|
+
cmd2 += ["--continue"]
|
|
354
|
+
if self.model:
|
|
355
|
+
cmd2 += ["--model", self.model]
|
|
356
|
+
|
|
357
|
+
result2 = subprocess.run(
|
|
358
|
+
cmd2, capture_output=True, text=True, timeout=timeout, cwd=cwd,
|
|
359
|
+
env=env, encoding="utf-8", errors="replace",
|
|
360
|
+
stdin=subprocess.DEVNULL, creationflags=_cflags,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
response.latency_ms = (time.perf_counter() - t0) * 1000
|
|
364
|
+
response.exit_code = result2.returncode
|
|
365
|
+
# Use turn 2 output as the main response (it has the answer)
|
|
366
|
+
response.raw_stdout = result2.stdout or ""
|
|
367
|
+
# Combine stderr from both turns for tool detection
|
|
368
|
+
response.raw_stderr = (result1.stderr or "") + "\n" + (result2.stderr or "")
|
|
369
|
+
self._parse_output(response)
|
|
370
|
+
|
|
371
|
+
# Merge cost/tokens from turn 1 if available
|
|
372
|
+
try:
|
|
373
|
+
data1 = json.loads(result1.stdout or "{}")
|
|
374
|
+
usage1 = data1.get("usage", data1.get("result", {}).get("usage", {}))
|
|
375
|
+
if usage1:
|
|
376
|
+
response.input_tokens += usage1.get("input_tokens", 0)
|
|
377
|
+
response.output_tokens += usage1.get("output_tokens", 0)
|
|
378
|
+
response.cache_creation_tokens += usage1.get("cache_creation_input_tokens", 0)
|
|
379
|
+
response.cache_read_tokens += usage1.get("cache_read_input_tokens", 0)
|
|
380
|
+
except (json.JSONDecodeError, TypeError, AttributeError):
|
|
381
|
+
pass
|
|
382
|
+
|
|
383
|
+
except subprocess.TimeoutExpired:
|
|
384
|
+
response.latency_ms = timeout * 2 * 1000
|
|
385
|
+
response.error = f"Multi-turn timeout after {timeout}s"
|
|
386
|
+
except Exception as e:
|
|
387
|
+
response.latency_ms = (time.perf_counter() - t0) * 1000
|
|
388
|
+
response.error = str(e)
|
|
389
|
+
|
|
390
|
+
response.response_text = response.text
|
|
391
|
+
response.tool_usage = self._extract_tool_usage(response, with_c3=True)
|
|
392
|
+
return response
|
|
393
|
+
|
|
394
|
+
def _parse_output(self, response: CLIResponse):
|
|
395
|
+
"""Parse raw output into structured response fields."""
|
|
396
|
+
stdout = response.raw_stdout.strip()
|
|
397
|
+
|
|
398
|
+
if self.name == "claude":
|
|
399
|
+
self._parse_claude_json(response, stdout)
|
|
400
|
+
elif self.name == "gemini":
|
|
401
|
+
self._parse_gemini_output(response, stdout)
|
|
402
|
+
elif self.name == "codex":
|
|
403
|
+
self._parse_codex_output(response, stdout)
|
|
404
|
+
else:
|
|
405
|
+
response.text = stdout
|
|
406
|
+
|
|
407
|
+
def _parse_claude_json(self, response: CLIResponse, stdout: str):
|
|
408
|
+
"""Parse Claude's --output-format json — extract all available fields."""
|
|
409
|
+
try:
|
|
410
|
+
data = json.loads(stdout)
|
|
411
|
+
response.text = data.get("result", stdout)
|
|
412
|
+
|
|
413
|
+
# Plan mode puts the actual answer in messages[], not in result.
|
|
414
|
+
# If result is very short, scan for the last substantial assistant text.
|
|
415
|
+
if len(response.text.strip()) < 300:
|
|
416
|
+
for msg in reversed(data.get("messages", [])):
|
|
417
|
+
if not isinstance(msg, dict) or msg.get("role") != "assistant":
|
|
418
|
+
continue
|
|
419
|
+
content = msg.get("content", "")
|
|
420
|
+
if isinstance(content, str):
|
|
421
|
+
candidate = content.strip()
|
|
422
|
+
elif isinstance(content, list):
|
|
423
|
+
candidate = "\n".join(
|
|
424
|
+
b.get("text", "") for b in content
|
|
425
|
+
if isinstance(b, dict) and b.get("type") == "text"
|
|
426
|
+
).strip()
|
|
427
|
+
else:
|
|
428
|
+
continue
|
|
429
|
+
if len(candidate) > len(response.text.strip()):
|
|
430
|
+
response.text = candidate
|
|
431
|
+
break
|
|
432
|
+
response.cost_usd = data.get("total_cost_usd", 0) or data.get("cost_usd", 0) or 0
|
|
433
|
+
response.num_turns = data.get("num_turns", 0) or 0
|
|
434
|
+
response.duration_ms = data.get("duration_ms", 0) or 0
|
|
435
|
+
response.duration_api_ms = data.get("duration_api_ms", 0) or 0
|
|
436
|
+
|
|
437
|
+
# Usage block (may be last-turn only when modelUsage is absent)
|
|
438
|
+
usage = data.get("usage", {})
|
|
439
|
+
if usage:
|
|
440
|
+
response.input_tokens = usage.get("input_tokens", 0) or 0
|
|
441
|
+
response.output_tokens = usage.get("output_tokens", 0) or 0
|
|
442
|
+
response.cache_creation_tokens = usage.get("cache_creation_input_tokens", 0) or 0
|
|
443
|
+
response.cache_read_tokens = usage.get("cache_read_input_tokens", 0) or 0
|
|
444
|
+
response.token_count_source = "usage"
|
|
445
|
+
|
|
446
|
+
# Model usage block — has model ID, context window, and cumulative token totals.
|
|
447
|
+
# Preferred over `usage` when present because it is always session-cumulative.
|
|
448
|
+
model_usage = data.get("modelUsage", {})
|
|
449
|
+
if model_usage:
|
|
450
|
+
for model_id, model_data in model_usage.items():
|
|
451
|
+
response.model_id = model_id
|
|
452
|
+
response.model_used = model_id
|
|
453
|
+
response.context_window = model_data.get("contextWindow", 0) or 0
|
|
454
|
+
if model_data.get("inputTokens"):
|
|
455
|
+
response.input_tokens = model_data["inputTokens"]
|
|
456
|
+
if model_data.get("outputTokens"):
|
|
457
|
+
response.output_tokens = model_data["outputTokens"]
|
|
458
|
+
if model_data.get("cacheCreationInputTokens"):
|
|
459
|
+
response.cache_creation_tokens = model_data["cacheCreationInputTokens"]
|
|
460
|
+
if model_data.get("cacheReadInputTokens"):
|
|
461
|
+
response.cache_read_tokens = model_data["cacheReadInputTokens"]
|
|
462
|
+
break # Take first model
|
|
463
|
+
response.token_count_source = "modelUsage"
|
|
464
|
+
elif not response.model_used:
|
|
465
|
+
response.model_used = self.model or ""
|
|
466
|
+
|
|
467
|
+
# Consistency check: re-derive cost from token breakdown and compare to
|
|
468
|
+
# total_cost_usd. A delta > $0.01 means the token counts are partial
|
|
469
|
+
# (likely last-turn only from the `usage` block).
|
|
470
|
+
computed = _compute_expected_cost(
|
|
471
|
+
response.input_tokens, response.output_tokens,
|
|
472
|
+
response.cache_creation_tokens, response.cache_read_tokens,
|
|
473
|
+
response.model_id or response.model_used,
|
|
474
|
+
)
|
|
475
|
+
if computed is not None and abs(computed - response.cost_usd) > 0.01:
|
|
476
|
+
response.token_count_source = "partial"
|
|
477
|
+
|
|
478
|
+
# Context pressure: estimate peak context fill % across the session.
|
|
479
|
+
# Approximation: cache_read grows each turn; the last turn reads ~
|
|
480
|
+
# 2*total_cache_read/(num_turns+1) tokens from cache.
|
|
481
|
+
if response.context_window > 0 and response.num_turns > 0:
|
|
482
|
+
peak_ctx = 2 * response.cache_read_tokens / (response.num_turns + 1)
|
|
483
|
+
response.context_pressure_pct = min(peak_ctx / response.context_window * 100, 100.0)
|
|
484
|
+
|
|
485
|
+
except (json.JSONDecodeError, TypeError):
|
|
486
|
+
response.text = stdout
|
|
487
|
+
|
|
488
|
+
def _parse_gemini_output(self, response: CLIResponse, stdout: str):
|
|
489
|
+
"""Parse Gemini CLI output.
|
|
490
|
+
|
|
491
|
+
Gemini CLI may prefix stdout with status lines like
|
|
492
|
+
"Server 'c3' supports tool updates. Listening for changes..."
|
|
493
|
+
before the JSON blob, so we locate the first '{' and parse from there.
|
|
494
|
+
|
|
495
|
+
Token structure (as of gemini-cli 0.32+):
|
|
496
|
+
stats.models.<model_id>.tokens.{input, candidates, cached, total}
|
|
497
|
+
"""
|
|
498
|
+
# Strip any non-JSON prefix lines printed before the JSON object
|
|
499
|
+
json_start = stdout.find("{")
|
|
500
|
+
if json_start > 0:
|
|
501
|
+
stdout = stdout[json_start:]
|
|
502
|
+
|
|
503
|
+
try:
|
|
504
|
+
data = json.loads(stdout)
|
|
505
|
+
except (json.JSONDecodeError, TypeError):
|
|
506
|
+
response.text = stdout
|
|
507
|
+
return
|
|
508
|
+
|
|
509
|
+
if not isinstance(data, dict):
|
|
510
|
+
if isinstance(data, list):
|
|
511
|
+
texts = [msg.get("text", msg.get("content", ""))
|
|
512
|
+
for msg in data if isinstance(msg, dict)]
|
|
513
|
+
response.text = "\n".join(t for t in texts if t)
|
|
514
|
+
else:
|
|
515
|
+
response.text = str(data)
|
|
516
|
+
return
|
|
517
|
+
|
|
518
|
+
response.text = data.get("response", data.get("text", data.get("result", stdout)))
|
|
519
|
+
|
|
520
|
+
# Token & model data live under stats.models.<model_id>.tokens
|
|
521
|
+
stats = data.get("stats", {})
|
|
522
|
+
models = stats.get("models", {})
|
|
523
|
+
if models:
|
|
524
|
+
# Aggregate across all model entries (Gemini may split across models)
|
|
525
|
+
total_input = total_output = total_cached = total_req = 0
|
|
526
|
+
for model_id, mdata in models.items():
|
|
527
|
+
if not response.model_used:
|
|
528
|
+
response.model_used = model_id
|
|
529
|
+
|
|
530
|
+
api_stats = mdata.get("api", {})
|
|
531
|
+
total_req += api_stats.get("totalRequests", 0) or 0
|
|
532
|
+
|
|
533
|
+
tok = mdata.get("tokens", {})
|
|
534
|
+
total_input += tok.get("input", 0) or 0
|
|
535
|
+
# "candidates" = generated/output tokens
|
|
536
|
+
total_output += tok.get("candidates", 0) or 0
|
|
537
|
+
total_cached += tok.get("cached", 0) or 0
|
|
538
|
+
|
|
539
|
+
response.num_turns = total_req
|
|
540
|
+
response.input_tokens = total_input
|
|
541
|
+
response.output_tokens = total_output
|
|
542
|
+
response.cache_read_tokens = total_cached
|
|
543
|
+
|
|
544
|
+
if not response.model_used:
|
|
545
|
+
response.model_used = data.get("model", self.model or "")
|
|
546
|
+
|
|
547
|
+
def _parse_codex_output(self, response: CLIResponse, stdout: str):
|
|
548
|
+
"""Parse Codex exec --json JSONL output.
|
|
549
|
+
|
|
550
|
+
Event schema:
|
|
551
|
+
thread.started — session opened
|
|
552
|
+
turn.started — new turn
|
|
553
|
+
error — transient (reconnect/transport) or fatal
|
|
554
|
+
item.completed — item.type='agent_message' has the text;
|
|
555
|
+
item.type='error' is a non-fatal item error
|
|
556
|
+
turn.completed — final event; .usage has token counts
|
|
557
|
+
|
|
558
|
+
A session is considered terminated if no turn.completed is emitted.
|
|
559
|
+
"""
|
|
560
|
+
response.model_used = self.model or ""
|
|
561
|
+
events = []
|
|
562
|
+
for line in stdout.splitlines():
|
|
563
|
+
line = line.strip()
|
|
564
|
+
if not line:
|
|
565
|
+
continue
|
|
566
|
+
try:
|
|
567
|
+
events.append(json.loads(line))
|
|
568
|
+
except (json.JSONDecodeError, ValueError):
|
|
569
|
+
pass
|
|
570
|
+
|
|
571
|
+
if not events:
|
|
572
|
+
# No JSONL at all — old codex version or plain text output
|
|
573
|
+
response.text = stdout
|
|
574
|
+
return
|
|
575
|
+
|
|
576
|
+
texts = []
|
|
577
|
+
fatal_errors = []
|
|
578
|
+
turn_completed = False
|
|
579
|
+
|
|
580
|
+
_TRANSIENT = ("reconnecting", "falling back", "stream disconnected",
|
|
581
|
+
"websocket", "https transport")
|
|
582
|
+
|
|
583
|
+
for ev in events:
|
|
584
|
+
t = ev.get("type", "")
|
|
585
|
+
|
|
586
|
+
if t == "error":
|
|
587
|
+
msg = ev.get("message", "")
|
|
588
|
+
if not any(pat in msg.lower() for pat in _TRANSIENT):
|
|
589
|
+
fatal_errors.append(msg)
|
|
590
|
+
|
|
591
|
+
elif t == "item.completed":
|
|
592
|
+
item = ev.get("item", {})
|
|
593
|
+
if item.get("type") == "agent_message":
|
|
594
|
+
texts.append(item.get("text", ""))
|
|
595
|
+
elif item.get("type") == "error":
|
|
596
|
+
msg = item.get("message", "")
|
|
597
|
+
if not any(pat in msg.lower() for pat in _TRANSIENT):
|
|
598
|
+
fatal_errors.append(msg)
|
|
599
|
+
|
|
600
|
+
elif t == "turn.completed":
|
|
601
|
+
turn_completed = True
|
|
602
|
+
usage = ev.get("usage", {})
|
|
603
|
+
response.input_tokens = usage.get("input_tokens", 0) or 0
|
|
604
|
+
response.cache_read_tokens = usage.get("cached_input_tokens", 0) or 0
|
|
605
|
+
response.output_tokens = usage.get("output_tokens", 0) or 0
|
|
606
|
+
|
|
607
|
+
response.text = "\n".join(t for t in texts if t)
|
|
608
|
+
|
|
609
|
+
if not turn_completed:
|
|
610
|
+
# Session was killed before completing — surface a clear error
|
|
611
|
+
termination_msg = "; ".join(fatal_errors) if fatal_errors else "session terminated before turn.completed"
|
|
612
|
+
response.error = f"[codex:terminated] {termination_msg}"
|
|
613
|
+
elif fatal_errors and not response.text:
|
|
614
|
+
response.error = f"[codex:error] {'; '.join(fatal_errors)}"
|
|
615
|
+
|
|
616
|
+
def _extract_tool_usage(self, response: CLIResponse, with_c3: bool = True) -> ToolUsage:
|
|
617
|
+
"""Extract tool usage from CLI response using JSON data + text heuristics."""
|
|
618
|
+
usage = ToolUsage()
|
|
619
|
+
counts: dict[str, int] = {}
|
|
620
|
+
|
|
621
|
+
# Source 1: Claude JSON — parse tool_use messages if present
|
|
622
|
+
if self.name == "claude" and response.raw_stdout:
|
|
623
|
+
try:
|
|
624
|
+
data = json.loads(response.raw_stdout)
|
|
625
|
+
# Claude may include messages array with tool_use blocks
|
|
626
|
+
messages = data.get("messages", [])
|
|
627
|
+
for msg in messages:
|
|
628
|
+
if isinstance(msg, dict):
|
|
629
|
+
content = msg.get("content", [])
|
|
630
|
+
if isinstance(content, list):
|
|
631
|
+
for block in content:
|
|
632
|
+
if isinstance(block, dict) and block.get("type") == "tool_use":
|
|
633
|
+
name = block.get("name", "unknown")
|
|
634
|
+
counts[name] = counts.get(name, 0) + 1
|
|
635
|
+
# Also check role=tool_use pattern
|
|
636
|
+
if msg.get("role") == "assistant" and msg.get("type") == "tool_use":
|
|
637
|
+
name = msg.get("name", "unknown")
|
|
638
|
+
counts[name] = counts.get(name, 0) + 1
|
|
639
|
+
except (json.JSONDecodeError, TypeError):
|
|
640
|
+
pass
|
|
641
|
+
|
|
642
|
+
# Source 1c: Claude JSON — parse top-level 'result' array
|
|
643
|
+
if self.name == "claude" and response.raw_stdout and not counts:
|
|
644
|
+
try:
|
|
645
|
+
data = json.loads(response.raw_stdout)
|
|
646
|
+
result_list = data.get("result", [])
|
|
647
|
+
if isinstance(result_list, list):
|
|
648
|
+
for block in result_list:
|
|
649
|
+
if isinstance(block, dict) and block.get("type") == "tool_use":
|
|
650
|
+
name = block.get("name", "unknown")
|
|
651
|
+
counts[name] = counts.get(name, 0) + 1
|
|
652
|
+
except (json.JSONDecodeError, TypeError):
|
|
653
|
+
pass
|
|
654
|
+
|
|
655
|
+
# Source 1b: Gemini JSON — parse stats.tools if present
|
|
656
|
+
if self.name == "gemini" and response.raw_stdout:
|
|
657
|
+
stdout = response.raw_stdout
|
|
658
|
+
json_start = stdout.find("{")
|
|
659
|
+
if json_start >= 0:
|
|
660
|
+
try:
|
|
661
|
+
data = json.loads(stdout[json_start:])
|
|
662
|
+
stats = data.get("stats", {})
|
|
663
|
+
tools = stats.get("tools", {})
|
|
664
|
+
by_name = tools.get("byName", {})
|
|
665
|
+
for tool_name, tool_data in by_name.items():
|
|
666
|
+
count = tool_data.get("count", 0)
|
|
667
|
+
if count > 0:
|
|
668
|
+
# Normalize tool name (e.g., mcp_c3_c3_search -> c3_search)
|
|
669
|
+
norm_name = tool_name
|
|
670
|
+
if norm_name.startswith("mcp_c3_"):
|
|
671
|
+
norm_name = norm_name[7:]
|
|
672
|
+
counts[norm_name] = counts.get(norm_name, 0) + count
|
|
673
|
+
except (json.JSONDecodeError, TypeError):
|
|
674
|
+
pass
|
|
675
|
+
|
|
676
|
+
# Source 2: Heuristic — detect tool patterns from response text.
|
|
677
|
+
# Supplements (not replaces) JSON-based counts: adds tools found in text
|
|
678
|
+
# that weren't captured by JSON parsing (e.g. when messages[] is absent).
|
|
679
|
+
# c3_* patterns are skipped for baseline runs to avoid false positives when
|
|
680
|
+
# the response quotes source files that mention c3_* tool names.
|
|
681
|
+
text = response.text or ""
|
|
682
|
+
heuristic_counts = _detect_tools_from_text(text, include_c3=with_c3)
|
|
683
|
+
if not counts:
|
|
684
|
+
counts = heuristic_counts
|
|
685
|
+
else:
|
|
686
|
+
# Supplement: add heuristic detections for tools not captured by JSON
|
|
687
|
+
for name, count in heuristic_counts.items():
|
|
688
|
+
if name not in counts:
|
|
689
|
+
counts[name] = count
|
|
690
|
+
|
|
691
|
+
# Source 3: Parse stderr for tool call patterns
|
|
692
|
+
if not counts and response.raw_stderr:
|
|
693
|
+
import re
|
|
694
|
+
stderr_tools = re.findall(r'(?:Tool|tool_use|Calling)[\s:]+(\w+)', response.raw_stderr)
|
|
695
|
+
for tool_name in stderr_tools:
|
|
696
|
+
if tool_name.startswith("mcp__c3__"):
|
|
697
|
+
tool_name = tool_name[9:]
|
|
698
|
+
elif tool_name.startswith("mcp_c3_"):
|
|
699
|
+
tool_name = tool_name[7:]
|
|
700
|
+
if tool_name and tool_name != "unknown":
|
|
701
|
+
counts[tool_name] = counts.get(tool_name, 0) + 1
|
|
702
|
+
|
|
703
|
+
# If we have num_turns but no tool counts, estimate from turns
|
|
704
|
+
if not counts and response.num_turns > 1:
|
|
705
|
+
# Each turn beyond the first likely involved tool use
|
|
706
|
+
# We can't know which tools, but we know there were tool interactions
|
|
707
|
+
counts["_unknown_tools"] = max(0, response.num_turns - 1)
|
|
708
|
+
|
|
709
|
+
# Classify tools
|
|
710
|
+
c3_calls = 0
|
|
711
|
+
native_calls = 0
|
|
712
|
+
for name, count in counts.items():
|
|
713
|
+
if name in _C3_TOOLS or name.startswith("c3_"):
|
|
714
|
+
c3_calls += count
|
|
715
|
+
elif name in _NATIVE_TOOLS:
|
|
716
|
+
native_calls += count
|
|
717
|
+
else:
|
|
718
|
+
native_calls += count # default to native
|
|
719
|
+
|
|
720
|
+
# Build categories
|
|
721
|
+
categories: dict[str, list[str]] = {}
|
|
722
|
+
for cat_name, cat_tools in _TOOL_CATEGORIES.items():
|
|
723
|
+
matched = [t for t in counts if t in cat_tools]
|
|
724
|
+
if matched:
|
|
725
|
+
categories[cat_name] = matched
|
|
726
|
+
|
|
727
|
+
usage.tool_counts = counts
|
|
728
|
+
usage.tool_categories = categories
|
|
729
|
+
usage.total_tool_calls = sum(counts.values())
|
|
730
|
+
usage.unique_tools = len(counts)
|
|
731
|
+
usage.c3_tool_calls = c3_calls
|
|
732
|
+
usage.native_tool_calls = native_calls
|
|
733
|
+
|
|
734
|
+
return usage
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def _detect_tools_from_text(text: str, include_c3: bool = True) -> dict[str, int]:
|
|
738
|
+
"""Heuristic tool detection from response text patterns."""
|
|
739
|
+
counts: dict[str, int] = {}
|
|
740
|
+
text_lower = text.lower()
|
|
741
|
+
|
|
742
|
+
# Pattern-based detection for common tool signatures
|
|
743
|
+
_patterns = []
|
|
744
|
+
|
|
745
|
+
# C3 MCP tools — only when MCP is enabled (baseline runs may mention c3_* in quoted code)
|
|
746
|
+
if include_c3:
|
|
747
|
+
_patterns += [
|
|
748
|
+
(r'\bc3_compress\b', "c3_compress"),
|
|
749
|
+
(r'\bc3_read\b', "c3_read"),
|
|
750
|
+
(r'\bc3_search\b', "c3_search"),
|
|
751
|
+
(r'\bc3_filter\b', "c3_filter"),
|
|
752
|
+
(r'\bc3_validate\b', "c3_validate"),
|
|
753
|
+
(r'\bc3_memory\b', "c3_memory"),
|
|
754
|
+
(r'\bc3_session\b', "c3_session"),
|
|
755
|
+
(r'\bc3_status\b', "c3_status"),
|
|
756
|
+
]
|
|
757
|
+
# Native tool signatures in response text
|
|
758
|
+
_patterns += [
|
|
759
|
+
(r'(?:i\'ll |let me |i will )read (?:the |this )?file', "Read"),
|
|
760
|
+
(r'(?:reading|read) `[^`]+`', "Read"),
|
|
761
|
+
(r'(?:i\'ll |let me )search', "Grep"),
|
|
762
|
+
(r'(?:searching|grep|search)(?:ing)? for', "Grep"),
|
|
763
|
+
(r'(?:i\'ll |let me )(?:run|execute)', "Bash"),
|
|
764
|
+
(r'(?:running|ran) (?:the )?command', "Bash"),
|
|
765
|
+
(r'(?:i\'ll |let me )edit', "Edit"),
|
|
766
|
+
(r'(?:i\'ll |let me )write', "Write"),
|
|
767
|
+
(r'(?:looking for|finding|glob) files', "Glob"),
|
|
768
|
+
]
|
|
769
|
+
|
|
770
|
+
for pattern, tool_name in _patterns:
|
|
771
|
+
found = len(re.findall(pattern, text_lower))
|
|
772
|
+
if found:
|
|
773
|
+
counts[tool_name] = counts.get(tool_name, 0) + found
|
|
774
|
+
|
|
775
|
+
return counts
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
# ---------------------------------------------------------------------------
|
|
779
|
+
# Task Result
|
|
780
|
+
# ---------------------------------------------------------------------------
|
|
781
|
+
|
|
782
|
+
@dataclass
|
|
783
|
+
class TaskResult:
|
|
784
|
+
"""Result of running one task against one provider."""
|
|
785
|
+
task_id: str
|
|
786
|
+
task_category: str
|
|
787
|
+
task_difficulty: str = "medium"
|
|
788
|
+
provider: str = ""
|
|
789
|
+
c3_response: CLIResponse = field(default_factory=CLIResponse)
|
|
790
|
+
baseline_response: CLIResponse = field(default_factory=CLIResponse)
|
|
791
|
+
c3_score: EvalScore = field(default_factory=EvalScore)
|
|
792
|
+
baseline_score: EvalScore = field(default_factory=EvalScore)
|
|
793
|
+
|
|
794
|
+
@property
|
|
795
|
+
def c3_wins(self) -> bool:
|
|
796
|
+
c3 = self.c3_score.combined_score
|
|
797
|
+
base = self.baseline_score.combined_score
|
|
798
|
+
if c3 != base:
|
|
799
|
+
return c3 > base
|
|
800
|
+
# Tiebreaker: faster (lower latency) wins
|
|
801
|
+
return self.c3_response.latency_ms < self.baseline_response.latency_ms
|
|
802
|
+
|
|
803
|
+
@property
|
|
804
|
+
def score_delta(self) -> float:
|
|
805
|
+
return self.c3_score.combined_score - self.baseline_score.combined_score
|
|
806
|
+
|
|
807
|
+
@property
|
|
808
|
+
def difficulty_weight(self) -> float:
|
|
809
|
+
return DIFFICULTY_WEIGHTS.get(self.task_difficulty, 1.0)
|
|
810
|
+
|
|
811
|
+
def efficiency(self) -> dict:
|
|
812
|
+
"""Compute per-task efficiency metrics (time, cost, tokens saved)."""
|
|
813
|
+
c3 = self.c3_response
|
|
814
|
+
base = self.baseline_response
|
|
815
|
+
c3_total_tok = c3.input_tokens + c3.output_tokens + c3.cache_creation_tokens + c3.cache_read_tokens
|
|
816
|
+
base_total_tok = base.input_tokens + base.output_tokens + base.cache_creation_tokens + base.cache_read_tokens
|
|
817
|
+
|
|
818
|
+
def _pct(saved, total):
|
|
819
|
+
return round(saved / total * 100, 1) if total else 0.0
|
|
820
|
+
|
|
821
|
+
time_saved = base.latency_ms - c3.latency_ms
|
|
822
|
+
cost_saved = base.cost_usd - c3.cost_usd
|
|
823
|
+
tokens_saved = base_total_tok - c3_total_tok
|
|
824
|
+
turns_saved = base.num_turns - c3.num_turns
|
|
825
|
+
|
|
826
|
+
c3_qpd = c3.cost_usd and (self.c3_score.combined_score / c3.cost_usd) or 0
|
|
827
|
+
base_qpd = base.cost_usd and (self.baseline_score.combined_score / base.cost_usd) or 0
|
|
828
|
+
|
|
829
|
+
return {
|
|
830
|
+
"time_saved_ms": round(time_saved, 1),
|
|
831
|
+
"time_saved_pct": _pct(time_saved, base.latency_ms),
|
|
832
|
+
"cost_saved_usd": round(cost_saved, 6),
|
|
833
|
+
"cost_saved_pct": _pct(cost_saved, base.cost_usd),
|
|
834
|
+
"tokens_saved": tokens_saved,
|
|
835
|
+
"tokens_saved_pct": _pct(tokens_saved, base_total_tok),
|
|
836
|
+
"turns_saved": turns_saved,
|
|
837
|
+
"quality_per_dollar_c3": round(c3_qpd, 2),
|
|
838
|
+
"quality_per_dollar_baseline": round(base_qpd, 2),
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
def to_dict(self) -> dict:
|
|
842
|
+
return {
|
|
843
|
+
"task_id": self.task_id,
|
|
844
|
+
"task_category": self.task_category,
|
|
845
|
+
"task_difficulty": self.task_difficulty,
|
|
846
|
+
"difficulty_weight": self.difficulty_weight,
|
|
847
|
+
"provider": self.provider,
|
|
848
|
+
"c3_response": self.c3_response.to_dict(),
|
|
849
|
+
"baseline_response": self.baseline_response.to_dict(),
|
|
850
|
+
"c3_score": self.c3_score.to_dict(),
|
|
851
|
+
"baseline_score": self.baseline_score.to_dict(),
|
|
852
|
+
"c3_wins": self.c3_wins,
|
|
853
|
+
"score_delta": round(self.score_delta, 3),
|
|
854
|
+
"efficiency": self.efficiency(),
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
# ---------------------------------------------------------------------------
|
|
859
|
+
# Benchmark Engine
|
|
860
|
+
# ---------------------------------------------------------------------------
|
|
861
|
+
|
|
862
|
+
def detect_providers(
|
|
863
|
+
requested: list[str] | None = None,
|
|
864
|
+
model_overrides: dict[str, str] | None = None,
|
|
865
|
+
permission_mode: str = "bypassPermissions",
|
|
866
|
+
) -> list[CLIProvider]:
|
|
867
|
+
"""Detect available AI CLIs on the system."""
|
|
868
|
+
all_providers = [
|
|
869
|
+
CLIProvider(name="claude"),
|
|
870
|
+
CLIProvider(name="gemini"),
|
|
871
|
+
CLIProvider(name="codex"),
|
|
872
|
+
]
|
|
873
|
+
|
|
874
|
+
for p in all_providers:
|
|
875
|
+
p.permission_mode = permission_mode
|
|
876
|
+
p.detect()
|
|
877
|
+
if model_overrides and p.name in model_overrides:
|
|
878
|
+
p.model = model_overrides[p.name]
|
|
879
|
+
|
|
880
|
+
if requested:
|
|
881
|
+
all_providers = [p for p in all_providers if p.name in requested]
|
|
882
|
+
|
|
883
|
+
return [p for p in all_providers if p.available]
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
# ---------------------------------------------------------------------------
|
|
887
|
+
# Result cache helpers
|
|
888
|
+
# ---------------------------------------------------------------------------
|
|
889
|
+
|
|
890
|
+
def _task_cache_key(task: "E2ETask", providers: list["CLIProvider"]) -> str:
|
|
891
|
+
"""Stable cache key from task id, query, and provider identities."""
|
|
892
|
+
import hashlib
|
|
893
|
+
_CACHE_VERSION = "v2" # Bump when prompt template or scoring changes
|
|
894
|
+
provider_sig = ",".join(f"{p.name}:{p.model or ''}" for p in sorted(providers, key=lambda p: p.name))
|
|
895
|
+
raw = f"{_CACHE_VERSION}|{task.id}|{task.query}|{provider_sig}"
|
|
896
|
+
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
|
897
|
+
|
|
898
|
+
|
|
899
|
+
def _task_result_to_cache_dict(tr: "TaskResult") -> dict:
|
|
900
|
+
"""Minimal serialisation of a TaskResult for the cache."""
|
|
901
|
+
return {
|
|
902
|
+
"task_id": tr.task_id,
|
|
903
|
+
"task_category": tr.task_category,
|
|
904
|
+
"task_difficulty": tr.task_difficulty,
|
|
905
|
+
"provider": tr.provider,
|
|
906
|
+
"c3_score": tr.c3_score.to_dict(),
|
|
907
|
+
"baseline_score": tr.baseline_score.to_dict(),
|
|
908
|
+
"c3_response": tr.c3_response.to_dict(),
|
|
909
|
+
"baseline_response": tr.baseline_response.to_dict(),
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
def _load_result_cache(project_path: str) -> dict:
|
|
914
|
+
cache_path = Path(project_path) / ".c3" / "e2e_benchmark" / "result_cache.json"
|
|
915
|
+
if cache_path.exists():
|
|
916
|
+
try:
|
|
917
|
+
return json.loads(cache_path.read_text(encoding="utf-8"))
|
|
918
|
+
except Exception:
|
|
919
|
+
logging.getLogger("c3.e2e").debug("Failed to load result cache", exc_info=True)
|
|
920
|
+
return {}
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
def _save_result_cache(project_path: str, cache: dict) -> None:
|
|
924
|
+
cache_path = Path(project_path) / ".c3" / "e2e_benchmark" / "result_cache.json"
|
|
925
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
926
|
+
try:
|
|
927
|
+
cache_path.write_text(json.dumps(cache, indent=2), encoding="utf-8")
|
|
928
|
+
except Exception:
|
|
929
|
+
logging.getLogger("c3.e2e").debug("Failed to save result cache", exc_info=True)
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
class E2EBenchmark:
|
|
933
|
+
"""Orchestrates end-to-end benchmark runs across providers and tasks."""
|
|
934
|
+
|
|
935
|
+
def __init__(
|
|
936
|
+
self,
|
|
937
|
+
project_path: str,
|
|
938
|
+
providers: list[CLIProvider],
|
|
939
|
+
tasks: list[E2ETask],
|
|
940
|
+
evaluator: Evaluator,
|
|
941
|
+
timeout: int = 120,
|
|
942
|
+
parallel: bool = True,
|
|
943
|
+
verbose: bool = False,
|
|
944
|
+
on_progress: callable = None,
|
|
945
|
+
task_workers: int = 1,
|
|
946
|
+
cache: bool = True,
|
|
947
|
+
permission_mode: str = "bypassPermissions",
|
|
948
|
+
):
|
|
949
|
+
self.project_path = str(Path(project_path).resolve())
|
|
950
|
+
self.providers = providers
|
|
951
|
+
self.tasks = tasks
|
|
952
|
+
self.evaluator = evaluator
|
|
953
|
+
self.timeout = timeout
|
|
954
|
+
self.parallel = parallel
|
|
955
|
+
self.verbose = verbose
|
|
956
|
+
self.on_progress = on_progress
|
|
957
|
+
self.task_workers = max(1, task_workers)
|
|
958
|
+
self.cache = cache
|
|
959
|
+
self.permission_mode = permission_mode
|
|
960
|
+
self._result_cache = _load_result_cache(project_path) if cache else {}
|
|
961
|
+
self.results: list[TaskResult] = []
|
|
962
|
+
self._sandbox_path = self.project_path
|
|
963
|
+
|
|
964
|
+
def run_all(self) -> list[TaskResult]:
|
|
965
|
+
"""Run all tasks against all providers, return results.
|
|
966
|
+
|
|
967
|
+
Task-level parallelism is controlled by self.task_workers (default 1).
|
|
968
|
+
Setting task_workers > 1 runs multiple tasks concurrently — useful when
|
|
969
|
+
benchmarking many tasks and Anthropic rate limits allow it.
|
|
970
|
+
"""
|
|
971
|
+
self.results = []
|
|
972
|
+
total = len(self.tasks) * len(self.providers)
|
|
973
|
+
completed = 0
|
|
974
|
+
self._wins = 0 # rolling C3 win count for verbose scoreboard
|
|
975
|
+
self._task_elapsed: list[float] = [] # per-task wall times for ETA
|
|
976
|
+
run_start = time.perf_counter()
|
|
977
|
+
SEP = "─" if _UNI else "-"
|
|
978
|
+
_lock = __import__("threading").Lock()
|
|
979
|
+
|
|
980
|
+
# --- Worktree sandbox ---
|
|
981
|
+
sandbox_path = self.project_path
|
|
982
|
+
worktree_dir = None
|
|
983
|
+
if self.permission_mode != "plan":
|
|
984
|
+
git_dir = os.path.join(self.project_path, ".git")
|
|
985
|
+
if os.path.isdir(git_dir):
|
|
986
|
+
suffix = f"_c3bench_{os.getpid()}"
|
|
987
|
+
worktree_dir = os.path.join(os.path.dirname(self.project_path), f".c3_bench{suffix}")
|
|
988
|
+
try:
|
|
989
|
+
subprocess.run(
|
|
990
|
+
["git", "worktree", "add", worktree_dir, "HEAD", "--detach"],
|
|
991
|
+
cwd=self.project_path, capture_output=True, text=True, timeout=30,
|
|
992
|
+
)
|
|
993
|
+
# Selective .c3/ copy
|
|
994
|
+
src_c3 = os.path.join(self.project_path, ".c3")
|
|
995
|
+
dst_c3 = os.path.join(worktree_dir, ".c3")
|
|
996
|
+
if os.path.isdir(src_c3):
|
|
997
|
+
os.makedirs(dst_c3, exist_ok=True)
|
|
998
|
+
for item in ("index", "doc_index"):
|
|
999
|
+
s = os.path.join(src_c3, item)
|
|
1000
|
+
d = os.path.join(dst_c3, item)
|
|
1001
|
+
if os.path.isdir(s):
|
|
1002
|
+
shutil.copytree(s, d, dirs_exist_ok=True)
|
|
1003
|
+
for item in ("facts.json", "config.json"):
|
|
1004
|
+
s = os.path.join(src_c3, item)
|
|
1005
|
+
if os.path.isfile(s):
|
|
1006
|
+
shutil.copy2(s, os.path.join(dst_c3, item))
|
|
1007
|
+
|
|
1008
|
+
# Copy CLAUDE.md — required for C3 tool mandate
|
|
1009
|
+
for md_file in ("CLAUDE.md",):
|
|
1010
|
+
s = os.path.join(self.project_path, md_file)
|
|
1011
|
+
if os.path.isfile(s):
|
|
1012
|
+
shutil.copy2(s, os.path.join(worktree_dir, md_file))
|
|
1013
|
+
|
|
1014
|
+
# Copy .mcp.json — registers C3 MCP server with Claude CLI
|
|
1015
|
+
src_mcp = os.path.join(self.project_path, ".mcp.json")
|
|
1016
|
+
if os.path.isfile(src_mcp):
|
|
1017
|
+
shutil.copy2(src_mcp, os.path.join(worktree_dir, ".mcp.json"))
|
|
1018
|
+
|
|
1019
|
+
# Copy .claude/ settings (contains MCP hooks, local config)
|
|
1020
|
+
src_claude = os.path.join(self.project_path, ".claude")
|
|
1021
|
+
dst_claude = os.path.join(worktree_dir, ".claude")
|
|
1022
|
+
if os.path.isdir(src_claude) and not os.path.isdir(dst_claude):
|
|
1023
|
+
shutil.copytree(src_claude, dst_claude, dirs_exist_ok=True)
|
|
1024
|
+
|
|
1025
|
+
if not os.path.isfile(os.path.join(worktree_dir, "CLAUDE.md")):
|
|
1026
|
+
print(" !! Warning: CLAUDE.md not found in worktree — C3 instructions may be missing")
|
|
1027
|
+
if not os.path.isfile(os.path.join(worktree_dir, ".mcp.json")):
|
|
1028
|
+
print(" !! Warning: .mcp.json not found in worktree — MCP tools won't be available")
|
|
1029
|
+
sandbox_path = worktree_dir
|
|
1030
|
+
if self.verbose:
|
|
1031
|
+
print(f" Sandbox: {worktree_dir}")
|
|
1032
|
+
except Exception as e:
|
|
1033
|
+
print(f" !! Worktree creation failed ({e}), running in-place")
|
|
1034
|
+
worktree_dir = None
|
|
1035
|
+
else:
|
|
1036
|
+
if self.verbose:
|
|
1037
|
+
print(" !! Not a git repo — skipping worktree sandbox")
|
|
1038
|
+
|
|
1039
|
+
self._sandbox_path = sandbox_path
|
|
1040
|
+
|
|
1041
|
+
def _run_task_with_stats(task_idx_task):
|
|
1042
|
+
task_idx, task = task_idx_task
|
|
1043
|
+
if self.verbose:
|
|
1044
|
+
with _lock:
|
|
1045
|
+
label = f" Task {task_idx+1}/{len(self.tasks)} | [{task.category}] {task.id} ({task.difficulty}) "
|
|
1046
|
+
print(f"\n {SEP*3}{label}{SEP * max(0, 68 - len(label))}", flush=True)
|
|
1047
|
+
print(f" Q: {task.query[:120]}{'...' if len(task.query) > 120 else ''}", flush=True)
|
|
1048
|
+
|
|
1049
|
+
task_start = time.perf_counter()
|
|
1050
|
+
|
|
1051
|
+
# Check cache first
|
|
1052
|
+
cached = self._get_cached_results(task)
|
|
1053
|
+
if cached is not None:
|
|
1054
|
+
if self.verbose:
|
|
1055
|
+
with _lock:
|
|
1056
|
+
print(f" >> [{task.id}] using cached results", flush=True)
|
|
1057
|
+
return time.perf_counter() - task_start, cached
|
|
1058
|
+
|
|
1059
|
+
if self.parallel and len(self.providers) > 1:
|
|
1060
|
+
task_results = self._run_task_parallel(task)
|
|
1061
|
+
else:
|
|
1062
|
+
task_results = self._run_task_sequential(task)
|
|
1063
|
+
|
|
1064
|
+
if self.cache:
|
|
1065
|
+
self._save_cached_results(task, task_results)
|
|
1066
|
+
|
|
1067
|
+
return time.perf_counter() - task_start, task_results
|
|
1068
|
+
|
|
1069
|
+
indexed_tasks = list(enumerate(self.tasks))
|
|
1070
|
+
|
|
1071
|
+
try:
|
|
1072
|
+
if self.task_workers > 1:
|
|
1073
|
+
with ThreadPoolExecutor(max_workers=self.task_workers) as pool:
|
|
1074
|
+
futures = {pool.submit(_run_task_with_stats, it): it for it in indexed_tasks}
|
|
1075
|
+
for future in as_completed(futures):
|
|
1076
|
+
try:
|
|
1077
|
+
elapsed, task_results = future.result()
|
|
1078
|
+
except Exception as e:
|
|
1079
|
+
it = futures[future]
|
|
1080
|
+
task_results = []
|
|
1081
|
+
elapsed = 0.0
|
|
1082
|
+
if self.verbose:
|
|
1083
|
+
print(f" !! Task {it[1].id} failed: {e}", flush=True)
|
|
1084
|
+
|
|
1085
|
+
with _lock:
|
|
1086
|
+
self._task_elapsed.append(elapsed)
|
|
1087
|
+
for tr in task_results:
|
|
1088
|
+
self.results.append(tr)
|
|
1089
|
+
completed += 1
|
|
1090
|
+
if tr.c3_wins:
|
|
1091
|
+
self._wins += 1
|
|
1092
|
+
if self.on_progress:
|
|
1093
|
+
self.on_progress(completed, total, tr)
|
|
1094
|
+
if self.verbose:
|
|
1095
|
+
self._print_result(tr, completed, total)
|
|
1096
|
+
else:
|
|
1097
|
+
for task_idx, task in indexed_tasks:
|
|
1098
|
+
if self.verbose and self._task_elapsed:
|
|
1099
|
+
avg_s = sum(self._task_elapsed) / len(self._task_elapsed)
|
|
1100
|
+
remaining_tasks = len(self.tasks) - task_idx
|
|
1101
|
+
eta = f" ~{_fmt_duration(avg_s * remaining_tasks)} left"
|
|
1102
|
+
label = f" Task {task_idx+1}/{len(self.tasks)} | [{task.category}] {task.id} ({task.difficulty}){eta} "
|
|
1103
|
+
print(f"\n {SEP*3}{label}{SEP * max(0, 68 - len(label))}", flush=True)
|
|
1104
|
+
print(f" Q: {task.query[:120]}{'...' if len(task.query) > 120 else ''}", flush=True)
|
|
1105
|
+
elif self.verbose:
|
|
1106
|
+
label = f" Task {task_idx+1}/{len(self.tasks)} | [{task.category}] {task.id} ({task.difficulty}) "
|
|
1107
|
+
print(f"\n {SEP*3}{label}{SEP * max(0, 68 - len(label))}", flush=True)
|
|
1108
|
+
print(f" Q: {task.query[:120]}{'...' if len(task.query) > 120 else ''}", flush=True)
|
|
1109
|
+
|
|
1110
|
+
elapsed, task_results = _run_task_with_stats((task_idx, task))
|
|
1111
|
+
self._task_elapsed.append(elapsed)
|
|
1112
|
+
for tr in task_results:
|
|
1113
|
+
self.results.append(tr)
|
|
1114
|
+
completed += 1
|
|
1115
|
+
if tr.c3_wins:
|
|
1116
|
+
self._wins += 1
|
|
1117
|
+
if self.on_progress:
|
|
1118
|
+
self.on_progress(completed, total, tr)
|
|
1119
|
+
if self.verbose:
|
|
1120
|
+
self._print_result(tr, completed, total)
|
|
1121
|
+
|
|
1122
|
+
if self.cache:
|
|
1123
|
+
_save_result_cache(self.project_path, self._result_cache)
|
|
1124
|
+
|
|
1125
|
+
if self.verbose:
|
|
1126
|
+
elapsed = time.perf_counter() - run_start
|
|
1127
|
+
c3_wins = sum(1 for r in self.results if r.c3_wins)
|
|
1128
|
+
avg_c3 = sum(r.c3_score.combined_score for r in self.results) / max(len(self.results), 1)
|
|
1129
|
+
avg_base = sum(r.baseline_score.combined_score for r in self.results) / max(len(self.results), 1)
|
|
1130
|
+
cached_count = sum(1 for r in self.results if getattr(r, "_from_cache", False))
|
|
1131
|
+
cache_note = f" ({cached_count} cached)" if cached_count else ""
|
|
1132
|
+
print(f"\n {SEP*72}")
|
|
1133
|
+
print(f" Done in {_fmt_duration(elapsed)}{cache_note} | C3 won {c3_wins}/{total} tasks "
|
|
1134
|
+
f"({100*c3_wins/max(total,1):.1f}%) | "
|
|
1135
|
+
f"avg score: C3={avg_c3:.3f} Base={avg_base:.3f}", flush=True)
|
|
1136
|
+
|
|
1137
|
+
return self.results
|
|
1138
|
+
finally:
|
|
1139
|
+
if worktree_dir and os.path.isdir(worktree_dir):
|
|
1140
|
+
try:
|
|
1141
|
+
subprocess.run(
|
|
1142
|
+
["git", "worktree", "remove", worktree_dir, "--force"],
|
|
1143
|
+
cwd=self.project_path, capture_output=True, text=True, timeout=30,
|
|
1144
|
+
)
|
|
1145
|
+
except Exception:
|
|
1146
|
+
pass
|
|
1147
|
+
if os.path.isdir(worktree_dir):
|
|
1148
|
+
try:
|
|
1149
|
+
shutil.rmtree(worktree_dir, ignore_errors=True)
|
|
1150
|
+
except Exception:
|
|
1151
|
+
pass
|
|
1152
|
+
|
|
1153
|
+
def _get_cached_results(self, task: E2ETask) -> list[TaskResult] | None:
|
|
1154
|
+
"""Return cached TaskResults if valid cache entry exists, else None."""
|
|
1155
|
+
key = _task_cache_key(task, self.providers)
|
|
1156
|
+
entry = self._result_cache.get(key)
|
|
1157
|
+
if not entry:
|
|
1158
|
+
return None
|
|
1159
|
+
# Simple TTL: 24 hours
|
|
1160
|
+
if time.time() - entry.get("ts", 0) > 86400:
|
|
1161
|
+
del self._result_cache[key]
|
|
1162
|
+
return None
|
|
1163
|
+
try:
|
|
1164
|
+
results = []
|
|
1165
|
+
for r in entry["results"]:
|
|
1166
|
+
tr = TaskResult(
|
|
1167
|
+
task_id=r["task_id"],
|
|
1168
|
+
task_category=r["task_category"],
|
|
1169
|
+
task_difficulty=r.get("task_difficulty", "medium"),
|
|
1170
|
+
provider=r.get("provider", ""),
|
|
1171
|
+
)
|
|
1172
|
+
# Restore scores from cached dicts
|
|
1173
|
+
for field, cls in (("c3_score", EvalScore), ("baseline_score", EvalScore)):
|
|
1174
|
+
d = r.get(field, {})
|
|
1175
|
+
score = cls()
|
|
1176
|
+
for k, v in d.items():
|
|
1177
|
+
if hasattr(score, k):
|
|
1178
|
+
setattr(score, k, v)
|
|
1179
|
+
setattr(tr, field, score)
|
|
1180
|
+
tr._from_cache = True
|
|
1181
|
+
results.append(tr)
|
|
1182
|
+
return results
|
|
1183
|
+
except Exception:
|
|
1184
|
+
return None
|
|
1185
|
+
|
|
1186
|
+
def _save_cached_results(self, task: E2ETask, results: list[TaskResult]) -> None:
|
|
1187
|
+
"""Persist task results to the in-memory cache dict."""
|
|
1188
|
+
key = _task_cache_key(task, self.providers)
|
|
1189
|
+
self._result_cache[key] = {
|
|
1190
|
+
"ts": time.time(),
|
|
1191
|
+
"results": [_task_result_to_cache_dict(r) for r in results],
|
|
1192
|
+
}
|
|
1193
|
+
|
|
1194
|
+
def _run_task_parallel(self, task: E2ETask) -> list[TaskResult]:
|
|
1195
|
+
results = []
|
|
1196
|
+
prompt = build_prompt(task)
|
|
1197
|
+
|
|
1198
|
+
with ThreadPoolExecutor(max_workers=len(self.providers)) as pool:
|
|
1199
|
+
futures = {}
|
|
1200
|
+
for provider in self.providers:
|
|
1201
|
+
future = pool.submit(self._run_single, task, provider, prompt)
|
|
1202
|
+
futures[future] = provider
|
|
1203
|
+
|
|
1204
|
+
for future in as_completed(futures):
|
|
1205
|
+
try:
|
|
1206
|
+
result = future.result()
|
|
1207
|
+
results.append(result)
|
|
1208
|
+
except Exception as e:
|
|
1209
|
+
provider = futures[future]
|
|
1210
|
+
tr = TaskResult(
|
|
1211
|
+
task_id=task.id,
|
|
1212
|
+
task_category=task.category,
|
|
1213
|
+
task_difficulty=task.difficulty,
|
|
1214
|
+
provider=provider.name,
|
|
1215
|
+
)
|
|
1216
|
+
tr.c3_response.error = str(e)
|
|
1217
|
+
tr.baseline_response.error = str(e)
|
|
1218
|
+
results.append(tr)
|
|
1219
|
+
|
|
1220
|
+
return results
|
|
1221
|
+
|
|
1222
|
+
def _run_task_sequential(self, task: E2ETask) -> list[TaskResult]:
|
|
1223
|
+
results = []
|
|
1224
|
+
prompt = build_prompt(task)
|
|
1225
|
+
|
|
1226
|
+
for provider in self.providers:
|
|
1227
|
+
try:
|
|
1228
|
+
result = self._run_single(task, provider, prompt)
|
|
1229
|
+
results.append(result)
|
|
1230
|
+
except Exception as e:
|
|
1231
|
+
tr = TaskResult(
|
|
1232
|
+
task_id=task.id,
|
|
1233
|
+
task_category=task.category,
|
|
1234
|
+
task_difficulty=task.difficulty,
|
|
1235
|
+
provider=provider.name,
|
|
1236
|
+
)
|
|
1237
|
+
tr.c3_response.error = str(e)
|
|
1238
|
+
results.append(tr)
|
|
1239
|
+
|
|
1240
|
+
return results
|
|
1241
|
+
|
|
1242
|
+
def _run_single(self, task: E2ETask, provider: CLIProvider,
|
|
1243
|
+
prompt: str) -> TaskResult:
|
|
1244
|
+
tr = TaskResult(
|
|
1245
|
+
task_id=task.id,
|
|
1246
|
+
task_category=task.category,
|
|
1247
|
+
task_difficulty=task.difficulty,
|
|
1248
|
+
provider=provider.name,
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1251
|
+
if self.verbose:
|
|
1252
|
+
print(f" >> {provider.name:>7} | C3 + BASE | starting in parallel...", flush=True)
|
|
1253
|
+
|
|
1254
|
+
# Run C3 and baseline concurrently — halves wall time per task
|
|
1255
|
+
# Always single-turn: multi-turn doubles wall time and causes timeouts
|
|
1256
|
+
with ThreadPoolExecutor(max_workers=2) as pool:
|
|
1257
|
+
c3_future = pool.submit(provider.run, prompt, self._sandbox_path, True, self.timeout, False)
|
|
1258
|
+
base_future = pool.submit(provider.run, prompt, self._sandbox_path, False, self.timeout, False)
|
|
1259
|
+
tr.c3_response = c3_future.result()
|
|
1260
|
+
tr.baseline_response = base_future.result()
|
|
1261
|
+
|
|
1262
|
+
if self.verbose:
|
|
1263
|
+
self._print_call_result(provider.name, "C3 ", tr.c3_response)
|
|
1264
|
+
self._print_call_result(provider.name, "BASE", tr.baseline_response)
|
|
1265
|
+
|
|
1266
|
+
if tr.c3_response.text:
|
|
1267
|
+
tr.c3_score = self.evaluator.score(tr.c3_response.text, task.ground_truth)
|
|
1268
|
+
if tr.baseline_response.text:
|
|
1269
|
+
tr.baseline_score = self.evaluator.score(tr.baseline_response.text, task.ground_truth)
|
|
1270
|
+
|
|
1271
|
+
return tr
|
|
1272
|
+
|
|
1273
|
+
def _print_call_result(self, provider: str, label: str, resp: CLIResponse):
|
|
1274
|
+
lat = f"{resp.latency_ms/1000:.1f}s"
|
|
1275
|
+
tok = resp.input_tokens + resp.output_tokens + resp.cache_read_tokens + resp.cache_creation_tokens
|
|
1276
|
+
tok_str = f"{tok:,} tok" if tok else "? tok"
|
|
1277
|
+
if resp.error:
|
|
1278
|
+
status = f"ERROR: {resp.error[:70]}"
|
|
1279
|
+
else:
|
|
1280
|
+
parts = [f"done {lat:>6}", f"{tok_str:>12}"]
|
|
1281
|
+
if resp.num_turns:
|
|
1282
|
+
parts.append(f"{resp.num_turns} turn{'s' if resp.num_turns != 1 else ''}")
|
|
1283
|
+
if resp.cost_usd:
|
|
1284
|
+
parts.append(f"${resp.cost_usd:.4f}")
|
|
1285
|
+
if resp.model_used:
|
|
1286
|
+
parts.append(f"[{resp.model_used[:20]}]")
|
|
1287
|
+
status = " ".join(parts)
|
|
1288
|
+
print(f" >> {provider:>7} | {label:<4} | {status}", flush=True)
|
|
1289
|
+
|
|
1290
|
+
def _print_result(self, tr: TaskResult, current: int, total: int):
|
|
1291
|
+
eff = tr.efficiency()
|
|
1292
|
+
delta = tr.score_delta
|
|
1293
|
+
wins = getattr(self, "_wins", "?")
|
|
1294
|
+
c3_err = " [C3-ERR]" if tr.c3_response.error else ""
|
|
1295
|
+
base_err = " [BASE-ERR]" if tr.baseline_response.error else ""
|
|
1296
|
+
|
|
1297
|
+
if _UNI:
|
|
1298
|
+
winner_str = "C3 \u2713 wins" if tr.c3_wins else "BASE wins"
|
|
1299
|
+
else:
|
|
1300
|
+
winner_str = "C3 WINS " if tr.c3_wins else "BASE WINS"
|
|
1301
|
+
|
|
1302
|
+
time_saved_s = eff["time_saved_ms"] / 1000
|
|
1303
|
+
cost_saved = eff["cost_saved_usd"]
|
|
1304
|
+
|
|
1305
|
+
print(
|
|
1306
|
+
f" [{current:>2}/{total}] {winner_str} | {tr.provider} | "
|
|
1307
|
+
f"C3={tr.c3_score.combined_score:.3f} Base={tr.baseline_score.combined_score:.3f} "
|
|
1308
|
+
f"delta={delta:+.3f} | "
|
|
1309
|
+
f"time {time_saved_s:+.0f}s cost {cost_saved:+.4f} | "
|
|
1310
|
+
f"[{wins}/{current} wins]{c3_err}{base_err}",
|
|
1311
|
+
flush=True,
|
|
1312
|
+
)
|
|
1313
|
+
|
|
1314
|
+
# On C3 loss, show which dimension hurt most
|
|
1315
|
+
if not tr.c3_wins and not tr.c3_response.error:
|
|
1316
|
+
dims = {
|
|
1317
|
+
"file_mention": tr.c3_score.file_mention_score - tr.baseline_score.file_mention_score,
|
|
1318
|
+
"completeness": tr.c3_score.completeness_score - tr.baseline_score.completeness_score,
|
|
1319
|
+
"structural": tr.c3_score.structural_score - tr.baseline_score.structural_score,
|
|
1320
|
+
"keyword": tr.c3_score.keyword_score - tr.baseline_score.keyword_score,
|
|
1321
|
+
}
|
|
1322
|
+
worst_dim, worst_gap = min(dims.items(), key=lambda x: x[1])
|
|
1323
|
+
if worst_gap < -0.05:
|
|
1324
|
+
c3_v = getattr(tr.c3_score, f"{worst_dim}_score", 0)
|
|
1325
|
+
base_v = getattr(tr.baseline_score, f"{worst_dim}_score", 0)
|
|
1326
|
+
print(f" weak dim: {worst_dim} C3={c3_v:.2f} Base={base_v:.2f}", flush=True)
|
|
1327
|
+
|
|
1328
|
+
|
|
1329
|
+
# ---------------------------------------------------------------------------
|
|
1330
|
+
# Report Generation
|
|
1331
|
+
# ---------------------------------------------------------------------------
|
|
1332
|
+
# Run History & Trends
|
|
1333
|
+
# ---------------------------------------------------------------------------
|
|
1334
|
+
|
|
1335
|
+
def load_run_history(project_path: str, max_runs: int = 20) -> list[dict]:
|
|
1336
|
+
"""Load past benchmark runs from .c3/e2e_benchmark/runs/ sorted newest-first."""
|
|
1337
|
+
runs_dir = Path(project_path) / ".c3" / "e2e_benchmark" / "runs"
|
|
1338
|
+
if not runs_dir.exists():
|
|
1339
|
+
return []
|
|
1340
|
+
|
|
1341
|
+
run_files = sorted(runs_dir.glob("e2e_*.json"), reverse=True)[:max_runs]
|
|
1342
|
+
history = []
|
|
1343
|
+
for f in run_files:
|
|
1344
|
+
try:
|
|
1345
|
+
data = json.loads(f.read_text(encoding="utf-8"))
|
|
1346
|
+
data["_file"] = str(f)
|
|
1347
|
+
history.append(data)
|
|
1348
|
+
except (json.JSONDecodeError, OSError):
|
|
1349
|
+
continue
|
|
1350
|
+
return history
|
|
1351
|
+
|
|
1352
|
+
|
|
1353
|
+
def compute_trends(current: dict, history: list[dict]) -> dict:
|
|
1354
|
+
"""Compute trend data from current run + historical runs.
|
|
1355
|
+
|
|
1356
|
+
Returns sparkline arrays, since-last-run deltas, and moving averages.
|
|
1357
|
+
"""
|
|
1358
|
+
if not history:
|
|
1359
|
+
return {"available": False}
|
|
1360
|
+
|
|
1361
|
+
# Build timeline: history (oldest-first) + current
|
|
1362
|
+
timeline = list(reversed(history)) + [current]
|
|
1363
|
+
|
|
1364
|
+
# Extract key metrics across runs
|
|
1365
|
+
win_rates = []
|
|
1366
|
+
avg_deltas = []
|
|
1367
|
+
avg_c3_scores = []
|
|
1368
|
+
avg_base_scores = []
|
|
1369
|
+
total_costs_c3 = []
|
|
1370
|
+
total_costs_base = []
|
|
1371
|
+
timestamps = []
|
|
1372
|
+
mcp_ratios = []
|
|
1373
|
+
|
|
1374
|
+
for run in timeline:
|
|
1375
|
+
sc = run.get("scorecard", {})
|
|
1376
|
+
eff = run.get("efficiency_summary", {})
|
|
1377
|
+
win_rates.append(sc.get("c3_win_rate", 0))
|
|
1378
|
+
avg_deltas.append(sc.get("avg_score_delta", 0))
|
|
1379
|
+
avg_c3_scores.append(sc.get("avg_score_c3", 0))
|
|
1380
|
+
avg_base_scores.append(sc.get("avg_score_baseline", 0))
|
|
1381
|
+
total_costs_c3.append(eff.get("total_cost_c3_usd", 0))
|
|
1382
|
+
total_costs_base.append(eff.get("total_cost_baseline_usd", 0))
|
|
1383
|
+
timestamps.append(run.get("timestamp", ""))
|
|
1384
|
+
mcp_ratios.append(run.get("mcp_ratio", run.get("tool_analysis", {}).get("summary", {}).get("mcp_ratio", 0)))
|
|
1385
|
+
|
|
1386
|
+
# Since-last-run deltas (compare current vs most recent past run)
|
|
1387
|
+
prev = history[0] # newest past run (history is newest-first)
|
|
1388
|
+
prev_sc = prev.get("scorecard", {})
|
|
1389
|
+
prev_eff = prev.get("efficiency_summary", {})
|
|
1390
|
+
cur_sc = current.get("scorecard", {})
|
|
1391
|
+
cur_eff = current.get("efficiency_summary", {})
|
|
1392
|
+
|
|
1393
|
+
since_last = {
|
|
1394
|
+
"win_rate_delta": round(cur_sc.get("c3_win_rate", 0) - prev_sc.get("c3_win_rate", 0), 1),
|
|
1395
|
+
"score_delta_delta": round(
|
|
1396
|
+
cur_sc.get("avg_score_delta", 0) - prev_sc.get("avg_score_delta", 0), 3
|
|
1397
|
+
),
|
|
1398
|
+
"avg_c3_delta": round(
|
|
1399
|
+
cur_sc.get("avg_score_c3", 0) - prev_sc.get("avg_score_c3", 0), 3
|
|
1400
|
+
),
|
|
1401
|
+
"cost_saved_delta": round(
|
|
1402
|
+
cur_eff.get("total_cost_saved_usd", 0) - prev_eff.get("total_cost_saved_usd", 0), 4
|
|
1403
|
+
),
|
|
1404
|
+
"token_saved_delta": (
|
|
1405
|
+
cur_eff.get("total_tokens_saved", 0) - prev_eff.get("total_tokens_saved", 0)
|
|
1406
|
+
),
|
|
1407
|
+
"mcp_ratio_delta": round(
|
|
1408
|
+
current.get("mcp_ratio", 0) - prev.get("mcp_ratio", prev.get("tool_analysis", {}).get("summary", {}).get("mcp_ratio", 0)), 1
|
|
1409
|
+
),
|
|
1410
|
+
"prev_timestamp": prev.get("timestamp", "unknown"),
|
|
1411
|
+
"prev_total_tasks": prev.get("total_results", 0),
|
|
1412
|
+
}
|
|
1413
|
+
|
|
1414
|
+
# Per-category trends
|
|
1415
|
+
cur_cats = current.get("category_stats", {})
|
|
1416
|
+
prev_cats = prev.get("category_stats", {})
|
|
1417
|
+
category_trends = {}
|
|
1418
|
+
for cat in set(list(cur_cats.keys()) + list(prev_cats.keys())):
|
|
1419
|
+
cur_wr = cur_cats.get(cat, {}).get("win_rate_c3", 0)
|
|
1420
|
+
prev_wr = prev_cats.get(cat, {}).get("win_rate_c3", 0)
|
|
1421
|
+
cur_d = cur_cats.get(cat, {}).get("avg_score_delta", 0)
|
|
1422
|
+
prev_d = prev_cats.get(cat, {}).get("avg_score_delta", 0)
|
|
1423
|
+
category_trends[cat] = {
|
|
1424
|
+
"win_rate_delta": round(cur_wr - prev_wr, 1),
|
|
1425
|
+
"score_delta_delta": round(cur_d - prev_d, 3),
|
|
1426
|
+
"improving": cur_d > prev_d,
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
# Moving averages (3-run window)
|
|
1430
|
+
def _ma(arr, window=3):
|
|
1431
|
+
if len(arr) < window:
|
|
1432
|
+
return arr[-1] if arr else 0
|
|
1433
|
+
return round(sum(arr[-window:]) / window, 3)
|
|
1434
|
+
|
|
1435
|
+
return {
|
|
1436
|
+
"available": True,
|
|
1437
|
+
"run_count": len(timeline),
|
|
1438
|
+
"sparklines": {
|
|
1439
|
+
"win_rates": [round(x, 1) for x in win_rates],
|
|
1440
|
+
"avg_deltas": [round(x, 3) for x in avg_deltas],
|
|
1441
|
+
"avg_c3_scores": [round(x, 3) for x in avg_c3_scores],
|
|
1442
|
+
"avg_base_scores": [round(x, 3) for x in avg_base_scores],
|
|
1443
|
+
"costs_c3": [round(x, 4) for x in total_costs_c3],
|
|
1444
|
+
"costs_base": [round(x, 4) for x in total_costs_base],
|
|
1445
|
+
"mcp_ratios": [round(x, 1) for x in mcp_ratios],
|
|
1446
|
+
"timestamps": timestamps,
|
|
1447
|
+
},
|
|
1448
|
+
"since_last": since_last,
|
|
1449
|
+
"category_trends": category_trends,
|
|
1450
|
+
"moving_averages": {
|
|
1451
|
+
"win_rate_3run": _ma(win_rates),
|
|
1452
|
+
"delta_3run": _ma(avg_deltas),
|
|
1453
|
+
"c3_score_3run": _ma(avg_c3_scores),
|
|
1454
|
+
},
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
|
|
1458
|
+
# ---------------------------------------------------------------------------
|
|
1459
|
+
# Report Generation
|
|
1460
|
+
# ---------------------------------------------------------------------------
|
|
1461
|
+
|
|
1462
|
+
def generate_e2e_report(
|
|
1463
|
+
project_path: str,
|
|
1464
|
+
results: list[TaskResult],
|
|
1465
|
+
providers: list[CLIProvider],
|
|
1466
|
+
tasks: list[E2ETask],
|
|
1467
|
+
) -> dict:
|
|
1468
|
+
"""Generate comprehensive JSON report from benchmark results."""
|
|
1469
|
+
# Per-provider aggregation
|
|
1470
|
+
provider_stats = {}
|
|
1471
|
+
for p in providers:
|
|
1472
|
+
p_results = [r for r in results if r.provider == p.name]
|
|
1473
|
+
if not p_results:
|
|
1474
|
+
continue
|
|
1475
|
+
|
|
1476
|
+
c3_wins = sum(1 for r in p_results if r.c3_wins)
|
|
1477
|
+
total = len(p_results)
|
|
1478
|
+
avg_c3 = sum(r.c3_score.combined_score for r in p_results) / total
|
|
1479
|
+
avg_base = sum(r.baseline_score.combined_score for r in p_results) / total
|
|
1480
|
+
avg_delta = sum(r.score_delta for r in p_results) / total
|
|
1481
|
+
|
|
1482
|
+
# Weighted win rate
|
|
1483
|
+
weighted_c3_wins = sum(r.difficulty_weight for r in p_results if r.c3_wins)
|
|
1484
|
+
weighted_total = sum(r.difficulty_weight for r in p_results)
|
|
1485
|
+
|
|
1486
|
+
total_c3_tokens = sum(
|
|
1487
|
+
r.c3_response.input_tokens + r.c3_response.output_tokens +
|
|
1488
|
+
r.c3_response.cache_creation_tokens + r.c3_response.cache_read_tokens
|
|
1489
|
+
for r in p_results
|
|
1490
|
+
)
|
|
1491
|
+
total_base_tokens = sum(
|
|
1492
|
+
r.baseline_response.input_tokens + r.baseline_response.output_tokens +
|
|
1493
|
+
r.baseline_response.cache_creation_tokens + r.baseline_response.cache_read_tokens
|
|
1494
|
+
for r in p_results
|
|
1495
|
+
)
|
|
1496
|
+
total_c3_cost = sum(r.c3_response.cost_usd for r in p_results)
|
|
1497
|
+
total_base_cost = sum(r.baseline_response.cost_usd for r in p_results)
|
|
1498
|
+
total_c3_latency = sum(r.c3_response.latency_ms for r in p_results)
|
|
1499
|
+
total_base_latency = sum(r.baseline_response.latency_ms for r in p_results)
|
|
1500
|
+
|
|
1501
|
+
provider_stats[p.name] = {
|
|
1502
|
+
"model": p.model or "default",
|
|
1503
|
+
"tasks_run": total,
|
|
1504
|
+
"c3_wins": c3_wins,
|
|
1505
|
+
"baseline_wins": total - c3_wins,
|
|
1506
|
+
"win_rate_c3": round(c3_wins / total * 100, 1),
|
|
1507
|
+
"weighted_win_rate_c3": round(weighted_c3_wins / weighted_total * 100, 1) if weighted_total else 0,
|
|
1508
|
+
"avg_score_c3": round(avg_c3, 3),
|
|
1509
|
+
"avg_score_baseline": round(avg_base, 3),
|
|
1510
|
+
"avg_score_delta": round(avg_delta, 3),
|
|
1511
|
+
"total_tokens_c3": total_c3_tokens,
|
|
1512
|
+
"total_tokens_baseline": total_base_tokens,
|
|
1513
|
+
"total_cost_c3_usd": round(total_c3_cost, 4),
|
|
1514
|
+
"total_cost_baseline_usd": round(total_base_cost, 4),
|
|
1515
|
+
"avg_latency_c3_ms": round(total_c3_latency / total, 1),
|
|
1516
|
+
"avg_latency_baseline_ms": round(total_base_latency / total, 1),
|
|
1517
|
+
}
|
|
1518
|
+
|
|
1519
|
+
# Per-category aggregation
|
|
1520
|
+
categories = sorted(set(r.task_category for r in results))
|
|
1521
|
+
category_stats = {}
|
|
1522
|
+
for cat in categories:
|
|
1523
|
+
cat_results = [r for r in results if r.task_category == cat]
|
|
1524
|
+
total = len(cat_results)
|
|
1525
|
+
c3_wins = sum(1 for r in cat_results if r.c3_wins)
|
|
1526
|
+
avg_delta = sum(r.score_delta for r in cat_results) / total
|
|
1527
|
+
avg_c3 = sum(r.c3_score.combined_score for r in cat_results) / total
|
|
1528
|
+
avg_base = sum(r.baseline_score.combined_score for r in cat_results) / total
|
|
1529
|
+
|
|
1530
|
+
category_stats[cat] = {
|
|
1531
|
+
"tasks_run": total,
|
|
1532
|
+
"c3_wins": c3_wins,
|
|
1533
|
+
"win_rate_c3": round(c3_wins / total * 100, 1),
|
|
1534
|
+
"avg_score_c3": round(avg_c3, 3),
|
|
1535
|
+
"avg_score_baseline": round(avg_base, 3),
|
|
1536
|
+
"avg_score_delta": round(avg_delta, 3),
|
|
1537
|
+
"difficulty": cat_results[0].task_difficulty if cat_results else "unknown",
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
# Global scorecard
|
|
1541
|
+
total_results = len(results)
|
|
1542
|
+
total_c3_wins = sum(1 for r in results if r.c3_wins)
|
|
1543
|
+
global_avg_c3 = sum(r.c3_score.combined_score for r in results) / total_results if total_results else 0
|
|
1544
|
+
global_avg_base = sum(r.baseline_score.combined_score for r in results) / total_results if total_results else 0
|
|
1545
|
+
|
|
1546
|
+
# Weighted win rate
|
|
1547
|
+
weighted_wins = sum(r.difficulty_weight for r in results if r.c3_wins)
|
|
1548
|
+
weighted_total = sum(r.difficulty_weight for r in results)
|
|
1549
|
+
weighted_win_rate = round(weighted_wins / weighted_total * 100, 1) if weighted_total else 0
|
|
1550
|
+
|
|
1551
|
+
# Efficiency summary
|
|
1552
|
+
total_time_c3 = sum(r.c3_response.latency_ms for r in results)
|
|
1553
|
+
total_time_base = sum(r.baseline_response.latency_ms for r in results)
|
|
1554
|
+
total_cost_c3 = sum(r.c3_response.cost_usd for r in results)
|
|
1555
|
+
total_cost_base = sum(r.baseline_response.cost_usd for r in results)
|
|
1556
|
+
total_tokens_c3 = sum(
|
|
1557
|
+
r.c3_response.input_tokens + r.c3_response.output_tokens +
|
|
1558
|
+
r.c3_response.cache_creation_tokens + r.c3_response.cache_read_tokens
|
|
1559
|
+
for r in results
|
|
1560
|
+
)
|
|
1561
|
+
total_tokens_base = sum(
|
|
1562
|
+
r.baseline_response.input_tokens + r.baseline_response.output_tokens +
|
|
1563
|
+
r.baseline_response.cache_creation_tokens + r.baseline_response.cache_read_tokens
|
|
1564
|
+
for r in results
|
|
1565
|
+
)
|
|
1566
|
+
|
|
1567
|
+
def _pct(saved, total):
|
|
1568
|
+
return round(saved / total * 100, 1) if total else 0.0
|
|
1569
|
+
|
|
1570
|
+
efficiency_summary = {
|
|
1571
|
+
"total_time_c3_s": round(total_time_c3 / 1000, 1),
|
|
1572
|
+
"total_time_baseline_s": round(total_time_base / 1000, 1),
|
|
1573
|
+
"total_time_saved_s": round((total_time_base - total_time_c3) / 1000, 1),
|
|
1574
|
+
"time_saved_pct": _pct(total_time_base - total_time_c3, total_time_base),
|
|
1575
|
+
"avg_time_per_task_c3_s": round(total_time_c3 / total_results / 1000, 1) if total_results else 0,
|
|
1576
|
+
"avg_time_per_task_baseline_s": round(total_time_base / total_results / 1000, 1) if total_results else 0,
|
|
1577
|
+
"total_cost_c3_usd": round(total_cost_c3, 4),
|
|
1578
|
+
"total_cost_baseline_usd": round(total_cost_base, 4),
|
|
1579
|
+
"total_cost_saved_usd": round(total_cost_base - total_cost_c3, 4),
|
|
1580
|
+
"cost_saved_pct": _pct(total_cost_base - total_cost_c3, total_cost_base),
|
|
1581
|
+
"total_tokens_c3": total_tokens_c3,
|
|
1582
|
+
"total_tokens_baseline": total_tokens_base,
|
|
1583
|
+
"total_tokens_saved": total_tokens_base - total_tokens_c3,
|
|
1584
|
+
"tokens_saved_pct": _pct(total_tokens_base - total_tokens_c3, total_tokens_base),
|
|
1585
|
+
# Projections (assuming 5 sessions/day, 22 days/month)
|
|
1586
|
+
"projected_daily_cost_saved_usd": round((total_cost_base - total_cost_c3) * 5, 4),
|
|
1587
|
+
"projected_monthly_cost_saved_usd": round((total_cost_base - total_cost_c3) * 5 * 22, 2),
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
# Score breakdown by dimension (averaged across all results)
|
|
1591
|
+
dimensions = ["keyword_score", "structural_score", "file_mention_score",
|
|
1592
|
+
"factual_score", "completeness_score"]
|
|
1593
|
+
dimension_breakdown = {}
|
|
1594
|
+
for dim in dimensions:
|
|
1595
|
+
c3_vals = [getattr(r.c3_score, dim, 0) for r in results]
|
|
1596
|
+
base_vals = [getattr(r.baseline_score, dim, 0) for r in results]
|
|
1597
|
+
dimension_breakdown[dim] = {
|
|
1598
|
+
"avg_c3": round(sum(c3_vals) / len(c3_vals), 3) if c3_vals else 0,
|
|
1599
|
+
"avg_baseline": round(sum(base_vals) / len(base_vals), 3) if base_vals else 0,
|
|
1600
|
+
"delta": round(
|
|
1601
|
+
(sum(c3_vals) / len(c3_vals) - sum(base_vals) / len(base_vals)), 3
|
|
1602
|
+
) if c3_vals and base_vals else 0,
|
|
1603
|
+
}
|
|
1604
|
+
|
|
1605
|
+
# Tool usage analysis
|
|
1606
|
+
tool_analysis = _build_tool_analysis(results)
|
|
1607
|
+
|
|
1608
|
+
# Tool adoption: how many C3-mode runs actually used MCP tools?
|
|
1609
|
+
tasks_using_mcp = sum(1 for r in results if r.c3_response.tool_usage.c3_tool_calls > 0)
|
|
1610
|
+
unique_mcp_tools_used = set()
|
|
1611
|
+
for r in results:
|
|
1612
|
+
for tool_name in r.c3_response.tool_usage.tool_counts:
|
|
1613
|
+
if tool_name in _C3_TOOLS or tool_name.startswith("c3_"):
|
|
1614
|
+
unique_mcp_tools_used.add(tool_name)
|
|
1615
|
+
tool_adoption = {
|
|
1616
|
+
"tasks_using_mcp": tasks_using_mcp,
|
|
1617
|
+
"total_tasks": total_results,
|
|
1618
|
+
"adoption_rate": round(tasks_using_mcp / total_results * 100, 1) if total_results else 0,
|
|
1619
|
+
"unique_mcp_tools": sorted(unique_mcp_tools_used),
|
|
1620
|
+
"unique_mcp_tool_count": len(unique_mcp_tools_used),
|
|
1621
|
+
}
|
|
1622
|
+
|
|
1623
|
+
report_data = {
|
|
1624
|
+
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
|
1625
|
+
"project_path": project_path,
|
|
1626
|
+
"benchmark_type": "e2e_agent",
|
|
1627
|
+
"total_tasks": len(tasks),
|
|
1628
|
+
"total_results": total_results,
|
|
1629
|
+
"providers_tested": [p.name for p in providers],
|
|
1630
|
+
"scorecard": {
|
|
1631
|
+
"c3_win_rate": round(total_c3_wins / total_results * 100, 1) if total_results else 0,
|
|
1632
|
+
"weighted_win_rate": weighted_win_rate,
|
|
1633
|
+
"c3_wins": total_c3_wins,
|
|
1634
|
+
"baseline_wins": total_results - total_c3_wins,
|
|
1635
|
+
"avg_score_c3": round(global_avg_c3, 3),
|
|
1636
|
+
"avg_score_baseline": round(global_avg_base, 3),
|
|
1637
|
+
"avg_score_delta": round(global_avg_c3 - global_avg_base, 3),
|
|
1638
|
+
},
|
|
1639
|
+
"efficiency_summary": efficiency_summary,
|
|
1640
|
+
"dimension_breakdown": dimension_breakdown,
|
|
1641
|
+
"tool_analysis": tool_analysis,
|
|
1642
|
+
"provider_stats": provider_stats,
|
|
1643
|
+
"category_stats": category_stats,
|
|
1644
|
+
"tasks": [t.to_dict() for t in tasks],
|
|
1645
|
+
"results": [r.to_dict() for r in results],
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
# Promote mcp_ratio for easy trend access
|
|
1649
|
+
report_data["mcp_ratio"] = tool_analysis.get("summary", {}).get("mcp_ratio", 0)
|
|
1650
|
+
report_data["tool_adoption"] = tool_adoption
|
|
1651
|
+
|
|
1652
|
+
# Generate insights from the assembled report
|
|
1653
|
+
report_data["insights"] = _build_insights(report_data)
|
|
1654
|
+
|
|
1655
|
+
# Load history and compute trends
|
|
1656
|
+
history = load_run_history(project_path)
|
|
1657
|
+
report_data["trends"] = compute_trends(report_data, history)
|
|
1658
|
+
|
|
1659
|
+
return report_data
|
|
1660
|
+
|
|
1661
|
+
|
|
1662
|
+
def _build_tool_analysis(results: list[TaskResult]) -> dict:
|
|
1663
|
+
"""Aggregate tool usage data across all results."""
|
|
1664
|
+
# Global tool counts
|
|
1665
|
+
c3_tool_totals: dict[str, int] = {}
|
|
1666
|
+
base_tool_totals: dict[str, int] = {}
|
|
1667
|
+
|
|
1668
|
+
# Per-category tool usage
|
|
1669
|
+
category_tools: dict[str, dict] = {}
|
|
1670
|
+
|
|
1671
|
+
# Aggregate totals
|
|
1672
|
+
total_c3_calls = 0
|
|
1673
|
+
total_base_calls = 0
|
|
1674
|
+
total_c3_unique = 0
|
|
1675
|
+
total_base_unique = 0
|
|
1676
|
+
total_c3_mcp = 0
|
|
1677
|
+
total_c3_native = 0
|
|
1678
|
+
total_base_native = 0
|
|
1679
|
+
|
|
1680
|
+
for r in results:
|
|
1681
|
+
c3_tu = r.c3_response.tool_usage
|
|
1682
|
+
base_tu = r.baseline_response.tool_usage
|
|
1683
|
+
|
|
1684
|
+
total_c3_calls += c3_tu.total_tool_calls
|
|
1685
|
+
total_base_calls += base_tu.total_tool_calls
|
|
1686
|
+
total_c3_unique += c3_tu.unique_tools
|
|
1687
|
+
total_base_unique += base_tu.unique_tools
|
|
1688
|
+
total_c3_mcp += c3_tu.c3_tool_calls
|
|
1689
|
+
total_c3_native += c3_tu.native_tool_calls
|
|
1690
|
+
total_base_native += base_tu.native_tool_calls
|
|
1691
|
+
|
|
1692
|
+
for tool, count in c3_tu.tool_counts.items():
|
|
1693
|
+
c3_tool_totals[tool] = c3_tool_totals.get(tool, 0) + count
|
|
1694
|
+
for tool, count in base_tu.tool_counts.items():
|
|
1695
|
+
base_tool_totals[tool] = base_tool_totals.get(tool, 0) + count
|
|
1696
|
+
|
|
1697
|
+
cat = r.task_category
|
|
1698
|
+
if cat not in category_tools:
|
|
1699
|
+
category_tools[cat] = {
|
|
1700
|
+
"c3_total_calls": 0, "base_total_calls": 0,
|
|
1701
|
+
"c3_mcp_calls": 0, "c3_native_calls": 0,
|
|
1702
|
+
"base_native_calls": 0,
|
|
1703
|
+
}
|
|
1704
|
+
category_tools[cat]["c3_total_calls"] += c3_tu.total_tool_calls
|
|
1705
|
+
category_tools[cat]["base_total_calls"] += base_tu.total_tool_calls
|
|
1706
|
+
category_tools[cat]["c3_mcp_calls"] += c3_tu.c3_tool_calls
|
|
1707
|
+
category_tools[cat]["c3_native_calls"] += c3_tu.native_tool_calls
|
|
1708
|
+
category_tools[cat]["base_native_calls"] += base_tu.native_tool_calls
|
|
1709
|
+
|
|
1710
|
+
n = len(results) or 1
|
|
1711
|
+
|
|
1712
|
+
# Top tools ranked by usage
|
|
1713
|
+
all_tools = sorted(
|
|
1714
|
+
set(list(c3_tool_totals.keys()) + list(base_tool_totals.keys())),
|
|
1715
|
+
key=lambda t: c3_tool_totals.get(t, 0) + base_tool_totals.get(t, 0),
|
|
1716
|
+
reverse=True,
|
|
1717
|
+
)
|
|
1718
|
+
tool_comparison = [
|
|
1719
|
+
{
|
|
1720
|
+
"tool": t,
|
|
1721
|
+
"c3_calls": c3_tool_totals.get(t, 0),
|
|
1722
|
+
"baseline_calls": base_tool_totals.get(t, 0),
|
|
1723
|
+
"is_c3_tool": t in _C3_TOOLS or t.startswith("c3_"),
|
|
1724
|
+
"delta": c3_tool_totals.get(t, 0) - base_tool_totals.get(t, 0),
|
|
1725
|
+
}
|
|
1726
|
+
for t in all_tools[:20] # top 20
|
|
1727
|
+
]
|
|
1728
|
+
|
|
1729
|
+
# Tool diversity score: unique tools per task (higher = more diverse strategy)
|
|
1730
|
+
c3_diversity = round(total_c3_unique / n, 1)
|
|
1731
|
+
base_diversity = round(total_base_unique / n, 1)
|
|
1732
|
+
|
|
1733
|
+
return {
|
|
1734
|
+
"summary": {
|
|
1735
|
+
"total_c3_tool_calls": total_c3_calls,
|
|
1736
|
+
"total_baseline_tool_calls": total_base_calls,
|
|
1737
|
+
"avg_c3_calls_per_task": round(total_c3_calls / n, 1),
|
|
1738
|
+
"avg_baseline_calls_per_task": round(total_base_calls / n, 1),
|
|
1739
|
+
"c3_mcp_calls": total_c3_mcp,
|
|
1740
|
+
"c3_native_calls": total_c3_native,
|
|
1741
|
+
"baseline_native_calls": total_base_native,
|
|
1742
|
+
"c3_tool_diversity": c3_diversity,
|
|
1743
|
+
"baseline_tool_diversity": base_diversity,
|
|
1744
|
+
"mcp_ratio": round(total_c3_mcp / total_c3_calls * 100, 1) if total_c3_calls else 0,
|
|
1745
|
+
},
|
|
1746
|
+
"tool_comparison": tool_comparison,
|
|
1747
|
+
"category_breakdown": category_tools,
|
|
1748
|
+
"c3_tool_totals": c3_tool_totals,
|
|
1749
|
+
"baseline_tool_totals": base_tool_totals,
|
|
1750
|
+
}
|
|
1751
|
+
|
|
1752
|
+
|
|
1753
|
+
# ---------------------------------------------------------------------------
|
|
1754
|
+
# Insights Engine
|
|
1755
|
+
# ---------------------------------------------------------------------------
|
|
1756
|
+
|
|
1757
|
+
_INSIGHT_SEVERITY = {"strength": 0, "info": 1, "warning": 2, "critical": 3}
|
|
1758
|
+
|
|
1759
|
+
|
|
1760
|
+
def _build_insights(report: dict) -> dict:
|
|
1761
|
+
"""Analyze benchmark results and generate actionable insights."""
|
|
1762
|
+
findings: list[dict] = []
|
|
1763
|
+
sc = report.get("scorecard", {})
|
|
1764
|
+
eff = report.get("efficiency_summary", {})
|
|
1765
|
+
dims = report.get("dimension_breakdown", {})
|
|
1766
|
+
cats = report.get("category_stats", {})
|
|
1767
|
+
ta = report.get("tool_analysis", {}).get("summary", {})
|
|
1768
|
+
results = report.get("results", [])
|
|
1769
|
+
|
|
1770
|
+
# --- Overall performance ---
|
|
1771
|
+
win_rate = sc.get("c3_win_rate", 0)
|
|
1772
|
+
weighted_wr = sc.get("weighted_win_rate", 0)
|
|
1773
|
+
delta = sc.get("avg_score_delta", 0)
|
|
1774
|
+
|
|
1775
|
+
if win_rate >= 75:
|
|
1776
|
+
findings.append({
|
|
1777
|
+
"severity": "strength", "area": "overall",
|
|
1778
|
+
"title": "Strong C3 advantage",
|
|
1779
|
+
"detail": f"C3 wins {win_rate:.0f}% of tasks with an average score delta of {delta:+.3f}.",
|
|
1780
|
+
"action": "C3 MCP tools provide consistent quality improvements across tasks.",
|
|
1781
|
+
})
|
|
1782
|
+
elif win_rate >= 50:
|
|
1783
|
+
findings.append({
|
|
1784
|
+
"severity": "info", "area": "overall",
|
|
1785
|
+
"title": "Moderate C3 advantage",
|
|
1786
|
+
"detail": f"C3 wins {win_rate:.0f}% of tasks. Weighted win rate: {weighted_wr:.0f}%.",
|
|
1787
|
+
"action": "C3 helps on harder tasks. Consider which task categories benefit most.",
|
|
1788
|
+
})
|
|
1789
|
+
elif win_rate > 0:
|
|
1790
|
+
findings.append({
|
|
1791
|
+
"severity": "warning", "area": "overall",
|
|
1792
|
+
"title": "Baseline competitive",
|
|
1793
|
+
"detail": f"C3 only wins {win_rate:.0f}% of tasks (delta: {delta:+.3f}).",
|
|
1794
|
+
"action": "Review category breakdown — C3 may excel in specific areas but not globally.",
|
|
1795
|
+
})
|
|
1796
|
+
|
|
1797
|
+
# Weighted vs unweighted divergence
|
|
1798
|
+
if abs(weighted_wr - win_rate) > 15:
|
|
1799
|
+
if weighted_wr > win_rate:
|
|
1800
|
+
findings.append({
|
|
1801
|
+
"severity": "strength", "area": "difficulty",
|
|
1802
|
+
"title": "C3 excels on harder tasks",
|
|
1803
|
+
"detail": f"Weighted win rate ({weighted_wr:.0f}%) significantly exceeds raw ({win_rate:.0f}%).",
|
|
1804
|
+
"action": "C3 tools provide more value on complex tasks where tool assistance matters most.",
|
|
1805
|
+
})
|
|
1806
|
+
else:
|
|
1807
|
+
findings.append({
|
|
1808
|
+
"severity": "warning", "area": "difficulty",
|
|
1809
|
+
"title": "C3 struggles on harder tasks",
|
|
1810
|
+
"detail": f"Weighted win rate ({weighted_wr:.0f}%) is below raw ({win_rate:.0f}%).",
|
|
1811
|
+
"action": "Investigate hard/expert task results — C3 tools may need optimization for complex queries.",
|
|
1812
|
+
})
|
|
1813
|
+
|
|
1814
|
+
# --- Efficiency insights ---
|
|
1815
|
+
cost_pct = eff.get("cost_saved_pct", 0)
|
|
1816
|
+
token_pct = eff.get("tokens_saved_pct", 0)
|
|
1817
|
+
time_pct = eff.get("time_saved_pct", 0)
|
|
1818
|
+
monthly = eff.get("projected_monthly_cost_saved_usd", 0)
|
|
1819
|
+
|
|
1820
|
+
if cost_pct > 20:
|
|
1821
|
+
findings.append({
|
|
1822
|
+
"severity": "strength", "area": "cost",
|
|
1823
|
+
"title": f"Significant cost reduction ({cost_pct:.0f}%)",
|
|
1824
|
+
"detail": f"C3 saves ${eff.get('total_cost_saved_usd', 0):.4f} per run. Projected monthly: ${monthly:.2f}.",
|
|
1825
|
+
"action": "Cost savings compound at scale — C3 pays for itself quickly.",
|
|
1826
|
+
})
|
|
1827
|
+
elif cost_pct < -20:
|
|
1828
|
+
findings.append({
|
|
1829
|
+
"severity": "warning", "area": "cost",
|
|
1830
|
+
"title": f"C3 increases cost ({abs(cost_pct):.0f}% more)",
|
|
1831
|
+
"detail": "C3 MCP tool calls add token overhead that exceeds native tool efficiency.",
|
|
1832
|
+
"action": "Consider if quality gains justify the cost. Review which C3 tools add most overhead.",
|
|
1833
|
+
})
|
|
1834
|
+
|
|
1835
|
+
if token_pct > 15:
|
|
1836
|
+
findings.append({
|
|
1837
|
+
"severity": "strength", "area": "tokens",
|
|
1838
|
+
"title": f"Token efficient ({token_pct:.0f}% fewer tokens)",
|
|
1839
|
+
"detail": f"C3 uses {eff.get('total_tokens_saved', 0):,} fewer tokens across all tasks.",
|
|
1840
|
+
"action": "C3 compress/read tools reduce context window usage effectively.",
|
|
1841
|
+
})
|
|
1842
|
+
elif token_pct < -15:
|
|
1843
|
+
findings.append({
|
|
1844
|
+
"severity": "warning", "area": "tokens",
|
|
1845
|
+
"title": f"Higher token usage ({abs(token_pct):.0f}% more)",
|
|
1846
|
+
"detail": "C3 tool calls introduce additional tokens from MCP overhead.",
|
|
1847
|
+
"action": "Check if c3_compress is being used — it should reduce token consumption.",
|
|
1848
|
+
})
|
|
1849
|
+
|
|
1850
|
+
if time_pct > 20:
|
|
1851
|
+
findings.append({
|
|
1852
|
+
"severity": "strength", "area": "speed",
|
|
1853
|
+
"title": f"Faster responses ({time_pct:.0f}% time saved)",
|
|
1854
|
+
"detail": f"C3 saves {eff.get('total_time_saved_s', 0):.0f}s total across all tasks.",
|
|
1855
|
+
"action": "C3 tools help the AI find answers faster with fewer tool-use turns.",
|
|
1856
|
+
})
|
|
1857
|
+
elif time_pct < -20:
|
|
1858
|
+
findings.append({
|
|
1859
|
+
"severity": "warning", "area": "speed",
|
|
1860
|
+
"title": f"Slower responses ({abs(time_pct):.0f}% slower)",
|
|
1861
|
+
"detail": "MCP tool round-trips add latency.",
|
|
1862
|
+
"action": "C3 tool startup overhead may dominate on simple tasks. Focus C3 on complex queries.",
|
|
1863
|
+
})
|
|
1864
|
+
|
|
1865
|
+
# --- Dimension analysis ---
|
|
1866
|
+
for dim_name, dim_data in dims.items():
|
|
1867
|
+
d = dim_data.get("delta", 0)
|
|
1868
|
+
label = dim_name.replace("_score", "").replace("_", " ").title()
|
|
1869
|
+
if d > 0.1:
|
|
1870
|
+
findings.append({
|
|
1871
|
+
"severity": "strength", "area": "dimension",
|
|
1872
|
+
"title": f"Strong in {label} (+{d:.3f})",
|
|
1873
|
+
"detail": f"C3: {dim_data['avg_c3']:.3f} vs Baseline: {dim_data['avg_baseline']:.3f}.",
|
|
1874
|
+
"action": f"C3 tools enhance {label.lower()} — a clear differentiator.",
|
|
1875
|
+
})
|
|
1876
|
+
elif d < -0.1:
|
|
1877
|
+
findings.append({
|
|
1878
|
+
"severity": "warning", "area": "dimension",
|
|
1879
|
+
"title": f"Weak in {label} ({d:+.3f})",
|
|
1880
|
+
"detail": f"C3: {dim_data['avg_c3']:.3f} vs Baseline: {dim_data['avg_baseline']:.3f}.",
|
|
1881
|
+
"action": f"Investigate why C3 underperforms on {label.lower()}. May need tool improvements.",
|
|
1882
|
+
})
|
|
1883
|
+
|
|
1884
|
+
# --- Category analysis ---
|
|
1885
|
+
weak_cats = []
|
|
1886
|
+
strong_cats = []
|
|
1887
|
+
for cat_name, cat_data in cats.items():
|
|
1888
|
+
wr = cat_data.get("win_rate_c3", 50)
|
|
1889
|
+
cat_label = cat_name.replace("_", " ").title()
|
|
1890
|
+
if wr >= 80:
|
|
1891
|
+
strong_cats.append(cat_label)
|
|
1892
|
+
elif wr <= 20:
|
|
1893
|
+
weak_cats.append((cat_label, cat_data.get("avg_score_delta", 0)))
|
|
1894
|
+
|
|
1895
|
+
if strong_cats:
|
|
1896
|
+
findings.append({
|
|
1897
|
+
"severity": "strength", "area": "category",
|
|
1898
|
+
"title": f"Dominates in {', '.join(strong_cats)}",
|
|
1899
|
+
"detail": "C3 wins 80%+ of tasks in these categories.",
|
|
1900
|
+
"action": "These are C3's sweet spots — consider marketing/documentation around these strengths.",
|
|
1901
|
+
})
|
|
1902
|
+
if weak_cats:
|
|
1903
|
+
cats_str = ", ".join(f"{c} ({d:+.3f})" for c, d in weak_cats)
|
|
1904
|
+
findings.append({
|
|
1905
|
+
"severity": "critical" if len(weak_cats) > 2 else "warning",
|
|
1906
|
+
"area": "category",
|
|
1907
|
+
"title": f"Weak categories: {', '.join(c for c, _ in weak_cats)}",
|
|
1908
|
+
"detail": f"C3 wins 20% or less: {cats_str}.",
|
|
1909
|
+
"action": "Priority improvement areas. Analyze response comparisons for these categories.",
|
|
1910
|
+
})
|
|
1911
|
+
|
|
1912
|
+
# --- Tool usage insights ---
|
|
1913
|
+
mcp_ratio = ta.get("mcp_ratio", 0)
|
|
1914
|
+
c3_diversity = ta.get("c3_tool_diversity", 0)
|
|
1915
|
+
base_diversity = ta.get("baseline_tool_diversity", 0)
|
|
1916
|
+
|
|
1917
|
+
if mcp_ratio > 0 and mcp_ratio < 20:
|
|
1918
|
+
findings.append({
|
|
1919
|
+
"severity": "warning", "area": "tools",
|
|
1920
|
+
"title": f"Low MCP tool utilization ({mcp_ratio:.0f}%)",
|
|
1921
|
+
"detail": "C3 MCP tools are available but rarely used by the AI.",
|
|
1922
|
+
"action": "The AI may not know about C3 tools. Check CLAUDE.md instructions and tool descriptions.",
|
|
1923
|
+
})
|
|
1924
|
+
elif mcp_ratio >= 50:
|
|
1925
|
+
findings.append({
|
|
1926
|
+
"severity": "strength", "area": "tools",
|
|
1927
|
+
"title": f"Heavy MCP utilization ({mcp_ratio:.0f}%)",
|
|
1928
|
+
"detail": "The AI actively leverages C3 MCP tools over native alternatives.",
|
|
1929
|
+
"action": "Good adoption. C3 tools are being discovered and preferred.",
|
|
1930
|
+
})
|
|
1931
|
+
|
|
1932
|
+
# --- Tool adoption rate ---
|
|
1933
|
+
adoption = report.get("tool_adoption", {})
|
|
1934
|
+
adoption_rate = adoption.get("adoption_rate", 0)
|
|
1935
|
+
if adoption_rate < 50:
|
|
1936
|
+
findings.append({
|
|
1937
|
+
"severity": "warning", "area": "adoption",
|
|
1938
|
+
"title": f"Low C3 tool adoption ({adoption_rate:.0f}% of tasks)",
|
|
1939
|
+
"detail": f"Only {adoption.get('tasks_using_mcp', 0)}/{adoption.get('total_tasks', 0)} C3-mode runs used any MCP tools.",
|
|
1940
|
+
"action": "Strengthen prompt instructions or check if CLAUDE.md C3 mandate is being loaded.",
|
|
1941
|
+
})
|
|
1942
|
+
elif adoption_rate >= 80:
|
|
1943
|
+
findings.append({
|
|
1944
|
+
"severity": "strength", "area": "adoption",
|
|
1945
|
+
"title": f"High C3 tool adoption ({adoption_rate:.0f}%)",
|
|
1946
|
+
"detail": f"{adoption.get('tasks_using_mcp', 0)}/{adoption.get('total_tasks', 0)} tasks used C3 MCP tools. {adoption.get('unique_mcp_tool_count', 0)} unique tools.",
|
|
1947
|
+
"action": "Good adoption across tasks.",
|
|
1948
|
+
})
|
|
1949
|
+
|
|
1950
|
+
if c3_diversity > base_diversity + 1:
|
|
1951
|
+
findings.append({
|
|
1952
|
+
"severity": "info", "area": "tools",
|
|
1953
|
+
"title": "C3 enables broader tool strategy",
|
|
1954
|
+
"detail": f"C3 mode uses {c3_diversity:.1f} unique tools/task vs {base_diversity:.1f} baseline.",
|
|
1955
|
+
"action": "More diverse tool usage suggests C3 provides richer exploration capabilities.",
|
|
1956
|
+
})
|
|
1957
|
+
|
|
1958
|
+
# --- Context pressure warnings ---
|
|
1959
|
+
# Flag any task where peak estimated context fill exceeded 70 % in either run.
|
|
1960
|
+
_PRESSURE_WARN = 70.0
|
|
1961
|
+
high_pressure = []
|
|
1962
|
+
for r in results:
|
|
1963
|
+
for label, resp_key in (("c3", "c3_response"), ("baseline", "baseline_response")):
|
|
1964
|
+
resp = r.get(resp_key, {})
|
|
1965
|
+
pct = resp.get("context_pressure_pct", 0)
|
|
1966
|
+
if pct >= _PRESSURE_WARN:
|
|
1967
|
+
high_pressure.append((r.get("task_id", "?"), label, pct,
|
|
1968
|
+
resp.get("num_turns", 0),
|
|
1969
|
+
resp.get("total_tokens", 0)))
|
|
1970
|
+
if high_pressure:
|
|
1971
|
+
worst = max(high_pressure, key=lambda x: x[2])
|
|
1972
|
+
task_list = ", ".join(f"{t} ({l}, {p:.0f}%)" for t, l, p, *_ in high_pressure)
|
|
1973
|
+
findings.append({
|
|
1974
|
+
"severity": "warning" if worst[2] < 90 else "critical",
|
|
1975
|
+
"area": "context_pressure",
|
|
1976
|
+
"title": f"{len(high_pressure)} task run(s) under high context pressure",
|
|
1977
|
+
"detail": (
|
|
1978
|
+
f"Peak context fill ≥{_PRESSURE_WARN:.0f}% in: {task_list}. "
|
|
1979
|
+
f"Worst: '{worst[0]}' ({worst[1]}) at ~{worst[2]:.0f}% over {worst[3]} turns "
|
|
1980
|
+
f"({worst[4]:,} cumulative tokens)."
|
|
1981
|
+
),
|
|
1982
|
+
"action": (
|
|
1983
|
+
"High context pressure degrades output quality in later turns. "
|
|
1984
|
+
"Consider splitting long tasks, enabling C3 snapshots mid-task, "
|
|
1985
|
+
"or adding a max-turn limit to the benchmark runner."
|
|
1986
|
+
),
|
|
1987
|
+
})
|
|
1988
|
+
|
|
1989
|
+
# --- Token count reliability ---
|
|
1990
|
+
partial_runs = []
|
|
1991
|
+
for r in results:
|
|
1992
|
+
for label, resp_key in (("c3", "c3_response"), ("baseline", "baseline_response")):
|
|
1993
|
+
resp = r.get(resp_key, {})
|
|
1994
|
+
if resp.get("token_count_source") == "partial":
|
|
1995
|
+
reported = resp.get("cost_usd", 0)
|
|
1996
|
+
computed = resp.get("computed_cost_usd") or 0
|
|
1997
|
+
partial_runs.append((r.get("task_id", "?"), label, reported, computed))
|
|
1998
|
+
if partial_runs:
|
|
1999
|
+
run_list = "; ".join(
|
|
2000
|
+
f"{t} ({l}): reported ${rep:.4f} vs computed ${cmp:.4f}"
|
|
2001
|
+
for t, l, rep, cmp in partial_runs
|
|
2002
|
+
)
|
|
2003
|
+
findings.append({
|
|
2004
|
+
"severity": "warning",
|
|
2005
|
+
"area": "data_quality",
|
|
2006
|
+
"title": f"Token counts unreliable for {len(partial_runs)} run(s)",
|
|
2007
|
+
"detail": (
|
|
2008
|
+
f"cost_usd (cumulative) and token counts (partial) disagree by >$0.01. "
|
|
2009
|
+
f"Affected: {run_list}. Likely cause: 'usage' block reflects last turn only "
|
|
2010
|
+
f"while total_cost_usd is session-cumulative. Token savings % may be understated."
|
|
2011
|
+
),
|
|
2012
|
+
"action": (
|
|
2013
|
+
"Token savings comparisons for affected baseline runs are undercounted. "
|
|
2014
|
+
"True C3 token savings are likely higher than reported. "
|
|
2015
|
+
"Fix: ensure 'modelUsage' block is present in Claude JSON output, "
|
|
2016
|
+
"or sum per-turn usage blocks in the benchmark runner."
|
|
2017
|
+
),
|
|
2018
|
+
})
|
|
2019
|
+
|
|
2020
|
+
# --- Per-result outlier detection ---
|
|
2021
|
+
if results:
|
|
2022
|
+
biggest_c3_win = max(results, key=lambda r: r.get("score_delta", 0))
|
|
2023
|
+
biggest_base_win = min(results, key=lambda r: r.get("score_delta", 0))
|
|
2024
|
+
|
|
2025
|
+
if biggest_c3_win.get("score_delta", 0) > 0.2:
|
|
2026
|
+
findings.append({
|
|
2027
|
+
"severity": "info", "area": "outlier",
|
|
2028
|
+
"title": f"Biggest C3 win: {biggest_c3_win['task_id']}",
|
|
2029
|
+
"detail": f"Delta: {biggest_c3_win['score_delta']:+.3f} ({biggest_c3_win['task_category']}).",
|
|
2030
|
+
"action": "Expand response comparison to understand what C3 did differently.",
|
|
2031
|
+
})
|
|
2032
|
+
if biggest_base_win.get("score_delta", 0) < -0.2:
|
|
2033
|
+
findings.append({
|
|
2034
|
+
"severity": "warning", "area": "outlier",
|
|
2035
|
+
"title": f"Biggest C3 loss: {biggest_base_win['task_id']}",
|
|
2036
|
+
"detail": f"Delta: {biggest_base_win['score_delta']:+.3f} ({biggest_base_win['task_category']}).",
|
|
2037
|
+
"action": "Review this task — C3 tools may have misled the AI or added noise.",
|
|
2038
|
+
})
|
|
2039
|
+
|
|
2040
|
+
# Sort: critical first, then warning, info, strength
|
|
2041
|
+
findings.sort(key=lambda f: _INSIGHT_SEVERITY.get(f.get("severity", "info"), 1), reverse=True)
|
|
2042
|
+
|
|
2043
|
+
# Summary verdict
|
|
2044
|
+
n_critical = sum(1 for f in findings if f["severity"] == "critical")
|
|
2045
|
+
n_warnings = sum(1 for f in findings if f["severity"] == "warning")
|
|
2046
|
+
n_strengths = sum(1 for f in findings if f["severity"] == "strength")
|
|
2047
|
+
|
|
2048
|
+
if n_critical > 0:
|
|
2049
|
+
verdict = "C3 has critical weak spots that need attention before production use."
|
|
2050
|
+
elif n_warnings > n_strengths:
|
|
2051
|
+
verdict = "C3 shows mixed results. Focus on weak categories and dimensions."
|
|
2052
|
+
elif n_strengths > 0 and n_warnings == 0:
|
|
2053
|
+
verdict = "C3 provides clear, consistent improvements across the board."
|
|
2054
|
+
elif n_strengths > n_warnings:
|
|
2055
|
+
verdict = "C3 is net positive with some areas for improvement."
|
|
2056
|
+
else:
|
|
2057
|
+
verdict = "Results are inconclusive. Consider running more tasks or harder categories."
|
|
2058
|
+
|
|
2059
|
+
return {
|
|
2060
|
+
"verdict": verdict,
|
|
2061
|
+
"findings": findings,
|
|
2062
|
+
"counts": {
|
|
2063
|
+
"critical": n_critical,
|
|
2064
|
+
"warnings": n_warnings,
|
|
2065
|
+
"strengths": n_strengths,
|
|
2066
|
+
"info": sum(1 for f in findings if f["severity"] == "info"),
|
|
2067
|
+
},
|
|
2068
|
+
}
|
|
2069
|
+
|
|
2070
|
+
|
|
2071
|
+
# ---------------------------------------------------------------------------
|
|
2072
|
+
# HTML Report
|
|
2073
|
+
# ---------------------------------------------------------------------------
|
|
2074
|
+
|
|
2075
|
+
def render_e2e_html(report: dict) -> str:
|
|
2076
|
+
"""Render a comprehensive visual HTML report for the E2E benchmark."""
|
|
2077
|
+
sc = report["scorecard"]
|
|
2078
|
+
eff = report.get("efficiency_summary", {})
|
|
2079
|
+
dims = report.get("dimension_breakdown", {})
|
|
2080
|
+
providers = report.get("provider_stats", {})
|
|
2081
|
+
categories = report.get("category_stats", {})
|
|
2082
|
+
results = report.get("results", [])
|
|
2083
|
+
timestamp = report.get("timestamp", "")
|
|
2084
|
+
tool_analysis = report.get("tool_analysis", {})
|
|
2085
|
+
insights = report.get("insights", {})
|
|
2086
|
+
trends = report.get("trends", {})
|
|
2087
|
+
|
|
2088
|
+
provider_names = list(providers.keys())
|
|
2089
|
+
c3_scores = [providers[p]["avg_score_c3"] for p in provider_names]
|
|
2090
|
+
base_scores = [providers[p]["avg_score_baseline"] for p in provider_names]
|
|
2091
|
+
win_rates = [providers[p]["win_rate_c3"] for p in provider_names]
|
|
2092
|
+
|
|
2093
|
+
cat_names = list(categories.keys())
|
|
2094
|
+
cat_deltas = [categories[c]["avg_score_delta"] for c in cat_names]
|
|
2095
|
+
cat_c3 = [categories[c].get("avg_score_c3", 0) for c in cat_names]
|
|
2096
|
+
cat_base = [categories[c].get("avg_score_baseline", 0) for c in cat_names]
|
|
2097
|
+
|
|
2098
|
+
dim_names = list(dims.keys())
|
|
2099
|
+
dim_c3 = [dims[d]["avg_c3"] for d in dim_names]
|
|
2100
|
+
dim_base = [dims[d]["avg_baseline"] for d in dim_names]
|
|
2101
|
+
dim_labels = [d.replace("_score", "").replace("_", " ").title() for d in dim_names]
|
|
2102
|
+
|
|
2103
|
+
# Efficiency cards
|
|
2104
|
+
eff_time_saved = eff.get("total_time_saved_s", 0)
|
|
2105
|
+
eff_cost_saved = eff.get("total_cost_saved_usd", 0)
|
|
2106
|
+
eff_tokens_saved = eff.get("total_tokens_saved", 0)
|
|
2107
|
+
|
|
2108
|
+
# Tool usage data
|
|
2109
|
+
ta_summary = tool_analysis.get("summary", {})
|
|
2110
|
+
ta_comparison = tool_analysis.get("tool_comparison", [])
|
|
2111
|
+
ta_categories = tool_analysis.get("category_breakdown", {})
|
|
2112
|
+
|
|
2113
|
+
# Build tool comparison table rows
|
|
2114
|
+
tool_rows = ""
|
|
2115
|
+
for tc in ta_comparison:
|
|
2116
|
+
is_c3 = tc.get("is_c3_tool", False)
|
|
2117
|
+
badge = '<span class="c3-badge">C3</span>' if is_c3 else ""
|
|
2118
|
+
delta = tc.get("delta", 0)
|
|
2119
|
+
delta_class = "positive" if delta > 0 else ("negative" if delta < 0 else "")
|
|
2120
|
+
tool_rows += f"""
|
|
2121
|
+
<tr>
|
|
2122
|
+
<td>{tc['tool']} {badge}</td>
|
|
2123
|
+
<td>{tc['c3_calls']}</td>
|
|
2124
|
+
<td>{tc['baseline_calls']}</td>
|
|
2125
|
+
<td class="{delta_class}">{delta:+d}</td>
|
|
2126
|
+
</tr>"""
|
|
2127
|
+
|
|
2128
|
+
# Tool category chart data
|
|
2129
|
+
ta_cat_names = list(ta_categories.keys())
|
|
2130
|
+
ta_cat_c3_mcp = [ta_categories[c].get("c3_mcp_calls", 0) for c in ta_cat_names]
|
|
2131
|
+
ta_cat_c3_native = [ta_categories[c].get("c3_native_calls", 0) for c in ta_cat_names]
|
|
2132
|
+
ta_cat_base_native = [ta_categories[c].get("base_native_calls", 0) for c in ta_cat_names]
|
|
2133
|
+
|
|
2134
|
+
# Insights HTML
|
|
2135
|
+
insights_html = ""
|
|
2136
|
+
for finding in insights.get("findings", []):
|
|
2137
|
+
sev = finding.get("severity", "info")
|
|
2138
|
+
insights_html += f"""
|
|
2139
|
+
<div class="insight {sev}">
|
|
2140
|
+
<div class="insight-title">{_html_escape(finding.get('title', ''))}</div>
|
|
2141
|
+
<div class="insight-detail">{_html_escape(finding.get('detail', ''))}</div>
|
|
2142
|
+
<div class="insight-action">{_html_escape(finding.get('action', ''))}</div>
|
|
2143
|
+
</div>"""
|
|
2144
|
+
|
|
2145
|
+
verdict = insights.get("verdict", "")
|
|
2146
|
+
ic = insights.get("counts", {})
|
|
2147
|
+
verdict_counts = f"{ic.get('strengths', 0)} strengths, {ic.get('warnings', 0)} warnings, {ic.get('critical', 0)} critical"
|
|
2148
|
+
|
|
2149
|
+
# Trend data
|
|
2150
|
+
has_trends = trends.get("available", False)
|
|
2151
|
+
sl = trends.get("since_last", {})
|
|
2152
|
+
sparklines = trends.get("sparklines", {})
|
|
2153
|
+
cat_trends = trends.get("category_trends", {})
|
|
2154
|
+
|
|
2155
|
+
# Since-last-run delta badges for scorecard
|
|
2156
|
+
def _delta_badge(val, fmt="+.1f", suffix="", invert=False):
|
|
2157
|
+
"""Generate an HTML delta badge. invert=True means lower is better."""
|
|
2158
|
+
if not has_trends or val == 0:
|
|
2159
|
+
return ""
|
|
2160
|
+
good = val < 0 if invert else val > 0
|
|
2161
|
+
cls = "positive" if good else "negative"
|
|
2162
|
+
return f'<div class="delta-badge {cls}">{val:{fmt}}{suffix}</div>'
|
|
2163
|
+
|
|
2164
|
+
wr_delta_badge = _delta_badge(sl.get("win_rate_delta", 0), "+.1f", "pp")
|
|
2165
|
+
delta_delta_badge = _delta_badge(sl.get("score_delta_delta", 0), "+.3f")
|
|
2166
|
+
c3_delta_badge = _delta_badge(sl.get("avg_c3_delta", 0), "+.3f")
|
|
2167
|
+
cost_delta_badge = _delta_badge(sl.get("cost_saved_delta", 0), "+.4f", "")
|
|
2168
|
+
|
|
2169
|
+
# Category trend arrows
|
|
2170
|
+
cat_trend_arrows = {}
|
|
2171
|
+
for cat_name, ct in cat_trends.items():
|
|
2172
|
+
d = ct.get("score_delta_delta", 0)
|
|
2173
|
+
if d > 0.01:
|
|
2174
|
+
cat_trend_arrows[cat_name] = '<span class="trend-up">▲</span>'
|
|
2175
|
+
elif d < -0.01:
|
|
2176
|
+
cat_trend_arrows[cat_name] = '<span class="trend-down">▼</span>'
|
|
2177
|
+
else:
|
|
2178
|
+
cat_trend_arrows[cat_name] = '<span class="trend-flat">▶</span>'
|
|
2179
|
+
|
|
2180
|
+
# Pre-build trend HTML section (avoids nested f-string issues on Python <3.12)
|
|
2181
|
+
trend_section_html = ""
|
|
2182
|
+
trend_charts_js = ""
|
|
2183
|
+
if has_trends:
|
|
2184
|
+
run_count = trends.get("run_count", 0)
|
|
2185
|
+
sl_ts = sl.get("prev_timestamp", "?")
|
|
2186
|
+
sl_wr = sl.get("win_rate_delta", 0)
|
|
2187
|
+
sl_sd = sl.get("score_delta_delta", 0)
|
|
2188
|
+
sl_cs = sl.get("cost_saved_delta", 0)
|
|
2189
|
+
sl_ts_saved = sl.get("token_saved_delta", 0)
|
|
2190
|
+
trend_section_html = (
|
|
2191
|
+
f'<h2 class="section-title">Trend Analysis ({run_count} runs)</h2>\n'
|
|
2192
|
+
f'<div class="since-last">\n'
|
|
2193
|
+
f' Since last run ({sl_ts}):\n'
|
|
2194
|
+
f' Win rate {sl_wr:+.1f}pp |\n'
|
|
2195
|
+
f' Score delta {sl_sd:+.3f} |\n'
|
|
2196
|
+
f' Cost saved {sl_cs:+.4f} USD |\n'
|
|
2197
|
+
f' Tokens saved {sl_ts_saved:+,d} |\n'
|
|
2198
|
+
f' MCP ratio {sl.get("mcp_ratio_delta", 0):+.1f}pp\n'
|
|
2199
|
+
f'</div>\n'
|
|
2200
|
+
f'<div class="trend-section">\n'
|
|
2201
|
+
f' <div class="sparkline-grid">\n'
|
|
2202
|
+
f' <div class="card"><h3>Win Rate Over Time</h3><canvas id="trendWinRate"></canvas></div>\n'
|
|
2203
|
+
f' <div class="card"><h3>Score Delta Over Time</h3><canvas id="trendDelta"></canvas></div>\n'
|
|
2204
|
+
f' <div class="card"><h3>Avg Scores Over Time</h3><canvas id="trendScores"></canvas></div>\n'
|
|
2205
|
+
f' <div class="card"><h3>Cost Per Run Over Time</h3><canvas id="trendCost"></canvas></div>\n'
|
|
2206
|
+
f' <div class="card"><h3>MCP Ratio Over Time</h3><canvas id="trendMcpRatio"></canvas></div>\n'
|
|
2207
|
+
f' </div>\n'
|
|
2208
|
+
f'</div>\n'
|
|
2209
|
+
)
|
|
2210
|
+
sp_ts = json.dumps(sparklines.get("timestamps", []))
|
|
2211
|
+
sp_wr = json.dumps(sparklines.get("win_rates", []))
|
|
2212
|
+
sp_ad = json.dumps(sparklines.get("avg_deltas", []))
|
|
2213
|
+
sp_c3 = json.dumps(sparklines.get("avg_c3_scores", []))
|
|
2214
|
+
sp_bs = json.dumps(sparklines.get("avg_base_scores", []))
|
|
2215
|
+
sp_cc = json.dumps(sparklines.get("costs_c3", []))
|
|
2216
|
+
sp_cb = json.dumps(sparklines.get("costs_base", []))
|
|
2217
|
+
sp_mr = json.dumps(sparklines.get("mcp_ratios", []))
|
|
2218
|
+
trend_charts_js = (
|
|
2219
|
+
f"const trendLabels = {sp_ts}.map(t => t ? t.slice(5,16) : '');\n"
|
|
2220
|
+
f"const sparkOpts = {{ ...chartOpts, plugins:{{ legend:{{ display:false }} }}, "
|
|
2221
|
+
f"scales:{{ x:{{ display:false }}, y:{{ grid:{{ color:'#1e1e2e' }} }} }}, "
|
|
2222
|
+
f"elements:{{ point:{{ radius:2 }}, line:{{ tension:0.3 }} }} }};\n\n"
|
|
2223
|
+
f"new Chart(document.getElementById('trendWinRate'), {{\n"
|
|
2224
|
+
f" type:'line', data:{{ labels:trendLabels,\n"
|
|
2225
|
+
f" datasets:[{{ data:{sp_wr}, borderColor:C3, borderWidth:2, fill:false }}]\n"
|
|
2226
|
+
f" }}, options:{{ ...sparkOpts, scales:{{ ...sparkOpts.scales, y:{{ ...sparkOpts.scales.y, min:0, max:100 }} }} }}\n"
|
|
2227
|
+
f"}});\n\n"
|
|
2228
|
+
f"new Chart(document.getElementById('trendDelta'), {{\n"
|
|
2229
|
+
f" type:'line', data:{{ labels:trendLabels,\n"
|
|
2230
|
+
f" datasets:[{{ data:{sp_ad}, borderColor:ACCENT, borderWidth:2, fill:true, backgroundColor:ACCENT+'22' }}]\n"
|
|
2231
|
+
f" }}, options:sparkOpts\n"
|
|
2232
|
+
f"}});\n\n"
|
|
2233
|
+
f"new Chart(document.getElementById('trendScores'), {{\n"
|
|
2234
|
+
f" type:'line', data:{{ labels:trendLabels,\n"
|
|
2235
|
+
f" datasets:[\n"
|
|
2236
|
+
f" {{ label:'C3', data:{sp_c3}, borderColor:C3, borderWidth:2, fill:false }},\n"
|
|
2237
|
+
f" {{ label:'Baseline', data:{sp_bs}, borderColor:BASE, borderWidth:2, fill:false }}\n"
|
|
2238
|
+
f" ]\n"
|
|
2239
|
+
f" }}, options:{{ ...sparkOpts, plugins:{{ legend:{{ display:true, labels:{{ color:'#888' }} }} }} }}\n"
|
|
2240
|
+
f"}});\n\n"
|
|
2241
|
+
f"new Chart(document.getElementById('trendCost'), {{\n"
|
|
2242
|
+
f" type:'line', data:{{ labels:trendLabels,\n"
|
|
2243
|
+
f" datasets:[\n"
|
|
2244
|
+
f" {{ label:'C3', data:{sp_cc}, borderColor:C3, borderWidth:2, fill:false }},\n"
|
|
2245
|
+
f" {{ label:'Baseline', data:{sp_cb}, borderColor:BASE, borderWidth:2, fill:false }}\n"
|
|
2246
|
+
f" ]\n"
|
|
2247
|
+
f" }}, options:{{ ...sparkOpts, plugins:{{ legend:{{ display:true, labels:{{ color:'#888' }} }} }} }}\n"
|
|
2248
|
+
f"}});\n\n"
|
|
2249
|
+
f"new Chart(document.getElementById('trendMcpRatio'), {{\n"
|
|
2250
|
+
f" type:'line', data:{{ labels:trendLabels,\n"
|
|
2251
|
+
f" datasets:[{{ data:{sp_mr}, borderColor:ACCENT, borderWidth:2, fill:true, backgroundColor:ACCENT+'22' }}]\n"
|
|
2252
|
+
f" }}, options:{{ ...sparkOpts, scales:{{ ...sparkOpts.scales, y:{{ ...sparkOpts.scales.y, min:0, max:100 }} }} }}\n"
|
|
2253
|
+
f"}});\n"
|
|
2254
|
+
)
|
|
2255
|
+
|
|
2256
|
+
# Results table
|
|
2257
|
+
result_rows = ""
|
|
2258
|
+
for r in results:
|
|
2259
|
+
c3_s = r["c3_score"]["combined_score"]
|
|
2260
|
+
base_s = r["baseline_score"]["combined_score"]
|
|
2261
|
+
delta = r["score_delta"]
|
|
2262
|
+
winner = "C3" if r["c3_wins"] else "Baseline"
|
|
2263
|
+
winner_class = "c3-win" if r["c3_wins"] else "base-win"
|
|
2264
|
+
e = r.get("efficiency", {})
|
|
2265
|
+
time_saved = e.get("time_saved_ms", 0) / 1000
|
|
2266
|
+
cost_saved = e.get("cost_saved_usd", 0)
|
|
2267
|
+
|
|
2268
|
+
c3_tu = r.get("c3_response", {}).get("tool_usage", {})
|
|
2269
|
+
base_tu = r.get("baseline_response", {}).get("tool_usage", {})
|
|
2270
|
+
c3_tools = c3_tu.get("total_tool_calls", 0)
|
|
2271
|
+
base_tools = base_tu.get("total_tool_calls", 0)
|
|
2272
|
+
|
|
2273
|
+
result_rows += f"""
|
|
2274
|
+
<tr class="{winner_class}">
|
|
2275
|
+
<td>{r['provider']}</td>
|
|
2276
|
+
<td>{r['task_id']}</td>
|
|
2277
|
+
<td>{r['task_category']}</td>
|
|
2278
|
+
<td><span class="diff-badge">{r.get('task_difficulty','?')}</span></td>
|
|
2279
|
+
<td>{c3_s:.3f}</td>
|
|
2280
|
+
<td>{base_s:.3f}</td>
|
|
2281
|
+
<td>{delta:+.3f}</td>
|
|
2282
|
+
<td>{winner}</td>
|
|
2283
|
+
<td>{time_saved:+.1f}s</td>
|
|
2284
|
+
<td>${cost_saved:+.3f}</td>
|
|
2285
|
+
<td>{c3_tools}/{base_tools}</td>
|
|
2286
|
+
</tr>"""
|
|
2287
|
+
|
|
2288
|
+
# Provider detail cards
|
|
2289
|
+
provider_cards = ""
|
|
2290
|
+
for pname, pdata in providers.items():
|
|
2291
|
+
provider_cards += f"""
|
|
2292
|
+
<div class="card provider-card">
|
|
2293
|
+
<h3>{pname.title()}</h3>
|
|
2294
|
+
<div class="stat-row"><span class="label">Model</span><span class="value">{pdata['model']}</span></div>
|
|
2295
|
+
<div class="stat-row"><span class="label">Win Rate</span><span class="value highlight">{pdata['win_rate_c3']:.1f}%</span></div>
|
|
2296
|
+
<div class="stat-row"><span class="label">Weighted Win Rate</span><span class="value">{pdata.get('weighted_win_rate_c3', 0):.1f}%</span></div>
|
|
2297
|
+
<div class="stat-row"><span class="label">Avg Score (C3/Base)</span><span class="value">{pdata['avg_score_c3']:.3f} / {pdata['avg_score_baseline']:.3f}</span></div>
|
|
2298
|
+
<div class="stat-row"><span class="label">Score Delta</span><span class="value {'positive' if pdata['avg_score_delta'] >= 0 else 'negative'}">{pdata['avg_score_delta']:+.3f}</span></div>
|
|
2299
|
+
<div class="stat-row"><span class="label">Tokens (C3/Base)</span><span class="value">{pdata['total_tokens_c3']:,} / {pdata['total_tokens_baseline']:,}</span></div>
|
|
2300
|
+
<div class="stat-row"><span class="label">Cost (C3/Base)</span><span class="value">${pdata['total_cost_c3_usd']:.4f} / ${pdata['total_cost_baseline_usd']:.4f}</span></div>
|
|
2301
|
+
<div class="stat-row"><span class="label">Avg Latency (C3/Base)</span><span class="value">{pdata['avg_latency_c3_ms']/1000:.1f}s / {pdata['avg_latency_baseline_ms']/1000:.1f}s</span></div>
|
|
2302
|
+
</div>"""
|
|
2303
|
+
|
|
2304
|
+
# Response comparison (expandable details)
|
|
2305
|
+
comparison_html = ""
|
|
2306
|
+
for r in results:
|
|
2307
|
+
c3_text = r["c3_response"].get("response_text", "")
|
|
2308
|
+
base_text = r["baseline_response"].get("response_text", "")
|
|
2309
|
+
c3_s = r["c3_score"]["combined_score"]
|
|
2310
|
+
base_s = r["baseline_score"]["combined_score"]
|
|
2311
|
+
delta = r["score_delta"]
|
|
2312
|
+
tag = "c3-win" if r["c3_wins"] else "base-win"
|
|
2313
|
+
if c3_text or base_text:
|
|
2314
|
+
# Truncate for display
|
|
2315
|
+
c3_display = (c3_text[:2000] + "...") if len(c3_text) > 2000 else c3_text
|
|
2316
|
+
base_display = (base_text[:2000] + "...") if len(base_text) > 2000 else base_text
|
|
2317
|
+
e = r.get("efficiency", {})
|
|
2318
|
+
comparison_html += f"""
|
|
2319
|
+
<details class="comparison-item">
|
|
2320
|
+
<summary class="{tag}">
|
|
2321
|
+
<strong>{r['task_id']}</strong> ({r['provider']}) —
|
|
2322
|
+
C3: {c3_s:.3f} vs Base: {base_s:.3f} ({delta:+.3f})
|
|
2323
|
+
| Time: {e.get('time_saved_ms',0)/1000:+.1f}s | Cost: ${e.get('cost_saved_usd',0):+.3f}
|
|
2324
|
+
</summary>
|
|
2325
|
+
<div class="comparison-grid">
|
|
2326
|
+
<div class="comparison-col">
|
|
2327
|
+
<h4 style="color:var(--c3)">C3 Response ({r['c3_response']['latency_ms']/1000:.1f}s, ${r['c3_response']['cost_usd']:.4f})</h4>
|
|
2328
|
+
<pre>{_html_escape(c3_display)}</pre>
|
|
2329
|
+
<div class="score-details">
|
|
2330
|
+
Keyword: {r['c3_score']['keyword_score']:.2f} |
|
|
2331
|
+
Structural: {r['c3_score']['structural_score']:.2f} |
|
|
2332
|
+
Files: {r['c3_score']['file_mention_score']:.2f} |
|
|
2333
|
+
Factual: {r['c3_score']['factual_score']:.2f} |
|
|
2334
|
+
Complete: {r['c3_score']['completeness_score']:.2f}
|
|
2335
|
+
</div>
|
|
2336
|
+
</div>
|
|
2337
|
+
<div class="comparison-col">
|
|
2338
|
+
<h4 style="color:var(--base)">Baseline Response ({r['baseline_response']['latency_ms']/1000:.1f}s, ${r['baseline_response']['cost_usd']:.4f})</h4>
|
|
2339
|
+
<pre>{_html_escape(base_display)}</pre>
|
|
2340
|
+
<div class="score-details">
|
|
2341
|
+
Keyword: {r['baseline_score']['keyword_score']:.2f} |
|
|
2342
|
+
Structural: {r['baseline_score']['structural_score']:.2f} |
|
|
2343
|
+
Files: {r['baseline_score']['file_mention_score']:.2f} |
|
|
2344
|
+
Factual: {r['baseline_score']['factual_score']:.2f} |
|
|
2345
|
+
Complete: {r['baseline_score']['completeness_score']:.2f}
|
|
2346
|
+
</div>
|
|
2347
|
+
</div>
|
|
2348
|
+
</div>
|
|
2349
|
+
</details>"""
|
|
2350
|
+
|
|
2351
|
+
return f"""<!doctype html>
|
|
2352
|
+
<html lang="en">
|
|
2353
|
+
<head>
|
|
2354
|
+
<meta charset="utf-8">
|
|
2355
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
2356
|
+
<title>C3 E2E Benchmark Report</title>
|
|
2357
|
+
<script src="https://cdn.jsdelivr.net/npm/chart.js@4"></script>
|
|
2358
|
+
<style>
|
|
2359
|
+
:root {{
|
|
2360
|
+
--bg: #0a0a0f; --surface: #12121a; --border: #1e1e2e;
|
|
2361
|
+
--text: #e0e0e0; --dim: #888; --accent: #6c5ce7;
|
|
2362
|
+
--c3: #00b894; --base: #e17055; --neutral: #636e72;
|
|
2363
|
+
--positive: #00b894; --negative: #e17055;
|
|
2364
|
+
}}
|
|
2365
|
+
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
|
2366
|
+
body {{ font-family: 'Segoe UI', system-ui, sans-serif; background: var(--bg); color: var(--text); padding: 24px; max-width: 1400px; margin: 0 auto; }}
|
|
2367
|
+
.header {{ text-align: center; padding: 32px 0; border-bottom: 1px solid var(--border); margin-bottom: 24px; }}
|
|
2368
|
+
.header h1 {{ font-size: 28px; font-weight: 300; color: var(--accent); }}
|
|
2369
|
+
.header .meta {{ color: var(--dim); font-size: 13px; margin-top: 8px; }}
|
|
2370
|
+
|
|
2371
|
+
.scorecard {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 12px; margin-bottom: 24px; }}
|
|
2372
|
+
.scorecard .card {{ background: var(--surface); border: 1px solid var(--border); border-radius: 12px; padding: 16px; text-align: center; }}
|
|
2373
|
+
.scorecard .card .big {{ font-size: 32px; font-weight: 700; }}
|
|
2374
|
+
.scorecard .card .label {{ font-size: 11px; color: var(--dim); text-transform: uppercase; letter-spacing: 1px; margin-top: 4px; }}
|
|
2375
|
+
|
|
2376
|
+
.grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(380px, 1fr)); gap: 16px; margin-bottom: 24px; }}
|
|
2377
|
+
.card {{ background: var(--surface); border: 1px solid var(--border); border-radius: 12px; padding: 20px; }}
|
|
2378
|
+
.card h3 {{ font-size: 15px; font-weight: 500; margin-bottom: 12px; color: var(--accent); }}
|
|
2379
|
+
.card canvas {{ max-height: 260px; }}
|
|
2380
|
+
|
|
2381
|
+
.provider-cards {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 12px; margin-bottom: 24px; }}
|
|
2382
|
+
.provider-card .stat-row {{ display: flex; justify-content: space-between; padding: 5px 0; border-bottom: 1px solid var(--border); font-size: 13px; }}
|
|
2383
|
+
.provider-card .stat-row .label {{ color: var(--dim); }}
|
|
2384
|
+
.provider-card .highlight {{ color: var(--accent); font-weight: 700; }}
|
|
2385
|
+
.positive {{ color: var(--positive); }}
|
|
2386
|
+
.negative {{ color: var(--negative); }}
|
|
2387
|
+
|
|
2388
|
+
table {{ width: 100%; border-collapse: collapse; font-size: 12px; }}
|
|
2389
|
+
th {{ text-align: left; padding: 8px 6px; border-bottom: 2px solid var(--border); color: var(--dim); text-transform: uppercase; font-size: 10px; letter-spacing: 1px; }}
|
|
2390
|
+
td {{ padding: 6px; border-bottom: 1px solid var(--border); }}
|
|
2391
|
+
tr.c3-win td:nth-child(8) {{ color: var(--c3); font-weight: 600; }}
|
|
2392
|
+
tr.base-win td:nth-child(8) {{ color: var(--base); font-weight: 600; }}
|
|
2393
|
+
tr:hover {{ background: rgba(108, 92, 231, 0.05); }}
|
|
2394
|
+
.diff-badge {{ font-size: 10px; padding: 2px 6px; border-radius: 4px; background: var(--border); }}
|
|
2395
|
+
|
|
2396
|
+
.section-title {{ font-size: 18px; font-weight: 400; margin: 24px 0 12px; padding-bottom: 6px; border-bottom: 1px solid var(--border); }}
|
|
2397
|
+
|
|
2398
|
+
.tool-summary {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 8px; margin-bottom: 16px; }}
|
|
2399
|
+
.tool-summary .stat {{ background: var(--surface); border: 1px solid var(--border); border-radius: 8px; padding: 12px; text-align: center; }}
|
|
2400
|
+
.tool-summary .stat .num {{ font-size: 24px; font-weight: 700; }}
|
|
2401
|
+
.tool-summary .stat .lbl {{ font-size: 10px; color: var(--dim); text-transform: uppercase; letter-spacing: 1px; margin-top: 4px; }}
|
|
2402
|
+
.c3-badge {{ font-size: 9px; padding: 1px 5px; border-radius: 3px; background: var(--c3); color: #000; font-weight: 700; vertical-align: middle; margin-left: 4px; }}
|
|
2403
|
+
.tool-table {{ margin-top: 12px; }}
|
|
2404
|
+
|
|
2405
|
+
.insight {{ padding: 12px 16px; border-radius: 8px; margin-bottom: 8px; border-left: 4px solid; }}
|
|
2406
|
+
.insight.critical {{ background: rgba(225,112,85,0.1); border-color: #e17055; }}
|
|
2407
|
+
.insight.warning {{ background: rgba(253,203,110,0.1); border-color: #fdcb6e; }}
|
|
2408
|
+
.insight.strength {{ background: rgba(0,184,148,0.1); border-color: #00b894; }}
|
|
2409
|
+
.insight.info {{ background: rgba(108,92,231,0.1); border-color: #6c5ce7; }}
|
|
2410
|
+
.insight .insight-title {{ font-weight: 600; font-size: 14px; margin-bottom: 4px; }}
|
|
2411
|
+
.insight.critical .insight-title {{ color: #e17055; }}
|
|
2412
|
+
.insight.warning .insight-title {{ color: #fdcb6e; }}
|
|
2413
|
+
.insight.strength .insight-title {{ color: #00b894; }}
|
|
2414
|
+
.insight.info .insight-title {{ color: #6c5ce7; }}
|
|
2415
|
+
.insight .insight-detail {{ font-size: 12px; color: var(--dim); }}
|
|
2416
|
+
.insight .insight-action {{ font-size: 12px; color: var(--text); margin-top: 4px; font-style: italic; }}
|
|
2417
|
+
.verdict {{ background: var(--surface); border: 2px solid var(--accent); border-radius: 12px; padding: 16px 20px; margin-bottom: 16px; text-align: center; }}
|
|
2418
|
+
.verdict .verdict-text {{ font-size: 16px; font-weight: 400; color: var(--accent); }}
|
|
2419
|
+
.verdict .verdict-counts {{ font-size: 12px; color: var(--dim); margin-top: 6px; }}
|
|
2420
|
+
|
|
2421
|
+
.guide {{ background: var(--surface); border: 1px solid var(--border); border-radius: 12px; margin-bottom: 24px; }}
|
|
2422
|
+
.guide summary {{ cursor: pointer; padding: 14px 20px; font-size: 15px; font-weight: 500; color: var(--accent); }}
|
|
2423
|
+
.guide .guide-content {{ padding: 0 20px 16px; font-size: 13px; line-height: 1.7; color: var(--dim); }}
|
|
2424
|
+
.guide .guide-content h4 {{ color: var(--text); margin: 12px 0 4px; font-size: 14px; }}
|
|
2425
|
+
.guide .guide-content dt {{ color: var(--text); font-weight: 600; margin-top: 8px; }}
|
|
2426
|
+
.guide .guide-content dd {{ margin-left: 16px; margin-bottom: 4px; }}
|
|
2427
|
+
|
|
2428
|
+
.delta-badge {{ font-size: 10px; padding: 1px 6px; border-radius: 4px; margin-top: 4px; display: inline-block; }}
|
|
2429
|
+
.delta-badge.positive {{ background: rgba(0,184,148,0.15); color: var(--positive); }}
|
|
2430
|
+
.delta-badge.negative {{ background: rgba(225,112,85,0.15); color: var(--negative); }}
|
|
2431
|
+
.trend-up {{ color: var(--positive); font-size: 10px; }}
|
|
2432
|
+
.trend-down {{ color: var(--negative); font-size: 10px; }}
|
|
2433
|
+
.trend-flat {{ color: var(--dim); font-size: 10px; }}
|
|
2434
|
+
.trend-section {{ margin-bottom: 24px; }}
|
|
2435
|
+
.trend-section .sparkline-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 12px; }}
|
|
2436
|
+
.trend-section canvas {{ max-height: 140px; }}
|
|
2437
|
+
.since-last {{ background: var(--surface); border: 1px solid var(--border); border-radius: 8px; padding: 10px 16px; margin-bottom: 12px; font-size: 13px; color: var(--dim); }}
|
|
2438
|
+
|
|
2439
|
+
.comparison-item {{ margin-bottom: 8px; }}
|
|
2440
|
+
.comparison-item summary {{ cursor: pointer; padding: 8px 12px; background: var(--surface); border: 1px solid var(--border); border-radius: 8px; font-size: 13px; }}
|
|
2441
|
+
.comparison-item summary.c3-win {{ border-left: 3px solid var(--c3); }}
|
|
2442
|
+
.comparison-item summary.base-win {{ border-left: 3px solid var(--base); }}
|
|
2443
|
+
.comparison-grid {{ display: grid; grid-template-columns: 1fr 1fr; gap: 12px; padding: 12px; }}
|
|
2444
|
+
.comparison-col {{ background: var(--bg); border-radius: 8px; padding: 12px; overflow: hidden; }}
|
|
2445
|
+
.comparison-col h4 {{ font-size: 13px; margin-bottom: 8px; }}
|
|
2446
|
+
.comparison-col pre {{ font-size: 11px; white-space: pre-wrap; word-break: break-word; max-height: 400px; overflow-y: auto; color: var(--dim); }}
|
|
2447
|
+
.score-details {{ font-size: 10px; color: var(--dim); margin-top: 8px; padding-top: 6px; border-top: 1px solid var(--border); }}
|
|
2448
|
+
</style>
|
|
2449
|
+
</head>
|
|
2450
|
+
<body>
|
|
2451
|
+
<div class="header">
|
|
2452
|
+
<h1>C3 End-to-End Benchmark <span style="background:#34d399;color:#0b1020;padding:0.15rem 0.55rem;border-radius:999px;font-size:0.7rem;font-weight:600;margin-left:0.5rem;vertical-align:middle">Live AI</span> <a href="../benchmarks/index.html" style="color:var(--text-dim,#9aa3c7);font-size:0.8rem;margin-left:0.8rem;text-decoration:none;font-weight:400">← dashboard</a></h1>
|
|
2453
|
+
<div class="meta">{timestamp} | {report.get('project_path', '')} | Providers: {', '.join(report.get('providers_tested', []))}</div>
|
|
2454
|
+
</div>
|
|
2455
|
+
|
|
2456
|
+
<details class="guide">
|
|
2457
|
+
<summary>How to Read This Report</summary>
|
|
2458
|
+
<div class="guide-content">
|
|
2459
|
+
<h4>What This Benchmark Measures</h4>
|
|
2460
|
+
<p>Each task is run twice against each AI provider: once with C3 MCP tools enabled, once without (baseline).
|
|
2461
|
+
Both modes have full native tool access (Read, Grep, Bash, etc.). The only difference is whether C3's
|
|
2462
|
+
specialized tools (c3_search, c3_compress, c3_read, etc.) are available.</p>
|
|
2463
|
+
|
|
2464
|
+
<h4>Key Metrics</h4>
|
|
2465
|
+
<dl>
|
|
2466
|
+
<dt>Win Rate</dt>
|
|
2467
|
+
<dd>Percentage of tasks where C3 mode scored higher than baseline. >60% = good, >75% = strong.</dd>
|
|
2468
|
+
<dt>Weighted Win Rate</dt>
|
|
2469
|
+
<dd>Same as win rate but harder tasks count more (easy=0.5x, medium=1x, hard=2x, expert=3x). If this exceeds raw win rate, C3 is better at hard tasks.</dd>
|
|
2470
|
+
<dt>Score Delta</dt>
|
|
2471
|
+
<dd>Average difference between C3 and baseline scores. Positive = C3 better. >+0.05 is meaningful.</dd>
|
|
2472
|
+
<dt>Efficiency Metrics</dt>
|
|
2473
|
+
<dd>Time, cost, and token savings. Positive values mean C3 used fewer resources.</dd>
|
|
2474
|
+
</dl>
|
|
2475
|
+
|
|
2476
|
+
<h4>Scoring Dimensions (0.0 - 1.0)</h4>
|
|
2477
|
+
<dl>
|
|
2478
|
+
<dt>Keyword</dt><dd>Required terms present in response, forbidden terms absent.</dd>
|
|
2479
|
+
<dt>Structural</dt><dd>Code blocks, file references, line numbers, organized structure.</dd>
|
|
2480
|
+
<dt>File Mention</dt><dd>Expected files and symbols referenced correctly.</dd>
|
|
2481
|
+
<dt>Hallucination</dt><dd>1.0 = no fabricated file paths or symbols. Lower = invented references.</dd>
|
|
2482
|
+
<dt>Factual</dt><dd>Verifiable claims about the codebase matched against ground truth.</dd>
|
|
2483
|
+
<dt>Completeness</dt><dd>All required aspects of the question addressed.</dd>
|
|
2484
|
+
</dl>
|
|
2485
|
+
|
|
2486
|
+
<h4>Tool Usage Analysis</h4>
|
|
2487
|
+
<p>Shows which tools each mode used. "MCP Ratio" is the percentage of C3 tool calls that used C3-specific
|
|
2488
|
+
tools. Higher MCP ratio + higher score = C3 tools are being used effectively. Low MCP ratio may mean
|
|
2489
|
+
the AI isn't discovering C3 tools.</p>
|
|
2490
|
+
|
|
2491
|
+
<h4>Reading the Insights</h4>
|
|
2492
|
+
<p>Insights are auto-generated from the data. Colors indicate severity:
|
|
2493
|
+
<span style="color:#00b894">green = strength</span>,
|
|
2494
|
+
<span style="color:#6c5ce7">purple = info</span>,
|
|
2495
|
+
<span style="color:#fdcb6e">yellow = warning</span>,
|
|
2496
|
+
<span style="color:#e17055">red = critical</span>.
|
|
2497
|
+
Each insight includes an actionable recommendation.</p>
|
|
2498
|
+
</div>
|
|
2499
|
+
</details>
|
|
2500
|
+
|
|
2501
|
+
<div class="verdict">
|
|
2502
|
+
<div class="verdict-text">{_html_escape(verdict)}</div>
|
|
2503
|
+
<div class="verdict-counts">{verdict_counts}</div>
|
|
2504
|
+
</div>
|
|
2505
|
+
|
|
2506
|
+
<div class="scorecard">
|
|
2507
|
+
<div class="card">
|
|
2508
|
+
<div class="big" style="color: var(--c3)">{sc['c3_win_rate']:.0f}%</div>
|
|
2509
|
+
<div class="label">C3 Win Rate</div>
|
|
2510
|
+
{wr_delta_badge}
|
|
2511
|
+
</div>
|
|
2512
|
+
<div class="card">
|
|
2513
|
+
<div class="big">{sc.get('weighted_win_rate', sc['c3_win_rate']):.0f}%</div>
|
|
2514
|
+
<div class="label">Weighted Win Rate</div>
|
|
2515
|
+
</div>
|
|
2516
|
+
<div class="card">
|
|
2517
|
+
<div class="big">{sc['c3_wins']} / {sc['c3_wins'] + sc['baseline_wins']}</div>
|
|
2518
|
+
<div class="label">C3 Wins / Total</div>
|
|
2519
|
+
</div>
|
|
2520
|
+
<div class="card">
|
|
2521
|
+
<div class="big">{sc['avg_score_c3']:.2f}</div>
|
|
2522
|
+
<div class="label">Avg C3 Score</div>
|
|
2523
|
+
{c3_delta_badge}
|
|
2524
|
+
</div>
|
|
2525
|
+
<div class="card">
|
|
2526
|
+
<div class="big {'positive' if sc['avg_score_delta'] >= 0 else 'negative'}">{sc['avg_score_delta']:+.3f}</div>
|
|
2527
|
+
<div class="label">Score Delta</div>
|
|
2528
|
+
{delta_delta_badge}
|
|
2529
|
+
</div>
|
|
2530
|
+
<div class="card">
|
|
2531
|
+
<div class="big {'positive' if eff_time_saved >= 0 else 'negative'}">{eff_time_saved:+.0f}s</div>
|
|
2532
|
+
<div class="label">Time Saved</div>
|
|
2533
|
+
</div>
|
|
2534
|
+
<div class="card">
|
|
2535
|
+
<div class="big {'positive' if eff_cost_saved >= 0 else 'negative'}">${eff_cost_saved:+.3f}</div>
|
|
2536
|
+
<div class="label">Cost Saved</div>
|
|
2537
|
+
</div>
|
|
2538
|
+
<div class="card">
|
|
2539
|
+
<div class="big">{eff.get('projected_monthly_cost_saved_usd', 0):+.1f}</div>
|
|
2540
|
+
<div class="label">$/mo Projected</div>
|
|
2541
|
+
</div>
|
|
2542
|
+
<div class="card">
|
|
2543
|
+
<div class="big" style="color: var(--accent)">{(report.get('tool_adoption') or {}).get('adoption_rate', 0):.0f}%</div>
|
|
2544
|
+
<div class="label">MCP Adoption</div>
|
|
2545
|
+
</div>
|
|
2546
|
+
</div>
|
|
2547
|
+
|
|
2548
|
+
<div class="grid">
|
|
2549
|
+
<div class="card">
|
|
2550
|
+
<h3>Provider: C3 vs Baseline Scores</h3>
|
|
2551
|
+
<canvas id="providerChart"></canvas>
|
|
2552
|
+
</div>
|
|
2553
|
+
<div class="card">
|
|
2554
|
+
<h3>Score Dimensions (C3 vs Baseline)</h3>
|
|
2555
|
+
<canvas id="dimChart"></canvas>
|
|
2556
|
+
</div>
|
|
2557
|
+
<div class="card">
|
|
2558
|
+
<h3>Category: C3 vs Baseline</h3>
|
|
2559
|
+
<canvas id="catChart"></canvas>
|
|
2560
|
+
</div>
|
|
2561
|
+
<div class="card">
|
|
2562
|
+
<h3>Win Rate by Provider</h3>
|
|
2563
|
+
<canvas id="winRateChart"></canvas>
|
|
2564
|
+
</div>
|
|
2565
|
+
</div>
|
|
2566
|
+
|
|
2567
|
+
{trend_section_html}
|
|
2568
|
+
|
|
2569
|
+
<h2 class="section-title">Insights & Recommendations</h2>
|
|
2570
|
+
{insights_html}
|
|
2571
|
+
|
|
2572
|
+
<h2 class="section-title">Provider Details</h2>
|
|
2573
|
+
<div class="provider-cards">{provider_cards}</div>
|
|
2574
|
+
|
|
2575
|
+
<h2 class="section-title">Tool Usage Analysis</h2>
|
|
2576
|
+
<div class="tool-summary">
|
|
2577
|
+
<div class="stat"><div class="num" style="color:var(--c3)">{ta_summary.get('avg_c3_calls_per_task', 0)}</div><div class="lbl">Avg C3 Tools/Task</div></div>
|
|
2578
|
+
<div class="stat"><div class="num" style="color:var(--base)">{ta_summary.get('avg_baseline_calls_per_task', 0)}</div><div class="lbl">Avg Base Tools/Task</div></div>
|
|
2579
|
+
<div class="stat"><div class="num" style="color:var(--accent)">{ta_summary.get('c3_mcp_calls', 0)}</div><div class="lbl">C3 MCP Calls</div></div>
|
|
2580
|
+
<div class="stat"><div class="num">{ta_summary.get('mcp_ratio', 0):.0f}%</div><div class="lbl">MCP Ratio</div></div>
|
|
2581
|
+
<div class="stat"><div class="num">{ta_summary.get('c3_tool_diversity', 0)}</div><div class="lbl">C3 Diversity</div></div>
|
|
2582
|
+
<div class="stat"><div class="num">{ta_summary.get('baseline_tool_diversity', 0)}</div><div class="lbl">Base Diversity</div></div>
|
|
2583
|
+
</div>
|
|
2584
|
+
<div class="grid">
|
|
2585
|
+
<div class="card">
|
|
2586
|
+
<h3>Tool Calls by Category</h3>
|
|
2587
|
+
<canvas id="toolCatChart"></canvas>
|
|
2588
|
+
</div>
|
|
2589
|
+
<div class="card">
|
|
2590
|
+
<h3>Top Tools: C3 vs Baseline</h3>
|
|
2591
|
+
<canvas id="toolCompChart"></canvas>
|
|
2592
|
+
</div>
|
|
2593
|
+
</div>
|
|
2594
|
+
<div class="card tool-table" style="overflow-x: auto;">
|
|
2595
|
+
<h3>Tool Comparison Detail</h3>
|
|
2596
|
+
<table>
|
|
2597
|
+
<thead><tr><th>Tool</th><th>C3 Calls</th><th>Base Calls</th><th>Delta</th></tr></thead>
|
|
2598
|
+
<tbody>{tool_rows}</tbody>
|
|
2599
|
+
</table>
|
|
2600
|
+
</div>
|
|
2601
|
+
|
|
2602
|
+
<h2 class="section-title">All Results</h2>
|
|
2603
|
+
<div class="card" style="overflow-x: auto;">
|
|
2604
|
+
<table>
|
|
2605
|
+
<thead><tr>
|
|
2606
|
+
<th>Provider</th><th>Task</th><th>Category</th><th>Diff</th>
|
|
2607
|
+
<th>C3</th><th>Base</th><th>Delta</th><th>Winner</th>
|
|
2608
|
+
<th>Time Saved</th><th>Cost Saved</th><th>Tools (C3/Base)</th>
|
|
2609
|
+
</tr></thead>
|
|
2610
|
+
<tbody>{result_rows}</tbody>
|
|
2611
|
+
</table>
|
|
2612
|
+
</div>
|
|
2613
|
+
|
|
2614
|
+
<h2 class="section-title">Response Comparison</h2>
|
|
2615
|
+
{comparison_html}
|
|
2616
|
+
|
|
2617
|
+
<script>
|
|
2618
|
+
const C3='#00b894', BASE='#e17055', ACCENT='#6c5ce7';
|
|
2619
|
+
const chartOpts = {{ responsive:true, plugins:{{ legend:{{ labels:{{ color:'#888' }} }} }} }};
|
|
2620
|
+
|
|
2621
|
+
new Chart(document.getElementById('providerChart'), {{
|
|
2622
|
+
type:'bar', data:{{ labels:{json.dumps(provider_names)},
|
|
2623
|
+
datasets:[{{ label:'C3', data:{json.dumps(c3_scores)}, backgroundColor:C3 }},
|
|
2624
|
+
{{ label:'Baseline', data:{json.dumps(base_scores)}, backgroundColor:BASE }}]
|
|
2625
|
+
}}, options:{{ ...chartOpts, scales:{{ y:{{ beginAtZero:true, max:1, grid:{{ color:'#1e1e2e' }} }} }} }}
|
|
2626
|
+
}});
|
|
2627
|
+
|
|
2628
|
+
new Chart(document.getElementById('dimChart'), {{
|
|
2629
|
+
type:'radar', data:{{ labels:{json.dumps(dim_labels)},
|
|
2630
|
+
datasets:[{{ label:'C3', data:{json.dumps(dim_c3)}, borderColor:C3, backgroundColor:C3+'33' }},
|
|
2631
|
+
{{ label:'Baseline', data:{json.dumps(dim_base)}, borderColor:BASE, backgroundColor:BASE+'33' }}]
|
|
2632
|
+
}}, options:{{ ...chartOpts, scales:{{ r:{{ beginAtZero:true, max:1, grid:{{ color:'#1e1e2e' }}, pointLabels:{{ color:'#888' }} }} }} }}
|
|
2633
|
+
}});
|
|
2634
|
+
|
|
2635
|
+
new Chart(document.getElementById('catChart'), {{
|
|
2636
|
+
type:'bar', data:{{ labels:{json.dumps([c.replace('_',' ').title() for c in cat_names])},
|
|
2637
|
+
datasets:[{{ label:'C3', data:{json.dumps(cat_c3)}, backgroundColor:C3 }},
|
|
2638
|
+
{{ label:'Baseline', data:{json.dumps(cat_base)}, backgroundColor:BASE }}]
|
|
2639
|
+
}}, options:{{ ...chartOpts, scales:{{ y:{{ beginAtZero:true, max:1, grid:{{ color:'#1e1e2e' }} }} }} }}
|
|
2640
|
+
}});
|
|
2641
|
+
|
|
2642
|
+
new Chart(document.getElementById('winRateChart'), {{
|
|
2643
|
+
type:'bar', data:{{ labels:{json.dumps(provider_names)},
|
|
2644
|
+
datasets:[{{ label:'Win Rate %', data:{json.dumps(win_rates)}, backgroundColor:ACCENT }}]
|
|
2645
|
+
}}, options:{{ ...chartOpts, indexAxis:'y', scales:{{ x:{{ beginAtZero:true, max:100, grid:{{ color:'#1e1e2e' }} }} }}, plugins:{{ legend:{{ display:false }} }} }}
|
|
2646
|
+
}});
|
|
2647
|
+
|
|
2648
|
+
// Tool Usage Charts
|
|
2649
|
+
new Chart(document.getElementById('toolCatChart'), {{
|
|
2650
|
+
type:'bar', data:{{ labels:{json.dumps([c.replace('_',' ').title() for c in ta_cat_names])},
|
|
2651
|
+
datasets:[
|
|
2652
|
+
{{ label:'C3 MCP', data:{json.dumps(ta_cat_c3_mcp)}, backgroundColor:'#6c5ce7' }},
|
|
2653
|
+
{{ label:'C3 Native', data:{json.dumps(ta_cat_c3_native)}, backgroundColor:C3 }},
|
|
2654
|
+
{{ label:'Baseline Native', data:{json.dumps(ta_cat_base_native)}, backgroundColor:BASE }}
|
|
2655
|
+
]
|
|
2656
|
+
}}, options:{{ ...chartOpts, scales:{{ x:{{ stacked:true }}, y:{{ stacked:true, grid:{{ color:'#1e1e2e' }} }} }} }}
|
|
2657
|
+
}});
|
|
2658
|
+
|
|
2659
|
+
const topTools = {json.dumps([t['tool'] for t in ta_comparison[:10]])};
|
|
2660
|
+
const topC3 = {json.dumps([t['c3_calls'] for t in ta_comparison[:10]])};
|
|
2661
|
+
const topBase = {json.dumps([t['baseline_calls'] for t in ta_comparison[:10]])};
|
|
2662
|
+
new Chart(document.getElementById('toolCompChart'), {{
|
|
2663
|
+
type:'bar', data:{{ labels:topTools,
|
|
2664
|
+
datasets:[{{ label:'C3', data:topC3, backgroundColor:C3 }},
|
|
2665
|
+
{{ label:'Baseline', data:topBase, backgroundColor:BASE }}]
|
|
2666
|
+
}}, options:{{ ...chartOpts, indexAxis:'y', scales:{{ x:{{ grid:{{ color:'#1e1e2e' }} }} }} }}
|
|
2667
|
+
}});
|
|
2668
|
+
|
|
2669
|
+
// Trend sparkline charts
|
|
2670
|
+
{trend_charts_js}
|
|
2671
|
+
</script>
|
|
2672
|
+
</body>
|
|
2673
|
+
</html>"""
|
|
2674
|
+
|
|
2675
|
+
|
|
2676
|
+
def _html_escape(text: str) -> str:
|
|
2677
|
+
"""Minimal HTML escaping for pre blocks."""
|
|
2678
|
+
return text.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
2679
|
+
|
|
2680
|
+
|
|
2681
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
2682
|
+
# Dual-Provider Delegate Benchmark — Ollama vs Codex
|
|
2683
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
2684
|
+
|
|
2685
|
+
_DELEGATE_BENCH_TASKS = [
|
|
2686
|
+
{"task_type": "review", "task": "Review this code for bugs and regressions", "difficulty": "medium"},
|
|
2687
|
+
{"task_type": "explain", "task": "Explain what this code does and how it works", "difficulty": "easy"},
|
|
2688
|
+
{"task_type": "diagnose", "task": "Diagnose the root cause of failures in this code", "difficulty": "hard"},
|
|
2689
|
+
{"task_type": "improve", "task": "Suggest the most impactful improvement to this code", "difficulty": "medium"},
|
|
2690
|
+
{"task_type": "test", "task": "Design focused test cases for this code", "difficulty": "medium"},
|
|
2691
|
+
{"task_type": "summarize","task": "Summarize the key points of this code", "difficulty": "easy"},
|
|
2692
|
+
]
|
|
2693
|
+
|
|
2694
|
+
|
|
2695
|
+
class DelegateBenchmarkResult:
|
|
2696
|
+
"""Result of running one delegate task through one backend."""
|
|
2697
|
+
|
|
2698
|
+
def __init__(self, task_type: str, backend: str, difficulty: str = "medium"):
|
|
2699
|
+
self.task_type = task_type
|
|
2700
|
+
self.backend = backend
|
|
2701
|
+
self.difficulty = difficulty
|
|
2702
|
+
self.output: str = ""
|
|
2703
|
+
self.success: bool = False
|
|
2704
|
+
self.latency_s: float = 0.0
|
|
2705
|
+
self.output_tokens: int = 0
|
|
2706
|
+
self.model: str = ""
|
|
2707
|
+
|
|
2708
|
+
def to_dict(self) -> dict:
|
|
2709
|
+
return {
|
|
2710
|
+
"task_type": self.task_type,
|
|
2711
|
+
"backend": self.backend,
|
|
2712
|
+
"difficulty": self.difficulty,
|
|
2713
|
+
"success": self.success,
|
|
2714
|
+
"latency_s": round(self.latency_s, 1),
|
|
2715
|
+
"output_tokens": self.output_tokens,
|
|
2716
|
+
"model": self.model,
|
|
2717
|
+
"output_preview": (self.output[:200] + "...") if len(self.output) > 200 else self.output,
|
|
2718
|
+
}
|
|
2719
|
+
|
|
2720
|
+
|
|
2721
|
+
class DelegateBenchmark:
|
|
2722
|
+
"""Compare Ollama vs Codex delegate backends on the same tasks.
|
|
2723
|
+
|
|
2724
|
+
Usage:
|
|
2725
|
+
bench = DelegateBenchmark(project_path, svc)
|
|
2726
|
+
results = bench.run_all()
|
|
2727
|
+
report = bench.generate_report(results)
|
|
2728
|
+
"""
|
|
2729
|
+
|
|
2730
|
+
def __init__(self, project_path: str, svc, verbose: bool = False,
|
|
2731
|
+
task_types: list[str] | None = None):
|
|
2732
|
+
self.project_path = str(Path(project_path).resolve())
|
|
2733
|
+
self.svc = svc
|
|
2734
|
+
self.verbose = verbose
|
|
2735
|
+
self.task_types = task_types # filter to specific types, or None for all
|
|
2736
|
+
|
|
2737
|
+
def run_all(self) -> list[DelegateBenchmarkResult]:
|
|
2738
|
+
"""Run all delegate benchmark tasks through both backends."""
|
|
2739
|
+
from cli.tools.delegate import (
|
|
2740
|
+
_is_codex_on_path,
|
|
2741
|
+
check_codex,
|
|
2742
|
+
)
|
|
2743
|
+
|
|
2744
|
+
results = []
|
|
2745
|
+
|
|
2746
|
+
# Build context from project — compress a few key files
|
|
2747
|
+
context = self._build_context()
|
|
2748
|
+
|
|
2749
|
+
tasks = _DELEGATE_BENCH_TASKS
|
|
2750
|
+
if self.task_types:
|
|
2751
|
+
tasks = [t for t in tasks if t["task_type"] in self.task_types]
|
|
2752
|
+
|
|
2753
|
+
backends = ["ollama"]
|
|
2754
|
+
dcfg = self.svc.delegate_config or {}
|
|
2755
|
+
if dcfg.get("codex_enabled", False) and _is_codex_on_path():
|
|
2756
|
+
info = check_codex()
|
|
2757
|
+
if info.get("status") == "ok":
|
|
2758
|
+
backends.append("codex")
|
|
2759
|
+
|
|
2760
|
+
if self.verbose:
|
|
2761
|
+
print(f" Delegate backends: {', '.join(backends)}")
|
|
2762
|
+
print(f" Tasks: {len(tasks)} types")
|
|
2763
|
+
|
|
2764
|
+
for task_def in tasks:
|
|
2765
|
+
for backend in backends:
|
|
2766
|
+
result = self._run_single(task_def, backend, context)
|
|
2767
|
+
results.append(result)
|
|
2768
|
+
if self.verbose:
|
|
2769
|
+
status = "OK" if result.success else "FAIL"
|
|
2770
|
+
print(f" {result.task_type:>10} | {backend:>6} | {status} | "
|
|
2771
|
+
f"{result.latency_s:.1f}s | {result.output_tokens}tok | {result.model}")
|
|
2772
|
+
|
|
2773
|
+
return results
|
|
2774
|
+
|
|
2775
|
+
def _run_single(self, task_def: dict, backend: str,
|
|
2776
|
+
context: str) -> DelegateBenchmarkResult:
|
|
2777
|
+
"""Run a single task through a specific backend."""
|
|
2778
|
+
import time as _time
|
|
2779
|
+
|
|
2780
|
+
from cli.tools.delegate import handle_delegate
|
|
2781
|
+
|
|
2782
|
+
result = DelegateBenchmarkResult(
|
|
2783
|
+
task_type=task_def["task_type"],
|
|
2784
|
+
backend=backend,
|
|
2785
|
+
difficulty=task_def.get("difficulty", "medium"),
|
|
2786
|
+
)
|
|
2787
|
+
|
|
2788
|
+
captured = {}
|
|
2789
|
+
|
|
2790
|
+
def finalize(name, args, resp, summ):
|
|
2791
|
+
captured["args"] = args
|
|
2792
|
+
captured["response"] = resp
|
|
2793
|
+
captured["summary"] = summ
|
|
2794
|
+
return resp
|
|
2795
|
+
|
|
2796
|
+
t0 = _time.monotonic()
|
|
2797
|
+
try:
|
|
2798
|
+
handle_delegate(
|
|
2799
|
+
task=task_def["task"],
|
|
2800
|
+
task_type=task_def["task_type"],
|
|
2801
|
+
context=context,
|
|
2802
|
+
file_path="",
|
|
2803
|
+
svc=self.svc,
|
|
2804
|
+
finalize=finalize,
|
|
2805
|
+
backend=backend,
|
|
2806
|
+
)
|
|
2807
|
+
except Exception as e:
|
|
2808
|
+
result.output = f"[error] {e}"
|
|
2809
|
+
result.latency_s = round(_time.monotonic() - t0, 1)
|
|
2810
|
+
return result
|
|
2811
|
+
|
|
2812
|
+
result.latency_s = round(_time.monotonic() - t0, 1)
|
|
2813
|
+
result.output = captured.get("response", "")
|
|
2814
|
+
result.model = (captured.get("args") or {}).get("model", "unknown")
|
|
2815
|
+
result.output_tokens = count_tokens(result.output) if result.output else 0
|
|
2816
|
+
|
|
2817
|
+
# Determine success: non-empty, non-error output
|
|
2818
|
+
if result.output and not result.output.startswith("[delegate:error]") \
|
|
2819
|
+
and not result.output.startswith("[codex:error]"):
|
|
2820
|
+
result.success = True
|
|
2821
|
+
|
|
2822
|
+
return result
|
|
2823
|
+
|
|
2824
|
+
def _build_context(self) -> str:
|
|
2825
|
+
"""Build representative context from project files."""
|
|
2826
|
+
try:
|
|
2827
|
+
compressor = self.svc.compressor
|
|
2828
|
+
# Find a few representative files
|
|
2829
|
+
indexer = self.svc.indexer
|
|
2830
|
+
files = []
|
|
2831
|
+
for ext in (".py", ".js", ".ts"):
|
|
2832
|
+
hits = indexer.search(f"main function {ext}", top_k=2, include_content=False)
|
|
2833
|
+
files.extend(h["file"] for h in hits)
|
|
2834
|
+
files = list(dict.fromkeys(files))[:3]
|
|
2835
|
+
|
|
2836
|
+
parts = []
|
|
2837
|
+
for f in files:
|
|
2838
|
+
try:
|
|
2839
|
+
res = compressor.compress_file(
|
|
2840
|
+
str(Path(self.project_path) / f), "map"
|
|
2841
|
+
)
|
|
2842
|
+
if isinstance(res, dict) and res.get("compressed"):
|
|
2843
|
+
parts.append(f"--- {f} ---\n{res['compressed']}")
|
|
2844
|
+
except Exception:
|
|
2845
|
+
continue
|
|
2846
|
+
return "\n".join(parts) if parts else "No context available."
|
|
2847
|
+
except Exception:
|
|
2848
|
+
return "No context available."
|
|
2849
|
+
|
|
2850
|
+
@staticmethod
|
|
2851
|
+
def generate_report(results: list[DelegateBenchmarkResult]) -> dict:
|
|
2852
|
+
"""Generate a comparison report from benchmark results."""
|
|
2853
|
+
by_backend: dict[str, list] = {}
|
|
2854
|
+
for r in results:
|
|
2855
|
+
by_backend.setdefault(r.backend, []).append(r)
|
|
2856
|
+
|
|
2857
|
+
backend_stats = {}
|
|
2858
|
+
for backend, res_list in by_backend.items():
|
|
2859
|
+
total = len(res_list)
|
|
2860
|
+
successes = sum(1 for r in res_list if r.success)
|
|
2861
|
+
avg_latency = sum(r.latency_s for r in res_list) / total if total else 0
|
|
2862
|
+
avg_tokens = sum(r.output_tokens for r in res_list) / total if total else 0
|
|
2863
|
+
|
|
2864
|
+
backend_stats[backend] = {
|
|
2865
|
+
"tasks_run": total,
|
|
2866
|
+
"successes": successes,
|
|
2867
|
+
"success_rate": round(successes / total * 100, 1) if total else 0,
|
|
2868
|
+
"avg_latency_s": round(avg_latency, 1),
|
|
2869
|
+
"avg_output_tokens": round(avg_tokens),
|
|
2870
|
+
"models_used": list(set(r.model for r in res_list if r.model)),
|
|
2871
|
+
}
|
|
2872
|
+
|
|
2873
|
+
# Per task-type comparison
|
|
2874
|
+
by_type: dict[str, dict] = {}
|
|
2875
|
+
for r in results:
|
|
2876
|
+
by_type.setdefault(r.task_type, {}).setdefault(r.backend, r.to_dict())
|
|
2877
|
+
|
|
2878
|
+
return {
|
|
2879
|
+
"benchmark_type": "delegate_comparison",
|
|
2880
|
+
"backends": backend_stats,
|
|
2881
|
+
"per_task_type": by_type,
|
|
2882
|
+
"total_results": len(results),
|
|
2883
|
+
"all_results": [r.to_dict() for r in results],
|
|
2884
|
+
}
|