code-context-control 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. cli/__init__.py +1 -0
  2. cli/_hook_utils.py +99 -0
  3. cli/c3.py +6152 -0
  4. cli/commands/__init__.py +1 -0
  5. cli/commands/common.py +312 -0
  6. cli/commands/parser.py +286 -0
  7. cli/docs.html +3178 -0
  8. cli/edits.html +878 -0
  9. cli/hook_auto_snapshot.py +142 -0
  10. cli/hook_c3_signal.py +61 -0
  11. cli/hook_c3read.py +116 -0
  12. cli/hook_edit_ledger.py +213 -0
  13. cli/hook_edit_unlock.py +170 -0
  14. cli/hook_filter.py +130 -0
  15. cli/hook_ghost_files.py +238 -0
  16. cli/hook_pretool_enforce.py +334 -0
  17. cli/hook_read.py +200 -0
  18. cli/hook_session_stats.py +62 -0
  19. cli/hook_terse_advisor.py +190 -0
  20. cli/hub.html +3764 -0
  21. cli/hub_server.py +1619 -0
  22. cli/mcp_proxy.py +428 -0
  23. cli/mcp_server.py +660 -0
  24. cli/server.py +2985 -0
  25. cli/tools/__init__.py +4 -0
  26. cli/tools/_helpers.py +65 -0
  27. cli/tools/agent.py +1165 -0
  28. cli/tools/compress.py +215 -0
  29. cli/tools/delegate.py +1184 -0
  30. cli/tools/edit.py +313 -0
  31. cli/tools/edits.py +118 -0
  32. cli/tools/filter.py +285 -0
  33. cli/tools/impact.py +163 -0
  34. cli/tools/memory.py +469 -0
  35. cli/tools/read.py +224 -0
  36. cli/tools/search.py +337 -0
  37. cli/tools/session.py +95 -0
  38. cli/tools/shell.py +193 -0
  39. cli/tools/status.py +306 -0
  40. cli/tools/validate.py +310 -0
  41. cli/ui/api.js +36 -0
  42. cli/ui/app.js +207 -0
  43. cli/ui/components/chat.js +758 -0
  44. cli/ui/components/dashboard.js +689 -0
  45. cli/ui/components/edits.js +220 -0
  46. cli/ui/components/instructions.js +481 -0
  47. cli/ui/components/memory.js +626 -0
  48. cli/ui/components/sessions.js +606 -0
  49. cli/ui/components/settings.js +1404 -0
  50. cli/ui/components/sidebar.js +156 -0
  51. cli/ui/icons.js +51 -0
  52. cli/ui/shared.js +119 -0
  53. cli/ui/theme.js +22 -0
  54. cli/ui.html +168 -0
  55. cli/ui_legacy.html +6797 -0
  56. cli/ui_nano.html +503 -0
  57. code_context_control-2.28.0.dist-info/METADATA +248 -0
  58. code_context_control-2.28.0.dist-info/RECORD +150 -0
  59. code_context_control-2.28.0.dist-info/WHEEL +5 -0
  60. code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
  61. code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
  62. code_context_control-2.28.0.dist-info/top_level.txt +5 -0
  63. core/__init__.py +75 -0
  64. core/config.py +269 -0
  65. core/ide.py +188 -0
  66. oracle/__init__.py +1 -0
  67. oracle/config.py +75 -0
  68. oracle/oracle.html +3900 -0
  69. oracle/oracle_server.py +663 -0
  70. oracle/services/__init__.py +1 -0
  71. oracle/services/c3_bridge.py +210 -0
  72. oracle/services/chat_engine.py +1103 -0
  73. oracle/services/chat_store.py +155 -0
  74. oracle/services/cross_memory.py +154 -0
  75. oracle/services/federated_graph.py +463 -0
  76. oracle/services/health_checker.py +117 -0
  77. oracle/services/insight_engine.py +307 -0
  78. oracle/services/memory_reader.py +106 -0
  79. oracle/services/memory_writer.py +182 -0
  80. oracle/services/ollama_bridge.py +332 -0
  81. oracle/services/project_scanner.py +87 -0
  82. oracle/services/review_agent.py +206 -0
  83. services/__init__.py +1 -0
  84. services/activity_log.py +93 -0
  85. services/agent_base.py +124 -0
  86. services/agents.py +1529 -0
  87. services/auto_memory.py +407 -0
  88. services/bench/__init__.py +6 -0
  89. services/bench/external/__init__.py +29 -0
  90. services/bench/external/aider_polyglot.py +405 -0
  91. services/bench/external/swe_bench.py +485 -0
  92. services/benchmark_dashboard.py +596 -0
  93. services/claude_md.py +785 -0
  94. services/compressor.py +592 -0
  95. services/context_snapshot.py +356 -0
  96. services/conversation_store.py +870 -0
  97. services/doc_index.py +537 -0
  98. services/e2e_benchmark.py +2884 -0
  99. services/e2e_evaluator.py +396 -0
  100. services/e2e_tasks.py +743 -0
  101. services/edit_ledger.py +459 -0
  102. services/embedding_index.py +341 -0
  103. services/error_reporting.py +123 -0
  104. services/file_memory.py +734 -0
  105. services/hub_service.py +585 -0
  106. services/indexer.py +712 -0
  107. services/memory.py +318 -0
  108. services/memory_consolidator.py +538 -0
  109. services/memory_graph.py +382 -0
  110. services/memory_grounder.py +304 -0
  111. services/memory_scorer.py +246 -0
  112. services/metrics.py +86 -0
  113. services/notifications.py +209 -0
  114. services/ollama_client.py +201 -0
  115. services/output_filter.py +488 -0
  116. services/parser.py +1238 -0
  117. services/project_manager.py +579 -0
  118. services/protocol.py +306 -0
  119. services/proxy_state.py +152 -0
  120. services/retrieval_broker.py +129 -0
  121. services/router.py +414 -0
  122. services/runtime.py +326 -0
  123. services/session_benchmark.py +1945 -0
  124. services/session_manager.py +1026 -0
  125. services/session_preloader.py +251 -0
  126. services/text_index.py +90 -0
  127. services/tool_classifier.py +176 -0
  128. services/transcript_index.py +340 -0
  129. services/validation_cache.py +155 -0
  130. services/vector_store.py +299 -0
  131. services/version_tracker.py +271 -0
  132. services/watcher.py +192 -0
  133. tui/__init__.py +0 -0
  134. tui/backend.py +59 -0
  135. tui/main.py +145 -0
  136. tui/screens/__init__.py +1 -0
  137. tui/screens/benchmark_view.py +109 -0
  138. tui/screens/claudemd_view.py +46 -0
  139. tui/screens/compress_view.py +52 -0
  140. tui/screens/index_view.py +74 -0
  141. tui/screens/init_view.py +82 -0
  142. tui/screens/mcp_view.py +73 -0
  143. tui/screens/optimize_view.py +41 -0
  144. tui/screens/pipe_view.py +46 -0
  145. tui/screens/projects_view.py +355 -0
  146. tui/screens/search_view.py +55 -0
  147. tui/screens/session_view.py +143 -0
  148. tui/screens/stats.py +158 -0
  149. tui/screens/ui_view.py +54 -0
  150. tui/theme.tcss +335 -0
@@ -0,0 +1,1945 @@
1
+ """Real-world session benchmark for C3.
2
+
3
+ Simulates multi-turn AI coding workflows end-to-end, comparing
4
+ "with C3" vs "without C3" paths across realistic scenarios like
5
+ bug investigation, feature exploration, code review, etc.
6
+
7
+ Measures cumulative token usage, latency, quality, and session longevity.
8
+ Generates a visual HTML report.
9
+ """
10
+
11
+ import html
12
+ import json
13
+ import re
14
+ import time
15
+ from dataclasses import asdict, dataclass, field
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ from core import count_tokens
20
+ from services.compressor import CodeCompressor
21
+ from services.file_memory import FileMemoryStore
22
+ from services.indexer import CodeIndex
23
+ from services.output_filter import OutputFilter
24
+ from services.parser import check_syntax_ast, check_syntax_native
25
+ from services.validation_cache import ValidationCache
26
+
27
+ # ─── Data Classes ──────────────────────────────────────────
28
+
29
+ @dataclass
30
+ class StepResult:
31
+ """Result of a single step within a workflow scenario."""
32
+ name: str
33
+ tool: str # C3 tool used or "native"
34
+ tokens: int = 0
35
+ latency_ms: float = 0.0
36
+ quality: float = 100.0 # 0-100
37
+ detail: str = ""
38
+
39
+
40
+ @dataclass
41
+ class ScenarioResult:
42
+ """Result of a complete workflow scenario."""
43
+ name: str
44
+ description: str
45
+ steps_c3: list = field(default_factory=list)
46
+ steps_baseline: list = field(default_factory=list)
47
+
48
+ @property
49
+ def total_tokens_c3(self):
50
+ return sum(s.tokens for s in self.steps_c3)
51
+
52
+ @property
53
+ def total_tokens_baseline(self):
54
+ return sum(s.tokens for s in self.steps_baseline)
55
+
56
+ @property
57
+ def total_latency_c3(self):
58
+ return sum(s.latency_ms for s in self.steps_c3)
59
+
60
+ @property
61
+ def total_latency_baseline(self):
62
+ return sum(s.latency_ms for s in self.steps_baseline)
63
+
64
+ @property
65
+ def avg_quality_c3(self):
66
+ if not self.steps_c3:
67
+ return 0.0
68
+ return sum(s.quality for s in self.steps_c3) / len(self.steps_c3)
69
+
70
+ @property
71
+ def avg_quality_baseline(self):
72
+ if not self.steps_baseline:
73
+ return 0.0
74
+ return sum(s.quality for s in self.steps_baseline) / len(self.steps_baseline)
75
+
76
+ @property
77
+ def token_savings_pct(self):
78
+ if not self.total_tokens_baseline:
79
+ return 0.0
80
+ return round((self.total_tokens_baseline - self.total_tokens_c3) / self.total_tokens_baseline * 100, 1)
81
+
82
+ @property
83
+ def budget_multiplier(self):
84
+ if not self.total_tokens_c3:
85
+ return 0.0
86
+ return round(self.total_tokens_baseline / self.total_tokens_c3, 2)
87
+
88
+ def to_dict(self):
89
+ return {
90
+ "name": self.name,
91
+ "description": self.description,
92
+ "steps_c3": [asdict(s) for s in self.steps_c3],
93
+ "steps_baseline": [asdict(s) for s in self.steps_baseline],
94
+ "total_tokens_c3": self.total_tokens_c3,
95
+ "total_tokens_baseline": self.total_tokens_baseline,
96
+ "total_latency_c3_ms": round(self.total_latency_c3, 2),
97
+ "total_latency_baseline_ms": round(self.total_latency_baseline, 2),
98
+ "avg_quality_c3": round(self.avg_quality_c3, 1),
99
+ "avg_quality_baseline": round(self.avg_quality_baseline, 1),
100
+ "token_savings_pct": self.token_savings_pct,
101
+ "budget_multiplier": self.budget_multiplier,
102
+ }
103
+
104
+
105
+ # ─── Performance Timing Model ─────────────────────────────────
106
+ # Estimated model input processing rates (tokens/second).
107
+ # These are conservative averages for cloud API calls including network latency.
108
+ # Real rates vary by model, provider load, and prompt complexity.
109
+ PERF_PROFILES = {
110
+ "fast_model": {
111
+ "label": "Fast Model (Sonnet/Haiku)",
112
+ "input_tps": 150_000, # tokens/sec input processing
113
+ "output_tps": 120, # tokens/sec output generation
114
+ "avg_output_tokens": 500, # typical output per turn
115
+ "network_overhead_ms": 200, # API round-trip overhead
116
+ },
117
+ "capable_model": {
118
+ "label": "Capable Model (Opus/GPT-4)",
119
+ "input_tps": 80_000,
120
+ "output_tps": 60,
121
+ "avg_output_tokens": 800,
122
+ "network_overhead_ms": 300,
123
+ },
124
+ }
125
+
126
+
127
+ def _estimate_turn_time_ms(tokens: int, profile: dict) -> float:
128
+ """Estimate total wall-clock time for one AI turn given input token count."""
129
+ input_ms = (tokens / profile["input_tps"]) * 1000
130
+ output_ms = (profile["avg_output_tokens"] / profile["output_tps"]) * 1000
131
+ return input_ms + output_ms + profile["network_overhead_ms"]
132
+
133
+
134
+ # ─── Session Benchmark Engine ────────────────────────────────
135
+
136
+ class SessionBenchmark:
137
+ """Runs real-world workflow scenarios comparing C3 vs native approaches."""
138
+
139
+ def __init__(self, project_path: str, sample_size: int = 15, min_tokens: int = 200):
140
+ self.project_path = Path(project_path).resolve()
141
+ self.sample_size = sample_size
142
+ self.min_tokens = min_tokens
143
+
144
+ self.indexer = CodeIndex(str(self.project_path), str(self.project_path / ".c3" / "index"))
145
+ self.compressor = CodeCompressor(str(self.project_path / ".c3" / "cache"), project_root=str(self.project_path))
146
+ self.file_memory = FileMemoryStore(str(self.project_path))
147
+ self.validation_cache = ValidationCache(str(self.project_path))
148
+ self.output_filter = OutputFilter({"HYBRID_DISABLE_TIER1": True})
149
+
150
+ self.files = self._collect_files()
151
+ self.sample = self._select_sample()
152
+ self.fixtures = self._build_fixtures()
153
+
154
+ def _collect_files(self):
155
+ skip_dirs = set(getattr(self.indexer, "skip_dirs", set()))
156
+ code_exts = set(getattr(self.indexer, "code_exts", set()))
157
+ files = []
158
+ for fpath in self.project_path.rglob("*"):
159
+ if not fpath.is_file():
160
+ continue
161
+ if fpath.suffix.lower() not in code_exts:
162
+ continue
163
+ if any(skip in fpath.parts for skip in skip_dirs):
164
+ continue
165
+ if self.compressor.is_protected_file(fpath):
166
+ continue
167
+ try:
168
+ content = fpath.read_text(encoding="utf-8", errors="replace")
169
+ except Exception:
170
+ continue
171
+ files.append((fpath, content, count_tokens(content)))
172
+ return files
173
+
174
+ def _select_sample(self):
175
+ sample = sorted(
176
+ [f for f in self.files if f[2] >= self.min_tokens],
177
+ key=lambda x: x[2], reverse=True
178
+ )[:self.sample_size]
179
+ if not sample:
180
+ sample = sorted(self.files, key=lambda x: x[2], reverse=True)[:self.sample_size]
181
+ return sample
182
+
183
+ def _rel(self, path: Path) -> str:
184
+ return str(path.relative_to(self.project_path)).replace("\\", "/")
185
+
186
+ def _build_fixtures(self):
187
+ """Create log, JSONL, and terminal fixtures for log-related scenarios."""
188
+ fixture_dir = self.project_path / ".c3" / "session_benchmark" / "fixtures"
189
+ fixture_dir.mkdir(parents=True, exist_ok=True)
190
+
191
+ rel_paths = [self._rel(f[0]) for f in self.sample[:8]] or ["cli/c3.py"]
192
+
193
+ def stamp(idx):
194
+ return f"2026-03-10T14:{idx % 60:02d}:{(idx * 7) % 60:02d}"
195
+
196
+ # Build a realistic application log
197
+ log_lines = []
198
+ for idx in range(120):
199
+ rel = rel_paths[idx % len(rel_paths)]
200
+ log_lines.append(f"{stamp(idx)} INFO Processing {rel}")
201
+ if idx % 2 == 0:
202
+ log_lines.extend([f"{stamp(idx)} DEBUG heartbeat ok"] * 2)
203
+ if idx % 5 == 0:
204
+ log_lines.append(f"{stamp(idx)} WARN Slow parse {rel} latency={30 + idx}ms")
205
+ if idx % 8 == 0:
206
+ log_lines.append(f"{stamp(idx)} ERROR Failed to analyze {rel}")
207
+ log_lines.append("Traceback (most recent call last):")
208
+ log_lines.append(f' File "{rel}", line {10 + idx}, in process')
209
+ log_lines.append("RuntimeError: analysis timeout exceeded")
210
+ if idx % 15 == 0:
211
+ log_lines.append(f"{stamp(idx)} ERROR ConnectionError: upstream service unavailable")
212
+
213
+ log_path = fixture_dir / "session_app.log"
214
+ log_path.write_text("\n".join(log_lines) + "\n", encoding="utf-8")
215
+
216
+ # Build JSONL event stream
217
+ jsonl_entries = []
218
+ for idx in range(200):
219
+ rel = rel_paths[idx % len(rel_paths)]
220
+ jsonl_entries.append({
221
+ "ts": stamp(idx), "event": ["compress", "search", "read", "validate"][idx % 4],
222
+ "file": rel, "status": "ok" if idx % 13 else "error",
223
+ "tokens": 200 + idx * 3, "latency_ms": 2 + (idx % 20),
224
+ "user": f"user_{idx % 3}",
225
+ })
226
+
227
+ jsonl_path = fixture_dir / "session_events.jsonl"
228
+ jsonl_path.write_text("\n".join(json.dumps(e) for e in jsonl_entries) + "\n", encoding="utf-8")
229
+
230
+ # Build noisy terminal output
231
+ terminal_lines = ["\x1b[36mRunning test suite...\x1b[0m", ""]
232
+ for idx in range(150):
233
+ rel = rel_paths[idx % len(rel_paths)]
234
+ terminal_lines.append(f"tests/test_{idx:03d}.py::test_{rel.replace('/', '_')} PASSED")
235
+ if idx % 5 == 0:
236
+ terminal_lines.extend(["Installing dependencies..."] * 3)
237
+ if idx % 7 == 0:
238
+ terminal_lines.append("██████████████████████ 100%")
239
+ if idx % 10 == 0:
240
+ terminal_lines.append(f"WARN deprecation in {rel}")
241
+ if idx % 25 == 0:
242
+ terminal_lines.append(f"ERROR compilation failed for {rel}")
243
+ terminal_lines.append(f"FAILED tests/test_{idx:03d}.py - AssertionError")
244
+
245
+ terminal_path = fixture_dir / "session_terminal.txt"
246
+ terminal_path.write_text("\n".join(terminal_lines) + "\n", encoding="utf-8")
247
+
248
+ return {
249
+ "log_path": str(log_path),
250
+ "jsonl_path": str(jsonl_path),
251
+ "terminal_path": str(terminal_path),
252
+ "log_signals": ["ERROR", "WARN", "Traceback", "RuntimeError", "ConnectionError"],
253
+ "jsonl_fields": list(jsonl_entries[0].keys()) if jsonl_entries else [],
254
+ "terminal_signals": ["WARN", "ERROR", "FAILED"],
255
+ }
256
+
257
+ # ─── Workflow Scenarios ───────────────────────────────────
258
+
259
+ def run_all(self) -> list:
260
+ """Run all workflow scenarios and return results."""
261
+ scenarios = [
262
+ self._scenario_bug_investigation,
263
+ self._scenario_feature_exploration,
264
+ self._scenario_code_review,
265
+ self._scenario_log_diagnosis,
266
+ self._scenario_refactor_planning,
267
+ self._scenario_onboarding,
268
+ ]
269
+ results = []
270
+ for scenario_fn in scenarios:
271
+ try:
272
+ results.append(scenario_fn())
273
+ except Exception as e:
274
+ results.append(ScenarioResult(
275
+ name=scenario_fn.__name__.replace("_scenario_", ""),
276
+ description=f"Error: {e}",
277
+ ))
278
+ return results
279
+
280
+ def _scenario_bug_investigation(self) -> ScenarioResult:
281
+ """Simulate: search for error → read suspicious files → narrow to symbol → validate."""
282
+ result = ScenarioResult(
283
+ name="bug_investigation",
284
+ description="Search for an error pattern, read suspicious files, narrow to the relevant symbol, validate syntax.",
285
+ )
286
+ if len(self.sample) < 2:
287
+ return result
288
+
289
+ # Pick a target file and a "bug query"
290
+ target_path, target_content, _ = self.sample[0]
291
+ target_rel = self._rel(target_path)
292
+ record = self.file_memory.get(target_rel)
293
+ if not record or self.file_memory.needs_update(target_rel):
294
+ record = self.file_memory.update(target_rel)
295
+ symbols = []
296
+ if record and record.get("sections"):
297
+ symbols = [s["name"] for s in record["sections"] if s.get("type") in ("class", "function", "method")][:3]
298
+ # Use a query that's realistic but likely to hit — reference the filename and a symbol if available
299
+ if symbols:
300
+ query = f"{symbols[0]} bug in {target_path.stem}"
301
+ else:
302
+ query = f"{target_path.stem} {target_path.suffix.lstrip('.')} error"
303
+
304
+ # ── WITH C3 ──
305
+
306
+ # Step 1: c3_search to find relevant files
307
+ t0 = time.perf_counter()
308
+ search_results = self.indexer.search(query, top_k=5, max_tokens=2000)
309
+ context = self.indexer.get_context(query, top_k=5, max_tokens=2000)
310
+ lat = (time.perf_counter() - t0) * 1000
311
+ ctx_tokens = count_tokens(context)
312
+ hit = any(target_rel in str(r.get("file", "")).replace("\\", "/") for r in search_results)
313
+ # Partial credit: if we got results at all, the search is still useful (found related files)
314
+ search_quality = 100.0 if hit else (80.0 if search_results else 50.0)
315
+ result.steps_c3.append(StepResult("search", "c3_search", ctx_tokens, lat, search_quality))
316
+
317
+ # Step 2: c3_compress(map) to understand structure
318
+ t0 = time.perf_counter()
319
+ map_text = self.file_memory.get_or_build_map(target_rel)
320
+ lat = (time.perf_counter() - t0) * 1000
321
+ map_tokens = count_tokens(map_text)
322
+ map_ok = "[file_map] Could not" not in map_text and "[file_map:error]" not in map_text
323
+ result.steps_c3.append(StepResult("map_structure", "c3_compress(map)", map_tokens, lat, 100.0 if map_ok else 60.0))
324
+
325
+ # Step 3: c3_read to extract specific symbol
326
+ t0 = time.perf_counter()
327
+ if symbols and record and record.get("sections"):
328
+ target_sections = [s for s in record["sections"] if s["name"] == symbols[0]]
329
+ if target_sections:
330
+ lines = target_content.splitlines()
331
+ s = target_sections[0]
332
+ extracted = "\n".join(lines[s["line_start"]-1:s["line_end"]])
333
+ read_tokens = count_tokens(extracted)
334
+ read_quality = 100.0
335
+ else:
336
+ read_tokens = count_tokens(target_content)
337
+ read_quality = 70.0
338
+ else:
339
+ read_tokens = count_tokens(target_content)
340
+ read_quality = 70.0
341
+ lat = (time.perf_counter() - t0) * 1000
342
+ result.steps_c3.append(StepResult("read_symbol", "c3_read", read_tokens, lat, read_quality))
343
+
344
+ # Step 4: c3_validate syntax
345
+ t0 = time.perf_counter()
346
+ errors = check_syntax_ast(target_content, target_path.suffix.lower())
347
+ lat = (time.perf_counter() - t0) * 1000
348
+ err_msg = f"Found {len(errors)} errors" if errors else "No errors"
349
+ result.steps_c3.append(StepResult("validate", "c3_validate", count_tokens(err_msg), lat, 100.0))
350
+
351
+ # ── WITHOUT C3 (baseline) ──
352
+
353
+ # Step 1: lexical grep + read full files
354
+ t0 = time.perf_counter()
355
+ terms = [t for t in re.findall(r"[A-Za-z_]+", query.lower()) if len(t) > 2]
356
+ baseline_context = []
357
+ for fpath, content, _ in self.files[:20]:
358
+ low = content.lower()
359
+ if any(term in low for term in terms):
360
+ baseline_context.append(content)
361
+ if len(baseline_context) >= 5:
362
+ break
363
+ if not baseline_context:
364
+ baseline_context = [self.sample[0][1]]
365
+ lat = (time.perf_counter() - t0) * 1000
366
+ base_tokens = count_tokens("\n\n".join(baseline_context))
367
+ # Baseline grep: check if target was among the matched files
368
+ base_hit = any(target_rel in str(fp).replace("\\", "/") for fp, c, _ in self.files[:20] if any(t in c.lower() for t in terms))
369
+ base_search_quality = 100.0 if base_hit else (75.0 if baseline_context else 50.0)
370
+ result.steps_baseline.append(StepResult("grep_search", "native", base_tokens, lat, base_search_quality))
371
+
372
+ # Step 2: read full file
373
+ t0 = time.perf_counter()
374
+ full_content = target_path.read_text(encoding="utf-8", errors="replace")
375
+ lat = (time.perf_counter() - t0) * 1000
376
+ result.steps_baseline.append(StepResult("read_full_file", "native", count_tokens(full_content), lat, 100.0))
377
+
378
+ # Step 3: read full file again (no surgical read)
379
+ result.steps_baseline.append(StepResult("read_full_for_symbol", "native", count_tokens(full_content), 0.1, 100.0))
380
+
381
+ # Step 4: read full file to "validate" visually
382
+ result.steps_baseline.append(StepResult("visual_validation", "native", count_tokens(full_content), 0.1, 80.0))
383
+
384
+ return result
385
+
386
+ def _scenario_feature_exploration(self) -> ScenarioResult:
387
+ """Simulate: discover related files → map structure → read key symbols → understand deps."""
388
+ result = ScenarioResult(
389
+ name="feature_exploration",
390
+ description="Discover files related to a feature, map their structure, read key symbols, understand dependencies.",
391
+ )
392
+ explore_count = min(5, len(self.sample))
393
+ if explore_count < 2:
394
+ return result
395
+
396
+ explore_files = self.sample[:explore_count]
397
+ query = f"how does {explore_files[0][0].stem} work"
398
+
399
+ # ── WITH C3 ──
400
+
401
+ # Step 1: c3_search to discover related files
402
+ t0 = time.perf_counter()
403
+ context = self.indexer.get_context(query, top_k=5, max_tokens=2000)
404
+ lat = (time.perf_counter() - t0) * 1000
405
+ result.steps_c3.append(StepResult("discover_files", "c3_search", count_tokens(context), lat, 100.0))
406
+
407
+ # Step 2: c3_compress each file
408
+ total_compressed = 0
409
+ total_lat = 0
410
+ successes = 0
411
+ for fpath, content, _ in explore_files:
412
+ t0 = time.perf_counter()
413
+ comp = self.compressor.compress_file(str(fpath), "smart")
414
+ total_lat += (time.perf_counter() - t0) * 1000
415
+ comp_tokens = int(comp.get("compressed_tokens", count_tokens(content)))
416
+ total_compressed += comp_tokens
417
+ if "error" not in comp:
418
+ successes += 1
419
+ quality = round(successes / explore_count * 100, 1)
420
+ result.steps_c3.append(StepResult("compress_files", "c3_compress", total_compressed, total_lat, quality))
421
+
422
+ # Step 3: surgical read of key symbols from top 2 files
423
+ surgical_tokens = 0
424
+ surgical_lat = 0
425
+ for fpath, content, _ in explore_files[:2]:
426
+ rel = self._rel(fpath)
427
+ t0 = time.perf_counter()
428
+ rec = self.file_memory.get(rel)
429
+ if not rec or self.file_memory.needs_update(rel):
430
+ rec = self.file_memory.update(rel)
431
+ if rec and rec.get("sections"):
432
+ target = [s for s in rec["sections"] if s.get("type") in ("class", "function")][:2]
433
+ lines = content.splitlines()
434
+ for s in target:
435
+ extracted = "\n".join(lines[s["line_start"]-1:s["line_end"]])
436
+ surgical_tokens += count_tokens(extracted)
437
+ else:
438
+ surgical_tokens += count_tokens(content)
439
+ surgical_lat += (time.perf_counter() - t0) * 1000
440
+ result.steps_c3.append(StepResult("read_symbols", "c3_read", surgical_tokens, surgical_lat, 100.0))
441
+
442
+ # ── WITHOUT C3 ──
443
+
444
+ # Step 1: grep for related files + read them all
445
+ t0 = time.perf_counter()
446
+ base_context = []
447
+ for fpath, content, _ in self.files[:30]:
448
+ if explore_files[0][0].stem.lower() in content.lower():
449
+ base_context.append(content)
450
+ if len(base_context) >= 5:
451
+ break
452
+ if not base_context:
453
+ base_context = [f[1] for f in explore_files[:5]]
454
+ lat = (time.perf_counter() - t0) * 1000
455
+ result.steps_baseline.append(StepResult("grep_discover", "native", count_tokens("\n\n".join(base_context)), lat, 100.0))
456
+
457
+ # Step 2: read all files fully
458
+ t0 = time.perf_counter()
459
+ full_tokens = 0
460
+ for fpath, content, _ in explore_files:
461
+ fpath.read_text(encoding="utf-8", errors="replace")
462
+ full_tokens += count_tokens(content)
463
+ lat = (time.perf_counter() - t0) * 1000
464
+ result.steps_baseline.append(StepResult("read_all_files", "native", full_tokens, lat, 100.0))
465
+
466
+ # Step 3: re-read for symbol understanding (no compression)
467
+ result.steps_baseline.append(StepResult("reread_for_symbols", "native", full_tokens, 0.1, 100.0))
468
+
469
+ return result
470
+
471
+ def _scenario_code_review(self) -> ScenarioResult:
472
+ """Simulate: list changed files → compress each → read flagged sections → validate."""
473
+ result = ScenarioResult(
474
+ name="code_review",
475
+ description="Review changed files by compressing them, reading flagged sections, and validating syntax.",
476
+ )
477
+ review_count = min(6, len(self.sample))
478
+ if review_count < 2:
479
+ return result
480
+
481
+ review_files = self.sample[:review_count]
482
+
483
+ # ── WITH C3 ──
484
+
485
+ # Step 1: compress all files under review
486
+ total_compressed = 0
487
+ total_lat = 0
488
+ for fpath, content, _ in review_files:
489
+ t0 = time.perf_counter()
490
+ comp = self.compressor.compress_file(str(fpath), "smart")
491
+ total_lat += (time.perf_counter() - t0) * 1000
492
+ total_compressed += int(comp.get("compressed_tokens", count_tokens(content)))
493
+ result.steps_c3.append(StepResult("compress_review", "c3_compress", total_compressed, total_lat, 100.0))
494
+
495
+ # Step 2: surgical read of flagged sections (top 3 files, 1 symbol each)
496
+ read_tokens = 0
497
+ read_lat = 0
498
+ for fpath, content, _ in review_files[:3]:
499
+ rel = self._rel(fpath)
500
+ t0 = time.perf_counter()
501
+ rec = self.file_memory.get(rel)
502
+ if not rec or self.file_memory.needs_update(rel):
503
+ rec = self.file_memory.update(rel)
504
+ if rec and rec.get("sections"):
505
+ target = [s for s in rec["sections"] if s.get("type") in ("class", "function", "method")][:1]
506
+ if target:
507
+ lines = content.splitlines()
508
+ s = target[0]
509
+ extracted = "\n".join(lines[s["line_start"]-1:s["line_end"]])
510
+ read_tokens += count_tokens(extracted)
511
+ else:
512
+ read_tokens += count_tokens(content)
513
+ else:
514
+ read_tokens += count_tokens(content)
515
+ read_lat += (time.perf_counter() - t0) * 1000
516
+ result.steps_c3.append(StepResult("read_flagged", "c3_read", read_tokens, read_lat, 100.0))
517
+
518
+ # Step 3: validate all files (cold — native parser, populates cache)
519
+ val_tokens = 0
520
+ val_lat = 0
521
+ val_ok = 0
522
+ for fpath, content, _ in review_files:
523
+ ext = fpath.suffix.lower()
524
+ t0 = time.perf_counter()
525
+ native_result = check_syntax_native(content, ext)
526
+ val_lat += (time.perf_counter() - t0) * 1000
527
+ status = native_result.get("status", "")
528
+ msg = "ok" if status == "clean" else f"{status}: {native_result.get('detail', '')}"[:60]
529
+ val_tokens += count_tokens(msg)
530
+ val_ok += 1
531
+ # Populate cache for the next step
532
+ try:
533
+ import os
534
+ rel = self._rel(fpath)
535
+ st = os.stat(str(fpath))
536
+ self.validation_cache.put(rel, native_result, st.st_mtime, st.st_size)
537
+ except Exception:
538
+ pass
539
+ result.steps_c3.append(StepResult("validate_cold", "c3_validate", val_tokens, val_lat, round(val_ok / review_count * 100, 1)))
540
+
541
+ # Step 4: re-validate same files (warm — cache hit, near-zero latency)
542
+ cache_tokens = 0
543
+ cache_lat = 0
544
+ cache_hits = 0
545
+ for fpath, content, _ in review_files:
546
+ rel = self._rel(fpath)
547
+ t0 = time.perf_counter()
548
+ cached = self.validation_cache.get(rel)
549
+ cache_lat += (time.perf_counter() - t0) * 1000
550
+ if cached is not None:
551
+ status = cached.get("status", "")
552
+ msg = "ok" if status == "clean" else f"{status}"[:30]
553
+ cache_tokens += count_tokens(msg)
554
+ cache_hits += 1
555
+ else:
556
+ cache_tokens += count_tokens("miss")
557
+ quality = round(cache_hits / review_count * 100, 1) if review_count else 0
558
+ result.steps_c3.append(StepResult("validate_cached", "c3_validate", cache_tokens, cache_lat, quality))
559
+
560
+ # ── WITHOUT C3 ──
561
+
562
+ # Step 1: read all files fully
563
+ t0 = time.perf_counter()
564
+ full_tokens = sum(count_tokens(c) for _, c, _ in review_files)
565
+ lat = (time.perf_counter() - t0) * 1000
566
+ result.steps_baseline.append(StepResult("read_all_review", "native", full_tokens, lat, 100.0))
567
+
568
+ # Step 2: re-read flagged files fully
569
+ flagged_tokens = sum(count_tokens(c) for _, c, _ in review_files[:3])
570
+ result.steps_baseline.append(StepResult("read_flagged_full", "native", flagged_tokens, 0.1, 100.0))
571
+
572
+ # Step 3: re-read for "visual validation" (no syntax checker)
573
+ result.steps_baseline.append(StepResult("visual_validate", "native", full_tokens, 0.1, 80.0))
574
+
575
+ # Step 4: re-read again for second validation pass (no cache — full re-read)
576
+ result.steps_baseline.append(StepResult("re_validate_full", "native", full_tokens, 0.1, 80.0))
577
+
578
+ return result
579
+
580
+ def _scenario_log_diagnosis(self) -> ScenarioResult:
581
+ """Simulate: read log → filter noise → extract errors → search for related code."""
582
+ result = ScenarioResult(
583
+ name="log_diagnosis",
584
+ description="Diagnose errors from a log file by filtering noise, extracting key signals, and searching for related code.",
585
+ )
586
+
587
+ log_path = Path(self.fixtures["log_path"])
588
+ log_text = log_path.read_text(encoding="utf-8", errors="replace")
589
+ log_tokens = count_tokens(log_text)
590
+
591
+ terminal_path = Path(self.fixtures["terminal_path"])
592
+ terminal_text = terminal_path.read_text(encoding="utf-8", errors="replace")
593
+
594
+ # ── WITH C3 ──
595
+
596
+ # Step 1: c3_filter the log file
597
+ t0 = time.perf_counter()
598
+ from cli.c3 import _benchmark_extract_preview
599
+ log_extract = _benchmark_extract_preview(log_path, self.compressor)
600
+ lat = (time.perf_counter() - t0) * 1000
601
+ extract_tokens = count_tokens(log_extract)
602
+ signal_hits = sum(1 for sig in self.fixtures["log_signals"] if sig in log_extract)
603
+ signal_quality = round(signal_hits / len(self.fixtures["log_signals"]) * 100, 1) if self.fixtures["log_signals"] else 100.0
604
+ result.steps_c3.append(StepResult("filter_log", "c3_filter", extract_tokens, lat, signal_quality))
605
+
606
+ # Step 2: c3_filter terminal output
607
+ t0 = time.perf_counter()
608
+ filter_result = self.output_filter.filter(terminal_text, use_llm=False)
609
+ lat = (time.perf_counter() - t0) * 1000
610
+ filtered_tokens = filter_result.get("filtered_tokens", count_tokens(terminal_text))
611
+ term_signals = sum(1 for sig in self.fixtures["terminal_signals"] if sig in filter_result.get("filtered", ""))
612
+ term_quality = round(term_signals / len(self.fixtures["terminal_signals"]) * 100, 1) if self.fixtures["terminal_signals"] else 100.0
613
+ result.steps_c3.append(StepResult("filter_terminal", "c3_filter", filtered_tokens, lat, term_quality))
614
+
615
+ # Step 3: c3_search for error-related code
616
+ t0 = time.perf_counter()
617
+ context = self.indexer.get_context("error handling RuntimeError", top_k=3, max_tokens=1500)
618
+ lat = (time.perf_counter() - t0) * 1000
619
+ result.steps_c3.append(StepResult("search_error_code", "c3_search", count_tokens(context), lat, 100.0))
620
+
621
+ # ── WITHOUT C3 ──
622
+
623
+ # Step 1: read full log
624
+ t0 = time.perf_counter()
625
+ log_path.read_text(encoding="utf-8", errors="replace")
626
+ lat = (time.perf_counter() - t0) * 1000
627
+ result.steps_baseline.append(StepResult("read_full_log", "native", log_tokens, lat, 100.0))
628
+
629
+ # Step 2: read full terminal output
630
+ t0 = time.perf_counter()
631
+ terminal_path.read_text(encoding="utf-8", errors="replace")
632
+ lat = (time.perf_counter() - t0) * 1000
633
+ result.steps_baseline.append(StepResult("read_full_terminal", "native", count_tokens(terminal_text), lat, 100.0))
634
+
635
+ # Step 3: grep + read full files for error context
636
+ t0 = time.perf_counter()
637
+ grep_context = []
638
+ for fpath, content, _ in self.files[:20]:
639
+ if "error" in content.lower() or "exception" in content.lower():
640
+ grep_context.append(content)
641
+ if len(grep_context) >= 3:
642
+ break
643
+ lat = (time.perf_counter() - t0) * 1000
644
+ base_tokens = count_tokens("\n\n".join(grep_context)) if grep_context else 0
645
+ result.steps_baseline.append(StepResult("grep_error_code", "native", base_tokens, lat, 100.0))
646
+
647
+ return result
648
+
649
+ def _scenario_refactor_planning(self) -> ScenarioResult:
650
+ """Simulate: search for usage → compress callers → read implementations → map impact."""
651
+ result = ScenarioResult(
652
+ name="refactor_planning",
653
+ description="Plan a refactor by searching for usage patterns, compressing callers, reading implementations, mapping impact.",
654
+ )
655
+ if len(self.sample) < 3:
656
+ return result
657
+
658
+ target_path, target_content, _ = self.sample[0]
659
+ target_rel = self._rel(target_path)
660
+ query = f"functions that call {target_path.stem}"
661
+ impact_files = self.sample[1:4]
662
+
663
+ # ── WITH C3 ──
664
+
665
+ # Step 1: search for callers
666
+ t0 = time.perf_counter()
667
+ context = self.indexer.get_context(query, top_k=5, max_tokens=2000)
668
+ lat = (time.perf_counter() - t0) * 1000
669
+ result.steps_c3.append(StepResult("search_callers", "c3_search", count_tokens(context), lat, 100.0))
670
+
671
+ # Step 2: compress caller files
672
+ comp_tokens = 0
673
+ comp_lat = 0
674
+ for fpath, content, _ in impact_files:
675
+ t0 = time.perf_counter()
676
+ comp = self.compressor.compress_file(str(fpath), "smart")
677
+ comp_lat += (time.perf_counter() - t0) * 1000
678
+ comp_tokens += int(comp.get("compressed_tokens", count_tokens(content)))
679
+ result.steps_c3.append(StepResult("compress_callers", "c3_compress", comp_tokens, comp_lat, 100.0))
680
+
681
+ # Step 3: surgical read of target implementation
682
+ t0 = time.perf_counter()
683
+ rec = self.file_memory.get(target_rel)
684
+ if not rec or self.file_memory.needs_update(target_rel):
685
+ rec = self.file_memory.update(target_rel)
686
+ impl_tokens = 0
687
+ if rec and rec.get("sections"):
688
+ target_sections = [s for s in rec["sections"] if s.get("type") in ("class", "function")][:3]
689
+ lines = target_content.splitlines()
690
+ for s in target_sections:
691
+ extracted = "\n".join(lines[s["line_start"]-1:s["line_end"]])
692
+ impl_tokens += count_tokens(extracted)
693
+ if not impl_tokens:
694
+ impl_tokens = count_tokens(target_content)
695
+ lat = (time.perf_counter() - t0) * 1000
696
+ result.steps_c3.append(StepResult("read_implementation", "c3_read", impl_tokens, lat, 100.0))
697
+
698
+ # Step 4: compress target for impact map
699
+ t0 = time.perf_counter()
700
+ map_text = self.file_memory.get_or_build_map(target_rel)
701
+ lat = (time.perf_counter() - t0) * 1000
702
+ result.steps_c3.append(StepResult("map_impact", "c3_compress(map)", count_tokens(map_text), lat, 100.0))
703
+
704
+ # ── WITHOUT C3 ──
705
+
706
+ # Step 1: grep for callers + read full files
707
+ t0 = time.perf_counter()
708
+ base_context = []
709
+ for fpath, content, _ in self.files[:30]:
710
+ if target_path.stem.lower() in content.lower():
711
+ base_context.append(content)
712
+ if len(base_context) >= 5:
713
+ break
714
+ lat = (time.perf_counter() - t0) * 1000
715
+ result.steps_baseline.append(StepResult("grep_callers", "native", count_tokens("\n\n".join(base_context)), lat, 100.0))
716
+
717
+ # Step 2: read all caller files fully
718
+ full_tokens = sum(count_tokens(c) for _, c, _ in impact_files)
719
+ result.steps_baseline.append(StepResult("read_all_callers", "native", full_tokens, 0.1, 100.0))
720
+
721
+ # Step 3: read target fully
722
+ result.steps_baseline.append(StepResult("read_full_target", "native", count_tokens(target_content), 0.1, 100.0))
723
+
724
+ # Step 4: re-read target for impact assessment
725
+ result.steps_baseline.append(StepResult("reread_for_impact", "native", count_tokens(target_content), 0.1, 100.0))
726
+
727
+ return result
728
+
729
+ def _scenario_onboarding(self) -> ScenarioResult:
730
+ """Simulate: explore project → compress key files → read entry points → search patterns."""
731
+ result = ScenarioResult(
732
+ name="onboarding",
733
+ description="New contributor explores project structure, compresses key files, reads entry points, searches for patterns.",
734
+ )
735
+ explore_count = min(8, len(self.sample))
736
+ if explore_count < 2:
737
+ return result
738
+
739
+ explore_files = self.sample[:explore_count]
740
+
741
+ # ── WITH C3 ──
742
+
743
+ # Step 1: c3_search for project structure
744
+ t0 = time.perf_counter()
745
+ context = self.indexer.get_context("main entry point project structure", top_k=5, max_tokens=2000)
746
+ lat = (time.perf_counter() - t0) * 1000
747
+ result.steps_c3.append(StepResult("search_structure", "c3_search", count_tokens(context), lat, 100.0))
748
+
749
+ # Step 2: compress all key files
750
+ total_compressed = 0
751
+ total_lat = 0
752
+ for fpath, content, _ in explore_files:
753
+ t0 = time.perf_counter()
754
+ comp = self.compressor.compress_file(str(fpath), "smart")
755
+ total_lat += (time.perf_counter() - t0) * 1000
756
+ total_compressed += int(comp.get("compressed_tokens", count_tokens(content)))
757
+ result.steps_c3.append(StepResult("compress_key_files", "c3_compress", total_compressed, total_lat, 100.0))
758
+
759
+ # Step 3: surgical read of entry points (top 2 files)
760
+ entry_tokens = 0
761
+ entry_lat = 0
762
+ for fpath, content, _ in explore_files[:2]:
763
+ rel = self._rel(fpath)
764
+ t0 = time.perf_counter()
765
+ rec = self.file_memory.get(rel)
766
+ if not rec or self.file_memory.needs_update(rel):
767
+ rec = self.file_memory.update(rel)
768
+ if rec and rec.get("sections"):
769
+ target = [s for s in rec["sections"] if s.get("type") in ("class", "function")][:3]
770
+ lines = content.splitlines()
771
+ for s in target:
772
+ entry_tokens += count_tokens("\n".join(lines[s["line_start"]-1:s["line_end"]]))
773
+ else:
774
+ entry_tokens += count_tokens(content)
775
+ entry_lat += (time.perf_counter() - t0) * 1000
776
+ result.steps_c3.append(StepResult("read_entry_points", "c3_read", entry_tokens, entry_lat, 100.0))
777
+
778
+ # Step 4: search for common patterns
779
+ t0 = time.perf_counter()
780
+ ctx2 = self.indexer.get_context("configuration and settings", top_k=3, max_tokens=1500)
781
+ lat = (time.perf_counter() - t0) * 1000
782
+ result.steps_c3.append(StepResult("search_patterns", "c3_search", count_tokens(ctx2), lat, 100.0))
783
+
784
+ # ── WITHOUT C3 ──
785
+
786
+ # Step 1: read project root listing + README (simulate)
787
+ t0 = time.perf_counter()
788
+ readme_path = self.project_path / "README.md"
789
+ readme_tokens = 0
790
+ if readme_path.exists():
791
+ readme_tokens = count_tokens(readme_path.read_text(encoding="utf-8", errors="replace"))
792
+ lat = (time.perf_counter() - t0) * 1000
793
+ result.steps_baseline.append(StepResult("read_readme", "native", max(readme_tokens, 200), lat, 100.0))
794
+
795
+ # Step 2: read all key files fully
796
+ t0 = time.perf_counter()
797
+ full_tokens = 0
798
+ for fpath, content, _ in explore_files:
799
+ fpath.read_text(encoding="utf-8", errors="replace")
800
+ full_tokens += count_tokens(content)
801
+ lat = (time.perf_counter() - t0) * 1000
802
+ result.steps_baseline.append(StepResult("read_all_key_files", "native", full_tokens, lat, 100.0))
803
+
804
+ # Step 3: re-read entry files for understanding
805
+ entry_full = sum(count_tokens(c) for _, c, _ in explore_files[:2])
806
+ result.steps_baseline.append(StepResult("reread_entries", "native", entry_full, 0.1, 100.0))
807
+
808
+ # Step 4: grep for config patterns + read results
809
+ t0 = time.perf_counter()
810
+ config_context = []
811
+ for fpath, content, _ in self.files[:30]:
812
+ if "config" in content.lower() or "setting" in content.lower():
813
+ config_context.append(content)
814
+ if len(config_context) >= 3:
815
+ break
816
+ lat = (time.perf_counter() - t0) * 1000
817
+ result.steps_baseline.append(StepResult("grep_patterns", "native", count_tokens("\n\n".join(config_context)), lat, 100.0))
818
+
819
+ return result
820
+
821
+
822
+ # ─── Report Generation ────────────────────────────────────────
823
+
824
+ def generate_report(project_path: str, scenarios: list, sample_size: int,
825
+ file_count: int, sampled_files: Optional[list] = None) -> dict:
826
+ """Generate the full JSON report from scenario results."""
827
+ total_c3 = sum(s.total_tokens_c3 for s in scenarios)
828
+ total_base = sum(s.total_tokens_baseline for s in scenarios)
829
+ total_lat_c3 = sum(s.total_latency_c3 for s in scenarios)
830
+ total_lat_base = sum(s.total_latency_baseline for s in scenarios)
831
+
832
+ savings_pct = round((total_base - total_c3) / total_base * 100, 1) if total_base else 0.0
833
+ budget_mult = round(total_base / total_c3, 2) if total_c3 else 0.0
834
+
835
+ avg_quality_c3 = round(sum(s.avg_quality_c3 for s in scenarios) / len(scenarios), 1) if scenarios else 0.0
836
+ avg_quality_base = round(sum(s.avg_quality_baseline for s in scenarios) / len(scenarios), 1) if scenarios else 0.0
837
+
838
+ # Session longevity projection
839
+ context_limit = 200_000
840
+ avg_tokens_per_turn_c3 = total_c3 / len(scenarios) if scenarios else 1
841
+ avg_tokens_per_turn_base = total_base / len(scenarios) if scenarios else 1
842
+ turns_c3 = round(context_limit / avg_tokens_per_turn_c3, 1) if avg_tokens_per_turn_c3 else 0
843
+ turns_base = round(context_limit / avg_tokens_per_turn_base, 1) if avg_tokens_per_turn_base else 0
844
+
845
+ # Cumulative token timeline — always at least 30 points, extend to show C3 range
846
+ max_turn = max(30, int(turns_c3) + 5, int(turns_base) + 5)
847
+ timeline = []
848
+ for turn in range(1, max_turn + 1):
849
+ timeline.append({
850
+ "turn": turn,
851
+ "cumulative_c3": round(avg_tokens_per_turn_c3 * turn),
852
+ "cumulative_baseline": round(min(avg_tokens_per_turn_base * turn, context_limit * 1.5)),
853
+ })
854
+
855
+ # Tool contribution heatmap — per-scenario breakdown
856
+ tool_contributions = {}
857
+ tool_scenario_matrix = {} # tool -> {scenario_name: tokens_saved}
858
+ for s in scenarios:
859
+ for step in s.steps_c3:
860
+ tool = step.tool
861
+ if tool not in tool_contributions:
862
+ tool_contributions[tool] = {"total_tokens_saved": 0, "scenarios": []}
863
+ tool_scenario_matrix[tool] = {}
864
+ idx = s.steps_c3.index(step)
865
+ if idx < len(s.steps_baseline):
866
+ saved = max(0, s.steps_baseline[idx].tokens - step.tokens)
867
+ tool_contributions[tool]["total_tokens_saved"] += saved
868
+ tool_scenario_matrix[tool][s.name] = tool_scenario_matrix[tool].get(s.name, 0) + saved
869
+ if s.name not in tool_contributions[tool]["scenarios"]:
870
+ tool_contributions[tool]["scenarios"].append(s.name)
871
+
872
+ # Cost estimation (configurable pricing)
873
+ cost_profiles = {
874
+ "sonnet_4": {"label": "Claude Sonnet 4", "input_per_mtok": 3.0, "output_per_mtok": 15.0},
875
+ "opus_4": {"label": "Claude Opus 4", "input_per_mtok": 15.0, "output_per_mtok": 75.0},
876
+ "gpt4o": {"label": "GPT-4o", "input_per_mtok": 2.5, "output_per_mtok": 10.0},
877
+ }
878
+ cost_estimates = {}
879
+ tokens_saved = total_base - total_c3
880
+ for key, profile in cost_profiles.items():
881
+ saved_cost_per_session = (tokens_saved / 1_000_000) * profile["input_per_mtok"]
882
+ cost_estimates[key] = {
883
+ "label": profile["label"],
884
+ "saved_per_session": round(saved_cost_per_session, 4),
885
+ "saved_per_day_5_sessions": round(saved_cost_per_session * 5, 3),
886
+ "saved_per_month": round(saved_cost_per_session * 5 * 22, 2),
887
+ }
888
+
889
+ # Sampled files info
890
+ files_info = []
891
+ if sampled_files:
892
+ pp = Path(project_path).resolve()
893
+ for fpath, _content, tok_count in sampled_files:
894
+ try:
895
+ rel = str(Path(fpath).relative_to(pp)).replace("\\", "/")
896
+ except ValueError:
897
+ rel = str(fpath)
898
+ files_info.append({
899
+ "path": rel,
900
+ "tokens": tok_count,
901
+ "extension": Path(fpath).suffix.lower(),
902
+ })
903
+
904
+ return {
905
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
906
+ "project_path": str(project_path),
907
+ "files_considered": file_count,
908
+ "sample_size": sample_size,
909
+ "scorecard": {
910
+ "total_tokens_c3": total_c3,
911
+ "total_tokens_baseline": total_base,
912
+ "token_savings_pct": savings_pct,
913
+ "budget_multiplier": budget_mult,
914
+ "total_latency_c3_ms": round(total_lat_c3, 2),
915
+ "total_latency_baseline_ms": round(total_lat_base, 2),
916
+ "avg_quality_c3": avg_quality_c3,
917
+ "avg_quality_baseline": avg_quality_base,
918
+ "quality_delta": round(avg_quality_c3 - avg_quality_base, 1),
919
+ },
920
+ "session_longevity": {
921
+ "context_limit": context_limit,
922
+ "avg_tokens_per_turn_c3": round(avg_tokens_per_turn_c3),
923
+ "avg_tokens_per_turn_baseline": round(avg_tokens_per_turn_base),
924
+ "estimated_turns_c3": turns_c3,
925
+ "estimated_turns_baseline": turns_base,
926
+ "turn_multiplier": round(turns_c3 / turns_base, 2) if turns_base else 0.0,
927
+ },
928
+ "timeline": timeline,
929
+ "scenarios": [s.to_dict() for s in scenarios],
930
+ "tool_contributions": tool_contributions,
931
+ "tool_scenario_matrix": tool_scenario_matrix,
932
+ "cost_estimates": cost_estimates,
933
+ "sampled_files": files_info,
934
+ "performance_timing": _build_performance_timing(scenarios, total_lat_c3, total_lat_base),
935
+ }
936
+
937
+
938
+ def _build_performance_timing(scenarios: list, total_local_c3_ms: float, total_local_base_ms: float) -> dict:
939
+ """Estimate end-to-end AI turn time: local processing + model inference + network."""
940
+ profiles = {}
941
+ for key, profile in PERF_PROFILES.items():
942
+ per_scenario = []
943
+ total_e2e_c3 = 0
944
+ total_e2e_base = 0
945
+ for s in scenarios:
946
+ c3_inference_ms = _estimate_turn_time_ms(s.total_tokens_c3, profile)
947
+ base_inference_ms = _estimate_turn_time_ms(s.total_tokens_baseline, profile)
948
+ # End-to-end = local C3 overhead + model inference
949
+ e2e_c3 = s.total_latency_c3 + c3_inference_ms
950
+ e2e_base = s.total_latency_baseline + base_inference_ms
951
+ total_e2e_c3 += e2e_c3
952
+ total_e2e_base += e2e_base
953
+ time_saved_pct = round((e2e_base - e2e_c3) / e2e_base * 100, 1) if e2e_base else 0
954
+ per_scenario.append({
955
+ "name": s.name,
956
+ "e2e_c3_ms": round(e2e_c3, 1),
957
+ "e2e_baseline_ms": round(e2e_base, 1),
958
+ "inference_c3_ms": round(c3_inference_ms, 1),
959
+ "inference_baseline_ms": round(base_inference_ms, 1),
960
+ "time_saved_pct": time_saved_pct,
961
+ })
962
+
963
+ time_saved_total_pct = round((total_e2e_base - total_e2e_c3) / total_e2e_base * 100, 1) if total_e2e_base else 0
964
+ speedup = round(total_e2e_base / total_e2e_c3, 2) if total_e2e_c3 else 0
965
+
966
+ profiles[key] = {
967
+ "label": profile["label"],
968
+ "total_e2e_c3_ms": round(total_e2e_c3, 1),
969
+ "total_e2e_baseline_ms": round(total_e2e_base, 1),
970
+ "time_saved_pct": time_saved_total_pct,
971
+ "speedup": speedup,
972
+ "local_overhead_c3_ms": round(total_local_c3_ms, 1),
973
+ "local_overhead_baseline_ms": round(total_local_base_ms, 1),
974
+ "per_scenario": per_scenario,
975
+ }
976
+
977
+ return {"profiles": profiles, "note": "End-to-end = local processing + model inference + network overhead. Model inference time scales with input token count."}
978
+
979
+
980
+ def _humanize(name: str) -> str:
981
+ """Convert snake_case to Title Case."""
982
+ return name.replace("_", " ").title()
983
+
984
+
985
+ # ─── Benchmark History ─────────────────────────────────────────────
986
+
987
+
988
+ def load_session_benchmark_history(project_path: str) -> list:
989
+ """Load all saved session benchmark runs, sorted by timestamp ascending."""
990
+ runs_dir = Path(project_path).resolve() / ".c3" / "session_benchmark" / "runs"
991
+ if not runs_dir.exists():
992
+ return []
993
+ reports = []
994
+ for f in runs_dir.glob("session_*.json"):
995
+ try:
996
+ reports.append(json.loads(f.read_text(encoding="utf-8")))
997
+ except Exception:
998
+ continue
999
+ reports.sort(key=lambda r: r.get("timestamp", ""))
1000
+ return reports
1001
+
1002
+
1003
+ def _build_history_data(history: list) -> dict:
1004
+ """Extract trend data from a list of benchmark reports."""
1005
+ if not history:
1006
+ return {}
1007
+ labels = []
1008
+ savings = []
1009
+ multipliers = []
1010
+ turns_c3 = []
1011
+ turns_base = []
1012
+ quality_c3 = []
1013
+ latency_c3 = []
1014
+ # Per-scenario savings over time
1015
+ scenario_trends: dict = {} # scenario_name -> list of savings%
1016
+
1017
+ for r in history:
1018
+ ts = r.get("timestamp", "")
1019
+ # Short label: "Mar 11 14:30" format
1020
+ try:
1021
+ dt = time.strptime(ts[:19], "%Y-%m-%dT%H:%M:%S")
1022
+ label = time.strftime("%b %d %H:%M", dt)
1023
+ except Exception:
1024
+ label = ts[:16]
1025
+ labels.append(label)
1026
+
1027
+ sc = r.get("scorecard", {})
1028
+ savings.append(sc.get("token_savings_pct", 0))
1029
+ multipliers.append(sc.get("budget_multiplier", 0))
1030
+ quality_c3.append(sc.get("avg_quality_c3", 0))
1031
+ latency_c3.append(sc.get("total_latency_c3_ms", 0))
1032
+
1033
+ lon = r.get("session_longevity", {})
1034
+ turns_c3.append(lon.get("estimated_turns_c3", 0))
1035
+ turns_base.append(lon.get("estimated_turns_baseline", 0))
1036
+
1037
+ for s in r.get("scenarios", []):
1038
+ name = s.get("name", "")
1039
+ if name not in scenario_trends:
1040
+ scenario_trends[name] = []
1041
+ scenario_trends[name].append(s.get("token_savings_pct", 0))
1042
+
1043
+ return {
1044
+ "labels": labels,
1045
+ "savings": savings,
1046
+ "multipliers": multipliers,
1047
+ "turns_c3": turns_c3,
1048
+ "turns_base": turns_base,
1049
+ "quality_c3": quality_c3,
1050
+ "latency_c3": latency_c3,
1051
+ "scenario_trends": scenario_trends,
1052
+ "run_count": len(history),
1053
+ }
1054
+
1055
+
1056
+ def _render_history_section(hist: dict) -> str:
1057
+ """Render the HTML section for benchmark history trends."""
1058
+ if not hist or hist.get("run_count", 0) < 2:
1059
+ return '<div class="info-section"><p style="color:var(--text-dim)">Run the benchmark at least twice to see trend data here. Previous runs are saved automatically.</p></div>'
1060
+
1061
+ n = hist["run_count"]
1062
+ latest_savings = hist["savings"][-1] if hist["savings"] else 0
1063
+ first_savings = hist["savings"][0] if hist["savings"] else 0
1064
+ delta = round(latest_savings - first_savings, 1)
1065
+ delta_sign = "+" if delta >= 0 else ""
1066
+ latest_mult = hist["multipliers"][-1] if hist["multipliers"] else 0
1067
+
1068
+ # History summary table
1069
+ rows = ""
1070
+ for i in range(n):
1071
+ label = hist["labels"][i]
1072
+ sav = hist["savings"][i]
1073
+ mult = hist["multipliers"][i]
1074
+ qual = hist["quality_c3"][i]
1075
+ turns = hist["turns_c3"][i]
1076
+ lat = hist["latency_c3"][i]
1077
+ rows += f'<tr><td>{html.escape(label)}</td><td style="text-align:right">{sav:.1f}%</td><td style="text-align:right">{mult:.2f}x</td><td style="text-align:right">{qual:.1f}%</td><td style="text-align:right">{turns:.1f}</td><td style="text-align:right">{lat:.0f}ms</td></tr>'
1078
+
1079
+ return f"""
1080
+ <div class="info-section">
1081
+ <div class="stat-grid" style="margin-bottom:1.5rem">
1082
+ <div class="stat-card"><div class="stat-label">Total Runs</div><div class="stat-value">{n}</div></div>
1083
+ <div class="stat-card"><div class="stat-label">Latest Savings</div><div class="stat-value">{latest_savings:.1f}%</div></div>
1084
+ <div class="stat-card"><div class="stat-label">Trend (vs First)</div><div class="stat-value" style="color:{'var(--ok)' if delta >= 0 else 'var(--warn)'}">{delta_sign}{delta}%</div></div>
1085
+ <div class="stat-card"><div class="stat-label">Latest Multiplier</div><div class="stat-value">{latest_mult:.2f}x</div></div>
1086
+ </div>
1087
+ <div class="chart-row"><div class="chart-box"><h3>Token Savings Over Time</h3><canvas id="historySavingsChart"></canvas></div><div class="chart-box"><h3>Budget Multiplier Over Time</h3><canvas id="historyMultChart"></canvas></div></div>
1088
+ <div class="chart-row"><div class="chart-box"><h3>Session Turns Over Time</h3><canvas id="historyTurnsChart"></canvas></div><div class="chart-box"><h3>Per-Scenario Savings Trend</h3><canvas id="historyScenarioChart"></canvas></div></div>
1089
+ <h3 class="collapsible-toggle" onclick="this.classList.toggle('open'); this.nextElementSibling.classList.toggle('open')">Run History Table ({n} runs)</h3>
1090
+ <div class="collapsible-content">
1091
+ <table class="files-table"><thead><tr><th>Run</th><th style="text-align:right">Savings</th><th style="text-align:right">Multiplier</th><th style="text-align:right">Quality</th><th style="text-align:right">Turns (C3)</th><th style="text-align:right">Latency</th></tr></thead><tbody>{rows}</tbody></table>
1092
+ </div>
1093
+ </div>"""
1094
+
1095
+
1096
+ def _render_history_charts_js(hist: dict) -> str:
1097
+ """Render Chart.js code for benchmark history trends."""
1098
+ if not hist or hist.get("run_count", 0) < 2:
1099
+ return ""
1100
+
1101
+ labels = json.dumps(hist["labels"])
1102
+ savings = json.dumps(hist["savings"])
1103
+ multipliers = json.dumps(hist["multipliers"])
1104
+ turns_c3 = json.dumps(hist["turns_c3"])
1105
+ turns_base = json.dumps(hist["turns_base"])
1106
+
1107
+ # Per-scenario trend datasets
1108
+ colors = ['#818cf8', '#34d399', '#fbbf24', '#f87171', '#a78bfa', '#38bdf8', '#fb923c', '#e879f9']
1109
+ scenario_datasets = []
1110
+ for i, (name, values) in enumerate(hist.get("scenario_trends", {}).items()):
1111
+ color = colors[i % len(colors)]
1112
+ scenario_datasets.append({
1113
+ "label": _humanize(name),
1114
+ "data": values,
1115
+ "borderColor": color,
1116
+ "backgroundColor": color + "33",
1117
+ "tension": 0.3,
1118
+ "fill": False,
1119
+ })
1120
+ scenario_datasets_json = json.dumps(scenario_datasets)
1121
+
1122
+ return f"""
1123
+ // ── History Charts ──
1124
+ new Chart(document.getElementById('historySavingsChart'), {{
1125
+ type: 'line',
1126
+ data: {{
1127
+ labels: {labels},
1128
+ datasets: [{{
1129
+ label: 'Token Savings %',
1130
+ data: {savings},
1131
+ borderColor: '#818cf8',
1132
+ backgroundColor: '#818cf833',
1133
+ tension: 0.3,
1134
+ fill: true,
1135
+ pointRadius: 4,
1136
+ pointHoverRadius: 6,
1137
+ }}]
1138
+ }},
1139
+ options: {{ responsive: true, plugins: {{ legend: {{ display: false }} }}, scales: {{ y: {{ beginAtZero: true, max: 100, title: {{ display: true, text: '%' }} }} }} }}
1140
+ }});
1141
+
1142
+ new Chart(document.getElementById('historyMultChart'), {{
1143
+ type: 'line',
1144
+ data: {{
1145
+ labels: {labels},
1146
+ datasets: [{{
1147
+ label: 'Budget Multiplier',
1148
+ data: {multipliers},
1149
+ borderColor: '#34d399',
1150
+ backgroundColor: '#34d39933',
1151
+ tension: 0.3,
1152
+ fill: true,
1153
+ pointRadius: 4,
1154
+ pointHoverRadius: 6,
1155
+ }}]
1156
+ }},
1157
+ options: {{ responsive: true, plugins: {{ legend: {{ display: false }} }}, scales: {{ y: {{ beginAtZero: true, title: {{ display: true, text: 'x' }} }} }} }}
1158
+ }});
1159
+
1160
+ new Chart(document.getElementById('historyTurnsChart'), {{
1161
+ type: 'line',
1162
+ data: {{
1163
+ labels: {labels},
1164
+ datasets: [
1165
+ {{ label: 'With C3', data: {turns_c3}, borderColor: '#818cf8', tension: 0.3, pointRadius: 4 }},
1166
+ {{ label: 'Baseline', data: {turns_base}, borderColor: '#f87171', tension: 0.3, pointRadius: 4 }}
1167
+ ]
1168
+ }},
1169
+ options: {{ responsive: true, plugins: {{ legend: {{ position: 'bottom' }} }}, scales: {{ y: {{ beginAtZero: true, title: {{ display: true, text: 'Turns' }} }} }} }}
1170
+ }});
1171
+
1172
+ new Chart(document.getElementById('historyScenarioChart'), {{
1173
+ type: 'line',
1174
+ data: {{
1175
+ labels: {labels},
1176
+ datasets: {scenario_datasets_json}
1177
+ }},
1178
+ options: {{ responsive: true, plugins: {{ legend: {{ position: 'bottom' }} }}, scales: {{ y: {{ beginAtZero: true, max: 100, title: {{ display: true, text: 'Savings %' }} }} }} }}
1179
+ }});
1180
+ """
1181
+
1182
+
1183
+ def render_html(report: dict, history: Optional[list] = None) -> str:
1184
+ """Render a comprehensive visual HTML report with charts and detailed breakdowns."""
1185
+ sc = report["scorecard"]
1186
+ longevity = report["session_longevity"]
1187
+ scenarios = report["scenarios"]
1188
+ timeline = report["timeline"]
1189
+ tool_contribs = report.get("tool_contributions", {})
1190
+ tool_matrix = report.get("tool_scenario_matrix", {})
1191
+ cost_estimates = report.get("cost_estimates", {})
1192
+ sampled_files = report.get("sampled_files", [])
1193
+ perf_timing = report.get("performance_timing", {}).get("profiles", {})
1194
+ hist = _build_history_data(history or [])
1195
+
1196
+ def esc(v):
1197
+ return html.escape(str(v))
1198
+
1199
+ # ── Chart data ──
1200
+ timeline_labels = json.dumps([t["turn"] for t in timeline])
1201
+ timeline_c3 = json.dumps([t["cumulative_c3"] for t in timeline])
1202
+ timeline_base = json.dumps([t["cumulative_baseline"] for t in timeline])
1203
+
1204
+ scenario_names = json.dumps([_humanize(s["name"]) for s in scenarios])
1205
+ scenario_c3_tokens = json.dumps([s["total_tokens_c3"] for s in scenarios])
1206
+ scenario_base_tokens = json.dumps([s["total_tokens_baseline"] for s in scenarios])
1207
+ scenario_savings = json.dumps([s["token_savings_pct"] for s in scenarios])
1208
+ scenario_lat_c3 = json.dumps([round(s["total_latency_c3_ms"], 1) for s in scenarios])
1209
+ scenario_lat_base = json.dumps([round(s["total_latency_baseline_ms"], 1) for s in scenarios])
1210
+
1211
+ # Tool heatmap: stacked bar data — one dataset per tool, values per scenario
1212
+ all_tools = sorted(tool_matrix.keys())
1213
+ all_scenario_names = [s["name"] for s in scenarios]
1214
+ heatmap_datasets = []
1215
+ tool_colors = ['#818cf8', '#34d399', '#fbbf24', '#f87171', '#a78bfa', '#38bdf8', '#fb923c', '#e879f9']
1216
+ for i, tool in enumerate(all_tools):
1217
+ data = [tool_matrix.get(tool, {}).get(sn, 0) for sn in all_scenario_names]
1218
+ color = tool_colors[i % len(tool_colors)]
1219
+ heatmap_datasets.append({"label": tool, "data": data, "backgroundColor": color})
1220
+ heatmap_datasets_json = json.dumps(heatmap_datasets)
1221
+
1222
+ # Performance timing chart data
1223
+ perf_cards_html = ""
1224
+ perf_e2e_c3_data = {} # profile_key -> [ms per scenario]
1225
+ perf_e2e_base_data = {}
1226
+ for pkey, pdata in perf_timing.items():
1227
+ speedup = pdata.get("speedup", 0)
1228
+ saved_pct = pdata.get("time_saved_pct", 0)
1229
+ total_c3_s = pdata.get("total_e2e_c3_ms", 0) / 1000
1230
+ total_base_s = pdata.get("total_e2e_baseline_ms", 0) / 1000
1231
+ local_c3 = pdata.get("local_overhead_c3_ms", 0)
1232
+ local_base = pdata.get("local_overhead_baseline_ms", 0)
1233
+ perf_e2e_c3_data[pkey] = [s["e2e_c3_ms"] for s in pdata.get("per_scenario", [])]
1234
+ perf_e2e_base_data[pkey] = [s["e2e_baseline_ms"] for s in pdata.get("per_scenario", [])]
1235
+ perf_cards_html += f"""<div class="cost-card">
1236
+ <div class="cost-model">{esc(pdata['label'])}</div>
1237
+ <div class="cost-row"><span class="cost-label">Total E2E (C3)</span><span class="cost-val">{total_c3_s:.1f}s</span></div>
1238
+ <div class="cost-row"><span class="cost-label">Total E2E (Base)</span><span class="cost-val" style="color:var(--warn)">{total_base_s:.1f}s</span></div>
1239
+ <div class="cost-row"><span class="cost-label">Time Saved</span><span class="cost-val">{saved_pct}%</span></div>
1240
+ <div class="cost-row"><span class="cost-label">Speedup</span><span class="cost-val">{speedup}x faster</span></div>
1241
+ <div class="cost-row" style="margin-top:0.4rem;padding-top:0.4rem;border-top:1px solid var(--surface2)"><span class="cost-label">Local C3 overhead</span><span class="cost-val" style="color:var(--text-dim)">{local_c3:.0f}ms</span></div>
1242
+ <div class="cost-row"><span class="cost-label">Local base overhead</span><span class="cost-val" style="color:var(--text-dim)">{local_base:.0f}ms</span></div>
1243
+ </div>"""
1244
+ # Use the first profile for the per-scenario chart
1245
+ first_profile_key = list(perf_timing.keys())[0] if perf_timing else None
1246
+ perf_chart_c3 = json.dumps(perf_e2e_c3_data.get(first_profile_key, []))
1247
+ perf_chart_base = json.dumps(perf_e2e_base_data.get(first_profile_key, []))
1248
+ first_profile_label = perf_timing.get(first_profile_key, {}).get("label", "") if first_profile_key else ""
1249
+
1250
+ # ── Summary table ──
1251
+ summary_rows = ""
1252
+ for s in scenarios:
1253
+ sav = s["token_savings_pct"]
1254
+ sav_color = "#10b981" if sav > 50 else ("#f59e0b" if sav > 20 else "#ef4444")
1255
+ q_c3 = s.get("avg_quality_c3", 0)
1256
+ q_base = s.get("avg_quality_baseline", 0)
1257
+ q_icon = "" if q_c3 >= 99.9 else (' <span class="q-warn" title="Quality below 100%: some steps had imperfect retrieval/extraction">&#9888;</span>' if q_c3 < 100 else "")
1258
+ summary_rows += f"""<tr>
1259
+ <td class="td-name">{esc(_humanize(s['name']))}</td>
1260
+ <td class="num">{s['total_tokens_c3']:,}</td>
1261
+ <td class="num">{s['total_tokens_baseline']:,}</td>
1262
+ <td class="num" style="color:{sav_color};font-weight:700">{sav}%</td>
1263
+ <td class="num">{s['budget_multiplier']}x</td>
1264
+ <td class="num">{s['total_latency_c3_ms']:.0f}</td>
1265
+ <td class="num">{s['total_latency_baseline_ms']:.0f}</td>
1266
+ <td class="num">{q_c3:.0f}%{q_icon}</td>
1267
+ <td class="num">{q_base:.0f}%</td>
1268
+ </tr>"""
1269
+ # Totals row
1270
+ t_c3 = sc["total_tokens_c3"]
1271
+ t_base = sc["total_tokens_baseline"]
1272
+ summary_rows += f"""<tr class="totals-row">
1273
+ <td class="td-name"><strong>TOTAL</strong></td>
1274
+ <td class="num"><strong>{t_c3:,}</strong></td>
1275
+ <td class="num"><strong>{t_base:,}</strong></td>
1276
+ <td class="num" style="color:#10b981;font-weight:700"><strong>{sc['token_savings_pct']}%</strong></td>
1277
+ <td class="num"><strong>{sc['budget_multiplier']}x</strong></td>
1278
+ <td class="num"><strong>{sc['total_latency_c3_ms']:.0f}</strong></td>
1279
+ <td class="num"><strong>{sc['total_latency_baseline_ms']:.0f}</strong></td>
1280
+ <td class="num"><strong>{sc['avg_quality_c3']:.0f}%</strong></td>
1281
+ <td class="num"><strong>{sc['avg_quality_baseline']:.0f}%</strong></td>
1282
+ </tr>"""
1283
+
1284
+ # ── Cost cards ──
1285
+ cost_cards = ""
1286
+ for key, est in cost_estimates.items():
1287
+ cost_cards += f"""<div class="cost-card">
1288
+ <div class="cost-model">{esc(est['label'])}</div>
1289
+ <div class="cost-row"><span class="cost-label">Per session</span><span class="cost-val">${est['saved_per_session']:.4f}</span></div>
1290
+ <div class="cost-row"><span class="cost-label">Per day (5 sessions)</span><span class="cost-val">${est['saved_per_day_5_sessions']:.3f}</span></div>
1291
+ <div class="cost-row"><span class="cost-label">Per month (22 days)</span><span class="cost-val">${est['saved_per_month']:.2f}</span></div>
1292
+ </div>"""
1293
+
1294
+ # ── Sampled files ──
1295
+ files_rows = ""
1296
+ for f in sampled_files:
1297
+ files_rows += f"""<tr><td class="td-path">{esc(f['path'])}</td><td class="num">{f['tokens']:,}</td><td>{esc(f['extension'])}</td></tr>"""
1298
+
1299
+ # ── Scenario detail cards ──
1300
+ scenario_cards = []
1301
+ for s in scenarios:
1302
+ steps_c3 = s.get("steps_c3", [])
1303
+ steps_base = s.get("steps_baseline", [])
1304
+ max_step_tokens = max(
1305
+ max((st["tokens"] for st in steps_c3), default=1),
1306
+ max((st["tokens"] for st in steps_base), default=1),
1307
+ 1
1308
+ )
1309
+
1310
+ def _step_rows(steps, badge_cls):
1311
+ rows = ""
1312
+ for i, step in enumerate(steps):
1313
+ bar_pct = min(100, step["tokens"] / max_step_tokens * 100)
1314
+ # Find matching step in other path for delta
1315
+ delta_html = ""
1316
+ if badge_cls == "badge-c3" and i < len(steps_base):
1317
+ saved = steps_base[i]["tokens"] - step["tokens"]
1318
+ if saved > 0:
1319
+ delta_html = f'<span class="step-delta">-{saved:,}</span>'
1320
+ elif badge_cls == "badge-base" and i < len(steps_c3):
1321
+ extra = step["tokens"] - steps_c3[i]["tokens"]
1322
+ if extra > 0:
1323
+ delta_html = f'<span class="step-extra">+{extra:,}</span>'
1324
+ q_html = ""
1325
+ if step.get("quality", 100) < 100:
1326
+ q_html = f' <span class="q-warn" title="Quality: {step["quality"]:.0f}%">&#9888; {step["quality"]:.0f}%</span>'
1327
+ rows += f"""<div class="step-row">
1328
+ <span class="step-name">{esc(_humanize(step['name']))}{q_html}</span>
1329
+ <span class="step-tool {badge_cls}">{esc(step['tool'])}</span>
1330
+ <span class="step-tokens">{step['tokens']:,} tok {delta_html}</span>
1331
+ <span class="step-latency">{step['latency_ms']:.1f}ms</span>
1332
+ <div class="step-bar-track"><div class="step-bar {badge_cls}" style="width:{bar_pct:.1f}%"></div></div>
1333
+ </div>"""
1334
+ return rows
1335
+
1336
+ sav_color = "#10b981" if s["token_savings_pct"] > 50 else ("#f59e0b" if s["token_savings_pct"] > 20 else "#ef4444")
1337
+
1338
+ scenario_cards.append(f"""<div class="scenario-card" id="scenario-{s['name']}">
1339
+ <div class="scenario-header">
1340
+ <h3>{esc(_humanize(s['name']))}</h3>
1341
+ <div class="scenario-savings" style="color:{sav_color}">{s['token_savings_pct']}% saved</div>
1342
+ </div>
1343
+ <p class="scenario-desc">{esc(s['description'])}</p>
1344
+ <div class="scenario-metrics">
1345
+ <div class="metric-pill"><span class="metric-label">C3</span><span class="metric-value">{s['total_tokens_c3']:,} tok</span></div>
1346
+ <div class="metric-pill"><span class="metric-label">Base</span><span class="metric-value">{s['total_tokens_baseline']:,} tok</span></div>
1347
+ <div class="metric-pill"><span class="metric-label">Budget</span><span class="metric-value">{s['budget_multiplier']}x</span></div>
1348
+ <div class="metric-pill"><span class="metric-label">Quality C3</span><span class="metric-value">{s['avg_quality_c3']:.0f}%</span></div>
1349
+ <div class="metric-pill"><span class="metric-label">Latency C3</span><span class="metric-value">{s['total_latency_c3_ms']:.0f}ms</span></div>
1350
+ </div>
1351
+ <div class="steps-comparison">
1352
+ <div class="steps-col">
1353
+ <h4>With C3</h4>
1354
+ {_step_rows(steps_c3, "badge-c3")}
1355
+ <div class="steps-total">Total: {s['total_tokens_c3']:,} tokens &middot; {s['total_latency_c3_ms']:.0f}ms</div>
1356
+ </div>
1357
+ <div class="steps-col">
1358
+ <h4>Without C3 (Baseline)</h4>
1359
+ {_step_rows(steps_base, "badge-base")}
1360
+ <div class="steps-total">Total: {s['total_tokens_baseline']:,} tokens &middot; {s['total_latency_baseline_ms']:.0f}ms</div>
1361
+ </div>
1362
+ </div>
1363
+ </div>""")
1364
+
1365
+ return f"""<!doctype html>
1366
+ <html lang="en">
1367
+ <head>
1368
+ <meta charset="utf-8">
1369
+ <meta name="viewport" content="width=device-width, initial-scale=1">
1370
+ <title>C3 Session Benchmark Report</title>
1371
+ <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
1372
+ <script src="https://cdn.jsdelivr.net/npm/chartjs-plugin-annotation@3"></script>
1373
+ <style>
1374
+ :root {{
1375
+ --bg: #0f172a; --surface: #1e293b; --surface2: #334155; --border: #475569;
1376
+ --text: #e2e8f0; --text-dim: #94a3b8; --accent: #818cf8; --accent2: #34d399;
1377
+ --danger: #f87171; --warn: #fbbf24; --c3: #818cf8; --base: #64748b;
1378
+ }}
1379
+ * {{ margin:0; padding:0; box-sizing:border-box; }}
1380
+ body {{ background:var(--bg); color:var(--text); font-family:'Inter',-apple-system,sans-serif; padding:0; line-height:1.6; }}
1381
+ .container {{ max-width:1440px; margin:0 auto; padding:2rem; padding-top:4rem; }}
1382
+ h1 {{ font-size:2rem; font-weight:700; margin-bottom:0.5rem; }}
1383
+ h2 {{ font-size:1.4rem; font-weight:600; margin:2.5rem 0 1rem; color:var(--accent); scroll-margin-top:3.5rem; }}
1384
+ h3 {{ font-size:1.1rem; font-weight:600; }}
1385
+ h4 {{ font-size:0.85rem; font-weight:600; color:var(--text-dim); margin-bottom:0.5rem; text-transform:uppercase; letter-spacing:0.05em; }}
1386
+ .subtitle {{ color:var(--text-dim); margin-bottom:2rem; }}
1387
+
1388
+ /* Sticky Nav */
1389
+ .sticky-nav {{ position:fixed; top:0; left:0; right:0; z-index:100; background:rgba(15,23,42,0.92); backdrop-filter:blur(12px); border-bottom:1px solid var(--border); padding:0.5rem 2rem; display:flex; align-items:center; gap:1.5rem; }}
1390
+ .sticky-nav .nav-brand {{ font-weight:700; color:var(--accent); font-size:0.9rem; white-space:nowrap; }}
1391
+ .sticky-nav a {{ color:var(--text-dim); text-decoration:none; font-size:0.8rem; white-space:nowrap; transition:color 0.15s; }}
1392
+ .sticky-nav a:hover {{ color:var(--accent); }}
1393
+ .sticky-nav .nav-actions {{ margin-left:auto; display:flex; gap:0.75rem; }}
1394
+ .btn-sm {{ background:var(--surface2); border:1px solid var(--border); color:var(--text); padding:0.25rem 0.75rem; border-radius:6px; cursor:pointer; font-size:0.75rem; }}
1395
+ .btn-sm:hover {{ background:var(--border); }}
1396
+
1397
+ /* Scorecard */
1398
+ .scorecard {{ display:grid; grid-template-columns:repeat(auto-fit, minmax(180px, 1fr)); gap:1rem; margin-bottom:2rem; }}
1399
+ .score-card {{ background:var(--surface); border:1px solid var(--border); border-radius:12px; padding:1.25rem; text-align:center; transition:transform 0.15s; }}
1400
+ .score-card:hover {{ transform:translateY(-2px); }}
1401
+ .score-card .value {{ font-size:2rem; font-weight:700; color:var(--accent); }}
1402
+ .score-card .label {{ font-size:0.75rem; color:var(--text-dim); text-transform:uppercase; letter-spacing:0.05em; margin-top:0.25rem; }}
1403
+ .score-card.green .value {{ color:var(--accent2); }}
1404
+ .score-card.warn .value {{ color:var(--warn); }}
1405
+ .score-card.cost .value {{ color:var(--accent2); font-size:1.6rem; }}
1406
+
1407
+ /* Charts */
1408
+ .chart-grid {{ display:grid; grid-template-columns:repeat(auto-fit, minmax(420px, 1fr)); gap:1.5rem; margin-bottom:2rem; }}
1409
+ .chart-box {{ background:var(--surface); border:1px solid var(--border); border-radius:12px; padding:1.5rem; }}
1410
+ .chart-box canvas {{ max-height:320px; }}
1411
+ .chart-box.wide {{ grid-column: 1 / -1; }}
1412
+
1413
+ /* Summary Table */
1414
+ .summary-table {{ width:100%; border-collapse:collapse; margin-bottom:2rem; font-size:0.85rem; }}
1415
+ .summary-table th {{ background:var(--surface2); color:var(--text-dim); padding:0.6rem 0.75rem; text-align:left; font-weight:600; font-size:0.75rem; text-transform:uppercase; letter-spacing:0.04em; cursor:pointer; user-select:none; white-space:nowrap; border-bottom:2px solid var(--border); }}
1416
+ .summary-table th:hover {{ color:var(--accent); }}
1417
+ .summary-table td {{ padding:0.5rem 0.75rem; border-bottom:1px solid var(--surface2); }}
1418
+ .summary-table .num {{ text-align:right; font-variant-numeric:tabular-nums; }}
1419
+ .summary-table .td-name {{ font-weight:600; }}
1420
+ .summary-table .totals-row td {{ border-top:2px solid var(--accent); background:rgba(129,140,248,0.05); }}
1421
+ .summary-table .q-warn {{ color:var(--warn); cursor:help; }}
1422
+
1423
+ /* Cost Estimation */
1424
+ .cost-grid {{ display:grid; grid-template-columns:repeat(auto-fit, minmax(220px, 1fr)); gap:1rem; margin-bottom:1.5rem; }}
1425
+ .cost-card {{ background:var(--surface); border:1px solid var(--border); border-radius:12px; padding:1.25rem; }}
1426
+ .cost-model {{ font-weight:600; font-size:0.9rem; margin-bottom:0.75rem; color:var(--accent); }}
1427
+ .cost-row {{ display:flex; justify-content:space-between; padding:0.25rem 0; font-size:0.8rem; }}
1428
+ .cost-label {{ color:var(--text-dim); }}
1429
+ .cost-val {{ font-weight:600; color:var(--accent2); }}
1430
+
1431
+ /* Scenario Cards */
1432
+ .scenarios-grid {{ display:grid; grid-template-columns:repeat(auto-fit, minmax(620px, 1fr)); gap:1.5rem; }}
1433
+ .scenario-card {{ background:var(--surface); border:1px solid var(--border); border-radius:12px; padding:1.5rem; scroll-margin-top:3.5rem; }}
1434
+ .scenario-header {{ display:flex; justify-content:space-between; align-items:center; margin-bottom:0.5rem; }}
1435
+ .scenario-savings {{ font-size:1.3rem; font-weight:700; }}
1436
+ .scenario-desc {{ color:var(--text-dim); font-size:0.85rem; margin-bottom:1rem; }}
1437
+ .scenario-metrics {{ display:flex; gap:0.6rem; flex-wrap:wrap; margin-bottom:1rem; }}
1438
+ .metric-pill {{ background:var(--surface2); border-radius:8px; padding:0.35rem 0.65rem; display:flex; gap:0.4rem; align-items:center; }}
1439
+ .metric-label {{ font-size:0.65rem; color:var(--text-dim); text-transform:uppercase; }}
1440
+ .metric-value {{ font-size:0.85rem; font-weight:600; }}
1441
+
1442
+ /* Steps */
1443
+ .steps-comparison {{ display:grid; grid-template-columns:1fr 1fr; gap:1rem; }}
1444
+ .steps-col {{ background:var(--bg); border-radius:8px; padding:1rem; }}
1445
+ .step-row {{ padding:0.4rem 0; font-size:0.8rem; border-bottom:1px solid var(--surface2); }}
1446
+ .step-row > span {{ display:inline-block; vertical-align:middle; }}
1447
+ .step-name {{ min-width:120px; }}
1448
+ .step-tool {{ font-size:0.7rem; padding:0.15rem 0.4rem; border-radius:4px; }}
1449
+ .badge-c3 {{ background:rgba(129,140,248,0.2); color:var(--c3); }}
1450
+ .badge-base {{ background:rgba(100,116,139,0.2); color:var(--base); }}
1451
+ .step-tokens {{ color:var(--accent2); min-width:90px; text-align:right; }}
1452
+ .step-delta {{ color:var(--accent2); font-size:0.7rem; font-weight:600; margin-left:0.3rem; }}
1453
+ .step-extra {{ color:var(--danger); font-size:0.7rem; font-weight:600; margin-left:0.3rem; }}
1454
+ .step-latency {{ color:var(--text-dim); min-width:55px; text-align:right; }}
1455
+ .step-bar-track {{ width:100%; height:4px; background:var(--surface2); border-radius:2px; margin-top:0.3rem; }}
1456
+ .step-bar {{ height:4px; border-radius:2px; transition:width 0.3s; }}
1457
+ .step-bar.badge-c3 {{ background:var(--c3); }}
1458
+ .step-bar.badge-base {{ background:var(--base); }}
1459
+ .steps-total {{ margin-top:0.5rem; padding-top:0.5rem; border-top:1px solid var(--border); font-size:0.8rem; font-weight:600; }}
1460
+ .q-warn {{ color:var(--warn); font-size:0.7rem; cursor:help; }}
1461
+
1462
+ /* Longevity */
1463
+ .longevity-box {{ background:var(--surface); border:1px solid var(--border); border-radius:12px; padding:1.5rem; margin-bottom:2rem; }}
1464
+ .longevity-grid {{ display:grid; grid-template-columns:repeat(auto-fit, minmax(180px, 1fr)); gap:1rem; text-align:center; }}
1465
+ .longevity-stat .num {{ font-size:1.8rem; font-weight:700; }}
1466
+ .longevity-stat .lbl {{ font-size:0.8rem; color:var(--text-dim); }}
1467
+
1468
+ /* Sampled Files */
1469
+ .files-table {{ width:100%; border-collapse:collapse; font-size:0.8rem; margin-top:0.5rem; }}
1470
+ .files-table th {{ text-align:left; padding:0.4rem 0.6rem; color:var(--text-dim); font-weight:600; font-size:0.7rem; text-transform:uppercase; border-bottom:1px solid var(--border); }}
1471
+ .files-table td {{ padding:0.3rem 0.6rem; border-bottom:1px solid var(--surface2); }}
1472
+ .files-table .td-path {{ font-family:monospace; font-size:0.75rem; }}
1473
+
1474
+ /* Info Sections */
1475
+ .info-section {{ background:var(--surface); border:1px solid var(--border); border-radius:12px; padding:1.5rem 2rem; margin-bottom:1.5rem; }}
1476
+ .info-section h3 {{ margin-bottom:0.75rem; }}
1477
+ .info-section p, .info-section li {{ color:var(--text-dim); font-size:0.85rem; line-height:1.7; }}
1478
+ .info-section ul {{ padding-left:1.25rem; margin:0.5rem 0; }}
1479
+ .info-section li {{ margin-bottom:0.3rem; }}
1480
+ .info-section strong {{ color:var(--text); }}
1481
+ .info-section code {{ background:var(--surface2); padding:0.15rem 0.4rem; border-radius:4px; font-size:0.8rem; color:var(--accent); }}
1482
+ .info-grid {{ display:grid; grid-template-columns:repeat(auto-fit, minmax(350px, 1fr)); gap:1.5rem; }}
1483
+ .info-grid .info-section {{ margin-bottom:0; }}
1484
+ .run-params {{ display:grid; grid-template-columns:repeat(auto-fit, minmax(160px, 1fr)); gap:0.75rem; margin-top:0.75rem; }}
1485
+ .param-box {{ background:var(--bg); border-radius:8px; padding:0.5rem 0.8rem; }}
1486
+ .param-box .param-key {{ font-size:0.65rem; color:var(--text-dim); text-transform:uppercase; letter-spacing:0.04em; }}
1487
+ .param-box .param-val {{ font-size:0.95rem; font-weight:600; color:var(--text); }}
1488
+ .collapsible-toggle {{ cursor:pointer; user-select:none; display:flex; align-items:center; gap:0.5rem; }}
1489
+ .collapsible-toggle::before {{ content:'\\25B6'; font-size:0.7rem; transition:transform 0.2s; }}
1490
+ .collapsible-toggle.open::before {{ transform:rotate(90deg); }}
1491
+ .collapsible-content {{ max-height:0; overflow:hidden; transition:max-height 0.3s ease; }}
1492
+ .collapsible-content.open {{ max-height:4000px; }}
1493
+ .scenario-explainer {{ background:var(--bg); border-radius:8px; padding:1rem; margin-top:0.5rem; }}
1494
+ .scenario-explainer .ex-title {{ font-weight:600; font-size:0.9rem; color:var(--accent); margin-bottom:0.3rem; }}
1495
+ .scenario-explainer .ex-flow {{ display:flex; gap:0.4rem; flex-wrap:wrap; align-items:center; margin:0.4rem 0; }}
1496
+ .flow-step {{ background:var(--surface2); border-radius:6px; padding:0.2rem 0.5rem; font-size:0.75rem; }}
1497
+ .flow-arrow {{ color:var(--text-dim); font-size:0.7rem; }}
1498
+ .metric-def {{ display:grid; grid-template-columns:140px 1fr; gap:0.3rem 1rem; margin-top:0.5rem; }}
1499
+ .metric-def dt {{ font-weight:600; font-size:0.8rem; color:var(--accent); }}
1500
+ .metric-def dd {{ font-size:0.8rem; color:var(--text-dim); }}
1501
+
1502
+ /* Footer */
1503
+ .footer {{ margin-top:3rem; padding:1.5rem 0; text-align:center; color:var(--text-dim); font-size:0.8rem; border-top:1px solid var(--border); }}
1504
+
1505
+ /* Print */
1506
+ @media print {{
1507
+ body {{ background:#fff; color:#1a1a1a; padding:1rem; }}
1508
+ .sticky-nav {{ display:none; }}
1509
+ .container {{ padding-top:0; }}
1510
+ .score-card, .chart-box, .scenario-card, .info-section, .longevity-box, .cost-card {{ border-color:#ccc; background:#fff; }}
1511
+ .score-card .value, h2, .scenario-savings, .cost-model {{ color:#333; }}
1512
+ .step-bar-track {{ background:#eee; }}
1513
+ .step-bar.badge-c3 {{ background:#6366f1; }}
1514
+ .step-bar.badge-base {{ background:#9ca3af; }}
1515
+ }}
1516
+ </style>
1517
+ </head>
1518
+ <body>
1519
+
1520
+ <!-- Sticky Navigation -->
1521
+ <nav class="sticky-nav">
1522
+ <span class="nav-brand">C3 Session Benchmark</span>
1523
+ <a href="#scorecard">Scorecard</a>
1524
+ <a href="#longevity">Longevity</a>
1525
+ <a href="#charts">Charts</a>
1526
+ <a href="#summary">Summary</a>
1527
+ <a href="#cost">Cost</a>
1528
+ <a href="#performance">Performance</a>
1529
+ <a href="#methodology">Methodology</a>
1530
+ <a href="#scenarios">Scenarios</a>
1531
+ <a href="#details">Details</a>
1532
+ <a href="#files">Files</a>
1533
+ <a href="#history">History</a>
1534
+ <div class="nav-actions">
1535
+ <button class="btn-sm" onclick="window.print()">Export PDF</button>
1536
+ </div>
1537
+ </nav>
1538
+
1539
+ <div class="container">
1540
+
1541
+ <h1>C3 Session Benchmark <span style="background:#818cf8;color:#0b1020;padding:0.15rem 0.55rem;border-radius:999px;font-size:0.7rem;font-weight:600;margin-left:0.5rem;vertical-align:middle">Synthetic</span> <a href="../benchmarks/index.html" style="color:var(--text-dim,#9aa3c7);font-size:0.8rem;margin-left:0.8rem;text-decoration:none;font-weight:400">← dashboard</a></h1>
1542
+ <p class="subtitle">Real-world workflow simulation &middot; {esc(report.get('timestamp', '').replace('T', ' '))} &middot; {report['files_considered']} files &middot; {len(scenarios)} scenarios</p>
1543
+
1544
+ <!-- About -->
1545
+ <div class="info-section">
1546
+ <h3 class="collapsible-toggle open" onclick="this.classList.toggle('open'); this.nextElementSibling.classList.toggle('open')">About This Benchmark</h3>
1547
+ <div class="collapsible-content open">
1548
+ <p>
1549
+ This benchmark simulates <strong>real-world AI coding session workflows</strong> end-to-end, comparing
1550
+ <strong>with C3 tools</strong> (c3_search, c3_compress, c3_read, c3_filter, c3_validate) versus <strong>without C3</strong>
1551
+ (native file reads, lexical grep, full-file context loading). Each scenario represents a common developer task broken into
1552
+ sequential steps. Both paths perform equivalent work on the same files, but C3 uses intelligent compression, surgical reading,
1553
+ and semantic search to minimize tokens loaded into context.
1554
+ </p>
1555
+ </div>
1556
+ </div>
1557
+
1558
+ <!-- Run Parameters -->
1559
+ <div class="info-section">
1560
+ <h3>Run Parameters</h3>
1561
+ <div class="run-params">
1562
+ <div class="param-box"><div class="param-key">Project</div><div class="param-val">{esc(Path(report.get('project_path','')).name)}</div></div>
1563
+ <div class="param-box"><div class="param-key">Files Eligible</div><div class="param-val">{report['files_considered']}</div></div>
1564
+ <div class="param-box"><div class="param-key">Files Sampled</div><div class="param-val">{report['sample_size']}</div></div>
1565
+ <div class="param-box"><div class="param-key">Scenarios</div><div class="param-val">{len(scenarios)}</div></div>
1566
+ <div class="param-box"><div class="param-key">Timestamp</div><div class="param-val">{esc(report.get('timestamp','').replace('T',' '))}</div></div>
1567
+ <div class="param-box"><div class="param-key">Context Limit</div><div class="param-val">{longevity['context_limit']:,} tok</div></div>
1568
+ </div>
1569
+ </div>
1570
+
1571
+ <!-- Scorecard -->
1572
+ <h2 id="scorecard">Scorecard</h2>
1573
+ <div class="scorecard">
1574
+ <div class="score-card green"><div class="value">{sc['token_savings_pct']}%</div><div class="label">Token Savings</div></div>
1575
+ <div class="score-card"><div class="value">{sc['budget_multiplier']}x</div><div class="label">Budget Multiplier</div></div>
1576
+ <div class="score-card green"><div class="value">{longevity['estimated_turns_c3']}</div><div class="label">Est. Turns (C3)</div></div>
1577
+ <div class="score-card warn"><div class="value">{longevity['estimated_turns_baseline']}</div><div class="label">Est. Turns (Base)</div></div>
1578
+ <div class="score-card"><div class="value">{longevity['turn_multiplier']}x</div><div class="label">Session Multiplier</div></div>
1579
+ <div class="score-card"><div class="value">{sc['avg_quality_c3']:.0f}%</div><div class="label">Avg Quality (C3)</div></div>
1580
+ <div class="score-card"><div class="value">{sc['total_latency_c3_ms']:.0f}ms</div><div class="label">Total Latency (C3)</div></div>
1581
+ <div class="score-card"><div class="value">{sc['total_tokens_c3']:,}</div><div class="label">Total Tokens (C3)</div></div>
1582
+ <div class="score-card green"><div class="value">{list(perf_timing.values())[0].get('speedup', 0) if perf_timing else 0}x</div><div class="label">E2E Speedup</div></div>
1583
+ </div>
1584
+
1585
+ <!-- Longevity -->
1586
+ <h2 id="longevity">Session Longevity Projection</h2>
1587
+ <div class="longevity-box" style="margin-top:0">
1588
+ <p style="color:var(--text-dim); margin-bottom:1rem; font-size:0.85rem">
1589
+ Estimated turns before hitting {longevity['context_limit']:,} token context limit. Each "turn" is one complete workflow scenario.
1590
+ </p>
1591
+ <div class="longevity-grid">
1592
+ <div class="longevity-stat"><div class="num" style="color:var(--accent)">{longevity['avg_tokens_per_turn_c3']:,}</div><div class="lbl">Avg tokens/turn (C3)</div></div>
1593
+ <div class="longevity-stat"><div class="num" style="color:var(--base)">{longevity['avg_tokens_per_turn_baseline']:,}</div><div class="lbl">Avg tokens/turn (Base)</div></div>
1594
+ <div class="longevity-stat"><div class="num" style="color:var(--accent)">{longevity['estimated_turns_c3']}</div><div class="lbl">Turns before limit (C3)</div></div>
1595
+ <div class="longevity-stat"><div class="num" style="color:var(--warn)">{longevity['estimated_turns_baseline']}</div><div class="lbl">Turns before limit (Base)</div></div>
1596
+ <div class="longevity-stat"><div class="num" style="color:var(--accent2)">{longevity['turn_multiplier']}x</div><div class="lbl">Session Multiplier</div></div>
1597
+ </div>
1598
+ </div>
1599
+
1600
+ <!-- Charts -->
1601
+ <h2 id="charts">Visual Analysis</h2>
1602
+ <div class="chart-grid">
1603
+ <div class="chart-box wide">
1604
+ <h3>Cumulative Token Usage (Turn by Turn)</h3>
1605
+ <p style="color:var(--text-dim);font-size:0.8rem;margin:0.3rem 0 0.5rem">Shows how context accumulates over turns. The gap is wasted tokens C3 prevents. Red dashed = context limit. Orange zone = danger (80%+).</p>
1606
+ <canvas id="timelineChart" style="max-height:380px"></canvas>
1607
+ </div>
1608
+ <div class="chart-box">
1609
+ <h3>Token Usage by Scenario</h3>
1610
+ <p style="color:var(--text-dim);font-size:0.8rem;margin:0.3rem 0 0.5rem">Side-by-side token consumption per workflow.</p>
1611
+ <canvas id="scenarioChart"></canvas>
1612
+ </div>
1613
+ <div class="chart-box">
1614
+ <h3>Savings by Scenario (%)</h3>
1615
+ <canvas id="savingsChart"></canvas>
1616
+ </div>
1617
+ <div class="chart-box">
1618
+ <h3>Latency Comparison (ms)</h3>
1619
+ <p style="color:var(--text-dim);font-size:0.8rem;margin:0.3rem 0 0.5rem">C3 trades local ms for massive token savings.</p>
1620
+ <canvas id="latencyChart"></canvas>
1621
+ </div>
1622
+ <div class="chart-box">
1623
+ <h3>Tool Savings by Scenario (Stacked)</h3>
1624
+ <p style="color:var(--text-dim);font-size:0.8rem;margin:0.3rem 0 0.5rem">Which C3 tools contributed most savings in each scenario.</p>
1625
+ <canvas id="heatmapChart"></canvas>
1626
+ </div>
1627
+ </div>
1628
+
1629
+ <!-- Summary Table -->
1630
+ <h2 id="summary">Summary Comparison</h2>
1631
+ <div style="overflow-x:auto">
1632
+ <table class="summary-table" id="summaryTable">
1633
+ <thead>
1634
+ <tr>
1635
+ <th onclick="sortTable(0)">Scenario</th>
1636
+ <th onclick="sortTable(1)">C3 Tokens</th>
1637
+ <th onclick="sortTable(2)">Base Tokens</th>
1638
+ <th onclick="sortTable(3)">Savings %</th>
1639
+ <th onclick="sortTable(4)">Budget x</th>
1640
+ <th onclick="sortTable(5)">C3 Latency ms</th>
1641
+ <th onclick="sortTable(6)">Base Latency ms</th>
1642
+ <th onclick="sortTable(7)">C3 Quality</th>
1643
+ <th onclick="sortTable(8)">Base Quality</th>
1644
+ </tr>
1645
+ </thead>
1646
+ <tbody>
1647
+ {summary_rows}
1648
+ </tbody>
1649
+ </table>
1650
+ </div>
1651
+
1652
+ <!-- Cost Estimation -->
1653
+ <h2 id="cost">Estimated Cost Savings</h2>
1654
+ <div class="info-section" style="margin-bottom:1rem">
1655
+ <p>Based on {sc['total_tokens_baseline'] - sc['total_tokens_c3']:,} tokens saved per session. Cost = input token pricing only (output tokens are unaffected by C3).
1656
+ Assumes 5 sessions/day, 22 working days/month.</p>
1657
+ </div>
1658
+ <div class="cost-grid">
1659
+ {cost_cards}
1660
+ </div>
1661
+
1662
+ <!-- Performance Timing -->
1663
+ <h2 id="performance">End-to-End Performance Timing</h2>
1664
+ <div class="info-section" style="margin-bottom:1rem">
1665
+ <p>Estimates total wall-clock time for an AI assistant to process each scenario, including model inference (input tokenization + output generation) and network overhead.
1666
+ Fewer input tokens = faster model processing = shorter wait times for the developer.</p>
1667
+ </div>
1668
+ <div class="cost-grid">
1669
+ {perf_cards_html}
1670
+ </div>
1671
+ <div class="chart-card" style="margin-top:1.5rem">
1672
+ <h3>Per-Scenario E2E Time &mdash; {esc(first_profile_label)}</h3>
1673
+ <canvas id="perfChart"></canvas>
1674
+ </div>
1675
+
1676
+ <!-- Methodology -->
1677
+ <h2 id="methodology">Methodology</h2>
1678
+ <div class="info-grid">
1679
+ <div class="info-section">
1680
+ <h3>How Measurements Work</h3>
1681
+ <ul>
1682
+ <li><strong>Token counting</strong> uses the same tokenizer as the AI model. Every piece of text loaded into context is counted.</li>
1683
+ <li><strong>Latency</strong> measured via <code>time.perf_counter()</code> — real wall-clock time including disk I/O, index lookups, and compression.</li>
1684
+ <li><strong>Quality</strong> scored 0&ndash;100% per step: Did search find the target file? Did the file map build? Did surgical reading extract the right symbol? Did filtering retain error signals?</li>
1685
+ <li><strong>Both paths do equivalent work.</strong> The baseline represents how a capable AI assistant actually operates without C3: full file reads, lexical grep, complete log loading.</li>
1686
+ </ul>
1687
+ </div>
1688
+ <div class="info-section">
1689
+ <h3>Metric Definitions</h3>
1690
+ <dl class="metric-def">
1691
+ <dt>Token Savings %</dt><dd><code>(baseline - c3) / baseline &times; 100</code></dd>
1692
+ <dt>Budget Multiplier</dt><dd><code>baseline_tokens / c3_tokens</code> — how many times more info fits in context.</dd>
1693
+ <dt>Est. Turns</dt><dd>Turns before hitting 200K context limit, based on avg tokens/turn.</dd>
1694
+ <dt>Session Multiplier</dt><dd><code>turns_c3 / turns_baseline</code></dd>
1695
+ <dt>Quality Score</dt><dd>Avg accuracy. 100% = perfect. &lt;100% = some info missed (marked with &#9888;).</dd>
1696
+ <dt>Latency</dt><dd>Local wall-clock ms. C3 trades local compute for token savings.</dd>
1697
+ </dl>
1698
+ </div>
1699
+ </div>
1700
+
1701
+ <!-- Scenario Explanations -->
1702
+ <h2 id="scenarios">Scenario Descriptions</h2>
1703
+ <div class="info-section">
1704
+ <p style="margin-bottom:1rem">Each scenario simulates a real developer workflow with 3&ndash;4 sequential steps.</p>
1705
+ <div class="scenario-explainer">
1706
+ <div class="ex-title">1. Bug Investigation</div>
1707
+ <p>Find, understand, and validate a fix for an error.</p>
1708
+ <div class="ex-flow"><span class="flow-step">Search error</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Map structure</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Read symbol</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Validate</span></div>
1709
+ </div>
1710
+ <div class="scenario-explainer">
1711
+ <div class="ex-title">2. Feature Exploration</div>
1712
+ <p>Understand how a feature works across multiple files.</p>
1713
+ <div class="ex-flow"><span class="flow-step">Discover files</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Compress each</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Read key symbols</span></div>
1714
+ </div>
1715
+ <div class="scenario-explainer">
1716
+ <div class="ex-title">3. Code Review</div>
1717
+ <p>Review changed files for correctness and style.</p>
1718
+ <div class="ex-flow"><span class="flow-step">Compress files</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Read flagged</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Validate all</span></div>
1719
+ </div>
1720
+ <div class="scenario-explainer">
1721
+ <div class="ex-title">4. Log Diagnosis</div>
1722
+ <p>Triage errors from logs and terminal output.</p>
1723
+ <div class="ex-flow"><span class="flow-step">Filter log</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Filter terminal</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Search code</span></div>
1724
+ </div>
1725
+ <div class="scenario-explainer">
1726
+ <div class="ex-title">5. Refactor Planning</div>
1727
+ <p>Understand callers, implementations, and impact before refactoring.</p>
1728
+ <div class="ex-flow"><span class="flow-step">Search callers</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Compress callers</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Read impl</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Map impact</span></div>
1729
+ </div>
1730
+ <div class="scenario-explainer">
1731
+ <div class="ex-title">6. Onboarding</div>
1732
+ <p>New contributor explores project structure and key files.</p>
1733
+ <div class="ex-flow"><span class="flow-step">Search structure</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Compress files</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Read entries</span><span class="flow-arrow">&rarr;</span><span class="flow-step">Search patterns</span></div>
1734
+ </div>
1735
+ </div>
1736
+
1737
+ <!-- C3 Tools Reference -->
1738
+ <div class="info-grid" style="margin-top:1rem">
1739
+ <div class="info-section">
1740
+ <h3>C3 Tools Used</h3>
1741
+ <ul>
1742
+ <li><code>c3_search</code> &mdash; TF-IDF semantic code search. Returns relevant snippets, not full files.</li>
1743
+ <li><code>c3_compress</code> &mdash; Structural summaries (classes, functions, signatures). 40&ndash;90% savings.</li>
1744
+ <li><code>c3_compress(map)</code> &mdash; Lightweight file layout map for targeted reads.</li>
1745
+ <li><code>c3_read</code> &mdash; Extract specific symbols by name without full-file reads.</li>
1746
+ <li><code>c3_filter</code> &mdash; Surface errors/warnings from logs, collapse repetition.</li>
1747
+ <li><code>c3_validate</code> &mdash; AST syntax check. Near-zero token cost.</li>
1748
+ </ul>
1749
+ </div>
1750
+ <div class="info-section">
1751
+ <h3>Baseline (Without C3)</h3>
1752
+ <ul>
1753
+ <li><strong>File reads:</strong> Full content loaded. Every byte enters the prompt.</li>
1754
+ <li><strong>Search:</strong> Lexical term matching + full-file loading.</li>
1755
+ <li><strong>Logs:</strong> Entire log loaded. AI scans visually for errors.</li>
1756
+ <li><strong>Validation:</strong> Re-reads full file. Scored at 80% (misses subtle errors).</li>
1757
+ <li><strong>Multi-file:</strong> Each file read in full, often multiple times across steps.</li>
1758
+ </ul>
1759
+ </div>
1760
+ </div>
1761
+
1762
+ <!-- Scenario Details -->
1763
+ <h2 id="details">Workflow Scenario Details</h2>
1764
+ <div class="scenarios-grid">
1765
+ {"".join(scenario_cards)}
1766
+ </div>
1767
+
1768
+ <!-- Sampled Files -->
1769
+ <h2 id="files">Sampled Files</h2>
1770
+ <div class="info-section">
1771
+ <h3 class="collapsible-toggle" onclick="this.classList.toggle('open'); this.nextElementSibling.classList.toggle('open')">Files Used in This Benchmark ({len(sampled_files)} files)</h3>
1772
+ <div class="collapsible-content">
1773
+ <table class="files-table">
1774
+ <thead><tr><th>Path</th><th style="text-align:right">Tokens</th><th>Type</th></tr></thead>
1775
+ <tbody>{files_rows}</tbody>
1776
+ </table>
1777
+ </div>
1778
+ </div>
1779
+
1780
+ <!-- How to Read -->
1781
+ <h2>How to Read This Report</h2>
1782
+ <div class="info-section">
1783
+ <ul>
1784
+ <li><strong>Scorecard</strong> &mdash; headline numbers. Token savings directly determines session longevity and available reasoning context.</li>
1785
+ <li><strong>Timeline chart</strong> &mdash; cumulative tokens turn-by-turn. The gap between lines is wasted tokens C3 prevents. Orange zone = danger. Red dashed = limit.</li>
1786
+ <li><strong>Summary table</strong> &mdash; sortable comparison of all scenarios. Click column headers to sort.</li>
1787
+ <li><strong>Cost estimation</strong> &mdash; translates token savings to dollar amounts based on common model pricing.</li>
1788
+ <li><strong>Scenario cards</strong> &mdash; step-by-step breakdown with per-step token bars, delta indicators, and quality warnings.</li>
1789
+ <li><strong>Tool heatmap</strong> &mdash; stacked bar showing which C3 tools saved the most tokens in each scenario.</li>
1790
+ </ul>
1791
+ <p style="margin-top:0.5rem"><strong>Note on latency:</strong> C3 is slightly slower than raw reads (ms of compression vs. &micro;s of disk I/O).
1792
+ This is intentional &mdash; spending a few ms locally prevents thousands of tokens from entering context, saving significant AI inference time and cost.</p>
1793
+ </div>
1794
+
1795
+ <!-- Benchmark History / Trends -->
1796
+ <h2 id="history">Benchmark History</h2>
1797
+ {_render_history_section(hist)}
1798
+
1799
+ <div class="footer">
1800
+ Generated by C3 Session Benchmark &middot; {esc(report.get('project_path', ''))} &middot;
1801
+ <code>c3 session-benchmark</code> to regenerate
1802
+ </div>
1803
+
1804
+ </div>
1805
+
1806
+ <script>
1807
+ Chart.defaults.color = '#94a3b8';
1808
+ Chart.defaults.borderColor = '#475569';
1809
+
1810
+ // Timeline with annotation
1811
+ const dangerZone = {longevity['context_limit']} * 0.8;
1812
+ new Chart(document.getElementById('timelineChart'), {{
1813
+ type: 'line',
1814
+ data: {{
1815
+ labels: {timeline_labels},
1816
+ datasets: [
1817
+ {{ label: 'With C3', data: {timeline_c3}, borderColor: '#818cf8', backgroundColor: 'rgba(129,140,248,0.08)', fill: true, tension: 0.3, pointRadius: 1 }},
1818
+ {{ label: 'Without C3', data: {timeline_base}, borderColor: '#64748b', backgroundColor: 'rgba(100,116,139,0.08)', fill: true, tension: 0.3, pointRadius: 1 }},
1819
+ {{ label: 'Context Limit ({longevity["context_limit"]:,})', data: {json.dumps([longevity['context_limit']] * len(timeline))}, borderColor: '#f87171', borderDash: [6,3], pointRadius: 0, fill: false, borderWidth: 2 }}
1820
+ ]
1821
+ }},
1822
+ options: {{
1823
+ responsive: true,
1824
+ plugins: {{
1825
+ legend: {{ position: 'bottom' }},
1826
+ annotation: {{
1827
+ annotations: {{
1828
+ dangerZone: {{
1829
+ type: 'box', yMin: dangerZone, yMax: {longevity['context_limit']},
1830
+ backgroundColor: 'rgba(251,191,36,0.06)', borderWidth: 0,
1831
+ label: {{ display: true, content: 'Danger Zone (80%+)', position: 'start', color: '#fbbf24', font: {{ size: 10 }} }}
1832
+ }},
1833
+ baseExhausted: {{
1834
+ type: 'line', xMin: {longevity['estimated_turns_baseline']}, xMax: {longevity['estimated_turns_baseline']},
1835
+ borderColor: '#f87171', borderDash: [3,3], borderWidth: 1,
1836
+ label: {{ display: true, content: 'Base exhausted', position: 'start', color: '#f87171', font: {{ size: 10 }} }}
1837
+ }}
1838
+ }}
1839
+ }}
1840
+ }},
1841
+ scales: {{
1842
+ x: {{ title: {{ display: true, text: 'Turn #' }} }},
1843
+ y: {{ title: {{ display: true, text: 'Cumulative Tokens' }}, beginAtZero: true }}
1844
+ }}
1845
+ }}
1846
+ }});
1847
+
1848
+ // Scenario tokens
1849
+ new Chart(document.getElementById('scenarioChart'), {{
1850
+ type: 'bar',
1851
+ data: {{
1852
+ labels: {scenario_names},
1853
+ datasets: [
1854
+ {{ label: 'With C3', data: {scenario_c3_tokens}, backgroundColor: '#818cf8' }},
1855
+ {{ label: 'Without C3', data: {scenario_base_tokens}, backgroundColor: '#64748b' }}
1856
+ ]
1857
+ }},
1858
+ options: {{ responsive: true, plugins: {{ legend: {{ position: 'bottom' }} }}, scales: {{ y: {{ beginAtZero: true, title: {{ display: true, text: 'Tokens' }} }} }} }}
1859
+ }});
1860
+
1861
+ // Savings
1862
+ new Chart(document.getElementById('savingsChart'), {{
1863
+ type: 'bar',
1864
+ data: {{
1865
+ labels: {scenario_names},
1866
+ datasets: [{{ label: 'Savings %', data: {scenario_savings}, backgroundColor: {scenario_savings}.map(v => v > 80 ? '#10b981' : v > 50 ? '#34d399' : '#f59e0b') }}]
1867
+ }},
1868
+ options: {{ responsive: true, indexAxis: 'y', plugins: {{ legend: {{ display: false }} }}, scales: {{ x: {{ beginAtZero: true, max: 100, title: {{ display: true, text: '%' }} }} }} }}
1869
+ }});
1870
+
1871
+ // Latency comparison (new)
1872
+ new Chart(document.getElementById('latencyChart'), {{
1873
+ type: 'bar',
1874
+ data: {{
1875
+ labels: {scenario_names},
1876
+ datasets: [
1877
+ {{ label: 'C3 Latency (ms)', data: {scenario_lat_c3}, backgroundColor: '#818cf8' }},
1878
+ {{ label: 'Baseline Latency (ms)', data: {scenario_lat_base}, backgroundColor: '#64748b' }}
1879
+ ]
1880
+ }},
1881
+ options: {{ responsive: true, plugins: {{ legend: {{ position: 'bottom' }} }}, scales: {{ y: {{ beginAtZero: true, title: {{ display: true, text: 'ms' }} }} }} }}
1882
+ }});
1883
+
1884
+ // Tool heatmap (stacked bar)
1885
+ new Chart(document.getElementById('heatmapChart'), {{
1886
+ type: 'bar',
1887
+ data: {{
1888
+ labels: {json.dumps([_humanize(n) for n in all_scenario_names])},
1889
+ datasets: {heatmap_datasets_json}
1890
+ }},
1891
+ options: {{
1892
+ responsive: true,
1893
+ plugins: {{ legend: {{ position: 'bottom' }} }},
1894
+ scales: {{ x: {{ stacked: true }}, y: {{ stacked: true, beginAtZero: true, title: {{ display: true, text: 'Tokens Saved' }} }} }}
1895
+ }}
1896
+ }});
1897
+
1898
+ // Performance timing chart
1899
+ new Chart(document.getElementById('perfChart'), {{
1900
+ type: 'bar',
1901
+ data: {{
1902
+ labels: {json.dumps([_humanize(n) for n in all_scenario_names])},
1903
+ datasets: [
1904
+ {{ label: 'With C3', data: {perf_chart_c3}.map(v => v / 1000), backgroundColor: '#818cf8' }},
1905
+ {{ label: 'Baseline', data: {perf_chart_base}.map(v => v / 1000), backgroundColor: '#f87171' }}
1906
+ ]
1907
+ }},
1908
+ options: {{
1909
+ responsive: true,
1910
+ plugins: {{ legend: {{ position: 'bottom' }} }},
1911
+ scales: {{ y: {{ beginAtZero: true, title: {{ display: true, text: 'Seconds' }} }} }}
1912
+ }}
1913
+ }});
1914
+
1915
+ // Sortable table
1916
+ function sortTable(col) {{
1917
+ const table = document.getElementById('summaryTable');
1918
+ const tbody = table.querySelector('tbody');
1919
+ const rows = Array.from(tbody.querySelectorAll('tr:not(.totals-row)'));
1920
+ const totalsRow = tbody.querySelector('.totals-row');
1921
+ const dir = table.dataset.sortDir === 'asc' ? 'desc' : 'asc';
1922
+ table.dataset.sortDir = dir;
1923
+ rows.sort((a, b) => {{
1924
+ let aVal = a.cells[col].textContent.replace(/[,%x$]/g, '').trim();
1925
+ let bVal = b.cells[col].textContent.replace(/[,%x$]/g, '').trim();
1926
+ let aNum = parseFloat(aVal), bNum = parseFloat(bVal);
1927
+ if (!isNaN(aNum) && !isNaN(bNum)) return dir === 'asc' ? aNum - bNum : bNum - aNum;
1928
+ return dir === 'asc' ? aVal.localeCompare(bVal) : bVal.localeCompare(aVal);
1929
+ }});
1930
+ rows.forEach(r => tbody.appendChild(r));
1931
+ if (totalsRow) tbody.appendChild(totalsRow);
1932
+ }}
1933
+
1934
+ // Smooth scroll
1935
+ document.querySelectorAll('.sticky-nav a[href^="#"]').forEach(a => {{
1936
+ a.addEventListener('click', e => {{
1937
+ e.preventDefault();
1938
+ document.querySelector(a.getAttribute('href')).scrollIntoView({{ behavior: 'smooth' }});
1939
+ }});
1940
+ }});
1941
+
1942
+ {_render_history_charts_js(hist)}
1943
+ </script>
1944
+ </body>
1945
+ </html>"""