code-context-control 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. cli/__init__.py +1 -0
  2. cli/_hook_utils.py +99 -0
  3. cli/c3.py +6152 -0
  4. cli/commands/__init__.py +1 -0
  5. cli/commands/common.py +312 -0
  6. cli/commands/parser.py +286 -0
  7. cli/docs.html +3178 -0
  8. cli/edits.html +878 -0
  9. cli/hook_auto_snapshot.py +142 -0
  10. cli/hook_c3_signal.py +61 -0
  11. cli/hook_c3read.py +116 -0
  12. cli/hook_edit_ledger.py +213 -0
  13. cli/hook_edit_unlock.py +170 -0
  14. cli/hook_filter.py +130 -0
  15. cli/hook_ghost_files.py +238 -0
  16. cli/hook_pretool_enforce.py +334 -0
  17. cli/hook_read.py +200 -0
  18. cli/hook_session_stats.py +62 -0
  19. cli/hook_terse_advisor.py +190 -0
  20. cli/hub.html +3764 -0
  21. cli/hub_server.py +1619 -0
  22. cli/mcp_proxy.py +428 -0
  23. cli/mcp_server.py +660 -0
  24. cli/server.py +2985 -0
  25. cli/tools/__init__.py +4 -0
  26. cli/tools/_helpers.py +65 -0
  27. cli/tools/agent.py +1165 -0
  28. cli/tools/compress.py +215 -0
  29. cli/tools/delegate.py +1184 -0
  30. cli/tools/edit.py +313 -0
  31. cli/tools/edits.py +118 -0
  32. cli/tools/filter.py +285 -0
  33. cli/tools/impact.py +163 -0
  34. cli/tools/memory.py +469 -0
  35. cli/tools/read.py +224 -0
  36. cli/tools/search.py +337 -0
  37. cli/tools/session.py +95 -0
  38. cli/tools/shell.py +193 -0
  39. cli/tools/status.py +306 -0
  40. cli/tools/validate.py +310 -0
  41. cli/ui/api.js +36 -0
  42. cli/ui/app.js +207 -0
  43. cli/ui/components/chat.js +758 -0
  44. cli/ui/components/dashboard.js +689 -0
  45. cli/ui/components/edits.js +220 -0
  46. cli/ui/components/instructions.js +481 -0
  47. cli/ui/components/memory.js +626 -0
  48. cli/ui/components/sessions.js +606 -0
  49. cli/ui/components/settings.js +1404 -0
  50. cli/ui/components/sidebar.js +156 -0
  51. cli/ui/icons.js +51 -0
  52. cli/ui/shared.js +119 -0
  53. cli/ui/theme.js +22 -0
  54. cli/ui.html +168 -0
  55. cli/ui_legacy.html +6797 -0
  56. cli/ui_nano.html +503 -0
  57. code_context_control-2.28.0.dist-info/METADATA +248 -0
  58. code_context_control-2.28.0.dist-info/RECORD +150 -0
  59. code_context_control-2.28.0.dist-info/WHEEL +5 -0
  60. code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
  61. code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
  62. code_context_control-2.28.0.dist-info/top_level.txt +5 -0
  63. core/__init__.py +75 -0
  64. core/config.py +269 -0
  65. core/ide.py +188 -0
  66. oracle/__init__.py +1 -0
  67. oracle/config.py +75 -0
  68. oracle/oracle.html +3900 -0
  69. oracle/oracle_server.py +663 -0
  70. oracle/services/__init__.py +1 -0
  71. oracle/services/c3_bridge.py +210 -0
  72. oracle/services/chat_engine.py +1103 -0
  73. oracle/services/chat_store.py +155 -0
  74. oracle/services/cross_memory.py +154 -0
  75. oracle/services/federated_graph.py +463 -0
  76. oracle/services/health_checker.py +117 -0
  77. oracle/services/insight_engine.py +307 -0
  78. oracle/services/memory_reader.py +106 -0
  79. oracle/services/memory_writer.py +182 -0
  80. oracle/services/ollama_bridge.py +332 -0
  81. oracle/services/project_scanner.py +87 -0
  82. oracle/services/review_agent.py +206 -0
  83. services/__init__.py +1 -0
  84. services/activity_log.py +93 -0
  85. services/agent_base.py +124 -0
  86. services/agents.py +1529 -0
  87. services/auto_memory.py +407 -0
  88. services/bench/__init__.py +6 -0
  89. services/bench/external/__init__.py +29 -0
  90. services/bench/external/aider_polyglot.py +405 -0
  91. services/bench/external/swe_bench.py +485 -0
  92. services/benchmark_dashboard.py +596 -0
  93. services/claude_md.py +785 -0
  94. services/compressor.py +592 -0
  95. services/context_snapshot.py +356 -0
  96. services/conversation_store.py +870 -0
  97. services/doc_index.py +537 -0
  98. services/e2e_benchmark.py +2884 -0
  99. services/e2e_evaluator.py +396 -0
  100. services/e2e_tasks.py +743 -0
  101. services/edit_ledger.py +459 -0
  102. services/embedding_index.py +341 -0
  103. services/error_reporting.py +123 -0
  104. services/file_memory.py +734 -0
  105. services/hub_service.py +585 -0
  106. services/indexer.py +712 -0
  107. services/memory.py +318 -0
  108. services/memory_consolidator.py +538 -0
  109. services/memory_graph.py +382 -0
  110. services/memory_grounder.py +304 -0
  111. services/memory_scorer.py +246 -0
  112. services/metrics.py +86 -0
  113. services/notifications.py +209 -0
  114. services/ollama_client.py +201 -0
  115. services/output_filter.py +488 -0
  116. services/parser.py +1238 -0
  117. services/project_manager.py +579 -0
  118. services/protocol.py +306 -0
  119. services/proxy_state.py +152 -0
  120. services/retrieval_broker.py +129 -0
  121. services/router.py +414 -0
  122. services/runtime.py +326 -0
  123. services/session_benchmark.py +1945 -0
  124. services/session_manager.py +1026 -0
  125. services/session_preloader.py +251 -0
  126. services/text_index.py +90 -0
  127. services/tool_classifier.py +176 -0
  128. services/transcript_index.py +340 -0
  129. services/validation_cache.py +155 -0
  130. services/vector_store.py +299 -0
  131. services/version_tracker.py +271 -0
  132. services/watcher.py +192 -0
  133. tui/__init__.py +0 -0
  134. tui/backend.py +59 -0
  135. tui/main.py +145 -0
  136. tui/screens/__init__.py +1 -0
  137. tui/screens/benchmark_view.py +109 -0
  138. tui/screens/claudemd_view.py +46 -0
  139. tui/screens/compress_view.py +52 -0
  140. tui/screens/index_view.py +74 -0
  141. tui/screens/init_view.py +82 -0
  142. tui/screens/mcp_view.py +73 -0
  143. tui/screens/optimize_view.py +41 -0
  144. tui/screens/pipe_view.py +46 -0
  145. tui/screens/projects_view.py +355 -0
  146. tui/screens/search_view.py +55 -0
  147. tui/screens/session_view.py +143 -0
  148. tui/screens/stats.py +158 -0
  149. tui/screens/ui_view.py +54 -0
  150. tui/theme.tcss +335 -0
services/e2e_tasks.py ADDED
@@ -0,0 +1,743 @@
1
+ """
2
+ E2E Benchmark Task Library — dynamically generated tasks with verifiable ground truths.
3
+
4
+ Tasks are built from codebase analysis so they work on any project.
5
+ Each task has objectively verifiable ground truths derived from the index/file memory.
6
+
7
+ Task categories:
8
+ - explanation (easy): Single-symbol explanation
9
+ - file_discovery (easy): Locate where a symbol is defined
10
+ - dependency_analysis (medium): Analyze file imports
11
+ - architecture (medium/hard): Project structure understanding
12
+ - call_chain (hard): Cross-file reference tracing
13
+ - code_review (hard): Quality analysis of complex files
14
+ - multi_file_trace (hard): Data flow across multiple files
15
+ - large_file_needle (hard): Find specific detail in a large file
16
+ - refactor_suggestion (expert): Cross-file duplication analysis
17
+ - bug_injection (medium): Detect planted syntax/logic issues
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import random
23
+ import re
24
+ import sys
25
+ import traceback
26
+ from dataclasses import dataclass, field
27
+ from pathlib import Path
28
+
29
+ # Difficulty weights — harder tasks count more in weighted scoring
30
+ DIFFICULTY_WEIGHTS = {
31
+ "easy": 0.5,
32
+ "medium": 1.0,
33
+ "hard": 2.0,
34
+ "expert": 3.0,
35
+ }
36
+
37
+
38
+ @dataclass
39
+ class GroundTruth:
40
+ """Verifiable facts about the expected answer."""
41
+ required_keywords: list[str] = field(default_factory=list)
42
+ forbidden_keywords: list[str] = field(default_factory=list)
43
+ expected_files: list[str] = field(default_factory=list)
44
+ expected_symbols: list[str] = field(default_factory=list)
45
+ expected_answer_summary: str = ""
46
+ # Factual claims that can be verified: list of (claim_text, is_true) tuples
47
+ verifiable_claims: list[tuple[str, bool]] = field(default_factory=list)
48
+ # Multi-part: list of sub-questions that should each be addressed
49
+ required_aspects: list[str] = field(default_factory=list)
50
+ scoring_weights: dict = field(default_factory=lambda: {
51
+ "keyword": 0.15, "structural": 0.10, "file_mention": 0.15,
52
+ "factual": 0.35, "completeness": 0.25,
53
+ })
54
+
55
+
56
+ @dataclass
57
+ class E2ETask:
58
+ """A single benchmark task with ground truth for scoring."""
59
+ id: str
60
+ category: str
61
+ query: str
62
+ ground_truth: GroundTruth
63
+ difficulty: str = "medium"
64
+ target_files: list[str] = field(default_factory=list)
65
+ suggested_tools: list[str] = field(default_factory=list)
66
+ multi_turn: bool = False
67
+
68
+ def to_dict(self) -> dict:
69
+ return {
70
+ "id": self.id,
71
+ "category": self.category,
72
+ "query": self.query,
73
+ "difficulty": self.difficulty,
74
+ "target_files": self.target_files,
75
+ "suggested_tools": self.suggested_tools,
76
+ "multi_turn": self.multi_turn,
77
+ "ground_truth": {
78
+ "required_keywords": self.ground_truth.required_keywords,
79
+ "forbidden_keywords": self.ground_truth.forbidden_keywords,
80
+ "expected_files": self.ground_truth.expected_files,
81
+ "expected_symbols": self.ground_truth.expected_symbols,
82
+ "expected_answer_summary": self.ground_truth.expected_answer_summary,
83
+ "verifiable_claims_count": len(self.ground_truth.verifiable_claims),
84
+ "required_aspects": self.ground_truth.required_aspects,
85
+ },
86
+ }
87
+
88
+
89
+ TASK_PROMPT_TEMPLATE = (
90
+ "Use C3 MCP tools (not native Read/Grep/Glob). "
91
+ "Be concise, cite file paths and line numbers.\n\n"
92
+ "Question: {query}"
93
+ )
94
+
95
+ # Categories included in benchmark runs. Others still exist for ad-hoc use
96
+ # via --tasks but are excluded by default because they produce low or zero
97
+ # score delta between C3 and baseline.
98
+ BENCHMARK_CATEGORIES: set[str] = {
99
+ "call_chain",
100
+ "code_review",
101
+ "bug_injection",
102
+ "architecture",
103
+ "multi_file_trace",
104
+ }
105
+
106
+ # Maps task category -> recommended C3 tools
107
+ _CATEGORY_TOOL_HINTS: dict[str, list[str]] = {
108
+ "code_review": ["c3_compress(mode='bug_scan')", "c3_read"],
109
+ "file_discovery": ["c3_search(action='files')"],
110
+ "architecture": ["c3_compress(mode='map')", "c3_memory"],
111
+ "bug_injection": ["c3_compress(mode='bug_scan')", "c3_validate"],
112
+ "call_chain": ["c3_search(action='code')", "c3_read(symbols=[...])"],
113
+ "explanation": ["c3_read(symbols=[...])", "c3_compress"],
114
+ "refactor_suggestion": ["c3_compress(mode='map')", "c3_search"],
115
+ "dependency_analysis": ["c3_search(action='code')", "c3_compress(mode='map')"],
116
+ "multi_file_trace": ["c3_search(action='code')", "c3_read(symbols=[...])"],
117
+ "large_file_needle": ["c3_compress(mode='dense_map')", "c3_read(lines=[...])"],
118
+ }
119
+
120
+
121
+ class TaskBuilder:
122
+ """Generates benchmark tasks from codebase analysis."""
123
+
124
+ def __init__(self, project_path: str, indexer=None, file_memory=None):
125
+ self.project_path = Path(project_path).resolve()
126
+ self.indexer = indexer
127
+ self.file_memory = file_memory
128
+ self._file_records: dict[str, dict] = {}
129
+ self._all_symbols: list[tuple[str, str, dict]] = [] # (rel_path, symbol_name, section_info)
130
+
131
+ def build_tasks(self, max_per_category: int = 1,
132
+ categories: set[str] | None = None) -> list[E2ETask]:
133
+ """Build benchmark tasks, filtered to high-signal categories.
134
+
135
+ Args:
136
+ max_per_category: Max tasks per category (default 1).
137
+ categories: Set of category names to include.
138
+ Defaults to BENCHMARK_CATEGORIES.
139
+ """
140
+ self._scan_files()
141
+ if not self._all_symbols:
142
+ return []
143
+
144
+ include = categories or BENCHMARK_CATEGORIES
145
+
146
+ # Map category name -> builder method
147
+ all_builders = {
148
+ "explanation": self._symbol_explanation_tasks,
149
+ "file_discovery": self._file_discovery_tasks,
150
+ "dependency_analysis": self._dependency_analysis_tasks,
151
+ "architecture": self._architecture_tasks,
152
+ "call_chain": self._call_chain_tasks,
153
+ "code_review": self._code_review_tasks,
154
+ "multi_file_trace": self._multi_file_trace_tasks,
155
+ "large_file_needle": self._large_file_needle_tasks,
156
+ "refactor_suggestion": self._refactor_suggestion_tasks,
157
+ "bug_injection": self._bug_injection_tasks,
158
+ }
159
+
160
+ tasks = []
161
+ for cat_name, builder_fn in all_builders.items():
162
+ if cat_name not in include:
163
+ continue
164
+ try:
165
+ category_tasks = builder_fn(max_per_category)
166
+ tasks.extend(category_tasks)
167
+ except Exception as exc:
168
+ print(f" [e2e_tasks] Warning: {builder_fn.__name__} failed: {exc}", file=sys.stderr)
169
+ if __debug__:
170
+ traceback.print_exc(file=sys.stderr)
171
+ continue
172
+ return tasks
173
+
174
+ def _scan_files(self):
175
+ """Scan file memory to collect symbols and records."""
176
+ if not self.file_memory:
177
+ return
178
+ all_files = self.file_memory.list_tracked()
179
+ for rel_path in all_files:
180
+ record = self.file_memory.get(rel_path)
181
+ if not record or not record.get("sections"):
182
+ continue
183
+ self._file_records[rel_path] = record
184
+ for section in record["sections"]:
185
+ if section.get("type") in ("class", "function", "method"):
186
+ self._all_symbols.append((rel_path, section["name"], section))
187
+
188
+ def _base_ground_truth(self, **kwargs) -> GroundTruth:
189
+ """Create a GroundTruth pre-filled with common defaults."""
190
+ return GroundTruth(**kwargs)
191
+
192
+ def _pick_symbols(self, n: int, types: list[str] | None = None,
193
+ min_name_len: int = 0) -> list[tuple[str, str, dict]]:
194
+ """Pick n random symbols, optionally filtered by type and name length."""
195
+ pool = self._all_symbols
196
+ if types:
197
+ pool = [s for s in pool if s[2].get("type") in types]
198
+ if min_name_len:
199
+ pool = [s for s in pool if len(s[1]) >= min_name_len]
200
+ if len(pool) <= n:
201
+ return list(pool)
202
+ return random.sample(pool, n)
203
+
204
+ def _read_file_content(self, rel_path: str) -> str:
205
+ """Read file content, return empty string on failure."""
206
+ try:
207
+ return (self.project_path / rel_path).read_text(encoding="utf-8", errors="replace")
208
+ except Exception:
209
+ return ""
210
+
211
+ # ── Original categories (enhanced) ──────────────────────────────────
212
+
213
+ def _symbol_explanation_tasks(self, max_tasks: int) -> list[E2ETask]:
214
+ """'What does function X do?' — enhanced with multi-hop questions for classes."""
215
+ tasks = []
216
+ picks = self._pick_symbols(max_tasks, types=["function", "class"])
217
+ for rel_path, sym_name, section in picks:
218
+ keywords = [sym_name]
219
+ required_aspects = ["purpose"]
220
+
221
+ if section.get("type") == "class":
222
+ record = self._file_records.get(rel_path, {})
223
+ methods = [s["name"] for s in record.get("sections", [])
224
+ if s.get("type") == "method" and s.get("parent") == sym_name][:3]
225
+ keywords.extend(methods)
226
+ required_aspects.extend(["methods", "usage"])
227
+
228
+ docstring = section.get("docstring", "")
229
+ if docstring:
230
+ doc_words = [w for w in re.findall(r"[a-z_]{4,}", docstring.lower())
231
+ if w not in ("self", "this", "that", "with", "from", "none", "true", "false")][:3]
232
+ keywords.extend(doc_words)
233
+
234
+ # Build verifiable claims
235
+ claims = [(f"{sym_name} is defined in {rel_path}", True)]
236
+ params = section.get("params", "")
237
+ if params and "self" not in params.split(",")[0]:
238
+ claims.append((f"{sym_name} accepts parameters", True))
239
+
240
+ tasks.append(E2ETask(
241
+ id=f"explain_{sym_name}",
242
+ category="explanation",
243
+ query=(f"What does `{sym_name}` in `{rel_path}` do? "
244
+ f"Explain its purpose, parameters, and how it works."
245
+ + (" List its key methods and their roles." if section.get("type") == "class" else "")),
246
+ target_files=[rel_path],
247
+ difficulty="easy" if section.get("type") == "function" else "medium",
248
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("explanation", []),
249
+ ground_truth=self._base_ground_truth(
250
+ required_keywords=keywords,
251
+ expected_files=[rel_path],
252
+ expected_symbols=[sym_name],
253
+ expected_answer_summary=docstring or f"Explanation of {sym_name}",
254
+ verifiable_claims=claims,
255
+ required_aspects=required_aspects,
256
+ ),
257
+ ))
258
+ return tasks
259
+
260
+ def _file_discovery_tasks(self, max_tasks: int) -> list[E2ETask]:
261
+ """'Which file contains class X?' — ground truth from index."""
262
+ tasks = []
263
+ picks = self._pick_symbols(max_tasks, types=["class"])
264
+ if not picks:
265
+ picks = self._pick_symbols(max_tasks, types=["function"])
266
+ for rel_path, sym_name, section in picks:
267
+ # Get other symbols in the same file for verification
268
+ record = self._file_records.get(rel_path, {})
269
+ other_symbols = [s["name"] for s in record.get("sections", [])
270
+ if s["name"] != sym_name and s.get("type") in ("class", "function")][:4]
271
+
272
+ tasks.append(E2ETask(
273
+ id=f"find_{sym_name}",
274
+ category="file_discovery",
275
+ query=f"Which file contains the `{sym_name}` {section.get('type', 'symbol')}? "
276
+ f"What other important symbols are defined in that same file?",
277
+ target_files=[rel_path],
278
+ difficulty="easy",
279
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("file_discovery", []),
280
+ ground_truth=self._base_ground_truth(
281
+ required_keywords=[sym_name],
282
+ expected_files=[rel_path],
283
+ expected_symbols=[sym_name] + other_symbols[:2],
284
+ expected_answer_summary=f"{sym_name} is defined in {rel_path}",
285
+ verifiable_claims=[
286
+ (f"{sym_name} is in {rel_path}", True),
287
+ ] + [(f"{s} is also in this file", True) for s in other_symbols[:2]],
288
+ ),
289
+ ))
290
+ return tasks
291
+
292
+ def _dependency_analysis_tasks(self, max_tasks: int) -> list[E2ETask]:
293
+ """'What does file Y depend on?' — ground truth from imports."""
294
+ tasks = []
295
+ candidates = [(p, r) for p, r in self._file_records.items()
296
+ if r.get("imports")]
297
+ if not candidates:
298
+ return []
299
+ picks = random.sample(candidates, min(max_tasks, len(candidates)))
300
+ for rel_path, record in picks:
301
+ imports = record.get("imports", [])
302
+ import_keywords = []
303
+ for imp in imports[:5]:
304
+ mod = imp if isinstance(imp, str) else imp.get("module", "")
305
+ if mod and not mod.startswith("__"):
306
+ import_keywords.append(mod.split(".")[-1])
307
+
308
+ tasks.append(E2ETask(
309
+ id=f"deps_{Path(rel_path).stem}",
310
+ category="dependency_analysis",
311
+ query=f"What are the key dependencies of `{rel_path}`? "
312
+ f"List the modules it imports and explain what each is used for.",
313
+ target_files=[rel_path],
314
+ difficulty="medium",
315
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("dependency_analysis", []),
316
+ ground_truth=self._base_ground_truth(
317
+ required_keywords=import_keywords[:4],
318
+ expected_files=[rel_path],
319
+ expected_answer_summary=f"Dependencies of {rel_path}: {', '.join(import_keywords)}",
320
+ verifiable_claims=[(f"{rel_path} imports {m}", True) for m in import_keywords[:3]],
321
+ required_aspects=["imports", "usage"],
322
+ ),
323
+ ))
324
+ return tasks
325
+
326
+ def _architecture_tasks(self, max_tasks: int) -> list[E2ETask]:
327
+ """'How is the project structured?' — ground truth from directory analysis."""
328
+ tasks = []
329
+ top_dirs = set()
330
+ for rel_path in self._file_records:
331
+ parts = Path(rel_path).parts
332
+ if len(parts) > 1:
333
+ top_dirs.add(parts[0])
334
+
335
+ if not top_dirs:
336
+ return []
337
+
338
+ dirs_list = sorted(top_dirs)
339
+ tasks.append(E2ETask(
340
+ id="architecture_overview",
341
+ category="architecture",
342
+ query="Describe the high-level architecture of this project. "
343
+ "What are the main directories/modules and what is each responsible for?",
344
+ target_files=[],
345
+ difficulty="medium",
346
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("architecture", []),
347
+ ground_truth=self._base_ground_truth(
348
+ required_keywords=dirs_list[:5],
349
+ expected_answer_summary=f"Project has these main modules: {', '.join(dirs_list)}",
350
+ required_aspects=["directories", "responsibilities", "relationships"],
351
+ ),
352
+ ))
353
+
354
+ if len(dirs_list) >= 2:
355
+ target_dir = random.choice(dirs_list[:3])
356
+ dir_files = [p for p in self._file_records
357
+ if p.startswith(target_dir + "/") or p.startswith(target_dir + "\\")]
358
+ file_keywords = [Path(f).stem for f in dir_files[:5]]
359
+
360
+ tasks.append(E2ETask(
361
+ id=f"architecture_{target_dir}",
362
+ category="architecture",
363
+ query=f"Explain the purpose and internal structure of the `{target_dir}/` module. "
364
+ f"What are the key files and how do they relate to each other?",
365
+ target_files=dir_files[:5],
366
+ difficulty="hard",
367
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("architecture", []),
368
+ multi_turn=True,
369
+ ground_truth=self._base_ground_truth(
370
+ required_keywords=file_keywords[:4],
371
+ expected_files=dir_files[:3],
372
+ expected_answer_summary=f"The {target_dir} module contains: {', '.join(file_keywords)}",
373
+ required_aspects=["files", "purpose", "relationships"],
374
+ ),
375
+ ))
376
+
377
+ return tasks[:max_tasks]
378
+
379
+ def _call_chain_tasks(self, max_tasks: int) -> list[E2ETask]:
380
+ """'What calls function X?' — ground truth from grep-verified call sites."""
381
+ tasks = []
382
+ picks = self._pick_symbols(max_tasks * 3, types=["function"], min_name_len=5)
383
+ for rel_path, sym_name, section in picks:
384
+ if len(tasks) >= max_tasks:
385
+ break
386
+ if sym_name.startswith("_"):
387
+ continue
388
+
389
+ call_sites = []
390
+ _call_pat = re.compile(r"\b" + re.escape(sym_name) + r"\b")
391
+ for other_path in self._file_records:
392
+ if other_path == rel_path:
393
+ continue
394
+ content = self._read_file_content(other_path)
395
+ if _call_pat.search(content):
396
+ call_sites.append(other_path)
397
+
398
+ if not call_sites:
399
+ continue
400
+
401
+ tasks.append(E2ETask(
402
+ id=f"callers_{sym_name}",
403
+ category="call_chain",
404
+ query=f"Find all files that call or reference the function `{sym_name}` (defined in `{rel_path}`). "
405
+ f"For each caller, explain why it uses this function.",
406
+ target_files=[rel_path] + call_sites[:3],
407
+ difficulty="hard",
408
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("call_chain", []),
409
+ multi_turn=True,
410
+ ground_truth=self._base_ground_truth(
411
+ required_keywords=[sym_name] + [Path(c).stem for c in call_sites[:2]],
412
+ expected_files=call_sites[:3],
413
+ expected_symbols=[sym_name],
414
+ expected_answer_summary=f"{sym_name} is called from: {', '.join(call_sites[:3])}",
415
+ verifiable_claims=[
416
+ (f"{Path(c).stem} calls {sym_name}", True) for c in call_sites[:3]
417
+ ],
418
+ required_aspects=["call_sites", "reasons"],
419
+ ),
420
+ ))
421
+ return tasks
422
+
423
+ def _code_review_tasks(self, max_tasks: int) -> list[E2ETask]:
424
+ """'Review file X for issues' — enhanced with specific structural issues."""
425
+ tasks = []
426
+ complex_files = sorted(
427
+ self._file_records.items(),
428
+ key=lambda x: len(x[1].get("sections", [])),
429
+ reverse=True,
430
+ )[:max_tasks * 2]
431
+
432
+ for rel_path, record in complex_files[:max_tasks]:
433
+ symbols = [s["name"] for s in record.get("sections", [])
434
+ if s.get("type") in ("class", "function")][:5]
435
+ line_count = record.get("line_count", 0)
436
+
437
+ # Detect reviewable patterns from code structure
438
+ review_aspects = ["error_handling", "organization"]
439
+ claims = []
440
+ if line_count > 500:
441
+ review_aspects.append("file_length")
442
+ claims.append((f"{rel_path} is {line_count} lines long", True))
443
+ content = self._read_file_content(rel_path)
444
+ if content:
445
+ # Check for bare except
446
+ if re.search(r"except\s*:", content):
447
+ review_aspects.append("bare_except")
448
+ claims.append((f"{rel_path} has bare except clauses", True))
449
+ # Check for TODO/FIXME
450
+ todo_count = len(re.findall(r"#\s*(TODO|FIXME|HACK|XXX)", content, re.IGNORECASE))
451
+ if todo_count:
452
+ claims.append((f"{rel_path} has TODO/FIXME comments", True))
453
+ # Check for long functions
454
+ long_fns = [s["name"] for s in record.get("sections", [])
455
+ if s.get("line_end", 0) - s.get("line_start", 0) > 80]
456
+ if long_fns:
457
+ review_aspects.append("long_functions")
458
+ claims.append((f"{rel_path} has long functions: {', '.join(long_fns[:2])}", True))
459
+
460
+ tasks.append(E2ETask(
461
+ id=f"review_{Path(rel_path).stem}",
462
+ category="code_review",
463
+ query=f"Review `{rel_path}` for code quality. Consider: error handling, "
464
+ f"code organization, naming conventions, and potential bugs. "
465
+ f"Be specific about what you find.",
466
+ target_files=[rel_path],
467
+ difficulty="hard",
468
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("code_review", []),
469
+ multi_turn=True,
470
+ ground_truth=self._base_ground_truth(
471
+ required_keywords=symbols[:3],
472
+ expected_files=[rel_path],
473
+ expected_symbols=symbols[:3],
474
+ expected_answer_summary=f"Code review of {rel_path} ({line_count} lines, {len(symbols)} symbols)",
475
+ verifiable_claims=claims,
476
+ required_aspects=review_aspects,
477
+ ),
478
+ ))
479
+ return tasks
480
+
481
+ # ── New categories ──────────────────────────────────────────────────
482
+
483
+ def _multi_file_trace_tasks(self, max_tasks: int) -> list[E2ETask]:
484
+ """Trace data flow across multiple files — tests cross-file reasoning."""
485
+ tasks = []
486
+ # Find classes/functions imported and used across files
487
+ for rel_path, record in self._file_records.items():
488
+ if len(tasks) >= max_tasks:
489
+ break
490
+ imports = record.get("imports", [])
491
+ if not imports:
492
+ continue
493
+
494
+ # Find a local import (within this project)
495
+ for imp in imports:
496
+ mod = imp if isinstance(imp, str) else imp.get("module", "")
497
+ if not mod:
498
+ continue
499
+ # Check if it's an internal module
500
+ mod_parts = mod.replace(".", "/")
501
+ matching_files = [f for f in self._file_records
502
+ if mod_parts in f.replace("\\", "/")]
503
+ if not matching_files:
504
+ continue
505
+
506
+ source_file = matching_files[0]
507
+ source_record = self._file_records.get(source_file, {})
508
+ source_symbols = [s["name"] for s in source_record.get("sections", [])
509
+ if s.get("type") in ("class", "function")][:3]
510
+ if not source_symbols:
511
+ continue
512
+
513
+ target_symbol = source_symbols[0]
514
+ tasks.append(E2ETask(
515
+ id=f"trace_{Path(rel_path).stem}_to_{Path(source_file).stem}",
516
+ category="multi_file_trace",
517
+ query=(f"Trace how `{rel_path}` uses `{target_symbol}` from `{source_file}`. "
518
+ f"What data flows from the source to the consumer? "
519
+ f"What transformations happen along the way?"),
520
+ target_files=[rel_path, source_file],
521
+ difficulty="hard",
522
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("multi_file_trace", []),
523
+ multi_turn=True,
524
+ ground_truth=self._base_ground_truth(
525
+ required_keywords=[target_symbol, Path(source_file).stem, Path(rel_path).stem],
526
+ expected_files=[rel_path, source_file],
527
+ expected_symbols=[target_symbol],
528
+ expected_answer_summary=(
529
+ f"{rel_path} imports {target_symbol} from {source_file} "
530
+ f"and uses it for data processing"
531
+ ),
532
+ verifiable_claims=[
533
+ (f"{rel_path} imports from {source_file}", True),
534
+ (f"{target_symbol} is used in {rel_path}", True),
535
+ ],
536
+ required_aspects=["import_chain", "data_flow", "transformations"],
537
+ ),
538
+ ))
539
+ break # One trace per file
540
+
541
+ return tasks[:max_tasks]
542
+
543
+ def _large_file_needle_tasks(self, max_tasks: int) -> list[E2ETask]:
544
+ """Find specific detail in a large file — tests surgical extraction."""
545
+ tasks = []
546
+ # Find files with many sections (large, complex files)
547
+ large_files = sorted(
548
+ self._file_records.items(),
549
+ key=lambda x: x[1].get("line_count", 0),
550
+ reverse=True,
551
+ )
552
+
553
+ for rel_path, record in large_files:
554
+ if len(tasks) >= max_tasks:
555
+ break
556
+ line_count = record.get("line_count", 0)
557
+ if line_count < 200:
558
+ continue
559
+
560
+ sections = record.get("sections", [])
561
+ # Pick a function in the bottom half of the file (harder to find)
562
+ bottom_half = [s for s in sections
563
+ if s.get("line_start", 0) > line_count // 2
564
+ and s.get("type") in ("function", "method")
565
+ and len(s.get("name", "")) >= 5]
566
+ if not bottom_half:
567
+ continue
568
+
569
+ target = random.choice(bottom_half)
570
+ target_name = target["name"]
571
+ target_line = target.get("line_start", 0)
572
+ docstring = target.get("docstring", "")
573
+ params = target.get("params", "")
574
+
575
+ claims = [
576
+ (f"{target_name} starts around line {target_line}", True),
577
+ ]
578
+ if docstring:
579
+ claims.append((f"{target_name} has a docstring", True))
580
+ if params:
581
+ claims.append((f"{target_name} accepts parameters", True))
582
+
583
+ tasks.append(E2ETask(
584
+ id=f"needle_{target_name}_in_{Path(rel_path).stem}",
585
+ category="large_file_needle",
586
+ query=(f"In `{rel_path}` ({line_count} lines), find the function `{target_name}`. "
587
+ f"What does it do, what are its parameters, and what line is it on?"),
588
+ target_files=[rel_path],
589
+ difficulty="hard",
590
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("large_file_needle", []),
591
+ multi_turn=True,
592
+ ground_truth=self._base_ground_truth(
593
+ required_keywords=[target_name],
594
+ expected_files=[rel_path],
595
+ expected_symbols=[target_name],
596
+ expected_answer_summary=(
597
+ f"{target_name} at ~line {target_line} in {rel_path}: {docstring or 'no docstring'}"
598
+ ),
599
+ verifiable_claims=claims,
600
+ required_aspects=["location", "purpose", "parameters"],
601
+ ),
602
+ ))
603
+ return tasks
604
+
605
+ def _refactor_suggestion_tasks(self, max_tasks: int) -> list[E2ETask]:
606
+ """Suggest refactoring for duplication — tests cross-file pattern detection."""
607
+ tasks = []
608
+
609
+ # Find files with similar names or in the same directory that might have duplication
610
+ dir_groups: dict[str, list[str]] = {}
611
+ for rel_path in self._file_records:
612
+ parent = str(Path(rel_path).parent)
613
+ dir_groups.setdefault(parent, []).append(rel_path)
614
+
615
+ for parent_dir, files in dir_groups.items():
616
+ if len(tasks) >= max_tasks:
617
+ break
618
+ if len(files) < 3:
619
+ continue
620
+
621
+ # Pick 2-3 files from the same directory
622
+ sample = random.sample(files, min(3, len(files)))
623
+ sample_symbols = {}
624
+ for f in sample:
625
+ rec = self._file_records.get(f, {})
626
+ syms = [s["name"] for s in rec.get("sections", [])
627
+ if s.get("type") in ("function", "method")]
628
+ sample_symbols[f] = syms
629
+
630
+ all_syms = []
631
+ for syms in sample_symbols.values():
632
+ all_syms.extend(syms)
633
+ file_stems = [Path(f).stem for f in sample]
634
+
635
+ tasks.append(E2ETask(
636
+ id=f"refactor_{parent_dir.replace('/', '_').replace(chr(92), '_')}",
637
+ category="refactor_suggestion",
638
+ query=(f"Analyze these files in `{parent_dir}/` for code duplication and refactoring opportunities: "
639
+ f"{', '.join('`' + f + '`' for f in sample)}. "
640
+ f"What patterns are repeated? How would you reduce the duplication?"),
641
+ target_files=sample,
642
+ difficulty="expert",
643
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("refactor_suggestion", []),
644
+ multi_turn=True,
645
+ ground_truth=self._base_ground_truth(
646
+ required_keywords=file_stems[:3],
647
+ expected_files=sample,
648
+ expected_answer_summary=f"Refactoring analysis of {', '.join(file_stems)}",
649
+ required_aspects=["duplication_patterns", "refactoring_approach", "shared_abstractions"],
650
+ ),
651
+ ))
652
+ return tasks
653
+
654
+ def _bug_injection_tasks(self, max_tasks: int) -> list[E2ETask]:
655
+ """Detect issues in code — tests analytical ability with planted hints."""
656
+ tasks = []
657
+ # Pick files and ask about specific patterns that are verifiable
658
+ candidates = []
659
+ for rel_path, record in self._file_records.items():
660
+ content = self._read_file_content(rel_path)
661
+ if not content or len(content) < 500:
662
+ continue
663
+ issues = []
664
+ # Detect real patterns we can ask about
665
+ if re.search(r"except\s*:", content):
666
+ issues.append("bare_except")
667
+ if re.search(r"except\s+Exception\s*:", content):
668
+ issues.append("broad_except")
669
+ if "# TODO" in content or "# FIXME" in content or "# HACK" in content:
670
+ issues.append("todo_markers")
671
+ if re.search(r"\.format\(", content) and "f\"" not in content[:2000]:
672
+ issues.append("old_format_strings")
673
+ if re.search(r"type\([\w]+\)\s*==", content):
674
+ issues.append("type_comparison")
675
+ if re.search(r"except.*pass\s*$", content, re.MULTILINE):
676
+ issues.append("silent_exception")
677
+ if issues:
678
+ candidates.append((rel_path, record, content, issues))
679
+
680
+ if not candidates:
681
+ return []
682
+
683
+ # Keyword synonym groups — each entry is a list so the evaluator matches
684
+ # if ANY alternative is present. Avoids fragile single-word requirements
685
+ # (e.g. "overly" which models rarely write verbatim).
686
+ _ISSUE_KEYWORDS: dict[str, list[str]] = {
687
+ "bare_except": ["bare except", "except:", "catching all", "bare clause"],
688
+ "broad_except": ["except Exception", "broad exception", "broad except", "swallow", "silent failure"],
689
+ "todo_markers": ["TODO", "FIXME", "HACK", "tech debt"],
690
+ "old_format_strings": [".format(", "f-string", "f\"", "f'"],
691
+ "type_comparison": ["type()", "isinstance", "type comparison"],
692
+ "silent_exception": ["except: pass", "swallow", "silent", "silently"],
693
+ }
694
+ _ISSUE_DESC: dict[str, str] = {
695
+ "bare_except": "bare except clauses (catching all exceptions without specifying type)",
696
+ "broad_except": "overly broad exception handling (catching base Exception)",
697
+ "todo_markers": "unresolved TODO/FIXME/HACK comments",
698
+ "old_format_strings": "use of .format() instead of f-strings",
699
+ "type_comparison": "type comparison using type() == instead of isinstance()",
700
+ "silent_exception": "silently swallowed exceptions (except: pass)",
701
+ }
702
+
703
+ for rel_path, record, content, issues in random.sample(candidates, min(max_tasks, len(candidates))):
704
+ # Synonym groups per detected issue
705
+ expected_keywords = [_ISSUE_KEYWORDS.get(i, [i]) for i in issues[:3]]
706
+ claims = [(f"{rel_path} has {_ISSUE_DESC.get(i, i)}", True) for i in issues[:3]]
707
+
708
+ tasks.append(E2ETask(
709
+ id=f"bugs_{Path(rel_path).stem}",
710
+ category="bug_injection",
711
+ query=(f"Analyze `{rel_path}` for code quality issues, anti-patterns, and potential bugs. "
712
+ f"Focus on error handling, exception management, and code hygiene. "
713
+ f"List each issue with the specific line number or function name where it occurs, "
714
+ f"and suggest how to fix it."),
715
+ target_files=[rel_path],
716
+ difficulty="medium",
717
+ suggested_tools=_CATEGORY_TOOL_HINTS.get("bug_injection", []),
718
+ ground_truth=self._base_ground_truth(
719
+ required_keywords=expected_keywords,
720
+ expected_files=[rel_path],
721
+ expected_answer_summary=f"Issues in {rel_path}: {', '.join(issues)}",
722
+ verifiable_claims=claims,
723
+ required_aspects=["issues_found", "locations", "suggestions"],
724
+ # Bug reports benefit most from factual accuracy and completeness.
725
+ scoring_weights={
726
+ "keyword": 0.10,
727
+ "structural": 0.10,
728
+ "file_mention": 0.10,
729
+ "factual": 0.35,
730
+ "completeness": 0.35,
731
+ },
732
+ ),
733
+ ))
734
+ return tasks
735
+
736
+
737
+ def build_prompt(task: E2ETask) -> str:
738
+ """Build the full prompt string for a task."""
739
+ prompt = TASK_PROMPT_TEMPLATE.format(query=task.query)
740
+ tools = task.suggested_tools
741
+ if tools:
742
+ prompt += f"\n\nSuggested C3 tools for this task: {', '.join(tools)}"
743
+ return prompt