code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
services/e2e_tasks.py
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
1
|
+
"""
|
|
2
|
+
E2E Benchmark Task Library — dynamically generated tasks with verifiable ground truths.
|
|
3
|
+
|
|
4
|
+
Tasks are built from codebase analysis so they work on any project.
|
|
5
|
+
Each task has objectively verifiable ground truths derived from the index/file memory.
|
|
6
|
+
|
|
7
|
+
Task categories:
|
|
8
|
+
- explanation (easy): Single-symbol explanation
|
|
9
|
+
- file_discovery (easy): Locate where a symbol is defined
|
|
10
|
+
- dependency_analysis (medium): Analyze file imports
|
|
11
|
+
- architecture (medium/hard): Project structure understanding
|
|
12
|
+
- call_chain (hard): Cross-file reference tracing
|
|
13
|
+
- code_review (hard): Quality analysis of complex files
|
|
14
|
+
- multi_file_trace (hard): Data flow across multiple files
|
|
15
|
+
- large_file_needle (hard): Find specific detail in a large file
|
|
16
|
+
- refactor_suggestion (expert): Cross-file duplication analysis
|
|
17
|
+
- bug_injection (medium): Detect planted syntax/logic issues
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import random
|
|
23
|
+
import re
|
|
24
|
+
import sys
|
|
25
|
+
import traceback
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
# Difficulty weights — harder tasks count more in weighted scoring
|
|
30
|
+
DIFFICULTY_WEIGHTS = {
|
|
31
|
+
"easy": 0.5,
|
|
32
|
+
"medium": 1.0,
|
|
33
|
+
"hard": 2.0,
|
|
34
|
+
"expert": 3.0,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class GroundTruth:
|
|
40
|
+
"""Verifiable facts about the expected answer."""
|
|
41
|
+
required_keywords: list[str] = field(default_factory=list)
|
|
42
|
+
forbidden_keywords: list[str] = field(default_factory=list)
|
|
43
|
+
expected_files: list[str] = field(default_factory=list)
|
|
44
|
+
expected_symbols: list[str] = field(default_factory=list)
|
|
45
|
+
expected_answer_summary: str = ""
|
|
46
|
+
# Factual claims that can be verified: list of (claim_text, is_true) tuples
|
|
47
|
+
verifiable_claims: list[tuple[str, bool]] = field(default_factory=list)
|
|
48
|
+
# Multi-part: list of sub-questions that should each be addressed
|
|
49
|
+
required_aspects: list[str] = field(default_factory=list)
|
|
50
|
+
scoring_weights: dict = field(default_factory=lambda: {
|
|
51
|
+
"keyword": 0.15, "structural": 0.10, "file_mention": 0.15,
|
|
52
|
+
"factual": 0.35, "completeness": 0.25,
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class E2ETask:
|
|
58
|
+
"""A single benchmark task with ground truth for scoring."""
|
|
59
|
+
id: str
|
|
60
|
+
category: str
|
|
61
|
+
query: str
|
|
62
|
+
ground_truth: GroundTruth
|
|
63
|
+
difficulty: str = "medium"
|
|
64
|
+
target_files: list[str] = field(default_factory=list)
|
|
65
|
+
suggested_tools: list[str] = field(default_factory=list)
|
|
66
|
+
multi_turn: bool = False
|
|
67
|
+
|
|
68
|
+
def to_dict(self) -> dict:
|
|
69
|
+
return {
|
|
70
|
+
"id": self.id,
|
|
71
|
+
"category": self.category,
|
|
72
|
+
"query": self.query,
|
|
73
|
+
"difficulty": self.difficulty,
|
|
74
|
+
"target_files": self.target_files,
|
|
75
|
+
"suggested_tools": self.suggested_tools,
|
|
76
|
+
"multi_turn": self.multi_turn,
|
|
77
|
+
"ground_truth": {
|
|
78
|
+
"required_keywords": self.ground_truth.required_keywords,
|
|
79
|
+
"forbidden_keywords": self.ground_truth.forbidden_keywords,
|
|
80
|
+
"expected_files": self.ground_truth.expected_files,
|
|
81
|
+
"expected_symbols": self.ground_truth.expected_symbols,
|
|
82
|
+
"expected_answer_summary": self.ground_truth.expected_answer_summary,
|
|
83
|
+
"verifiable_claims_count": len(self.ground_truth.verifiable_claims),
|
|
84
|
+
"required_aspects": self.ground_truth.required_aspects,
|
|
85
|
+
},
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
TASK_PROMPT_TEMPLATE = (
|
|
90
|
+
"Use C3 MCP tools (not native Read/Grep/Glob). "
|
|
91
|
+
"Be concise, cite file paths and line numbers.\n\n"
|
|
92
|
+
"Question: {query}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Categories included in benchmark runs. Others still exist for ad-hoc use
|
|
96
|
+
# via --tasks but are excluded by default because they produce low or zero
|
|
97
|
+
# score delta between C3 and baseline.
|
|
98
|
+
BENCHMARK_CATEGORIES: set[str] = {
|
|
99
|
+
"call_chain",
|
|
100
|
+
"code_review",
|
|
101
|
+
"bug_injection",
|
|
102
|
+
"architecture",
|
|
103
|
+
"multi_file_trace",
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# Maps task category -> recommended C3 tools
|
|
107
|
+
_CATEGORY_TOOL_HINTS: dict[str, list[str]] = {
|
|
108
|
+
"code_review": ["c3_compress(mode='bug_scan')", "c3_read"],
|
|
109
|
+
"file_discovery": ["c3_search(action='files')"],
|
|
110
|
+
"architecture": ["c3_compress(mode='map')", "c3_memory"],
|
|
111
|
+
"bug_injection": ["c3_compress(mode='bug_scan')", "c3_validate"],
|
|
112
|
+
"call_chain": ["c3_search(action='code')", "c3_read(symbols=[...])"],
|
|
113
|
+
"explanation": ["c3_read(symbols=[...])", "c3_compress"],
|
|
114
|
+
"refactor_suggestion": ["c3_compress(mode='map')", "c3_search"],
|
|
115
|
+
"dependency_analysis": ["c3_search(action='code')", "c3_compress(mode='map')"],
|
|
116
|
+
"multi_file_trace": ["c3_search(action='code')", "c3_read(symbols=[...])"],
|
|
117
|
+
"large_file_needle": ["c3_compress(mode='dense_map')", "c3_read(lines=[...])"],
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class TaskBuilder:
|
|
122
|
+
"""Generates benchmark tasks from codebase analysis."""
|
|
123
|
+
|
|
124
|
+
def __init__(self, project_path: str, indexer=None, file_memory=None):
|
|
125
|
+
self.project_path = Path(project_path).resolve()
|
|
126
|
+
self.indexer = indexer
|
|
127
|
+
self.file_memory = file_memory
|
|
128
|
+
self._file_records: dict[str, dict] = {}
|
|
129
|
+
self._all_symbols: list[tuple[str, str, dict]] = [] # (rel_path, symbol_name, section_info)
|
|
130
|
+
|
|
131
|
+
def build_tasks(self, max_per_category: int = 1,
|
|
132
|
+
categories: set[str] | None = None) -> list[E2ETask]:
|
|
133
|
+
"""Build benchmark tasks, filtered to high-signal categories.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
max_per_category: Max tasks per category (default 1).
|
|
137
|
+
categories: Set of category names to include.
|
|
138
|
+
Defaults to BENCHMARK_CATEGORIES.
|
|
139
|
+
"""
|
|
140
|
+
self._scan_files()
|
|
141
|
+
if not self._all_symbols:
|
|
142
|
+
return []
|
|
143
|
+
|
|
144
|
+
include = categories or BENCHMARK_CATEGORIES
|
|
145
|
+
|
|
146
|
+
# Map category name -> builder method
|
|
147
|
+
all_builders = {
|
|
148
|
+
"explanation": self._symbol_explanation_tasks,
|
|
149
|
+
"file_discovery": self._file_discovery_tasks,
|
|
150
|
+
"dependency_analysis": self._dependency_analysis_tasks,
|
|
151
|
+
"architecture": self._architecture_tasks,
|
|
152
|
+
"call_chain": self._call_chain_tasks,
|
|
153
|
+
"code_review": self._code_review_tasks,
|
|
154
|
+
"multi_file_trace": self._multi_file_trace_tasks,
|
|
155
|
+
"large_file_needle": self._large_file_needle_tasks,
|
|
156
|
+
"refactor_suggestion": self._refactor_suggestion_tasks,
|
|
157
|
+
"bug_injection": self._bug_injection_tasks,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
tasks = []
|
|
161
|
+
for cat_name, builder_fn in all_builders.items():
|
|
162
|
+
if cat_name not in include:
|
|
163
|
+
continue
|
|
164
|
+
try:
|
|
165
|
+
category_tasks = builder_fn(max_per_category)
|
|
166
|
+
tasks.extend(category_tasks)
|
|
167
|
+
except Exception as exc:
|
|
168
|
+
print(f" [e2e_tasks] Warning: {builder_fn.__name__} failed: {exc}", file=sys.stderr)
|
|
169
|
+
if __debug__:
|
|
170
|
+
traceback.print_exc(file=sys.stderr)
|
|
171
|
+
continue
|
|
172
|
+
return tasks
|
|
173
|
+
|
|
174
|
+
def _scan_files(self):
|
|
175
|
+
"""Scan file memory to collect symbols and records."""
|
|
176
|
+
if not self.file_memory:
|
|
177
|
+
return
|
|
178
|
+
all_files = self.file_memory.list_tracked()
|
|
179
|
+
for rel_path in all_files:
|
|
180
|
+
record = self.file_memory.get(rel_path)
|
|
181
|
+
if not record or not record.get("sections"):
|
|
182
|
+
continue
|
|
183
|
+
self._file_records[rel_path] = record
|
|
184
|
+
for section in record["sections"]:
|
|
185
|
+
if section.get("type") in ("class", "function", "method"):
|
|
186
|
+
self._all_symbols.append((rel_path, section["name"], section))
|
|
187
|
+
|
|
188
|
+
def _base_ground_truth(self, **kwargs) -> GroundTruth:
|
|
189
|
+
"""Create a GroundTruth pre-filled with common defaults."""
|
|
190
|
+
return GroundTruth(**kwargs)
|
|
191
|
+
|
|
192
|
+
def _pick_symbols(self, n: int, types: list[str] | None = None,
|
|
193
|
+
min_name_len: int = 0) -> list[tuple[str, str, dict]]:
|
|
194
|
+
"""Pick n random symbols, optionally filtered by type and name length."""
|
|
195
|
+
pool = self._all_symbols
|
|
196
|
+
if types:
|
|
197
|
+
pool = [s for s in pool if s[2].get("type") in types]
|
|
198
|
+
if min_name_len:
|
|
199
|
+
pool = [s for s in pool if len(s[1]) >= min_name_len]
|
|
200
|
+
if len(pool) <= n:
|
|
201
|
+
return list(pool)
|
|
202
|
+
return random.sample(pool, n)
|
|
203
|
+
|
|
204
|
+
def _read_file_content(self, rel_path: str) -> str:
|
|
205
|
+
"""Read file content, return empty string on failure."""
|
|
206
|
+
try:
|
|
207
|
+
return (self.project_path / rel_path).read_text(encoding="utf-8", errors="replace")
|
|
208
|
+
except Exception:
|
|
209
|
+
return ""
|
|
210
|
+
|
|
211
|
+
# ── Original categories (enhanced) ──────────────────────────────────
|
|
212
|
+
|
|
213
|
+
def _symbol_explanation_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
214
|
+
"""'What does function X do?' — enhanced with multi-hop questions for classes."""
|
|
215
|
+
tasks = []
|
|
216
|
+
picks = self._pick_symbols(max_tasks, types=["function", "class"])
|
|
217
|
+
for rel_path, sym_name, section in picks:
|
|
218
|
+
keywords = [sym_name]
|
|
219
|
+
required_aspects = ["purpose"]
|
|
220
|
+
|
|
221
|
+
if section.get("type") == "class":
|
|
222
|
+
record = self._file_records.get(rel_path, {})
|
|
223
|
+
methods = [s["name"] for s in record.get("sections", [])
|
|
224
|
+
if s.get("type") == "method" and s.get("parent") == sym_name][:3]
|
|
225
|
+
keywords.extend(methods)
|
|
226
|
+
required_aspects.extend(["methods", "usage"])
|
|
227
|
+
|
|
228
|
+
docstring = section.get("docstring", "")
|
|
229
|
+
if docstring:
|
|
230
|
+
doc_words = [w for w in re.findall(r"[a-z_]{4,}", docstring.lower())
|
|
231
|
+
if w not in ("self", "this", "that", "with", "from", "none", "true", "false")][:3]
|
|
232
|
+
keywords.extend(doc_words)
|
|
233
|
+
|
|
234
|
+
# Build verifiable claims
|
|
235
|
+
claims = [(f"{sym_name} is defined in {rel_path}", True)]
|
|
236
|
+
params = section.get("params", "")
|
|
237
|
+
if params and "self" not in params.split(",")[0]:
|
|
238
|
+
claims.append((f"{sym_name} accepts parameters", True))
|
|
239
|
+
|
|
240
|
+
tasks.append(E2ETask(
|
|
241
|
+
id=f"explain_{sym_name}",
|
|
242
|
+
category="explanation",
|
|
243
|
+
query=(f"What does `{sym_name}` in `{rel_path}` do? "
|
|
244
|
+
f"Explain its purpose, parameters, and how it works."
|
|
245
|
+
+ (" List its key methods and their roles." if section.get("type") == "class" else "")),
|
|
246
|
+
target_files=[rel_path],
|
|
247
|
+
difficulty="easy" if section.get("type") == "function" else "medium",
|
|
248
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("explanation", []),
|
|
249
|
+
ground_truth=self._base_ground_truth(
|
|
250
|
+
required_keywords=keywords,
|
|
251
|
+
expected_files=[rel_path],
|
|
252
|
+
expected_symbols=[sym_name],
|
|
253
|
+
expected_answer_summary=docstring or f"Explanation of {sym_name}",
|
|
254
|
+
verifiable_claims=claims,
|
|
255
|
+
required_aspects=required_aspects,
|
|
256
|
+
),
|
|
257
|
+
))
|
|
258
|
+
return tasks
|
|
259
|
+
|
|
260
|
+
def _file_discovery_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
261
|
+
"""'Which file contains class X?' — ground truth from index."""
|
|
262
|
+
tasks = []
|
|
263
|
+
picks = self._pick_symbols(max_tasks, types=["class"])
|
|
264
|
+
if not picks:
|
|
265
|
+
picks = self._pick_symbols(max_tasks, types=["function"])
|
|
266
|
+
for rel_path, sym_name, section in picks:
|
|
267
|
+
# Get other symbols in the same file for verification
|
|
268
|
+
record = self._file_records.get(rel_path, {})
|
|
269
|
+
other_symbols = [s["name"] for s in record.get("sections", [])
|
|
270
|
+
if s["name"] != sym_name and s.get("type") in ("class", "function")][:4]
|
|
271
|
+
|
|
272
|
+
tasks.append(E2ETask(
|
|
273
|
+
id=f"find_{sym_name}",
|
|
274
|
+
category="file_discovery",
|
|
275
|
+
query=f"Which file contains the `{sym_name}` {section.get('type', 'symbol')}? "
|
|
276
|
+
f"What other important symbols are defined in that same file?",
|
|
277
|
+
target_files=[rel_path],
|
|
278
|
+
difficulty="easy",
|
|
279
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("file_discovery", []),
|
|
280
|
+
ground_truth=self._base_ground_truth(
|
|
281
|
+
required_keywords=[sym_name],
|
|
282
|
+
expected_files=[rel_path],
|
|
283
|
+
expected_symbols=[sym_name] + other_symbols[:2],
|
|
284
|
+
expected_answer_summary=f"{sym_name} is defined in {rel_path}",
|
|
285
|
+
verifiable_claims=[
|
|
286
|
+
(f"{sym_name} is in {rel_path}", True),
|
|
287
|
+
] + [(f"{s} is also in this file", True) for s in other_symbols[:2]],
|
|
288
|
+
),
|
|
289
|
+
))
|
|
290
|
+
return tasks
|
|
291
|
+
|
|
292
|
+
def _dependency_analysis_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
293
|
+
"""'What does file Y depend on?' — ground truth from imports."""
|
|
294
|
+
tasks = []
|
|
295
|
+
candidates = [(p, r) for p, r in self._file_records.items()
|
|
296
|
+
if r.get("imports")]
|
|
297
|
+
if not candidates:
|
|
298
|
+
return []
|
|
299
|
+
picks = random.sample(candidates, min(max_tasks, len(candidates)))
|
|
300
|
+
for rel_path, record in picks:
|
|
301
|
+
imports = record.get("imports", [])
|
|
302
|
+
import_keywords = []
|
|
303
|
+
for imp in imports[:5]:
|
|
304
|
+
mod = imp if isinstance(imp, str) else imp.get("module", "")
|
|
305
|
+
if mod and not mod.startswith("__"):
|
|
306
|
+
import_keywords.append(mod.split(".")[-1])
|
|
307
|
+
|
|
308
|
+
tasks.append(E2ETask(
|
|
309
|
+
id=f"deps_{Path(rel_path).stem}",
|
|
310
|
+
category="dependency_analysis",
|
|
311
|
+
query=f"What are the key dependencies of `{rel_path}`? "
|
|
312
|
+
f"List the modules it imports and explain what each is used for.",
|
|
313
|
+
target_files=[rel_path],
|
|
314
|
+
difficulty="medium",
|
|
315
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("dependency_analysis", []),
|
|
316
|
+
ground_truth=self._base_ground_truth(
|
|
317
|
+
required_keywords=import_keywords[:4],
|
|
318
|
+
expected_files=[rel_path],
|
|
319
|
+
expected_answer_summary=f"Dependencies of {rel_path}: {', '.join(import_keywords)}",
|
|
320
|
+
verifiable_claims=[(f"{rel_path} imports {m}", True) for m in import_keywords[:3]],
|
|
321
|
+
required_aspects=["imports", "usage"],
|
|
322
|
+
),
|
|
323
|
+
))
|
|
324
|
+
return tasks
|
|
325
|
+
|
|
326
|
+
def _architecture_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
327
|
+
"""'How is the project structured?' — ground truth from directory analysis."""
|
|
328
|
+
tasks = []
|
|
329
|
+
top_dirs = set()
|
|
330
|
+
for rel_path in self._file_records:
|
|
331
|
+
parts = Path(rel_path).parts
|
|
332
|
+
if len(parts) > 1:
|
|
333
|
+
top_dirs.add(parts[0])
|
|
334
|
+
|
|
335
|
+
if not top_dirs:
|
|
336
|
+
return []
|
|
337
|
+
|
|
338
|
+
dirs_list = sorted(top_dirs)
|
|
339
|
+
tasks.append(E2ETask(
|
|
340
|
+
id="architecture_overview",
|
|
341
|
+
category="architecture",
|
|
342
|
+
query="Describe the high-level architecture of this project. "
|
|
343
|
+
"What are the main directories/modules and what is each responsible for?",
|
|
344
|
+
target_files=[],
|
|
345
|
+
difficulty="medium",
|
|
346
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("architecture", []),
|
|
347
|
+
ground_truth=self._base_ground_truth(
|
|
348
|
+
required_keywords=dirs_list[:5],
|
|
349
|
+
expected_answer_summary=f"Project has these main modules: {', '.join(dirs_list)}",
|
|
350
|
+
required_aspects=["directories", "responsibilities", "relationships"],
|
|
351
|
+
),
|
|
352
|
+
))
|
|
353
|
+
|
|
354
|
+
if len(dirs_list) >= 2:
|
|
355
|
+
target_dir = random.choice(dirs_list[:3])
|
|
356
|
+
dir_files = [p for p in self._file_records
|
|
357
|
+
if p.startswith(target_dir + "/") or p.startswith(target_dir + "\\")]
|
|
358
|
+
file_keywords = [Path(f).stem for f in dir_files[:5]]
|
|
359
|
+
|
|
360
|
+
tasks.append(E2ETask(
|
|
361
|
+
id=f"architecture_{target_dir}",
|
|
362
|
+
category="architecture",
|
|
363
|
+
query=f"Explain the purpose and internal structure of the `{target_dir}/` module. "
|
|
364
|
+
f"What are the key files and how do they relate to each other?",
|
|
365
|
+
target_files=dir_files[:5],
|
|
366
|
+
difficulty="hard",
|
|
367
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("architecture", []),
|
|
368
|
+
multi_turn=True,
|
|
369
|
+
ground_truth=self._base_ground_truth(
|
|
370
|
+
required_keywords=file_keywords[:4],
|
|
371
|
+
expected_files=dir_files[:3],
|
|
372
|
+
expected_answer_summary=f"The {target_dir} module contains: {', '.join(file_keywords)}",
|
|
373
|
+
required_aspects=["files", "purpose", "relationships"],
|
|
374
|
+
),
|
|
375
|
+
))
|
|
376
|
+
|
|
377
|
+
return tasks[:max_tasks]
|
|
378
|
+
|
|
379
|
+
def _call_chain_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
380
|
+
"""'What calls function X?' — ground truth from grep-verified call sites."""
|
|
381
|
+
tasks = []
|
|
382
|
+
picks = self._pick_symbols(max_tasks * 3, types=["function"], min_name_len=5)
|
|
383
|
+
for rel_path, sym_name, section in picks:
|
|
384
|
+
if len(tasks) >= max_tasks:
|
|
385
|
+
break
|
|
386
|
+
if sym_name.startswith("_"):
|
|
387
|
+
continue
|
|
388
|
+
|
|
389
|
+
call_sites = []
|
|
390
|
+
_call_pat = re.compile(r"\b" + re.escape(sym_name) + r"\b")
|
|
391
|
+
for other_path in self._file_records:
|
|
392
|
+
if other_path == rel_path:
|
|
393
|
+
continue
|
|
394
|
+
content = self._read_file_content(other_path)
|
|
395
|
+
if _call_pat.search(content):
|
|
396
|
+
call_sites.append(other_path)
|
|
397
|
+
|
|
398
|
+
if not call_sites:
|
|
399
|
+
continue
|
|
400
|
+
|
|
401
|
+
tasks.append(E2ETask(
|
|
402
|
+
id=f"callers_{sym_name}",
|
|
403
|
+
category="call_chain",
|
|
404
|
+
query=f"Find all files that call or reference the function `{sym_name}` (defined in `{rel_path}`). "
|
|
405
|
+
f"For each caller, explain why it uses this function.",
|
|
406
|
+
target_files=[rel_path] + call_sites[:3],
|
|
407
|
+
difficulty="hard",
|
|
408
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("call_chain", []),
|
|
409
|
+
multi_turn=True,
|
|
410
|
+
ground_truth=self._base_ground_truth(
|
|
411
|
+
required_keywords=[sym_name] + [Path(c).stem for c in call_sites[:2]],
|
|
412
|
+
expected_files=call_sites[:3],
|
|
413
|
+
expected_symbols=[sym_name],
|
|
414
|
+
expected_answer_summary=f"{sym_name} is called from: {', '.join(call_sites[:3])}",
|
|
415
|
+
verifiable_claims=[
|
|
416
|
+
(f"{Path(c).stem} calls {sym_name}", True) for c in call_sites[:3]
|
|
417
|
+
],
|
|
418
|
+
required_aspects=["call_sites", "reasons"],
|
|
419
|
+
),
|
|
420
|
+
))
|
|
421
|
+
return tasks
|
|
422
|
+
|
|
423
|
+
def _code_review_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
424
|
+
"""'Review file X for issues' — enhanced with specific structural issues."""
|
|
425
|
+
tasks = []
|
|
426
|
+
complex_files = sorted(
|
|
427
|
+
self._file_records.items(),
|
|
428
|
+
key=lambda x: len(x[1].get("sections", [])),
|
|
429
|
+
reverse=True,
|
|
430
|
+
)[:max_tasks * 2]
|
|
431
|
+
|
|
432
|
+
for rel_path, record in complex_files[:max_tasks]:
|
|
433
|
+
symbols = [s["name"] for s in record.get("sections", [])
|
|
434
|
+
if s.get("type") in ("class", "function")][:5]
|
|
435
|
+
line_count = record.get("line_count", 0)
|
|
436
|
+
|
|
437
|
+
# Detect reviewable patterns from code structure
|
|
438
|
+
review_aspects = ["error_handling", "organization"]
|
|
439
|
+
claims = []
|
|
440
|
+
if line_count > 500:
|
|
441
|
+
review_aspects.append("file_length")
|
|
442
|
+
claims.append((f"{rel_path} is {line_count} lines long", True))
|
|
443
|
+
content = self._read_file_content(rel_path)
|
|
444
|
+
if content:
|
|
445
|
+
# Check for bare except
|
|
446
|
+
if re.search(r"except\s*:", content):
|
|
447
|
+
review_aspects.append("bare_except")
|
|
448
|
+
claims.append((f"{rel_path} has bare except clauses", True))
|
|
449
|
+
# Check for TODO/FIXME
|
|
450
|
+
todo_count = len(re.findall(r"#\s*(TODO|FIXME|HACK|XXX)", content, re.IGNORECASE))
|
|
451
|
+
if todo_count:
|
|
452
|
+
claims.append((f"{rel_path} has TODO/FIXME comments", True))
|
|
453
|
+
# Check for long functions
|
|
454
|
+
long_fns = [s["name"] for s in record.get("sections", [])
|
|
455
|
+
if s.get("line_end", 0) - s.get("line_start", 0) > 80]
|
|
456
|
+
if long_fns:
|
|
457
|
+
review_aspects.append("long_functions")
|
|
458
|
+
claims.append((f"{rel_path} has long functions: {', '.join(long_fns[:2])}", True))
|
|
459
|
+
|
|
460
|
+
tasks.append(E2ETask(
|
|
461
|
+
id=f"review_{Path(rel_path).stem}",
|
|
462
|
+
category="code_review",
|
|
463
|
+
query=f"Review `{rel_path}` for code quality. Consider: error handling, "
|
|
464
|
+
f"code organization, naming conventions, and potential bugs. "
|
|
465
|
+
f"Be specific about what you find.",
|
|
466
|
+
target_files=[rel_path],
|
|
467
|
+
difficulty="hard",
|
|
468
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("code_review", []),
|
|
469
|
+
multi_turn=True,
|
|
470
|
+
ground_truth=self._base_ground_truth(
|
|
471
|
+
required_keywords=symbols[:3],
|
|
472
|
+
expected_files=[rel_path],
|
|
473
|
+
expected_symbols=symbols[:3],
|
|
474
|
+
expected_answer_summary=f"Code review of {rel_path} ({line_count} lines, {len(symbols)} symbols)",
|
|
475
|
+
verifiable_claims=claims,
|
|
476
|
+
required_aspects=review_aspects,
|
|
477
|
+
),
|
|
478
|
+
))
|
|
479
|
+
return tasks
|
|
480
|
+
|
|
481
|
+
# ── New categories ──────────────────────────────────────────────────
|
|
482
|
+
|
|
483
|
+
def _multi_file_trace_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
484
|
+
"""Trace data flow across multiple files — tests cross-file reasoning."""
|
|
485
|
+
tasks = []
|
|
486
|
+
# Find classes/functions imported and used across files
|
|
487
|
+
for rel_path, record in self._file_records.items():
|
|
488
|
+
if len(tasks) >= max_tasks:
|
|
489
|
+
break
|
|
490
|
+
imports = record.get("imports", [])
|
|
491
|
+
if not imports:
|
|
492
|
+
continue
|
|
493
|
+
|
|
494
|
+
# Find a local import (within this project)
|
|
495
|
+
for imp in imports:
|
|
496
|
+
mod = imp if isinstance(imp, str) else imp.get("module", "")
|
|
497
|
+
if not mod:
|
|
498
|
+
continue
|
|
499
|
+
# Check if it's an internal module
|
|
500
|
+
mod_parts = mod.replace(".", "/")
|
|
501
|
+
matching_files = [f for f in self._file_records
|
|
502
|
+
if mod_parts in f.replace("\\", "/")]
|
|
503
|
+
if not matching_files:
|
|
504
|
+
continue
|
|
505
|
+
|
|
506
|
+
source_file = matching_files[0]
|
|
507
|
+
source_record = self._file_records.get(source_file, {})
|
|
508
|
+
source_symbols = [s["name"] for s in source_record.get("sections", [])
|
|
509
|
+
if s.get("type") in ("class", "function")][:3]
|
|
510
|
+
if not source_symbols:
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
target_symbol = source_symbols[0]
|
|
514
|
+
tasks.append(E2ETask(
|
|
515
|
+
id=f"trace_{Path(rel_path).stem}_to_{Path(source_file).stem}",
|
|
516
|
+
category="multi_file_trace",
|
|
517
|
+
query=(f"Trace how `{rel_path}` uses `{target_symbol}` from `{source_file}`. "
|
|
518
|
+
f"What data flows from the source to the consumer? "
|
|
519
|
+
f"What transformations happen along the way?"),
|
|
520
|
+
target_files=[rel_path, source_file],
|
|
521
|
+
difficulty="hard",
|
|
522
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("multi_file_trace", []),
|
|
523
|
+
multi_turn=True,
|
|
524
|
+
ground_truth=self._base_ground_truth(
|
|
525
|
+
required_keywords=[target_symbol, Path(source_file).stem, Path(rel_path).stem],
|
|
526
|
+
expected_files=[rel_path, source_file],
|
|
527
|
+
expected_symbols=[target_symbol],
|
|
528
|
+
expected_answer_summary=(
|
|
529
|
+
f"{rel_path} imports {target_symbol} from {source_file} "
|
|
530
|
+
f"and uses it for data processing"
|
|
531
|
+
),
|
|
532
|
+
verifiable_claims=[
|
|
533
|
+
(f"{rel_path} imports from {source_file}", True),
|
|
534
|
+
(f"{target_symbol} is used in {rel_path}", True),
|
|
535
|
+
],
|
|
536
|
+
required_aspects=["import_chain", "data_flow", "transformations"],
|
|
537
|
+
),
|
|
538
|
+
))
|
|
539
|
+
break # One trace per file
|
|
540
|
+
|
|
541
|
+
return tasks[:max_tasks]
|
|
542
|
+
|
|
543
|
+
def _large_file_needle_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
544
|
+
"""Find specific detail in a large file — tests surgical extraction."""
|
|
545
|
+
tasks = []
|
|
546
|
+
# Find files with many sections (large, complex files)
|
|
547
|
+
large_files = sorted(
|
|
548
|
+
self._file_records.items(),
|
|
549
|
+
key=lambda x: x[1].get("line_count", 0),
|
|
550
|
+
reverse=True,
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
for rel_path, record in large_files:
|
|
554
|
+
if len(tasks) >= max_tasks:
|
|
555
|
+
break
|
|
556
|
+
line_count = record.get("line_count", 0)
|
|
557
|
+
if line_count < 200:
|
|
558
|
+
continue
|
|
559
|
+
|
|
560
|
+
sections = record.get("sections", [])
|
|
561
|
+
# Pick a function in the bottom half of the file (harder to find)
|
|
562
|
+
bottom_half = [s for s in sections
|
|
563
|
+
if s.get("line_start", 0) > line_count // 2
|
|
564
|
+
and s.get("type") in ("function", "method")
|
|
565
|
+
and len(s.get("name", "")) >= 5]
|
|
566
|
+
if not bottom_half:
|
|
567
|
+
continue
|
|
568
|
+
|
|
569
|
+
target = random.choice(bottom_half)
|
|
570
|
+
target_name = target["name"]
|
|
571
|
+
target_line = target.get("line_start", 0)
|
|
572
|
+
docstring = target.get("docstring", "")
|
|
573
|
+
params = target.get("params", "")
|
|
574
|
+
|
|
575
|
+
claims = [
|
|
576
|
+
(f"{target_name} starts around line {target_line}", True),
|
|
577
|
+
]
|
|
578
|
+
if docstring:
|
|
579
|
+
claims.append((f"{target_name} has a docstring", True))
|
|
580
|
+
if params:
|
|
581
|
+
claims.append((f"{target_name} accepts parameters", True))
|
|
582
|
+
|
|
583
|
+
tasks.append(E2ETask(
|
|
584
|
+
id=f"needle_{target_name}_in_{Path(rel_path).stem}",
|
|
585
|
+
category="large_file_needle",
|
|
586
|
+
query=(f"In `{rel_path}` ({line_count} lines), find the function `{target_name}`. "
|
|
587
|
+
f"What does it do, what are its parameters, and what line is it on?"),
|
|
588
|
+
target_files=[rel_path],
|
|
589
|
+
difficulty="hard",
|
|
590
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("large_file_needle", []),
|
|
591
|
+
multi_turn=True,
|
|
592
|
+
ground_truth=self._base_ground_truth(
|
|
593
|
+
required_keywords=[target_name],
|
|
594
|
+
expected_files=[rel_path],
|
|
595
|
+
expected_symbols=[target_name],
|
|
596
|
+
expected_answer_summary=(
|
|
597
|
+
f"{target_name} at ~line {target_line} in {rel_path}: {docstring or 'no docstring'}"
|
|
598
|
+
),
|
|
599
|
+
verifiable_claims=claims,
|
|
600
|
+
required_aspects=["location", "purpose", "parameters"],
|
|
601
|
+
),
|
|
602
|
+
))
|
|
603
|
+
return tasks
|
|
604
|
+
|
|
605
|
+
def _refactor_suggestion_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
606
|
+
"""Suggest refactoring for duplication — tests cross-file pattern detection."""
|
|
607
|
+
tasks = []
|
|
608
|
+
|
|
609
|
+
# Find files with similar names or in the same directory that might have duplication
|
|
610
|
+
dir_groups: dict[str, list[str]] = {}
|
|
611
|
+
for rel_path in self._file_records:
|
|
612
|
+
parent = str(Path(rel_path).parent)
|
|
613
|
+
dir_groups.setdefault(parent, []).append(rel_path)
|
|
614
|
+
|
|
615
|
+
for parent_dir, files in dir_groups.items():
|
|
616
|
+
if len(tasks) >= max_tasks:
|
|
617
|
+
break
|
|
618
|
+
if len(files) < 3:
|
|
619
|
+
continue
|
|
620
|
+
|
|
621
|
+
# Pick 2-3 files from the same directory
|
|
622
|
+
sample = random.sample(files, min(3, len(files)))
|
|
623
|
+
sample_symbols = {}
|
|
624
|
+
for f in sample:
|
|
625
|
+
rec = self._file_records.get(f, {})
|
|
626
|
+
syms = [s["name"] for s in rec.get("sections", [])
|
|
627
|
+
if s.get("type") in ("function", "method")]
|
|
628
|
+
sample_symbols[f] = syms
|
|
629
|
+
|
|
630
|
+
all_syms = []
|
|
631
|
+
for syms in sample_symbols.values():
|
|
632
|
+
all_syms.extend(syms)
|
|
633
|
+
file_stems = [Path(f).stem for f in sample]
|
|
634
|
+
|
|
635
|
+
tasks.append(E2ETask(
|
|
636
|
+
id=f"refactor_{parent_dir.replace('/', '_').replace(chr(92), '_')}",
|
|
637
|
+
category="refactor_suggestion",
|
|
638
|
+
query=(f"Analyze these files in `{parent_dir}/` for code duplication and refactoring opportunities: "
|
|
639
|
+
f"{', '.join('`' + f + '`' for f in sample)}. "
|
|
640
|
+
f"What patterns are repeated? How would you reduce the duplication?"),
|
|
641
|
+
target_files=sample,
|
|
642
|
+
difficulty="expert",
|
|
643
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("refactor_suggestion", []),
|
|
644
|
+
multi_turn=True,
|
|
645
|
+
ground_truth=self._base_ground_truth(
|
|
646
|
+
required_keywords=file_stems[:3],
|
|
647
|
+
expected_files=sample,
|
|
648
|
+
expected_answer_summary=f"Refactoring analysis of {', '.join(file_stems)}",
|
|
649
|
+
required_aspects=["duplication_patterns", "refactoring_approach", "shared_abstractions"],
|
|
650
|
+
),
|
|
651
|
+
))
|
|
652
|
+
return tasks
|
|
653
|
+
|
|
654
|
+
def _bug_injection_tasks(self, max_tasks: int) -> list[E2ETask]:
|
|
655
|
+
"""Detect issues in code — tests analytical ability with planted hints."""
|
|
656
|
+
tasks = []
|
|
657
|
+
# Pick files and ask about specific patterns that are verifiable
|
|
658
|
+
candidates = []
|
|
659
|
+
for rel_path, record in self._file_records.items():
|
|
660
|
+
content = self._read_file_content(rel_path)
|
|
661
|
+
if not content or len(content) < 500:
|
|
662
|
+
continue
|
|
663
|
+
issues = []
|
|
664
|
+
# Detect real patterns we can ask about
|
|
665
|
+
if re.search(r"except\s*:", content):
|
|
666
|
+
issues.append("bare_except")
|
|
667
|
+
if re.search(r"except\s+Exception\s*:", content):
|
|
668
|
+
issues.append("broad_except")
|
|
669
|
+
if "# TODO" in content or "# FIXME" in content or "# HACK" in content:
|
|
670
|
+
issues.append("todo_markers")
|
|
671
|
+
if re.search(r"\.format\(", content) and "f\"" not in content[:2000]:
|
|
672
|
+
issues.append("old_format_strings")
|
|
673
|
+
if re.search(r"type\([\w]+\)\s*==", content):
|
|
674
|
+
issues.append("type_comparison")
|
|
675
|
+
if re.search(r"except.*pass\s*$", content, re.MULTILINE):
|
|
676
|
+
issues.append("silent_exception")
|
|
677
|
+
if issues:
|
|
678
|
+
candidates.append((rel_path, record, content, issues))
|
|
679
|
+
|
|
680
|
+
if not candidates:
|
|
681
|
+
return []
|
|
682
|
+
|
|
683
|
+
# Keyword synonym groups — each entry is a list so the evaluator matches
|
|
684
|
+
# if ANY alternative is present. Avoids fragile single-word requirements
|
|
685
|
+
# (e.g. "overly" which models rarely write verbatim).
|
|
686
|
+
_ISSUE_KEYWORDS: dict[str, list[str]] = {
|
|
687
|
+
"bare_except": ["bare except", "except:", "catching all", "bare clause"],
|
|
688
|
+
"broad_except": ["except Exception", "broad exception", "broad except", "swallow", "silent failure"],
|
|
689
|
+
"todo_markers": ["TODO", "FIXME", "HACK", "tech debt"],
|
|
690
|
+
"old_format_strings": [".format(", "f-string", "f\"", "f'"],
|
|
691
|
+
"type_comparison": ["type()", "isinstance", "type comparison"],
|
|
692
|
+
"silent_exception": ["except: pass", "swallow", "silent", "silently"],
|
|
693
|
+
}
|
|
694
|
+
_ISSUE_DESC: dict[str, str] = {
|
|
695
|
+
"bare_except": "bare except clauses (catching all exceptions without specifying type)",
|
|
696
|
+
"broad_except": "overly broad exception handling (catching base Exception)",
|
|
697
|
+
"todo_markers": "unresolved TODO/FIXME/HACK comments",
|
|
698
|
+
"old_format_strings": "use of .format() instead of f-strings",
|
|
699
|
+
"type_comparison": "type comparison using type() == instead of isinstance()",
|
|
700
|
+
"silent_exception": "silently swallowed exceptions (except: pass)",
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
for rel_path, record, content, issues in random.sample(candidates, min(max_tasks, len(candidates))):
|
|
704
|
+
# Synonym groups per detected issue
|
|
705
|
+
expected_keywords = [_ISSUE_KEYWORDS.get(i, [i]) for i in issues[:3]]
|
|
706
|
+
claims = [(f"{rel_path} has {_ISSUE_DESC.get(i, i)}", True) for i in issues[:3]]
|
|
707
|
+
|
|
708
|
+
tasks.append(E2ETask(
|
|
709
|
+
id=f"bugs_{Path(rel_path).stem}",
|
|
710
|
+
category="bug_injection",
|
|
711
|
+
query=(f"Analyze `{rel_path}` for code quality issues, anti-patterns, and potential bugs. "
|
|
712
|
+
f"Focus on error handling, exception management, and code hygiene. "
|
|
713
|
+
f"List each issue with the specific line number or function name where it occurs, "
|
|
714
|
+
f"and suggest how to fix it."),
|
|
715
|
+
target_files=[rel_path],
|
|
716
|
+
difficulty="medium",
|
|
717
|
+
suggested_tools=_CATEGORY_TOOL_HINTS.get("bug_injection", []),
|
|
718
|
+
ground_truth=self._base_ground_truth(
|
|
719
|
+
required_keywords=expected_keywords,
|
|
720
|
+
expected_files=[rel_path],
|
|
721
|
+
expected_answer_summary=f"Issues in {rel_path}: {', '.join(issues)}",
|
|
722
|
+
verifiable_claims=claims,
|
|
723
|
+
required_aspects=["issues_found", "locations", "suggestions"],
|
|
724
|
+
# Bug reports benefit most from factual accuracy and completeness.
|
|
725
|
+
scoring_weights={
|
|
726
|
+
"keyword": 0.10,
|
|
727
|
+
"structural": 0.10,
|
|
728
|
+
"file_mention": 0.10,
|
|
729
|
+
"factual": 0.35,
|
|
730
|
+
"completeness": 0.35,
|
|
731
|
+
},
|
|
732
|
+
),
|
|
733
|
+
))
|
|
734
|
+
return tasks
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
def build_prompt(task: E2ETask) -> str:
|
|
738
|
+
"""Build the full prompt string for a task."""
|
|
739
|
+
prompt = TASK_PROMPT_TEMPLATE.format(query=task.query)
|
|
740
|
+
tools = task.suggested_tools
|
|
741
|
+
if tools:
|
|
742
|
+
prompt += f"\n\nSuggested C3 tools for this task: {', '.join(tools)}"
|
|
743
|
+
return prompt
|