code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
"""
|
|
2
|
+
E2E Benchmark Evaluator — scores AI responses against ground truths.
|
|
3
|
+
|
|
4
|
+
Five scoring dimensions:
|
|
5
|
+
1. Keyword matching (free) — required/forbidden keyword checks; supports list-of-alternatives per keyword
|
|
6
|
+
2. Structural analysis (free) — format quality (code blocks, references, etc.)
|
|
7
|
+
3. File/symbol mentions (free) — expected files and symbols referenced
|
|
8
|
+
4. Factual accuracy (free) — verify specific claims against ground truth
|
|
9
|
+
5. Completeness (free) — all required aspects addressed
|
|
10
|
+
6. AI-as-judge (optional) — external AI rates the response
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
import subprocess
|
|
19
|
+
import time
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
|
|
22
|
+
from services.e2e_tasks import GroundTruth
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class EvalScore:
|
|
27
|
+
"""Detailed scoring breakdown for a single response."""
|
|
28
|
+
keyword_score: float = 0.0
|
|
29
|
+
structural_score: float = 0.0
|
|
30
|
+
file_mention_score: float = 0.0
|
|
31
|
+
factual_score: float = 0.0
|
|
32
|
+
completeness_score: float = 0.0
|
|
33
|
+
ai_judge_score: float | None = None
|
|
34
|
+
combined_score: float = 0.0
|
|
35
|
+
details: dict = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
def to_dict(self) -> dict:
|
|
38
|
+
return {
|
|
39
|
+
"keyword_score": round(self.keyword_score, 3),
|
|
40
|
+
"structural_score": round(self.structural_score, 3),
|
|
41
|
+
"file_mention_score": round(self.file_mention_score, 3),
|
|
42
|
+
"factual_score": round(self.factual_score, 3),
|
|
43
|
+
"completeness_score": round(self.completeness_score, 3),
|
|
44
|
+
"ai_judge_score": round(self.ai_judge_score, 3) if self.ai_judge_score is not None else None,
|
|
45
|
+
"combined_score": round(self.combined_score, 3),
|
|
46
|
+
"details": self.details,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Evaluator:
|
|
51
|
+
"""Scores AI responses against ground truths using multiple dimensions."""
|
|
52
|
+
|
|
53
|
+
def __init__(self, judge_cli: str | None = None, judge_model: str | None = None):
|
|
54
|
+
self.judge_cli = judge_cli
|
|
55
|
+
self.judge_model = judge_model
|
|
56
|
+
|
|
57
|
+
def score(self, response_text: str, ground_truth: GroundTruth) -> EvalScore:
|
|
58
|
+
"""Score a response against ground truth using all available tiers."""
|
|
59
|
+
result = EvalScore()
|
|
60
|
+
response_lower = response_text.lower()
|
|
61
|
+
|
|
62
|
+
# Tier 1: Keyword matching
|
|
63
|
+
result.keyword_score, kw_details = self._keyword_score(response_lower, ground_truth)
|
|
64
|
+
result.details["keywords"] = kw_details
|
|
65
|
+
|
|
66
|
+
# Tier 2: Structural analysis
|
|
67
|
+
result.structural_score, struct_details = self._structural_score(response_text)
|
|
68
|
+
result.details["structural"] = struct_details
|
|
69
|
+
|
|
70
|
+
# Tier 3: File/symbol mention scoring
|
|
71
|
+
result.file_mention_score, file_details = self._file_mention_score(response_lower, ground_truth)
|
|
72
|
+
result.details["file_mentions"] = file_details
|
|
73
|
+
|
|
74
|
+
# Tier 4: Factual accuracy
|
|
75
|
+
result.factual_score, fact_details = self._factual_score(response_lower, ground_truth)
|
|
76
|
+
result.details["factual"] = fact_details
|
|
77
|
+
|
|
78
|
+
# Tier 5: Completeness
|
|
79
|
+
result.completeness_score, comp_details = self._completeness_score(response_lower, ground_truth)
|
|
80
|
+
result.details["completeness"] = comp_details
|
|
81
|
+
|
|
82
|
+
# Tier 6: AI-as-judge (optional)
|
|
83
|
+
if self.judge_cli:
|
|
84
|
+
result.ai_judge_score, judge_details = self._ai_judge_score(response_text, ground_truth)
|
|
85
|
+
result.details["ai_judge"] = judge_details
|
|
86
|
+
|
|
87
|
+
# Combine scores
|
|
88
|
+
weights = ground_truth.scoring_weights
|
|
89
|
+
if result.ai_judge_score is not None:
|
|
90
|
+
# With judge: redistribute weights to include it
|
|
91
|
+
total_w = sum(weights.values()) + 0.20
|
|
92
|
+
result.combined_score = (
|
|
93
|
+
weights.get("keyword", 0.15) / total_w * result.keyword_score +
|
|
94
|
+
weights.get("structural", 0.10) / total_w * result.structural_score +
|
|
95
|
+
weights.get("file_mention", 0.15) / total_w * result.file_mention_score +
|
|
96
|
+
weights.get("factual", 0.35) / total_w * result.factual_score +
|
|
97
|
+
weights.get("completeness", 0.25) / total_w * result.completeness_score +
|
|
98
|
+
0.20 / total_w * result.ai_judge_score
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
total_w = sum(weights.values()) or 1.0
|
|
102
|
+
result.combined_score = (
|
|
103
|
+
weights.get("keyword", 0.15) / total_w * result.keyword_score +
|
|
104
|
+
weights.get("structural", 0.10) / total_w * result.structural_score +
|
|
105
|
+
weights.get("file_mention", 0.15) / total_w * result.file_mention_score +
|
|
106
|
+
weights.get("factual", 0.35) / total_w * result.factual_score +
|
|
107
|
+
weights.get("completeness", 0.25) / total_w * result.completeness_score
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
def _keyword_score(self, response_lower: str, truth: GroundTruth) -> tuple[float, dict]:
|
|
113
|
+
"""Check required keywords presence and forbidden keywords absence.
|
|
114
|
+
|
|
115
|
+
Each element of required_keywords may be:
|
|
116
|
+
- a str — direct substring match
|
|
117
|
+
- a list — any alternative in the list matches (synonym group)
|
|
118
|
+
"""
|
|
119
|
+
required = truth.required_keywords
|
|
120
|
+
forbidden = truth.forbidden_keywords
|
|
121
|
+
|
|
122
|
+
found_required = []
|
|
123
|
+
missed_required = []
|
|
124
|
+
for kw in required:
|
|
125
|
+
if isinstance(kw, list):
|
|
126
|
+
# Synonym group: at least one alternative must appear
|
|
127
|
+
label = kw[0]
|
|
128
|
+
if any(alt.lower() in response_lower for alt in kw):
|
|
129
|
+
found_required.append(label)
|
|
130
|
+
else:
|
|
131
|
+
missed_required.append(label)
|
|
132
|
+
else:
|
|
133
|
+
if kw.lower() in response_lower:
|
|
134
|
+
found_required.append(kw)
|
|
135
|
+
else:
|
|
136
|
+
missed_required.append(kw)
|
|
137
|
+
|
|
138
|
+
found_forbidden = [kw for kw in forbidden if kw.lower() in response_lower]
|
|
139
|
+
|
|
140
|
+
if not required and not forbidden:
|
|
141
|
+
score = 0.5
|
|
142
|
+
else:
|
|
143
|
+
req_score = len(found_required) / len(required) if required else 1.0
|
|
144
|
+
forbid_penalty = len(found_forbidden) / len(forbidden) * 0.5 if forbidden else 0.0
|
|
145
|
+
score = max(0.0, req_score - forbid_penalty)
|
|
146
|
+
|
|
147
|
+
return score, {
|
|
148
|
+
"found_required": found_required,
|
|
149
|
+
"missed_required": missed_required,
|
|
150
|
+
"found_forbidden": found_forbidden,
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
def _structural_score(self, response: str) -> tuple[float, dict]:
|
|
154
|
+
"""Evaluate response structure quality."""
|
|
155
|
+
score = 0.0
|
|
156
|
+
details = {}
|
|
157
|
+
|
|
158
|
+
code_blocks = re.findall(r"```[\s\S]*?```", response)
|
|
159
|
+
details["code_blocks"] = len(code_blocks)
|
|
160
|
+
if code_blocks:
|
|
161
|
+
score += 0.2
|
|
162
|
+
|
|
163
|
+
file_refs = re.findall(r"[a-zA-Z_][\w/\\]*\.\w{1,4}", response)
|
|
164
|
+
details["file_references"] = len(file_refs)
|
|
165
|
+
if file_refs:
|
|
166
|
+
score += 0.2
|
|
167
|
+
|
|
168
|
+
line_refs = re.findall(r"(?:line|L|:)\s*\d+", response, re.IGNORECASE)
|
|
169
|
+
details["line_references"] = len(line_refs)
|
|
170
|
+
if line_refs:
|
|
171
|
+
score += 0.15
|
|
172
|
+
|
|
173
|
+
backtick_refs = re.findall(r"`[a-zA-Z_]\w*`", response)
|
|
174
|
+
details["symbol_references"] = len(backtick_refs)
|
|
175
|
+
if backtick_refs:
|
|
176
|
+
score += 0.15
|
|
177
|
+
|
|
178
|
+
word_count = len(response.split())
|
|
179
|
+
details["word_count"] = word_count
|
|
180
|
+
if 50 <= word_count <= 600:
|
|
181
|
+
score += 0.15
|
|
182
|
+
elif 20 <= word_count < 50 or 600 < word_count <= 1000:
|
|
183
|
+
score += 0.08
|
|
184
|
+
elif word_count > 1000:
|
|
185
|
+
score += 0.04 # Minimal credit for very long responses (comprehensive but verbose)
|
|
186
|
+
|
|
187
|
+
has_structure = bool(re.search(r"^[\s]*[-*#\d]+[.)]?\s", response, re.MULTILINE))
|
|
188
|
+
details["has_structure"] = has_structure
|
|
189
|
+
if has_structure:
|
|
190
|
+
score += 0.15
|
|
191
|
+
|
|
192
|
+
return min(1.0, score), details
|
|
193
|
+
|
|
194
|
+
def _file_mention_score(self, response_lower: str, truth: GroundTruth) -> tuple[float, dict]:
|
|
195
|
+
"""Check if expected files and symbols are mentioned."""
|
|
196
|
+
expected_files = truth.expected_files
|
|
197
|
+
expected_symbols = truth.expected_symbols
|
|
198
|
+
|
|
199
|
+
found_files = []
|
|
200
|
+
for f in expected_files:
|
|
201
|
+
fname = f.replace("\\", "/").split("/")[-1].lower()
|
|
202
|
+
if fname in response_lower or f.lower().replace("\\", "/") in response_lower.replace("\\", "/"):
|
|
203
|
+
found_files.append(f)
|
|
204
|
+
|
|
205
|
+
found_symbols = [s for s in expected_symbols if s.lower() in response_lower]
|
|
206
|
+
|
|
207
|
+
total_expected = len(expected_files) + len(expected_symbols)
|
|
208
|
+
total_found = len(found_files) + len(found_symbols)
|
|
209
|
+
|
|
210
|
+
if total_expected == 0:
|
|
211
|
+
score = 0.5
|
|
212
|
+
else:
|
|
213
|
+
score = total_found / total_expected
|
|
214
|
+
|
|
215
|
+
return score, {
|
|
216
|
+
"expected_files": expected_files,
|
|
217
|
+
"found_files": found_files,
|
|
218
|
+
"expected_symbols": expected_symbols,
|
|
219
|
+
"found_symbols": found_symbols,
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
def _factual_score(self, response_lower: str, truth: GroundTruth) -> tuple[float, dict]:
|
|
223
|
+
"""Verify specific claims against ground truth. Higher = more accurate."""
|
|
224
|
+
claims = truth.verifiable_claims
|
|
225
|
+
if not claims:
|
|
226
|
+
return 0.5, {"skipped": True, "reason": "no verifiable claims defined"}
|
|
227
|
+
|
|
228
|
+
verified = []
|
|
229
|
+
failed = []
|
|
230
|
+
for claim_text, expected_true in claims:
|
|
231
|
+
# Check if the claim's key elements appear in the response
|
|
232
|
+
claim_lower = claim_text.lower()
|
|
233
|
+
# Extract key terms from the claim (words > 3 chars)
|
|
234
|
+
claim_terms = [w for w in re.findall(r"[a-z_]\w{3,}", claim_lower)
|
|
235
|
+
if w not in ("this", "that", "from", "with", "have", "does", "file",
|
|
236
|
+
"uses", "calls", "also", "true", "false", "code", "class",
|
|
237
|
+
"function", "value", "list", "test", "type", "name",
|
|
238
|
+
"method", "return", "param", "object", "string", "line",
|
|
239
|
+
"which", "when", "each", "been", "more", "into", "some")]
|
|
240
|
+
if not claim_terms:
|
|
241
|
+
continue
|
|
242
|
+
|
|
243
|
+
# A claim is "verified" if most of its key terms appear in the response
|
|
244
|
+
found_terms = sum(1 for t in claim_terms if t in response_lower)
|
|
245
|
+
match_ratio = found_terms / len(claim_terms)
|
|
246
|
+
|
|
247
|
+
if expected_true:
|
|
248
|
+
# True claim: good if mentioned
|
|
249
|
+
if match_ratio >= 0.5:
|
|
250
|
+
verified.append(claim_text)
|
|
251
|
+
else:
|
|
252
|
+
failed.append(claim_text)
|
|
253
|
+
else:
|
|
254
|
+
# False claim: good if NOT mentioned
|
|
255
|
+
if match_ratio < 0.5:
|
|
256
|
+
verified.append(claim_text)
|
|
257
|
+
else:
|
|
258
|
+
failed.append(claim_text)
|
|
259
|
+
|
|
260
|
+
total = len(verified) + len(failed)
|
|
261
|
+
score = len(verified) / total if total else 0.5
|
|
262
|
+
|
|
263
|
+
return score, {
|
|
264
|
+
"verified_claims": verified,
|
|
265
|
+
"failed_claims": failed,
|
|
266
|
+
"total_claims": len(claims),
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
def _completeness_score(self, response_lower: str, truth: GroundTruth) -> tuple[float, dict]:
|
|
270
|
+
"""Check if all required aspects of the question were addressed."""
|
|
271
|
+
aspects = truth.required_aspects
|
|
272
|
+
if not aspects:
|
|
273
|
+
return 0.5, {"skipped": True, "reason": "no required aspects defined"}
|
|
274
|
+
|
|
275
|
+
# Map aspect names to detection patterns
|
|
276
|
+
aspect_patterns = {
|
|
277
|
+
"purpose": [r"purpose", r"responsible for", r"handles", r"used for", r"designed to"],
|
|
278
|
+
"methods": [r"method", r"def\s", r"function"],
|
|
279
|
+
"usage": [r"used by", r"called from", r"import", r"usage"],
|
|
280
|
+
"imports": [r"import", r"from\s+\w+", r"depend"],
|
|
281
|
+
"parameters": [r"param", r"argument", r"takes\s", r"accepts"],
|
|
282
|
+
"location": [r"line\s*\d", r"at\s+line", r"found at", r"located"],
|
|
283
|
+
"directories": [r"\bdirector(?:y|ies)\b", r"\bfolder\b", r"\bmodule\b", r"\bpackage\b", r"(?:services|cli|core|tests)/"],
|
|
284
|
+
"responsibilities": [r"responsible", r"handles", r"manages", r"provides"],
|
|
285
|
+
"relationships": [r"depends on", r"imports", r"calls", r"uses", r"relates"],
|
|
286
|
+
"files": [r"\.py", r"\.js", r"\.ts", r"file"],
|
|
287
|
+
"call_sites": [r"called from", r"referenced in", r"used in", r"calls\s"],
|
|
288
|
+
"reasons": [r"because", r"in order to", r"for\s", r"to\s"],
|
|
289
|
+
"import_chain": [r"import", r"from\s+", r"chain"],
|
|
290
|
+
"data_flow": [r"passes", r"returns", r"flow", r"transforms", r"converts"],
|
|
291
|
+
"transformations": [r"transform", r"convert", r"process", r"modify", r"format"],
|
|
292
|
+
"error_handling": [r"error", r"exception", r"try", r"except", r"catch", r"raise"],
|
|
293
|
+
"organization": [r"organiz", r"structure", r"split", r"module", r"separate"],
|
|
294
|
+
"file_length": [r"\d+\s*lines", r"long\s+file", r"large"],
|
|
295
|
+
"bare_except": [r"bare\s+except", r"except\s*:", r"catching\s+all"],
|
|
296
|
+
"long_functions": [r"long\s+function", r"too\s+long", r"refactor", r"split"],
|
|
297
|
+
"issues_found": [r"issue", r"problem", r"bug", r"concern", r"warning", r"anti.?pattern"],
|
|
298
|
+
"locations": [
|
|
299
|
+
r"line\s*\d", r"at\s+line", r"in\s+function", r"in\s+`\w",
|
|
300
|
+
r"\bL\d{2,}", r"L\d+[–\-]\d+", r"#\s*\d{2,}", r":\s*\d{2,}",
|
|
301
|
+
r"\(\s*L?\d{2,}\)", r"→\s*L?\d",
|
|
302
|
+
],
|
|
303
|
+
"suggestions": [r"suggest", r"recommend", r"consider", r"should", r"could", r"fix\b", r"replace"],
|
|
304
|
+
"duplication_patterns": [r"duplicat", r"repeat", r"similar", r"common\s+pattern"],
|
|
305
|
+
"refactoring_approach": [r"refactor", r"extract", r"abstract", r"consolidat"],
|
|
306
|
+
"shared_abstractions": [r"base\s+class", r"mixin", r"shared", r"common", r"abstract"],
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
addressed = []
|
|
310
|
+
missed = []
|
|
311
|
+
for aspect in aspects:
|
|
312
|
+
patterns = aspect_patterns.get(aspect, [aspect])
|
|
313
|
+
found = any(re.search(p, response_lower) for p in patterns)
|
|
314
|
+
if found:
|
|
315
|
+
addressed.append(aspect)
|
|
316
|
+
else:
|
|
317
|
+
missed.append(aspect)
|
|
318
|
+
|
|
319
|
+
score = len(addressed) / len(aspects) if aspects else 0.5
|
|
320
|
+
|
|
321
|
+
return score, {
|
|
322
|
+
"addressed_aspects": addressed,
|
|
323
|
+
"missed_aspects": missed,
|
|
324
|
+
"total_aspects": len(aspects),
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
def _ai_judge_score(self, response: str, truth: GroundTruth) -> tuple[float, dict]:
|
|
328
|
+
"""Use an AI CLI as a judge to score the response quality."""
|
|
329
|
+
judge_prompt = (
|
|
330
|
+
"You are an expert code reviewer judging an AI's response about a codebase.\n\n"
|
|
331
|
+
f"EXPECTED ANSWER SUMMARY: {truth.expected_answer_summary}\n\n"
|
|
332
|
+
f"EXPECTED FILES: {', '.join(truth.expected_files)}\n\n"
|
|
333
|
+
f"EXPECTED SYMBOLS: {', '.join(truth.expected_symbols)}\n\n"
|
|
334
|
+
f"AI RESPONSE TO JUDGE:\n{response[:3000]}\n\n"
|
|
335
|
+
"Rate the response on three dimensions (1-5 each):\n"
|
|
336
|
+
"1. ACCURACY: Is the information correct and complete?\n"
|
|
337
|
+
"2. RELEVANCE: Does it address the question with specific details?\n"
|
|
338
|
+
"3. QUALITY: Is it well-organized and actionable?\n\n"
|
|
339
|
+
"Reply with ONLY a JSON object: {\"accuracy\": N, \"relevance\": N, \"quality\": N}\n"
|
|
340
|
+
"No other text."
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
# Clean env for nested CLI calls
|
|
344
|
+
env = os.environ.copy()
|
|
345
|
+
for block_var in ("CLAUDECODE", "CLAUDE_CODE", "GEMINI_CLI", "CODEX_CLI"):
|
|
346
|
+
env.pop(block_var, None)
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
cmd = self._build_judge_command(judge_prompt)
|
|
350
|
+
t0 = time.perf_counter()
|
|
351
|
+
result = subprocess.run(
|
|
352
|
+
cmd, capture_output=True, text=True, timeout=90, env=env,
|
|
353
|
+
creationflags=subprocess.CREATE_NO_WINDOW if hasattr(subprocess, "CREATE_NO_WINDOW") else 0,
|
|
354
|
+
)
|
|
355
|
+
latency = (time.perf_counter() - t0) * 1000
|
|
356
|
+
|
|
357
|
+
output = result.stdout.strip()
|
|
358
|
+
json_match = re.search(r"\{[^}]*\"accuracy\"[^}]*\}", output, re.DOTALL)
|
|
359
|
+
if json_match:
|
|
360
|
+
scores = json.loads(json_match.group())
|
|
361
|
+
accuracy = float(scores.get("accuracy", 3))
|
|
362
|
+
relevance = float(scores.get("relevance", 3))
|
|
363
|
+
quality = float(scores.get("quality", 3))
|
|
364
|
+
avg = (accuracy + relevance + quality) / 3.0
|
|
365
|
+
normalized = (avg - 1) / 4.0
|
|
366
|
+
return normalized, {
|
|
367
|
+
"accuracy": accuracy,
|
|
368
|
+
"relevance": relevance,
|
|
369
|
+
"quality": quality,
|
|
370
|
+
"latency_ms": round(latency, 1),
|
|
371
|
+
"judge_cli": self.judge_cli,
|
|
372
|
+
}
|
|
373
|
+
except Exception as e:
|
|
374
|
+
return None, {"error": str(e)}
|
|
375
|
+
|
|
376
|
+
return None, {"error": "Could not parse judge response"}
|
|
377
|
+
|
|
378
|
+
def _build_judge_command(self, prompt: str) -> list[str]:
|
|
379
|
+
"""Build the CLI command for the AI judge."""
|
|
380
|
+
if self.judge_cli == "claude":
|
|
381
|
+
cmd = ["claude", "-p", prompt, "--output-format", "text"]
|
|
382
|
+
if self.judge_model:
|
|
383
|
+
cmd += ["--model", self.judge_model]
|
|
384
|
+
return cmd
|
|
385
|
+
elif self.judge_cli == "gemini":
|
|
386
|
+
cmd = ["gemini", "-p", prompt, "--output-format", "text"]
|
|
387
|
+
if self.judge_model:
|
|
388
|
+
cmd += ["-m", self.judge_model]
|
|
389
|
+
return cmd
|
|
390
|
+
elif self.judge_cli == "codex":
|
|
391
|
+
cmd = ["codex", "exec", prompt]
|
|
392
|
+
if self.judge_model:
|
|
393
|
+
cmd += ["--model", self.judge_model]
|
|
394
|
+
return cmd
|
|
395
|
+
else:
|
|
396
|
+
raise ValueError(f"Unknown judge CLI: {self.judge_cli}")
|