code-context-control 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. cli/__init__.py +1 -0
  2. cli/_hook_utils.py +99 -0
  3. cli/c3.py +6152 -0
  4. cli/commands/__init__.py +1 -0
  5. cli/commands/common.py +312 -0
  6. cli/commands/parser.py +286 -0
  7. cli/docs.html +3178 -0
  8. cli/edits.html +878 -0
  9. cli/hook_auto_snapshot.py +142 -0
  10. cli/hook_c3_signal.py +61 -0
  11. cli/hook_c3read.py +116 -0
  12. cli/hook_edit_ledger.py +213 -0
  13. cli/hook_edit_unlock.py +170 -0
  14. cli/hook_filter.py +130 -0
  15. cli/hook_ghost_files.py +238 -0
  16. cli/hook_pretool_enforce.py +334 -0
  17. cli/hook_read.py +200 -0
  18. cli/hook_session_stats.py +62 -0
  19. cli/hook_terse_advisor.py +190 -0
  20. cli/hub.html +3764 -0
  21. cli/hub_server.py +1619 -0
  22. cli/mcp_proxy.py +428 -0
  23. cli/mcp_server.py +660 -0
  24. cli/server.py +2985 -0
  25. cli/tools/__init__.py +4 -0
  26. cli/tools/_helpers.py +65 -0
  27. cli/tools/agent.py +1165 -0
  28. cli/tools/compress.py +215 -0
  29. cli/tools/delegate.py +1184 -0
  30. cli/tools/edit.py +313 -0
  31. cli/tools/edits.py +118 -0
  32. cli/tools/filter.py +285 -0
  33. cli/tools/impact.py +163 -0
  34. cli/tools/memory.py +469 -0
  35. cli/tools/read.py +224 -0
  36. cli/tools/search.py +337 -0
  37. cli/tools/session.py +95 -0
  38. cli/tools/shell.py +193 -0
  39. cli/tools/status.py +306 -0
  40. cli/tools/validate.py +310 -0
  41. cli/ui/api.js +36 -0
  42. cli/ui/app.js +207 -0
  43. cli/ui/components/chat.js +758 -0
  44. cli/ui/components/dashboard.js +689 -0
  45. cli/ui/components/edits.js +220 -0
  46. cli/ui/components/instructions.js +481 -0
  47. cli/ui/components/memory.js +626 -0
  48. cli/ui/components/sessions.js +606 -0
  49. cli/ui/components/settings.js +1404 -0
  50. cli/ui/components/sidebar.js +156 -0
  51. cli/ui/icons.js +51 -0
  52. cli/ui/shared.js +119 -0
  53. cli/ui/theme.js +22 -0
  54. cli/ui.html +168 -0
  55. cli/ui_legacy.html +6797 -0
  56. cli/ui_nano.html +503 -0
  57. code_context_control-2.28.0.dist-info/METADATA +248 -0
  58. code_context_control-2.28.0.dist-info/RECORD +150 -0
  59. code_context_control-2.28.0.dist-info/WHEEL +5 -0
  60. code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
  61. code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
  62. code_context_control-2.28.0.dist-info/top_level.txt +5 -0
  63. core/__init__.py +75 -0
  64. core/config.py +269 -0
  65. core/ide.py +188 -0
  66. oracle/__init__.py +1 -0
  67. oracle/config.py +75 -0
  68. oracle/oracle.html +3900 -0
  69. oracle/oracle_server.py +663 -0
  70. oracle/services/__init__.py +1 -0
  71. oracle/services/c3_bridge.py +210 -0
  72. oracle/services/chat_engine.py +1103 -0
  73. oracle/services/chat_store.py +155 -0
  74. oracle/services/cross_memory.py +154 -0
  75. oracle/services/federated_graph.py +463 -0
  76. oracle/services/health_checker.py +117 -0
  77. oracle/services/insight_engine.py +307 -0
  78. oracle/services/memory_reader.py +106 -0
  79. oracle/services/memory_writer.py +182 -0
  80. oracle/services/ollama_bridge.py +332 -0
  81. oracle/services/project_scanner.py +87 -0
  82. oracle/services/review_agent.py +206 -0
  83. services/__init__.py +1 -0
  84. services/activity_log.py +93 -0
  85. services/agent_base.py +124 -0
  86. services/agents.py +1529 -0
  87. services/auto_memory.py +407 -0
  88. services/bench/__init__.py +6 -0
  89. services/bench/external/__init__.py +29 -0
  90. services/bench/external/aider_polyglot.py +405 -0
  91. services/bench/external/swe_bench.py +485 -0
  92. services/benchmark_dashboard.py +596 -0
  93. services/claude_md.py +785 -0
  94. services/compressor.py +592 -0
  95. services/context_snapshot.py +356 -0
  96. services/conversation_store.py +870 -0
  97. services/doc_index.py +537 -0
  98. services/e2e_benchmark.py +2884 -0
  99. services/e2e_evaluator.py +396 -0
  100. services/e2e_tasks.py +743 -0
  101. services/edit_ledger.py +459 -0
  102. services/embedding_index.py +341 -0
  103. services/error_reporting.py +123 -0
  104. services/file_memory.py +734 -0
  105. services/hub_service.py +585 -0
  106. services/indexer.py +712 -0
  107. services/memory.py +318 -0
  108. services/memory_consolidator.py +538 -0
  109. services/memory_graph.py +382 -0
  110. services/memory_grounder.py +304 -0
  111. services/memory_scorer.py +246 -0
  112. services/metrics.py +86 -0
  113. services/notifications.py +209 -0
  114. services/ollama_client.py +201 -0
  115. services/output_filter.py +488 -0
  116. services/parser.py +1238 -0
  117. services/project_manager.py +579 -0
  118. services/protocol.py +306 -0
  119. services/proxy_state.py +152 -0
  120. services/retrieval_broker.py +129 -0
  121. services/router.py +414 -0
  122. services/runtime.py +326 -0
  123. services/session_benchmark.py +1945 -0
  124. services/session_manager.py +1026 -0
  125. services/session_preloader.py +251 -0
  126. services/text_index.py +90 -0
  127. services/tool_classifier.py +176 -0
  128. services/transcript_index.py +340 -0
  129. services/validation_cache.py +155 -0
  130. services/vector_store.py +299 -0
  131. services/version_tracker.py +271 -0
  132. services/watcher.py +192 -0
  133. tui/__init__.py +0 -0
  134. tui/backend.py +59 -0
  135. tui/main.py +145 -0
  136. tui/screens/__init__.py +1 -0
  137. tui/screens/benchmark_view.py +109 -0
  138. tui/screens/claudemd_view.py +46 -0
  139. tui/screens/compress_view.py +52 -0
  140. tui/screens/index_view.py +74 -0
  141. tui/screens/init_view.py +82 -0
  142. tui/screens/mcp_view.py +73 -0
  143. tui/screens/optimize_view.py +41 -0
  144. tui/screens/pipe_view.py +46 -0
  145. tui/screens/projects_view.py +355 -0
  146. tui/screens/search_view.py +55 -0
  147. tui/screens/session_view.py +143 -0
  148. tui/screens/stats.py +158 -0
  149. tui/screens/ui_view.py +54 -0
  150. tui/theme.tcss +335 -0
@@ -0,0 +1,396 @@
1
+ """
2
+ E2E Benchmark Evaluator — scores AI responses against ground truths.
3
+
4
+ Five scoring dimensions:
5
+ 1. Keyword matching (free) — required/forbidden keyword checks; supports list-of-alternatives per keyword
6
+ 2. Structural analysis (free) — format quality (code blocks, references, etc.)
7
+ 3. File/symbol mentions (free) — expected files and symbols referenced
8
+ 4. Factual accuracy (free) — verify specific claims against ground truth
9
+ 5. Completeness (free) — all required aspects addressed
10
+ 6. AI-as-judge (optional) — external AI rates the response
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import os
17
+ import re
18
+ import subprocess
19
+ import time
20
+ from dataclasses import dataclass, field
21
+
22
+ from services.e2e_tasks import GroundTruth
23
+
24
+
25
+ @dataclass
26
+ class EvalScore:
27
+ """Detailed scoring breakdown for a single response."""
28
+ keyword_score: float = 0.0
29
+ structural_score: float = 0.0
30
+ file_mention_score: float = 0.0
31
+ factual_score: float = 0.0
32
+ completeness_score: float = 0.0
33
+ ai_judge_score: float | None = None
34
+ combined_score: float = 0.0
35
+ details: dict = field(default_factory=dict)
36
+
37
+ def to_dict(self) -> dict:
38
+ return {
39
+ "keyword_score": round(self.keyword_score, 3),
40
+ "structural_score": round(self.structural_score, 3),
41
+ "file_mention_score": round(self.file_mention_score, 3),
42
+ "factual_score": round(self.factual_score, 3),
43
+ "completeness_score": round(self.completeness_score, 3),
44
+ "ai_judge_score": round(self.ai_judge_score, 3) if self.ai_judge_score is not None else None,
45
+ "combined_score": round(self.combined_score, 3),
46
+ "details": self.details,
47
+ }
48
+
49
+
50
+ class Evaluator:
51
+ """Scores AI responses against ground truths using multiple dimensions."""
52
+
53
+ def __init__(self, judge_cli: str | None = None, judge_model: str | None = None):
54
+ self.judge_cli = judge_cli
55
+ self.judge_model = judge_model
56
+
57
+ def score(self, response_text: str, ground_truth: GroundTruth) -> EvalScore:
58
+ """Score a response against ground truth using all available tiers."""
59
+ result = EvalScore()
60
+ response_lower = response_text.lower()
61
+
62
+ # Tier 1: Keyword matching
63
+ result.keyword_score, kw_details = self._keyword_score(response_lower, ground_truth)
64
+ result.details["keywords"] = kw_details
65
+
66
+ # Tier 2: Structural analysis
67
+ result.structural_score, struct_details = self._structural_score(response_text)
68
+ result.details["structural"] = struct_details
69
+
70
+ # Tier 3: File/symbol mention scoring
71
+ result.file_mention_score, file_details = self._file_mention_score(response_lower, ground_truth)
72
+ result.details["file_mentions"] = file_details
73
+
74
+ # Tier 4: Factual accuracy
75
+ result.factual_score, fact_details = self._factual_score(response_lower, ground_truth)
76
+ result.details["factual"] = fact_details
77
+
78
+ # Tier 5: Completeness
79
+ result.completeness_score, comp_details = self._completeness_score(response_lower, ground_truth)
80
+ result.details["completeness"] = comp_details
81
+
82
+ # Tier 6: AI-as-judge (optional)
83
+ if self.judge_cli:
84
+ result.ai_judge_score, judge_details = self._ai_judge_score(response_text, ground_truth)
85
+ result.details["ai_judge"] = judge_details
86
+
87
+ # Combine scores
88
+ weights = ground_truth.scoring_weights
89
+ if result.ai_judge_score is not None:
90
+ # With judge: redistribute weights to include it
91
+ total_w = sum(weights.values()) + 0.20
92
+ result.combined_score = (
93
+ weights.get("keyword", 0.15) / total_w * result.keyword_score +
94
+ weights.get("structural", 0.10) / total_w * result.structural_score +
95
+ weights.get("file_mention", 0.15) / total_w * result.file_mention_score +
96
+ weights.get("factual", 0.35) / total_w * result.factual_score +
97
+ weights.get("completeness", 0.25) / total_w * result.completeness_score +
98
+ 0.20 / total_w * result.ai_judge_score
99
+ )
100
+ else:
101
+ total_w = sum(weights.values()) or 1.0
102
+ result.combined_score = (
103
+ weights.get("keyword", 0.15) / total_w * result.keyword_score +
104
+ weights.get("structural", 0.10) / total_w * result.structural_score +
105
+ weights.get("file_mention", 0.15) / total_w * result.file_mention_score +
106
+ weights.get("factual", 0.35) / total_w * result.factual_score +
107
+ weights.get("completeness", 0.25) / total_w * result.completeness_score
108
+ )
109
+
110
+ return result
111
+
112
+ def _keyword_score(self, response_lower: str, truth: GroundTruth) -> tuple[float, dict]:
113
+ """Check required keywords presence and forbidden keywords absence.
114
+
115
+ Each element of required_keywords may be:
116
+ - a str — direct substring match
117
+ - a list — any alternative in the list matches (synonym group)
118
+ """
119
+ required = truth.required_keywords
120
+ forbidden = truth.forbidden_keywords
121
+
122
+ found_required = []
123
+ missed_required = []
124
+ for kw in required:
125
+ if isinstance(kw, list):
126
+ # Synonym group: at least one alternative must appear
127
+ label = kw[0]
128
+ if any(alt.lower() in response_lower for alt in kw):
129
+ found_required.append(label)
130
+ else:
131
+ missed_required.append(label)
132
+ else:
133
+ if kw.lower() in response_lower:
134
+ found_required.append(kw)
135
+ else:
136
+ missed_required.append(kw)
137
+
138
+ found_forbidden = [kw for kw in forbidden if kw.lower() in response_lower]
139
+
140
+ if not required and not forbidden:
141
+ score = 0.5
142
+ else:
143
+ req_score = len(found_required) / len(required) if required else 1.0
144
+ forbid_penalty = len(found_forbidden) / len(forbidden) * 0.5 if forbidden else 0.0
145
+ score = max(0.0, req_score - forbid_penalty)
146
+
147
+ return score, {
148
+ "found_required": found_required,
149
+ "missed_required": missed_required,
150
+ "found_forbidden": found_forbidden,
151
+ }
152
+
153
+ def _structural_score(self, response: str) -> tuple[float, dict]:
154
+ """Evaluate response structure quality."""
155
+ score = 0.0
156
+ details = {}
157
+
158
+ code_blocks = re.findall(r"```[\s\S]*?```", response)
159
+ details["code_blocks"] = len(code_blocks)
160
+ if code_blocks:
161
+ score += 0.2
162
+
163
+ file_refs = re.findall(r"[a-zA-Z_][\w/\\]*\.\w{1,4}", response)
164
+ details["file_references"] = len(file_refs)
165
+ if file_refs:
166
+ score += 0.2
167
+
168
+ line_refs = re.findall(r"(?:line|L|:)\s*\d+", response, re.IGNORECASE)
169
+ details["line_references"] = len(line_refs)
170
+ if line_refs:
171
+ score += 0.15
172
+
173
+ backtick_refs = re.findall(r"`[a-zA-Z_]\w*`", response)
174
+ details["symbol_references"] = len(backtick_refs)
175
+ if backtick_refs:
176
+ score += 0.15
177
+
178
+ word_count = len(response.split())
179
+ details["word_count"] = word_count
180
+ if 50 <= word_count <= 600:
181
+ score += 0.15
182
+ elif 20 <= word_count < 50 or 600 < word_count <= 1000:
183
+ score += 0.08
184
+ elif word_count > 1000:
185
+ score += 0.04 # Minimal credit for very long responses (comprehensive but verbose)
186
+
187
+ has_structure = bool(re.search(r"^[\s]*[-*#\d]+[.)]?\s", response, re.MULTILINE))
188
+ details["has_structure"] = has_structure
189
+ if has_structure:
190
+ score += 0.15
191
+
192
+ return min(1.0, score), details
193
+
194
+ def _file_mention_score(self, response_lower: str, truth: GroundTruth) -> tuple[float, dict]:
195
+ """Check if expected files and symbols are mentioned."""
196
+ expected_files = truth.expected_files
197
+ expected_symbols = truth.expected_symbols
198
+
199
+ found_files = []
200
+ for f in expected_files:
201
+ fname = f.replace("\\", "/").split("/")[-1].lower()
202
+ if fname in response_lower or f.lower().replace("\\", "/") in response_lower.replace("\\", "/"):
203
+ found_files.append(f)
204
+
205
+ found_symbols = [s for s in expected_symbols if s.lower() in response_lower]
206
+
207
+ total_expected = len(expected_files) + len(expected_symbols)
208
+ total_found = len(found_files) + len(found_symbols)
209
+
210
+ if total_expected == 0:
211
+ score = 0.5
212
+ else:
213
+ score = total_found / total_expected
214
+
215
+ return score, {
216
+ "expected_files": expected_files,
217
+ "found_files": found_files,
218
+ "expected_symbols": expected_symbols,
219
+ "found_symbols": found_symbols,
220
+ }
221
+
222
+ def _factual_score(self, response_lower: str, truth: GroundTruth) -> tuple[float, dict]:
223
+ """Verify specific claims against ground truth. Higher = more accurate."""
224
+ claims = truth.verifiable_claims
225
+ if not claims:
226
+ return 0.5, {"skipped": True, "reason": "no verifiable claims defined"}
227
+
228
+ verified = []
229
+ failed = []
230
+ for claim_text, expected_true in claims:
231
+ # Check if the claim's key elements appear in the response
232
+ claim_lower = claim_text.lower()
233
+ # Extract key terms from the claim (words > 3 chars)
234
+ claim_terms = [w for w in re.findall(r"[a-z_]\w{3,}", claim_lower)
235
+ if w not in ("this", "that", "from", "with", "have", "does", "file",
236
+ "uses", "calls", "also", "true", "false", "code", "class",
237
+ "function", "value", "list", "test", "type", "name",
238
+ "method", "return", "param", "object", "string", "line",
239
+ "which", "when", "each", "been", "more", "into", "some")]
240
+ if not claim_terms:
241
+ continue
242
+
243
+ # A claim is "verified" if most of its key terms appear in the response
244
+ found_terms = sum(1 for t in claim_terms if t in response_lower)
245
+ match_ratio = found_terms / len(claim_terms)
246
+
247
+ if expected_true:
248
+ # True claim: good if mentioned
249
+ if match_ratio >= 0.5:
250
+ verified.append(claim_text)
251
+ else:
252
+ failed.append(claim_text)
253
+ else:
254
+ # False claim: good if NOT mentioned
255
+ if match_ratio < 0.5:
256
+ verified.append(claim_text)
257
+ else:
258
+ failed.append(claim_text)
259
+
260
+ total = len(verified) + len(failed)
261
+ score = len(verified) / total if total else 0.5
262
+
263
+ return score, {
264
+ "verified_claims": verified,
265
+ "failed_claims": failed,
266
+ "total_claims": len(claims),
267
+ }
268
+
269
+ def _completeness_score(self, response_lower: str, truth: GroundTruth) -> tuple[float, dict]:
270
+ """Check if all required aspects of the question were addressed."""
271
+ aspects = truth.required_aspects
272
+ if not aspects:
273
+ return 0.5, {"skipped": True, "reason": "no required aspects defined"}
274
+
275
+ # Map aspect names to detection patterns
276
+ aspect_patterns = {
277
+ "purpose": [r"purpose", r"responsible for", r"handles", r"used for", r"designed to"],
278
+ "methods": [r"method", r"def\s", r"function"],
279
+ "usage": [r"used by", r"called from", r"import", r"usage"],
280
+ "imports": [r"import", r"from\s+\w+", r"depend"],
281
+ "parameters": [r"param", r"argument", r"takes\s", r"accepts"],
282
+ "location": [r"line\s*\d", r"at\s+line", r"found at", r"located"],
283
+ "directories": [r"\bdirector(?:y|ies)\b", r"\bfolder\b", r"\bmodule\b", r"\bpackage\b", r"(?:services|cli|core|tests)/"],
284
+ "responsibilities": [r"responsible", r"handles", r"manages", r"provides"],
285
+ "relationships": [r"depends on", r"imports", r"calls", r"uses", r"relates"],
286
+ "files": [r"\.py", r"\.js", r"\.ts", r"file"],
287
+ "call_sites": [r"called from", r"referenced in", r"used in", r"calls\s"],
288
+ "reasons": [r"because", r"in order to", r"for\s", r"to\s"],
289
+ "import_chain": [r"import", r"from\s+", r"chain"],
290
+ "data_flow": [r"passes", r"returns", r"flow", r"transforms", r"converts"],
291
+ "transformations": [r"transform", r"convert", r"process", r"modify", r"format"],
292
+ "error_handling": [r"error", r"exception", r"try", r"except", r"catch", r"raise"],
293
+ "organization": [r"organiz", r"structure", r"split", r"module", r"separate"],
294
+ "file_length": [r"\d+\s*lines", r"long\s+file", r"large"],
295
+ "bare_except": [r"bare\s+except", r"except\s*:", r"catching\s+all"],
296
+ "long_functions": [r"long\s+function", r"too\s+long", r"refactor", r"split"],
297
+ "issues_found": [r"issue", r"problem", r"bug", r"concern", r"warning", r"anti.?pattern"],
298
+ "locations": [
299
+ r"line\s*\d", r"at\s+line", r"in\s+function", r"in\s+`\w",
300
+ r"\bL\d{2,}", r"L\d+[–\-]\d+", r"#\s*\d{2,}", r":\s*\d{2,}",
301
+ r"\(\s*L?\d{2,}\)", r"→\s*L?\d",
302
+ ],
303
+ "suggestions": [r"suggest", r"recommend", r"consider", r"should", r"could", r"fix\b", r"replace"],
304
+ "duplication_patterns": [r"duplicat", r"repeat", r"similar", r"common\s+pattern"],
305
+ "refactoring_approach": [r"refactor", r"extract", r"abstract", r"consolidat"],
306
+ "shared_abstractions": [r"base\s+class", r"mixin", r"shared", r"common", r"abstract"],
307
+ }
308
+
309
+ addressed = []
310
+ missed = []
311
+ for aspect in aspects:
312
+ patterns = aspect_patterns.get(aspect, [aspect])
313
+ found = any(re.search(p, response_lower) for p in patterns)
314
+ if found:
315
+ addressed.append(aspect)
316
+ else:
317
+ missed.append(aspect)
318
+
319
+ score = len(addressed) / len(aspects) if aspects else 0.5
320
+
321
+ return score, {
322
+ "addressed_aspects": addressed,
323
+ "missed_aspects": missed,
324
+ "total_aspects": len(aspects),
325
+ }
326
+
327
+ def _ai_judge_score(self, response: str, truth: GroundTruth) -> tuple[float, dict]:
328
+ """Use an AI CLI as a judge to score the response quality."""
329
+ judge_prompt = (
330
+ "You are an expert code reviewer judging an AI's response about a codebase.\n\n"
331
+ f"EXPECTED ANSWER SUMMARY: {truth.expected_answer_summary}\n\n"
332
+ f"EXPECTED FILES: {', '.join(truth.expected_files)}\n\n"
333
+ f"EXPECTED SYMBOLS: {', '.join(truth.expected_symbols)}\n\n"
334
+ f"AI RESPONSE TO JUDGE:\n{response[:3000]}\n\n"
335
+ "Rate the response on three dimensions (1-5 each):\n"
336
+ "1. ACCURACY: Is the information correct and complete?\n"
337
+ "2. RELEVANCE: Does it address the question with specific details?\n"
338
+ "3. QUALITY: Is it well-organized and actionable?\n\n"
339
+ "Reply with ONLY a JSON object: {\"accuracy\": N, \"relevance\": N, \"quality\": N}\n"
340
+ "No other text."
341
+ )
342
+
343
+ # Clean env for nested CLI calls
344
+ env = os.environ.copy()
345
+ for block_var in ("CLAUDECODE", "CLAUDE_CODE", "GEMINI_CLI", "CODEX_CLI"):
346
+ env.pop(block_var, None)
347
+
348
+ try:
349
+ cmd = self._build_judge_command(judge_prompt)
350
+ t0 = time.perf_counter()
351
+ result = subprocess.run(
352
+ cmd, capture_output=True, text=True, timeout=90, env=env,
353
+ creationflags=subprocess.CREATE_NO_WINDOW if hasattr(subprocess, "CREATE_NO_WINDOW") else 0,
354
+ )
355
+ latency = (time.perf_counter() - t0) * 1000
356
+
357
+ output = result.stdout.strip()
358
+ json_match = re.search(r"\{[^}]*\"accuracy\"[^}]*\}", output, re.DOTALL)
359
+ if json_match:
360
+ scores = json.loads(json_match.group())
361
+ accuracy = float(scores.get("accuracy", 3))
362
+ relevance = float(scores.get("relevance", 3))
363
+ quality = float(scores.get("quality", 3))
364
+ avg = (accuracy + relevance + quality) / 3.0
365
+ normalized = (avg - 1) / 4.0
366
+ return normalized, {
367
+ "accuracy": accuracy,
368
+ "relevance": relevance,
369
+ "quality": quality,
370
+ "latency_ms": round(latency, 1),
371
+ "judge_cli": self.judge_cli,
372
+ }
373
+ except Exception as e:
374
+ return None, {"error": str(e)}
375
+
376
+ return None, {"error": "Could not parse judge response"}
377
+
378
+ def _build_judge_command(self, prompt: str) -> list[str]:
379
+ """Build the CLI command for the AI judge."""
380
+ if self.judge_cli == "claude":
381
+ cmd = ["claude", "-p", prompt, "--output-format", "text"]
382
+ if self.judge_model:
383
+ cmd += ["--model", self.judge_model]
384
+ return cmd
385
+ elif self.judge_cli == "gemini":
386
+ cmd = ["gemini", "-p", prompt, "--output-format", "text"]
387
+ if self.judge_model:
388
+ cmd += ["-m", self.judge_model]
389
+ return cmd
390
+ elif self.judge_cli == "codex":
391
+ cmd = ["codex", "exec", prompt]
392
+ if self.judge_model:
393
+ cmd += ["--model", self.judge_model]
394
+ return cmd
395
+ else:
396
+ raise ValueError(f"Unknown judge CLI: {self.judge_cli}")