code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
"""Aider Polyglot benchmark adapter.
|
|
2
|
+
|
|
3
|
+
Wraps the Aider CLI + polyglot-benchmark corpus (225 Exercism exercises in 6
|
|
4
|
+
languages) to measure how C3 MCP affects an agent's edit success rate.
|
|
5
|
+
|
|
6
|
+
Setup (one-time):
|
|
7
|
+
git clone https://github.com/Aider-AI/polyglot-benchmark /tmp/polyglot-benchmark
|
|
8
|
+
pip install aider-chat
|
|
9
|
+
|
|
10
|
+
Run:
|
|
11
|
+
c3 bench external --suite aider-polyglot --path /tmp/polyglot-benchmark \\
|
|
12
|
+
--languages python --max-exercises 5 --model gpt-4o-mini
|
|
13
|
+
|
|
14
|
+
What it measures:
|
|
15
|
+
For each selected exercise, runs Aider twice against the same starter code:
|
|
16
|
+
1. With C3 MCP server attached (c3_* tools available)
|
|
17
|
+
2. Without any MCP servers (pure Aider baseline)
|
|
18
|
+
After each run, executes the exercise's test command to record pass/fail,
|
|
19
|
+
runtime, and token usage. Aggregate metrics: pass rate delta (C3 minus
|
|
20
|
+
baseline), average tries-to-pass, token cost.
|
|
21
|
+
|
|
22
|
+
Limitations / honest caveats:
|
|
23
|
+
- Requires `aider` CLI and a cloned polyglot-benchmark repo.
|
|
24
|
+
- Each run burns real API tokens (cost scales linearly with exercises x 2).
|
|
25
|
+
- MCP support in Aider is still evolving; this adapter uses a .mcp.json
|
|
26
|
+
file in the exercise directory to enable C3 tools. If the installed Aider
|
|
27
|
+
build does not yet honor MCP, the "with_c3" run degrades to equivalent to
|
|
28
|
+
"baseline" and the adapter records that case rather than silently passing.
|
|
29
|
+
- Test commands are language-specific and must match what polyglot-benchmark
|
|
30
|
+
expects — see LANGUAGE_TEST_COMMANDS below.
|
|
31
|
+
"""
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import json
|
|
35
|
+
import os
|
|
36
|
+
import shutil
|
|
37
|
+
import subprocess
|
|
38
|
+
import tempfile
|
|
39
|
+
import time
|
|
40
|
+
from dataclasses import asdict, dataclass, field
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from typing import Optional
|
|
43
|
+
|
|
44
|
+
# Polyglot-benchmark exercise structure:
|
|
45
|
+
# {repo}/{language}/exercises/practice/{exercise}/
|
|
46
|
+
# .meta/config.json -> files.solution = primary edit target(s)
|
|
47
|
+
# .docs/instructions.md -> prompt given to the agent
|
|
48
|
+
# <solution files> -> starter code (must edit these to pass)
|
|
49
|
+
# <test files> -> tests (agent must make pass)
|
|
50
|
+
|
|
51
|
+
LANGUAGE_TEST_COMMANDS: dict[str, list[str]] = {
|
|
52
|
+
"python": ["python", "-m", "pytest", "-q"],
|
|
53
|
+
"javascript": ["npx", "jest", "--silent"],
|
|
54
|
+
"go": ["go", "test", "./..."],
|
|
55
|
+
"rust": ["cargo", "test", "--quiet"],
|
|
56
|
+
"java": ["./gradlew", "test", "--quiet"],
|
|
57
|
+
"cpp": ["make", "test"],
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class AiderPolyglotResult:
|
|
63
|
+
exercise: str
|
|
64
|
+
language: str
|
|
65
|
+
mode: str # "with_c3" | "baseline"
|
|
66
|
+
passed: bool = False
|
|
67
|
+
tries: int = 0
|
|
68
|
+
latency_s: float = 0.0
|
|
69
|
+
input_tokens: int = 0
|
|
70
|
+
output_tokens: int = 0
|
|
71
|
+
cost_usd: float = 0.0
|
|
72
|
+
model: str = ""
|
|
73
|
+
error: str = ""
|
|
74
|
+
test_output_tail: str = "" # last ~500 chars of test output for triage
|
|
75
|
+
|
|
76
|
+
def to_dict(self) -> dict:
|
|
77
|
+
return asdict(self)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class AiderPolyglotReport:
|
|
82
|
+
timestamp: str
|
|
83
|
+
project_path: str
|
|
84
|
+
suite: str = "aider-polyglot"
|
|
85
|
+
tier: str = "external"
|
|
86
|
+
model: str = ""
|
|
87
|
+
languages: list[str] = field(default_factory=list)
|
|
88
|
+
exercises_run: int = 0
|
|
89
|
+
results: list[AiderPolyglotResult] = field(default_factory=list)
|
|
90
|
+
|
|
91
|
+
def to_dict(self) -> dict:
|
|
92
|
+
return {
|
|
93
|
+
"timestamp": self.timestamp,
|
|
94
|
+
"project_path": self.project_path,
|
|
95
|
+
"suite": self.suite,
|
|
96
|
+
"tier": self.tier,
|
|
97
|
+
"benchmark_type": "aider_polyglot",
|
|
98
|
+
"model": self.model,
|
|
99
|
+
"languages": self.languages,
|
|
100
|
+
"exercises_run": self.exercises_run,
|
|
101
|
+
"results": [r.to_dict() for r in self.results],
|
|
102
|
+
"scorecard": self._scorecard(),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
def _scorecard(self) -> dict:
|
|
106
|
+
with_c3 = [r for r in self.results if r.mode == "with_c3"]
|
|
107
|
+
base = [r for r in self.results if r.mode == "baseline"]
|
|
108
|
+
|
|
109
|
+
def pct(hits, total):
|
|
110
|
+
return round(100.0 * hits / total, 1) if total else 0.0
|
|
111
|
+
|
|
112
|
+
with_c3_pass = sum(1 for r in with_c3 if r.passed)
|
|
113
|
+
base_pass = sum(1 for r in base if r.passed)
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
"with_c3_pass_rate": pct(with_c3_pass, len(with_c3)),
|
|
117
|
+
"baseline_pass_rate": pct(base_pass, len(base)),
|
|
118
|
+
"pass_rate_delta": pct(with_c3_pass, len(with_c3)) - pct(base_pass, len(base)),
|
|
119
|
+
"with_c3_avg_latency_s": round(
|
|
120
|
+
sum(r.latency_s for r in with_c3) / len(with_c3), 1
|
|
121
|
+
) if with_c3 else 0,
|
|
122
|
+
"baseline_avg_latency_s": round(
|
|
123
|
+
sum(r.latency_s for r in base) / len(base), 1
|
|
124
|
+
) if base else 0,
|
|
125
|
+
"with_c3_total_cost_usd": round(sum(r.cost_usd for r in with_c3), 4),
|
|
126
|
+
"baseline_total_cost_usd": round(sum(r.cost_usd for r in base), 4),
|
|
127
|
+
"with_c3_count": len(with_c3),
|
|
128
|
+
"baseline_count": len(base),
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def detect_aider() -> Optional[str]:
|
|
133
|
+
"""Return path to the `aider` CLI or None if missing."""
|
|
134
|
+
return shutil.which("aider")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def find_polyglot_repo(path: Optional[str] = None) -> Optional[Path]:
|
|
138
|
+
"""Locate a polyglot-benchmark checkout.
|
|
139
|
+
|
|
140
|
+
Search order: explicit path, env var, a few common locations.
|
|
141
|
+
A directory is recognized by having at least one of the canonical language
|
|
142
|
+
subdirs (python/, javascript/, etc.) with `exercises/practice/` below it.
|
|
143
|
+
"""
|
|
144
|
+
candidates: list[Path] = []
|
|
145
|
+
if path:
|
|
146
|
+
candidates.append(Path(path))
|
|
147
|
+
env = os.environ.get("POLYGLOT_BENCHMARK_PATH")
|
|
148
|
+
if env:
|
|
149
|
+
candidates.append(Path(env))
|
|
150
|
+
candidates += [
|
|
151
|
+
Path.home() / "polyglot-benchmark",
|
|
152
|
+
Path.home() / "src" / "polyglot-benchmark",
|
|
153
|
+
Path("/tmp/polyglot-benchmark"),
|
|
154
|
+
Path("/opt/polyglot-benchmark"),
|
|
155
|
+
Path.cwd() / "polyglot-benchmark",
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
for c in candidates:
|
|
159
|
+
if not c.exists():
|
|
160
|
+
continue
|
|
161
|
+
for lang in LANGUAGE_TEST_COMMANDS:
|
|
162
|
+
if (c / lang / "exercises" / "practice").exists():
|
|
163
|
+
return c.resolve()
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _list_exercises(repo: Path, language: str, limit: int) -> list[Path]:
|
|
168
|
+
practice = repo / language / "exercises" / "practice"
|
|
169
|
+
if not practice.exists():
|
|
170
|
+
return []
|
|
171
|
+
dirs = sorted([d for d in practice.iterdir() if d.is_dir()])
|
|
172
|
+
return dirs[:limit]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _read_exercise_meta(ex_dir: Path) -> dict:
|
|
176
|
+
meta_path = ex_dir / ".meta" / "config.json"
|
|
177
|
+
if not meta_path.exists():
|
|
178
|
+
return {}
|
|
179
|
+
try:
|
|
180
|
+
return json.loads(meta_path.read_text(encoding="utf-8"))
|
|
181
|
+
except Exception:
|
|
182
|
+
return {}
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _read_instructions(ex_dir: Path) -> str:
|
|
186
|
+
docs = ex_dir / ".docs" / "instructions.md"
|
|
187
|
+
append = ex_dir / ".docs" / "instructions.append.md"
|
|
188
|
+
parts = []
|
|
189
|
+
if docs.exists():
|
|
190
|
+
parts.append(docs.read_text(encoding="utf-8", errors="replace"))
|
|
191
|
+
if append.exists():
|
|
192
|
+
parts.append(append.read_text(encoding="utf-8", errors="replace"))
|
|
193
|
+
return "\n\n".join(parts) or f"Complete the exercise in {ex_dir.name}."
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _write_c3_mcp_config(workspace: Path) -> None:
|
|
197
|
+
"""Drop an .mcp.json into workspace so aider can load C3 tools.
|
|
198
|
+
|
|
199
|
+
This assumes the `c3` CLI is installed and runnable as an MCP server.
|
|
200
|
+
If the installed Aider build doesn't honor MCP yet, this file is simply
|
|
201
|
+
ignored (no crash, no fake success).
|
|
202
|
+
"""
|
|
203
|
+
config = {
|
|
204
|
+
"mcpServers": {
|
|
205
|
+
"c3": {
|
|
206
|
+
"command": "python",
|
|
207
|
+
"args": ["-m", "cli.mcp_server"],
|
|
208
|
+
"env": {"C3_PROJECT_PATH": str(workspace)},
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
(workspace / ".mcp.json").write_text(json.dumps(config, indent=2), encoding="utf-8")
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class AiderPolyglotBenchmark:
|
|
216
|
+
def __init__(
|
|
217
|
+
self,
|
|
218
|
+
repo_path: Path,
|
|
219
|
+
project_path: Path,
|
|
220
|
+
*,
|
|
221
|
+
languages: list[str],
|
|
222
|
+
max_exercises: int = 5,
|
|
223
|
+
model: str = "gpt-4o-mini",
|
|
224
|
+
timeout_per_exercise: int = 300,
|
|
225
|
+
verbose: bool = False,
|
|
226
|
+
):
|
|
227
|
+
self.repo = repo_path
|
|
228
|
+
self.project = project_path
|
|
229
|
+
self.languages = languages
|
|
230
|
+
self.max_exercises = max_exercises
|
|
231
|
+
self.model = model
|
|
232
|
+
self.timeout = timeout_per_exercise
|
|
233
|
+
self.verbose = verbose
|
|
234
|
+
|
|
235
|
+
def run_all(self) -> AiderPolyglotReport:
|
|
236
|
+
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S")
|
|
237
|
+
report = AiderPolyglotReport(
|
|
238
|
+
timestamp=timestamp,
|
|
239
|
+
project_path=str(self.project),
|
|
240
|
+
model=self.model,
|
|
241
|
+
languages=list(self.languages),
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
for lang in self.languages:
|
|
245
|
+
if lang not in LANGUAGE_TEST_COMMANDS:
|
|
246
|
+
print(f" [skip] Unsupported language: {lang}")
|
|
247
|
+
continue
|
|
248
|
+
exercises = _list_exercises(self.repo, lang, self.max_exercises)
|
|
249
|
+
if not exercises:
|
|
250
|
+
print(f" [skip] No {lang} exercises under {self.repo}")
|
|
251
|
+
continue
|
|
252
|
+
|
|
253
|
+
for ex in exercises:
|
|
254
|
+
report.exercises_run += 1
|
|
255
|
+
if self.verbose:
|
|
256
|
+
print(f"\n [{lang}] {ex.name}")
|
|
257
|
+
|
|
258
|
+
for mode in ("baseline", "with_c3"):
|
|
259
|
+
result = self._run_exercise(ex, lang, mode)
|
|
260
|
+
report.results.append(result)
|
|
261
|
+
if self.verbose:
|
|
262
|
+
status = "PASS" if result.passed else "FAIL"
|
|
263
|
+
print(
|
|
264
|
+
f" {mode:<9} {status} "
|
|
265
|
+
f"t={result.latency_s:.1f}s "
|
|
266
|
+
f"tok={result.input_tokens + result.output_tokens}"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
return report
|
|
270
|
+
|
|
271
|
+
def _run_exercise(self, ex_dir: Path, language: str, mode: str) -> AiderPolyglotResult:
|
|
272
|
+
result = AiderPolyglotResult(
|
|
273
|
+
exercise=ex_dir.name, language=language, mode=mode, model=self.model, tries=1
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
meta = _read_exercise_meta(ex_dir)
|
|
277
|
+
solution_files = meta.get("files", {}).get("solution", [])
|
|
278
|
+
if not solution_files:
|
|
279
|
+
result.error = "no solution files in .meta/config.json"
|
|
280
|
+
return result
|
|
281
|
+
|
|
282
|
+
with tempfile.TemporaryDirectory(prefix=f"c3-aider-{mode}-") as tmp:
|
|
283
|
+
workspace = Path(tmp)
|
|
284
|
+
# Copy full exercise dir (code + tests + docs)
|
|
285
|
+
for child in ex_dir.iterdir():
|
|
286
|
+
target = workspace / child.name
|
|
287
|
+
if child.is_dir():
|
|
288
|
+
shutil.copytree(child, target)
|
|
289
|
+
else:
|
|
290
|
+
shutil.copy2(child, target)
|
|
291
|
+
|
|
292
|
+
if mode == "with_c3":
|
|
293
|
+
_write_c3_mcp_config(workspace)
|
|
294
|
+
|
|
295
|
+
instructions = _read_instructions(ex_dir)
|
|
296
|
+
prompt = (
|
|
297
|
+
f"{instructions}\n\n"
|
|
298
|
+
f"Edit the solution file(s) so the existing tests pass. "
|
|
299
|
+
f"Do not modify the test files."
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
aider = detect_aider()
|
|
303
|
+
if not aider:
|
|
304
|
+
result.error = "aider CLI not on PATH"
|
|
305
|
+
return result
|
|
306
|
+
|
|
307
|
+
cmd = [
|
|
308
|
+
aider,
|
|
309
|
+
"--model", self.model,
|
|
310
|
+
"--yes-always",
|
|
311
|
+
"--no-auto-commits",
|
|
312
|
+
"--no-pretty",
|
|
313
|
+
"--no-stream",
|
|
314
|
+
"--message", prompt,
|
|
315
|
+
*solution_files,
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
t0 = time.monotonic()
|
|
319
|
+
try:
|
|
320
|
+
proc = subprocess.run(
|
|
321
|
+
cmd,
|
|
322
|
+
cwd=workspace,
|
|
323
|
+
capture_output=True,
|
|
324
|
+
text=True,
|
|
325
|
+
timeout=self.timeout,
|
|
326
|
+
)
|
|
327
|
+
result.latency_s = round(time.monotonic() - t0, 1)
|
|
328
|
+
result.input_tokens, result.output_tokens, result.cost_usd = \
|
|
329
|
+
_parse_aider_tokens_cost(proc.stdout + proc.stderr)
|
|
330
|
+
except subprocess.TimeoutExpired:
|
|
331
|
+
result.latency_s = float(self.timeout)
|
|
332
|
+
result.error = "aider timed out"
|
|
333
|
+
return result
|
|
334
|
+
except FileNotFoundError:
|
|
335
|
+
result.error = "aider not invocable"
|
|
336
|
+
return result
|
|
337
|
+
|
|
338
|
+
test_cmd = LANGUAGE_TEST_COMMANDS[language]
|
|
339
|
+
try:
|
|
340
|
+
tp = subprocess.run(
|
|
341
|
+
test_cmd,
|
|
342
|
+
cwd=workspace,
|
|
343
|
+
capture_output=True,
|
|
344
|
+
text=True,
|
|
345
|
+
timeout=120,
|
|
346
|
+
)
|
|
347
|
+
result.passed = tp.returncode == 0
|
|
348
|
+
tail = (tp.stdout + tp.stderr)[-500:]
|
|
349
|
+
result.test_output_tail = tail
|
|
350
|
+
except subprocess.TimeoutExpired:
|
|
351
|
+
result.error = "tests timed out"
|
|
352
|
+
except FileNotFoundError:
|
|
353
|
+
result.error = f"test runner missing: {' '.join(test_cmd)}"
|
|
354
|
+
|
|
355
|
+
return result
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _parse_aider_tokens_cost(output: str) -> tuple[int, int, float]:
|
|
359
|
+
"""Best-effort parse of aider's trailing 'Tokens: ... Cost: ...' line.
|
|
360
|
+
|
|
361
|
+
Aider prints something like:
|
|
362
|
+
Tokens: 2.3k sent, 450 received.
|
|
363
|
+
Cost: $0.0123 message, $0.0456 session.
|
|
364
|
+
We extract totals for the message. Fall back to zeros on parse failure.
|
|
365
|
+
"""
|
|
366
|
+
import re
|
|
367
|
+
|
|
368
|
+
inp = out = 0
|
|
369
|
+
cost = 0.0
|
|
370
|
+
for line in output.splitlines()[-20:]:
|
|
371
|
+
line = line.strip()
|
|
372
|
+
if line.startswith("Tokens:"):
|
|
373
|
+
m = re.search(r"([\d.]+)([kKmM]?)\s*sent", line)
|
|
374
|
+
if m:
|
|
375
|
+
inp = _to_int(m.group(1), m.group(2))
|
|
376
|
+
m = re.search(r"([\d.]+)([kKmM]?)\s*received", line)
|
|
377
|
+
if m:
|
|
378
|
+
out = _to_int(m.group(1), m.group(2))
|
|
379
|
+
elif line.startswith("Cost:"):
|
|
380
|
+
m = re.search(r"\$\s*([\d.]+)\s*message", line)
|
|
381
|
+
if m:
|
|
382
|
+
try:
|
|
383
|
+
cost = float(m.group(1))
|
|
384
|
+
except ValueError:
|
|
385
|
+
cost = 0.0
|
|
386
|
+
return inp, out, cost
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _to_int(val: str, suffix: str) -> int:
|
|
390
|
+
mult = {"k": 1_000, "K": 1_000, "m": 1_000_000, "M": 1_000_000}.get(suffix, 1)
|
|
391
|
+
try:
|
|
392
|
+
return int(float(val) * mult)
|
|
393
|
+
except ValueError:
|
|
394
|
+
return 0
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def save_report(project_path: Path, report: AiderPolyglotReport) -> Path:
|
|
398
|
+
runs_dir = project_path / ".c3" / "external_benchmark" / "runs"
|
|
399
|
+
runs_dir.mkdir(parents=True, exist_ok=True)
|
|
400
|
+
ts = time.strftime("%Y%m%d_%H%M%S")
|
|
401
|
+
out = runs_dir / f"aider_polyglot_{ts}.json"
|
|
402
|
+
out.write_text(json.dumps(report.to_dict(), indent=2), encoding="utf-8")
|
|
403
|
+
latest = project_path / ".c3" / "external_benchmark" / "latest.json"
|
|
404
|
+
latest.write_text(json.dumps(report.to_dict(), indent=2), encoding="utf-8")
|
|
405
|
+
return out
|