code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
"""SWE-bench Lite external benchmark adapter.
|
|
2
|
+
|
|
3
|
+
Wraps SWE-bench (https://www.swebench.com) — 300 real GitHub issues from 12
|
|
4
|
+
popular Python repos in the Lite subset. For each instance, an agent reads
|
|
5
|
+
the issue and produces a patch; the harness then runs the repo's tests in an
|
|
6
|
+
isolated Docker image to decide if the issue was resolved.
|
|
7
|
+
|
|
8
|
+
Setup (one-time):
|
|
9
|
+
# Dataset:
|
|
10
|
+
pip install datasets
|
|
11
|
+
# Evaluation (optional, requires Docker):
|
|
12
|
+
pip install swebench
|
|
13
|
+
|
|
14
|
+
# OR download the Lite dataset as a JSON file once:
|
|
15
|
+
python -c "from datasets import load_dataset; \\
|
|
16
|
+
ds = load_dataset('princeton-nlp/SWE-bench_Lite', split='test'); \\
|
|
17
|
+
ds.to_json('swe_bench_lite.jsonl')"
|
|
18
|
+
|
|
19
|
+
Run:
|
|
20
|
+
c3 bench external --suite swe-bench-lite --dataset swe_bench_lite.jsonl \\
|
|
21
|
+
--agent aider --model gpt-4o-mini --max-tasks 5
|
|
22
|
+
|
|
23
|
+
What it produces:
|
|
24
|
+
1. Predictions JSONL: .c3/external_benchmark/runs/swebench_<ts>_{with_c3,baseline}.jsonl
|
|
25
|
+
Each line: {"instance_id": "...", "model_patch": "diff --git ...",
|
|
26
|
+
"model_name_or_path": "c3+aider-gpt4o"}
|
|
27
|
+
Directly consumable by the official SWE-bench evaluation harness.
|
|
28
|
+
2. Summary JSON: .c3/external_benchmark/runs/swe_bench_lite_<ts>.json
|
|
29
|
+
Aggregated resolution rate, latency, cost (populated after evaluation).
|
|
30
|
+
3. Instructions to run the Docker-based evaluator if swebench is installed.
|
|
31
|
+
|
|
32
|
+
Honest caveats:
|
|
33
|
+
- Patch generation is reliable without Docker. Resolution evaluation REQUIRES
|
|
34
|
+
Docker (one image per instance) — absent, we record "unevaluated".
|
|
35
|
+
- Some repos require specific Python versions + deps that only install
|
|
36
|
+
cleanly inside their official instance image. Do not try to run tests
|
|
37
|
+
outside Docker.
|
|
38
|
+
- Real SWE-bench Lite runs are slow (many minutes per task). Start small
|
|
39
|
+
(--max-tasks 2–5) to iterate, then scale up.
|
|
40
|
+
"""
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import json
|
|
44
|
+
import subprocess
|
|
45
|
+
import tempfile
|
|
46
|
+
import time
|
|
47
|
+
from dataclasses import asdict, dataclass, field
|
|
48
|
+
from pathlib import Path
|
|
49
|
+
from typing import Optional
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class SWEBenchTask:
|
|
54
|
+
instance_id: str
|
|
55
|
+
repo: str
|
|
56
|
+
base_commit: str
|
|
57
|
+
problem_statement: str
|
|
58
|
+
hints_text: str = ""
|
|
59
|
+
test_patch: str = ""
|
|
60
|
+
patch: str = "" # gold patch (for reference only — do NOT feed to agent)
|
|
61
|
+
fail_to_pass: list[str] = field(default_factory=list)
|
|
62
|
+
pass_to_pass: list[str] = field(default_factory=list)
|
|
63
|
+
version: str = ""
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_dict(cls, d: dict) -> "SWEBenchTask":
|
|
67
|
+
def _parse_list(val):
|
|
68
|
+
if isinstance(val, list):
|
|
69
|
+
return val
|
|
70
|
+
if isinstance(val, str):
|
|
71
|
+
try:
|
|
72
|
+
return json.loads(val)
|
|
73
|
+
except Exception:
|
|
74
|
+
return [val] if val else []
|
|
75
|
+
return []
|
|
76
|
+
|
|
77
|
+
return cls(
|
|
78
|
+
instance_id=d.get("instance_id", ""),
|
|
79
|
+
repo=d.get("repo", ""),
|
|
80
|
+
base_commit=d.get("base_commit", ""),
|
|
81
|
+
problem_statement=d.get("problem_statement", ""),
|
|
82
|
+
hints_text=d.get("hints_text", ""),
|
|
83
|
+
test_patch=d.get("test_patch", ""),
|
|
84
|
+
patch=d.get("patch", ""),
|
|
85
|
+
fail_to_pass=_parse_list(d.get("FAIL_TO_PASS", d.get("fail_to_pass", []))),
|
|
86
|
+
pass_to_pass=_parse_list(d.get("PASS_TO_PASS", d.get("pass_to_pass", []))),
|
|
87
|
+
version=str(d.get("version", "")),
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class SWEBenchResult:
|
|
93
|
+
instance_id: str
|
|
94
|
+
repo: str
|
|
95
|
+
mode: str # "with_c3" | "baseline"
|
|
96
|
+
model_patch: str = ""
|
|
97
|
+
patch_empty: bool = True
|
|
98
|
+
patch_lines: int = 0
|
|
99
|
+
latency_s: float = 0.0
|
|
100
|
+
input_tokens: int = 0
|
|
101
|
+
output_tokens: int = 0
|
|
102
|
+
cost_usd: float = 0.0
|
|
103
|
+
resolved: Optional[bool] = None # None = unevaluated (no Docker)
|
|
104
|
+
error: str = ""
|
|
105
|
+
|
|
106
|
+
def to_dict(self) -> dict:
|
|
107
|
+
return asdict(self)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class SWEBenchReport:
|
|
112
|
+
timestamp: str
|
|
113
|
+
project_path: str
|
|
114
|
+
agent: str
|
|
115
|
+
model: str
|
|
116
|
+
dataset: str
|
|
117
|
+
tasks_run: int = 0
|
|
118
|
+
evaluation_method: str = "none" # "swebench-docker" | "none"
|
|
119
|
+
results: list[SWEBenchResult] = field(default_factory=list)
|
|
120
|
+
predictions_with_c3: str = ""
|
|
121
|
+
predictions_baseline: str = ""
|
|
122
|
+
|
|
123
|
+
def to_dict(self) -> dict:
|
|
124
|
+
return {
|
|
125
|
+
"timestamp": self.timestamp,
|
|
126
|
+
"project_path": self.project_path,
|
|
127
|
+
"suite": "swe-bench-lite",
|
|
128
|
+
"tier": "external",
|
|
129
|
+
"benchmark_type": "swe_bench_lite",
|
|
130
|
+
"agent": self.agent,
|
|
131
|
+
"model": self.model,
|
|
132
|
+
"dataset": self.dataset,
|
|
133
|
+
"tasks_run": self.tasks_run,
|
|
134
|
+
"evaluation_method": self.evaluation_method,
|
|
135
|
+
"results": [r.to_dict() for r in self.results],
|
|
136
|
+
"predictions_with_c3": self.predictions_with_c3,
|
|
137
|
+
"predictions_baseline": self.predictions_baseline,
|
|
138
|
+
"scorecard": self._scorecard(),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
def _scorecard(self) -> dict:
|
|
142
|
+
with_c3 = [r for r in self.results if r.mode == "with_c3"]
|
|
143
|
+
base = [r for r in self.results if r.mode == "baseline"]
|
|
144
|
+
|
|
145
|
+
def pct(hits, total):
|
|
146
|
+
return round(100.0 * hits / total, 1) if total else 0.0
|
|
147
|
+
|
|
148
|
+
# Patch-generation metrics (always available)
|
|
149
|
+
with_c3_patched = sum(1 for r in with_c3 if not r.patch_empty)
|
|
150
|
+
base_patched = sum(1 for r in base if not r.patch_empty)
|
|
151
|
+
|
|
152
|
+
# Resolution metrics (only if evaluated)
|
|
153
|
+
with_c3_resolved = sum(1 for r in with_c3 if r.resolved is True)
|
|
154
|
+
base_resolved = sum(1 for r in base if r.resolved is True)
|
|
155
|
+
evaluated = any(r.resolved is not None for r in self.results)
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
"evaluated": evaluated,
|
|
159
|
+
"with_c3_patch_rate": pct(with_c3_patched, len(with_c3)),
|
|
160
|
+
"baseline_patch_rate": pct(base_patched, len(base)),
|
|
161
|
+
# Resolution delta — the headline metric (null if unevaluated)
|
|
162
|
+
"with_c3_pass_rate": pct(with_c3_resolved, len(with_c3)) if evaluated else None,
|
|
163
|
+
"baseline_pass_rate": pct(base_resolved, len(base)) if evaluated else None,
|
|
164
|
+
"pass_rate_delta": (pct(with_c3_resolved, len(with_c3))
|
|
165
|
+
- pct(base_resolved, len(base))) if evaluated else None,
|
|
166
|
+
"with_c3_avg_latency_s": round(
|
|
167
|
+
sum(r.latency_s for r in with_c3) / len(with_c3), 1
|
|
168
|
+
) if with_c3 else 0,
|
|
169
|
+
"baseline_avg_latency_s": round(
|
|
170
|
+
sum(r.latency_s for r in base) / len(base), 1
|
|
171
|
+
) if base else 0,
|
|
172
|
+
"with_c3_total_cost_usd": round(sum(r.cost_usd for r in with_c3), 4),
|
|
173
|
+
"baseline_total_cost_usd": round(sum(r.cost_usd for r in base), 4),
|
|
174
|
+
"with_c3_count": len(with_c3),
|
|
175
|
+
"baseline_count": len(base),
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def load_tasks(dataset_path: str) -> list[SWEBenchTask]:
|
|
180
|
+
"""Load SWE-bench tasks from a JSON, JSONL, or HuggingFace dataset name.
|
|
181
|
+
|
|
182
|
+
Accepted forms:
|
|
183
|
+
- "path/to/swe_bench_lite.jsonl" — one JSON object per line
|
|
184
|
+
- "path/to/tasks.json" — a JSON array
|
|
185
|
+
- "princeton-nlp/SWE-bench_Lite" — HuggingFace dataset id (lazy import)
|
|
186
|
+
"""
|
|
187
|
+
p = Path(dataset_path)
|
|
188
|
+
if p.exists():
|
|
189
|
+
text = p.read_text(encoding="utf-8").strip()
|
|
190
|
+
# JSON array form: starts with '['
|
|
191
|
+
if text.startswith("["):
|
|
192
|
+
data = json.loads(text)
|
|
193
|
+
if isinstance(data, list):
|
|
194
|
+
return [SWEBenchTask.from_dict(r) for r in data]
|
|
195
|
+
raise ValueError(f"Unrecognised dataset format: {dataset_path}")
|
|
196
|
+
# Otherwise JSONL: one JSON object per line
|
|
197
|
+
rows = []
|
|
198
|
+
for line in text.splitlines():
|
|
199
|
+
line = line.strip()
|
|
200
|
+
if not line:
|
|
201
|
+
continue
|
|
202
|
+
obj = json.loads(line)
|
|
203
|
+
if not isinstance(obj, dict):
|
|
204
|
+
raise ValueError(f"Expected object per JSONL line, got {type(obj).__name__}")
|
|
205
|
+
rows.append(obj)
|
|
206
|
+
if rows:
|
|
207
|
+
return [SWEBenchTask.from_dict(r) for r in rows]
|
|
208
|
+
raise ValueError(f"Empty dataset: {dataset_path}")
|
|
209
|
+
|
|
210
|
+
# HuggingFace id (e.g. "princeton-nlp/SWE-bench_Lite")
|
|
211
|
+
try:
|
|
212
|
+
from datasets import load_dataset # type: ignore
|
|
213
|
+
except ImportError as e:
|
|
214
|
+
raise RuntimeError(
|
|
215
|
+
f"Dataset file not found at {dataset_path!r}, and `datasets` package "
|
|
216
|
+
"is not installed. Install with `pip install datasets` or pass a "
|
|
217
|
+
"local JSONL path."
|
|
218
|
+
) from e
|
|
219
|
+
ds = load_dataset(dataset_path, split="test")
|
|
220
|
+
return [SWEBenchTask.from_dict(r) for r in ds]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _clone_and_checkout(task: SWEBenchTask, workspace: Path) -> Optional[str]:
|
|
224
|
+
"""Shallow-clone + checkout the base commit. Returns error string or None."""
|
|
225
|
+
url = f"https://github.com/{task.repo}.git"
|
|
226
|
+
try:
|
|
227
|
+
subprocess.run(
|
|
228
|
+
["git", "clone", "--quiet", url, str(workspace)],
|
|
229
|
+
check=True, capture_output=True, text=True, timeout=300,
|
|
230
|
+
)
|
|
231
|
+
subprocess.run(
|
|
232
|
+
["git", "-C", str(workspace), "fetch", "--quiet", "origin", task.base_commit],
|
|
233
|
+
check=False, capture_output=True, text=True, timeout=120,
|
|
234
|
+
)
|
|
235
|
+
subprocess.run(
|
|
236
|
+
["git", "-C", str(workspace), "checkout", "--quiet", task.base_commit],
|
|
237
|
+
check=True, capture_output=True, text=True, timeout=60,
|
|
238
|
+
)
|
|
239
|
+
except subprocess.CalledProcessError as e:
|
|
240
|
+
return f"git: {e.stderr.strip()[:200]}"
|
|
241
|
+
except subprocess.TimeoutExpired:
|
|
242
|
+
return "git clone/checkout timed out"
|
|
243
|
+
except FileNotFoundError:
|
|
244
|
+
return "git not on PATH"
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _diff_workspace(workspace: Path) -> str:
|
|
249
|
+
"""Return the unified diff of workspace vs base commit (the patch)."""
|
|
250
|
+
try:
|
|
251
|
+
proc = subprocess.run(
|
|
252
|
+
["git", "-C", str(workspace), "diff", "HEAD"],
|
|
253
|
+
capture_output=True, text=True, timeout=30,
|
|
254
|
+
)
|
|
255
|
+
return proc.stdout
|
|
256
|
+
except Exception:
|
|
257
|
+
return ""
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _run_aider_on_task(
|
|
261
|
+
task: SWEBenchTask, workspace: Path, model: str, timeout: int,
|
|
262
|
+
) -> tuple[float, int, int, float, str]:
|
|
263
|
+
"""Invoke aider against the task. Returns (latency_s, input_tok, output_tok, cost, error)."""
|
|
264
|
+
from services.bench.external.aider_polyglot import (
|
|
265
|
+
_parse_aider_tokens_cost,
|
|
266
|
+
detect_aider,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
aider = detect_aider()
|
|
270
|
+
if not aider:
|
|
271
|
+
return (0.0, 0, 0, 0.0, "aider CLI not on PATH")
|
|
272
|
+
|
|
273
|
+
prompt = (
|
|
274
|
+
f"Resolve this GitHub issue in this repository. "
|
|
275
|
+
f"Make minimal focused changes — do not modify tests.\n\n"
|
|
276
|
+
f"=== Issue ===\n{task.problem_statement}\n"
|
|
277
|
+
)
|
|
278
|
+
if task.hints_text:
|
|
279
|
+
prompt += f"\n=== Hints ===\n{task.hints_text[:2000]}\n"
|
|
280
|
+
|
|
281
|
+
cmd = [
|
|
282
|
+
aider,
|
|
283
|
+
"--model", model,
|
|
284
|
+
"--yes-always",
|
|
285
|
+
"--no-auto-commits",
|
|
286
|
+
"--no-pretty",
|
|
287
|
+
"--no-stream",
|
|
288
|
+
"--map-tokens", "4096",
|
|
289
|
+
"--message", prompt,
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
t0 = time.monotonic()
|
|
293
|
+
try:
|
|
294
|
+
proc = subprocess.run(
|
|
295
|
+
cmd, cwd=workspace, capture_output=True, text=True, timeout=timeout,
|
|
296
|
+
)
|
|
297
|
+
latency = round(time.monotonic() - t0, 1)
|
|
298
|
+
inp, out, cost = _parse_aider_tokens_cost(proc.stdout + proc.stderr)
|
|
299
|
+
return (latency, inp, out, cost, "")
|
|
300
|
+
except subprocess.TimeoutExpired:
|
|
301
|
+
return (float(timeout), 0, 0, 0.0, "aider timed out")
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _write_c3_mcp_config(workspace: Path) -> None:
|
|
305
|
+
(workspace / ".mcp.json").write_text(
|
|
306
|
+
json.dumps({
|
|
307
|
+
"mcpServers": {
|
|
308
|
+
"c3": {
|
|
309
|
+
"command": "python",
|
|
310
|
+
"args": ["-m", "cli.mcp_server"],
|
|
311
|
+
"env": {"C3_PROJECT_PATH": str(workspace)},
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
}, indent=2),
|
|
315
|
+
encoding="utf-8",
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class SWEBenchAdapter:
|
|
320
|
+
def __init__(
|
|
321
|
+
self,
|
|
322
|
+
project_path: Path,
|
|
323
|
+
tasks: list[SWEBenchTask],
|
|
324
|
+
*,
|
|
325
|
+
agent: str = "aider",
|
|
326
|
+
model: str = "gpt-4o-mini",
|
|
327
|
+
timeout_per_task: int = 600,
|
|
328
|
+
verbose: bool = False,
|
|
329
|
+
):
|
|
330
|
+
self.project_path = project_path
|
|
331
|
+
self.tasks = tasks
|
|
332
|
+
self.agent = agent
|
|
333
|
+
self.model = model
|
|
334
|
+
self.timeout = timeout_per_task
|
|
335
|
+
self.verbose = verbose
|
|
336
|
+
|
|
337
|
+
def run_all(self, dataset_label: str = "") -> SWEBenchReport:
|
|
338
|
+
report = SWEBenchReport(
|
|
339
|
+
timestamp=time.strftime("%Y-%m-%dT%H:%M:%S"),
|
|
340
|
+
project_path=str(self.project_path),
|
|
341
|
+
agent=self.agent, model=self.model,
|
|
342
|
+
dataset=dataset_label,
|
|
343
|
+
tasks_run=len(self.tasks),
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
predictions_c3: list[dict] = []
|
|
347
|
+
predictions_base: list[dict] = []
|
|
348
|
+
|
|
349
|
+
for task in self.tasks:
|
|
350
|
+
if self.verbose:
|
|
351
|
+
print(f"\n [{task.repo}] {task.instance_id}")
|
|
352
|
+
for mode in ("baseline", "with_c3"):
|
|
353
|
+
result = self._run_one(task, mode)
|
|
354
|
+
report.results.append(result)
|
|
355
|
+
pred = {
|
|
356
|
+
"instance_id": task.instance_id,
|
|
357
|
+
"model_patch": result.model_patch,
|
|
358
|
+
"model_name_or_path": f"{'c3+' if mode == 'with_c3' else ''}{self.agent}-{self.model}",
|
|
359
|
+
}
|
|
360
|
+
(predictions_c3 if mode == "with_c3" else predictions_base).append(pred)
|
|
361
|
+
if self.verbose:
|
|
362
|
+
status = "patched" if not result.patch_empty else "EMPTY"
|
|
363
|
+
print(f" {mode:<9} {status} t={result.latency_s:.1f}s "
|
|
364
|
+
f"tok={result.input_tokens + result.output_tokens}")
|
|
365
|
+
|
|
366
|
+
# Save predictions JSONL for both modes
|
|
367
|
+
runs_dir = self.project_path / ".c3" / "external_benchmark" / "runs"
|
|
368
|
+
runs_dir.mkdir(parents=True, exist_ok=True)
|
|
369
|
+
ts = time.strftime("%Y%m%d_%H%M%S")
|
|
370
|
+
p_c3 = runs_dir / f"swebench_{ts}_with_c3.jsonl"
|
|
371
|
+
p_bs = runs_dir / f"swebench_{ts}_baseline.jsonl"
|
|
372
|
+
p_c3.write_text("\n".join(json.dumps(p) for p in predictions_c3), encoding="utf-8")
|
|
373
|
+
p_bs.write_text("\n".join(json.dumps(p) for p in predictions_base), encoding="utf-8")
|
|
374
|
+
report.predictions_with_c3 = str(p_c3)
|
|
375
|
+
report.predictions_baseline = str(p_bs)
|
|
376
|
+
|
|
377
|
+
return report
|
|
378
|
+
|
|
379
|
+
def _run_one(self, task: SWEBenchTask, mode: str) -> SWEBenchResult:
|
|
380
|
+
result = SWEBenchResult(
|
|
381
|
+
instance_id=task.instance_id, repo=task.repo, mode=mode,
|
|
382
|
+
)
|
|
383
|
+
with tempfile.TemporaryDirectory(prefix=f"c3-swe-{mode}-") as tmp:
|
|
384
|
+
workspace = Path(tmp)
|
|
385
|
+
err = _clone_and_checkout(task, workspace)
|
|
386
|
+
if err:
|
|
387
|
+
result.error = err
|
|
388
|
+
return result
|
|
389
|
+
if mode == "with_c3":
|
|
390
|
+
_write_c3_mcp_config(workspace)
|
|
391
|
+
|
|
392
|
+
if self.agent == "aider":
|
|
393
|
+
latency, inp, out, cost, err = _run_aider_on_task(
|
|
394
|
+
task, workspace, self.model, self.timeout,
|
|
395
|
+
)
|
|
396
|
+
result.latency_s = latency
|
|
397
|
+
result.input_tokens = inp
|
|
398
|
+
result.output_tokens = out
|
|
399
|
+
result.cost_usd = cost
|
|
400
|
+
if err:
|
|
401
|
+
result.error = err
|
|
402
|
+
return result
|
|
403
|
+
else:
|
|
404
|
+
result.error = f"agent not supported yet: {self.agent}"
|
|
405
|
+
return result
|
|
406
|
+
|
|
407
|
+
patch = _diff_workspace(workspace)
|
|
408
|
+
result.model_patch = patch
|
|
409
|
+
result.patch_empty = not patch.strip()
|
|
410
|
+
result.patch_lines = patch.count("\n") if patch else 0
|
|
411
|
+
|
|
412
|
+
return result
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def evaluate_with_docker(
|
|
416
|
+
predictions_path: Path,
|
|
417
|
+
dataset_path: str,
|
|
418
|
+
run_id: str = "c3-bench",
|
|
419
|
+
max_workers: int = 1,
|
|
420
|
+
timeout: int = 1800,
|
|
421
|
+
) -> Optional[dict]:
|
|
422
|
+
"""Run the official SWE-bench evaluation harness if swebench + Docker are available.
|
|
423
|
+
|
|
424
|
+
Returns the parsed results JSON or None if the harness isn't installed/usable.
|
|
425
|
+
"""
|
|
426
|
+
try:
|
|
427
|
+
import swebench.harness.run_evaluation # noqa: F401
|
|
428
|
+
except ImportError:
|
|
429
|
+
return None
|
|
430
|
+
# Docker check
|
|
431
|
+
try:
|
|
432
|
+
subprocess.run(
|
|
433
|
+
["docker", "version"], check=True, capture_output=True, timeout=10,
|
|
434
|
+
)
|
|
435
|
+
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
|
|
436
|
+
return None
|
|
437
|
+
|
|
438
|
+
cmd = [
|
|
439
|
+
"python", "-m", "swebench.harness.run_evaluation",
|
|
440
|
+
"--predictions_path", str(predictions_path),
|
|
441
|
+
"--dataset_name", dataset_path,
|
|
442
|
+
"--max_workers", str(max_workers),
|
|
443
|
+
"--run_id", run_id,
|
|
444
|
+
"--timeout", str(timeout),
|
|
445
|
+
]
|
|
446
|
+
try:
|
|
447
|
+
subprocess.run(cmd, check=True, timeout=timeout * len(list(predictions_path.read_text().splitlines())))
|
|
448
|
+
except Exception as e:
|
|
449
|
+
return {"error": str(e)}
|
|
450
|
+
|
|
451
|
+
# Parse the harness-generated results report
|
|
452
|
+
candidates = list(Path.cwd().glob(f"*{run_id}*results.json")) + \
|
|
453
|
+
list(Path.cwd().glob(f"results-*{run_id}.json"))
|
|
454
|
+
for c in candidates:
|
|
455
|
+
try:
|
|
456
|
+
return json.loads(c.read_text(encoding="utf-8"))
|
|
457
|
+
except Exception:
|
|
458
|
+
continue
|
|
459
|
+
return None
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def apply_resolution_results(
|
|
463
|
+
report: SWEBenchReport, eval_result: dict, mode: str,
|
|
464
|
+
) -> None:
|
|
465
|
+
"""Merge resolved/unresolved sets from an evaluation into the report results."""
|
|
466
|
+
resolved = set(eval_result.get("resolved_ids", []) or eval_result.get("resolved", []))
|
|
467
|
+
unresolved = set(eval_result.get("unresolved_ids", []) or eval_result.get("unresolved", []))
|
|
468
|
+
for r in report.results:
|
|
469
|
+
if r.mode != mode:
|
|
470
|
+
continue
|
|
471
|
+
if r.instance_id in resolved:
|
|
472
|
+
r.resolved = True
|
|
473
|
+
elif r.instance_id in unresolved:
|
|
474
|
+
r.resolved = False
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def save_report(project_path: Path, report: SWEBenchReport) -> Path:
|
|
478
|
+
runs_dir = project_path / ".c3" / "external_benchmark" / "runs"
|
|
479
|
+
runs_dir.mkdir(parents=True, exist_ok=True)
|
|
480
|
+
ts = time.strftime("%Y%m%d_%H%M%S")
|
|
481
|
+
out = runs_dir / f"swe_bench_lite_{ts}.json"
|
|
482
|
+
out.write_text(json.dumps(report.to_dict(), indent=2), encoding="utf-8")
|
|
483
|
+
latest = project_path / ".c3" / "external_benchmark" / "latest.json"
|
|
484
|
+
latest.write_text(json.dumps(report.to_dict(), indent=2), encoding="utf-8")
|
|
485
|
+
return out
|