code-context-control 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. cli/__init__.py +1 -0
  2. cli/_hook_utils.py +99 -0
  3. cli/c3.py +6152 -0
  4. cli/commands/__init__.py +1 -0
  5. cli/commands/common.py +312 -0
  6. cli/commands/parser.py +286 -0
  7. cli/docs.html +3178 -0
  8. cli/edits.html +878 -0
  9. cli/hook_auto_snapshot.py +142 -0
  10. cli/hook_c3_signal.py +61 -0
  11. cli/hook_c3read.py +116 -0
  12. cli/hook_edit_ledger.py +213 -0
  13. cli/hook_edit_unlock.py +170 -0
  14. cli/hook_filter.py +130 -0
  15. cli/hook_ghost_files.py +238 -0
  16. cli/hook_pretool_enforce.py +334 -0
  17. cli/hook_read.py +200 -0
  18. cli/hook_session_stats.py +62 -0
  19. cli/hook_terse_advisor.py +190 -0
  20. cli/hub.html +3764 -0
  21. cli/hub_server.py +1619 -0
  22. cli/mcp_proxy.py +428 -0
  23. cli/mcp_server.py +660 -0
  24. cli/server.py +2985 -0
  25. cli/tools/__init__.py +4 -0
  26. cli/tools/_helpers.py +65 -0
  27. cli/tools/agent.py +1165 -0
  28. cli/tools/compress.py +215 -0
  29. cli/tools/delegate.py +1184 -0
  30. cli/tools/edit.py +313 -0
  31. cli/tools/edits.py +118 -0
  32. cli/tools/filter.py +285 -0
  33. cli/tools/impact.py +163 -0
  34. cli/tools/memory.py +469 -0
  35. cli/tools/read.py +224 -0
  36. cli/tools/search.py +337 -0
  37. cli/tools/session.py +95 -0
  38. cli/tools/shell.py +193 -0
  39. cli/tools/status.py +306 -0
  40. cli/tools/validate.py +310 -0
  41. cli/ui/api.js +36 -0
  42. cli/ui/app.js +207 -0
  43. cli/ui/components/chat.js +758 -0
  44. cli/ui/components/dashboard.js +689 -0
  45. cli/ui/components/edits.js +220 -0
  46. cli/ui/components/instructions.js +481 -0
  47. cli/ui/components/memory.js +626 -0
  48. cli/ui/components/sessions.js +606 -0
  49. cli/ui/components/settings.js +1404 -0
  50. cli/ui/components/sidebar.js +156 -0
  51. cli/ui/icons.js +51 -0
  52. cli/ui/shared.js +119 -0
  53. cli/ui/theme.js +22 -0
  54. cli/ui.html +168 -0
  55. cli/ui_legacy.html +6797 -0
  56. cli/ui_nano.html +503 -0
  57. code_context_control-2.28.0.dist-info/METADATA +248 -0
  58. code_context_control-2.28.0.dist-info/RECORD +150 -0
  59. code_context_control-2.28.0.dist-info/WHEEL +5 -0
  60. code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
  61. code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
  62. code_context_control-2.28.0.dist-info/top_level.txt +5 -0
  63. core/__init__.py +75 -0
  64. core/config.py +269 -0
  65. core/ide.py +188 -0
  66. oracle/__init__.py +1 -0
  67. oracle/config.py +75 -0
  68. oracle/oracle.html +3900 -0
  69. oracle/oracle_server.py +663 -0
  70. oracle/services/__init__.py +1 -0
  71. oracle/services/c3_bridge.py +210 -0
  72. oracle/services/chat_engine.py +1103 -0
  73. oracle/services/chat_store.py +155 -0
  74. oracle/services/cross_memory.py +154 -0
  75. oracle/services/federated_graph.py +463 -0
  76. oracle/services/health_checker.py +117 -0
  77. oracle/services/insight_engine.py +307 -0
  78. oracle/services/memory_reader.py +106 -0
  79. oracle/services/memory_writer.py +182 -0
  80. oracle/services/ollama_bridge.py +332 -0
  81. oracle/services/project_scanner.py +87 -0
  82. oracle/services/review_agent.py +206 -0
  83. services/__init__.py +1 -0
  84. services/activity_log.py +93 -0
  85. services/agent_base.py +124 -0
  86. services/agents.py +1529 -0
  87. services/auto_memory.py +407 -0
  88. services/bench/__init__.py +6 -0
  89. services/bench/external/__init__.py +29 -0
  90. services/bench/external/aider_polyglot.py +405 -0
  91. services/bench/external/swe_bench.py +485 -0
  92. services/benchmark_dashboard.py +596 -0
  93. services/claude_md.py +785 -0
  94. services/compressor.py +592 -0
  95. services/context_snapshot.py +356 -0
  96. services/conversation_store.py +870 -0
  97. services/doc_index.py +537 -0
  98. services/e2e_benchmark.py +2884 -0
  99. services/e2e_evaluator.py +396 -0
  100. services/e2e_tasks.py +743 -0
  101. services/edit_ledger.py +459 -0
  102. services/embedding_index.py +341 -0
  103. services/error_reporting.py +123 -0
  104. services/file_memory.py +734 -0
  105. services/hub_service.py +585 -0
  106. services/indexer.py +712 -0
  107. services/memory.py +318 -0
  108. services/memory_consolidator.py +538 -0
  109. services/memory_graph.py +382 -0
  110. services/memory_grounder.py +304 -0
  111. services/memory_scorer.py +246 -0
  112. services/metrics.py +86 -0
  113. services/notifications.py +209 -0
  114. services/ollama_client.py +201 -0
  115. services/output_filter.py +488 -0
  116. services/parser.py +1238 -0
  117. services/project_manager.py +579 -0
  118. services/protocol.py +306 -0
  119. services/proxy_state.py +152 -0
  120. services/retrieval_broker.py +129 -0
  121. services/router.py +414 -0
  122. services/runtime.py +326 -0
  123. services/session_benchmark.py +1945 -0
  124. services/session_manager.py +1026 -0
  125. services/session_preloader.py +251 -0
  126. services/text_index.py +90 -0
  127. services/tool_classifier.py +176 -0
  128. services/transcript_index.py +340 -0
  129. services/validation_cache.py +155 -0
  130. services/vector_store.py +299 -0
  131. services/version_tracker.py +271 -0
  132. services/watcher.py +192 -0
  133. tui/__init__.py +0 -0
  134. tui/backend.py +59 -0
  135. tui/main.py +145 -0
  136. tui/screens/__init__.py +1 -0
  137. tui/screens/benchmark_view.py +109 -0
  138. tui/screens/claudemd_view.py +46 -0
  139. tui/screens/compress_view.py +52 -0
  140. tui/screens/index_view.py +74 -0
  141. tui/screens/init_view.py +82 -0
  142. tui/screens/mcp_view.py +73 -0
  143. tui/screens/optimize_view.py +41 -0
  144. tui/screens/pipe_view.py +46 -0
  145. tui/screens/projects_view.py +355 -0
  146. tui/screens/search_view.py +55 -0
  147. tui/screens/session_view.py +143 -0
  148. tui/screens/stats.py +158 -0
  149. tui/screens/ui_view.py +54 -0
  150. tui/theme.tcss +335 -0
@@ -0,0 +1,485 @@
1
+ """SWE-bench Lite external benchmark adapter.
2
+
3
+ Wraps SWE-bench (https://www.swebench.com) — 300 real GitHub issues from 12
4
+ popular Python repos in the Lite subset. For each instance, an agent reads
5
+ the issue and produces a patch; the harness then runs the repo's tests in an
6
+ isolated Docker image to decide if the issue was resolved.
7
+
8
+ Setup (one-time):
9
+ # Dataset:
10
+ pip install datasets
11
+ # Evaluation (optional, requires Docker):
12
+ pip install swebench
13
+
14
+ # OR download the Lite dataset as a JSON file once:
15
+ python -c "from datasets import load_dataset; \\
16
+ ds = load_dataset('princeton-nlp/SWE-bench_Lite', split='test'); \\
17
+ ds.to_json('swe_bench_lite.jsonl')"
18
+
19
+ Run:
20
+ c3 bench external --suite swe-bench-lite --dataset swe_bench_lite.jsonl \\
21
+ --agent aider --model gpt-4o-mini --max-tasks 5
22
+
23
+ What it produces:
24
+ 1. Predictions JSONL: .c3/external_benchmark/runs/swebench_<ts>_{with_c3,baseline}.jsonl
25
+ Each line: {"instance_id": "...", "model_patch": "diff --git ...",
26
+ "model_name_or_path": "c3+aider-gpt4o"}
27
+ Directly consumable by the official SWE-bench evaluation harness.
28
+ 2. Summary JSON: .c3/external_benchmark/runs/swe_bench_lite_<ts>.json
29
+ Aggregated resolution rate, latency, cost (populated after evaluation).
30
+ 3. Instructions to run the Docker-based evaluator if swebench is installed.
31
+
32
+ Honest caveats:
33
+ - Patch generation is reliable without Docker. Resolution evaluation REQUIRES
34
+ Docker (one image per instance) — absent, we record "unevaluated".
35
+ - Some repos require specific Python versions + deps that only install
36
+ cleanly inside their official instance image. Do not try to run tests
37
+ outside Docker.
38
+ - Real SWE-bench Lite runs are slow (many minutes per task). Start small
39
+ (--max-tasks 2–5) to iterate, then scale up.
40
+ """
41
+ from __future__ import annotations
42
+
43
+ import json
44
+ import subprocess
45
+ import tempfile
46
+ import time
47
+ from dataclasses import asdict, dataclass, field
48
+ from pathlib import Path
49
+ from typing import Optional
50
+
51
+
52
+ @dataclass
53
+ class SWEBenchTask:
54
+ instance_id: str
55
+ repo: str
56
+ base_commit: str
57
+ problem_statement: str
58
+ hints_text: str = ""
59
+ test_patch: str = ""
60
+ patch: str = "" # gold patch (for reference only — do NOT feed to agent)
61
+ fail_to_pass: list[str] = field(default_factory=list)
62
+ pass_to_pass: list[str] = field(default_factory=list)
63
+ version: str = ""
64
+
65
+ @classmethod
66
+ def from_dict(cls, d: dict) -> "SWEBenchTask":
67
+ def _parse_list(val):
68
+ if isinstance(val, list):
69
+ return val
70
+ if isinstance(val, str):
71
+ try:
72
+ return json.loads(val)
73
+ except Exception:
74
+ return [val] if val else []
75
+ return []
76
+
77
+ return cls(
78
+ instance_id=d.get("instance_id", ""),
79
+ repo=d.get("repo", ""),
80
+ base_commit=d.get("base_commit", ""),
81
+ problem_statement=d.get("problem_statement", ""),
82
+ hints_text=d.get("hints_text", ""),
83
+ test_patch=d.get("test_patch", ""),
84
+ patch=d.get("patch", ""),
85
+ fail_to_pass=_parse_list(d.get("FAIL_TO_PASS", d.get("fail_to_pass", []))),
86
+ pass_to_pass=_parse_list(d.get("PASS_TO_PASS", d.get("pass_to_pass", []))),
87
+ version=str(d.get("version", "")),
88
+ )
89
+
90
+
91
+ @dataclass
92
+ class SWEBenchResult:
93
+ instance_id: str
94
+ repo: str
95
+ mode: str # "with_c3" | "baseline"
96
+ model_patch: str = ""
97
+ patch_empty: bool = True
98
+ patch_lines: int = 0
99
+ latency_s: float = 0.0
100
+ input_tokens: int = 0
101
+ output_tokens: int = 0
102
+ cost_usd: float = 0.0
103
+ resolved: Optional[bool] = None # None = unevaluated (no Docker)
104
+ error: str = ""
105
+
106
+ def to_dict(self) -> dict:
107
+ return asdict(self)
108
+
109
+
110
+ @dataclass
111
+ class SWEBenchReport:
112
+ timestamp: str
113
+ project_path: str
114
+ agent: str
115
+ model: str
116
+ dataset: str
117
+ tasks_run: int = 0
118
+ evaluation_method: str = "none" # "swebench-docker" | "none"
119
+ results: list[SWEBenchResult] = field(default_factory=list)
120
+ predictions_with_c3: str = ""
121
+ predictions_baseline: str = ""
122
+
123
+ def to_dict(self) -> dict:
124
+ return {
125
+ "timestamp": self.timestamp,
126
+ "project_path": self.project_path,
127
+ "suite": "swe-bench-lite",
128
+ "tier": "external",
129
+ "benchmark_type": "swe_bench_lite",
130
+ "agent": self.agent,
131
+ "model": self.model,
132
+ "dataset": self.dataset,
133
+ "tasks_run": self.tasks_run,
134
+ "evaluation_method": self.evaluation_method,
135
+ "results": [r.to_dict() for r in self.results],
136
+ "predictions_with_c3": self.predictions_with_c3,
137
+ "predictions_baseline": self.predictions_baseline,
138
+ "scorecard": self._scorecard(),
139
+ }
140
+
141
+ def _scorecard(self) -> dict:
142
+ with_c3 = [r for r in self.results if r.mode == "with_c3"]
143
+ base = [r for r in self.results if r.mode == "baseline"]
144
+
145
+ def pct(hits, total):
146
+ return round(100.0 * hits / total, 1) if total else 0.0
147
+
148
+ # Patch-generation metrics (always available)
149
+ with_c3_patched = sum(1 for r in with_c3 if not r.patch_empty)
150
+ base_patched = sum(1 for r in base if not r.patch_empty)
151
+
152
+ # Resolution metrics (only if evaluated)
153
+ with_c3_resolved = sum(1 for r in with_c3 if r.resolved is True)
154
+ base_resolved = sum(1 for r in base if r.resolved is True)
155
+ evaluated = any(r.resolved is not None for r in self.results)
156
+
157
+ return {
158
+ "evaluated": evaluated,
159
+ "with_c3_patch_rate": pct(with_c3_patched, len(with_c3)),
160
+ "baseline_patch_rate": pct(base_patched, len(base)),
161
+ # Resolution delta — the headline metric (null if unevaluated)
162
+ "with_c3_pass_rate": pct(with_c3_resolved, len(with_c3)) if evaluated else None,
163
+ "baseline_pass_rate": pct(base_resolved, len(base)) if evaluated else None,
164
+ "pass_rate_delta": (pct(with_c3_resolved, len(with_c3))
165
+ - pct(base_resolved, len(base))) if evaluated else None,
166
+ "with_c3_avg_latency_s": round(
167
+ sum(r.latency_s for r in with_c3) / len(with_c3), 1
168
+ ) if with_c3 else 0,
169
+ "baseline_avg_latency_s": round(
170
+ sum(r.latency_s for r in base) / len(base), 1
171
+ ) if base else 0,
172
+ "with_c3_total_cost_usd": round(sum(r.cost_usd for r in with_c3), 4),
173
+ "baseline_total_cost_usd": round(sum(r.cost_usd for r in base), 4),
174
+ "with_c3_count": len(with_c3),
175
+ "baseline_count": len(base),
176
+ }
177
+
178
+
179
+ def load_tasks(dataset_path: str) -> list[SWEBenchTask]:
180
+ """Load SWE-bench tasks from a JSON, JSONL, or HuggingFace dataset name.
181
+
182
+ Accepted forms:
183
+ - "path/to/swe_bench_lite.jsonl" — one JSON object per line
184
+ - "path/to/tasks.json" — a JSON array
185
+ - "princeton-nlp/SWE-bench_Lite" — HuggingFace dataset id (lazy import)
186
+ """
187
+ p = Path(dataset_path)
188
+ if p.exists():
189
+ text = p.read_text(encoding="utf-8").strip()
190
+ # JSON array form: starts with '['
191
+ if text.startswith("["):
192
+ data = json.loads(text)
193
+ if isinstance(data, list):
194
+ return [SWEBenchTask.from_dict(r) for r in data]
195
+ raise ValueError(f"Unrecognised dataset format: {dataset_path}")
196
+ # Otherwise JSONL: one JSON object per line
197
+ rows = []
198
+ for line in text.splitlines():
199
+ line = line.strip()
200
+ if not line:
201
+ continue
202
+ obj = json.loads(line)
203
+ if not isinstance(obj, dict):
204
+ raise ValueError(f"Expected object per JSONL line, got {type(obj).__name__}")
205
+ rows.append(obj)
206
+ if rows:
207
+ return [SWEBenchTask.from_dict(r) for r in rows]
208
+ raise ValueError(f"Empty dataset: {dataset_path}")
209
+
210
+ # HuggingFace id (e.g. "princeton-nlp/SWE-bench_Lite")
211
+ try:
212
+ from datasets import load_dataset # type: ignore
213
+ except ImportError as e:
214
+ raise RuntimeError(
215
+ f"Dataset file not found at {dataset_path!r}, and `datasets` package "
216
+ "is not installed. Install with `pip install datasets` or pass a "
217
+ "local JSONL path."
218
+ ) from e
219
+ ds = load_dataset(dataset_path, split="test")
220
+ return [SWEBenchTask.from_dict(r) for r in ds]
221
+
222
+
223
+ def _clone_and_checkout(task: SWEBenchTask, workspace: Path) -> Optional[str]:
224
+ """Shallow-clone + checkout the base commit. Returns error string or None."""
225
+ url = f"https://github.com/{task.repo}.git"
226
+ try:
227
+ subprocess.run(
228
+ ["git", "clone", "--quiet", url, str(workspace)],
229
+ check=True, capture_output=True, text=True, timeout=300,
230
+ )
231
+ subprocess.run(
232
+ ["git", "-C", str(workspace), "fetch", "--quiet", "origin", task.base_commit],
233
+ check=False, capture_output=True, text=True, timeout=120,
234
+ )
235
+ subprocess.run(
236
+ ["git", "-C", str(workspace), "checkout", "--quiet", task.base_commit],
237
+ check=True, capture_output=True, text=True, timeout=60,
238
+ )
239
+ except subprocess.CalledProcessError as e:
240
+ return f"git: {e.stderr.strip()[:200]}"
241
+ except subprocess.TimeoutExpired:
242
+ return "git clone/checkout timed out"
243
+ except FileNotFoundError:
244
+ return "git not on PATH"
245
+ return None
246
+
247
+
248
+ def _diff_workspace(workspace: Path) -> str:
249
+ """Return the unified diff of workspace vs base commit (the patch)."""
250
+ try:
251
+ proc = subprocess.run(
252
+ ["git", "-C", str(workspace), "diff", "HEAD"],
253
+ capture_output=True, text=True, timeout=30,
254
+ )
255
+ return proc.stdout
256
+ except Exception:
257
+ return ""
258
+
259
+
260
+ def _run_aider_on_task(
261
+ task: SWEBenchTask, workspace: Path, model: str, timeout: int,
262
+ ) -> tuple[float, int, int, float, str]:
263
+ """Invoke aider against the task. Returns (latency_s, input_tok, output_tok, cost, error)."""
264
+ from services.bench.external.aider_polyglot import (
265
+ _parse_aider_tokens_cost,
266
+ detect_aider,
267
+ )
268
+
269
+ aider = detect_aider()
270
+ if not aider:
271
+ return (0.0, 0, 0, 0.0, "aider CLI not on PATH")
272
+
273
+ prompt = (
274
+ f"Resolve this GitHub issue in this repository. "
275
+ f"Make minimal focused changes — do not modify tests.\n\n"
276
+ f"=== Issue ===\n{task.problem_statement}\n"
277
+ )
278
+ if task.hints_text:
279
+ prompt += f"\n=== Hints ===\n{task.hints_text[:2000]}\n"
280
+
281
+ cmd = [
282
+ aider,
283
+ "--model", model,
284
+ "--yes-always",
285
+ "--no-auto-commits",
286
+ "--no-pretty",
287
+ "--no-stream",
288
+ "--map-tokens", "4096",
289
+ "--message", prompt,
290
+ ]
291
+
292
+ t0 = time.monotonic()
293
+ try:
294
+ proc = subprocess.run(
295
+ cmd, cwd=workspace, capture_output=True, text=True, timeout=timeout,
296
+ )
297
+ latency = round(time.monotonic() - t0, 1)
298
+ inp, out, cost = _parse_aider_tokens_cost(proc.stdout + proc.stderr)
299
+ return (latency, inp, out, cost, "")
300
+ except subprocess.TimeoutExpired:
301
+ return (float(timeout), 0, 0, 0.0, "aider timed out")
302
+
303
+
304
+ def _write_c3_mcp_config(workspace: Path) -> None:
305
+ (workspace / ".mcp.json").write_text(
306
+ json.dumps({
307
+ "mcpServers": {
308
+ "c3": {
309
+ "command": "python",
310
+ "args": ["-m", "cli.mcp_server"],
311
+ "env": {"C3_PROJECT_PATH": str(workspace)},
312
+ }
313
+ }
314
+ }, indent=2),
315
+ encoding="utf-8",
316
+ )
317
+
318
+
319
+ class SWEBenchAdapter:
320
+ def __init__(
321
+ self,
322
+ project_path: Path,
323
+ tasks: list[SWEBenchTask],
324
+ *,
325
+ agent: str = "aider",
326
+ model: str = "gpt-4o-mini",
327
+ timeout_per_task: int = 600,
328
+ verbose: bool = False,
329
+ ):
330
+ self.project_path = project_path
331
+ self.tasks = tasks
332
+ self.agent = agent
333
+ self.model = model
334
+ self.timeout = timeout_per_task
335
+ self.verbose = verbose
336
+
337
+ def run_all(self, dataset_label: str = "") -> SWEBenchReport:
338
+ report = SWEBenchReport(
339
+ timestamp=time.strftime("%Y-%m-%dT%H:%M:%S"),
340
+ project_path=str(self.project_path),
341
+ agent=self.agent, model=self.model,
342
+ dataset=dataset_label,
343
+ tasks_run=len(self.tasks),
344
+ )
345
+
346
+ predictions_c3: list[dict] = []
347
+ predictions_base: list[dict] = []
348
+
349
+ for task in self.tasks:
350
+ if self.verbose:
351
+ print(f"\n [{task.repo}] {task.instance_id}")
352
+ for mode in ("baseline", "with_c3"):
353
+ result = self._run_one(task, mode)
354
+ report.results.append(result)
355
+ pred = {
356
+ "instance_id": task.instance_id,
357
+ "model_patch": result.model_patch,
358
+ "model_name_or_path": f"{'c3+' if mode == 'with_c3' else ''}{self.agent}-{self.model}",
359
+ }
360
+ (predictions_c3 if mode == "with_c3" else predictions_base).append(pred)
361
+ if self.verbose:
362
+ status = "patched" if not result.patch_empty else "EMPTY"
363
+ print(f" {mode:<9} {status} t={result.latency_s:.1f}s "
364
+ f"tok={result.input_tokens + result.output_tokens}")
365
+
366
+ # Save predictions JSONL for both modes
367
+ runs_dir = self.project_path / ".c3" / "external_benchmark" / "runs"
368
+ runs_dir.mkdir(parents=True, exist_ok=True)
369
+ ts = time.strftime("%Y%m%d_%H%M%S")
370
+ p_c3 = runs_dir / f"swebench_{ts}_with_c3.jsonl"
371
+ p_bs = runs_dir / f"swebench_{ts}_baseline.jsonl"
372
+ p_c3.write_text("\n".join(json.dumps(p) for p in predictions_c3), encoding="utf-8")
373
+ p_bs.write_text("\n".join(json.dumps(p) for p in predictions_base), encoding="utf-8")
374
+ report.predictions_with_c3 = str(p_c3)
375
+ report.predictions_baseline = str(p_bs)
376
+
377
+ return report
378
+
379
+ def _run_one(self, task: SWEBenchTask, mode: str) -> SWEBenchResult:
380
+ result = SWEBenchResult(
381
+ instance_id=task.instance_id, repo=task.repo, mode=mode,
382
+ )
383
+ with tempfile.TemporaryDirectory(prefix=f"c3-swe-{mode}-") as tmp:
384
+ workspace = Path(tmp)
385
+ err = _clone_and_checkout(task, workspace)
386
+ if err:
387
+ result.error = err
388
+ return result
389
+ if mode == "with_c3":
390
+ _write_c3_mcp_config(workspace)
391
+
392
+ if self.agent == "aider":
393
+ latency, inp, out, cost, err = _run_aider_on_task(
394
+ task, workspace, self.model, self.timeout,
395
+ )
396
+ result.latency_s = latency
397
+ result.input_tokens = inp
398
+ result.output_tokens = out
399
+ result.cost_usd = cost
400
+ if err:
401
+ result.error = err
402
+ return result
403
+ else:
404
+ result.error = f"agent not supported yet: {self.agent}"
405
+ return result
406
+
407
+ patch = _diff_workspace(workspace)
408
+ result.model_patch = patch
409
+ result.patch_empty = not patch.strip()
410
+ result.patch_lines = patch.count("\n") if patch else 0
411
+
412
+ return result
413
+
414
+
415
+ def evaluate_with_docker(
416
+ predictions_path: Path,
417
+ dataset_path: str,
418
+ run_id: str = "c3-bench",
419
+ max_workers: int = 1,
420
+ timeout: int = 1800,
421
+ ) -> Optional[dict]:
422
+ """Run the official SWE-bench evaluation harness if swebench + Docker are available.
423
+
424
+ Returns the parsed results JSON or None if the harness isn't installed/usable.
425
+ """
426
+ try:
427
+ import swebench.harness.run_evaluation # noqa: F401
428
+ except ImportError:
429
+ return None
430
+ # Docker check
431
+ try:
432
+ subprocess.run(
433
+ ["docker", "version"], check=True, capture_output=True, timeout=10,
434
+ )
435
+ except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired):
436
+ return None
437
+
438
+ cmd = [
439
+ "python", "-m", "swebench.harness.run_evaluation",
440
+ "--predictions_path", str(predictions_path),
441
+ "--dataset_name", dataset_path,
442
+ "--max_workers", str(max_workers),
443
+ "--run_id", run_id,
444
+ "--timeout", str(timeout),
445
+ ]
446
+ try:
447
+ subprocess.run(cmd, check=True, timeout=timeout * len(list(predictions_path.read_text().splitlines())))
448
+ except Exception as e:
449
+ return {"error": str(e)}
450
+
451
+ # Parse the harness-generated results report
452
+ candidates = list(Path.cwd().glob(f"*{run_id}*results.json")) + \
453
+ list(Path.cwd().glob(f"results-*{run_id}.json"))
454
+ for c in candidates:
455
+ try:
456
+ return json.loads(c.read_text(encoding="utf-8"))
457
+ except Exception:
458
+ continue
459
+ return None
460
+
461
+
462
+ def apply_resolution_results(
463
+ report: SWEBenchReport, eval_result: dict, mode: str,
464
+ ) -> None:
465
+ """Merge resolved/unresolved sets from an evaluation into the report results."""
466
+ resolved = set(eval_result.get("resolved_ids", []) or eval_result.get("resolved", []))
467
+ unresolved = set(eval_result.get("unresolved_ids", []) or eval_result.get("unresolved", []))
468
+ for r in report.results:
469
+ if r.mode != mode:
470
+ continue
471
+ if r.instance_id in resolved:
472
+ r.resolved = True
473
+ elif r.instance_id in unresolved:
474
+ r.resolved = False
475
+
476
+
477
+ def save_report(project_path: Path, report: SWEBenchReport) -> Path:
478
+ runs_dir = project_path / ".c3" / "external_benchmark" / "runs"
479
+ runs_dir.mkdir(parents=True, exist_ok=True)
480
+ ts = time.strftime("%Y%m%d_%H%M%S")
481
+ out = runs_dir / f"swe_bench_lite_{ts}.json"
482
+ out.write_text(json.dumps(report.to_dict(), indent=2), encoding="utf-8")
483
+ latest = project_path / ".c3" / "external_benchmark" / "latest.json"
484
+ latest.write_text(json.dumps(report.to_dict(), indent=2), encoding="utf-8")
485
+ return out