sin-code-bundle 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sin_code_bundle/__init__.py +6 -0
- sin_code_bundle/agents_md.py +245 -0
- sin_code_bundle/ast_edit.py +323 -0
- sin_code_bundle/bench.py +506 -0
- sin_code_bundle/budget.py +51 -0
- sin_code_bundle/cache.py +131 -0
- sin_code_bundle/checkpoint.py +230 -0
- sin_code_bundle/cli.py +1943 -0
- sin_code_bundle/codocs.py +328 -0
- sin_code_bundle/dap_bridge.py +135 -0
- sin_code_bundle/data/codocs/SKILL.md +280 -0
- sin_code_bundle/gitnexus.py +368 -0
- sin_code_bundle/hashline.py +216 -0
- sin_code_bundle/hooks.py +249 -0
- sin_code_bundle/immortal_commit.py +288 -0
- sin_code_bundle/interceptor.py +119 -0
- sin_code_bundle/lsp_backend.py +303 -0
- sin_code_bundle/lsp_bootstrap.py +85 -0
- sin_code_bundle/markitdown.py +254 -0
- sin_code_bundle/mcp_config.py +455 -0
- sin_code_bundle/mcp_server.py +963 -0
- sin_code_bundle/memory.py +208 -0
- sin_code_bundle/merge_safety.py +313 -0
- sin_code_bundle/orchestration_worktrees.py +102 -0
- sin_code_bundle/policy.py +224 -0
- sin_code_bundle/preflight.py +152 -0
- sin_code_bundle/programming_workflow.py +541 -0
- sin_code_bundle/rtk.py +154 -0
- sin_code_bundle/safety.py +52 -0
- sin_code_bundle/session_warmup.py +247 -0
- sin_code_bundle/skills.py +188 -0
- sin_code_bundle/symbol_resolve.py +166 -0
- sin_code_bundle/tools/__init__.py +4 -0
- sin_code_bundle/tools/pypi_setup.py +289 -0
- sin_code_bundle/vfs.py +264 -0
- sin_code_bundle-0.9.2.dist-info/METADATA +470 -0
- sin_code_bundle-0.9.2.dist-info/RECORD +41 -0
- sin_code_bundle-0.9.2.dist-info/WHEEL +5 -0
- sin_code_bundle-0.9.2.dist-info/entry_points.txt +4 -0
- sin_code_bundle-0.9.2.dist-info/licenses/LICENSE +21 -0
- sin_code_bundle-0.9.2.dist-info/top_level.txt +1 -0
sin_code_bundle/bench.py
ADDED
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
# SPDX-License-Identifier: MIT
|
|
2
|
+
"""SWE-bench-style A/B evaluation harness for the SIN-Code Bundle.
|
|
3
|
+
|
|
4
|
+
Goal: produce an objective, reproducible number that answers
|
|
5
|
+
"do the SIN tools (impact / semantic_diff / verify / oracle) actually improve
|
|
6
|
+
an agent's pass-rate?"
|
|
7
|
+
|
|
8
|
+
Design
|
|
9
|
+
------
|
|
10
|
+
- Loads a task set (SWE-bench Lite subset by default, or a local JSONL file).
|
|
11
|
+
- Runs each task twice through a pluggable agent runner:
|
|
12
|
+
* arm "control" -> SIN tools DISABLED (SIN_ENFORCE=0)
|
|
13
|
+
* arm "sin" -> SIN tools ENABLED (SIN_ENFORCE=1)
|
|
14
|
+
- Applies the produced patch in an isolated git worktree and runs the task's
|
|
15
|
+
FAIL_TO_PASS / PASS_TO_PASS tests.
|
|
16
|
+
- Reports resolved-rate per arm, the delta, and a per-task breakdown.
|
|
17
|
+
|
|
18
|
+
The harness is intentionally runner-agnostic: you wire in opencode / codex /
|
|
19
|
+
hermes via a small AgentRunner. A DryRunRunner is included so `sin bench`
|
|
20
|
+
works end-to-end without any LLM credits.
|
|
21
|
+
|
|
22
|
+
Docs: bench.doc.md
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import json
|
|
28
|
+
import statistics
|
|
29
|
+
import subprocess
|
|
30
|
+
import tempfile
|
|
31
|
+
import time
|
|
32
|
+
from dataclasses import asdict, dataclass, field
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from typing import Callable, Iterable, Literal, Optional, Protocol
|
|
35
|
+
|
|
36
|
+
Arm = Literal["control", "sin"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# --------------------------------------------------------------------------- #
|
|
40
|
+
# ── Task + Result Models: SWE-bench compatible dataclasses ────────────────── #
|
|
41
|
+
# --------------------------------------------------------------------------- #
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class Task:
|
|
44
|
+
"""One benchmark instance (SWE-bench compatible subset of fields)."""
|
|
45
|
+
|
|
46
|
+
instance_id: str
|
|
47
|
+
repo: str
|
|
48
|
+
base_commit: str
|
|
49
|
+
problem_statement: str
|
|
50
|
+
fail_to_pass: list[str] = field(default_factory=list)
|
|
51
|
+
pass_to_pass: list[str] = field(default_factory=list)
|
|
52
|
+
setup_cmds: list[str] = field(default_factory=list)
|
|
53
|
+
test_cmd: str = "pytest -q"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class TaskResult:
|
|
58
|
+
"""Per-task, per-arm outcome record produced by :func:`_eval_one`.
|
|
59
|
+
|
|
60
|
+
Attributes:
|
|
61
|
+
instance_id: Originating :class:`Task` id.
|
|
62
|
+
arm: Which arm ("control" = SIN tools off, "sin" = SIN tools on).
|
|
63
|
+
resolved: ``True`` iff the patch applied AND every FAIL_TO_PASS test
|
|
64
|
+
now passes. This is the headline "did the agent solve it?" bit.
|
|
65
|
+
duration_s: Wall-clock seconds for clone + agent + apply + test.
|
|
66
|
+
patch_applied: Whether ``git apply`` accepted the agent's diff.
|
|
67
|
+
fail_to_pass_passed: Count of FAIL_TO_PASS tests that now pass.
|
|
68
|
+
fail_to_pass_total: Size of the FAIL_TO_PASS set (or 1 if the task
|
|
69
|
+
has no named tests and we fell back to a single ``test_cmd`` run).
|
|
70
|
+
error: Stringified exception if the harness itself blew up (clone
|
|
71
|
+
failure, timeout, etc.) — separate from "agent produced bad patch".
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
instance_id: str
|
|
75
|
+
arm: Arm
|
|
76
|
+
resolved: bool
|
|
77
|
+
duration_s: float
|
|
78
|
+
patch_applied: bool
|
|
79
|
+
fail_to_pass_passed: int
|
|
80
|
+
fail_to_pass_total: int
|
|
81
|
+
error: Optional[str] = None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class ArmSummary:
|
|
86
|
+
"""Aggregated stats for one arm across all tasks in a benchmark run.
|
|
87
|
+
|
|
88
|
+
Attributes:
|
|
89
|
+
arm: "control" or "sin".
|
|
90
|
+
total: Number of tasks attempted in this arm.
|
|
91
|
+
resolved: Number of tasks whose :class:`TaskResult` had ``resolved=True``.
|
|
92
|
+
resolved_rate: ``resolved / total`` (0.0 if ``total == 0``).
|
|
93
|
+
mean_duration_s: Arithmetic mean of per-task durations.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
arm: Arm
|
|
97
|
+
total: int
|
|
98
|
+
resolved: int
|
|
99
|
+
resolved_rate: float
|
|
100
|
+
mean_duration_s: float
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class BenchReport:
|
|
105
|
+
"""Top-level benchmark output — per-arm summaries plus raw per-task results.
|
|
106
|
+
|
|
107
|
+
Attributes:
|
|
108
|
+
arms: Map ``arm_name -> ArmSummary``.
|
|
109
|
+
delta_resolved_rate: ``sin.resolved_rate - control.resolved_rate``
|
|
110
|
+
(i.e. the headline lift in percentage points / 100). Positive
|
|
111
|
+
means SIN tools helped.
|
|
112
|
+
per_task: Full list of :class:`TaskResult` records for both arms,
|
|
113
|
+
preserving execution order, for drill-down analysis.
|
|
114
|
+
started_at: ISO-8601 timestamp of harness start (local time, no TZ).
|
|
115
|
+
finished_at: ISO-8601 timestamp of harness completion.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
arms: dict[str, ArmSummary]
|
|
119
|
+
delta_resolved_rate: float
|
|
120
|
+
per_task: list[TaskResult]
|
|
121
|
+
started_at: str
|
|
122
|
+
finished_at: str
|
|
123
|
+
|
|
124
|
+
def to_json(self) -> str:
|
|
125
|
+
"""Serialise the full report to a pretty-printed JSON string.
|
|
126
|
+
|
|
127
|
+
Nested dataclasses (:class:`ArmSummary`, :class:`TaskResult`) are
|
|
128
|
+
converted with :func:`dataclasses.asdict` so the output is plain
|
|
129
|
+
JSON — safe to write to disk, post over HTTP, or diff between runs.
|
|
130
|
+
"""
|
|
131
|
+
return json.dumps(
|
|
132
|
+
{
|
|
133
|
+
"arms": {k: asdict(v) for k, v in self.arms.items()},
|
|
134
|
+
"delta_resolved_rate": self.delta_resolved_rate,
|
|
135
|
+
"per_task": [asdict(r) for r in self.per_task],
|
|
136
|
+
"started_at": self.started_at,
|
|
137
|
+
"finished_at": self.finished_at,
|
|
138
|
+
},
|
|
139
|
+
indent=2,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# --------------------------------------------------------------------------- #
|
|
144
|
+
# ── Agent Runner Protocol: pluggable backends (opencode / codex / dry-run) ── #
|
|
145
|
+
# --------------------------------------------------------------------------- #
|
|
146
|
+
class AgentRunner(Protocol):
|
|
147
|
+
"""Produces a unified diff that attempts to solve `task` inside `workdir`.
|
|
148
|
+
|
|
149
|
+
`sin_enabled` tells the runner whether to expose the SIN MCP tools to the
|
|
150
|
+
underlying agent. Implementations should return a unified-diff string (may
|
|
151
|
+
be empty if the agent produced no change).
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
def run(self, task: Task, workdir: Path, sin_enabled: bool) -> str:
|
|
155
|
+
"""Solve ``task`` inside ``workdir`` and return the resulting unified diff.
|
|
156
|
+
|
|
157
|
+
Protocol method — see the class docstring for the contract. Concrete
|
|
158
|
+
implementations should leave their edits in ``workdir`` (typically as
|
|
159
|
+
uncommitted changes) and return them as a diff string.
|
|
160
|
+
"""
|
|
161
|
+
...
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class DryRunRunner:
|
|
165
|
+
"""Zero-cost runner for smoke-testing the harness itself.
|
|
166
|
+
|
|
167
|
+
Produces no patch, so every task "fails" — but exercises the full
|
|
168
|
+
clone/apply/test pipeline so you can validate without an LLM.
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def run(self, task: Task, workdir: Path, sin_enabled: bool) -> str: # noqa: ARG002
|
|
172
|
+
"""Return an empty diff regardless of inputs.
|
|
173
|
+
|
|
174
|
+
Intentionally ignores ``task`` / ``workdir`` / ``sin_enabled`` — the
|
|
175
|
+
purpose is to keep the harness wired up end-to-end without making any
|
|
176
|
+
LLM calls. Every task will report ``resolved=False`` in both arms.
|
|
177
|
+
"""
|
|
178
|
+
return ""
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class CommandRunner:
|
|
182
|
+
"""Runs an external agent CLI and captures the diff it leaves in the repo.
|
|
183
|
+
|
|
184
|
+
Example wiring for opencode:
|
|
185
|
+
CommandRunner(
|
|
186
|
+
build_cmd=lambda task, sin: [
|
|
187
|
+
"opencode", "run",
|
|
188
|
+
"-m", task.problem_statement,
|
|
189
|
+
],
|
|
190
|
+
)
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
build_cmd: Callable[[Task, bool], list[str]],
|
|
196
|
+
timeout_s: int = 1800,
|
|
197
|
+
env_for: Optional[Callable[[Task, bool], dict[str, str]]] = None,
|
|
198
|
+
) -> None:
|
|
199
|
+
self._build_cmd = build_cmd
|
|
200
|
+
# 1800s = 30 min — generous enough for slow LLM rollouts but caps
|
|
201
|
+
# runaway agents so a single bad task can't stall the whole sweep.
|
|
202
|
+
self._timeout_s = timeout_s
|
|
203
|
+
self._env_for = env_for
|
|
204
|
+
|
|
205
|
+
def run(self, task: Task, workdir: Path, sin_enabled: bool) -> str:
|
|
206
|
+
"""Invoke the external agent, then return whatever ``git diff`` shows.
|
|
207
|
+
|
|
208
|
+
The agent is expected to mutate files inside ``workdir`` directly;
|
|
209
|
+
we don't parse its stdout. ``SIN_ENFORCE`` is exported into the
|
|
210
|
+
agent's env so MCP servers can gate themselves on it (1 = SIN tools
|
|
211
|
+
available, 0 = control arm, must not be used).
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Unified-diff text of every uncommitted change the agent made.
|
|
215
|
+
Empty string if the agent produced no edits, crashed, or hit
|
|
216
|
+
the timeout (we deliberately swallow non-zero exit codes here
|
|
217
|
+
— a broken agent is a "failed task", not a harness error).
|
|
218
|
+
"""
|
|
219
|
+
import os
|
|
220
|
+
|
|
221
|
+
cmd = self._build_cmd(task, sin_enabled)
|
|
222
|
+
env = {**os.environ}
|
|
223
|
+
if self._env_for:
|
|
224
|
+
env.update(self._env_for(task, sin_enabled))
|
|
225
|
+
env["SIN_ENFORCE"] = "1" if sin_enabled else "0"
|
|
226
|
+
|
|
227
|
+
subprocess.run(
|
|
228
|
+
cmd,
|
|
229
|
+
cwd=workdir,
|
|
230
|
+
env=env,
|
|
231
|
+
timeout=self._timeout_s,
|
|
232
|
+
check=False,
|
|
233
|
+
capture_output=True,
|
|
234
|
+
text=True,
|
|
235
|
+
)
|
|
236
|
+
diff = subprocess.run(
|
|
237
|
+
["git", "diff"],
|
|
238
|
+
cwd=workdir,
|
|
239
|
+
check=False,
|
|
240
|
+
capture_output=True,
|
|
241
|
+
text=True,
|
|
242
|
+
)
|
|
243
|
+
return diff.stdout
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# --------------------------------------------------------------------------- #
|
|
247
|
+
# ── Git / Test Plumbing: worktree prep, patch apply, test execution ──────── #
|
|
248
|
+
# --------------------------------------------------------------------------- #
|
|
249
|
+
def _sh(cmd: list[str], cwd: Path, timeout: int = 600) -> subprocess.CompletedProcess:
|
|
250
|
+
# 600s = 10 min default — fits clone/checkout/test-id runs; callers
|
|
251
|
+
# override (e.g. clone uses 900s, setup_cmds use 1800s).
|
|
252
|
+
return subprocess.run(
|
|
253
|
+
cmd, cwd=cwd, check=False, capture_output=True, text=True, timeout=timeout
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _prepare_worktree(task: Task, root: Path) -> Path:
|
|
258
|
+
work = root / task.instance_id.replace("/", "__")
|
|
259
|
+
work.mkdir(parents=True, exist_ok=True)
|
|
260
|
+
url = f"https://github.com/{task.repo}.git"
|
|
261
|
+
# 900s clone timeout — large monorepos (django, sympy) routinely
|
|
262
|
+
# need >5 min on a cold network; tighter would flake the harness.
|
|
263
|
+
_sh(["git", "clone", "--quiet", url, "."], cwd=work, timeout=900)
|
|
264
|
+
_sh(["git", "checkout", "--quiet", task.base_commit], cwd=work)
|
|
265
|
+
for cmd in task.setup_cmds:
|
|
266
|
+
# 1800s per setup cmd — pip installs of scientific stacks (scipy,
|
|
267
|
+
# pandas) can be slow when wheels are missing for the platform.
|
|
268
|
+
_sh(["bash", "-lc", cmd], cwd=work, timeout=1800)
|
|
269
|
+
return work
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _apply_patch(diff: str, work: Path) -> bool:
|
|
273
|
+
if not diff.strip():
|
|
274
|
+
return False
|
|
275
|
+
patch = work / ".sin_patch.diff"
|
|
276
|
+
patch.write_text(diff, encoding="utf-8")
|
|
277
|
+
res = _sh(["git", "apply", "--whitespace=nowarn", str(patch)], cwd=work)
|
|
278
|
+
return res.returncode == 0
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _run_named_tests(work: Path, task: Task) -> tuple[int, int]:
|
|
282
|
+
if not task.fail_to_pass:
|
|
283
|
+
# Fallback path: SWE-bench tasks usually name specific tests, but some
|
|
284
|
+
# in-house tasks just ship a `test_cmd` and rely on its overall exit
|
|
285
|
+
# code (0 = solved, non-zero = not solved).
|
|
286
|
+
res = _sh(["bash", "-lc", task.test_cmd], cwd=work, timeout=1800)
|
|
287
|
+
return (1, 1) if res.returncode == 0 else (0, 1)
|
|
288
|
+
|
|
289
|
+
passed = 0
|
|
290
|
+
for test_id in task.fail_to_pass:
|
|
291
|
+
# 900s per single test — pytest selectors on huge repos (django) need
|
|
292
|
+
# collection time even before the test itself runs.
|
|
293
|
+
res = _sh(
|
|
294
|
+
["bash", "-lc", f"{task.test_cmd} {test_id}"],
|
|
295
|
+
cwd=work,
|
|
296
|
+
timeout=900,
|
|
297
|
+
)
|
|
298
|
+
if res.returncode == 0:
|
|
299
|
+
passed += 1
|
|
300
|
+
return passed, len(task.fail_to_pass)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
# --------------------------------------------------------------------------- #
|
|
304
|
+
# ── Core Eval Loop: drive runner + scoring per task per arm ──────────────── #
|
|
305
|
+
# --------------------------------------------------------------------------- #
|
|
306
|
+
def _eval_one(task: Task, arm: Arm, runner: AgentRunner, root: Path) -> TaskResult:
|
|
307
|
+
start = time.time()
|
|
308
|
+
try:
|
|
309
|
+
work = _prepare_worktree(task, root)
|
|
310
|
+
diff = runner.run(task, work, sin_enabled=(arm == "sin"))
|
|
311
|
+
applied = _apply_patch(diff, work)
|
|
312
|
+
passed, total = (0, len(task.fail_to_pass) or 1)
|
|
313
|
+
if applied:
|
|
314
|
+
passed, total = _run_named_tests(work, task)
|
|
315
|
+
resolved = applied and passed == total and total > 0
|
|
316
|
+
return TaskResult(
|
|
317
|
+
instance_id=task.instance_id,
|
|
318
|
+
arm=arm,
|
|
319
|
+
resolved=resolved,
|
|
320
|
+
duration_s=round(time.time() - start, 2),
|
|
321
|
+
patch_applied=applied,
|
|
322
|
+
fail_to_pass_passed=passed,
|
|
323
|
+
fail_to_pass_total=total,
|
|
324
|
+
)
|
|
325
|
+
except Exception as exc: # noqa: BLE001
|
|
326
|
+
return TaskResult(
|
|
327
|
+
instance_id=task.instance_id,
|
|
328
|
+
arm=arm,
|
|
329
|
+
resolved=False,
|
|
330
|
+
duration_s=round(time.time() - start, 2),
|
|
331
|
+
patch_applied=False,
|
|
332
|
+
fail_to_pass_passed=0,
|
|
333
|
+
fail_to_pass_total=len(task.fail_to_pass) or 1,
|
|
334
|
+
error=str(exc),
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _summarize(arm: Arm, results: list[TaskResult]) -> ArmSummary:
|
|
339
|
+
subset = [r for r in results if r.arm == arm]
|
|
340
|
+
total = len(subset)
|
|
341
|
+
resolved = sum(1 for r in subset if r.resolved)
|
|
342
|
+
rate = (resolved / total) if total else 0.0
|
|
343
|
+
mean_dur = statistics.mean([r.duration_s for r in subset]) if subset else 0.0
|
|
344
|
+
return ArmSummary(
|
|
345
|
+
arm=arm,
|
|
346
|
+
total=total,
|
|
347
|
+
resolved=resolved,
|
|
348
|
+
resolved_rate=round(rate, 4),
|
|
349
|
+
mean_duration_s=round(mean_dur, 2),
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def run_benchmark(
|
|
354
|
+
tasks: Iterable[Task],
|
|
355
|
+
runner: AgentRunner,
|
|
356
|
+
arms: tuple[Arm, ...] = ("control", "sin"),
|
|
357
|
+
workspace: Optional[Path] = None,
|
|
358
|
+
) -> BenchReport:
|
|
359
|
+
"""Run every ``task`` through every ``arm`` and return an aggregated report.
|
|
360
|
+
|
|
361
|
+
Each (task, arm) pair gets its own clone under ``workspace / <arm> /
|
|
362
|
+
<task.instance_id>`` so arms can never poison each other's worktree.
|
|
363
|
+
The agent is invoked once per pair via ``runner``; its diff is applied
|
|
364
|
+
and the FAIL_TO_PASS tests are run to score the attempt.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
tasks: Iterable of :class:`Task` (consumed once; materialised internally).
|
|
368
|
+
runner: Pluggable :class:`AgentRunner` (e.g. :class:`DryRunRunner`,
|
|
369
|
+
:class:`CommandRunner`).
|
|
370
|
+
arms: Which arms to run. Default ``("control", "sin")`` produces the
|
|
371
|
+
standard A/B delta; pass ``("sin",)`` for a single-arm run.
|
|
372
|
+
workspace: Persistent workspace dir. Pass a real path to keep clones
|
|
373
|
+
on disk for post-mortem inspection; default uses a tempdir
|
|
374
|
+
wiped on return.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
:class:`BenchReport` with per-arm summaries, headline delta, and
|
|
378
|
+
per-task detail.
|
|
379
|
+
"""
|
|
380
|
+
started = time.strftime("%Y-%m-%dT%H:%M:%S")
|
|
381
|
+
tasks = list(tasks)
|
|
382
|
+
results: list[TaskResult] = []
|
|
383
|
+
|
|
384
|
+
with tempfile.TemporaryDirectory(prefix="sin-bench-") as tmp:
|
|
385
|
+
root = Path(workspace) if workspace else Path(tmp)
|
|
386
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
387
|
+
for arm in arms:
|
|
388
|
+
for task in tasks:
|
|
389
|
+
# Per-arm subdir keeps the two clones strictly isolated —
|
|
390
|
+
# otherwise the second arm would inherit the first arm's
|
|
391
|
+
# leftover patch state.
|
|
392
|
+
results.append(_eval_one(task, arm, runner, root / arm))
|
|
393
|
+
|
|
394
|
+
summaries = {arm: _summarize(arm, results) for arm in arms}
|
|
395
|
+
delta = 0.0
|
|
396
|
+
if "sin" in summaries and "control" in summaries:
|
|
397
|
+
delta = round(summaries["sin"].resolved_rate - summaries["control"].resolved_rate, 4)
|
|
398
|
+
return BenchReport(
|
|
399
|
+
arms=summaries,
|
|
400
|
+
delta_resolved_rate=delta,
|
|
401
|
+
per_task=results,
|
|
402
|
+
started_at=started,
|
|
403
|
+
finished_at=time.strftime("%Y-%m-%dT%H:%M:%S"),
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
# --------------------------------------------------------------------------- #
|
|
408
|
+
# ── Task Loading: JSONL + SWE-bench Lite via datasets ────────────────────── #
|
|
409
|
+
# --------------------------------------------------------------------------- #
|
|
410
|
+
def load_tasks_jsonl(path: Path, limit: Optional[int] = None) -> list[Task]:
|
|
411
|
+
"""Load tasks from a JSONL file (SWE-bench compatible field names)."""
|
|
412
|
+
tasks: list[Task] = []
|
|
413
|
+
for line in path.read_text(encoding="utf-8").splitlines():
|
|
414
|
+
line = line.strip()
|
|
415
|
+
if not line:
|
|
416
|
+
continue
|
|
417
|
+
d = json.loads(line)
|
|
418
|
+
tasks.append(
|
|
419
|
+
Task(
|
|
420
|
+
instance_id=d["instance_id"],
|
|
421
|
+
repo=d["repo"],
|
|
422
|
+
base_commit=d["base_commit"],
|
|
423
|
+
problem_statement=d.get("problem_statement", ""),
|
|
424
|
+
fail_to_pass=d.get("FAIL_TO_PASS", d.get("fail_to_pass", [])),
|
|
425
|
+
pass_to_pass=d.get("PASS_TO_PASS", d.get("pass_to_pass", [])),
|
|
426
|
+
setup_cmds=d.get("setup_cmds", []),
|
|
427
|
+
test_cmd=d.get("test_cmd", "pytest -q"),
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
if limit and len(tasks) >= limit:
|
|
431
|
+
break
|
|
432
|
+
return tasks
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def load_swebench_lite(limit: Optional[int] = 20) -> list[Task]:
|
|
436
|
+
"""Load SWE-bench Lite via `datasets` if available; else raise a clear error.
|
|
437
|
+
|
|
438
|
+
Default limit=20 is a smoke-test size — 20 tasks ≈ 10h on a single agent
|
|
439
|
+
(clone + setup + 30-min LLM rollout per task), enough to detect a
|
|
440
|
+
resolved-rate delta without burning a full 300-task run. Bump to None
|
|
441
|
+
for the full benchmark.
|
|
442
|
+
"""
|
|
443
|
+
try:
|
|
444
|
+
from datasets import load_dataset # type: ignore
|
|
445
|
+
except ImportError as exc:
|
|
446
|
+
raise RuntimeError(
|
|
447
|
+
"SWE-bench Lite requires the 'datasets' package. "
|
|
448
|
+
"Install with: pip install 'sin-code-bundle[bench]', "
|
|
449
|
+
"or pass --tasks <file.jsonl>."
|
|
450
|
+
) from exc
|
|
451
|
+
|
|
452
|
+
ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
|
|
453
|
+
tasks: list[Task] = []
|
|
454
|
+
for row in ds:
|
|
455
|
+
tasks.append(
|
|
456
|
+
Task(
|
|
457
|
+
instance_id=row["instance_id"],
|
|
458
|
+
repo=row["repo"],
|
|
459
|
+
base_commit=row["base_commit"],
|
|
460
|
+
problem_statement=row["problem_statement"],
|
|
461
|
+
fail_to_pass=json.loads(row["FAIL_TO_PASS"])
|
|
462
|
+
if isinstance(row["FAIL_TO_PASS"], str)
|
|
463
|
+
else row["FAIL_TO_PASS"],
|
|
464
|
+
pass_to_pass=json.loads(row["PASS_TO_PASS"])
|
|
465
|
+
if isinstance(row["PASS_TO_PASS"], str)
|
|
466
|
+
else row["PASS_TO_PASS"],
|
|
467
|
+
)
|
|
468
|
+
)
|
|
469
|
+
if limit and len(tasks) >= limit:
|
|
470
|
+
break
|
|
471
|
+
return tasks
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
# --------------------------------------------------------------------------- #
|
|
475
|
+
# ── Pretty Printing: human-readable terminal report ──────────────────────── #
|
|
476
|
+
# --------------------------------------------------------------------------- #
|
|
477
|
+
def format_report(report: BenchReport) -> str:
|
|
478
|
+
"""Render a :class:`BenchReport` as a fixed-width terminal block.
|
|
479
|
+
|
|
480
|
+
Used by the ``sin bench`` CLI to print results at the end of a run.
|
|
481
|
+
Layout::
|
|
482
|
+
|
|
483
|
+
SIN-Code Bench — A/B resolved-rate
|
|
484
|
+
========================================
|
|
485
|
+
control 3/20 resolved ( 15.0%) mean 142.5s
|
|
486
|
+
sin 7/20 resolved ( 35.0%) mean 187.2s
|
|
487
|
+
----------------------------------------
|
|
488
|
+
SIN delta: +20.0 pp (percentage points)
|
|
489
|
+
========================================
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
Multi-line string with no trailing newline — caller decides spacing.
|
|
493
|
+
"""
|
|
494
|
+
lines = ["", "SIN-Code Bench — A/B resolved-rate", "=" * 40]
|
|
495
|
+
for arm, s in report.arms.items():
|
|
496
|
+
lines.append(
|
|
497
|
+
f" {arm:<8} {s.resolved}/{s.total} resolved "
|
|
498
|
+
f"({s.resolved_rate * 100:5.1f}%) mean {s.mean_duration_s}s"
|
|
499
|
+
)
|
|
500
|
+
sign = "+" if report.delta_resolved_rate >= 0 else ""
|
|
501
|
+
lines.append("-" * 40)
|
|
502
|
+
lines.append(
|
|
503
|
+
f" SIN delta: {sign}{report.delta_resolved_rate * 100:.1f} pp (percentage points)"
|
|
504
|
+
)
|
|
505
|
+
lines.append("=" * 40)
|
|
506
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Keep MCP tool outputs compact so they don't blow the agent's context window.
|
|
2
|
+
|
|
3
|
+
Every tool result is passed through `trim()` before returning. Lists are capped,
|
|
4
|
+
long strings truncated, and an explicit `_truncated` flag is added so the agent
|
|
5
|
+
knows more data exists.
|
|
6
|
+
|
|
7
|
+
Docs: budget.doc.md
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
# Default ceilings sized to fit comfortably in a 200K-token agent context even
|
|
15
|
+
# when many tools are called per turn — strings dominate token cost so we cap
|
|
16
|
+
# them harder than list arity. Override per-call via `trim(value, max_list=…)`.
|
|
17
|
+
MAX_LIST = 25 # max items kept per list; rest collapsed into _truncated sentinel
|
|
18
|
+
MAX_STR = 2000 # max characters per string; rest replaced with " ...[truncated]"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def trim(value: Any, max_list: int = MAX_LIST, max_str: int = MAX_STR) -> Any:
|
|
22
|
+
"""Recursively trim a tool output to safe sizes.
|
|
23
|
+
|
|
24
|
+
Walks any JSON-shaped value (str / list / dict / scalar) and enforces the
|
|
25
|
+
`max_list` and `max_str` ceilings. Non-container scalars pass through
|
|
26
|
+
untouched. Lists longer than `max_list` get an extra trailing dict
|
|
27
|
+
``{"_truncated": True, "_omitted": N}`` so the agent can see that more
|
|
28
|
+
data existed without being forced to render it.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
value: Any JSON-serialisable Python value (typically the result of
|
|
32
|
+
an MCP tool call).
|
|
33
|
+
max_list: Maximum list length to keep before truncating.
|
|
34
|
+
max_str: Maximum string length (in characters) before truncating.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
A new value of the same shape as ``value`` but capped to the limits.
|
|
38
|
+
Original input is never mutated.
|
|
39
|
+
"""
|
|
40
|
+
if isinstance(value, str):
|
|
41
|
+
return value if len(value) <= max_str else value[:max_str] + " ...[truncated]"
|
|
42
|
+
if isinstance(value, list):
|
|
43
|
+
trimmed = [trim(v, max_list, max_str) for v in value[:max_list]]
|
|
44
|
+
if len(value) > max_list:
|
|
45
|
+
# Sentinel must be a dict (not a string) so JSON consumers can detect
|
|
46
|
+
# truncation programmatically without scanning text content.
|
|
47
|
+
trimmed.append({"_truncated": True, "_omitted": len(value) - max_list})
|
|
48
|
+
return trimmed
|
|
49
|
+
if isinstance(value, dict):
|
|
50
|
+
return {k: trim(v, max_list, max_str) for k, v in value.items()}
|
|
51
|
+
return value
|
sin_code_bundle/cache.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Incremental, content-hashed cache for SCKG / impact results.
|
|
2
|
+
|
|
3
|
+
Avoids rescanning the whole repo on every `impact()` call. Keyed by a hash of
|
|
4
|
+
the file set + their mtimes/sizes; invalidated automatically when files change.
|
|
5
|
+
Stored under .sin/cache/ as JSON.
|
|
6
|
+
|
|
7
|
+
Docs: cache.doc.md
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import json
|
|
14
|
+
import time
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Optional
|
|
17
|
+
|
|
18
|
+
_IGNORE = {".git", "node_modules", ".venv", "__pycache__", ".sin", "dist", "build"}
|
|
19
|
+
# Directory names that never carry first-party source — pruned from the
|
|
20
|
+
# fingerprint walk so e.g. installing a new dep into .venv doesn't blow the
|
|
21
|
+
# cache, and so the SCKG cache itself (under .sin/) can't recursively
|
|
22
|
+
# invalidate itself.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _repo_fingerprint(root: Path, exts: tuple[str, ...]) -> str:
|
|
26
|
+
"""Cheap content-aware hash of the repo's source tree.
|
|
27
|
+
|
|
28
|
+
Walks ``root`` recursively, filters to files whose suffix is in ``exts``
|
|
29
|
+
and whose path does not cross an ``_IGNORE`` directory, then hashes the
|
|
30
|
+
(path, mtime_ns, size) tuple of each. mtime+size is ~free compared to
|
|
31
|
+
reading file bytes and is sensitive enough for "did anything change?"
|
|
32
|
+
cache-invalidation — much cheaper than a full content hash.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Hex SHA-256 digest. Stable across runs for unchanged trees.
|
|
36
|
+
"""
|
|
37
|
+
h = hashlib.sha256()
|
|
38
|
+
for path in sorted(root.rglob("*")):
|
|
39
|
+
if not path.is_file() or path.suffix.lower() not in exts:
|
|
40
|
+
continue
|
|
41
|
+
if any(part in _IGNORE for part in path.parts):
|
|
42
|
+
continue
|
|
43
|
+
try:
|
|
44
|
+
st = path.stat()
|
|
45
|
+
except OSError:
|
|
46
|
+
# File vanished mid-walk (race with a checkout/rebuild) — skip
|
|
47
|
+
# rather than abort; fingerprint will still be stable next call.
|
|
48
|
+
continue
|
|
49
|
+
h.update(str(path).encode())
|
|
50
|
+
h.update(str(st.st_mtime_ns).encode())
|
|
51
|
+
h.update(str(st.st_size).encode())
|
|
52
|
+
return h.hexdigest()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ── GraphCache: On-disk Cache Layer ────────────────────────────────────────
|
|
56
|
+
class GraphCache:
|
|
57
|
+
"""On-disk cache for expensive SCKG / impact results, keyed by repo state.
|
|
58
|
+
|
|
59
|
+
Each cached entry is stamped with the current ``_repo_fingerprint`` of the
|
|
60
|
+
source tree. On :meth:`get`, if the stored fingerprint no longer matches
|
|
61
|
+
the live tree, the entry is treated as stale and ``None`` is returned —
|
|
62
|
+
so the cache silently self-invalidates whenever any tracked file changes.
|
|
63
|
+
|
|
64
|
+
Storage layout (under ``<root>/.sin/cache/``)::
|
|
65
|
+
|
|
66
|
+
<sha1(key)[:16]>.json
|
|
67
|
+
{ "fingerprint": "<sha256>",
|
|
68
|
+
"stored_at": <epoch>,
|
|
69
|
+
"value": <arbitrary JSON> }
|
|
70
|
+
|
|
71
|
+
The 16-char prefix is plenty to avoid collisions for the typical
|
|
72
|
+
handful of cache keys per repo and keeps filenames human-skimmable.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(
|
|
76
|
+
self,
|
|
77
|
+
root: Path = Path("."),
|
|
78
|
+
exts: tuple[str, ...] = (".py", ".ts", ".tsx", ".js", ".go", ".rs"),
|
|
79
|
+
) -> None:
|
|
80
|
+
self.root = Path(root).resolve()
|
|
81
|
+
self.exts = exts
|
|
82
|
+
self.dir = self.root / ".sin" / "cache"
|
|
83
|
+
self.dir.mkdir(parents=True, exist_ok=True)
|
|
84
|
+
|
|
85
|
+
def _file(self, key: str) -> Path:
|
|
86
|
+
# sha1 (not sha256) is fine here: this is a filesystem key, not a
|
|
87
|
+
# security boundary. 16 hex chars = 64 bits collision space.
|
|
88
|
+
safe = hashlib.sha1(key.encode()).hexdigest()[:16]
|
|
89
|
+
return self.dir / f"{safe}.json"
|
|
90
|
+
|
|
91
|
+
def get(self, key: str) -> Optional[Any]:
|
|
92
|
+
"""Return the cached value for ``key`` if and only if the repo is unchanged.
|
|
93
|
+
|
|
94
|
+
Returns ``None`` when there is no entry, when the file is corrupt, or
|
|
95
|
+
when the stored fingerprint disagrees with the live repo fingerprint
|
|
96
|
+
(i.e. some tracked source file changed since the value was stored).
|
|
97
|
+
"""
|
|
98
|
+
fp = self._file(key)
|
|
99
|
+
if not fp.exists():
|
|
100
|
+
return None
|
|
101
|
+
data = json.loads(fp.read_text(encoding="utf-8"))
|
|
102
|
+
if data.get("fingerprint") != _repo_fingerprint(self.root, self.exts):
|
|
103
|
+
return None # stale — repo changed
|
|
104
|
+
return data.get("value")
|
|
105
|
+
|
|
106
|
+
def set(self, key: str, value: Any) -> None:
|
|
107
|
+
"""Persist ``value`` under ``key`` together with the current repo fingerprint.
|
|
108
|
+
|
|
109
|
+
``value`` must be JSON-serialisable. Any prior entry under the same
|
|
110
|
+
key is overwritten atomically (single ``write_text`` call).
|
|
111
|
+
"""
|
|
112
|
+
fp = self._file(key)
|
|
113
|
+
fp.write_text(
|
|
114
|
+
json.dumps(
|
|
115
|
+
{
|
|
116
|
+
"fingerprint": _repo_fingerprint(self.root, self.exts),
|
|
117
|
+
"stored_at": time.time(),
|
|
118
|
+
"value": value,
|
|
119
|
+
},
|
|
120
|
+
indent=2,
|
|
121
|
+
),
|
|
122
|
+
encoding="utf-8",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
def clear(self) -> int:
|
|
126
|
+
"""Drop every cached entry. Returns the number of files removed."""
|
|
127
|
+
n = 0
|
|
128
|
+
for f in self.dir.glob("*.json"):
|
|
129
|
+
f.unlink()
|
|
130
|
+
n += 1
|
|
131
|
+
return n
|