sin-code-bundle 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. sin_code_bundle/__init__.py +6 -0
  2. sin_code_bundle/agents_md.py +245 -0
  3. sin_code_bundle/ast_edit.py +323 -0
  4. sin_code_bundle/bench.py +506 -0
  5. sin_code_bundle/budget.py +51 -0
  6. sin_code_bundle/cache.py +131 -0
  7. sin_code_bundle/checkpoint.py +230 -0
  8. sin_code_bundle/cli.py +1943 -0
  9. sin_code_bundle/codocs.py +328 -0
  10. sin_code_bundle/dap_bridge.py +135 -0
  11. sin_code_bundle/data/codocs/SKILL.md +280 -0
  12. sin_code_bundle/gitnexus.py +368 -0
  13. sin_code_bundle/hashline.py +216 -0
  14. sin_code_bundle/hooks.py +249 -0
  15. sin_code_bundle/immortal_commit.py +288 -0
  16. sin_code_bundle/interceptor.py +119 -0
  17. sin_code_bundle/lsp_backend.py +303 -0
  18. sin_code_bundle/lsp_bootstrap.py +85 -0
  19. sin_code_bundle/markitdown.py +254 -0
  20. sin_code_bundle/mcp_config.py +455 -0
  21. sin_code_bundle/mcp_server.py +963 -0
  22. sin_code_bundle/memory.py +208 -0
  23. sin_code_bundle/merge_safety.py +313 -0
  24. sin_code_bundle/orchestration_worktrees.py +102 -0
  25. sin_code_bundle/policy.py +224 -0
  26. sin_code_bundle/preflight.py +152 -0
  27. sin_code_bundle/programming_workflow.py +541 -0
  28. sin_code_bundle/rtk.py +154 -0
  29. sin_code_bundle/safety.py +52 -0
  30. sin_code_bundle/session_warmup.py +247 -0
  31. sin_code_bundle/skills.py +188 -0
  32. sin_code_bundle/symbol_resolve.py +166 -0
  33. sin_code_bundle/tools/__init__.py +4 -0
  34. sin_code_bundle/tools/pypi_setup.py +289 -0
  35. sin_code_bundle/vfs.py +264 -0
  36. sin_code_bundle-0.9.2.dist-info/METADATA +470 -0
  37. sin_code_bundle-0.9.2.dist-info/RECORD +41 -0
  38. sin_code_bundle-0.9.2.dist-info/WHEEL +5 -0
  39. sin_code_bundle-0.9.2.dist-info/entry_points.txt +4 -0
  40. sin_code_bundle-0.9.2.dist-info/licenses/LICENSE +21 -0
  41. sin_code_bundle-0.9.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,506 @@
1
+ # SPDX-License-Identifier: MIT
2
+ """SWE-bench-style A/B evaluation harness for the SIN-Code Bundle.
3
+
4
+ Goal: produce an objective, reproducible number that answers
5
+ "do the SIN tools (impact / semantic_diff / verify / oracle) actually improve
6
+ an agent's pass-rate?"
7
+
8
+ Design
9
+ ------
10
+ - Loads a task set (SWE-bench Lite subset by default, or a local JSONL file).
11
+ - Runs each task twice through a pluggable agent runner:
12
+ * arm "control" -> SIN tools DISABLED (SIN_ENFORCE=0)
13
+ * arm "sin" -> SIN tools ENABLED (SIN_ENFORCE=1)
14
+ - Applies the produced patch in an isolated git worktree and runs the task's
15
+ FAIL_TO_PASS / PASS_TO_PASS tests.
16
+ - Reports resolved-rate per arm, the delta, and a per-task breakdown.
17
+
18
+ The harness is intentionally runner-agnostic: you wire in opencode / codex /
19
+ hermes via a small AgentRunner. A DryRunRunner is included so `sin bench`
20
+ works end-to-end without any LLM credits.
21
+
22
+ Docs: bench.doc.md
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import statistics
29
+ import subprocess
30
+ import tempfile
31
+ import time
32
+ from dataclasses import asdict, dataclass, field
33
+ from pathlib import Path
34
+ from typing import Callable, Iterable, Literal, Optional, Protocol
35
+
36
+ Arm = Literal["control", "sin"]
37
+
38
+
39
+ # --------------------------------------------------------------------------- #
40
+ # ── Task + Result Models: SWE-bench compatible dataclasses ────────────────── #
41
+ # --------------------------------------------------------------------------- #
42
+ @dataclass(frozen=True)
43
+ class Task:
44
+ """One benchmark instance (SWE-bench compatible subset of fields)."""
45
+
46
+ instance_id: str
47
+ repo: str
48
+ base_commit: str
49
+ problem_statement: str
50
+ fail_to_pass: list[str] = field(default_factory=list)
51
+ pass_to_pass: list[str] = field(default_factory=list)
52
+ setup_cmds: list[str] = field(default_factory=list)
53
+ test_cmd: str = "pytest -q"
54
+
55
+
56
+ @dataclass
57
+ class TaskResult:
58
+ """Per-task, per-arm outcome record produced by :func:`_eval_one`.
59
+
60
+ Attributes:
61
+ instance_id: Originating :class:`Task` id.
62
+ arm: Which arm ("control" = SIN tools off, "sin" = SIN tools on).
63
+ resolved: ``True`` iff the patch applied AND every FAIL_TO_PASS test
64
+ now passes. This is the headline "did the agent solve it?" bit.
65
+ duration_s: Wall-clock seconds for clone + agent + apply + test.
66
+ patch_applied: Whether ``git apply`` accepted the agent's diff.
67
+ fail_to_pass_passed: Count of FAIL_TO_PASS tests that now pass.
68
+ fail_to_pass_total: Size of the FAIL_TO_PASS set (or 1 if the task
69
+ has no named tests and we fell back to a single ``test_cmd`` run).
70
+ error: Stringified exception if the harness itself blew up (clone
71
+ failure, timeout, etc.) — separate from "agent produced bad patch".
72
+ """
73
+
74
+ instance_id: str
75
+ arm: Arm
76
+ resolved: bool
77
+ duration_s: float
78
+ patch_applied: bool
79
+ fail_to_pass_passed: int
80
+ fail_to_pass_total: int
81
+ error: Optional[str] = None
82
+
83
+
84
+ @dataclass
85
+ class ArmSummary:
86
+ """Aggregated stats for one arm across all tasks in a benchmark run.
87
+
88
+ Attributes:
89
+ arm: "control" or "sin".
90
+ total: Number of tasks attempted in this arm.
91
+ resolved: Number of tasks whose :class:`TaskResult` had ``resolved=True``.
92
+ resolved_rate: ``resolved / total`` (0.0 if ``total == 0``).
93
+ mean_duration_s: Arithmetic mean of per-task durations.
94
+ """
95
+
96
+ arm: Arm
97
+ total: int
98
+ resolved: int
99
+ resolved_rate: float
100
+ mean_duration_s: float
101
+
102
+
103
+ @dataclass
104
+ class BenchReport:
105
+ """Top-level benchmark output — per-arm summaries plus raw per-task results.
106
+
107
+ Attributes:
108
+ arms: Map ``arm_name -> ArmSummary``.
109
+ delta_resolved_rate: ``sin.resolved_rate - control.resolved_rate``
110
+ (i.e. the headline lift in percentage points / 100). Positive
111
+ means SIN tools helped.
112
+ per_task: Full list of :class:`TaskResult` records for both arms,
113
+ preserving execution order, for drill-down analysis.
114
+ started_at: ISO-8601 timestamp of harness start (local time, no TZ).
115
+ finished_at: ISO-8601 timestamp of harness completion.
116
+ """
117
+
118
+ arms: dict[str, ArmSummary]
119
+ delta_resolved_rate: float
120
+ per_task: list[TaskResult]
121
+ started_at: str
122
+ finished_at: str
123
+
124
+ def to_json(self) -> str:
125
+ """Serialise the full report to a pretty-printed JSON string.
126
+
127
+ Nested dataclasses (:class:`ArmSummary`, :class:`TaskResult`) are
128
+ converted with :func:`dataclasses.asdict` so the output is plain
129
+ JSON — safe to write to disk, post over HTTP, or diff between runs.
130
+ """
131
+ return json.dumps(
132
+ {
133
+ "arms": {k: asdict(v) for k, v in self.arms.items()},
134
+ "delta_resolved_rate": self.delta_resolved_rate,
135
+ "per_task": [asdict(r) for r in self.per_task],
136
+ "started_at": self.started_at,
137
+ "finished_at": self.finished_at,
138
+ },
139
+ indent=2,
140
+ )
141
+
142
+
143
+ # --------------------------------------------------------------------------- #
144
+ # ── Agent Runner Protocol: pluggable backends (opencode / codex / dry-run) ── #
145
+ # --------------------------------------------------------------------------- #
146
+ class AgentRunner(Protocol):
147
+ """Produces a unified diff that attempts to solve `task` inside `workdir`.
148
+
149
+ `sin_enabled` tells the runner whether to expose the SIN MCP tools to the
150
+ underlying agent. Implementations should return a unified-diff string (may
151
+ be empty if the agent produced no change).
152
+ """
153
+
154
+ def run(self, task: Task, workdir: Path, sin_enabled: bool) -> str:
155
+ """Solve ``task`` inside ``workdir`` and return the resulting unified diff.
156
+
157
+ Protocol method — see the class docstring for the contract. Concrete
158
+ implementations should leave their edits in ``workdir`` (typically as
159
+ uncommitted changes) and return them as a diff string.
160
+ """
161
+ ...
162
+
163
+
164
+ class DryRunRunner:
165
+ """Zero-cost runner for smoke-testing the harness itself.
166
+
167
+ Produces no patch, so every task "fails" — but exercises the full
168
+ clone/apply/test pipeline so you can validate without an LLM.
169
+ """
170
+
171
+ def run(self, task: Task, workdir: Path, sin_enabled: bool) -> str: # noqa: ARG002
172
+ """Return an empty diff regardless of inputs.
173
+
174
+ Intentionally ignores ``task`` / ``workdir`` / ``sin_enabled`` — the
175
+ purpose is to keep the harness wired up end-to-end without making any
176
+ LLM calls. Every task will report ``resolved=False`` in both arms.
177
+ """
178
+ return ""
179
+
180
+
181
+ class CommandRunner:
182
+ """Runs an external agent CLI and captures the diff it leaves in the repo.
183
+
184
+ Example wiring for opencode:
185
+ CommandRunner(
186
+ build_cmd=lambda task, sin: [
187
+ "opencode", "run",
188
+ "-m", task.problem_statement,
189
+ ],
190
+ )
191
+ """
192
+
193
+ def __init__(
194
+ self,
195
+ build_cmd: Callable[[Task, bool], list[str]],
196
+ timeout_s: int = 1800,
197
+ env_for: Optional[Callable[[Task, bool], dict[str, str]]] = None,
198
+ ) -> None:
199
+ self._build_cmd = build_cmd
200
+ # 1800s = 30 min — generous enough for slow LLM rollouts but caps
201
+ # runaway agents so a single bad task can't stall the whole sweep.
202
+ self._timeout_s = timeout_s
203
+ self._env_for = env_for
204
+
205
+ def run(self, task: Task, workdir: Path, sin_enabled: bool) -> str:
206
+ """Invoke the external agent, then return whatever ``git diff`` shows.
207
+
208
+ The agent is expected to mutate files inside ``workdir`` directly;
209
+ we don't parse its stdout. ``SIN_ENFORCE`` is exported into the
210
+ agent's env so MCP servers can gate themselves on it (1 = SIN tools
211
+ available, 0 = control arm, must not be used).
212
+
213
+ Returns:
214
+ Unified-diff text of every uncommitted change the agent made.
215
+ Empty string if the agent produced no edits, crashed, or hit
216
+ the timeout (we deliberately swallow non-zero exit codes here
217
+ — a broken agent is a "failed task", not a harness error).
218
+ """
219
+ import os
220
+
221
+ cmd = self._build_cmd(task, sin_enabled)
222
+ env = {**os.environ}
223
+ if self._env_for:
224
+ env.update(self._env_for(task, sin_enabled))
225
+ env["SIN_ENFORCE"] = "1" if sin_enabled else "0"
226
+
227
+ subprocess.run(
228
+ cmd,
229
+ cwd=workdir,
230
+ env=env,
231
+ timeout=self._timeout_s,
232
+ check=False,
233
+ capture_output=True,
234
+ text=True,
235
+ )
236
+ diff = subprocess.run(
237
+ ["git", "diff"],
238
+ cwd=workdir,
239
+ check=False,
240
+ capture_output=True,
241
+ text=True,
242
+ )
243
+ return diff.stdout
244
+
245
+
246
+ # --------------------------------------------------------------------------- #
247
+ # ── Git / Test Plumbing: worktree prep, patch apply, test execution ──────── #
248
+ # --------------------------------------------------------------------------- #
249
+ def _sh(cmd: list[str], cwd: Path, timeout: int = 600) -> subprocess.CompletedProcess:
250
+ # 600s = 10 min default — fits clone/checkout/test-id runs; callers
251
+ # override (e.g. clone uses 900s, setup_cmds use 1800s).
252
+ return subprocess.run(
253
+ cmd, cwd=cwd, check=False, capture_output=True, text=True, timeout=timeout
254
+ )
255
+
256
+
257
+ def _prepare_worktree(task: Task, root: Path) -> Path:
258
+ work = root / task.instance_id.replace("/", "__")
259
+ work.mkdir(parents=True, exist_ok=True)
260
+ url = f"https://github.com/{task.repo}.git"
261
+ # 900s clone timeout — large monorepos (django, sympy) routinely
262
+ # need >5 min on a cold network; tighter would flake the harness.
263
+ _sh(["git", "clone", "--quiet", url, "."], cwd=work, timeout=900)
264
+ _sh(["git", "checkout", "--quiet", task.base_commit], cwd=work)
265
+ for cmd in task.setup_cmds:
266
+ # 1800s per setup cmd — pip installs of scientific stacks (scipy,
267
+ # pandas) can be slow when wheels are missing for the platform.
268
+ _sh(["bash", "-lc", cmd], cwd=work, timeout=1800)
269
+ return work
270
+
271
+
272
+ def _apply_patch(diff: str, work: Path) -> bool:
273
+ if not diff.strip():
274
+ return False
275
+ patch = work / ".sin_patch.diff"
276
+ patch.write_text(diff, encoding="utf-8")
277
+ res = _sh(["git", "apply", "--whitespace=nowarn", str(patch)], cwd=work)
278
+ return res.returncode == 0
279
+
280
+
281
+ def _run_named_tests(work: Path, task: Task) -> tuple[int, int]:
282
+ if not task.fail_to_pass:
283
+ # Fallback path: SWE-bench tasks usually name specific tests, but some
284
+ # in-house tasks just ship a `test_cmd` and rely on its overall exit
285
+ # code (0 = solved, non-zero = not solved).
286
+ res = _sh(["bash", "-lc", task.test_cmd], cwd=work, timeout=1800)
287
+ return (1, 1) if res.returncode == 0 else (0, 1)
288
+
289
+ passed = 0
290
+ for test_id in task.fail_to_pass:
291
+ # 900s per single test — pytest selectors on huge repos (django) need
292
+ # collection time even before the test itself runs.
293
+ res = _sh(
294
+ ["bash", "-lc", f"{task.test_cmd} {test_id}"],
295
+ cwd=work,
296
+ timeout=900,
297
+ )
298
+ if res.returncode == 0:
299
+ passed += 1
300
+ return passed, len(task.fail_to_pass)
301
+
302
+
303
+ # --------------------------------------------------------------------------- #
304
+ # ── Core Eval Loop: drive runner + scoring per task per arm ──────────────── #
305
+ # --------------------------------------------------------------------------- #
306
+ def _eval_one(task: Task, arm: Arm, runner: AgentRunner, root: Path) -> TaskResult:
307
+ start = time.time()
308
+ try:
309
+ work = _prepare_worktree(task, root)
310
+ diff = runner.run(task, work, sin_enabled=(arm == "sin"))
311
+ applied = _apply_patch(diff, work)
312
+ passed, total = (0, len(task.fail_to_pass) or 1)
313
+ if applied:
314
+ passed, total = _run_named_tests(work, task)
315
+ resolved = applied and passed == total and total > 0
316
+ return TaskResult(
317
+ instance_id=task.instance_id,
318
+ arm=arm,
319
+ resolved=resolved,
320
+ duration_s=round(time.time() - start, 2),
321
+ patch_applied=applied,
322
+ fail_to_pass_passed=passed,
323
+ fail_to_pass_total=total,
324
+ )
325
+ except Exception as exc: # noqa: BLE001
326
+ return TaskResult(
327
+ instance_id=task.instance_id,
328
+ arm=arm,
329
+ resolved=False,
330
+ duration_s=round(time.time() - start, 2),
331
+ patch_applied=False,
332
+ fail_to_pass_passed=0,
333
+ fail_to_pass_total=len(task.fail_to_pass) or 1,
334
+ error=str(exc),
335
+ )
336
+
337
+
338
+ def _summarize(arm: Arm, results: list[TaskResult]) -> ArmSummary:
339
+ subset = [r for r in results if r.arm == arm]
340
+ total = len(subset)
341
+ resolved = sum(1 for r in subset if r.resolved)
342
+ rate = (resolved / total) if total else 0.0
343
+ mean_dur = statistics.mean([r.duration_s for r in subset]) if subset else 0.0
344
+ return ArmSummary(
345
+ arm=arm,
346
+ total=total,
347
+ resolved=resolved,
348
+ resolved_rate=round(rate, 4),
349
+ mean_duration_s=round(mean_dur, 2),
350
+ )
351
+
352
+
353
+ def run_benchmark(
354
+ tasks: Iterable[Task],
355
+ runner: AgentRunner,
356
+ arms: tuple[Arm, ...] = ("control", "sin"),
357
+ workspace: Optional[Path] = None,
358
+ ) -> BenchReport:
359
+ """Run every ``task`` through every ``arm`` and return an aggregated report.
360
+
361
+ Each (task, arm) pair gets its own clone under ``workspace / <arm> /
362
+ <task.instance_id>`` so arms can never poison each other's worktree.
363
+ The agent is invoked once per pair via ``runner``; its diff is applied
364
+ and the FAIL_TO_PASS tests are run to score the attempt.
365
+
366
+ Args:
367
+ tasks: Iterable of :class:`Task` (consumed once; materialised internally).
368
+ runner: Pluggable :class:`AgentRunner` (e.g. :class:`DryRunRunner`,
369
+ :class:`CommandRunner`).
370
+ arms: Which arms to run. Default ``("control", "sin")`` produces the
371
+ standard A/B delta; pass ``("sin",)`` for a single-arm run.
372
+ workspace: Persistent workspace dir. Pass a real path to keep clones
373
+ on disk for post-mortem inspection; default uses a tempdir
374
+ wiped on return.
375
+
376
+ Returns:
377
+ :class:`BenchReport` with per-arm summaries, headline delta, and
378
+ per-task detail.
379
+ """
380
+ started = time.strftime("%Y-%m-%dT%H:%M:%S")
381
+ tasks = list(tasks)
382
+ results: list[TaskResult] = []
383
+
384
+ with tempfile.TemporaryDirectory(prefix="sin-bench-") as tmp:
385
+ root = Path(workspace) if workspace else Path(tmp)
386
+ root.mkdir(parents=True, exist_ok=True)
387
+ for arm in arms:
388
+ for task in tasks:
389
+ # Per-arm subdir keeps the two clones strictly isolated —
390
+ # otherwise the second arm would inherit the first arm's
391
+ # leftover patch state.
392
+ results.append(_eval_one(task, arm, runner, root / arm))
393
+
394
+ summaries = {arm: _summarize(arm, results) for arm in arms}
395
+ delta = 0.0
396
+ if "sin" in summaries and "control" in summaries:
397
+ delta = round(summaries["sin"].resolved_rate - summaries["control"].resolved_rate, 4)
398
+ return BenchReport(
399
+ arms=summaries,
400
+ delta_resolved_rate=delta,
401
+ per_task=results,
402
+ started_at=started,
403
+ finished_at=time.strftime("%Y-%m-%dT%H:%M:%S"),
404
+ )
405
+
406
+
407
+ # --------------------------------------------------------------------------- #
408
+ # ── Task Loading: JSONL + SWE-bench Lite via datasets ────────────────────── #
409
+ # --------------------------------------------------------------------------- #
410
+ def load_tasks_jsonl(path: Path, limit: Optional[int] = None) -> list[Task]:
411
+ """Load tasks from a JSONL file (SWE-bench compatible field names)."""
412
+ tasks: list[Task] = []
413
+ for line in path.read_text(encoding="utf-8").splitlines():
414
+ line = line.strip()
415
+ if not line:
416
+ continue
417
+ d = json.loads(line)
418
+ tasks.append(
419
+ Task(
420
+ instance_id=d["instance_id"],
421
+ repo=d["repo"],
422
+ base_commit=d["base_commit"],
423
+ problem_statement=d.get("problem_statement", ""),
424
+ fail_to_pass=d.get("FAIL_TO_PASS", d.get("fail_to_pass", [])),
425
+ pass_to_pass=d.get("PASS_TO_PASS", d.get("pass_to_pass", [])),
426
+ setup_cmds=d.get("setup_cmds", []),
427
+ test_cmd=d.get("test_cmd", "pytest -q"),
428
+ )
429
+ )
430
+ if limit and len(tasks) >= limit:
431
+ break
432
+ return tasks
433
+
434
+
435
+ def load_swebench_lite(limit: Optional[int] = 20) -> list[Task]:
436
+ """Load SWE-bench Lite via `datasets` if available; else raise a clear error.
437
+
438
+ Default limit=20 is a smoke-test size — 20 tasks ≈ 10h on a single agent
439
+ (clone + setup + 30-min LLM rollout per task), enough to detect a
440
+ resolved-rate delta without burning a full 300-task run. Bump to None
441
+ for the full benchmark.
442
+ """
443
+ try:
444
+ from datasets import load_dataset # type: ignore
445
+ except ImportError as exc:
446
+ raise RuntimeError(
447
+ "SWE-bench Lite requires the 'datasets' package. "
448
+ "Install with: pip install 'sin-code-bundle[bench]', "
449
+ "or pass --tasks <file.jsonl>."
450
+ ) from exc
451
+
452
+ ds = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
453
+ tasks: list[Task] = []
454
+ for row in ds:
455
+ tasks.append(
456
+ Task(
457
+ instance_id=row["instance_id"],
458
+ repo=row["repo"],
459
+ base_commit=row["base_commit"],
460
+ problem_statement=row["problem_statement"],
461
+ fail_to_pass=json.loads(row["FAIL_TO_PASS"])
462
+ if isinstance(row["FAIL_TO_PASS"], str)
463
+ else row["FAIL_TO_PASS"],
464
+ pass_to_pass=json.loads(row["PASS_TO_PASS"])
465
+ if isinstance(row["PASS_TO_PASS"], str)
466
+ else row["PASS_TO_PASS"],
467
+ )
468
+ )
469
+ if limit and len(tasks) >= limit:
470
+ break
471
+ return tasks
472
+
473
+
474
+ # --------------------------------------------------------------------------- #
475
+ # ── Pretty Printing: human-readable terminal report ──────────────────────── #
476
+ # --------------------------------------------------------------------------- #
477
+ def format_report(report: BenchReport) -> str:
478
+ """Render a :class:`BenchReport` as a fixed-width terminal block.
479
+
480
+ Used by the ``sin bench`` CLI to print results at the end of a run.
481
+ Layout::
482
+
483
+ SIN-Code Bench — A/B resolved-rate
484
+ ========================================
485
+ control 3/20 resolved ( 15.0%) mean 142.5s
486
+ sin 7/20 resolved ( 35.0%) mean 187.2s
487
+ ----------------------------------------
488
+ SIN delta: +20.0 pp (percentage points)
489
+ ========================================
490
+
491
+ Returns:
492
+ Multi-line string with no trailing newline — caller decides spacing.
493
+ """
494
+ lines = ["", "SIN-Code Bench — A/B resolved-rate", "=" * 40]
495
+ for arm, s in report.arms.items():
496
+ lines.append(
497
+ f" {arm:<8} {s.resolved}/{s.total} resolved "
498
+ f"({s.resolved_rate * 100:5.1f}%) mean {s.mean_duration_s}s"
499
+ )
500
+ sign = "+" if report.delta_resolved_rate >= 0 else ""
501
+ lines.append("-" * 40)
502
+ lines.append(
503
+ f" SIN delta: {sign}{report.delta_resolved_rate * 100:.1f} pp (percentage points)"
504
+ )
505
+ lines.append("=" * 40)
506
+ return "\n".join(lines)
@@ -0,0 +1,51 @@
1
+ """Keep MCP tool outputs compact so they don't blow the agent's context window.
2
+
3
+ Every tool result is passed through `trim()` before returning. Lists are capped,
4
+ long strings truncated, and an explicit `_truncated` flag is added so the agent
5
+ knows more data exists.
6
+
7
+ Docs: budget.doc.md
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any
13
+
14
+ # Default ceilings sized to fit comfortably in a 200K-token agent context even
15
+ # when many tools are called per turn — strings dominate token cost so we cap
16
+ # them harder than list arity. Override per-call via `trim(value, max_list=…)`.
17
+ MAX_LIST = 25 # max items kept per list; rest collapsed into _truncated sentinel
18
+ MAX_STR = 2000 # max characters per string; rest replaced with " ...[truncated]"
19
+
20
+
21
+ def trim(value: Any, max_list: int = MAX_LIST, max_str: int = MAX_STR) -> Any:
22
+ """Recursively trim a tool output to safe sizes.
23
+
24
+ Walks any JSON-shaped value (str / list / dict / scalar) and enforces the
25
+ `max_list` and `max_str` ceilings. Non-container scalars pass through
26
+ untouched. Lists longer than `max_list` get an extra trailing dict
27
+ ``{"_truncated": True, "_omitted": N}`` so the agent can see that more
28
+ data existed without being forced to render it.
29
+
30
+ Args:
31
+ value: Any JSON-serialisable Python value (typically the result of
32
+ an MCP tool call).
33
+ max_list: Maximum list length to keep before truncating.
34
+ max_str: Maximum string length (in characters) before truncating.
35
+
36
+ Returns:
37
+ A new value of the same shape as ``value`` but capped to the limits.
38
+ Original input is never mutated.
39
+ """
40
+ if isinstance(value, str):
41
+ return value if len(value) <= max_str else value[:max_str] + " ...[truncated]"
42
+ if isinstance(value, list):
43
+ trimmed = [trim(v, max_list, max_str) for v in value[:max_list]]
44
+ if len(value) > max_list:
45
+ # Sentinel must be a dict (not a string) so JSON consumers can detect
46
+ # truncation programmatically without scanning text content.
47
+ trimmed.append({"_truncated": True, "_omitted": len(value) - max_list})
48
+ return trimmed
49
+ if isinstance(value, dict):
50
+ return {k: trim(v, max_list, max_str) for k, v in value.items()}
51
+ return value
@@ -0,0 +1,131 @@
1
+ """Incremental, content-hashed cache for SCKG / impact results.
2
+
3
+ Avoids rescanning the whole repo on every `impact()` call. Keyed by a hash of
4
+ the file set + their mtimes/sizes; invalidated automatically when files change.
5
+ Stored under .sin/cache/ as JSON.
6
+
7
+ Docs: cache.doc.md
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ import json
14
+ import time
15
+ from pathlib import Path
16
+ from typing import Any, Optional
17
+
18
+ _IGNORE = {".git", "node_modules", ".venv", "__pycache__", ".sin", "dist", "build"}
19
+ # Directory names that never carry first-party source — pruned from the
20
+ # fingerprint walk so e.g. installing a new dep into .venv doesn't blow the
21
+ # cache, and so the SCKG cache itself (under .sin/) can't recursively
22
+ # invalidate itself.
23
+
24
+
25
+ def _repo_fingerprint(root: Path, exts: tuple[str, ...]) -> str:
26
+ """Cheap content-aware hash of the repo's source tree.
27
+
28
+ Walks ``root`` recursively, filters to files whose suffix is in ``exts``
29
+ and whose path does not cross an ``_IGNORE`` directory, then hashes the
30
+ (path, mtime_ns, size) tuple of each. mtime+size is ~free compared to
31
+ reading file bytes and is sensitive enough for "did anything change?"
32
+ cache-invalidation — much cheaper than a full content hash.
33
+
34
+ Returns:
35
+ Hex SHA-256 digest. Stable across runs for unchanged trees.
36
+ """
37
+ h = hashlib.sha256()
38
+ for path in sorted(root.rglob("*")):
39
+ if not path.is_file() or path.suffix.lower() not in exts:
40
+ continue
41
+ if any(part in _IGNORE for part in path.parts):
42
+ continue
43
+ try:
44
+ st = path.stat()
45
+ except OSError:
46
+ # File vanished mid-walk (race with a checkout/rebuild) — skip
47
+ # rather than abort; fingerprint will still be stable next call.
48
+ continue
49
+ h.update(str(path).encode())
50
+ h.update(str(st.st_mtime_ns).encode())
51
+ h.update(str(st.st_size).encode())
52
+ return h.hexdigest()
53
+
54
+
55
+ # ── GraphCache: On-disk Cache Layer ────────────────────────────────────────
56
+ class GraphCache:
57
+ """On-disk cache for expensive SCKG / impact results, keyed by repo state.
58
+
59
+ Each cached entry is stamped with the current ``_repo_fingerprint`` of the
60
+ source tree. On :meth:`get`, if the stored fingerprint no longer matches
61
+ the live tree, the entry is treated as stale and ``None`` is returned —
62
+ so the cache silently self-invalidates whenever any tracked file changes.
63
+
64
+ Storage layout (under ``<root>/.sin/cache/``)::
65
+
66
+ <sha1(key)[:16]>.json
67
+ { "fingerprint": "<sha256>",
68
+ "stored_at": <epoch>,
69
+ "value": <arbitrary JSON> }
70
+
71
+ The 16-char prefix is plenty to avoid collisions for the typical
72
+ handful of cache keys per repo and keeps filenames human-skimmable.
73
+ """
74
+
75
+ def __init__(
76
+ self,
77
+ root: Path = Path("."),
78
+ exts: tuple[str, ...] = (".py", ".ts", ".tsx", ".js", ".go", ".rs"),
79
+ ) -> None:
80
+ self.root = Path(root).resolve()
81
+ self.exts = exts
82
+ self.dir = self.root / ".sin" / "cache"
83
+ self.dir.mkdir(parents=True, exist_ok=True)
84
+
85
+ def _file(self, key: str) -> Path:
86
+ # sha1 (not sha256) is fine here: this is a filesystem key, not a
87
+ # security boundary. 16 hex chars = 64 bits collision space.
88
+ safe = hashlib.sha1(key.encode()).hexdigest()[:16]
89
+ return self.dir / f"{safe}.json"
90
+
91
+ def get(self, key: str) -> Optional[Any]:
92
+ """Return the cached value for ``key`` if and only if the repo is unchanged.
93
+
94
+ Returns ``None`` when there is no entry, when the file is corrupt, or
95
+ when the stored fingerprint disagrees with the live repo fingerprint
96
+ (i.e. some tracked source file changed since the value was stored).
97
+ """
98
+ fp = self._file(key)
99
+ if not fp.exists():
100
+ return None
101
+ data = json.loads(fp.read_text(encoding="utf-8"))
102
+ if data.get("fingerprint") != _repo_fingerprint(self.root, self.exts):
103
+ return None # stale — repo changed
104
+ return data.get("value")
105
+
106
+ def set(self, key: str, value: Any) -> None:
107
+ """Persist ``value`` under ``key`` together with the current repo fingerprint.
108
+
109
+ ``value`` must be JSON-serialisable. Any prior entry under the same
110
+ key is overwritten atomically (single ``write_text`` call).
111
+ """
112
+ fp = self._file(key)
113
+ fp.write_text(
114
+ json.dumps(
115
+ {
116
+ "fingerprint": _repo_fingerprint(self.root, self.exts),
117
+ "stored_at": time.time(),
118
+ "value": value,
119
+ },
120
+ indent=2,
121
+ ),
122
+ encoding="utf-8",
123
+ )
124
+
125
+ def clear(self) -> int:
126
+ """Drop every cached entry. Returns the number of files removed."""
127
+ n = 0
128
+ for f in self.dir.glob("*.json"):
129
+ f.unlink()
130
+ n += 1
131
+ return n