touchstone-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. touchstone/__init__.py +3 -0
  2. touchstone/artifacts.py +51 -0
  3. touchstone/cli.py +212 -0
  4. touchstone/concurrency.py +34 -0
  5. touchstone/config.py +431 -0
  6. touchstone/environment.py +126 -0
  7. touchstone/executor.py +162 -0
  8. touchstone/export/__init__.py +5 -0
  9. touchstone/export/langfuse.py +102 -0
  10. touchstone/fixtures.py +53 -0
  11. touchstone/grader/__init__.py +6 -0
  12. touchstone/grader/base.py +55 -0
  13. touchstone/grader/command.py +40 -0
  14. touchstone/grader/efficiency.py +63 -0
  15. touchstone/grader/files.py +71 -0
  16. touchstone/grader/implemented.py +42 -0
  17. touchstone/grader/model_judge.py +149 -0
  18. touchstone/grader/pytest_runner.py +209 -0
  19. touchstone/grader/registry.py +40 -0
  20. touchstone/grader/swebench.py +101 -0
  21. touchstone/grader/trace.py +116 -0
  22. touchstone/harness/__init__.py +7 -0
  23. touchstone/harness/acp.py +492 -0
  24. touchstone/harness/base.py +90 -0
  25. touchstone/harness/claude_code.py +85 -0
  26. touchstone/harness/claude_stream.py +158 -0
  27. touchstone/harness/cli_agent.py +97 -0
  28. touchstone/harness/echo.py +30 -0
  29. touchstone/harness/registry.py +78 -0
  30. touchstone/interaction/__init__.py +11 -0
  31. touchstone/interaction/base.py +86 -0
  32. touchstone/interaction/policies.py +104 -0
  33. touchstone/interaction/registry.py +40 -0
  34. touchstone/interaction/responder.py +98 -0
  35. touchstone/metrics.py +125 -0
  36. touchstone/reachability.py +170 -0
  37. touchstone/report.py +433 -0
  38. touchstone/runner.py +333 -0
  39. touchstone/sandbox.py +121 -0
  40. touchstone/setup.py +58 -0
  41. touchstone/store.py +170 -0
  42. touchstone/trace.py +189 -0
  43. touchstone_eval-0.1.0.dist-info/METADATA +343 -0
  44. touchstone_eval-0.1.0.dist-info/RECORD +47 -0
  45. touchstone_eval-0.1.0.dist-info/WHEEL +4 -0
  46. touchstone_eval-0.1.0.dist-info/entry_points.txt +2 -0
  47. touchstone_eval-0.1.0.dist-info/licenses/LICENSE +21 -0
touchstone/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """touchstone: a personal model/harness eval benchmark."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,51 @@
1
+ """Resolve a case's AI artifacts into concrete paths.
2
+
3
+ Artifacts are *materialized* into a sandbox by the harness adapter (different agents
4
+ expect different layouts), so this module only resolves and validates paths. See
5
+ `harness/claude_code.py` for the Claude Code layout (`.claude/skills`, `.mcp.json`, ...).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from pathlib import Path
12
+
13
+ from .config import Case
14
+
15
+
16
+ @dataclass
17
+ class ArtifactSet:
18
+ """Resolved, existing artifact paths for one case."""
19
+
20
+ skills: list[Path] = field(default_factory=list)
21
+ commands: list[Path] = field(default_factory=list)
22
+ plugins: list[Path] = field(default_factory=list)
23
+ mcp: Path | None = None
24
+
25
+ def is_empty(self) -> bool:
26
+ return not (self.skills or self.commands or self.plugins or self.mcp)
27
+
28
+
29
+ def _resolve_all(case: Case, rels: list[str]) -> list[Path]:
30
+ out: list[Path] = []
31
+ for rel in rels:
32
+ p = case.resolve(rel)
33
+ if not p.exists():
34
+ raise FileNotFoundError(f"artifact not found: {rel} (resolved {p})")
35
+ out.append(p)
36
+ return out
37
+
38
+
39
+ def build_artifact_set(case: Case) -> ArtifactSet:
40
+ spec = case.artifacts
41
+ mcp = None
42
+ if spec.mcp:
43
+ mcp = case.resolve(spec.mcp)
44
+ if not mcp.exists():
45
+ raise FileNotFoundError(f"mcp config not found: {spec.mcp} (resolved {mcp})")
46
+ return ArtifactSet(
47
+ skills=_resolve_all(case, spec.skills),
48
+ commands=_resolve_all(case, spec.commands),
49
+ plugins=_resolve_all(case, spec.plugins),
50
+ mcp=mcp,
51
+ )
touchstone/cli.py ADDED
@@ -0,0 +1,212 @@
1
+ """Command-line interface: validate / run / report / list."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from . import __version__
10
+ from .config import discover_cases, load_case
11
+ from .harness.registry import available_harnesses
12
+ from .reachability import UnavailableError, scan_cases
13
+ from .report import generate
14
+ from .runner import build_cells, run
15
+ from .store import RunStore
16
+
17
+ DEFAULT_EVALS = "evals"
18
+ DEFAULT_RUNS = "runs"
19
+
20
+
21
+ def main(argv: list[str] | None = None) -> int:
22
+ parser = argparse.ArgumentParser(prog="touchstone", description=__doc__)
23
+ parser.add_argument("--version", action="version", version=f"touchstone {__version__}")
24
+ parser.add_argument("--evals-dir", default=DEFAULT_EVALS)
25
+ parser.add_argument("--runs-dir", default=DEFAULT_RUNS)
26
+ sub = parser.add_subparsers(dest="cmd", required=True)
27
+
28
+ sub.add_parser("list", help="list cases and past runs")
29
+ p_validate = sub.add_parser("validate", help="schema-check case.yaml files")
30
+ p_validate.add_argument("--check-access", action="store_true",
31
+ help="also probe each case's external repos (network) and report "
32
+ "reachability; exit non-zero if a required case is unreachable")
33
+
34
+ p_run = sub.add_parser("run", help="run the matrix and produce a report")
35
+ p_run.add_argument("--eval", action="append", help="case id (repeatable); default all")
36
+ p_run.add_argument("--harness", action="append", help="restrict to these matrix harnesses")
37
+ p_run.add_argument("--model", action="append",
38
+ help="restrict to these declared models (filter)")
39
+ p_run.add_argument("--with-model", action="append", metavar="[HARNESS=]MODEL",
40
+ help="run this model even if a case did not declare it, replacing the "
41
+ "case's models for the harness (repeatable). Use to compare "
42
+ "models on the same harness, e.g. --harness droid "
43
+ "--with-model A --with-model B. Prefix HARNESS= to scope to one "
44
+ "harness when a run spans several.")
45
+ p_run.add_argument("--trials", type=int, help="override trial count")
46
+ p_run.add_argument("--resume", help="continue an interrupted run id")
47
+ p_run.add_argument("--workers", type=int, default=1,
48
+ help="run cells in parallel (default 1; manual interaction forces 1)")
49
+ p_run.add_argument("--keep-sandboxes", action="store_true",
50
+ help="do not tear down worktree sandboxes after each cell")
51
+ p_run.add_argument("--llm-concurrency", type=int,
52
+ help="max concurrent auxiliary LLM calls (judge + responder)")
53
+ p_run.add_argument("--on-unavailable", choices=["fail", "skip"], default="fail",
54
+ help="when a case's external repo is unreachable: 'fail' the whole run "
55
+ "(default) or 'skip' those cases and run the rest")
56
+
57
+ p_report = sub.add_parser("report", help="(re)generate a run's report.md")
58
+ p_report.add_argument("run_id")
59
+
60
+ p_export = sub.add_parser("export", help="export a run's traces to LangFuse JSON")
61
+ p_export.add_argument("run_id")
62
+ p_export.add_argument("--push", action="store_true",
63
+ help="also push via the langfuse SDK (needs langfuse + keys)")
64
+
65
+ args = parser.parse_args(argv)
66
+ dispatch = {
67
+ "list": _cmd_list, "validate": _cmd_validate,
68
+ "run": _cmd_run, "report": _cmd_report, "export": _cmd_export,
69
+ }
70
+ return dispatch[args.cmd](args)
71
+
72
+
73
+ def _cmd_list(args) -> int:
74
+ cases = discover_cases(args.evals_dir)
75
+ print(f"Cases ({len(cases)}) in {args.evals_dir}:")
76
+ for c in cases:
77
+ obs = "observe" if c.observe else "output-only"
78
+ print(f" {c.id:30} harnesses={c.matrix.all_harnesses} "
79
+ f"models={c.matrix.all_models} trials={c.matrix.trials} [{obs}]")
80
+ print(f"\nHarnesses available: {available_harnesses()}")
81
+ runs = RunStore.list_runs(args.runs_dir)
82
+ print(f"\nRuns ({len(runs)}) in {args.runs_dir}:")
83
+ for r in runs:
84
+ print(f" {r}")
85
+ return 0
86
+
87
+
88
+ def _cmd_validate(args) -> int:
89
+ evals = Path(args.evals_dir)
90
+ dirs = [d for d in sorted(evals.iterdir()) if d.is_dir()] if evals.is_dir() else []
91
+ if not dirs:
92
+ print(f"No case directories under {evals}", file=sys.stderr)
93
+ return 1
94
+ ok = True
95
+ cases = []
96
+ for d in dirs:
97
+ if not (d / "case.yaml").is_file():
98
+ continue
99
+ try:
100
+ case = load_case(d)
101
+ cells = build_cells([case])
102
+ print(f"OK {case.id}: {len(cells)} cells "
103
+ f"({len(case.graders)} graders)")
104
+ cases.append(case)
105
+ except Exception as e:
106
+ ok = False
107
+ print(f"FAIL {d.name}: {type(e).__name__}: {e}", file=sys.stderr)
108
+ if args.check_access and cases:
109
+ ok = _report_access(cases) and ok
110
+ return 0 if ok else 1
111
+
112
+
113
+ def _report_access(cases) -> bool:
114
+ """Probe each case's external repos and print reachability. Returns False if any REQUIRED
115
+ case is unreachable (optional-only unreachables don't fail validation)."""
116
+ unreachable = scan_cases(cases)
117
+ if not unreachable:
118
+ print(f"\nAccess: all {len(cases)} cases' external repos reachable.")
119
+ return True
120
+ print("\nAccess check:", file=sys.stderr)
121
+ required_blocked = False
122
+ for cid in sorted(unreachable):
123
+ case = next(c for c in cases if c.id == cid)
124
+ sev = "REQUIRED" if case.availability == "required" else "optional"
125
+ required_blocked = required_blocked or case.availability == "required"
126
+ for u in unreachable[cid]:
127
+ print(f" UNREACHABLE [{sev}] {cid}: {u.external.kind} {u.external.repo} "
128
+ f"({u.result.reason}) — {u.result.detail}", file=sys.stderr)
129
+ reachable = len(cases) - len(unreachable)
130
+ print(f"\n{reachable}/{len(cases)} cases reachable; {len(unreachable)} unreachable "
131
+ f"({'required cases blocked' if required_blocked else 'all optional — ok'}).",
132
+ file=sys.stderr)
133
+ return not required_blocked
134
+
135
+
136
+ def _parse_model_override(values: list[str] | None) -> dict[str | None, list[str]] | None:
137
+ """Parse `--with-model` values into {harness|None: [models]}.
138
+
139
+ `MODEL` -> applies to any harness (key None); `HARNESS=MODEL` -> scoped to that
140
+ harness. Repeats accumulate; later values append.
141
+ """
142
+ if not values:
143
+ return None
144
+ override: dict[str | None, list[str]] = {}
145
+ for v in values:
146
+ harness, sep, model = v.partition("=")
147
+ key, name = (harness, model) if sep else (None, harness)
148
+ name = name.strip()
149
+ if not name:
150
+ raise ValueError(f"--with-model {v!r}: empty model name")
151
+ override.setdefault(key, [])
152
+ if name not in override[key]:
153
+ override[key].append(name)
154
+ return override
155
+
156
+
157
+ def _cmd_run(args) -> int:
158
+ try:
159
+ model_override = _parse_model_override(args.with_model)
160
+ run_id = run(
161
+ args.evals_dir, args.runs_dir,
162
+ eval_filter=args.eval, harnesses=args.harness,
163
+ models=args.model, trials=args.trials, model_override=model_override,
164
+ resume=args.resume, workers=args.workers,
165
+ keep_sandboxes=args.keep_sandboxes, llm_concurrency=args.llm_concurrency,
166
+ on_unavailable=args.on_unavailable,
167
+ )
168
+ except UnavailableError as e:
169
+ print(str(e), file=sys.stderr)
170
+ print("\nFix git access for these repos, mark the cases `availability: optional`, or "
171
+ "re-run with `--on-unavailable skip` to degrade them.", file=sys.stderr)
172
+ return 1
173
+ except Exception as e:
174
+ print(f"Run failed: {type(e).__name__}: {e}", file=sys.stderr)
175
+ return 1
176
+ report_path = generate(args.runs_dir, run_id)
177
+ print(f"\nDone. Report: {report_path}")
178
+ return 0
179
+
180
+
181
+ def _cmd_report(args) -> int:
182
+ try:
183
+ path = generate(args.runs_dir, args.run_id)
184
+ except FileNotFoundError as e:
185
+ print(str(e), file=sys.stderr)
186
+ return 1
187
+ print(f"Report: {path}")
188
+ return 0
189
+
190
+
191
+ def _cmd_export(args) -> int:
192
+ from .export import push_langfuse, to_langfuse
193
+
194
+ try:
195
+ data = to_langfuse(args.runs_dir, args.run_id)
196
+ except FileNotFoundError as e:
197
+ print(str(e), file=sys.stderr)
198
+ return 1
199
+ out = Path(args.runs_dir) / args.run_id / "langfuse.json"
200
+ print(f"Exported {len(data['traces'])} traces -> {out}")
201
+ if args.push:
202
+ try:
203
+ n = push_langfuse(data)
204
+ print(f"Pushed {n} traces to LangFuse")
205
+ except Exception as e:
206
+ print(f"Push failed: {e}", file=sys.stderr)
207
+ return 1
208
+ return 0
209
+
210
+
211
+ if __name__ == "__main__":
212
+ raise SystemExit(main())
@@ -0,0 +1,34 @@
1
+ """Shared concurrency limits for parallel runs.
2
+
3
+ When many Cells run in parallel, the auxiliary LLMs (Judge + Responder) can fan out wide
4
+ enough to trip provider rate limits. They acquire a shared `llm_slot()` so total in-flight
5
+ auxiliary LLM calls stay bounded regardless of how many Cells are executing.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import threading
11
+ from contextlib import contextmanager
12
+
13
+ _DEFAULT = 8
14
+ _sem = threading.Semaphore(_DEFAULT)
15
+ _limit = _DEFAULT
16
+
17
+
18
+ def set_llm_concurrency(n: int) -> None:
19
+ global _sem, _limit
20
+ _limit = max(1, int(n))
21
+ _sem = threading.Semaphore(_limit)
22
+
23
+
24
+ def llm_concurrency() -> int:
25
+ return _limit
26
+
27
+
28
+ @contextmanager
29
+ def llm_slot():
30
+ _sem.acquire()
31
+ try:
32
+ yield
33
+ finally:
34
+ _sem.release()