touchstone-eval 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- touchstone/__init__.py +3 -0
- touchstone/artifacts.py +51 -0
- touchstone/cli.py +212 -0
- touchstone/concurrency.py +34 -0
- touchstone/config.py +431 -0
- touchstone/environment.py +126 -0
- touchstone/executor.py +162 -0
- touchstone/export/__init__.py +5 -0
- touchstone/export/langfuse.py +102 -0
- touchstone/fixtures.py +53 -0
- touchstone/grader/__init__.py +6 -0
- touchstone/grader/base.py +55 -0
- touchstone/grader/command.py +40 -0
- touchstone/grader/efficiency.py +63 -0
- touchstone/grader/files.py +71 -0
- touchstone/grader/implemented.py +42 -0
- touchstone/grader/model_judge.py +149 -0
- touchstone/grader/pytest_runner.py +209 -0
- touchstone/grader/registry.py +40 -0
- touchstone/grader/swebench.py +101 -0
- touchstone/grader/trace.py +116 -0
- touchstone/harness/__init__.py +7 -0
- touchstone/harness/acp.py +492 -0
- touchstone/harness/base.py +90 -0
- touchstone/harness/claude_code.py +85 -0
- touchstone/harness/claude_stream.py +158 -0
- touchstone/harness/cli_agent.py +97 -0
- touchstone/harness/echo.py +30 -0
- touchstone/harness/registry.py +78 -0
- touchstone/interaction/__init__.py +11 -0
- touchstone/interaction/base.py +86 -0
- touchstone/interaction/policies.py +104 -0
- touchstone/interaction/registry.py +40 -0
- touchstone/interaction/responder.py +98 -0
- touchstone/metrics.py +125 -0
- touchstone/reachability.py +170 -0
- touchstone/report.py +433 -0
- touchstone/runner.py +333 -0
- touchstone/sandbox.py +121 -0
- touchstone/setup.py +58 -0
- touchstone/store.py +170 -0
- touchstone/trace.py +189 -0
- touchstone_eval-0.1.0.dist-info/METADATA +343 -0
- touchstone_eval-0.1.0.dist-info/RECORD +47 -0
- touchstone_eval-0.1.0.dist-info/WHEEL +4 -0
- touchstone_eval-0.1.0.dist-info/entry_points.txt +2 -0
- touchstone_eval-0.1.0.dist-info/licenses/LICENSE +21 -0
touchstone/__init__.py
ADDED
touchstone/artifacts.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Resolve a case's AI artifacts into concrete paths.
|
|
2
|
+
|
|
3
|
+
Artifacts are *materialized* into a sandbox by the harness adapter (different agents
|
|
4
|
+
expect different layouts), so this module only resolves and validates paths. See
|
|
5
|
+
`harness/claude_code.py` for the Claude Code layout (`.claude/skills`, `.mcp.json`, ...).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from .config import Case
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class ArtifactSet:
|
|
18
|
+
"""Resolved, existing artifact paths for one case."""
|
|
19
|
+
|
|
20
|
+
skills: list[Path] = field(default_factory=list)
|
|
21
|
+
commands: list[Path] = field(default_factory=list)
|
|
22
|
+
plugins: list[Path] = field(default_factory=list)
|
|
23
|
+
mcp: Path | None = None
|
|
24
|
+
|
|
25
|
+
def is_empty(self) -> bool:
|
|
26
|
+
return not (self.skills or self.commands or self.plugins or self.mcp)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _resolve_all(case: Case, rels: list[str]) -> list[Path]:
|
|
30
|
+
out: list[Path] = []
|
|
31
|
+
for rel in rels:
|
|
32
|
+
p = case.resolve(rel)
|
|
33
|
+
if not p.exists():
|
|
34
|
+
raise FileNotFoundError(f"artifact not found: {rel} (resolved {p})")
|
|
35
|
+
out.append(p)
|
|
36
|
+
return out
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def build_artifact_set(case: Case) -> ArtifactSet:
|
|
40
|
+
spec = case.artifacts
|
|
41
|
+
mcp = None
|
|
42
|
+
if spec.mcp:
|
|
43
|
+
mcp = case.resolve(spec.mcp)
|
|
44
|
+
if not mcp.exists():
|
|
45
|
+
raise FileNotFoundError(f"mcp config not found: {spec.mcp} (resolved {mcp})")
|
|
46
|
+
return ArtifactSet(
|
|
47
|
+
skills=_resolve_all(case, spec.skills),
|
|
48
|
+
commands=_resolve_all(case, spec.commands),
|
|
49
|
+
plugins=_resolve_all(case, spec.plugins),
|
|
50
|
+
mcp=mcp,
|
|
51
|
+
)
|
touchstone/cli.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""Command-line interface: validate / run / report / list."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from . import __version__
|
|
10
|
+
from .config import discover_cases, load_case
|
|
11
|
+
from .harness.registry import available_harnesses
|
|
12
|
+
from .reachability import UnavailableError, scan_cases
|
|
13
|
+
from .report import generate
|
|
14
|
+
from .runner import build_cells, run
|
|
15
|
+
from .store import RunStore
|
|
16
|
+
|
|
17
|
+
DEFAULT_EVALS = "evals"
|
|
18
|
+
DEFAULT_RUNS = "runs"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def main(argv: list[str] | None = None) -> int:
|
|
22
|
+
parser = argparse.ArgumentParser(prog="touchstone", description=__doc__)
|
|
23
|
+
parser.add_argument("--version", action="version", version=f"touchstone {__version__}")
|
|
24
|
+
parser.add_argument("--evals-dir", default=DEFAULT_EVALS)
|
|
25
|
+
parser.add_argument("--runs-dir", default=DEFAULT_RUNS)
|
|
26
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
27
|
+
|
|
28
|
+
sub.add_parser("list", help="list cases and past runs")
|
|
29
|
+
p_validate = sub.add_parser("validate", help="schema-check case.yaml files")
|
|
30
|
+
p_validate.add_argument("--check-access", action="store_true",
|
|
31
|
+
help="also probe each case's external repos (network) and report "
|
|
32
|
+
"reachability; exit non-zero if a required case is unreachable")
|
|
33
|
+
|
|
34
|
+
p_run = sub.add_parser("run", help="run the matrix and produce a report")
|
|
35
|
+
p_run.add_argument("--eval", action="append", help="case id (repeatable); default all")
|
|
36
|
+
p_run.add_argument("--harness", action="append", help="restrict to these matrix harnesses")
|
|
37
|
+
p_run.add_argument("--model", action="append",
|
|
38
|
+
help="restrict to these declared models (filter)")
|
|
39
|
+
p_run.add_argument("--with-model", action="append", metavar="[HARNESS=]MODEL",
|
|
40
|
+
help="run this model even if a case did not declare it, replacing the "
|
|
41
|
+
"case's models for the harness (repeatable). Use to compare "
|
|
42
|
+
"models on the same harness, e.g. --harness droid "
|
|
43
|
+
"--with-model A --with-model B. Prefix HARNESS= to scope to one "
|
|
44
|
+
"harness when a run spans several.")
|
|
45
|
+
p_run.add_argument("--trials", type=int, help="override trial count")
|
|
46
|
+
p_run.add_argument("--resume", help="continue an interrupted run id")
|
|
47
|
+
p_run.add_argument("--workers", type=int, default=1,
|
|
48
|
+
help="run cells in parallel (default 1; manual interaction forces 1)")
|
|
49
|
+
p_run.add_argument("--keep-sandboxes", action="store_true",
|
|
50
|
+
help="do not tear down worktree sandboxes after each cell")
|
|
51
|
+
p_run.add_argument("--llm-concurrency", type=int,
|
|
52
|
+
help="max concurrent auxiliary LLM calls (judge + responder)")
|
|
53
|
+
p_run.add_argument("--on-unavailable", choices=["fail", "skip"], default="fail",
|
|
54
|
+
help="when a case's external repo is unreachable: 'fail' the whole run "
|
|
55
|
+
"(default) or 'skip' those cases and run the rest")
|
|
56
|
+
|
|
57
|
+
p_report = sub.add_parser("report", help="(re)generate a run's report.md")
|
|
58
|
+
p_report.add_argument("run_id")
|
|
59
|
+
|
|
60
|
+
p_export = sub.add_parser("export", help="export a run's traces to LangFuse JSON")
|
|
61
|
+
p_export.add_argument("run_id")
|
|
62
|
+
p_export.add_argument("--push", action="store_true",
|
|
63
|
+
help="also push via the langfuse SDK (needs langfuse + keys)")
|
|
64
|
+
|
|
65
|
+
args = parser.parse_args(argv)
|
|
66
|
+
dispatch = {
|
|
67
|
+
"list": _cmd_list, "validate": _cmd_validate,
|
|
68
|
+
"run": _cmd_run, "report": _cmd_report, "export": _cmd_export,
|
|
69
|
+
}
|
|
70
|
+
return dispatch[args.cmd](args)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _cmd_list(args) -> int:
|
|
74
|
+
cases = discover_cases(args.evals_dir)
|
|
75
|
+
print(f"Cases ({len(cases)}) in {args.evals_dir}:")
|
|
76
|
+
for c in cases:
|
|
77
|
+
obs = "observe" if c.observe else "output-only"
|
|
78
|
+
print(f" {c.id:30} harnesses={c.matrix.all_harnesses} "
|
|
79
|
+
f"models={c.matrix.all_models} trials={c.matrix.trials} [{obs}]")
|
|
80
|
+
print(f"\nHarnesses available: {available_harnesses()}")
|
|
81
|
+
runs = RunStore.list_runs(args.runs_dir)
|
|
82
|
+
print(f"\nRuns ({len(runs)}) in {args.runs_dir}:")
|
|
83
|
+
for r in runs:
|
|
84
|
+
print(f" {r}")
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _cmd_validate(args) -> int:
|
|
89
|
+
evals = Path(args.evals_dir)
|
|
90
|
+
dirs = [d for d in sorted(evals.iterdir()) if d.is_dir()] if evals.is_dir() else []
|
|
91
|
+
if not dirs:
|
|
92
|
+
print(f"No case directories under {evals}", file=sys.stderr)
|
|
93
|
+
return 1
|
|
94
|
+
ok = True
|
|
95
|
+
cases = []
|
|
96
|
+
for d in dirs:
|
|
97
|
+
if not (d / "case.yaml").is_file():
|
|
98
|
+
continue
|
|
99
|
+
try:
|
|
100
|
+
case = load_case(d)
|
|
101
|
+
cells = build_cells([case])
|
|
102
|
+
print(f"OK {case.id}: {len(cells)} cells "
|
|
103
|
+
f"({len(case.graders)} graders)")
|
|
104
|
+
cases.append(case)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
ok = False
|
|
107
|
+
print(f"FAIL {d.name}: {type(e).__name__}: {e}", file=sys.stderr)
|
|
108
|
+
if args.check_access and cases:
|
|
109
|
+
ok = _report_access(cases) and ok
|
|
110
|
+
return 0 if ok else 1
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _report_access(cases) -> bool:
|
|
114
|
+
"""Probe each case's external repos and print reachability. Returns False if any REQUIRED
|
|
115
|
+
case is unreachable (optional-only unreachables don't fail validation)."""
|
|
116
|
+
unreachable = scan_cases(cases)
|
|
117
|
+
if not unreachable:
|
|
118
|
+
print(f"\nAccess: all {len(cases)} cases' external repos reachable.")
|
|
119
|
+
return True
|
|
120
|
+
print("\nAccess check:", file=sys.stderr)
|
|
121
|
+
required_blocked = False
|
|
122
|
+
for cid in sorted(unreachable):
|
|
123
|
+
case = next(c for c in cases if c.id == cid)
|
|
124
|
+
sev = "REQUIRED" if case.availability == "required" else "optional"
|
|
125
|
+
required_blocked = required_blocked or case.availability == "required"
|
|
126
|
+
for u in unreachable[cid]:
|
|
127
|
+
print(f" UNREACHABLE [{sev}] {cid}: {u.external.kind} {u.external.repo} "
|
|
128
|
+
f"({u.result.reason}) — {u.result.detail}", file=sys.stderr)
|
|
129
|
+
reachable = len(cases) - len(unreachable)
|
|
130
|
+
print(f"\n{reachable}/{len(cases)} cases reachable; {len(unreachable)} unreachable "
|
|
131
|
+
f"({'required cases blocked' if required_blocked else 'all optional — ok'}).",
|
|
132
|
+
file=sys.stderr)
|
|
133
|
+
return not required_blocked
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _parse_model_override(values: list[str] | None) -> dict[str | None, list[str]] | None:
|
|
137
|
+
"""Parse `--with-model` values into {harness|None: [models]}.
|
|
138
|
+
|
|
139
|
+
`MODEL` -> applies to any harness (key None); `HARNESS=MODEL` -> scoped to that
|
|
140
|
+
harness. Repeats accumulate; later values append.
|
|
141
|
+
"""
|
|
142
|
+
if not values:
|
|
143
|
+
return None
|
|
144
|
+
override: dict[str | None, list[str]] = {}
|
|
145
|
+
for v in values:
|
|
146
|
+
harness, sep, model = v.partition("=")
|
|
147
|
+
key, name = (harness, model) if sep else (None, harness)
|
|
148
|
+
name = name.strip()
|
|
149
|
+
if not name:
|
|
150
|
+
raise ValueError(f"--with-model {v!r}: empty model name")
|
|
151
|
+
override.setdefault(key, [])
|
|
152
|
+
if name not in override[key]:
|
|
153
|
+
override[key].append(name)
|
|
154
|
+
return override
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _cmd_run(args) -> int:
|
|
158
|
+
try:
|
|
159
|
+
model_override = _parse_model_override(args.with_model)
|
|
160
|
+
run_id = run(
|
|
161
|
+
args.evals_dir, args.runs_dir,
|
|
162
|
+
eval_filter=args.eval, harnesses=args.harness,
|
|
163
|
+
models=args.model, trials=args.trials, model_override=model_override,
|
|
164
|
+
resume=args.resume, workers=args.workers,
|
|
165
|
+
keep_sandboxes=args.keep_sandboxes, llm_concurrency=args.llm_concurrency,
|
|
166
|
+
on_unavailable=args.on_unavailable,
|
|
167
|
+
)
|
|
168
|
+
except UnavailableError as e:
|
|
169
|
+
print(str(e), file=sys.stderr)
|
|
170
|
+
print("\nFix git access for these repos, mark the cases `availability: optional`, or "
|
|
171
|
+
"re-run with `--on-unavailable skip` to degrade them.", file=sys.stderr)
|
|
172
|
+
return 1
|
|
173
|
+
except Exception as e:
|
|
174
|
+
print(f"Run failed: {type(e).__name__}: {e}", file=sys.stderr)
|
|
175
|
+
return 1
|
|
176
|
+
report_path = generate(args.runs_dir, run_id)
|
|
177
|
+
print(f"\nDone. Report: {report_path}")
|
|
178
|
+
return 0
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _cmd_report(args) -> int:
|
|
182
|
+
try:
|
|
183
|
+
path = generate(args.runs_dir, args.run_id)
|
|
184
|
+
except FileNotFoundError as e:
|
|
185
|
+
print(str(e), file=sys.stderr)
|
|
186
|
+
return 1
|
|
187
|
+
print(f"Report: {path}")
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _cmd_export(args) -> int:
|
|
192
|
+
from .export import push_langfuse, to_langfuse
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
data = to_langfuse(args.runs_dir, args.run_id)
|
|
196
|
+
except FileNotFoundError as e:
|
|
197
|
+
print(str(e), file=sys.stderr)
|
|
198
|
+
return 1
|
|
199
|
+
out = Path(args.runs_dir) / args.run_id / "langfuse.json"
|
|
200
|
+
print(f"Exported {len(data['traces'])} traces -> {out}")
|
|
201
|
+
if args.push:
|
|
202
|
+
try:
|
|
203
|
+
n = push_langfuse(data)
|
|
204
|
+
print(f"Pushed {n} traces to LangFuse")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
print(f"Push failed: {e}", file=sys.stderr)
|
|
207
|
+
return 1
|
|
208
|
+
return 0
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
if __name__ == "__main__":
|
|
212
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Shared concurrency limits for parallel runs.
|
|
2
|
+
|
|
3
|
+
When many Cells run in parallel, the auxiliary LLMs (Judge + Responder) can fan out wide
|
|
4
|
+
enough to trip provider rate limits. They acquire a shared `llm_slot()` so total in-flight
|
|
5
|
+
auxiliary LLM calls stay bounded regardless of how many Cells are executing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import threading
|
|
11
|
+
from contextlib import contextmanager
|
|
12
|
+
|
|
13
|
+
_DEFAULT = 8
|
|
14
|
+
_sem = threading.Semaphore(_DEFAULT)
|
|
15
|
+
_limit = _DEFAULT
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def set_llm_concurrency(n: int) -> None:
|
|
19
|
+
global _sem, _limit
|
|
20
|
+
_limit = max(1, int(n))
|
|
21
|
+
_sem = threading.Semaphore(_limit)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def llm_concurrency() -> int:
|
|
25
|
+
return _limit
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@contextmanager
|
|
29
|
+
def llm_slot():
|
|
30
|
+
_sem.acquire()
|
|
31
|
+
try:
|
|
32
|
+
yield
|
|
33
|
+
finally:
|
|
34
|
+
_sem.release()
|