@crewhaus/tool-harness-synthesizer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "@crewhaus/tool-harness-synthesizer",
3
+ "version": "0.1.0",
4
+ "type": "module",
5
+ "description": "Track D / §55 — Thompson-sampled tree search over candidate verifier functions for skills and tools. Smaller LLM + synthesized verifier beats larger LLM (AutoHarness, arxiv 2603.03329).",
6
+ "main": "src/index.ts",
7
+ "types": "src/index.ts",
8
+ "exports": {
9
+ ".": "./src/index.ts"
10
+ },
11
+ "scripts": {
12
+ "test": "bun test src"
13
+ },
14
+ "dependencies": {
15
+ "@crewhaus/errors": "0.0.0",
16
+ "@crewhaus/prompt-optimizer": "0.0.0"
17
+ },
18
+ "license": "Apache-2.0",
19
+ "author": {
20
+ "name": "Max Meier",
21
+ "email": "max@studiomax.io",
22
+ "url": "https://studiomax.io"
23
+ },
24
+ "repository": {
25
+ "type": "git",
26
+ "url": "git+https://github.com/crewhaus/factory.git",
27
+ "directory": "packages/tool-harness-synthesizer"
28
+ },
29
+ "homepage": "https://github.com/crewhaus/factory/tree/main/packages/tool-harness-synthesizer#readme",
30
+ "bugs": {
31
+ "url": "https://github.com/crewhaus/factory/issues"
32
+ },
33
+ "publishConfig": {
34
+ "access": "restricted"
35
+ },
36
+ "files": [
37
+ "src",
38
+ "README.md",
39
+ "LICENSE",
40
+ "NOTICE"
41
+ ]
42
+ }
@@ -0,0 +1,136 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import {
3
+ HarnessSynthesizerError,
4
+ type VerifierSample,
5
+ runVerifier,
6
+ synthesizeVerifier,
7
+ thompsonPick,
8
+ } from "./index";
9
+
10
+ const evenSamples: VerifierSample[] = [
11
+ { input: null, output: 0, expected: true },
12
+ { input: null, output: 1, expected: false },
13
+ { input: null, output: 2, expected: true },
14
+ { input: null, output: 3, expected: false },
15
+ { input: null, output: 4, expected: true },
16
+ ];
17
+
18
+ describe("runVerifier", () => {
19
+ test("scores a correct verifier at 1.0", () => {
20
+ const r = runVerifier("return typeof output === 'number' && output % 2 === 0", evenSamples);
21
+ expect(r.heuristic).toBe(1);
22
+ expect(r.errors).toBe(0);
23
+ expect(r.verdicts).toEqual([true, false, true, false, true]);
24
+ });
25
+
26
+ test("scores a constant-true verifier at the majority class", () => {
27
+ const r = runVerifier("return true", evenSamples);
28
+ // 3 of 5 expected: true → score 0.6
29
+ expect(r.heuristic).toBe(0.6);
30
+ });
31
+
32
+ test("captures runtime errors without throwing", () => {
33
+ const r = runVerifier("throw new Error('boom')", evenSamples);
34
+ expect(r.errors).toBe(5);
35
+ expect(r.heuristic).toBe(0.4); // false vs expected: 2 of 5 are expected false
36
+ });
37
+
38
+ test("throws on uncompilable code", () => {
39
+ expect(() => runVerifier("not valid javascript {{{", evenSamples)).toThrow(
40
+ HarnessSynthesizerError,
41
+ );
42
+ });
43
+ });
44
+
45
+ describe("thompsonPick", () => {
46
+ test("returns 0 for a single candidate", () => {
47
+ const idx = thompsonPick(
48
+ [
49
+ {
50
+ id: "x",
51
+ code: "return true",
52
+ score: 1,
53
+ heuristic: 1,
54
+ alpha: 10,
55
+ beta: 1,
56
+ },
57
+ ],
58
+ () => 0.5,
59
+ );
60
+ expect(idx).toBe(0);
61
+ });
62
+
63
+ test("favors high-heuristic candidates when sampling is biased", () => {
64
+ const nodes = [
65
+ { id: "a", code: "1", score: 0.1, heuristic: 0.1, alpha: 1, beta: 9 },
66
+ { id: "b", code: "1", score: 0.9, heuristic: 0.9, alpha: 9, beta: 1 },
67
+ ];
68
+ // RNG always 0.5 — Marsaglia normal is degenerate; we just verify it
69
+ // doesn't crash and returns a valid index.
70
+ const idx = thompsonPick(nodes, () => 0.5);
71
+ expect([0, 1]).toContain(idx);
72
+ });
73
+ });
74
+
75
+ describe("synthesizeVerifier", () => {
76
+ test("returns immediately when a seed already meets target", async () => {
77
+ const result = await synthesizeVerifier({
78
+ seedCandidates: ["return typeof output === 'number' && output % 2 === 0"],
79
+ samples: evenSamples,
80
+ refiner: async () => "throw new Error('should not be called')",
81
+ target: 1.0,
82
+ });
83
+ expect(result.converged).toBe(true);
84
+ expect(result.iterations).toBe(0);
85
+ expect(result.best.heuristic).toBe(1);
86
+ });
87
+
88
+ test("converges via refiner when seed is poor", async () => {
89
+ // Start with a constant-true seed; refiner produces the correct
90
+ // verifier on the first call. This proves the search loop wires
91
+ // refiner → score → pool update correctly.
92
+ const result = await synthesizeVerifier({
93
+ seedCandidates: ["return true"],
94
+ samples: evenSamples,
95
+ refiner: async () => "return typeof output === 'number' && output % 2 === 0",
96
+ target: 1.0,
97
+ maxIterations: 3,
98
+ rng: () => 0.5,
99
+ });
100
+ expect(result.converged).toBe(true);
101
+ expect(result.best.heuristic).toBe(1);
102
+ });
103
+
104
+ test("returns best-so-far when iterations exhaust", async () => {
105
+ const result = await synthesizeVerifier({
106
+ seedCandidates: ["return false"], // score 0.4
107
+ samples: evenSamples,
108
+ refiner: async () => "return true", // score 0.6
109
+ target: 1.0,
110
+ maxIterations: 3,
111
+ rng: () => 0.5,
112
+ });
113
+ expect(result.converged).toBe(false);
114
+ expect(result.best.heuristic).toBeGreaterThanOrEqual(0.6);
115
+ });
116
+
117
+ test("throws on empty seed candidates", async () => {
118
+ await expect(
119
+ synthesizeVerifier({
120
+ seedCandidates: [],
121
+ samples: evenSamples,
122
+ refiner: async () => "return true",
123
+ }),
124
+ ).rejects.toThrow(HarnessSynthesizerError);
125
+ });
126
+
127
+ test("throws on empty sample set", async () => {
128
+ await expect(
129
+ synthesizeVerifier({
130
+ seedCandidates: ["return true"],
131
+ samples: [],
132
+ refiner: async () => "return true",
133
+ }),
134
+ ).rejects.toThrow(HarnessSynthesizerError);
135
+ });
136
+ });
package/src/index.ts ADDED
@@ -0,0 +1,340 @@
1
+ /**
2
+ * Track D (§55) — `tool-harness-synthesizer`. Thompson-sampled tree
3
+ * search over candidate verifier functions for skills and tools.
4
+ *
5
+ * Source: AutoHarness (Lou et al., Google DeepMind, March 2026,
6
+ * arxiv 2603.03329). The paper's headline finding: a smaller LLM
7
+ * (Gemini-2.5-Flash) plus a synthesized code harness beats a larger
8
+ * LLM (Gemini-2.5-Pro) at near-zero inference cost. The trick is to
9
+ * have the LLM synthesize TWO functions iteratively, with the
10
+ * environment as critic:
11
+ *
12
+ * - `propose_action(obs)` — candidate generator
13
+ * - `is_legal_action(obs, action)` — verifier
14
+ *
15
+ * If the verifier returns `True` but the action is invalid, refine
16
+ * BOTH functions; if it returns `False` and the action is invalid,
17
+ * refine only the proposer. This split-refinement is the empirical
18
+ * winning move.
19
+ *
20
+ * In CrewHaus, the equivalent is to synthesize verifier code per
21
+ * skill or tool: an `is_valid_output(input, output)` function for any
22
+ * tool that has objective validity criteria. The verifier becomes a
23
+ * reusable artifact under `.crewhaus/verifiers/<name>.ts` and feeds
24
+ * into the `eval-optimizer-orchestrator` via a `MutationProvider`
25
+ * variant that proposes verifier-aware prompt edits.
26
+ *
27
+ * v0 ships:
28
+ * - `synthesizeVerifier(spec)` — pure tree search over candidate
29
+ * verifier code strings (the LLM call is supplied by the caller
30
+ * so this package stays pure)
31
+ * - `thompsonPick(nodes)` — Thompson sampling over tree nodes
32
+ * - `VerifierMutationProvider` — adapter to plug verifier search
33
+ * into the existing optimizer
34
+ *
35
+ * Cited paper: AutoHarness (arxiv 2603.03329, Lou et al., 2026-03).
36
+ */
37
+ import { CrewhausError } from "@crewhaus/errors";
38
+ import type {
39
+ MutationProvider,
40
+ OptimizerState,
41
+ ProviderMutation,
42
+ } from "@crewhaus/prompt-optimizer";
43
+
44
+ export class HarnessSynthesizerError extends CrewhausError {
45
+ override readonly name = "HarnessSynthesizerError";
46
+ constructor(message: string, cause?: unknown) {
47
+ super("config", message, cause);
48
+ }
49
+ }
50
+
51
+ /**
52
+ * One sample of behavior the verifier should be measured against.
53
+ * `expected` is whether the verifier should accept (true) or reject
54
+ * (false) this sample. Both classes are required for non-degenerate
55
+ * search — a verifier that returns `true` for everything passes the
56
+ * `expected: true` set perfectly.
57
+ */
58
+ export type VerifierSample = {
59
+ readonly input: unknown;
60
+ readonly output: unknown;
61
+ readonly expected: boolean;
62
+ };
63
+
64
+ /**
65
+ * A candidate verifier — a code string + the per-sample score it
66
+ * achieved on the last evaluation. `code` is a function body string
67
+ * with the signature `(input: unknown, output: unknown) => boolean`.
68
+ * It's stored as a string so the search can mutate it and feed it
69
+ * back to the LLM. Execution happens via `runVerifier` which
70
+ * compiles + invokes safely in a sandboxed Function call.
71
+ */
72
+ export type VerifierCandidate = {
73
+ readonly id: string;
74
+ readonly code: string;
75
+ readonly score: number;
76
+ /** AutoHarness's heuristic value — average over samples, in [0, 1]. */
77
+ readonly heuristic: number;
78
+ /** Beta posterior parameters for Thompson sampling. */
79
+ readonly alpha: number;
80
+ readonly beta: number;
81
+ };
82
+
83
+ /**
84
+ * The Refiner: takes a failing candidate + concrete failure cases and
85
+ * returns a new code string. In production, this is a model call; for
86
+ * testing it's a deterministic rule-based mutation. Either way, the
87
+ * signature is the same.
88
+ */
89
+ export type RefinerFn = (
90
+ current: VerifierCandidate,
91
+ failures: ReadonlyArray<VerifierSample>,
92
+ ) => Promise<string>;
93
+
94
+ /**
95
+ * The Critic: runs `code` against a sample set and returns the per-
96
+ * sample verdict and the heuristic value. Pure; deterministic given
97
+ * the same code + samples.
98
+ */
99
+ export function runVerifier(
100
+ code: string,
101
+ samples: ReadonlyArray<VerifierSample>,
102
+ ): {
103
+ readonly verdicts: ReadonlyArray<boolean>;
104
+ readonly heuristic: number;
105
+ readonly errors: number;
106
+ } {
107
+ let fn: (input: unknown, output: unknown) => boolean;
108
+ try {
109
+ // Code must be the body of a function with parameters (input, output).
110
+ // We wrap defensively so callers can pass either a body or a complete
111
+ // expression returning a function.
112
+ fn = new Function("input", "output", `${code}`) as (input: unknown, output: unknown) => boolean;
113
+ } catch (err) {
114
+ throw new HarnessSynthesizerError(`verifier code did not compile: ${(err as Error).message}`);
115
+ }
116
+ const verdicts: boolean[] = [];
117
+ let correct = 0;
118
+ let errors = 0;
119
+ for (const s of samples) {
120
+ let v: boolean;
121
+ try {
122
+ v = Boolean(fn(s.input, s.output));
123
+ } catch {
124
+ v = false;
125
+ errors++;
126
+ }
127
+ verdicts.push(v);
128
+ if (v === s.expected) correct++;
129
+ }
130
+ const heuristic = samples.length === 0 ? 0 : correct / samples.length;
131
+ return { verdicts, heuristic, errors };
132
+ }
133
+
134
+ /**
135
+ * Thompson sampling over a node population. Picks the index whose
136
+ * posterior sample is highest. Each node has a Beta(alpha, beta)
137
+ * posterior over its heuristic value; the alpha/beta are accumulated
138
+ * across iterations as the search refines.
139
+ */
140
+ export function thompsonPick(
141
+ nodes: ReadonlyArray<VerifierCandidate>,
142
+ rng: () => number = Math.random,
143
+ ): number {
144
+ if (nodes.length === 0) throw new HarnessSynthesizerError("thompsonPick called on empty list");
145
+ let bestIdx = 0;
146
+ let bestSample = Number.NEGATIVE_INFINITY;
147
+ for (let i = 0; i < nodes.length; i++) {
148
+ const n = nodes[i];
149
+ if (n === undefined) continue;
150
+ const sample = betaSample(n.alpha, n.beta, rng);
151
+ if (sample > bestSample) {
152
+ bestSample = sample;
153
+ bestIdx = i;
154
+ }
155
+ }
156
+ return bestIdx;
157
+ }
158
+
159
+ /**
160
+ * Quick-and-deterministic Beta sample using two gamma samples
161
+ * (Marsaglia–Tsang). For the sizes we deal with (alpha, beta < 100),
162
+ * the approximation is fast and stable.
163
+ */
164
+ function betaSample(a: number, b: number, rng: () => number): number {
165
+ const x = gammaSample(a, rng);
166
+ const y = gammaSample(b, rng);
167
+ return x / (x + y);
168
+ }
169
+
170
+ function gammaSample(shape: number, rng: () => number): number {
171
+ // For shape >= 1 use Marsaglia-Tsang; for shape < 1 use Ahrens-Dieter.
172
+ if (shape < 1) {
173
+ // Use shape+1 then transform by U^(1/shape).
174
+ const x = gammaSample(shape + 1, rng);
175
+ const u = Math.max(rng(), 1e-12);
176
+ return x * u ** (1 / shape);
177
+ }
178
+ const d = shape - 1 / 3;
179
+ const c = 1 / Math.sqrt(9 * d);
180
+ // Loop until a valid sample.
181
+ for (let i = 0; i < 64; i++) {
182
+ let x: number;
183
+ let v: number;
184
+ do {
185
+ const u1 = Math.max(rng(), 1e-12);
186
+ const u2 = Math.max(rng(), 1e-12);
187
+ // Box-Muller for standard normal.
188
+ x = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
189
+ v = 1 + c * x;
190
+ } while (v <= 0);
191
+ v = v * v * v;
192
+ const u = rng();
193
+ if (u < 1 - 0.0331 * x * x * x * x) return d * v;
194
+ if (Math.log(u) < 0.5 * x * x + d * (1 - v + Math.log(v))) return d * v;
195
+ }
196
+ // Fallback — extremely rare. Return the deterministic mean.
197
+ return shape;
198
+ }
199
+
200
+ export type SynthesizeOptions = {
201
+ /** Initial seed candidates. Must be non-empty; provides the starting tree. */
202
+ readonly seedCandidates: ReadonlyArray<string>;
203
+ /** Samples the verifier is scored against. */
204
+ readonly samples: ReadonlyArray<VerifierSample>;
205
+ /** The refiner — usually an LLM-backed function. */
206
+ readonly refiner: RefinerFn;
207
+ /** Maximum tree-search iterations. Default: 16 (paper's median is ~14). */
208
+ readonly maxIterations?: number;
209
+ /** Target heuristic value — stop when reached. Default: 1.0 (100% correct). */
210
+ readonly target?: number;
211
+ /** RNG for Thompson sampling. Default: Math.random. */
212
+ readonly rng?: () => number;
213
+ };
214
+
215
+ export type SynthesizeResult = {
216
+ readonly best: VerifierCandidate;
217
+ readonly iterations: number;
218
+ readonly converged: boolean;
219
+ readonly trajectory: ReadonlyArray<VerifierCandidate>;
220
+ };
221
+
222
+ /**
223
+ * Run the tree search. Returns the best candidate found, the
224
+ * iteration count, and whether the target heuristic was reached.
225
+ * Pure with respect to randomness: pass `rng` for determinism.
226
+ */
227
+ export async function synthesizeVerifier(opts: SynthesizeOptions): Promise<SynthesizeResult> {
228
+ if (opts.seedCandidates.length === 0) {
229
+ throw new HarnessSynthesizerError("at least one seed candidate is required");
230
+ }
231
+ if (opts.samples.length === 0) {
232
+ throw new HarnessSynthesizerError("at least one sample is required to score the verifier");
233
+ }
234
+ const target = opts.target ?? 1.0;
235
+ const rng = opts.rng ?? Math.random;
236
+ const maxIter = opts.maxIterations ?? 16;
237
+
238
+ // Initialize the candidate pool from seeds.
239
+ const pool: VerifierCandidate[] = [];
240
+ for (let i = 0; i < opts.seedCandidates.length; i++) {
241
+ const code = opts.seedCandidates[i] as string;
242
+ const { heuristic } = runVerifier(code, opts.samples);
243
+ pool.push({
244
+ id: `seed_${i}`,
245
+ code,
246
+ score: heuristic,
247
+ heuristic,
248
+ // Beta starts uniform; update with observed correct/incorrect counts.
249
+ alpha: 1 + Math.round(heuristic * opts.samples.length),
250
+ beta: 1 + Math.round((1 - heuristic) * opts.samples.length),
251
+ });
252
+ }
253
+ const trajectory: VerifierCandidate[] = [...pool];
254
+
255
+ // Early exit if a seed already satisfies the target.
256
+ let best = pool.reduce((a, b) => (a.heuristic >= b.heuristic ? a : b));
257
+ if (best.heuristic >= target) {
258
+ return { best, iterations: 0, converged: true, trajectory };
259
+ }
260
+
261
+ for (let iter = 0; iter < maxIter; iter++) {
262
+ const pickIdx = thompsonPick(pool, rng);
263
+ const parent = pool[pickIdx] as VerifierCandidate;
264
+ // Compute concrete failures for the refiner.
265
+ const { verdicts } = runVerifier(parent.code, opts.samples);
266
+ const failures: VerifierSample[] = [];
267
+ for (let i = 0; i < opts.samples.length; i++) {
268
+ const s = opts.samples[i] as VerifierSample;
269
+ const v = verdicts[i] as boolean;
270
+ if (v !== s.expected) failures.push(s);
271
+ }
272
+ let newCode: string;
273
+ try {
274
+ newCode = await opts.refiner(parent, failures);
275
+ } catch (err) {
276
+ throw new HarnessSynthesizerError(
277
+ `refiner threw on iteration ${iter}: ${(err as Error).message}`,
278
+ err,
279
+ );
280
+ }
281
+ const { heuristic } = runVerifier(newCode, opts.samples);
282
+ const child: VerifierCandidate = {
283
+ id: `cand_${iter}`,
284
+ code: newCode,
285
+ score: heuristic,
286
+ heuristic,
287
+ alpha: 1 + Math.round(heuristic * opts.samples.length),
288
+ beta: 1 + Math.round((1 - heuristic) * opts.samples.length),
289
+ };
290
+ pool.push(child);
291
+ trajectory.push(child);
292
+ if (heuristic > best.heuristic) best = child;
293
+ if (best.heuristic >= target) {
294
+ return { best, iterations: iter + 1, converged: true, trajectory };
295
+ }
296
+ }
297
+ return { best, iterations: maxIter, converged: false, trajectory };
298
+ }
299
+
300
+ /**
301
+ * `MutationProvider` adapter so verifier search can drop into the
302
+ * existing eval-optimizer-orchestrator loop. The provider's `next()`
303
+ * runs one iteration of the inner tree search and emits a
304
+ * prompt-edit that references the synthesized verifier.
305
+ *
306
+ * Typical wiring (programmatic): construct this provider with the
307
+ * spec's skill samples and a `refiner` function, then pass it to
308
+ * `optimizeSpec({ mutator: new VerifierMutationProvider(...) })`. The
309
+ * orchestrator runs the standard search loop, but each "mutation" is
310
+ * a freshly-synthesized verifier persisted to .crewhaus/verifiers/.
311
+ * (CLI `--mutator verifier-synthesis` wiring is a follow-up; the CLI
312
+ * today exposes `rule-based` and `claude` only.)
313
+ */
314
+ export class VerifierMutationProvider implements MutationProvider {
315
+ readonly name = "verifier-synthesis";
316
+ private synthesisIterations = 0;
317
+
318
+ constructor(
319
+ private readonly samples: ReadonlyArray<VerifierSample>,
320
+ private readonly refiner: RefinerFn,
321
+ private readonly seedCandidates: ReadonlyArray<string>,
322
+ private readonly maxInnerIterations: number = 4,
323
+ ) {}
324
+
325
+ async next(state: OptimizerState): Promise<ProviderMutation> {
326
+ this.synthesisIterations++;
327
+ const result = await synthesizeVerifier({
328
+ seedCandidates: this.seedCandidates,
329
+ samples: this.samples,
330
+ refiner: this.refiner,
331
+ maxIterations: this.maxInnerIterations,
332
+ });
333
+ const annotation = `\n\n[verifier ${result.best.id}, h=${result.best.heuristic.toFixed(3)}]`;
334
+ return {
335
+ prompt: state.best.prompt + annotation,
336
+ mutations: [{ kind: "rephrase-instruction" }],
337
+ rationale: `verifier-synthesis pass ${this.synthesisIterations}: ${result.best.id} reached heuristic ${result.best.heuristic.toFixed(3)} in ${result.iterations} inner iterations${result.converged ? " (converged)" : ""}`,
338
+ };
339
+ }
340
+ }