@crewhaus/tool-harness-synthesizer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +42 -0
- package/src/index.test.ts +136 -0
- package/src/index.ts +340 -0
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@crewhaus/tool-harness-synthesizer",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Track D / §55 — Thompson-sampled tree search over candidate verifier functions for skills and tools. Smaller LLM + synthesized verifier beats larger LLM (AutoHarness, arxiv 2603.03329).",
|
|
6
|
+
"main": "src/index.ts",
|
|
7
|
+
"types": "src/index.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": "./src/index.ts"
|
|
10
|
+
},
|
|
11
|
+
"scripts": {
|
|
12
|
+
"test": "bun test src"
|
|
13
|
+
},
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"@crewhaus/errors": "0.0.0",
|
|
16
|
+
"@crewhaus/prompt-optimizer": "0.0.0"
|
|
17
|
+
},
|
|
18
|
+
"license": "Apache-2.0",
|
|
19
|
+
"author": {
|
|
20
|
+
"name": "Max Meier",
|
|
21
|
+
"email": "max@studiomax.io",
|
|
22
|
+
"url": "https://studiomax.io"
|
|
23
|
+
},
|
|
24
|
+
"repository": {
|
|
25
|
+
"type": "git",
|
|
26
|
+
"url": "git+https://github.com/crewhaus/factory.git",
|
|
27
|
+
"directory": "packages/tool-harness-synthesizer"
|
|
28
|
+
},
|
|
29
|
+
"homepage": "https://github.com/crewhaus/factory/tree/main/packages/tool-harness-synthesizer#readme",
|
|
30
|
+
"bugs": {
|
|
31
|
+
"url": "https://github.com/crewhaus/factory/issues"
|
|
32
|
+
},
|
|
33
|
+
"publishConfig": {
|
|
34
|
+
"access": "restricted"
|
|
35
|
+
},
|
|
36
|
+
"files": [
|
|
37
|
+
"src",
|
|
38
|
+
"README.md",
|
|
39
|
+
"LICENSE",
|
|
40
|
+
"NOTICE"
|
|
41
|
+
]
|
|
42
|
+
}
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import {
|
|
3
|
+
HarnessSynthesizerError,
|
|
4
|
+
type VerifierSample,
|
|
5
|
+
runVerifier,
|
|
6
|
+
synthesizeVerifier,
|
|
7
|
+
thompsonPick,
|
|
8
|
+
} from "./index";
|
|
9
|
+
|
|
10
|
+
const evenSamples: VerifierSample[] = [
|
|
11
|
+
{ input: null, output: 0, expected: true },
|
|
12
|
+
{ input: null, output: 1, expected: false },
|
|
13
|
+
{ input: null, output: 2, expected: true },
|
|
14
|
+
{ input: null, output: 3, expected: false },
|
|
15
|
+
{ input: null, output: 4, expected: true },
|
|
16
|
+
];
|
|
17
|
+
|
|
18
|
+
describe("runVerifier", () => {
|
|
19
|
+
test("scores a correct verifier at 1.0", () => {
|
|
20
|
+
const r = runVerifier("return typeof output === 'number' && output % 2 === 0", evenSamples);
|
|
21
|
+
expect(r.heuristic).toBe(1);
|
|
22
|
+
expect(r.errors).toBe(0);
|
|
23
|
+
expect(r.verdicts).toEqual([true, false, true, false, true]);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test("scores a constant-true verifier at the majority class", () => {
|
|
27
|
+
const r = runVerifier("return true", evenSamples);
|
|
28
|
+
// 3 of 5 expected: true → score 0.6
|
|
29
|
+
expect(r.heuristic).toBe(0.6);
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
test("captures runtime errors without throwing", () => {
|
|
33
|
+
const r = runVerifier("throw new Error('boom')", evenSamples);
|
|
34
|
+
expect(r.errors).toBe(5);
|
|
35
|
+
expect(r.heuristic).toBe(0.4); // false vs expected: 2 of 5 are expected false
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
test("throws on uncompilable code", () => {
|
|
39
|
+
expect(() => runVerifier("not valid javascript {{{", evenSamples)).toThrow(
|
|
40
|
+
HarnessSynthesizerError,
|
|
41
|
+
);
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
describe("thompsonPick", () => {
|
|
46
|
+
test("returns 0 for a single candidate", () => {
|
|
47
|
+
const idx = thompsonPick(
|
|
48
|
+
[
|
|
49
|
+
{
|
|
50
|
+
id: "x",
|
|
51
|
+
code: "return true",
|
|
52
|
+
score: 1,
|
|
53
|
+
heuristic: 1,
|
|
54
|
+
alpha: 10,
|
|
55
|
+
beta: 1,
|
|
56
|
+
},
|
|
57
|
+
],
|
|
58
|
+
() => 0.5,
|
|
59
|
+
);
|
|
60
|
+
expect(idx).toBe(0);
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
test("favors high-heuristic candidates when sampling is biased", () => {
|
|
64
|
+
const nodes = [
|
|
65
|
+
{ id: "a", code: "1", score: 0.1, heuristic: 0.1, alpha: 1, beta: 9 },
|
|
66
|
+
{ id: "b", code: "1", score: 0.9, heuristic: 0.9, alpha: 9, beta: 1 },
|
|
67
|
+
];
|
|
68
|
+
// RNG always 0.5 — Marsaglia normal is degenerate; we just verify it
|
|
69
|
+
// doesn't crash and returns a valid index.
|
|
70
|
+
const idx = thompsonPick(nodes, () => 0.5);
|
|
71
|
+
expect([0, 1]).toContain(idx);
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
describe("synthesizeVerifier", () => {
|
|
76
|
+
test("returns immediately when a seed already meets target", async () => {
|
|
77
|
+
const result = await synthesizeVerifier({
|
|
78
|
+
seedCandidates: ["return typeof output === 'number' && output % 2 === 0"],
|
|
79
|
+
samples: evenSamples,
|
|
80
|
+
refiner: async () => "throw new Error('should not be called')",
|
|
81
|
+
target: 1.0,
|
|
82
|
+
});
|
|
83
|
+
expect(result.converged).toBe(true);
|
|
84
|
+
expect(result.iterations).toBe(0);
|
|
85
|
+
expect(result.best.heuristic).toBe(1);
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
test("converges via refiner when seed is poor", async () => {
|
|
89
|
+
// Start with a constant-true seed; refiner produces the correct
|
|
90
|
+
// verifier on the first call. This proves the search loop wires
|
|
91
|
+
// refiner → score → pool update correctly.
|
|
92
|
+
const result = await synthesizeVerifier({
|
|
93
|
+
seedCandidates: ["return true"],
|
|
94
|
+
samples: evenSamples,
|
|
95
|
+
refiner: async () => "return typeof output === 'number' && output % 2 === 0",
|
|
96
|
+
target: 1.0,
|
|
97
|
+
maxIterations: 3,
|
|
98
|
+
rng: () => 0.5,
|
|
99
|
+
});
|
|
100
|
+
expect(result.converged).toBe(true);
|
|
101
|
+
expect(result.best.heuristic).toBe(1);
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
test("returns best-so-far when iterations exhaust", async () => {
|
|
105
|
+
const result = await synthesizeVerifier({
|
|
106
|
+
seedCandidates: ["return false"], // score 0.4
|
|
107
|
+
samples: evenSamples,
|
|
108
|
+
refiner: async () => "return true", // score 0.6
|
|
109
|
+
target: 1.0,
|
|
110
|
+
maxIterations: 3,
|
|
111
|
+
rng: () => 0.5,
|
|
112
|
+
});
|
|
113
|
+
expect(result.converged).toBe(false);
|
|
114
|
+
expect(result.best.heuristic).toBeGreaterThanOrEqual(0.6);
|
|
115
|
+
});
|
|
116
|
+
|
|
117
|
+
test("throws on empty seed candidates", async () => {
|
|
118
|
+
await expect(
|
|
119
|
+
synthesizeVerifier({
|
|
120
|
+
seedCandidates: [],
|
|
121
|
+
samples: evenSamples,
|
|
122
|
+
refiner: async () => "return true",
|
|
123
|
+
}),
|
|
124
|
+
).rejects.toThrow(HarnessSynthesizerError);
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
test("throws on empty sample set", async () => {
|
|
128
|
+
await expect(
|
|
129
|
+
synthesizeVerifier({
|
|
130
|
+
seedCandidates: ["return true"],
|
|
131
|
+
samples: [],
|
|
132
|
+
refiner: async () => "return true",
|
|
133
|
+
}),
|
|
134
|
+
).rejects.toThrow(HarnessSynthesizerError);
|
|
135
|
+
});
|
|
136
|
+
});
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Track D (§55) — `tool-harness-synthesizer`. Thompson-sampled tree
|
|
3
|
+
* search over candidate verifier functions for skills and tools.
|
|
4
|
+
*
|
|
5
|
+
* Source: AutoHarness (Lou et al., Google DeepMind, March 2026,
|
|
6
|
+
* arxiv 2603.03329). The paper's headline finding: a smaller LLM
|
|
7
|
+
* (Gemini-2.5-Flash) plus a synthesized code harness beats a larger
|
|
8
|
+
* LLM (Gemini-2.5-Pro) at near-zero inference cost. The trick is to
|
|
9
|
+
* have the LLM synthesize TWO functions iteratively, with the
|
|
10
|
+
* environment as critic:
|
|
11
|
+
*
|
|
12
|
+
* - `propose_action(obs)` — candidate generator
|
|
13
|
+
* - `is_legal_action(obs, action)` — verifier
|
|
14
|
+
*
|
|
15
|
+
* If the verifier returns `True` but the action is invalid, refine
|
|
16
|
+
* BOTH functions; if it returns `False` and the action is invalid,
|
|
17
|
+
* refine only the proposer. This split-refinement is the empirical
|
|
18
|
+
* winning move.
|
|
19
|
+
*
|
|
20
|
+
* In CrewHaus, the equivalent is to synthesize verifier code per
|
|
21
|
+
* skill or tool: an `is_valid_output(input, output)` function for any
|
|
22
|
+
* tool that has objective validity criteria. The verifier becomes a
|
|
23
|
+
* reusable artifact under `.crewhaus/verifiers/<name>.ts` and feeds
|
|
24
|
+
* into the `eval-optimizer-orchestrator` via a `MutationProvider`
|
|
25
|
+
* variant that proposes verifier-aware prompt edits.
|
|
26
|
+
*
|
|
27
|
+
* v0 ships:
|
|
28
|
+
* - `synthesizeVerifier(spec)` — pure tree search over candidate
|
|
29
|
+
* verifier code strings (the LLM call is supplied by the caller
|
|
30
|
+
* so this package stays pure)
|
|
31
|
+
* - `thompsonPick(nodes)` — Thompson sampling over tree nodes
|
|
32
|
+
* - `VerifierMutationProvider` — adapter to plug verifier search
|
|
33
|
+
* into the existing optimizer
|
|
34
|
+
*
|
|
35
|
+
* Cited paper: AutoHarness (arxiv 2603.03329, Lou et al., 2026-03).
|
|
36
|
+
*/
|
|
37
|
+
import { CrewhausError } from "@crewhaus/errors";
|
|
38
|
+
import type {
|
|
39
|
+
MutationProvider,
|
|
40
|
+
OptimizerState,
|
|
41
|
+
ProviderMutation,
|
|
42
|
+
} from "@crewhaus/prompt-optimizer";
|
|
43
|
+
|
|
44
|
+
export class HarnessSynthesizerError extends CrewhausError {
|
|
45
|
+
override readonly name = "HarnessSynthesizerError";
|
|
46
|
+
constructor(message: string, cause?: unknown) {
|
|
47
|
+
super("config", message, cause);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* One sample of behavior the verifier should be measured against.
|
|
53
|
+
* `expected` is whether the verifier should accept (true) or reject
|
|
54
|
+
* (false) this sample. Both classes are required for non-degenerate
|
|
55
|
+
* search — a verifier that returns `true` for everything passes the
|
|
56
|
+
* `expected: true` set perfectly.
|
|
57
|
+
*/
|
|
58
|
+
export type VerifierSample = {
|
|
59
|
+
readonly input: unknown;
|
|
60
|
+
readonly output: unknown;
|
|
61
|
+
readonly expected: boolean;
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* A candidate verifier — a code string + the per-sample score it
|
|
66
|
+
* achieved on the last evaluation. `code` is a function body string
|
|
67
|
+
* with the signature `(input: unknown, output: unknown) => boolean`.
|
|
68
|
+
* It's stored as a string so the search can mutate it and feed it
|
|
69
|
+
* back to the LLM. Execution happens via `runVerifier` which
|
|
70
|
+
* compiles + invokes safely in a sandboxed Function call.
|
|
71
|
+
*/
|
|
72
|
+
export type VerifierCandidate = {
|
|
73
|
+
readonly id: string;
|
|
74
|
+
readonly code: string;
|
|
75
|
+
readonly score: number;
|
|
76
|
+
/** AutoHarness's heuristic value — average over samples, in [0, 1]. */
|
|
77
|
+
readonly heuristic: number;
|
|
78
|
+
/** Beta posterior parameters for Thompson sampling. */
|
|
79
|
+
readonly alpha: number;
|
|
80
|
+
readonly beta: number;
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* The Refiner: takes a failing candidate + concrete failure cases and
|
|
85
|
+
* returns a new code string. In production, this is a model call; for
|
|
86
|
+
* testing it's a deterministic rule-based mutation. Either way, the
|
|
87
|
+
* signature is the same.
|
|
88
|
+
*/
|
|
89
|
+
export type RefinerFn = (
|
|
90
|
+
current: VerifierCandidate,
|
|
91
|
+
failures: ReadonlyArray<VerifierSample>,
|
|
92
|
+
) => Promise<string>;
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* The Critic: runs `code` against a sample set and returns the per-
|
|
96
|
+
* sample verdict and the heuristic value. Pure; deterministic given
|
|
97
|
+
* the same code + samples.
|
|
98
|
+
*/
|
|
99
|
+
export function runVerifier(
|
|
100
|
+
code: string,
|
|
101
|
+
samples: ReadonlyArray<VerifierSample>,
|
|
102
|
+
): {
|
|
103
|
+
readonly verdicts: ReadonlyArray<boolean>;
|
|
104
|
+
readonly heuristic: number;
|
|
105
|
+
readonly errors: number;
|
|
106
|
+
} {
|
|
107
|
+
let fn: (input: unknown, output: unknown) => boolean;
|
|
108
|
+
try {
|
|
109
|
+
// Code must be the body of a function with parameters (input, output).
|
|
110
|
+
// We wrap defensively so callers can pass either a body or a complete
|
|
111
|
+
// expression returning a function.
|
|
112
|
+
fn = new Function("input", "output", `${code}`) as (input: unknown, output: unknown) => boolean;
|
|
113
|
+
} catch (err) {
|
|
114
|
+
throw new HarnessSynthesizerError(`verifier code did not compile: ${(err as Error).message}`);
|
|
115
|
+
}
|
|
116
|
+
const verdicts: boolean[] = [];
|
|
117
|
+
let correct = 0;
|
|
118
|
+
let errors = 0;
|
|
119
|
+
for (const s of samples) {
|
|
120
|
+
let v: boolean;
|
|
121
|
+
try {
|
|
122
|
+
v = Boolean(fn(s.input, s.output));
|
|
123
|
+
} catch {
|
|
124
|
+
v = false;
|
|
125
|
+
errors++;
|
|
126
|
+
}
|
|
127
|
+
verdicts.push(v);
|
|
128
|
+
if (v === s.expected) correct++;
|
|
129
|
+
}
|
|
130
|
+
const heuristic = samples.length === 0 ? 0 : correct / samples.length;
|
|
131
|
+
return { verdicts, heuristic, errors };
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Thompson sampling over a node population. Picks the index whose
|
|
136
|
+
* posterior sample is highest. Each node has a Beta(alpha, beta)
|
|
137
|
+
* posterior over its heuristic value; the alpha/beta are accumulated
|
|
138
|
+
* across iterations as the search refines.
|
|
139
|
+
*/
|
|
140
|
+
export function thompsonPick(
|
|
141
|
+
nodes: ReadonlyArray<VerifierCandidate>,
|
|
142
|
+
rng: () => number = Math.random,
|
|
143
|
+
): number {
|
|
144
|
+
if (nodes.length === 0) throw new HarnessSynthesizerError("thompsonPick called on empty list");
|
|
145
|
+
let bestIdx = 0;
|
|
146
|
+
let bestSample = Number.NEGATIVE_INFINITY;
|
|
147
|
+
for (let i = 0; i < nodes.length; i++) {
|
|
148
|
+
const n = nodes[i];
|
|
149
|
+
if (n === undefined) continue;
|
|
150
|
+
const sample = betaSample(n.alpha, n.beta, rng);
|
|
151
|
+
if (sample > bestSample) {
|
|
152
|
+
bestSample = sample;
|
|
153
|
+
bestIdx = i;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return bestIdx;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Quick-and-deterministic Beta sample using two gamma samples
|
|
161
|
+
* (Marsaglia–Tsang). For the sizes we deal with (alpha, beta < 100),
|
|
162
|
+
* the approximation is fast and stable.
|
|
163
|
+
*/
|
|
164
|
+
function betaSample(a: number, b: number, rng: () => number): number {
|
|
165
|
+
const x = gammaSample(a, rng);
|
|
166
|
+
const y = gammaSample(b, rng);
|
|
167
|
+
return x / (x + y);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
function gammaSample(shape: number, rng: () => number): number {
|
|
171
|
+
// For shape >= 1 use Marsaglia-Tsang; for shape < 1 use Ahrens-Dieter.
|
|
172
|
+
if (shape < 1) {
|
|
173
|
+
// Use shape+1 then transform by U^(1/shape).
|
|
174
|
+
const x = gammaSample(shape + 1, rng);
|
|
175
|
+
const u = Math.max(rng(), 1e-12);
|
|
176
|
+
return x * u ** (1 / shape);
|
|
177
|
+
}
|
|
178
|
+
const d = shape - 1 / 3;
|
|
179
|
+
const c = 1 / Math.sqrt(9 * d);
|
|
180
|
+
// Loop until a valid sample.
|
|
181
|
+
for (let i = 0; i < 64; i++) {
|
|
182
|
+
let x: number;
|
|
183
|
+
let v: number;
|
|
184
|
+
do {
|
|
185
|
+
const u1 = Math.max(rng(), 1e-12);
|
|
186
|
+
const u2 = Math.max(rng(), 1e-12);
|
|
187
|
+
// Box-Muller for standard normal.
|
|
188
|
+
x = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
|
|
189
|
+
v = 1 + c * x;
|
|
190
|
+
} while (v <= 0);
|
|
191
|
+
v = v * v * v;
|
|
192
|
+
const u = rng();
|
|
193
|
+
if (u < 1 - 0.0331 * x * x * x * x) return d * v;
|
|
194
|
+
if (Math.log(u) < 0.5 * x * x + d * (1 - v + Math.log(v))) return d * v;
|
|
195
|
+
}
|
|
196
|
+
// Fallback — extremely rare. Return the deterministic mean.
|
|
197
|
+
return shape;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
export type SynthesizeOptions = {
|
|
201
|
+
/** Initial seed candidates. Must be non-empty; provides the starting tree. */
|
|
202
|
+
readonly seedCandidates: ReadonlyArray<string>;
|
|
203
|
+
/** Samples the verifier is scored against. */
|
|
204
|
+
readonly samples: ReadonlyArray<VerifierSample>;
|
|
205
|
+
/** The refiner — usually an LLM-backed function. */
|
|
206
|
+
readonly refiner: RefinerFn;
|
|
207
|
+
/** Maximum tree-search iterations. Default: 16 (paper's median is ~14). */
|
|
208
|
+
readonly maxIterations?: number;
|
|
209
|
+
/** Target heuristic value — stop when reached. Default: 1.0 (100% correct). */
|
|
210
|
+
readonly target?: number;
|
|
211
|
+
/** RNG for Thompson sampling. Default: Math.random. */
|
|
212
|
+
readonly rng?: () => number;
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
export type SynthesizeResult = {
|
|
216
|
+
readonly best: VerifierCandidate;
|
|
217
|
+
readonly iterations: number;
|
|
218
|
+
readonly converged: boolean;
|
|
219
|
+
readonly trajectory: ReadonlyArray<VerifierCandidate>;
|
|
220
|
+
};
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Run the tree search. Returns the best candidate found, the
|
|
224
|
+
* iteration count, and whether the target heuristic was reached.
|
|
225
|
+
* Pure with respect to randomness: pass `rng` for determinism.
|
|
226
|
+
*/
|
|
227
|
+
export async function synthesizeVerifier(opts: SynthesizeOptions): Promise<SynthesizeResult> {
|
|
228
|
+
if (opts.seedCandidates.length === 0) {
|
|
229
|
+
throw new HarnessSynthesizerError("at least one seed candidate is required");
|
|
230
|
+
}
|
|
231
|
+
if (opts.samples.length === 0) {
|
|
232
|
+
throw new HarnessSynthesizerError("at least one sample is required to score the verifier");
|
|
233
|
+
}
|
|
234
|
+
const target = opts.target ?? 1.0;
|
|
235
|
+
const rng = opts.rng ?? Math.random;
|
|
236
|
+
const maxIter = opts.maxIterations ?? 16;
|
|
237
|
+
|
|
238
|
+
// Initialize the candidate pool from seeds.
|
|
239
|
+
const pool: VerifierCandidate[] = [];
|
|
240
|
+
for (let i = 0; i < opts.seedCandidates.length; i++) {
|
|
241
|
+
const code = opts.seedCandidates[i] as string;
|
|
242
|
+
const { heuristic } = runVerifier(code, opts.samples);
|
|
243
|
+
pool.push({
|
|
244
|
+
id: `seed_${i}`,
|
|
245
|
+
code,
|
|
246
|
+
score: heuristic,
|
|
247
|
+
heuristic,
|
|
248
|
+
// Beta starts uniform; update with observed correct/incorrect counts.
|
|
249
|
+
alpha: 1 + Math.round(heuristic * opts.samples.length),
|
|
250
|
+
beta: 1 + Math.round((1 - heuristic) * opts.samples.length),
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
const trajectory: VerifierCandidate[] = [...pool];
|
|
254
|
+
|
|
255
|
+
// Early exit if a seed already satisfies the target.
|
|
256
|
+
let best = pool.reduce((a, b) => (a.heuristic >= b.heuristic ? a : b));
|
|
257
|
+
if (best.heuristic >= target) {
|
|
258
|
+
return { best, iterations: 0, converged: true, trajectory };
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
for (let iter = 0; iter < maxIter; iter++) {
|
|
262
|
+
const pickIdx = thompsonPick(pool, rng);
|
|
263
|
+
const parent = pool[pickIdx] as VerifierCandidate;
|
|
264
|
+
// Compute concrete failures for the refiner.
|
|
265
|
+
const { verdicts } = runVerifier(parent.code, opts.samples);
|
|
266
|
+
const failures: VerifierSample[] = [];
|
|
267
|
+
for (let i = 0; i < opts.samples.length; i++) {
|
|
268
|
+
const s = opts.samples[i] as VerifierSample;
|
|
269
|
+
const v = verdicts[i] as boolean;
|
|
270
|
+
if (v !== s.expected) failures.push(s);
|
|
271
|
+
}
|
|
272
|
+
let newCode: string;
|
|
273
|
+
try {
|
|
274
|
+
newCode = await opts.refiner(parent, failures);
|
|
275
|
+
} catch (err) {
|
|
276
|
+
throw new HarnessSynthesizerError(
|
|
277
|
+
`refiner threw on iteration ${iter}: ${(err as Error).message}`,
|
|
278
|
+
err,
|
|
279
|
+
);
|
|
280
|
+
}
|
|
281
|
+
const { heuristic } = runVerifier(newCode, opts.samples);
|
|
282
|
+
const child: VerifierCandidate = {
|
|
283
|
+
id: `cand_${iter}`,
|
|
284
|
+
code: newCode,
|
|
285
|
+
score: heuristic,
|
|
286
|
+
heuristic,
|
|
287
|
+
alpha: 1 + Math.round(heuristic * opts.samples.length),
|
|
288
|
+
beta: 1 + Math.round((1 - heuristic) * opts.samples.length),
|
|
289
|
+
};
|
|
290
|
+
pool.push(child);
|
|
291
|
+
trajectory.push(child);
|
|
292
|
+
if (heuristic > best.heuristic) best = child;
|
|
293
|
+
if (best.heuristic >= target) {
|
|
294
|
+
return { best, iterations: iter + 1, converged: true, trajectory };
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
return { best, iterations: maxIter, converged: false, trajectory };
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
/**
|
|
301
|
+
* `MutationProvider` adapter so verifier search can drop into the
|
|
302
|
+
* existing eval-optimizer-orchestrator loop. The provider's `next()`
|
|
303
|
+
* runs one iteration of the inner tree search and emits a
|
|
304
|
+
* prompt-edit that references the synthesized verifier.
|
|
305
|
+
*
|
|
306
|
+
* Typical wiring (programmatic): construct this provider with the
|
|
307
|
+
* spec's skill samples and a `refiner` function, then pass it to
|
|
308
|
+
* `optimizeSpec({ mutator: new VerifierMutationProvider(...) })`. The
|
|
309
|
+
* orchestrator runs the standard search loop, but each "mutation" is
|
|
310
|
+
* a freshly-synthesized verifier persisted to .crewhaus/verifiers/.
|
|
311
|
+
* (CLI `--mutator verifier-synthesis` wiring is a follow-up; the CLI
|
|
312
|
+
* today exposes `rule-based` and `claude` only.)
|
|
313
|
+
*/
|
|
314
|
+
export class VerifierMutationProvider implements MutationProvider {
|
|
315
|
+
readonly name = "verifier-synthesis";
|
|
316
|
+
private synthesisIterations = 0;
|
|
317
|
+
|
|
318
|
+
constructor(
|
|
319
|
+
private readonly samples: ReadonlyArray<VerifierSample>,
|
|
320
|
+
private readonly refiner: RefinerFn,
|
|
321
|
+
private readonly seedCandidates: ReadonlyArray<string>,
|
|
322
|
+
private readonly maxInnerIterations: number = 4,
|
|
323
|
+
) {}
|
|
324
|
+
|
|
325
|
+
async next(state: OptimizerState): Promise<ProviderMutation> {
|
|
326
|
+
this.synthesisIterations++;
|
|
327
|
+
const result = await synthesizeVerifier({
|
|
328
|
+
seedCandidates: this.seedCandidates,
|
|
329
|
+
samples: this.samples,
|
|
330
|
+
refiner: this.refiner,
|
|
331
|
+
maxIterations: this.maxInnerIterations,
|
|
332
|
+
});
|
|
333
|
+
const annotation = `\n\n[verifier ${result.best.id}, h=${result.best.heuristic.toFixed(3)}]`;
|
|
334
|
+
return {
|
|
335
|
+
prompt: state.best.prompt + annotation,
|
|
336
|
+
mutations: [{ kind: "rephrase-instruction" }],
|
|
337
|
+
rationale: `verifier-synthesis pass ${this.synthesisIterations}: ${result.best.id} reached heuristic ${result.best.heuristic.toFixed(3)} in ${result.iterations} inner iterations${result.converged ? " (converged)" : ""}`,
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
}
|