@tangle-network/agent-eval 0.41.0 → 0.43.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/index.js +2 -2
- package/dist/builder-eval/index.js +1 -1
- package/dist/campaign/index.d.ts +90 -368
- package/dist/campaign/index.js +74 -4
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-6QDKWHLS.js → chunk-MHQPVHXU.js} +2 -2
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/chunk-NSBPE2FW.js +17 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{chunk-YNMCYUWT.js → chunk-RXK7FXLV.js} +92 -37
- package/dist/chunk-RXK7FXLV.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +1 -1
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/governance/index.js +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +755 -1239
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/knowledge/index.js +1 -1
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/matrix/index.js +1 -1
- package/dist/meta-eval/index.js +1 -1
- package/dist/multishot/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +68 -4
- package/dist/pipelines/index.js.map +1 -1
- package/dist/prm/index.js +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +5 -9
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +45 -200
- package/dist/rl.js.map +1 -1
- package/dist/{run-campaign-KEJK5KFT.js → run-campaign-GNDO66B4.js} +3 -3
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/telemetry/file.js +1 -1
- package/dist/telemetry/index.js +1 -1
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +6 -6
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/dist/wire/index.js +1 -1
- package/package.json +26 -17
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/chunk-PZ5AY32C.js +0 -10
- package/dist/chunk-YNMCYUWT.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/run-campaign-KEJK5KFT.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-6QDKWHLS.js.map → chunk-MHQPVHXU.js.map} +0 -0
- /package/dist/{chunk-PZ5AY32C.js.map → chunk-NSBPE2FW.js.map} +0 -0
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
- /package/dist/{optimization.js.map → run-campaign-GNDO66B4.js.map} +0 -0
package/dist/chunk-DMW5VENN.js
DELETED
|
@@ -1,1412 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
validateRunRecord
|
|
3
|
-
} from "./chunk-BWZEGTES.js";
|
|
4
|
-
import {
|
|
5
|
-
pairedBootstrap,
|
|
6
|
-
wilcoxonSignedRank
|
|
7
|
-
} from "./chunk-WP7SY7AI.js";
|
|
8
|
-
|
|
9
|
-
// src/feedback-trajectory.ts
|
|
10
|
-
var DEFAULT_SPLIT_POLICY = {
|
|
11
|
-
trainPct: 70,
|
|
12
|
-
devPct: 15,
|
|
13
|
-
testPct: 10,
|
|
14
|
-
holdoutPct: 5
|
|
15
|
-
};
|
|
16
|
-
var InMemoryFeedbackTrajectoryStore = class {
|
|
17
|
-
trajectories = /* @__PURE__ */ new Map();
|
|
18
|
-
async save(trajectory) {
|
|
19
|
-
this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
|
|
20
|
-
}
|
|
21
|
-
async get(id) {
|
|
22
|
-
const trajectory = this.trajectories.get(id);
|
|
23
|
-
return trajectory ? cloneTrajectory(trajectory) : null;
|
|
24
|
-
}
|
|
25
|
-
async list(filter = {}) {
|
|
26
|
-
return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
|
|
27
|
-
}
|
|
28
|
-
async appendAttempt(id, attempt) {
|
|
29
|
-
const trajectory = this.trajectories.get(id);
|
|
30
|
-
if (!trajectory)
|
|
31
|
-
throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
|
|
32
|
-
const next = cloneTrajectory({
|
|
33
|
-
...trajectory,
|
|
34
|
-
attempts: [...trajectory.attempts, attempt],
|
|
35
|
-
updatedAt: attempt.createdAt
|
|
36
|
-
});
|
|
37
|
-
this.trajectories.set(id, next);
|
|
38
|
-
return cloneTrajectory(next);
|
|
39
|
-
}
|
|
40
|
-
async appendLabel(id, label, attemptId) {
|
|
41
|
-
const trajectory = this.trajectories.get(id);
|
|
42
|
-
if (!trajectory)
|
|
43
|
-
throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
|
|
44
|
-
const attempts = attemptId ? trajectory.attempts.map(
|
|
45
|
-
(attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt
|
|
46
|
-
) : trajectory.attempts;
|
|
47
|
-
const next = cloneTrajectory({
|
|
48
|
-
...trajectory,
|
|
49
|
-
attempts,
|
|
50
|
-
labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
|
|
51
|
-
updatedAt: label.createdAt
|
|
52
|
-
});
|
|
53
|
-
this.trajectories.set(id, next);
|
|
54
|
-
return cloneTrajectory(next);
|
|
55
|
-
}
|
|
56
|
-
};
|
|
57
|
-
var FileSystemFeedbackTrajectoryStore = class {
|
|
58
|
-
dir;
|
|
59
|
-
memory = new InMemoryFeedbackTrajectoryStore();
|
|
60
|
-
loaded = false;
|
|
61
|
-
constructor(options) {
|
|
62
|
-
this.dir = options.dir;
|
|
63
|
-
}
|
|
64
|
-
async save(trajectory) {
|
|
65
|
-
await this.load();
|
|
66
|
-
await this.memory.save(trajectory);
|
|
67
|
-
await this.append({ op: "save", trajectory });
|
|
68
|
-
}
|
|
69
|
-
async get(id) {
|
|
70
|
-
await this.load();
|
|
71
|
-
return this.memory.get(id);
|
|
72
|
-
}
|
|
73
|
-
async list(filter = {}) {
|
|
74
|
-
await this.load();
|
|
75
|
-
return this.memory.list(filter);
|
|
76
|
-
}
|
|
77
|
-
async appendAttempt(id, attempt) {
|
|
78
|
-
await this.load();
|
|
79
|
-
const next = await this.memory.appendAttempt(id, attempt);
|
|
80
|
-
await this.append({ op: "appendAttempt", id, attempt });
|
|
81
|
-
return next;
|
|
82
|
-
}
|
|
83
|
-
async appendLabel(id, label, attemptId) {
|
|
84
|
-
await this.load();
|
|
85
|
-
const next = await this.memory.appendLabel(id, label, attemptId);
|
|
86
|
-
await this.append({ op: "appendLabel", id, label, attemptId });
|
|
87
|
-
return next;
|
|
88
|
-
}
|
|
89
|
-
async append(record) {
|
|
90
|
-
const { appendFile, mkdir } = await import("fs/promises");
|
|
91
|
-
const { join } = await import("path");
|
|
92
|
-
await mkdir(this.dir, { recursive: true });
|
|
93
|
-
await appendFile(
|
|
94
|
-
join(this.dir, "feedback-trajectories.ndjson"),
|
|
95
|
-
`${JSON.stringify(record)}
|
|
96
|
-
`,
|
|
97
|
-
"utf8"
|
|
98
|
-
);
|
|
99
|
-
}
|
|
100
|
-
async load() {
|
|
101
|
-
if (this.loaded) return;
|
|
102
|
-
const { readFile } = await import("fs/promises");
|
|
103
|
-
const { join } = await import("path");
|
|
104
|
-
const file = join(this.dir, "feedback-trajectories.ndjson");
|
|
105
|
-
try {
|
|
106
|
-
const raw = await readFile(file, "utf8");
|
|
107
|
-
for (const line of raw.split("\n")) {
|
|
108
|
-
if (!line.trim()) continue;
|
|
109
|
-
try {
|
|
110
|
-
const record = JSON.parse(line);
|
|
111
|
-
if (record.op === "save") await this.memory.save(record.trajectory);
|
|
112
|
-
if (record.op === "appendAttempt")
|
|
113
|
-
await this.memory.appendAttempt(record.id, record.attempt);
|
|
114
|
-
if (record.op === "appendLabel")
|
|
115
|
-
await this.memory.appendLabel(record.id, record.label, record.attemptId);
|
|
116
|
-
} catch {
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
} catch {
|
|
120
|
-
}
|
|
121
|
-
this.loaded = true;
|
|
122
|
-
}
|
|
123
|
-
};
|
|
124
|
-
function createFeedbackTrajectory(input) {
|
|
125
|
-
const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
126
|
-
const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
|
|
127
|
-
return {
|
|
128
|
-
id,
|
|
129
|
-
projectId: input.projectId,
|
|
130
|
-
scenarioId: input.scenarioId,
|
|
131
|
-
task: input.task,
|
|
132
|
-
attempts: input.attempts ?? [],
|
|
133
|
-
labels: input.labels ?? [],
|
|
134
|
-
outcome: input.outcome,
|
|
135
|
-
split: input.split,
|
|
136
|
-
tags: input.tags,
|
|
137
|
-
createdAt,
|
|
138
|
-
metadata: input.metadata
|
|
139
|
-
};
|
|
140
|
-
}
|
|
141
|
-
function assignFeedbackSplit(trajectory, policy = {}) {
|
|
142
|
-
const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
|
|
143
|
-
const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
|
|
144
|
-
if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
|
|
145
|
-
const bucket = stableHash(
|
|
146
|
-
`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`
|
|
147
|
-
) % total;
|
|
148
|
-
if (bucket < split.trainPct) return "train";
|
|
149
|
-
if (bucket < split.trainPct + split.devPct) return "dev";
|
|
150
|
-
if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
|
|
151
|
-
return "holdout";
|
|
152
|
-
}
|
|
153
|
-
function withAssignedFeedbackSplit(trajectory, policy) {
|
|
154
|
-
return {
|
|
155
|
-
...trajectory,
|
|
156
|
-
split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
|
|
157
|
-
};
|
|
158
|
-
}
|
|
159
|
-
function feedbackTrajectoryToDatasetScenario(trajectory) {
|
|
160
|
-
const withSplit = withAssignedFeedbackSplit(trajectory);
|
|
161
|
-
return {
|
|
162
|
-
id: withSplit.scenarioId ?? withSplit.id,
|
|
163
|
-
split: withSplit.split,
|
|
164
|
-
payload: withSplit,
|
|
165
|
-
tags: {
|
|
166
|
-
...withSplit.projectId ? { projectId: withSplit.projectId } : {},
|
|
167
|
-
...withSplit.tags ?? {},
|
|
168
|
-
source: "feedback-trajectory"
|
|
169
|
-
}
|
|
170
|
-
};
|
|
171
|
-
}
|
|
172
|
-
function feedbackTrajectoriesToDatasetScenarios(trajectories) {
|
|
173
|
-
return trajectories.map(feedbackTrajectoryToDatasetScenario);
|
|
174
|
-
}
|
|
175
|
-
function feedbackTrajectoryToOptimizerRow(trajectory) {
|
|
176
|
-
const labels = allLabels(trajectory);
|
|
177
|
-
return {
|
|
178
|
-
scenarioId: trajectory.scenarioId ?? trajectory.id,
|
|
179
|
-
trajectoryId: trajectory.id,
|
|
180
|
-
labelKinds: [...new Set(labels.map((label) => label.kind))],
|
|
181
|
-
score: trajectory.outcome?.score ?? scoreFromLabels(labels),
|
|
182
|
-
metadata: {
|
|
183
|
-
projectId: trajectory.projectId,
|
|
184
|
-
split: trajectory.split,
|
|
185
|
-
intent: trajectory.task.intent,
|
|
186
|
-
attempts: trajectory.attempts.length,
|
|
187
|
-
outcome: trajectory.outcome,
|
|
188
|
-
labels
|
|
189
|
-
}
|
|
190
|
-
};
|
|
191
|
-
}
|
|
192
|
-
function feedbackTrajectoriesToOptimizerRows(trajectories) {
|
|
193
|
-
return trajectories.map(feedbackTrajectoryToOptimizerRow);
|
|
194
|
-
}
|
|
195
|
-
async function replayFeedbackTrajectory(trajectory, adapter) {
|
|
196
|
-
try {
|
|
197
|
-
const result = await adapter.replay(trajectory);
|
|
198
|
-
return {
|
|
199
|
-
trajectoryId: trajectory.id,
|
|
200
|
-
...result
|
|
201
|
-
};
|
|
202
|
-
} catch (err) {
|
|
203
|
-
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
204
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
205
|
-
return {
|
|
206
|
-
trajectoryId: trajectory.id,
|
|
207
|
-
pass: false,
|
|
208
|
-
labels: [
|
|
209
|
-
{
|
|
210
|
-
source: "system",
|
|
211
|
-
kind: "reject",
|
|
212
|
-
value: false,
|
|
213
|
-
reason: message,
|
|
214
|
-
severity: "error",
|
|
215
|
-
createdAt
|
|
216
|
-
}
|
|
217
|
-
],
|
|
218
|
-
outcome: {
|
|
219
|
-
success: false,
|
|
220
|
-
score: 0,
|
|
221
|
-
detail: message,
|
|
222
|
-
observedAt: createdAt
|
|
223
|
-
},
|
|
224
|
-
metadata: { replayError: true }
|
|
225
|
-
};
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
async function replayFeedbackTrajectories(trajectories, adapter) {
|
|
229
|
-
const results = [];
|
|
230
|
-
for (const trajectory of trajectories) {
|
|
231
|
-
results.push(await replayFeedbackTrajectory(trajectory, adapter));
|
|
232
|
-
}
|
|
233
|
-
return results;
|
|
234
|
-
}
|
|
235
|
-
function summarizePreferenceMemory(trajectories, options = {}) {
|
|
236
|
-
const maxEntries = options.maxEntries ?? 20;
|
|
237
|
-
const entries = [];
|
|
238
|
-
for (const trajectory of trajectories) {
|
|
239
|
-
for (const label of allLabels(trajectory)) {
|
|
240
|
-
const instruction = instructionFromLabel(trajectory, label);
|
|
241
|
-
if (!instruction) continue;
|
|
242
|
-
entries.push({
|
|
243
|
-
instruction,
|
|
244
|
-
rationale: label.reason ?? `${label.kind} label from ${label.source}`,
|
|
245
|
-
weight: weightForLabel(label),
|
|
246
|
-
sourceTrajectoryId: trajectory.id,
|
|
247
|
-
sourceLabelId: label.id,
|
|
248
|
-
category: label.kind
|
|
249
|
-
});
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
const byInstruction = /* @__PURE__ */ new Map();
|
|
253
|
-
for (const entry of entries) {
|
|
254
|
-
const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
|
|
255
|
-
const existing = byInstruction.get(key);
|
|
256
|
-
if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
|
|
257
|
-
}
|
|
258
|
-
return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
|
|
259
|
-
}
|
|
260
|
-
function renderPreferenceMemoryMarkdown(entries) {
|
|
261
|
-
const lines = ["# Preference Memory", ""];
|
|
262
|
-
for (const entry of entries) {
|
|
263
|
-
lines.push(`- ${entry.instruction}`);
|
|
264
|
-
lines.push(` Rationale: ${entry.rationale}`);
|
|
265
|
-
lines.push(` Source: ${entry.sourceTrajectoryId}`);
|
|
266
|
-
lines.push("");
|
|
267
|
-
}
|
|
268
|
-
return `${lines.join("\n").trim()}
|
|
269
|
-
`;
|
|
270
|
-
}
|
|
271
|
-
function serializeFeedbackTrajectoriesJsonl(trajectories) {
|
|
272
|
-
return `${trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n")}
|
|
273
|
-
`;
|
|
274
|
-
}
|
|
275
|
-
function parseFeedbackTrajectoriesJsonl(jsonl) {
|
|
276
|
-
const trajectories = [];
|
|
277
|
-
for (const line of jsonl.split("\n")) {
|
|
278
|
-
if (!line.trim()) continue;
|
|
279
|
-
trajectories.push(JSON.parse(line));
|
|
280
|
-
}
|
|
281
|
-
return trajectories;
|
|
282
|
-
}
|
|
283
|
-
function controlRunToFeedbackTrajectory(run, options = {}) {
|
|
284
|
-
const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
285
|
-
const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
|
|
286
|
-
return createFeedbackTrajectory({
|
|
287
|
-
id: trajectoryId,
|
|
288
|
-
projectId: options.projectId,
|
|
289
|
-
scenarioId: options.scenarioId,
|
|
290
|
-
task: { intent: run.intent },
|
|
291
|
-
createdAt,
|
|
292
|
-
attempts: run.steps.map((step) => ({
|
|
293
|
-
id: `${trajectoryId}_step_${step.index}`,
|
|
294
|
-
stepIndex: step.index,
|
|
295
|
-
artifactType: options.artifactType ?? "action",
|
|
296
|
-
artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
|
|
297
|
-
proposedAction: options.proposedActionFromStep?.(step),
|
|
298
|
-
evals: step.evalsAfter,
|
|
299
|
-
createdAt: step.startedAt,
|
|
300
|
-
metadata: {
|
|
301
|
-
decision: step.decision,
|
|
302
|
-
actionOutcome: step.actionOutcome
|
|
303
|
-
}
|
|
304
|
-
})),
|
|
305
|
-
labels: [
|
|
306
|
-
{
|
|
307
|
-
source: "system",
|
|
308
|
-
kind: run.pass ? "approve" : "reject",
|
|
309
|
-
value: run.pass,
|
|
310
|
-
reason: run.reason,
|
|
311
|
-
severity: run.pass ? "info" : "error",
|
|
312
|
-
createdAt
|
|
313
|
-
}
|
|
314
|
-
],
|
|
315
|
-
outcome: {
|
|
316
|
-
success: run.pass,
|
|
317
|
-
score: run.score,
|
|
318
|
-
costUsd: run.spentCostUsd,
|
|
319
|
-
detail: run.reason,
|
|
320
|
-
observedAt: createdAt,
|
|
321
|
-
metadata: {
|
|
322
|
-
stoppedBy: run.stoppedBy,
|
|
323
|
-
failureClass: run.failureClass
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
});
|
|
327
|
-
}
|
|
328
|
-
function allLabels(trajectory) {
|
|
329
|
-
const labels = [
|
|
330
|
-
...trajectory.labels,
|
|
331
|
-
...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
|
|
332
|
-
];
|
|
333
|
-
const seen = /* @__PURE__ */ new Set();
|
|
334
|
-
return labels.filter((label) => {
|
|
335
|
-
const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
|
|
336
|
-
if (seen.has(key)) return false;
|
|
337
|
-
seen.add(key);
|
|
338
|
-
return true;
|
|
339
|
-
});
|
|
340
|
-
}
|
|
341
|
-
function scoreFromLabels(labels) {
|
|
342
|
-
if (!labels.length) return void 0;
|
|
343
|
-
const scored = labels.map((label) => {
|
|
344
|
-
if (label.kind === "approve" || label.kind === "select") return 1;
|
|
345
|
-
if (label.kind === "reject" || label.kind === "policy_block") return 0;
|
|
346
|
-
if (label.kind === "rate" && typeof label.value === "number")
|
|
347
|
-
return Math.max(0, Math.min(1, label.value));
|
|
348
|
-
return void 0;
|
|
349
|
-
}).filter((value) => typeof value === "number");
|
|
350
|
-
if (!scored.length) return void 0;
|
|
351
|
-
return Math.round(scored.reduce((sum, value) => sum + value, 0) / scored.length * 1e3) / 1e3;
|
|
352
|
-
}
|
|
353
|
-
function instructionFromLabel(trajectory, label) {
|
|
354
|
-
if (label.kind === "reject" && label.reason)
|
|
355
|
-
return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
|
|
356
|
-
if (label.kind === "revision_request" && label.reason)
|
|
357
|
-
return `Revise similar work by applying: ${label.reason}`;
|
|
358
|
-
if (label.kind === "select" && label.reason)
|
|
359
|
-
return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
|
|
360
|
-
if (label.kind === "approve" && label.reason)
|
|
361
|
-
return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
|
|
362
|
-
if (label.kind === "comment" && label.reason) return label.reason;
|
|
363
|
-
return void 0;
|
|
364
|
-
}
|
|
365
|
-
function weightForLabel(label) {
|
|
366
|
-
const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
|
|
367
|
-
const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
|
|
368
|
-
return severity * source;
|
|
369
|
-
}
|
|
370
|
-
function matchesFilter(trajectory, filter) {
|
|
371
|
-
if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
|
|
372
|
-
if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
|
|
373
|
-
if (filter.split && trajectory.split !== filter.split) return false;
|
|
374
|
-
if (filter.tag) {
|
|
375
|
-
const [key, value] = filter.tag;
|
|
376
|
-
if (trajectory.tags?.[key] !== value) return false;
|
|
377
|
-
}
|
|
378
|
-
return true;
|
|
379
|
-
}
|
|
380
|
-
function cloneTrajectory(trajectory) {
|
|
381
|
-
return JSON.parse(JSON.stringify(trajectory));
|
|
382
|
-
}
|
|
383
|
-
function compact(value, max) {
|
|
384
|
-
const normalized = value.replace(/\s+/g, " ").trim();
|
|
385
|
-
return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
|
|
386
|
-
}
|
|
387
|
-
function stableHash(input) {
|
|
388
|
-
let hash = 2166136261;
|
|
389
|
-
for (let i = 0; i < input.length; i += 1) {
|
|
390
|
-
hash ^= input.charCodeAt(i);
|
|
391
|
-
hash = Math.imul(hash, 16777619);
|
|
392
|
-
}
|
|
393
|
-
return hash >>> 0;
|
|
394
|
-
}
|
|
395
|
-
function canonicalize(value) {
|
|
396
|
-
if (value === null || typeof value !== "object") return value;
|
|
397
|
-
if (Array.isArray(value)) return value.map(canonicalize);
|
|
398
|
-
const out = {};
|
|
399
|
-
for (const key of Object.keys(value).sort()) {
|
|
400
|
-
out[key] = canonicalize(value[key]);
|
|
401
|
-
}
|
|
402
|
-
return out;
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
// src/held-out-gate.ts
|
|
406
|
-
var HeldOutGate = class {
|
|
407
|
-
minProductiveRuns;
|
|
408
|
-
pairedDeltaThreshold;
|
|
409
|
-
overfitGapThreshold;
|
|
410
|
-
baselineKey;
|
|
411
|
-
confidence;
|
|
412
|
-
resamples;
|
|
413
|
-
seed;
|
|
414
|
-
constructor(config) {
|
|
415
|
-
if (!config.baselineKey) {
|
|
416
|
-
throw new Error("HeldOutGate: baselineKey is required");
|
|
417
|
-
}
|
|
418
|
-
this.minProductiveRuns = config.minProductiveRuns ?? 3;
|
|
419
|
-
this.pairedDeltaThreshold = config.pairedDeltaThreshold ?? 0;
|
|
420
|
-
this.overfitGapThreshold = config.overfitGapThreshold ?? 0.15;
|
|
421
|
-
this.baselineKey = config.baselineKey;
|
|
422
|
-
this.confidence = config.confidence ?? 0.95;
|
|
423
|
-
this.resamples = config.bootstrapResamples ?? 2e3;
|
|
424
|
-
this.seed = config.seed;
|
|
425
|
-
}
|
|
426
|
-
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
427
|
-
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
428
|
-
* the candidate run with the matching baseline run. Pairs without
|
|
429
|
-
* a holdout score on both sides are dropped. */
|
|
430
|
-
evaluate(candidate, baseline) {
|
|
431
|
-
const candidateId = inferCandidateId(candidate, this.baselineKey);
|
|
432
|
-
const baselineId = this.baselineKey;
|
|
433
|
-
const baselineHoldoutByKey = indexHoldoutByKey(baseline);
|
|
434
|
-
const beforeHoldout = [];
|
|
435
|
-
const afterHoldout = [];
|
|
436
|
-
for (const run of candidate) {
|
|
437
|
-
if (run.splitTag !== "holdout") continue;
|
|
438
|
-
if (run.outcome.holdoutScore === void 0) continue;
|
|
439
|
-
const key = pairKey(run);
|
|
440
|
-
const counterpart = baselineHoldoutByKey.get(key);
|
|
441
|
-
if (counterpart === void 0) continue;
|
|
442
|
-
beforeHoldout.push(counterpart);
|
|
443
|
-
afterHoldout.push(run.outcome.holdoutScore);
|
|
444
|
-
}
|
|
445
|
-
const productiveRuns = beforeHoldout.length;
|
|
446
|
-
const candidateSearchMean = mean(scores(candidate, "searchScore", "search"));
|
|
447
|
-
const candidateHoldoutMean = mean(scores(candidate, "holdoutScore", "holdout"));
|
|
448
|
-
const baselineSearchMean = mean(scores(baseline, "searchScore", "search"));
|
|
449
|
-
const baselineHoldoutMean = mean(scores(baseline, "holdoutScore", "holdout"));
|
|
450
|
-
const overfitGap = safeDiff(candidateSearchMean, candidateHoldoutMean);
|
|
451
|
-
const baselineOverfitGap = safeDiff(baselineSearchMean, baselineHoldoutMean);
|
|
452
|
-
if (productiveRuns < this.minProductiveRuns) {
|
|
453
|
-
return {
|
|
454
|
-
promote: false,
|
|
455
|
-
candidateId,
|
|
456
|
-
baselineId,
|
|
457
|
-
evidence: {
|
|
458
|
-
productiveRuns,
|
|
459
|
-
medianPairedDelta: productiveRuns > 0 ? medianDelta(beforeHoldout, afterHoldout) : 0,
|
|
460
|
-
pairedCI: { low: 0, high: 0 },
|
|
461
|
-
pairedPValue: 1,
|
|
462
|
-
searchScore: candidateSearchMean,
|
|
463
|
-
holdoutScore: candidateHoldoutMean,
|
|
464
|
-
overfitGap,
|
|
465
|
-
baselineOverfitGap
|
|
466
|
-
},
|
|
467
|
-
reason: `few_runs: ${productiveRuns} paired holdout observation(s) < min ${this.minProductiveRuns}`,
|
|
468
|
-
rejectionCode: "few_runs"
|
|
469
|
-
};
|
|
470
|
-
}
|
|
471
|
-
const ci = pairedBootstrap(beforeHoldout, afterHoldout, {
|
|
472
|
-
confidence: this.confidence,
|
|
473
|
-
resamples: this.resamples,
|
|
474
|
-
statistic: "median",
|
|
475
|
-
seed: this.seed
|
|
476
|
-
});
|
|
477
|
-
const wilcoxon = wilcoxonSignedRank(beforeHoldout, afterHoldout);
|
|
478
|
-
const evidence = {
|
|
479
|
-
productiveRuns,
|
|
480
|
-
medianPairedDelta: ci.median,
|
|
481
|
-
pairedCI: { low: ci.low, high: ci.high },
|
|
482
|
-
pairedPValue: wilcoxon.p,
|
|
483
|
-
searchScore: candidateSearchMean,
|
|
484
|
-
holdoutScore: candidateHoldoutMean,
|
|
485
|
-
overfitGap,
|
|
486
|
-
baselineOverfitGap
|
|
487
|
-
};
|
|
488
|
-
if (!(ci.low > this.pairedDeltaThreshold)) {
|
|
489
|
-
return {
|
|
490
|
-
promote: false,
|
|
491
|
-
candidateId,
|
|
492
|
-
baselineId,
|
|
493
|
-
evidence,
|
|
494
|
-
reason: `negative_delta: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] does not clear threshold ${fmt(this.pairedDeltaThreshold)}`,
|
|
495
|
-
rejectionCode: "negative_delta"
|
|
496
|
-
};
|
|
497
|
-
}
|
|
498
|
-
if (Number.isFinite(overfitGap) && Number.isFinite(baselineOverfitGap) && overfitGap > baselineOverfitGap + this.overfitGapThreshold) {
|
|
499
|
-
return {
|
|
500
|
-
promote: false,
|
|
501
|
-
candidateId,
|
|
502
|
-
baselineId,
|
|
503
|
-
evidence,
|
|
504
|
-
reason: `overfit_gap: candidate gap=${fmt(overfitGap)} exceeds baseline gap=${fmt(baselineOverfitGap)} by more than ${fmt(this.overfitGapThreshold)}`,
|
|
505
|
-
rejectionCode: "overfit_gap"
|
|
506
|
-
};
|
|
507
|
-
}
|
|
508
|
-
return {
|
|
509
|
-
promote: true,
|
|
510
|
-
candidateId,
|
|
511
|
-
baselineId,
|
|
512
|
-
evidence,
|
|
513
|
-
reason: `promote: paired holdout median \u0394=${fmt(ci.median)} CI=[${fmt(ci.low)}, ${fmt(ci.high)}] over ${productiveRuns} pairs; overfit gap candidate=${fmt(overfitGap)} vs baseline=${fmt(baselineOverfitGap)}`,
|
|
514
|
-
rejectionCode: null
|
|
515
|
-
};
|
|
516
|
-
}
|
|
517
|
-
};
|
|
518
|
-
function inferCandidateId(candidate, baselineKey) {
|
|
519
|
-
for (const run of candidate) {
|
|
520
|
-
if (run.candidateId && run.candidateId !== baselineKey) return run.candidateId;
|
|
521
|
-
}
|
|
522
|
-
return candidate[0]?.candidateId ?? "(unknown candidate)";
|
|
523
|
-
}
|
|
524
|
-
function indexHoldoutByKey(runs) {
|
|
525
|
-
const out = /* @__PURE__ */ new Map();
|
|
526
|
-
for (const r of runs) {
|
|
527
|
-
if (r.splitTag !== "holdout") continue;
|
|
528
|
-
if (r.outcome.holdoutScore === void 0) continue;
|
|
529
|
-
out.set(pairKey(r), r.outcome.holdoutScore);
|
|
530
|
-
}
|
|
531
|
-
return out;
|
|
532
|
-
}
|
|
533
|
-
function pairKey(r) {
|
|
534
|
-
return `${r.experimentId}::${r.seed}`;
|
|
535
|
-
}
|
|
536
|
-
function scores(runs, field, splitFilter) {
|
|
537
|
-
const out = [];
|
|
538
|
-
for (const r of runs) {
|
|
539
|
-
if (r.splitTag !== splitFilter) continue;
|
|
540
|
-
const v = r.outcome[field];
|
|
541
|
-
if (typeof v === "number" && Number.isFinite(v)) out.push(v);
|
|
542
|
-
}
|
|
543
|
-
return out;
|
|
544
|
-
}
|
|
545
|
-
function mean(xs) {
|
|
546
|
-
if (xs.length === 0) return Number.NaN;
|
|
547
|
-
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
548
|
-
}
|
|
549
|
-
function safeDiff(a, b) {
|
|
550
|
-
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
551
|
-
return a - b;
|
|
552
|
-
}
|
|
553
|
-
function medianDelta(before, after) {
|
|
554
|
-
const ds = before.map((b, i) => after[i] - b).sort((x, y) => x - y);
|
|
555
|
-
if (ds.length === 0) return 0;
|
|
556
|
-
const mid = Math.floor(ds.length / 2);
|
|
557
|
-
return ds.length % 2 === 0 ? (ds[mid - 1] + ds[mid]) / 2 : ds[mid];
|
|
558
|
-
}
|
|
559
|
-
function fmt(x) {
|
|
560
|
-
if (!Number.isFinite(x)) return String(x);
|
|
561
|
-
return x.toFixed(4);
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
// src/pareto.ts
|
|
565
|
-
function dominates(a, b, objectives) {
|
|
566
|
-
let strictlyBetter = false;
|
|
567
|
-
for (const obj of objectives) {
|
|
568
|
-
const av = obj.value(a);
|
|
569
|
-
const bv = obj.value(b);
|
|
570
|
-
if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
|
|
571
|
-
const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
|
|
572
|
-
const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
|
|
573
|
-
if (aIsWorse) return false;
|
|
574
|
-
if (aIsBetter) strictlyBetter = true;
|
|
575
|
-
}
|
|
576
|
-
return strictlyBetter;
|
|
577
|
-
}
|
|
578
|
-
function paretoFrontier(candidates, objectives) {
|
|
579
|
-
if (objectives.length === 0) {
|
|
580
|
-
throw new Error("paretoFrontier: at least 1 objective required");
|
|
581
|
-
}
|
|
582
|
-
const valid = candidates.filter((c) => objectives.every((o) => Number.isFinite(o.value(c))));
|
|
583
|
-
const frontier = [];
|
|
584
|
-
const dominated = [];
|
|
585
|
-
for (const c of valid) {
|
|
586
|
-
const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
|
|
587
|
-
if (isDominated) dominated.push(c);
|
|
588
|
-
else frontier.push(c);
|
|
589
|
-
}
|
|
590
|
-
const dominanceMap = frontier.map((d) => ({
|
|
591
|
-
dominator: d,
|
|
592
|
-
dominated: dominated.filter((x) => dominates(d, x, objectives))
|
|
593
|
-
}));
|
|
594
|
-
return { frontier, dominated, dominanceMap };
|
|
595
|
-
}
|
|
596
|
-
function scalarScore(candidates, objectives, options = {}) {
|
|
597
|
-
if (candidates.length === 0) return [];
|
|
598
|
-
const weights = options.weights ?? {};
|
|
599
|
-
const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
|
|
600
|
-
const ranges = objectives.map((obj) => {
|
|
601
|
-
const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
|
|
602
|
-
if (values.length === 0) return { min: 0, max: 1 };
|
|
603
|
-
const min = Math.min(...values);
|
|
604
|
-
const max = Math.max(...values);
|
|
605
|
-
return { min, max: max === min ? min + 1 : max };
|
|
606
|
-
});
|
|
607
|
-
return candidates.map((c) => {
|
|
608
|
-
let score = 0;
|
|
609
|
-
objectives.forEach((obj, i) => {
|
|
610
|
-
const v = obj.value(c);
|
|
611
|
-
if (!Number.isFinite(v)) return;
|
|
612
|
-
const { min, max } = ranges[i];
|
|
613
|
-
const normalised = (v - min) / (max - min);
|
|
614
|
-
const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
|
|
615
|
-
const weight = (weights[obj.name] ?? 1) / totalWeight;
|
|
616
|
-
score += directional * weight;
|
|
617
|
-
});
|
|
618
|
-
return { candidate: c, score };
|
|
619
|
-
});
|
|
620
|
-
}
|
|
621
|
-
function crowdingDistance(candidates, objectives) {
|
|
622
|
-
const distances = new Map(candidates.map((c) => [c, 0]));
|
|
623
|
-
for (const obj of objectives) {
|
|
624
|
-
const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
|
|
625
|
-
const min = obj.value(sorted[0]);
|
|
626
|
-
const max = obj.value(sorted[sorted.length - 1]);
|
|
627
|
-
const range = max - min || 1;
|
|
628
|
-
distances.set(sorted[0], Infinity);
|
|
629
|
-
distances.set(sorted[sorted.length - 1], Infinity);
|
|
630
|
-
for (let i = 1; i < sorted.length - 1; i++) {
|
|
631
|
-
const prev = obj.value(sorted[i - 1]);
|
|
632
|
-
const next = obj.value(sorted[i + 1]);
|
|
633
|
-
const current = distances.get(sorted[i]);
|
|
634
|
-
if (current === Infinity) continue;
|
|
635
|
-
distances.set(sorted[i], current + (next - prev) / range);
|
|
636
|
-
}
|
|
637
|
-
}
|
|
638
|
-
return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
|
|
639
|
-
}
|
|
640
|
-
function paretoFrontierWithCrowding(candidates, objectives) {
|
|
641
|
-
const { frontier } = paretoFrontier(candidates, objectives);
|
|
642
|
-
if (frontier.length === 0) return [];
|
|
643
|
-
const distances = crowdingDistance(frontier, objectives);
|
|
644
|
-
return distances.sort((a, b) => b.distance - a.distance);
|
|
645
|
-
}
|
|
646
|
-
|
|
647
|
-
// src/prompt-evolution.ts
|
|
648
|
-
var InMemoryTrialCache = class {
|
|
649
|
-
store = /* @__PURE__ */ new Map();
|
|
650
|
-
get(key) {
|
|
651
|
-
return this.store.get(key);
|
|
652
|
-
}
|
|
653
|
-
set(key, value) {
|
|
654
|
-
this.store.set(key, value);
|
|
655
|
-
}
|
|
656
|
-
size() {
|
|
657
|
-
return this.store.size;
|
|
658
|
-
}
|
|
659
|
-
clear() {
|
|
660
|
-
this.store.clear();
|
|
661
|
-
}
|
|
662
|
-
};
|
|
663
|
-
async function runPromptEvolution(config) {
|
|
664
|
-
const generations = [];
|
|
665
|
-
let population = [...config.seedVariants];
|
|
666
|
-
let bestVariant = population[0];
|
|
667
|
-
let bestAggregate = null;
|
|
668
|
-
for (let generation = 0; generation < config.generations; generation++) {
|
|
669
|
-
config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
|
|
670
|
-
const trials = await scorePopulation(population, config, generation);
|
|
671
|
-
const aggregates = aggregateTrials(population, config.scenarioIds, trials);
|
|
672
|
-
const front = paretoFrontierWithCrowding(aggregates, config.objectives);
|
|
673
|
-
const frontIds = new Set(front.map((c) => c.candidate.variantId));
|
|
674
|
-
const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
|
|
675
|
-
scored.sort((a, b) => b.score - a.score);
|
|
676
|
-
const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
|
|
677
|
-
const report = {
|
|
678
|
-
runId: config.runId,
|
|
679
|
-
target: config.target,
|
|
680
|
-
generation,
|
|
681
|
-
variants: population,
|
|
682
|
-
aggregates,
|
|
683
|
-
paretoFrontIds: front.map((c) => c.candidate.variantId),
|
|
684
|
-
winnerId,
|
|
685
|
-
trials
|
|
686
|
-
};
|
|
687
|
-
generations.push(report);
|
|
688
|
-
config.onProgress?.({ type: "generation-complete", report });
|
|
689
|
-
const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
|
|
690
|
-
if (winnerAgg) {
|
|
691
|
-
const winner = population.find((v) => v.id === winnerId);
|
|
692
|
-
if (winner) bestVariant = winner;
|
|
693
|
-
bestAggregate = winnerAgg;
|
|
694
|
-
}
|
|
695
|
-
if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
|
|
696
|
-
const prev = generations[generations.length - 2];
|
|
697
|
-
const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
|
|
698
|
-
if (noChange) {
|
|
699
|
-
config.onProgress?.({
|
|
700
|
-
type: "converged",
|
|
701
|
-
generation,
|
|
702
|
-
reason: "no improvement vs previous generation"
|
|
703
|
-
});
|
|
704
|
-
break;
|
|
705
|
-
}
|
|
706
|
-
}
|
|
707
|
-
if (generation === config.generations - 1) break;
|
|
708
|
-
population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
|
|
709
|
-
}
|
|
710
|
-
return {
|
|
711
|
-
runId: config.runId,
|
|
712
|
-
target: config.target,
|
|
713
|
-
generations,
|
|
714
|
-
bestVariant,
|
|
715
|
-
bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find(
|
|
716
|
-
(a) => a.variantId === bestVariant.id
|
|
717
|
-
)
|
|
718
|
-
};
|
|
719
|
-
}
|
|
720
|
-
async function scorePopulation(population, config, generation) {
|
|
721
|
-
const jobs = [];
|
|
722
|
-
for (const variant of population) {
|
|
723
|
-
for (const scenarioId of config.scenarioIds) {
|
|
724
|
-
for (let rep = 0; rep < config.reps; rep++) {
|
|
725
|
-
jobs.push(async () => {
|
|
726
|
-
const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
|
|
727
|
-
const cached = config.cache?.get(cacheKey);
|
|
728
|
-
if (cached) {
|
|
729
|
-
config.onProgress?.({
|
|
730
|
-
type: "trial-complete",
|
|
731
|
-
generation,
|
|
732
|
-
variantId: variant.id,
|
|
733
|
-
scenarioId,
|
|
734
|
-
rep,
|
|
735
|
-
ok: cached.ok,
|
|
736
|
-
score: cached.score,
|
|
737
|
-
cached: true
|
|
738
|
-
});
|
|
739
|
-
return cached;
|
|
740
|
-
}
|
|
741
|
-
const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
|
|
742
|
-
config.cache?.set(cacheKey, result);
|
|
743
|
-
config.onProgress?.({
|
|
744
|
-
type: "trial-complete",
|
|
745
|
-
generation,
|
|
746
|
-
variantId: variant.id,
|
|
747
|
-
scenarioId,
|
|
748
|
-
rep,
|
|
749
|
-
ok: result.ok,
|
|
750
|
-
score: result.score,
|
|
751
|
-
cached: false
|
|
752
|
-
});
|
|
753
|
-
return result;
|
|
754
|
-
});
|
|
755
|
-
}
|
|
756
|
-
}
|
|
757
|
-
}
|
|
758
|
-
return runWithConcurrency(jobs, config.scoreConcurrency);
|
|
759
|
-
}
|
|
760
|
-
async function runWithConcurrency(jobs, concurrency) {
|
|
761
|
-
const results = new Array(jobs.length);
|
|
762
|
-
const limit = Math.max(1, concurrency);
|
|
763
|
-
let next = 0;
|
|
764
|
-
async function worker() {
|
|
765
|
-
while (true) {
|
|
766
|
-
const i = next++;
|
|
767
|
-
if (i >= jobs.length) return;
|
|
768
|
-
results[i] = await jobs[i]();
|
|
769
|
-
}
|
|
770
|
-
}
|
|
771
|
-
await Promise.all(Array.from({ length: limit }, () => worker()));
|
|
772
|
-
return results;
|
|
773
|
-
}
|
|
774
|
-
function aggregateTrials(population, scenarioIds, trials) {
|
|
775
|
-
return population.map((variant) => {
|
|
776
|
-
const variantTrials = trials.filter((t) => t.variantId === variant.id);
|
|
777
|
-
const scenarios = scenarioIds.map((sid) => {
|
|
778
|
-
const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
|
|
779
|
-
const okTrials = scenarioTrials.filter((t) => t.ok);
|
|
780
|
-
const gradedTrials = scenarioTrials.filter((t) => !t.error);
|
|
781
|
-
const metrics = aggregateMetrics(gradedTrials.map((t) => t.metrics ?? {}));
|
|
782
|
-
return {
|
|
783
|
-
variantId: variant.id,
|
|
784
|
-
scenarioId: sid,
|
|
785
|
-
meanScore: mean2(gradedTrials.map((t) => t.score)),
|
|
786
|
-
meanCost: mean2(gradedTrials.map((t) => t.cost ?? 0)),
|
|
787
|
-
meanDurationMs: mean2(gradedTrials.map((t) => t.durationMs ?? 0)),
|
|
788
|
-
okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
|
|
789
|
-
trials: scenarioTrials.length,
|
|
790
|
-
metrics
|
|
791
|
-
};
|
|
792
|
-
});
|
|
793
|
-
return {
|
|
794
|
-
variantId: variant.id,
|
|
795
|
-
meanScore: mean2(scenarios.map((s) => s.meanScore)),
|
|
796
|
-
meanCost: mean2(scenarios.map((s) => s.meanCost)),
|
|
797
|
-
meanDurationMs: mean2(scenarios.map((s) => s.meanDurationMs)),
|
|
798
|
-
okRate: mean2(scenarios.map((s) => s.okRate)),
|
|
799
|
-
scenarios,
|
|
800
|
-
metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
|
|
801
|
-
};
|
|
802
|
-
});
|
|
803
|
-
}
|
|
804
|
-
function aggregateMetrics(rows) {
|
|
805
|
-
const buckets = /* @__PURE__ */ new Map();
|
|
806
|
-
for (const row of rows) {
|
|
807
|
-
for (const [k, v] of Object.entries(row)) {
|
|
808
|
-
if (!Number.isFinite(v)) continue;
|
|
809
|
-
const list = buckets.get(k) ?? [];
|
|
810
|
-
list.push(v);
|
|
811
|
-
buckets.set(k, list);
|
|
812
|
-
}
|
|
813
|
-
}
|
|
814
|
-
const out = {};
|
|
815
|
-
for (const [k, list] of buckets) out[k] = mean2(list);
|
|
816
|
-
return out;
|
|
817
|
-
}
|
|
818
|
-
function mean2(xs) {
|
|
819
|
-
if (xs.length === 0) return 0;
|
|
820
|
-
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
821
|
-
}
|
|
822
|
-
async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
|
|
823
|
-
const survivorIds = new Set(front.map((c) => c.candidate.variantId));
|
|
824
|
-
const survivors = current.filter((v) => survivorIds.has(v.id));
|
|
825
|
-
const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort(
|
|
826
|
-
(a, b) => b.score - a.score
|
|
827
|
-
);
|
|
828
|
-
const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
|
|
829
|
-
const parent = current.find((v) => v.id === parentId) ?? current[0];
|
|
830
|
-
const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
|
|
831
|
-
const topTrials = topKTrialsByScore(trials, parent.id, 3);
|
|
832
|
-
const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
|
|
833
|
-
const childCount = Math.max(0, config.populationSize - survivors.length);
|
|
834
|
-
let children = [];
|
|
835
|
-
if (childCount > 0) {
|
|
836
|
-
children = await config.mutateAdapter.mutate({
|
|
837
|
-
parent,
|
|
838
|
-
parentAggregate,
|
|
839
|
-
topTrials,
|
|
840
|
-
bottomTrials,
|
|
841
|
-
childCount,
|
|
842
|
-
generation: nextGeneration
|
|
843
|
-
});
|
|
844
|
-
children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
|
|
845
|
-
}
|
|
846
|
-
return [...survivors, ...children];
|
|
847
|
-
}
|
|
848
|
-
function topKTrialsByScore(trials, variantId, k) {
|
|
849
|
-
return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
|
|
850
|
-
}
|
|
851
|
-
function bottomKTrialsByScore(trials, variantId, k) {
|
|
852
|
-
return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
|
|
853
|
-
}
|
|
854
|
-
function samePopulation(a, b) {
|
|
855
|
-
if (a.length !== b.length) return false;
|
|
856
|
-
const setA = new Set(a);
|
|
857
|
-
return b.every((id) => setA.has(id));
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
// src/multi-shot-optimization.ts
|
|
861
|
-
async function runMultiShotOptimization(config) {
|
|
862
|
-
validateConfig(config);
|
|
863
|
-
const scoreAdapter = {
|
|
864
|
-
score: (args) => scoreOne(config, args.variant, args.scenarioId, args.rep, "search")
|
|
865
|
-
};
|
|
866
|
-
const evolution = await runPromptEvolution({
|
|
867
|
-
runId: config.runId,
|
|
868
|
-
target: config.target,
|
|
869
|
-
seedVariants: config.seedVariants,
|
|
870
|
-
scenarioIds: config.searchScenarioIds,
|
|
871
|
-
reps: config.reps,
|
|
872
|
-
generations: config.generations,
|
|
873
|
-
populationSize: config.populationSize,
|
|
874
|
-
scoreConcurrency: config.scoreConcurrency ?? 1,
|
|
875
|
-
scoreAdapter,
|
|
876
|
-
mutateAdapter: {
|
|
877
|
-
mutate: (args) => config.mutateAdapter.mutate({
|
|
878
|
-
...args,
|
|
879
|
-
topTrials: args.topTrials,
|
|
880
|
-
bottomTrials: args.bottomTrials
|
|
881
|
-
})
|
|
882
|
-
},
|
|
883
|
-
objectives: config.objectives ?? defaultMultiShotObjectives(),
|
|
884
|
-
scalarWeights: config.scalarWeights,
|
|
885
|
-
earlyStopOnNoImprovement: config.earlyStopOnNoImprovement,
|
|
886
|
-
cache: config.cache,
|
|
887
|
-
onProgress: config.onProgress
|
|
888
|
-
});
|
|
889
|
-
let gate = null;
|
|
890
|
-
const baseline = config.seedVariants[0];
|
|
891
|
-
let promotedVariant = evolution.bestVariant;
|
|
892
|
-
let promotedAggregate = evolution.bestAggregate;
|
|
893
|
-
if (config.gate && evolution.bestVariant.id !== baseline.id) {
|
|
894
|
-
gate = await evaluateMultiShotGate(config, baseline, evolution.bestVariant);
|
|
895
|
-
if (!gate.decision.promote) {
|
|
896
|
-
promotedVariant = baseline;
|
|
897
|
-
promotedAggregate = aggregateFor(evolution, baseline.id);
|
|
898
|
-
}
|
|
899
|
-
}
|
|
900
|
-
return {
|
|
901
|
-
evolution,
|
|
902
|
-
searchBestVariant: evolution.bestVariant,
|
|
903
|
-
searchBestAggregate: evolution.bestAggregate,
|
|
904
|
-
promotedVariant,
|
|
905
|
-
promotedAggregate,
|
|
906
|
-
gate
|
|
907
|
-
};
|
|
908
|
-
}
|
|
909
|
-
function defaultMultiShotObjectives() {
|
|
910
|
-
return [
|
|
911
|
-
{ name: "score", direction: "maximize", value: (a) => a.meanScore },
|
|
912
|
-
{ name: "cost", direction: "minimize", value: (a) => a.meanCost }
|
|
913
|
-
];
|
|
914
|
-
}
|
|
915
|
-
function trialTraceFromMultiShotTrial(trial) {
|
|
916
|
-
return {
|
|
917
|
-
id: `${trial.variantId}/${trial.scenarioId}/r${trial.rep}`,
|
|
918
|
-
score: trial.score,
|
|
919
|
-
inputName: trial.scenarioId,
|
|
920
|
-
expectations: (trial.asi ?? []).map((item, i) => ({
|
|
921
|
-
id: item.expectationId ?? `asi-${i}`,
|
|
922
|
-
phrase: item.message,
|
|
923
|
-
matched: item.matched ?? false
|
|
924
|
-
})),
|
|
925
|
-
emitted: trial.emitted ?? traceExcerpt(trial.trace),
|
|
926
|
-
metrics: trial.metrics
|
|
927
|
-
};
|
|
928
|
-
}
|
|
929
|
-
async function evaluateMultiShotGate(config, baseline, candidate) {
|
|
930
|
-
const gateConfig = config.gate;
|
|
931
|
-
const reps = gateConfig.reps ?? config.reps;
|
|
932
|
-
const candidateRuns = [];
|
|
933
|
-
const baselineRuns = [];
|
|
934
|
-
const searchIds = gateConfig.searchScenarioIds ?? config.searchScenarioIds;
|
|
935
|
-
for (const scenarioId of searchIds) {
|
|
936
|
-
for (let rep = 0; rep < reps; rep++) {
|
|
937
|
-
const seed = seedFor(config, scenarioId, rep);
|
|
938
|
-
const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "search");
|
|
939
|
-
const candTrial = await scoreOne(config, candidate, scenarioId, rep, "search");
|
|
940
|
-
baselineRuns.push(
|
|
941
|
-
toValidatedRecord(config, baseline, scenarioId, rep, "search", seed, baseTrial)
|
|
942
|
-
);
|
|
943
|
-
candidateRuns.push(
|
|
944
|
-
toValidatedRecord(config, candidate, scenarioId, rep, "search", seed, candTrial)
|
|
945
|
-
);
|
|
946
|
-
}
|
|
947
|
-
}
|
|
948
|
-
for (const scenarioId of gateConfig.holdoutScenarioIds) {
|
|
949
|
-
for (let rep = 0; rep < reps; rep++) {
|
|
950
|
-
const seed = seedFor(config, scenarioId, rep);
|
|
951
|
-
const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "holdout");
|
|
952
|
-
const candTrial = await scoreOne(config, candidate, scenarioId, rep, "holdout");
|
|
953
|
-
baselineRuns.push(
|
|
954
|
-
toValidatedRecord(config, baseline, scenarioId, rep, "holdout", seed, baseTrial)
|
|
955
|
-
);
|
|
956
|
-
candidateRuns.push(
|
|
957
|
-
toValidatedRecord(config, candidate, scenarioId, rep, "holdout", seed, candTrial)
|
|
958
|
-
);
|
|
959
|
-
}
|
|
960
|
-
}
|
|
961
|
-
const decision = new HeldOutGate(gateConfig.gate).evaluate(candidateRuns, baselineRuns);
|
|
962
|
-
return { decision, candidateRuns, baselineRuns };
|
|
963
|
-
}
|
|
964
|
-
async function scoreOne(config, variant, scenarioId, rep, split) {
|
|
965
|
-
const seed = seedFor(config, scenarioId, rep);
|
|
966
|
-
const input = { variant, scenarioId, rep, split, seed };
|
|
967
|
-
try {
|
|
968
|
-
const run = await config.runner.run(input);
|
|
969
|
-
const scored = await config.scorer.score({ ...input, run });
|
|
970
|
-
const asi = scored.asi ?? [];
|
|
971
|
-
return {
|
|
972
|
-
variantId: variant.id,
|
|
973
|
-
scenarioId,
|
|
974
|
-
rep,
|
|
975
|
-
ok: scored.ok ?? true,
|
|
976
|
-
score: clamp01(scored.score),
|
|
977
|
-
cost: scored.costUsd ?? run.costUsd ?? 0,
|
|
978
|
-
durationMs: scored.durationMs ?? run.durationMs ?? 0,
|
|
979
|
-
metrics: {
|
|
980
|
-
...numericMetrics(scored.metrics),
|
|
981
|
-
...asiMetrics(asi)
|
|
982
|
-
},
|
|
983
|
-
split,
|
|
984
|
-
seed,
|
|
985
|
-
trace: run.trace,
|
|
986
|
-
asi,
|
|
987
|
-
emitted: scored.emitted ?? traceExcerpt(run.trace),
|
|
988
|
-
metadata: scored.metadata
|
|
989
|
-
};
|
|
990
|
-
} catch (err) {
|
|
991
|
-
return {
|
|
992
|
-
variantId: variant.id,
|
|
993
|
-
scenarioId,
|
|
994
|
-
rep,
|
|
995
|
-
ok: false,
|
|
996
|
-
score: 0,
|
|
997
|
-
cost: 0,
|
|
998
|
-
durationMs: 0,
|
|
999
|
-
metrics: { error: 1 },
|
|
1000
|
-
error: err instanceof Error ? err.message : String(err),
|
|
1001
|
-
split,
|
|
1002
|
-
seed,
|
|
1003
|
-
asi: [
|
|
1004
|
-
{
|
|
1005
|
-
severity: "critical",
|
|
1006
|
-
message: err instanceof Error ? err.message : String(err),
|
|
1007
|
-
responsibleSurface: config.target
|
|
1008
|
-
}
|
|
1009
|
-
],
|
|
1010
|
-
emitted: ""
|
|
1011
|
-
};
|
|
1012
|
-
}
|
|
1013
|
-
}
|
|
1014
|
-
function toValidatedRecord(config, variant, scenarioId, rep, split, seed, trial) {
|
|
1015
|
-
const record = config.gate.toRunRecord({ variant, scenarioId, rep, split, seed, trial });
|
|
1016
|
-
return validateRunRecord(record);
|
|
1017
|
-
}
|
|
1018
|
-
function validateConfig(config) {
|
|
1019
|
-
if (!config.runId.trim()) throw new Error("runMultiShotOptimization: runId must not be empty");
|
|
1020
|
-
if (!config.target.trim()) throw new Error("runMultiShotOptimization: target must not be empty");
|
|
1021
|
-
if (config.seedVariants.length === 0) {
|
|
1022
|
-
throw new Error("runMultiShotOptimization: seedVariants must not be empty");
|
|
1023
|
-
}
|
|
1024
|
-
if (config.searchScenarioIds.length === 0) {
|
|
1025
|
-
throw new Error("runMultiShotOptimization: searchScenarioIds must not be empty");
|
|
1026
|
-
}
|
|
1027
|
-
requirePositiveInteger(config.reps, "reps");
|
|
1028
|
-
requirePositiveInteger(config.generations, "generations");
|
|
1029
|
-
requirePositiveInteger(config.populationSize, "populationSize");
|
|
1030
|
-
if (config.scoreConcurrency !== void 0)
|
|
1031
|
-
requirePositiveInteger(config.scoreConcurrency, "scoreConcurrency");
|
|
1032
|
-
if (config.populationSize < config.seedVariants.length) {
|
|
1033
|
-
throw new Error("runMultiShotOptimization: populationSize must be >= seedVariants.length");
|
|
1034
|
-
}
|
|
1035
|
-
assertUnique(
|
|
1036
|
-
config.seedVariants.map((v) => v.id),
|
|
1037
|
-
"seedVariants.id"
|
|
1038
|
-
);
|
|
1039
|
-
assertUnique(config.searchScenarioIds, "searchScenarioIds");
|
|
1040
|
-
if (config.gate) {
|
|
1041
|
-
if (config.gate.holdoutScenarioIds.length === 0) {
|
|
1042
|
-
throw new Error("runMultiShotOptimization: gate.holdoutScenarioIds must not be empty");
|
|
1043
|
-
}
|
|
1044
|
-
if (config.gate.reps !== void 0) requirePositiveInteger(config.gate.reps, "gate.reps");
|
|
1045
|
-
assertUnique(config.gate.holdoutScenarioIds, "gate.holdoutScenarioIds");
|
|
1046
|
-
if (config.gate.searchScenarioIds)
|
|
1047
|
-
assertUnique(config.gate.searchScenarioIds, "gate.searchScenarioIds");
|
|
1048
|
-
const searchIds = new Set(config.searchScenarioIds);
|
|
1049
|
-
for (const id of config.gate.holdoutScenarioIds) {
|
|
1050
|
-
if (searchIds.has(id)) {
|
|
1051
|
-
throw new Error(
|
|
1052
|
-
`runMultiShotOptimization: holdout scenario "${id}" also appears in searchScenarioIds`
|
|
1053
|
-
);
|
|
1054
|
-
}
|
|
1055
|
-
}
|
|
1056
|
-
const baselineId = config.seedVariants[0].id;
|
|
1057
|
-
if (config.gate.gate.baselineKey !== baselineId) {
|
|
1058
|
-
throw new Error(
|
|
1059
|
-
`runMultiShotOptimization: gate.gate.baselineKey must match first seed variant id "${baselineId}"`
|
|
1060
|
-
);
|
|
1061
|
-
}
|
|
1062
|
-
}
|
|
1063
|
-
}
|
|
1064
|
-
function requirePositiveInteger(value, name) {
|
|
1065
|
-
if (!Number.isInteger(value) || value <= 0) {
|
|
1066
|
-
throw new Error(`runMultiShotOptimization: ${name} must be a positive integer`);
|
|
1067
|
-
}
|
|
1068
|
-
}
|
|
1069
|
-
function assertUnique(values, name) {
|
|
1070
|
-
const seen = /* @__PURE__ */ new Set();
|
|
1071
|
-
for (const value of values) {
|
|
1072
|
-
if (!value.trim())
|
|
1073
|
-
throw new Error(`runMultiShotOptimization: ${name} must not contain empty values`);
|
|
1074
|
-
if (seen.has(value)) throw new Error(`runMultiShotOptimization: duplicate ${name} "${value}"`);
|
|
1075
|
-
seen.add(value);
|
|
1076
|
-
}
|
|
1077
|
-
}
|
|
1078
|
-
function aggregateFor(evolution, variantId) {
|
|
1079
|
-
const final = evolution.generations[evolution.generations.length - 1];
|
|
1080
|
-
const aggregate = final?.aggregates.find((a) => a.variantId === variantId);
|
|
1081
|
-
if (!aggregate) {
|
|
1082
|
-
throw new Error(`runMultiShotOptimization: missing aggregate for variant "${variantId}"`);
|
|
1083
|
-
}
|
|
1084
|
-
return aggregate;
|
|
1085
|
-
}
|
|
1086
|
-
function seedFor(config, scenarioId, rep) {
|
|
1087
|
-
const base = config.seedBase ?? 0;
|
|
1088
|
-
return (base + stableHash2(`${scenarioId}${rep}`)) % Number.MAX_SAFE_INTEGER;
|
|
1089
|
-
}
|
|
1090
|
-
function stableHash2(input) {
|
|
1091
|
-
let h = 2166136261;
|
|
1092
|
-
for (let i = 0; i < input.length; i++) {
|
|
1093
|
-
h ^= input.charCodeAt(i);
|
|
1094
|
-
h = Math.imul(h, 16777619);
|
|
1095
|
-
}
|
|
1096
|
-
return h >>> 0;
|
|
1097
|
-
}
|
|
1098
|
-
function clamp01(n) {
|
|
1099
|
-
if (!Number.isFinite(n)) return 0;
|
|
1100
|
-
return Math.max(0, Math.min(1, n));
|
|
1101
|
-
}
|
|
1102
|
-
function numericMetrics(metrics) {
|
|
1103
|
-
const out = {};
|
|
1104
|
-
for (const [k, v] of Object.entries(metrics ?? {})) {
|
|
1105
|
-
if (Number.isFinite(v)) out[k] = v;
|
|
1106
|
-
}
|
|
1107
|
-
return out;
|
|
1108
|
-
}
|
|
1109
|
-
function asiMetrics(asi) {
|
|
1110
|
-
const out = { asi: asi.length };
|
|
1111
|
-
for (const item of asi.slice(0, 1e3)) {
|
|
1112
|
-
const sev = normalizeSeverity(item.severity);
|
|
1113
|
-
out[`asi.${sev}`] = (out[`asi.${sev}`] ?? 0) + 1;
|
|
1114
|
-
if (item.responsibleSurface) {
|
|
1115
|
-
const key = `surface.${metricKeySegment(item.responsibleSurface)}`;
|
|
1116
|
-
out[key] = (out[key] ?? 0) + 1;
|
|
1117
|
-
}
|
|
1118
|
-
}
|
|
1119
|
-
return out;
|
|
1120
|
-
}
|
|
1121
|
-
function normalizeSeverity(severity) {
|
|
1122
|
-
if (severity === "info" || severity === "warning" || severity === "error" || severity === "critical") {
|
|
1123
|
-
return severity;
|
|
1124
|
-
}
|
|
1125
|
-
return "error";
|
|
1126
|
-
}
|
|
1127
|
-
function metricKeySegment(raw) {
|
|
1128
|
-
return raw.trim().replace(/[^a-zA-Z0-9._-]+/g, "_").slice(0, 80) || "unknown";
|
|
1129
|
-
}
|
|
1130
|
-
function traceExcerpt(trace) {
|
|
1131
|
-
if (!trace) return void 0;
|
|
1132
|
-
if (typeof trace.output === "string") return trace.output;
|
|
1133
|
-
if (trace.transcript) return trace.transcript;
|
|
1134
|
-
if (trace.turns) {
|
|
1135
|
-
try {
|
|
1136
|
-
const clipped = trace.turns.slice(0, 20);
|
|
1137
|
-
const suffix = trace.turns.length > clipped.length ? ` ... ${trace.turns.length - clipped.length} more turn(s)` : "";
|
|
1138
|
-
return `${JSON.stringify(clipped).slice(0, 2e3)}${suffix}`;
|
|
1139
|
-
} catch {
|
|
1140
|
-
return "[unserializable trace turns]";
|
|
1141
|
-
}
|
|
1142
|
-
}
|
|
1143
|
-
return void 0;
|
|
1144
|
-
}
|
|
1145
|
-
|
|
1146
|
-
// src/reflective-mutation.ts
|
|
1147
|
-
var DEFAULT_MUTATION_PRIMITIVES = [
|
|
1148
|
-
'Strengthen an imperative ("should" \u2192 "must")',
|
|
1149
|
-
"Add a concrete example pulled from a missed-golden phrase",
|
|
1150
|
-
"Remove a redundant rule that did not improve recall",
|
|
1151
|
-
'Add a counterfactual ("if X is missing, the score is capped at Y")',
|
|
1152
|
-
"Reorder sections so the highest-impact rule is first",
|
|
1153
|
-
"Replace abstract language with a domain-specific noun the trial misses"
|
|
1154
|
-
];
|
|
1155
|
-
function buildReflectionPrompt(ctx) {
|
|
1156
|
-
const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
|
|
1157
|
-
const sections = [];
|
|
1158
|
-
sections.push(`# Mutation target: ${ctx.target}`);
|
|
1159
|
-
sections.push("");
|
|
1160
|
-
sections.push(
|
|
1161
|
-
`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`
|
|
1162
|
-
);
|
|
1163
|
-
sections.push("");
|
|
1164
|
-
sections.push("## Current variant");
|
|
1165
|
-
sections.push("```json");
|
|
1166
|
-
sections.push(JSON.stringify(ctx.parentPayload, null, 2));
|
|
1167
|
-
sections.push("```");
|
|
1168
|
-
sections.push("");
|
|
1169
|
-
if (ctx.bottomTrials.length > 0) {
|
|
1170
|
-
sections.push("## Failures (bottom trials) \u2014 what went wrong");
|
|
1171
|
-
sections.push("");
|
|
1172
|
-
for (const trial of ctx.bottomTrials) {
|
|
1173
|
-
sections.push(
|
|
1174
|
-
`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
|
|
1175
|
-
);
|
|
1176
|
-
const missed = (trial.expectations ?? []).filter((e) => !e.matched);
|
|
1177
|
-
if (missed.length > 0) {
|
|
1178
|
-
sections.push("");
|
|
1179
|
-
sections.push("**Missed expectations:**");
|
|
1180
|
-
for (const m of missed) {
|
|
1181
|
-
sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
|
|
1182
|
-
}
|
|
1183
|
-
}
|
|
1184
|
-
if (trial.emitted) {
|
|
1185
|
-
sections.push("");
|
|
1186
|
-
sections.push("**What the agent emitted:**");
|
|
1187
|
-
sections.push("```");
|
|
1188
|
-
sections.push(truncate(trial.emitted, 600));
|
|
1189
|
-
sections.push("```");
|
|
1190
|
-
}
|
|
1191
|
-
sections.push("");
|
|
1192
|
-
}
|
|
1193
|
-
}
|
|
1194
|
-
if (ctx.topTrials.length > 0) {
|
|
1195
|
-
sections.push("## Successes (top trials) \u2014 what to preserve");
|
|
1196
|
-
sections.push("");
|
|
1197
|
-
for (const trial of ctx.topTrials) {
|
|
1198
|
-
sections.push(
|
|
1199
|
-
`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
|
|
1200
|
-
);
|
|
1201
|
-
}
|
|
1202
|
-
sections.push("");
|
|
1203
|
-
}
|
|
1204
|
-
sections.push("## Allowed mutation primitives");
|
|
1205
|
-
sections.push("");
|
|
1206
|
-
for (const p of primitives) sections.push(`- ${p}`);
|
|
1207
|
-
sections.push("");
|
|
1208
|
-
sections.push("## Output schema");
|
|
1209
|
-
sections.push("");
|
|
1210
|
-
sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
|
|
1211
|
-
sections.push("```json");
|
|
1212
|
-
sections.push(
|
|
1213
|
-
JSON.stringify(
|
|
1214
|
-
{
|
|
1215
|
-
proposals: [
|
|
1216
|
-
{
|
|
1217
|
-
label: "<short label, \u2264 40 chars>",
|
|
1218
|
-
rationale: "<which failure this targets and which primitive you used>",
|
|
1219
|
-
payload: "<full payload of the new variant \u2014 same shape as the current variant>"
|
|
1220
|
-
}
|
|
1221
|
-
]
|
|
1222
|
-
},
|
|
1223
|
-
null,
|
|
1224
|
-
2
|
|
1225
|
-
)
|
|
1226
|
-
);
|
|
1227
|
-
sections.push("```");
|
|
1228
|
-
return sections.join("\n");
|
|
1229
|
-
}
|
|
1230
|
-
function truncate(s, max) {
|
|
1231
|
-
if (s.length <= max) return s;
|
|
1232
|
-
return `${s.slice(0, max)}\u2026 [truncated]`;
|
|
1233
|
-
}
|
|
1234
|
-
function quote(s) {
|
|
1235
|
-
return s.replace(/`/g, "\\`");
|
|
1236
|
-
}
|
|
1237
|
-
function autoCloseTruncatedJson(raw) {
|
|
1238
|
-
const stack = [];
|
|
1239
|
-
let inString = false;
|
|
1240
|
-
let escaped = false;
|
|
1241
|
-
for (const c of raw) {
|
|
1242
|
-
if (escaped) {
|
|
1243
|
-
escaped = false;
|
|
1244
|
-
continue;
|
|
1245
|
-
}
|
|
1246
|
-
if (inString) {
|
|
1247
|
-
if (c === "\\") {
|
|
1248
|
-
escaped = true;
|
|
1249
|
-
continue;
|
|
1250
|
-
}
|
|
1251
|
-
if (c === '"') {
|
|
1252
|
-
inString = false;
|
|
1253
|
-
continue;
|
|
1254
|
-
}
|
|
1255
|
-
continue;
|
|
1256
|
-
}
|
|
1257
|
-
if (c === '"') {
|
|
1258
|
-
inString = true;
|
|
1259
|
-
continue;
|
|
1260
|
-
}
|
|
1261
|
-
if (c === "{" || c === "[") stack.push(c);
|
|
1262
|
-
else if (c === "}") {
|
|
1263
|
-
if (stack.pop() !== "{") return null;
|
|
1264
|
-
} else if (c === "]") {
|
|
1265
|
-
if (stack.pop() !== "[") return null;
|
|
1266
|
-
}
|
|
1267
|
-
}
|
|
1268
|
-
if (stack.length === 0 && !inString) return raw;
|
|
1269
|
-
let suffix = "";
|
|
1270
|
-
if (inString) suffix += '"';
|
|
1271
|
-
while (stack.length > 0) {
|
|
1272
|
-
const opener = stack.pop();
|
|
1273
|
-
suffix += opener === "{" ? "}" : "]";
|
|
1274
|
-
}
|
|
1275
|
-
return raw + suffix;
|
|
1276
|
-
}
|
|
1277
|
-
function parseReflectionResponse(raw, maxProposals) {
|
|
1278
|
-
let text = raw.trim();
|
|
1279
|
-
if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
1280
|
-
let parsed = null;
|
|
1281
|
-
const objectStart = text.indexOf("{");
|
|
1282
|
-
const objectEnd = text.lastIndexOf("}");
|
|
1283
|
-
const arrayStart = text.indexOf("[");
|
|
1284
|
-
const arrayEnd = text.lastIndexOf("]");
|
|
1285
|
-
const tryObjectFirst = objectStart >= 0 && (arrayStart < 0 || objectStart < arrayStart);
|
|
1286
|
-
const candidates = [];
|
|
1287
|
-
if (tryObjectFirst) {
|
|
1288
|
-
if (objectStart >= 0 && objectEnd > objectStart)
|
|
1289
|
-
candidates.push(text.slice(objectStart, objectEnd + 1));
|
|
1290
|
-
if (arrayStart >= 0 && arrayEnd > arrayStart)
|
|
1291
|
-
candidates.push(text.slice(arrayStart, arrayEnd + 1));
|
|
1292
|
-
} else {
|
|
1293
|
-
if (arrayStart >= 0 && arrayEnd > arrayStart)
|
|
1294
|
-
candidates.push(text.slice(arrayStart, arrayEnd + 1));
|
|
1295
|
-
if (objectStart >= 0 && objectEnd > objectStart)
|
|
1296
|
-
candidates.push(text.slice(objectStart, objectEnd + 1));
|
|
1297
|
-
}
|
|
1298
|
-
for (const slice of candidates) {
|
|
1299
|
-
try {
|
|
1300
|
-
parsed = JSON.parse(slice);
|
|
1301
|
-
break;
|
|
1302
|
-
} catch {
|
|
1303
|
-
}
|
|
1304
|
-
}
|
|
1305
|
-
if (parsed == null) {
|
|
1306
|
-
for (const slice of candidates) {
|
|
1307
|
-
const closed = autoCloseTruncatedJson(slice);
|
|
1308
|
-
if (closed != null && closed !== slice) {
|
|
1309
|
-
try {
|
|
1310
|
-
parsed = JSON.parse(closed);
|
|
1311
|
-
break;
|
|
1312
|
-
} catch {
|
|
1313
|
-
}
|
|
1314
|
-
}
|
|
1315
|
-
}
|
|
1316
|
-
}
|
|
1317
|
-
if (parsed == null) return [];
|
|
1318
|
-
let proposalsRaw;
|
|
1319
|
-
if (Array.isArray(parsed)) {
|
|
1320
|
-
proposalsRaw = parsed;
|
|
1321
|
-
} else if (parsed && typeof parsed === "object") {
|
|
1322
|
-
proposalsRaw = parsed.proposals;
|
|
1323
|
-
}
|
|
1324
|
-
if (!Array.isArray(proposalsRaw)) return [];
|
|
1325
|
-
const out = [];
|
|
1326
|
-
for (const p of proposalsRaw) {
|
|
1327
|
-
if (!p || typeof p !== "object") continue;
|
|
1328
|
-
const obj = p;
|
|
1329
|
-
if (!("payload" in obj)) continue;
|
|
1330
|
-
out.push({
|
|
1331
|
-
label: typeof obj.label === "string" ? obj.label : "mutation",
|
|
1332
|
-
rationale: typeof obj.rationale === "string" ? obj.rationale : "",
|
|
1333
|
-
payload: obj.payload
|
|
1334
|
-
});
|
|
1335
|
-
if (maxProposals !== void 0 && out.length >= maxProposals) break;
|
|
1336
|
-
}
|
|
1337
|
-
return out;
|
|
1338
|
-
}
|
|
1339
|
-
|
|
1340
|
-
// src/researcher.ts
|
|
1341
|
-
var CallbackResearcher = class {
|
|
1342
|
-
constructor(callbacks) {
|
|
1343
|
-
this.callbacks = callbacks;
|
|
1344
|
-
}
|
|
1345
|
-
callbacks;
|
|
1346
|
-
inspectFailures(runs) {
|
|
1347
|
-
return this.callbacks.inspectFailures(runs);
|
|
1348
|
-
}
|
|
1349
|
-
proposeChange(failures) {
|
|
1350
|
-
return this.callbacks.proposeChange(failures);
|
|
1351
|
-
}
|
|
1352
|
-
applyChange(changes, baseline) {
|
|
1353
|
-
return this.callbacks.applyChange(changes, baseline);
|
|
1354
|
-
}
|
|
1355
|
-
evaluateChange(plan) {
|
|
1356
|
-
return this.callbacks.evaluateChange(plan);
|
|
1357
|
-
}
|
|
1358
|
-
};
|
|
1359
|
-
var NoopResearcher = class {
|
|
1360
|
-
hint;
|
|
1361
|
-
constructor(hint = "NoopResearcher: no implementation wired") {
|
|
1362
|
-
this.hint = hint;
|
|
1363
|
-
}
|
|
1364
|
-
async inspectFailures(_runs) {
|
|
1365
|
-
throw new Error(`${this.hint} (inspectFailures not implemented)`);
|
|
1366
|
-
}
|
|
1367
|
-
async proposeChange(_failures) {
|
|
1368
|
-
throw new Error(`${this.hint} (proposeChange not implemented)`);
|
|
1369
|
-
}
|
|
1370
|
-
async applyChange(_changes, _baseline) {
|
|
1371
|
-
throw new Error(`${this.hint} (applyChange not implemented)`);
|
|
1372
|
-
}
|
|
1373
|
-
async evaluateChange(_plan) {
|
|
1374
|
-
throw new Error(`${this.hint} (evaluateChange not implemented)`);
|
|
1375
|
-
}
|
|
1376
|
-
};
|
|
1377
|
-
|
|
1378
|
-
export {
|
|
1379
|
-
InMemoryFeedbackTrajectoryStore,
|
|
1380
|
-
FileSystemFeedbackTrajectoryStore,
|
|
1381
|
-
createFeedbackTrajectory,
|
|
1382
|
-
assignFeedbackSplit,
|
|
1383
|
-
withAssignedFeedbackSplit,
|
|
1384
|
-
feedbackTrajectoryToDatasetScenario,
|
|
1385
|
-
feedbackTrajectoriesToDatasetScenarios,
|
|
1386
|
-
feedbackTrajectoryToOptimizerRow,
|
|
1387
|
-
feedbackTrajectoriesToOptimizerRows,
|
|
1388
|
-
replayFeedbackTrajectory,
|
|
1389
|
-
replayFeedbackTrajectories,
|
|
1390
|
-
summarizePreferenceMemory,
|
|
1391
|
-
renderPreferenceMemoryMarkdown,
|
|
1392
|
-
serializeFeedbackTrajectoriesJsonl,
|
|
1393
|
-
parseFeedbackTrajectoriesJsonl,
|
|
1394
|
-
controlRunToFeedbackTrajectory,
|
|
1395
|
-
HeldOutGate,
|
|
1396
|
-
dominates,
|
|
1397
|
-
paretoFrontier,
|
|
1398
|
-
scalarScore,
|
|
1399
|
-
crowdingDistance,
|
|
1400
|
-
paretoFrontierWithCrowding,
|
|
1401
|
-
InMemoryTrialCache,
|
|
1402
|
-
runPromptEvolution,
|
|
1403
|
-
runMultiShotOptimization,
|
|
1404
|
-
defaultMultiShotObjectives,
|
|
1405
|
-
trialTraceFromMultiShotTrial,
|
|
1406
|
-
DEFAULT_MUTATION_PRIMITIVES,
|
|
1407
|
-
buildReflectionPrompt,
|
|
1408
|
-
parseReflectionResponse,
|
|
1409
|
-
CallbackResearcher,
|
|
1410
|
-
NoopResearcher
|
|
1411
|
-
};
|
|
1412
|
-
//# sourceMappingURL=chunk-DMW5VENN.js.map
|