@tangle-network/agent-eval 0.23.0 → 0.24.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -0
- package/README.md +141 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OHEPNJQN.js +554 -0
- package/dist/chunk-OHEPNJQN.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
- package/dist/chunk-SY6WAAAD.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
- package/dist/chunk-VRJVTXRV.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +1866 -3151
- package/dist/index.js +5457 -7809
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +409 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-TDPn1cxq.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +22 -22
- package/dist/wire/index.js +4 -3
- package/package.json +35 -2
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-4W4NCYM2.js +0 -1945
- package/dist/chunk-4W4NCYM2.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
2
|
validateRunRecord
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-NLMNWKVM.js";
|
|
4
4
|
import {
|
|
5
5
|
pairedBootstrap,
|
|
6
6
|
pairedWilcoxon
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-2A5XJB43.js";
|
|
8
8
|
|
|
9
9
|
// src/feedback-trajectory.ts
|
|
10
10
|
var DEFAULT_SPLIT_POLICY = {
|
|
@@ -27,7 +27,8 @@ var InMemoryFeedbackTrajectoryStore = class {
|
|
|
27
27
|
}
|
|
28
28
|
async appendAttempt(id, attempt) {
|
|
29
29
|
const trajectory = this.trajectories.get(id);
|
|
30
|
-
if (!trajectory)
|
|
30
|
+
if (!trajectory)
|
|
31
|
+
throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
|
|
31
32
|
const next = cloneTrajectory({
|
|
32
33
|
...trajectory,
|
|
33
34
|
attempts: [...trajectory.attempts, attempt],
|
|
@@ -38,8 +39,11 @@ var InMemoryFeedbackTrajectoryStore = class {
|
|
|
38
39
|
}
|
|
39
40
|
async appendLabel(id, label, attemptId) {
|
|
40
41
|
const trajectory = this.trajectories.get(id);
|
|
41
|
-
if (!trajectory)
|
|
42
|
-
|
|
42
|
+
if (!trajectory)
|
|
43
|
+
throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
|
|
44
|
+
const attempts = attemptId ? trajectory.attempts.map(
|
|
45
|
+
(attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt
|
|
46
|
+
) : trajectory.attempts;
|
|
43
47
|
const next = cloneTrajectory({
|
|
44
48
|
...trajectory,
|
|
45
49
|
attempts,
|
|
@@ -86,7 +90,12 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
86
90
|
const { appendFile, mkdir } = await import("fs/promises");
|
|
87
91
|
const { join } = await import("path");
|
|
88
92
|
await mkdir(this.dir, { recursive: true });
|
|
89
|
-
await appendFile(
|
|
93
|
+
await appendFile(
|
|
94
|
+
join(this.dir, "feedback-trajectories.ndjson"),
|
|
95
|
+
`${JSON.stringify(record)}
|
|
96
|
+
`,
|
|
97
|
+
"utf8"
|
|
98
|
+
);
|
|
90
99
|
}
|
|
91
100
|
async load() {
|
|
92
101
|
if (this.loaded) return;
|
|
@@ -100,8 +109,10 @@ var FileSystemFeedbackTrajectoryStore = class {
|
|
|
100
109
|
try {
|
|
101
110
|
const record = JSON.parse(line);
|
|
102
111
|
if (record.op === "save") await this.memory.save(record.trajectory);
|
|
103
|
-
if (record.op === "appendAttempt")
|
|
104
|
-
|
|
112
|
+
if (record.op === "appendAttempt")
|
|
113
|
+
await this.memory.appendAttempt(record.id, record.attempt);
|
|
114
|
+
if (record.op === "appendLabel")
|
|
115
|
+
await this.memory.appendLabel(record.id, record.label, record.attemptId);
|
|
105
116
|
} catch {
|
|
106
117
|
}
|
|
107
118
|
}
|
|
@@ -131,7 +142,9 @@ function assignFeedbackSplit(trajectory, policy = {}) {
|
|
|
131
142
|
const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
|
|
132
143
|
const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
|
|
133
144
|
if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
|
|
134
|
-
const bucket = stableHash(
|
|
145
|
+
const bucket = stableHash(
|
|
146
|
+
`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`
|
|
147
|
+
) % total;
|
|
135
148
|
if (bucket < split.trainPct) return "train";
|
|
136
149
|
if (bucket < split.trainPct + split.devPct) return "dev";
|
|
137
150
|
if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
|
|
@@ -192,14 +205,16 @@ async function replayFeedbackTrajectory(trajectory, adapter) {
|
|
|
192
205
|
return {
|
|
193
206
|
trajectoryId: trajectory.id,
|
|
194
207
|
pass: false,
|
|
195
|
-
labels: [
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
208
|
+
labels: [
|
|
209
|
+
{
|
|
210
|
+
source: "system",
|
|
211
|
+
kind: "reject",
|
|
212
|
+
value: false,
|
|
213
|
+
reason: message,
|
|
214
|
+
severity: "error",
|
|
215
|
+
createdAt
|
|
216
|
+
}
|
|
217
|
+
],
|
|
203
218
|
outcome: {
|
|
204
219
|
success: false,
|
|
205
220
|
score: 0,
|
|
@@ -250,10 +265,12 @@ function renderPreferenceMemoryMarkdown(entries) {
|
|
|
250
265
|
lines.push(` Source: ${entry.sourceTrajectoryId}`);
|
|
251
266
|
lines.push("");
|
|
252
267
|
}
|
|
253
|
-
return lines.join("\n").trim()
|
|
268
|
+
return `${lines.join("\n").trim()}
|
|
269
|
+
`;
|
|
254
270
|
}
|
|
255
271
|
function serializeFeedbackTrajectoriesJsonl(trajectories) {
|
|
256
|
-
return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n")
|
|
272
|
+
return `${trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n")}
|
|
273
|
+
`;
|
|
257
274
|
}
|
|
258
275
|
function parseFeedbackTrajectoriesJsonl(jsonl) {
|
|
259
276
|
const trajectories = [];
|
|
@@ -326,17 +343,22 @@ function scoreFromLabels(labels) {
|
|
|
326
343
|
const scored = labels.map((label) => {
|
|
327
344
|
if (label.kind === "approve" || label.kind === "select") return 1;
|
|
328
345
|
if (label.kind === "reject" || label.kind === "policy_block") return 0;
|
|
329
|
-
if (label.kind === "rate" && typeof label.value === "number")
|
|
346
|
+
if (label.kind === "rate" && typeof label.value === "number")
|
|
347
|
+
return Math.max(0, Math.min(1, label.value));
|
|
330
348
|
return void 0;
|
|
331
349
|
}).filter((value) => typeof value === "number");
|
|
332
350
|
if (!scored.length) return void 0;
|
|
333
351
|
return Math.round(scored.reduce((sum, value) => sum + value, 0) / scored.length * 1e3) / 1e3;
|
|
334
352
|
}
|
|
335
353
|
function instructionFromLabel(trajectory, label) {
|
|
336
|
-
if (label.kind === "reject" && label.reason)
|
|
337
|
-
|
|
338
|
-
if (label.kind === "
|
|
339
|
-
|
|
354
|
+
if (label.kind === "reject" && label.reason)
|
|
355
|
+
return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
|
|
356
|
+
if (label.kind === "revision_request" && label.reason)
|
|
357
|
+
return `Revise similar work by applying: ${label.reason}`;
|
|
358
|
+
if (label.kind === "select" && label.reason)
|
|
359
|
+
return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
|
|
360
|
+
if (label.kind === "approve" && label.reason)
|
|
361
|
+
return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
|
|
340
362
|
if (label.kind === "comment" && label.reason) return label.reason;
|
|
341
363
|
return void 0;
|
|
342
364
|
}
|
|
@@ -398,9 +420,7 @@ function paretoFrontier(candidates, objectives) {
|
|
|
398
420
|
if (objectives.length === 0) {
|
|
399
421
|
throw new Error("paretoFrontier: at least 1 objective required");
|
|
400
422
|
}
|
|
401
|
-
const valid = candidates.filter(
|
|
402
|
-
(c) => objectives.every((o) => Number.isFinite(o.value(c)))
|
|
403
|
-
);
|
|
423
|
+
const valid = candidates.filter((c) => objectives.every((o) => Number.isFinite(o.value(c))));
|
|
404
424
|
const frontier = [];
|
|
405
425
|
const dominated = [];
|
|
406
426
|
for (const c of valid) {
|
|
@@ -624,44 +644,6 @@ function fmt(x) {
|
|
|
624
644
|
return x.toFixed(4);
|
|
625
645
|
}
|
|
626
646
|
|
|
627
|
-
// src/researcher.ts
|
|
628
|
-
var CallbackResearcher = class {
|
|
629
|
-
constructor(callbacks) {
|
|
630
|
-
this.callbacks = callbacks;
|
|
631
|
-
}
|
|
632
|
-
callbacks;
|
|
633
|
-
inspectFailures(runs) {
|
|
634
|
-
return this.callbacks.inspectFailures(runs);
|
|
635
|
-
}
|
|
636
|
-
proposeChange(failures) {
|
|
637
|
-
return this.callbacks.proposeChange(failures);
|
|
638
|
-
}
|
|
639
|
-
applyChange(changes, baseline) {
|
|
640
|
-
return this.callbacks.applyChange(changes, baseline);
|
|
641
|
-
}
|
|
642
|
-
evaluateChange(plan) {
|
|
643
|
-
return this.callbacks.evaluateChange(plan);
|
|
644
|
-
}
|
|
645
|
-
};
|
|
646
|
-
var NoopResearcher = class {
|
|
647
|
-
hint;
|
|
648
|
-
constructor(hint = "NoopResearcher: no implementation wired") {
|
|
649
|
-
this.hint = hint;
|
|
650
|
-
}
|
|
651
|
-
async inspectFailures(_runs) {
|
|
652
|
-
throw new Error(`${this.hint} (inspectFailures not implemented)`);
|
|
653
|
-
}
|
|
654
|
-
async proposeChange(_failures) {
|
|
655
|
-
throw new Error(`${this.hint} (proposeChange not implemented)`);
|
|
656
|
-
}
|
|
657
|
-
async applyChange(_changes, _baseline) {
|
|
658
|
-
throw new Error(`${this.hint} (applyChange not implemented)`);
|
|
659
|
-
}
|
|
660
|
-
async evaluateChange(_plan) {
|
|
661
|
-
throw new Error(`${this.hint} (evaluateChange not implemented)`);
|
|
662
|
-
}
|
|
663
|
-
};
|
|
664
|
-
|
|
665
647
|
// src/prompt-evolution.ts
|
|
666
648
|
var InMemoryTrialCache = class {
|
|
667
649
|
store = /* @__PURE__ */ new Map();
|
|
@@ -714,7 +696,11 @@ async function runPromptEvolution(config) {
|
|
|
714
696
|
const prev = generations[generations.length - 2];
|
|
715
697
|
const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
|
|
716
698
|
if (noChange) {
|
|
717
|
-
config.onProgress?.({
|
|
699
|
+
config.onProgress?.({
|
|
700
|
+
type: "converged",
|
|
701
|
+
generation,
|
|
702
|
+
reason: "no improvement vs previous generation"
|
|
703
|
+
});
|
|
718
704
|
break;
|
|
719
705
|
}
|
|
720
706
|
}
|
|
@@ -726,7 +712,9 @@ async function runPromptEvolution(config) {
|
|
|
726
712
|
target: config.target,
|
|
727
713
|
generations,
|
|
728
714
|
bestVariant,
|
|
729
|
-
bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find(
|
|
715
|
+
bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find(
|
|
716
|
+
(a) => a.variantId === bestVariant.id
|
|
717
|
+
)
|
|
730
718
|
};
|
|
731
719
|
}
|
|
732
720
|
async function scorePopulation(population, config, generation) {
|
|
@@ -834,7 +822,9 @@ function mean2(xs) {
|
|
|
834
822
|
async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
|
|
835
823
|
const survivorIds = new Set(front.map((c) => c.candidate.variantId));
|
|
836
824
|
const survivors = current.filter((v) => survivorIds.has(v.id));
|
|
837
|
-
const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort(
|
|
825
|
+
const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort(
|
|
826
|
+
(a, b) => b.score - a.score
|
|
827
|
+
);
|
|
838
828
|
const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
|
|
839
829
|
const parent = current.find((v) => v.id === parentId) ?? current[0];
|
|
840
830
|
const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
|
|
@@ -947,8 +937,12 @@ async function evaluateMultiShotGate(config, baseline, candidate) {
|
|
|
947
937
|
const seed = seedFor(config, scenarioId, rep);
|
|
948
938
|
const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "search");
|
|
949
939
|
const candTrial = await scoreOne(config, candidate, scenarioId, rep, "search");
|
|
950
|
-
baselineRuns.push(
|
|
951
|
-
|
|
940
|
+
baselineRuns.push(
|
|
941
|
+
toValidatedRecord(config, baseline, scenarioId, rep, "search", seed, baseTrial)
|
|
942
|
+
);
|
|
943
|
+
candidateRuns.push(
|
|
944
|
+
toValidatedRecord(config, candidate, scenarioId, rep, "search", seed, candTrial)
|
|
945
|
+
);
|
|
952
946
|
}
|
|
953
947
|
}
|
|
954
948
|
for (const scenarioId of gateConfig.holdoutScenarioIds) {
|
|
@@ -956,8 +950,12 @@ async function evaluateMultiShotGate(config, baseline, candidate) {
|
|
|
956
950
|
const seed = seedFor(config, scenarioId, rep);
|
|
957
951
|
const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "holdout");
|
|
958
952
|
const candTrial = await scoreOne(config, candidate, scenarioId, rep, "holdout");
|
|
959
|
-
baselineRuns.push(
|
|
960
|
-
|
|
953
|
+
baselineRuns.push(
|
|
954
|
+
toValidatedRecord(config, baseline, scenarioId, rep, "holdout", seed, baseTrial)
|
|
955
|
+
);
|
|
956
|
+
candidateRuns.push(
|
|
957
|
+
toValidatedRecord(config, candidate, scenarioId, rep, "holdout", seed, candTrial)
|
|
958
|
+
);
|
|
961
959
|
}
|
|
962
960
|
}
|
|
963
961
|
const decision = new HeldOutGate(gateConfig.gate).evaluate(candidateRuns, baselineRuns);
|
|
@@ -1002,11 +1000,13 @@ async function scoreOne(config, variant, scenarioId, rep, split) {
|
|
|
1002
1000
|
error: err instanceof Error ? err.message : String(err),
|
|
1003
1001
|
split,
|
|
1004
1002
|
seed,
|
|
1005
|
-
asi: [
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1003
|
+
asi: [
|
|
1004
|
+
{
|
|
1005
|
+
severity: "critical",
|
|
1006
|
+
message: err instanceof Error ? err.message : String(err),
|
|
1007
|
+
responsibleSurface: config.target
|
|
1008
|
+
}
|
|
1009
|
+
],
|
|
1010
1010
|
emitted: ""
|
|
1011
1011
|
};
|
|
1012
1012
|
}
|
|
@@ -1027,11 +1027,15 @@ function validateConfig(config) {
|
|
|
1027
1027
|
requirePositiveInteger(config.reps, "reps");
|
|
1028
1028
|
requirePositiveInteger(config.generations, "generations");
|
|
1029
1029
|
requirePositiveInteger(config.populationSize, "populationSize");
|
|
1030
|
-
if (config.scoreConcurrency !== void 0)
|
|
1030
|
+
if (config.scoreConcurrency !== void 0)
|
|
1031
|
+
requirePositiveInteger(config.scoreConcurrency, "scoreConcurrency");
|
|
1031
1032
|
if (config.populationSize < config.seedVariants.length) {
|
|
1032
1033
|
throw new Error("runMultiShotOptimization: populationSize must be >= seedVariants.length");
|
|
1033
1034
|
}
|
|
1034
|
-
assertUnique(
|
|
1035
|
+
assertUnique(
|
|
1036
|
+
config.seedVariants.map((v) => v.id),
|
|
1037
|
+
"seedVariants.id"
|
|
1038
|
+
);
|
|
1035
1039
|
assertUnique(config.searchScenarioIds, "searchScenarioIds");
|
|
1036
1040
|
if (config.gate) {
|
|
1037
1041
|
if (config.gate.holdoutScenarioIds.length === 0) {
|
|
@@ -1039,11 +1043,14 @@ function validateConfig(config) {
|
|
|
1039
1043
|
}
|
|
1040
1044
|
if (config.gate.reps !== void 0) requirePositiveInteger(config.gate.reps, "gate.reps");
|
|
1041
1045
|
assertUnique(config.gate.holdoutScenarioIds, "gate.holdoutScenarioIds");
|
|
1042
|
-
if (config.gate.searchScenarioIds)
|
|
1046
|
+
if (config.gate.searchScenarioIds)
|
|
1047
|
+
assertUnique(config.gate.searchScenarioIds, "gate.searchScenarioIds");
|
|
1043
1048
|
const searchIds = new Set(config.searchScenarioIds);
|
|
1044
1049
|
for (const id of config.gate.holdoutScenarioIds) {
|
|
1045
1050
|
if (searchIds.has(id)) {
|
|
1046
|
-
throw new Error(
|
|
1051
|
+
throw new Error(
|
|
1052
|
+
`runMultiShotOptimization: holdout scenario "${id}" also appears in searchScenarioIds`
|
|
1053
|
+
);
|
|
1047
1054
|
}
|
|
1048
1055
|
}
|
|
1049
1056
|
const baselineId = config.seedVariants[0].id;
|
|
@@ -1062,7 +1069,8 @@ function requirePositiveInteger(value, name) {
|
|
|
1062
1069
|
function assertUnique(values, name) {
|
|
1063
1070
|
const seen = /* @__PURE__ */ new Set();
|
|
1064
1071
|
for (const value of values) {
|
|
1065
|
-
if (!value.trim())
|
|
1072
|
+
if (!value.trim())
|
|
1073
|
+
throw new Error(`runMultiShotOptimization: ${name} must not contain empty values`);
|
|
1066
1074
|
if (seen.has(value)) throw new Error(`runMultiShotOptimization: duplicate ${name} "${value}"`);
|
|
1067
1075
|
seen.add(value);
|
|
1068
1076
|
}
|
|
@@ -1149,7 +1157,9 @@ function buildReflectionPrompt(ctx) {
|
|
|
1149
1157
|
const sections = [];
|
|
1150
1158
|
sections.push(`# Mutation target: ${ctx.target}`);
|
|
1151
1159
|
sections.push("");
|
|
1152
|
-
sections.push(
|
|
1160
|
+
sections.push(
|
|
1161
|
+
`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`
|
|
1162
|
+
);
|
|
1153
1163
|
sections.push("");
|
|
1154
1164
|
sections.push("## Current variant");
|
|
1155
1165
|
sections.push("```json");
|
|
@@ -1160,7 +1170,9 @@ function buildReflectionPrompt(ctx) {
|
|
|
1160
1170
|
sections.push("## Failures (bottom trials) \u2014 what went wrong");
|
|
1161
1171
|
sections.push("");
|
|
1162
1172
|
for (const trial of ctx.bottomTrials) {
|
|
1163
|
-
sections.push(
|
|
1173
|
+
sections.push(
|
|
1174
|
+
`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
|
|
1175
|
+
);
|
|
1164
1176
|
const missed = (trial.expectations ?? []).filter((e) => !e.matched);
|
|
1165
1177
|
if (missed.length > 0) {
|
|
1166
1178
|
sections.push("");
|
|
@@ -1183,7 +1195,9 @@ function buildReflectionPrompt(ctx) {
|
|
|
1183
1195
|
sections.push("## Successes (top trials) \u2014 what to preserve");
|
|
1184
1196
|
sections.push("");
|
|
1185
1197
|
for (const trial of ctx.topTrials) {
|
|
1186
|
-
sections.push(
|
|
1198
|
+
sections.push(
|
|
1199
|
+
`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
|
|
1200
|
+
);
|
|
1187
1201
|
}
|
|
1188
1202
|
sections.push("");
|
|
1189
1203
|
}
|
|
@@ -1195,25 +1209,27 @@ function buildReflectionPrompt(ctx) {
|
|
|
1195
1209
|
sections.push("");
|
|
1196
1210
|
sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
|
|
1197
1211
|
sections.push("```json");
|
|
1198
|
-
sections.push(
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1212
|
+
sections.push(
|
|
1213
|
+
JSON.stringify(
|
|
1214
|
+
{
|
|
1215
|
+
proposals: [
|
|
1216
|
+
{
|
|
1217
|
+
label: "<short label, \u2264 40 chars>",
|
|
1218
|
+
rationale: "<which failure this targets and which primitive you used>",
|
|
1219
|
+
payload: "<full payload of the new variant \u2014 same shape as the current variant>"
|
|
1220
|
+
}
|
|
1221
|
+
]
|
|
1222
|
+
},
|
|
1223
|
+
null,
|
|
1224
|
+
2
|
|
1225
|
+
)
|
|
1226
|
+
);
|
|
1211
1227
|
sections.push("```");
|
|
1212
1228
|
return sections.join("\n");
|
|
1213
1229
|
}
|
|
1214
1230
|
function truncate(s, max) {
|
|
1215
1231
|
if (s.length <= max) return s;
|
|
1216
|
-
return s.slice(0, max)
|
|
1232
|
+
return `${s.slice(0, max)}\u2026 [truncated]`;
|
|
1217
1233
|
}
|
|
1218
1234
|
function quote(s) {
|
|
1219
1235
|
return s.replace(/`/g, "\\`");
|
|
@@ -1221,15 +1237,15 @@ function quote(s) {
|
|
|
1221
1237
|
function autoCloseTruncatedJson(raw) {
|
|
1222
1238
|
const stack = [];
|
|
1223
1239
|
let inString = false;
|
|
1224
|
-
let
|
|
1240
|
+
let escaped = false;
|
|
1225
1241
|
for (const c of raw) {
|
|
1226
|
-
if (
|
|
1227
|
-
|
|
1242
|
+
if (escaped) {
|
|
1243
|
+
escaped = false;
|
|
1228
1244
|
continue;
|
|
1229
1245
|
}
|
|
1230
1246
|
if (inString) {
|
|
1231
1247
|
if (c === "\\") {
|
|
1232
|
-
|
|
1248
|
+
escaped = true;
|
|
1233
1249
|
continue;
|
|
1234
1250
|
}
|
|
1235
1251
|
if (c === '"') {
|
|
@@ -1269,11 +1285,15 @@ function parseReflectionResponse(raw, maxProposals) {
|
|
|
1269
1285
|
const tryObjectFirst = objectStart >= 0 && (arrayStart < 0 || objectStart < arrayStart);
|
|
1270
1286
|
const candidates = [];
|
|
1271
1287
|
if (tryObjectFirst) {
|
|
1272
|
-
if (objectStart >= 0 && objectEnd > objectStart)
|
|
1273
|
-
|
|
1288
|
+
if (objectStart >= 0 && objectEnd > objectStart)
|
|
1289
|
+
candidates.push(text.slice(objectStart, objectEnd + 1));
|
|
1290
|
+
if (arrayStart >= 0 && arrayEnd > arrayStart)
|
|
1291
|
+
candidates.push(text.slice(arrayStart, arrayEnd + 1));
|
|
1274
1292
|
} else {
|
|
1275
|
-
if (arrayStart >= 0 && arrayEnd > arrayStart)
|
|
1276
|
-
|
|
1293
|
+
if (arrayStart >= 0 && arrayEnd > arrayStart)
|
|
1294
|
+
candidates.push(text.slice(arrayStart, arrayEnd + 1));
|
|
1295
|
+
if (objectStart >= 0 && objectEnd > objectStart)
|
|
1296
|
+
candidates.push(text.slice(objectStart, objectEnd + 1));
|
|
1277
1297
|
}
|
|
1278
1298
|
for (const slice of candidates) {
|
|
1279
1299
|
try {
|
|
@@ -1317,6 +1337,44 @@ function parseReflectionResponse(raw, maxProposals) {
|
|
|
1317
1337
|
return out;
|
|
1318
1338
|
}
|
|
1319
1339
|
|
|
1340
|
+
// src/researcher.ts
|
|
1341
|
+
var CallbackResearcher = class {
|
|
1342
|
+
constructor(callbacks) {
|
|
1343
|
+
this.callbacks = callbacks;
|
|
1344
|
+
}
|
|
1345
|
+
callbacks;
|
|
1346
|
+
inspectFailures(runs) {
|
|
1347
|
+
return this.callbacks.inspectFailures(runs);
|
|
1348
|
+
}
|
|
1349
|
+
proposeChange(failures) {
|
|
1350
|
+
return this.callbacks.proposeChange(failures);
|
|
1351
|
+
}
|
|
1352
|
+
applyChange(changes, baseline) {
|
|
1353
|
+
return this.callbacks.applyChange(changes, baseline);
|
|
1354
|
+
}
|
|
1355
|
+
evaluateChange(plan) {
|
|
1356
|
+
return this.callbacks.evaluateChange(plan);
|
|
1357
|
+
}
|
|
1358
|
+
};
|
|
1359
|
+
var NoopResearcher = class {
|
|
1360
|
+
hint;
|
|
1361
|
+
constructor(hint = "NoopResearcher: no implementation wired") {
|
|
1362
|
+
this.hint = hint;
|
|
1363
|
+
}
|
|
1364
|
+
async inspectFailures(_runs) {
|
|
1365
|
+
throw new Error(`${this.hint} (inspectFailures not implemented)`);
|
|
1366
|
+
}
|
|
1367
|
+
async proposeChange(_failures) {
|
|
1368
|
+
throw new Error(`${this.hint} (proposeChange not implemented)`);
|
|
1369
|
+
}
|
|
1370
|
+
async applyChange(_changes, _baseline) {
|
|
1371
|
+
throw new Error(`${this.hint} (applyChange not implemented)`);
|
|
1372
|
+
}
|
|
1373
|
+
async evaluateChange(_plan) {
|
|
1374
|
+
throw new Error(`${this.hint} (evaluateChange not implemented)`);
|
|
1375
|
+
}
|
|
1376
|
+
};
|
|
1377
|
+
|
|
1320
1378
|
export {
|
|
1321
1379
|
InMemoryFeedbackTrajectoryStore,
|
|
1322
1380
|
FileSystemFeedbackTrajectoryStore,
|
|
@@ -1340,8 +1398,6 @@ export {
|
|
|
1340
1398
|
crowdingDistance,
|
|
1341
1399
|
paretoFrontierWithCrowding,
|
|
1342
1400
|
HeldOutGate,
|
|
1343
|
-
CallbackResearcher,
|
|
1344
|
-
NoopResearcher,
|
|
1345
1401
|
InMemoryTrialCache,
|
|
1346
1402
|
runPromptEvolution,
|
|
1347
1403
|
runMultiShotOptimization,
|
|
@@ -1349,6 +1405,8 @@ export {
|
|
|
1349
1405
|
trialTraceFromMultiShotTrial,
|
|
1350
1406
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
1351
1407
|
buildReflectionPrompt,
|
|
1352
|
-
parseReflectionResponse
|
|
1408
|
+
parseReflectionResponse,
|
|
1409
|
+
CallbackResearcher,
|
|
1410
|
+
NoopResearcher
|
|
1353
1411
|
};
|
|
1354
|
-
//# sourceMappingURL=chunk-
|
|
1412
|
+
//# sourceMappingURL=chunk-VRJVTXRV.js.map
|