@tangle-network/agent-eval 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +80 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -1,10 +1,10 @@
1
1
  import {
2
2
  validateRunRecord
3
- } from "./chunk-QBW3YBTR.js";
3
+ } from "./chunk-NLMNWKVM.js";
4
4
  import {
5
5
  pairedBootstrap,
6
6
  pairedWilcoxon
7
- } from "./chunk-IOXMGMHQ.js";
7
+ } from "./chunk-2A5XJB43.js";
8
8
 
9
9
  // src/feedback-trajectory.ts
10
10
  var DEFAULT_SPLIT_POLICY = {
@@ -27,7 +27,8 @@ var InMemoryFeedbackTrajectoryStore = class {
27
27
  }
28
28
  async appendAttempt(id, attempt) {
29
29
  const trajectory = this.trajectories.get(id);
30
- if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
30
+ if (!trajectory)
31
+ throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
31
32
  const next = cloneTrajectory({
32
33
  ...trajectory,
33
34
  attempts: [...trajectory.attempts, attempt],
@@ -38,8 +39,11 @@ var InMemoryFeedbackTrajectoryStore = class {
38
39
  }
39
40
  async appendLabel(id, label, attemptId) {
40
41
  const trajectory = this.trajectories.get(id);
41
- if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
42
- const attempts = attemptId ? trajectory.attempts.map((attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt) : trajectory.attempts;
42
+ if (!trajectory)
43
+ throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
44
+ const attempts = attemptId ? trajectory.attempts.map(
45
+ (attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt
46
+ ) : trajectory.attempts;
43
47
  const next = cloneTrajectory({
44
48
  ...trajectory,
45
49
  attempts,
@@ -86,7 +90,12 @@ var FileSystemFeedbackTrajectoryStore = class {
86
90
  const { appendFile, mkdir } = await import("fs/promises");
87
91
  const { join } = await import("path");
88
92
  await mkdir(this.dir, { recursive: true });
89
- await appendFile(join(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
93
+ await appendFile(
94
+ join(this.dir, "feedback-trajectories.ndjson"),
95
+ `${JSON.stringify(record)}
96
+ `,
97
+ "utf8"
98
+ );
90
99
  }
91
100
  async load() {
92
101
  if (this.loaded) return;
@@ -100,8 +109,10 @@ var FileSystemFeedbackTrajectoryStore = class {
100
109
  try {
101
110
  const record = JSON.parse(line);
102
111
  if (record.op === "save") await this.memory.save(record.trajectory);
103
- if (record.op === "appendAttempt") await this.memory.appendAttempt(record.id, record.attempt);
104
- if (record.op === "appendLabel") await this.memory.appendLabel(record.id, record.label, record.attemptId);
112
+ if (record.op === "appendAttempt")
113
+ await this.memory.appendAttempt(record.id, record.attempt);
114
+ if (record.op === "appendLabel")
115
+ await this.memory.appendLabel(record.id, record.label, record.attemptId);
105
116
  } catch {
106
117
  }
107
118
  }
@@ -131,7 +142,9 @@ function assignFeedbackSplit(trajectory, policy = {}) {
131
142
  const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
132
143
  const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
133
144
  if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
134
- const bucket = stableHash(`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`) % total;
145
+ const bucket = stableHash(
146
+ `${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`
147
+ ) % total;
135
148
  if (bucket < split.trainPct) return "train";
136
149
  if (bucket < split.trainPct + split.devPct) return "dev";
137
150
  if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
@@ -192,14 +205,16 @@ async function replayFeedbackTrajectory(trajectory, adapter) {
192
205
  return {
193
206
  trajectoryId: trajectory.id,
194
207
  pass: false,
195
- labels: [{
196
- source: "system",
197
- kind: "reject",
198
- value: false,
199
- reason: message,
200
- severity: "error",
201
- createdAt
202
- }],
208
+ labels: [
209
+ {
210
+ source: "system",
211
+ kind: "reject",
212
+ value: false,
213
+ reason: message,
214
+ severity: "error",
215
+ createdAt
216
+ }
217
+ ],
203
218
  outcome: {
204
219
  success: false,
205
220
  score: 0,
@@ -250,10 +265,12 @@ function renderPreferenceMemoryMarkdown(entries) {
250
265
  lines.push(` Source: ${entry.sourceTrajectoryId}`);
251
266
  lines.push("");
252
267
  }
253
- return lines.join("\n").trim() + "\n";
268
+ return `${lines.join("\n").trim()}
269
+ `;
254
270
  }
255
271
  function serializeFeedbackTrajectoriesJsonl(trajectories) {
256
- return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n") + "\n";
272
+ return `${trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n")}
273
+ `;
257
274
  }
258
275
  function parseFeedbackTrajectoriesJsonl(jsonl) {
259
276
  const trajectories = [];
@@ -326,17 +343,22 @@ function scoreFromLabels(labels) {
326
343
  const scored = labels.map((label) => {
327
344
  if (label.kind === "approve" || label.kind === "select") return 1;
328
345
  if (label.kind === "reject" || label.kind === "policy_block") return 0;
329
- if (label.kind === "rate" && typeof label.value === "number") return Math.max(0, Math.min(1, label.value));
346
+ if (label.kind === "rate" && typeof label.value === "number")
347
+ return Math.max(0, Math.min(1, label.value));
330
348
  return void 0;
331
349
  }).filter((value) => typeof value === "number");
332
350
  if (!scored.length) return void 0;
333
351
  return Math.round(scored.reduce((sum, value) => sum + value, 0) / scored.length * 1e3) / 1e3;
334
352
  }
335
353
  function instructionFromLabel(trajectory, label) {
336
- if (label.kind === "reject" && label.reason) return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
337
- if (label.kind === "revision_request" && label.reason) return `Revise similar work by applying: ${label.reason}`;
338
- if (label.kind === "select" && label.reason) return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
339
- if (label.kind === "approve" && label.reason) return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
354
+ if (label.kind === "reject" && label.reason)
355
+ return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
356
+ if (label.kind === "revision_request" && label.reason)
357
+ return `Revise similar work by applying: ${label.reason}`;
358
+ if (label.kind === "select" && label.reason)
359
+ return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
360
+ if (label.kind === "approve" && label.reason)
361
+ return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
340
362
  if (label.kind === "comment" && label.reason) return label.reason;
341
363
  return void 0;
342
364
  }
@@ -398,9 +420,7 @@ function paretoFrontier(candidates, objectives) {
398
420
  if (objectives.length === 0) {
399
421
  throw new Error("paretoFrontier: at least 1 objective required");
400
422
  }
401
- const valid = candidates.filter(
402
- (c) => objectives.every((o) => Number.isFinite(o.value(c)))
403
- );
423
+ const valid = candidates.filter((c) => objectives.every((o) => Number.isFinite(o.value(c))));
404
424
  const frontier = [];
405
425
  const dominated = [];
406
426
  for (const c of valid) {
@@ -624,44 +644,6 @@ function fmt(x) {
624
644
  return x.toFixed(4);
625
645
  }
626
646
 
627
- // src/researcher.ts
628
- var CallbackResearcher = class {
629
- constructor(callbacks) {
630
- this.callbacks = callbacks;
631
- }
632
- callbacks;
633
- inspectFailures(runs) {
634
- return this.callbacks.inspectFailures(runs);
635
- }
636
- proposeChange(failures) {
637
- return this.callbacks.proposeChange(failures);
638
- }
639
- applyChange(changes, baseline) {
640
- return this.callbacks.applyChange(changes, baseline);
641
- }
642
- evaluateChange(plan) {
643
- return this.callbacks.evaluateChange(plan);
644
- }
645
- };
646
- var NoopResearcher = class {
647
- hint;
648
- constructor(hint = "NoopResearcher: no implementation wired") {
649
- this.hint = hint;
650
- }
651
- async inspectFailures(_runs) {
652
- throw new Error(`${this.hint} (inspectFailures not implemented)`);
653
- }
654
- async proposeChange(_failures) {
655
- throw new Error(`${this.hint} (proposeChange not implemented)`);
656
- }
657
- async applyChange(_changes, _baseline) {
658
- throw new Error(`${this.hint} (applyChange not implemented)`);
659
- }
660
- async evaluateChange(_plan) {
661
- throw new Error(`${this.hint} (evaluateChange not implemented)`);
662
- }
663
- };
664
-
665
647
  // src/prompt-evolution.ts
666
648
  var InMemoryTrialCache = class {
667
649
  store = /* @__PURE__ */ new Map();
@@ -714,7 +696,11 @@ async function runPromptEvolution(config) {
714
696
  const prev = generations[generations.length - 2];
715
697
  const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
716
698
  if (noChange) {
717
- config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
699
+ config.onProgress?.({
700
+ type: "converged",
701
+ generation,
702
+ reason: "no improvement vs previous generation"
703
+ });
718
704
  break;
719
705
  }
720
706
  }
@@ -726,7 +712,9 @@ async function runPromptEvolution(config) {
726
712
  target: config.target,
727
713
  generations,
728
714
  bestVariant,
729
- bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
715
+ bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find(
716
+ (a) => a.variantId === bestVariant.id
717
+ )
730
718
  };
731
719
  }
732
720
  async function scorePopulation(population, config, generation) {
@@ -834,7 +822,9 @@ function mean2(xs) {
834
822
  async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
835
823
  const survivorIds = new Set(front.map((c) => c.candidate.variantId));
836
824
  const survivors = current.filter((v) => survivorIds.has(v.id));
837
- const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
825
+ const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort(
826
+ (a, b) => b.score - a.score
827
+ );
838
828
  const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
839
829
  const parent = current.find((v) => v.id === parentId) ?? current[0];
840
830
  const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
@@ -947,8 +937,12 @@ async function evaluateMultiShotGate(config, baseline, candidate) {
947
937
  const seed = seedFor(config, scenarioId, rep);
948
938
  const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "search");
949
939
  const candTrial = await scoreOne(config, candidate, scenarioId, rep, "search");
950
- baselineRuns.push(toValidatedRecord(config, baseline, scenarioId, rep, "search", seed, baseTrial));
951
- candidateRuns.push(toValidatedRecord(config, candidate, scenarioId, rep, "search", seed, candTrial));
940
+ baselineRuns.push(
941
+ toValidatedRecord(config, baseline, scenarioId, rep, "search", seed, baseTrial)
942
+ );
943
+ candidateRuns.push(
944
+ toValidatedRecord(config, candidate, scenarioId, rep, "search", seed, candTrial)
945
+ );
952
946
  }
953
947
  }
954
948
  for (const scenarioId of gateConfig.holdoutScenarioIds) {
@@ -956,8 +950,12 @@ async function evaluateMultiShotGate(config, baseline, candidate) {
956
950
  const seed = seedFor(config, scenarioId, rep);
957
951
  const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "holdout");
958
952
  const candTrial = await scoreOne(config, candidate, scenarioId, rep, "holdout");
959
- baselineRuns.push(toValidatedRecord(config, baseline, scenarioId, rep, "holdout", seed, baseTrial));
960
- candidateRuns.push(toValidatedRecord(config, candidate, scenarioId, rep, "holdout", seed, candTrial));
953
+ baselineRuns.push(
954
+ toValidatedRecord(config, baseline, scenarioId, rep, "holdout", seed, baseTrial)
955
+ );
956
+ candidateRuns.push(
957
+ toValidatedRecord(config, candidate, scenarioId, rep, "holdout", seed, candTrial)
958
+ );
961
959
  }
962
960
  }
963
961
  const decision = new HeldOutGate(gateConfig.gate).evaluate(candidateRuns, baselineRuns);
@@ -1002,11 +1000,13 @@ async function scoreOne(config, variant, scenarioId, rep, split) {
1002
1000
  error: err instanceof Error ? err.message : String(err),
1003
1001
  split,
1004
1002
  seed,
1005
- asi: [{
1006
- severity: "critical",
1007
- message: err instanceof Error ? err.message : String(err),
1008
- responsibleSurface: config.target
1009
- }],
1003
+ asi: [
1004
+ {
1005
+ severity: "critical",
1006
+ message: err instanceof Error ? err.message : String(err),
1007
+ responsibleSurface: config.target
1008
+ }
1009
+ ],
1010
1010
  emitted: ""
1011
1011
  };
1012
1012
  }
@@ -1027,11 +1027,15 @@ function validateConfig(config) {
1027
1027
  requirePositiveInteger(config.reps, "reps");
1028
1028
  requirePositiveInteger(config.generations, "generations");
1029
1029
  requirePositiveInteger(config.populationSize, "populationSize");
1030
- if (config.scoreConcurrency !== void 0) requirePositiveInteger(config.scoreConcurrency, "scoreConcurrency");
1030
+ if (config.scoreConcurrency !== void 0)
1031
+ requirePositiveInteger(config.scoreConcurrency, "scoreConcurrency");
1031
1032
  if (config.populationSize < config.seedVariants.length) {
1032
1033
  throw new Error("runMultiShotOptimization: populationSize must be >= seedVariants.length");
1033
1034
  }
1034
- assertUnique(config.seedVariants.map((v) => v.id), "seedVariants.id");
1035
+ assertUnique(
1036
+ config.seedVariants.map((v) => v.id),
1037
+ "seedVariants.id"
1038
+ );
1035
1039
  assertUnique(config.searchScenarioIds, "searchScenarioIds");
1036
1040
  if (config.gate) {
1037
1041
  if (config.gate.holdoutScenarioIds.length === 0) {
@@ -1039,11 +1043,14 @@ function validateConfig(config) {
1039
1043
  }
1040
1044
  if (config.gate.reps !== void 0) requirePositiveInteger(config.gate.reps, "gate.reps");
1041
1045
  assertUnique(config.gate.holdoutScenarioIds, "gate.holdoutScenarioIds");
1042
- if (config.gate.searchScenarioIds) assertUnique(config.gate.searchScenarioIds, "gate.searchScenarioIds");
1046
+ if (config.gate.searchScenarioIds)
1047
+ assertUnique(config.gate.searchScenarioIds, "gate.searchScenarioIds");
1043
1048
  const searchIds = new Set(config.searchScenarioIds);
1044
1049
  for (const id of config.gate.holdoutScenarioIds) {
1045
1050
  if (searchIds.has(id)) {
1046
- throw new Error(`runMultiShotOptimization: holdout scenario "${id}" also appears in searchScenarioIds`);
1051
+ throw new Error(
1052
+ `runMultiShotOptimization: holdout scenario "${id}" also appears in searchScenarioIds`
1053
+ );
1047
1054
  }
1048
1055
  }
1049
1056
  const baselineId = config.seedVariants[0].id;
@@ -1062,7 +1069,8 @@ function requirePositiveInteger(value, name) {
1062
1069
  function assertUnique(values, name) {
1063
1070
  const seen = /* @__PURE__ */ new Set();
1064
1071
  for (const value of values) {
1065
- if (!value.trim()) throw new Error(`runMultiShotOptimization: ${name} must not contain empty values`);
1072
+ if (!value.trim())
1073
+ throw new Error(`runMultiShotOptimization: ${name} must not contain empty values`);
1066
1074
  if (seen.has(value)) throw new Error(`runMultiShotOptimization: duplicate ${name} "${value}"`);
1067
1075
  seen.add(value);
1068
1076
  }
@@ -1149,7 +1157,9 @@ function buildReflectionPrompt(ctx) {
1149
1157
  const sections = [];
1150
1158
  sections.push(`# Mutation target: ${ctx.target}`);
1151
1159
  sections.push("");
1152
- sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
1160
+ sections.push(
1161
+ `You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`
1162
+ );
1153
1163
  sections.push("");
1154
1164
  sections.push("## Current variant");
1155
1165
  sections.push("```json");
@@ -1160,7 +1170,9 @@ function buildReflectionPrompt(ctx) {
1160
1170
  sections.push("## Failures (bottom trials) \u2014 what went wrong");
1161
1171
  sections.push("");
1162
1172
  for (const trial of ctx.bottomTrials) {
1163
- sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
1173
+ sections.push(
1174
+ `### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
1175
+ );
1164
1176
  const missed = (trial.expectations ?? []).filter((e) => !e.matched);
1165
1177
  if (missed.length > 0) {
1166
1178
  sections.push("");
@@ -1183,7 +1195,9 @@ function buildReflectionPrompt(ctx) {
1183
1195
  sections.push("## Successes (top trials) \u2014 what to preserve");
1184
1196
  sections.push("");
1185
1197
  for (const trial of ctx.topTrials) {
1186
- sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
1198
+ sections.push(
1199
+ `- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`
1200
+ );
1187
1201
  }
1188
1202
  sections.push("");
1189
1203
  }
@@ -1195,25 +1209,27 @@ function buildReflectionPrompt(ctx) {
1195
1209
  sections.push("");
1196
1210
  sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
1197
1211
  sections.push("```json");
1198
- sections.push(JSON.stringify(
1199
- {
1200
- proposals: [
1201
- {
1202
- label: "<short label, \u2264 40 chars>",
1203
- rationale: "<which failure this targets and which primitive you used>",
1204
- payload: "<full payload of the new variant \u2014 same shape as the current variant>"
1205
- }
1206
- ]
1207
- },
1208
- null,
1209
- 2
1210
- ));
1212
+ sections.push(
1213
+ JSON.stringify(
1214
+ {
1215
+ proposals: [
1216
+ {
1217
+ label: "<short label, \u2264 40 chars>",
1218
+ rationale: "<which failure this targets and which primitive you used>",
1219
+ payload: "<full payload of the new variant \u2014 same shape as the current variant>"
1220
+ }
1221
+ ]
1222
+ },
1223
+ null,
1224
+ 2
1225
+ )
1226
+ );
1211
1227
  sections.push("```");
1212
1228
  return sections.join("\n");
1213
1229
  }
1214
1230
  function truncate(s, max) {
1215
1231
  if (s.length <= max) return s;
1216
- return s.slice(0, max) + "\u2026 [truncated]";
1232
+ return `${s.slice(0, max)}\u2026 [truncated]`;
1217
1233
  }
1218
1234
  function quote(s) {
1219
1235
  return s.replace(/`/g, "\\`");
@@ -1221,15 +1237,15 @@ function quote(s) {
1221
1237
  function autoCloseTruncatedJson(raw) {
1222
1238
  const stack = [];
1223
1239
  let inString = false;
1224
- let escape = false;
1240
+ let escaped = false;
1225
1241
  for (const c of raw) {
1226
- if (escape) {
1227
- escape = false;
1242
+ if (escaped) {
1243
+ escaped = false;
1228
1244
  continue;
1229
1245
  }
1230
1246
  if (inString) {
1231
1247
  if (c === "\\") {
1232
- escape = true;
1248
+ escaped = true;
1233
1249
  continue;
1234
1250
  }
1235
1251
  if (c === '"') {
@@ -1269,11 +1285,15 @@ function parseReflectionResponse(raw, maxProposals) {
1269
1285
  const tryObjectFirst = objectStart >= 0 && (arrayStart < 0 || objectStart < arrayStart);
1270
1286
  const candidates = [];
1271
1287
  if (tryObjectFirst) {
1272
- if (objectStart >= 0 && objectEnd > objectStart) candidates.push(text.slice(objectStart, objectEnd + 1));
1273
- if (arrayStart >= 0 && arrayEnd > arrayStart) candidates.push(text.slice(arrayStart, arrayEnd + 1));
1288
+ if (objectStart >= 0 && objectEnd > objectStart)
1289
+ candidates.push(text.slice(objectStart, objectEnd + 1));
1290
+ if (arrayStart >= 0 && arrayEnd > arrayStart)
1291
+ candidates.push(text.slice(arrayStart, arrayEnd + 1));
1274
1292
  } else {
1275
- if (arrayStart >= 0 && arrayEnd > arrayStart) candidates.push(text.slice(arrayStart, arrayEnd + 1));
1276
- if (objectStart >= 0 && objectEnd > objectStart) candidates.push(text.slice(objectStart, objectEnd + 1));
1293
+ if (arrayStart >= 0 && arrayEnd > arrayStart)
1294
+ candidates.push(text.slice(arrayStart, arrayEnd + 1));
1295
+ if (objectStart >= 0 && objectEnd > objectStart)
1296
+ candidates.push(text.slice(objectStart, objectEnd + 1));
1277
1297
  }
1278
1298
  for (const slice of candidates) {
1279
1299
  try {
@@ -1317,6 +1337,44 @@ function parseReflectionResponse(raw, maxProposals) {
1317
1337
  return out;
1318
1338
  }
1319
1339
 
1340
+ // src/researcher.ts
1341
+ var CallbackResearcher = class {
1342
+ constructor(callbacks) {
1343
+ this.callbacks = callbacks;
1344
+ }
1345
+ callbacks;
1346
+ inspectFailures(runs) {
1347
+ return this.callbacks.inspectFailures(runs);
1348
+ }
1349
+ proposeChange(failures) {
1350
+ return this.callbacks.proposeChange(failures);
1351
+ }
1352
+ applyChange(changes, baseline) {
1353
+ return this.callbacks.applyChange(changes, baseline);
1354
+ }
1355
+ evaluateChange(plan) {
1356
+ return this.callbacks.evaluateChange(plan);
1357
+ }
1358
+ };
1359
+ var NoopResearcher = class {
1360
+ hint;
1361
+ constructor(hint = "NoopResearcher: no implementation wired") {
1362
+ this.hint = hint;
1363
+ }
1364
+ async inspectFailures(_runs) {
1365
+ throw new Error(`${this.hint} (inspectFailures not implemented)`);
1366
+ }
1367
+ async proposeChange(_failures) {
1368
+ throw new Error(`${this.hint} (proposeChange not implemented)`);
1369
+ }
1370
+ async applyChange(_changes, _baseline) {
1371
+ throw new Error(`${this.hint} (applyChange not implemented)`);
1372
+ }
1373
+ async evaluateChange(_plan) {
1374
+ throw new Error(`${this.hint} (evaluateChange not implemented)`);
1375
+ }
1376
+ };
1377
+
1320
1378
  export {
1321
1379
  InMemoryFeedbackTrajectoryStore,
1322
1380
  FileSystemFeedbackTrajectoryStore,
@@ -1340,8 +1398,6 @@ export {
1340
1398
  crowdingDistance,
1341
1399
  paretoFrontierWithCrowding,
1342
1400
  HeldOutGate,
1343
- CallbackResearcher,
1344
- NoopResearcher,
1345
1401
  InMemoryTrialCache,
1346
1402
  runPromptEvolution,
1347
1403
  runMultiShotOptimization,
@@ -1349,6 +1405,8 @@ export {
1349
1405
  trialTraceFromMultiShotTrial,
1350
1406
  DEFAULT_MUTATION_PRIMITIVES,
1351
1407
  buildReflectionPrompt,
1352
- parseReflectionResponse
1408
+ parseReflectionResponse,
1409
+ CallbackResearcher,
1410
+ NoopResearcher
1353
1411
  };
1354
- //# sourceMappingURL=chunk-VQQSPGSM.js.map
1412
+ //# sourceMappingURL=chunk-VRJVTXRV.js.map