@tangle-network/agent-eval 0.62.0 → 0.64.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,11 @@ import {
3
3
  } from "./chunk-7TPYV2ER.js";
4
4
  import {
5
5
  buildReflectionPrompt,
6
+ paretoFrontier,
6
7
  parseReflectionResponse,
7
8
  runCanaries,
8
9
  scoreRedTeamOutput
9
- } from "./chunk-SS2SOBBT.js";
10
+ } from "./chunk-4ODZXQV2.js";
10
11
  import {
11
12
  summarizeBackendIntegrity
12
13
  } from "./chunk-E22YUOAL.js";
@@ -147,52 +148,120 @@ function evolutionaryDriver(opts) {
147
148
 
148
149
  // src/campaign/drivers/gepa.ts
149
150
  var REFLECTION_SYSTEM = 'You are an expert prompt engineer. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} where each `payload` is the FULL improved surface text. No prose outside the JSON.';
151
+ var COMBINE_SYSTEM = 'You are an expert prompt engineer performing a GEPA "combine complementary lessons" merge. You are given several non-dominated versions of one surface; each is uniquely best on different scenarios. Produce ONE new version that keeps what makes each version strong on its winning scenarios and resolves conflicts in favor of the more general rule. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} with exactly one proposal whose `payload` is the FULL merged surface text. No prose outside the JSON.';
150
152
  function gepaDriver(opts) {
151
153
  const evidenceK = opts.evidenceK ?? 3;
154
+ const combineParents = opts.combineParents ?? true;
155
+ const combineMaxParents = opts.combineMaxParents ?? 4;
156
+ if (combineParents && combineMaxParents < 1) {
157
+ throw new Error("gepaDriver: combineMaxParents must be >= 1 when combineParents is enabled");
158
+ }
152
159
  return {
153
160
  kind: "gepa",
154
161
  async propose(ctx) {
155
162
  const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
156
- const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
157
- const userPrompt = buildReflectionPrompt({
158
- target,
159
- parentPayload: parent,
160
- topTrials: top,
161
- bottomTrials: bottom,
162
- childCount: ctx.populationSize,
163
- mutationPrimitives: opts.mutationPrimitives
164
- });
165
- const result = await callLlm(
166
- {
167
- model: opts.model,
168
- messages: [
169
- { role: "system", content: REFLECTION_SYSTEM },
170
- { role: "user", content: userPrompt }
171
- ],
172
- jsonMode: true,
173
- temperature: opts.temperature ?? 0.7,
174
- maxTokens: opts.maxTokens ?? 6e3
175
- },
176
- opts.llm
177
- );
178
- const proposals = parseReflectionResponse(result.content, ctx.populationSize);
179
- const out = [];
180
- const seen = /* @__PURE__ */ new Set();
181
163
  const constraints = opts.constraints;
182
164
  const preserveSections = constraints?.preserveSections !== void 0 ? constraints.preserveSections.length === 0 ? extractH2Sections(parent) : constraints.preserveSections : null;
183
165
  const maxEdits = constraints?.maxSentenceEdits;
184
- for (const proposal of proposals) {
185
- const text = typeof proposal.payload === "string" ? proposal.payload.trim() : "";
186
- if (!text || text === parent || seen.has(text)) continue;
187
- if (preserveSections && !validatePreservedSections(text, preserveSections)) continue;
188
- if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) continue;
166
+ const out = [];
167
+ const seen = /* @__PURE__ */ new Set();
168
+ const accept = (payload, label, rationale) => {
169
+ const text = typeof payload === "string" ? payload.trim() : "";
170
+ if (!text || text === parent || seen.has(text)) return;
171
+ if (preserveSections && !validatePreservedSections(text, preserveSections)) return;
172
+ if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) return;
189
173
  seen.add(text);
190
- out.push({ surface: text, label: proposal.label, rationale: proposal.rationale });
174
+ out.push({ surface: text, label, rationale });
175
+ };
176
+ const stringParents = (combineParents ? ctx.paretoParents ?? [] : []).filter((p) => typeof p.surface === "string").sort((a, b) => b.composite - a.composite).slice(0, combineMaxParents);
177
+ if (stringParents.length > 1) {
178
+ const combinePrompt = buildCombinePrompt({
179
+ target: opts.target,
180
+ parents: stringParents,
181
+ evidenceK
182
+ });
183
+ const combineResult = await callLlm(
184
+ {
185
+ model: opts.model,
186
+ messages: [
187
+ { role: "system", content: COMBINE_SYSTEM },
188
+ { role: "user", content: combinePrompt }
189
+ ],
190
+ jsonMode: true,
191
+ temperature: opts.temperature ?? 0.7,
192
+ maxTokens: opts.maxTokens ?? 6e3
193
+ },
194
+ opts.llm
195
+ );
196
+ const merged = parseReflectionResponse(combineResult.content, 1)[0];
197
+ if (merged) {
198
+ accept(
199
+ merged.payload,
200
+ merged.label || "pareto-combine",
201
+ merged.rationale || `combined ${stringParents.length} non-dominated parents (gen ${stringParents.map((p) => p.generation).join(",")})`
202
+ );
203
+ }
191
204
  }
192
- return out;
205
+ const reflectCount = Math.max(0, ctx.populationSize - out.length);
206
+ if (reflectCount > 0) {
207
+ const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
208
+ const userPrompt = buildReflectionPrompt({
209
+ target,
210
+ parentPayload: parent,
211
+ topTrials: top,
212
+ bottomTrials: bottom,
213
+ childCount: reflectCount,
214
+ mutationPrimitives: opts.mutationPrimitives
215
+ });
216
+ const result = await callLlm(
217
+ {
218
+ model: opts.model,
219
+ messages: [
220
+ { role: "system", content: REFLECTION_SYSTEM },
221
+ { role: "user", content: userPrompt }
222
+ ],
223
+ jsonMode: true,
224
+ temperature: opts.temperature ?? 0.7,
225
+ maxTokens: opts.maxTokens ?? 6e3
226
+ },
227
+ opts.llm
228
+ );
229
+ for (const proposal of parseReflectionResponse(result.content, reflectCount)) {
230
+ accept(proposal.payload, proposal.label, proposal.rationale);
231
+ }
232
+ }
233
+ return out.slice(0, ctx.populationSize);
193
234
  }
194
235
  };
195
236
  }
237
+ function buildCombinePrompt(args) {
238
+ const lines = [
239
+ `You are merging ${args.parents.length} versions of: ${args.target}.`,
240
+ "",
241
+ "Each version is on the Pareto frontier \u2014 none dominates the others; each",
242
+ "wins on different scenarios. Combine their complementary strengths into",
243
+ "ONE version. Below, each version lists the scenarios it scores highest on.",
244
+ ""
245
+ ];
246
+ args.parents.forEach((p, i) => {
247
+ const tag = String.fromCharCode(65 + i);
248
+ const best = Object.entries(p.objectives).sort((a, b) => b[1] - a[1]).slice(0, args.evidenceK).map(([id, score]) => `${id} (${score.toFixed(2)})`);
249
+ lines.push(
250
+ `### Version ${tag} (mean ${p.composite.toFixed(2)}; strongest on: ${best.join(", ") || "n/a"})`,
251
+ "```",
252
+ p.surface,
253
+ "```",
254
+ ""
255
+ );
256
+ });
257
+ lines.push(
258
+ "Return ONE merged version that would score well on the union of every",
259
+ "version's winning scenarios. Keep each version's specific winning rule;",
260
+ "where two rules conflict, prefer the more general one and note the choice",
261
+ "in your rationale."
262
+ );
263
+ return lines.join("\n");
264
+ }
196
265
  function extractH2Sections(text) {
197
266
  const out = [];
198
267
  for (const line of text.split("\n")) {
@@ -453,9 +522,45 @@ function labelTrustRank(trust) {
453
522
  return LABEL_TRUST_RANK[trust ?? "unverified"];
454
523
  }
455
524
 
456
- // src/campaign/presets/run-eval.ts
457
- async function runEval(opts) {
458
- return runCampaign(opts);
525
+ // src/campaign/score-utils.ts
526
+ function campaignMeanComposite(campaign) {
527
+ const composites = [];
528
+ for (const cell of campaign.cells) {
529
+ const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
530
+ if (cellComposites.length > 0) {
531
+ composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
532
+ }
533
+ }
534
+ return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
535
+ }
536
+ function campaignBreakdown(campaign) {
537
+ const dimSums = {};
538
+ const dimCounts = {};
539
+ const byScenario = /* @__PURE__ */ new Map();
540
+ for (const cell of campaign.cells) {
541
+ const judgeScores = Object.values(cell.judgeScores);
542
+ if (judgeScores.length === 0) continue;
543
+ const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
544
+ const arr = byScenario.get(cell.scenarioId) ?? [];
545
+ arr.push(cellComposite);
546
+ byScenario.set(cell.scenarioId, arr);
547
+ for (const score of judgeScores) {
548
+ for (const [key, value] of Object.entries(score.dimensions)) {
549
+ dimSums[key] = (dimSums[key] ?? 0) + value;
550
+ dimCounts[key] = (dimCounts[key] ?? 0) + 1;
551
+ }
552
+ }
553
+ }
554
+ const dimensions = {};
555
+ for (const key of Object.keys(dimSums)) {
556
+ const count = dimCounts[key] ?? 0;
557
+ dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
558
+ }
559
+ const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
560
+ scenarioId,
561
+ composite: comps.reduce((a, b) => a + b, 0) / comps.length
562
+ }));
563
+ return { dimensions, scenarios };
459
564
  }
460
565
 
461
566
  // src/campaign/presets/run-optimization.ts
@@ -472,11 +577,15 @@ async function runOptimization(opts) {
472
577
  let currentSurfaces = [opts.baselineSurface];
473
578
  let winnerSurface = opts.baselineSurface;
474
579
  let winnerSurfaceHash = surfaceHash(opts.baselineSurface);
475
- let winnerComposite = meanComposite2(baselineCampaign);
580
+ let winnerComposite = campaignMeanComposite(baselineCampaign);
476
581
  let winnerLabel;
477
582
  let winnerRationale;
583
+ const scored = [
584
+ toParetoParent(opts.baselineSurface, winnerSurfaceHash, baselineCampaign, -1)
585
+ ];
478
586
  for (let gen = 0; gen < opts.maxGenerations; gen++) {
479
587
  if (opts.driver.decide?.({ history }).stop) break;
588
+ const paretoParents = computeParetoFrontier(scored);
480
589
  const proposed = await opts.driver.propose({
481
590
  currentSurface: currentSurfaces[0] ?? opts.baselineSurface,
482
591
  history,
@@ -486,7 +595,8 @@ async function runOptimization(opts) {
486
595
  signal: new AbortController().signal,
487
596
  report: opts.report,
488
597
  dataset: opts.labeledStore && opts.labeledStore !== "off" ? opts.labeledStore : void 0,
489
- maxImprovementShots: opts.maxImprovementShots
598
+ maxImprovementShots: opts.maxImprovementShots,
599
+ paretoParents
490
600
  });
491
601
  const candidates = proposed.map(
492
602
  (p) => isProposedCandidate(p) ? p : { surface: p, label: "", rationale: "" }
@@ -500,8 +610,11 @@ async function runOptimization(opts) {
500
610
  dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
501
611
  runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`
502
612
  });
503
- const composite = meanComposite2(campaign);
613
+ const composite = campaignMeanComposite(campaign);
504
614
  surfaceResults.push({ surfaceHash: hash, surface, label, rationale, campaign, composite });
615
+ scored.push(
616
+ toParetoParent(surface, hash, campaign, gen, label || void 0, rationale || void 0)
617
+ );
505
618
  }
506
619
  surfaceResults.sort((a, b) => b.composite - a.composite);
507
620
  const promoted = surfaceResults.slice(0, promoteTopK);
@@ -517,7 +630,7 @@ async function runOptimization(opts) {
517
630
  const record = {
518
631
  generationIndex: gen,
519
632
  candidates: surfaceResults.map((s) => {
520
- const breakdown = candidateBreakdown(s.campaign);
633
+ const breakdown = campaignBreakdown(s.campaign);
521
634
  const candidate = {
522
635
  surfaceHash: s.surfaceHash,
523
636
  composite: s.composite,
@@ -547,8 +660,49 @@ async function runOptimization(opts) {
547
660
  winnerSurfaceHash,
548
661
  winnerLabel,
549
662
  winnerRationale,
550
- baselineCampaign
663
+ baselineCampaign,
664
+ paretoFrontier: computeParetoFrontier(scored)
665
+ };
666
+ }
667
+ function toParetoParent(surface, hash, campaign, generation, label, rationale) {
668
+ const objectives = {};
669
+ for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
670
+ objectives[scenarioId] = composite;
671
+ }
672
+ const parent = {
673
+ surface,
674
+ surfaceHash: hash,
675
+ objectives,
676
+ composite: campaignMeanComposite(campaign),
677
+ generation
551
678
  };
679
+ if (label) parent.label = label;
680
+ if (rationale) parent.rationale = rationale;
681
+ return parent;
682
+ }
683
+ function computeParetoFrontier(scored) {
684
+ if (scored.length <= 1) return [...scored];
685
+ const ids = /* @__PURE__ */ new Set();
686
+ for (const p of scored) for (const id of Object.keys(p.objectives)) ids.add(id);
687
+ if (ids.size === 0) return [...scored];
688
+ const floor = {};
689
+ for (const id of ids) {
690
+ let min = Number.POSITIVE_INFINITY;
691
+ for (const p of scored) {
692
+ const v = p.objectives[id];
693
+ if (typeof v === "number" && Number.isFinite(v) && v < min) min = v;
694
+ }
695
+ floor[id] = Number.isFinite(min) ? min : 0;
696
+ }
697
+ const objectives = [...ids].map((id) => ({
698
+ name: id,
699
+ direction: "maximize",
700
+ value: (p) => {
701
+ const v = p.objectives[id];
702
+ return typeof v === "number" && Number.isFinite(v) ? v : floor[id] ?? 0;
703
+ }
704
+ }));
705
+ return paretoFrontier(scored, objectives).frontier;
552
706
  }
553
707
  function surfaceHash(surface) {
554
708
  const material = typeof surface === "string" ? surface : JSON.stringify({
@@ -558,45 +712,6 @@ function surfaceHash(surface) {
558
712
  });
559
713
  return createHash("sha256").update(material).digest("hex").slice(0, 16);
560
714
  }
561
- function meanComposite2(campaign) {
562
- const composites = [];
563
- for (const cell of campaign.cells) {
564
- const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
565
- if (cellComposites.length > 0) {
566
- composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
567
- }
568
- }
569
- return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
570
- }
571
- function candidateBreakdown(campaign) {
572
- const dimSums = {};
573
- const dimCounts = {};
574
- const byScenario = /* @__PURE__ */ new Map();
575
- for (const cell of campaign.cells) {
576
- const judgeScores = Object.values(cell.judgeScores);
577
- if (judgeScores.length === 0) continue;
578
- const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
579
- const arr = byScenario.get(cell.scenarioId) ?? [];
580
- arr.push(cellComposite);
581
- byScenario.set(cell.scenarioId, arr);
582
- for (const score of judgeScores) {
583
- for (const [key, value] of Object.entries(score.dimensions)) {
584
- dimSums[key] = (dimSums[key] ?? 0) + value;
585
- dimCounts[key] = (dimCounts[key] ?? 0) + 1;
586
- }
587
- }
588
- }
589
- const dimensions = {};
590
- for (const key of Object.keys(dimSums)) {
591
- const count = dimCounts[key] ?? 0;
592
- dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
593
- }
594
- const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
595
- scenarioId,
596
- composite: comps.reduce((a, b) => a + b, 0) / comps.length
597
- }));
598
- return { dimensions, scenarios };
599
- }
600
715
 
601
716
  // src/campaign/presets/run-improvement-loop.ts
602
717
  async function runImprovementLoop(opts) {
@@ -689,6 +804,11 @@ ${fmt(winnerSurface)}`;
689
804
  return lines.join("\n");
690
805
  }
691
806
 
807
+ // src/campaign/presets/run-eval.ts
808
+ async function runEval(opts) {
809
+ return runCampaign(opts);
810
+ }
811
+
692
812
  // src/campaign/provenance.ts
693
813
  import { createHash as createHash2 } from "crypto";
694
814
  import { join as join2 } from "path";
@@ -913,11 +1033,13 @@ export {
913
1033
  heldOutGate,
914
1034
  isProposedCandidate,
915
1035
  labelTrustRank,
916
- runEval,
1036
+ campaignMeanComposite,
1037
+ campaignBreakdown,
917
1038
  runOptimization,
918
1039
  surfaceHash,
919
1040
  runImprovementLoop,
920
1041
  defaultRenderDiff,
1042
+ runEval,
921
1043
  surfaceContentHash,
922
1044
  buildLoopProvenanceRecord,
923
1045
  loopProvenanceSpans,
@@ -925,4 +1047,4 @@ export {
925
1047
  provenanceSpansPath,
926
1048
  emitLoopProvenance
927
1049
  };
928
- //# sourceMappingURL=chunk-CV2BS2OV.js.map
1050
+ //# sourceMappingURL=chunk-Z7ZU7IYZ.js.map