@tangle-network/agent-eval 0.61.0 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/CHANGELOG.md +48 -8
  2. package/dist/adapters/http.d.ts +4 -1
  3. package/dist/adapters/langchain.d.ts +4 -1
  4. package/dist/adapters/otel.d.ts +4 -4
  5. package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
  6. package/dist/benchmarks/index.d.ts +2 -2
  7. package/dist/campaign/index.d.ts +388 -11
  8. package/dist/campaign/index.js +597 -12
  9. package/dist/campaign/index.js.map +1 -1
  10. package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
  11. package/dist/chunk-4ODZXQV2.js.map +1 -0
  12. package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
  13. package/dist/chunk-7TPYV2ER.js.map +1 -0
  14. package/dist/chunk-E22YUOAL.js +111 -0
  15. package/dist/chunk-E22YUOAL.js.map +1 -0
  16. package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
  17. package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
  18. package/dist/contract/index.d.ts +9 -9
  19. package/dist/contract/index.js +4 -3
  20. package/dist/contract/index.js.map +1 -1
  21. package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
  22. package/dist/control.d.ts +2 -2
  23. package/dist/hosted/index.d.ts +4 -4
  24. package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
  25. package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
  26. package/dist/index.d.ts +98 -14
  27. package/dist/index.js +331 -128
  28. package/dist/index.js.map +1 -1
  29. package/dist/meta-eval/index.d.ts +2 -2
  30. package/dist/multishot/index.js.map +1 -1
  31. package/dist/openapi.json +1 -1
  32. package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
  33. package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
  34. package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
  35. package/dist/reporting.d.ts +4 -4
  36. package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
  37. package/dist/rl.d.ts +6 -6
  38. package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
  39. package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
  40. package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
  41. package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
  42. package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
  43. package/package.json +1 -1
  44. package/dist/chunk-GMXHLSLL.js.map +0 -1
  45. package/dist/chunk-OLULBECP.js.map +0 -1
  46. package/dist/chunk-SUGME4OT.js.map +0 -1
  47. /package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0
@@ -1,13 +1,16 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-OLULBECP.js";
3
+ } from "./chunk-7TPYV2ER.js";
4
4
  import {
5
5
  buildReflectionPrompt,
6
+ paretoFrontier,
6
7
  parseReflectionResponse,
7
8
  runCanaries,
8
- scoreRedTeamOutput,
9
+ scoreRedTeamOutput
10
+ } from "./chunk-4ODZXQV2.js";
11
+ import {
9
12
  summarizeBackendIntegrity
10
- } from "./chunk-GMXHLSLL.js";
13
+ } from "./chunk-E22YUOAL.js";
11
14
  import {
12
15
  detectRewardHacking
13
16
  } from "./chunk-YV7J7X5N.js";
@@ -145,52 +148,120 @@ function evolutionaryDriver(opts) {
145
148
 
146
149
  // src/campaign/drivers/gepa.ts
147
150
  var REFLECTION_SYSTEM = 'You are an expert prompt engineer. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} where each `payload` is the FULL improved surface text. No prose outside the JSON.';
151
+ var COMBINE_SYSTEM = 'You are an expert prompt engineer performing a GEPA "combine complementary lessons" merge. You are given several non-dominated versions of one surface; each is uniquely best on different scenarios. Produce ONE new version that keeps what makes each version strong on its winning scenarios and resolves conflicts in favor of the more general rule. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} with exactly one proposal whose `payload` is the FULL merged surface text. No prose outside the JSON.';
148
152
  function gepaDriver(opts) {
149
153
  const evidenceK = opts.evidenceK ?? 3;
154
+ const combineParents = opts.combineParents ?? true;
155
+ const combineMaxParents = opts.combineMaxParents ?? 4;
156
+ if (combineParents && combineMaxParents < 1) {
157
+ throw new Error("gepaDriver: combineMaxParents must be >= 1 when combineParents is enabled");
158
+ }
150
159
  return {
151
160
  kind: "gepa",
152
161
  async propose(ctx) {
153
162
  const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
154
- const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
155
- const userPrompt = buildReflectionPrompt({
156
- target,
157
- parentPayload: parent,
158
- topTrials: top,
159
- bottomTrials: bottom,
160
- childCount: ctx.populationSize,
161
- mutationPrimitives: opts.mutationPrimitives
162
- });
163
- const result = await callLlm(
164
- {
165
- model: opts.model,
166
- messages: [
167
- { role: "system", content: REFLECTION_SYSTEM },
168
- { role: "user", content: userPrompt }
169
- ],
170
- jsonMode: true,
171
- temperature: opts.temperature ?? 0.7,
172
- maxTokens: opts.maxTokens ?? 6e3
173
- },
174
- opts.llm
175
- );
176
- const proposals = parseReflectionResponse(result.content, ctx.populationSize);
177
- const out = [];
178
- const seen = /* @__PURE__ */ new Set();
179
163
  const constraints = opts.constraints;
180
164
  const preserveSections = constraints?.preserveSections !== void 0 ? constraints.preserveSections.length === 0 ? extractH2Sections(parent) : constraints.preserveSections : null;
181
165
  const maxEdits = constraints?.maxSentenceEdits;
182
- for (const proposal of proposals) {
183
- const text = typeof proposal.payload === "string" ? proposal.payload.trim() : "";
184
- if (!text || text === parent || seen.has(text)) continue;
185
- if (preserveSections && !validatePreservedSections(text, preserveSections)) continue;
186
- if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) continue;
166
+ const out = [];
167
+ const seen = /* @__PURE__ */ new Set();
168
+ const accept = (payload, label, rationale) => {
169
+ const text = typeof payload === "string" ? payload.trim() : "";
170
+ if (!text || text === parent || seen.has(text)) return;
171
+ if (preserveSections && !validatePreservedSections(text, preserveSections)) return;
172
+ if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) return;
187
173
  seen.add(text);
188
- out.push({ surface: text, label: proposal.label, rationale: proposal.rationale });
174
+ out.push({ surface: text, label, rationale });
175
+ };
176
+ const stringParents = (combineParents ? ctx.paretoParents ?? [] : []).filter((p) => typeof p.surface === "string").sort((a, b) => b.composite - a.composite).slice(0, combineMaxParents);
177
+ if (stringParents.length > 1) {
178
+ const combinePrompt = buildCombinePrompt({
179
+ target: opts.target,
180
+ parents: stringParents,
181
+ evidenceK
182
+ });
183
+ const combineResult = await callLlm(
184
+ {
185
+ model: opts.model,
186
+ messages: [
187
+ { role: "system", content: COMBINE_SYSTEM },
188
+ { role: "user", content: combinePrompt }
189
+ ],
190
+ jsonMode: true,
191
+ temperature: opts.temperature ?? 0.7,
192
+ maxTokens: opts.maxTokens ?? 6e3
193
+ },
194
+ opts.llm
195
+ );
196
+ const merged = parseReflectionResponse(combineResult.content, 1)[0];
197
+ if (merged) {
198
+ accept(
199
+ merged.payload,
200
+ merged.label || "pareto-combine",
201
+ merged.rationale || `combined ${stringParents.length} non-dominated parents (gen ${stringParents.map((p) => p.generation).join(",")})`
202
+ );
203
+ }
189
204
  }
190
- return out;
205
+ const reflectCount = Math.max(0, ctx.populationSize - out.length);
206
+ if (reflectCount > 0) {
207
+ const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
208
+ const userPrompt = buildReflectionPrompt({
209
+ target,
210
+ parentPayload: parent,
211
+ topTrials: top,
212
+ bottomTrials: bottom,
213
+ childCount: reflectCount,
214
+ mutationPrimitives: opts.mutationPrimitives
215
+ });
216
+ const result = await callLlm(
217
+ {
218
+ model: opts.model,
219
+ messages: [
220
+ { role: "system", content: REFLECTION_SYSTEM },
221
+ { role: "user", content: userPrompt }
222
+ ],
223
+ jsonMode: true,
224
+ temperature: opts.temperature ?? 0.7,
225
+ maxTokens: opts.maxTokens ?? 6e3
226
+ },
227
+ opts.llm
228
+ );
229
+ for (const proposal of parseReflectionResponse(result.content, reflectCount)) {
230
+ accept(proposal.payload, proposal.label, proposal.rationale);
231
+ }
232
+ }
233
+ return out.slice(0, ctx.populationSize);
191
234
  }
192
235
  };
193
236
  }
237
+ function buildCombinePrompt(args) {
238
+ const lines = [
239
+ `You are merging ${args.parents.length} versions of: ${args.target}.`,
240
+ "",
241
+ "Each version is on the Pareto frontier \u2014 none dominates the others; each",
242
+ "wins on different scenarios. Combine their complementary strengths into",
243
+ "ONE version. Below, each version lists the scenarios it scores highest on.",
244
+ ""
245
+ ];
246
+ args.parents.forEach((p, i) => {
247
+ const tag = String.fromCharCode(65 + i);
248
+ const best = Object.entries(p.objectives).sort((a, b) => b[1] - a[1]).slice(0, args.evidenceK).map(([id, score]) => `${id} (${score.toFixed(2)})`);
249
+ lines.push(
250
+ `### Version ${tag} (mean ${p.composite.toFixed(2)}; strongest on: ${best.join(", ") || "n/a"})`,
251
+ "```",
252
+ p.surface,
253
+ "```",
254
+ ""
255
+ );
256
+ });
257
+ lines.push(
258
+ "Return ONE merged version that would score well on the union of every",
259
+ "version's winning scenarios. Keep each version's specific winning rule;",
260
+ "where two rules conflict, prefer the more general one and note the choice",
261
+ "in your rationale."
262
+ );
263
+ return lines.join("\n");
264
+ }
194
265
  function extractH2Sections(text) {
195
266
  const out = [];
196
267
  for (const line of text.split("\n")) {
@@ -451,9 +522,45 @@ function labelTrustRank(trust) {
451
522
  return LABEL_TRUST_RANK[trust ?? "unverified"];
452
523
  }
453
524
 
454
- // src/campaign/presets/run-eval.ts
455
- async function runEval(opts) {
456
- return runCampaign(opts);
525
+ // src/campaign/score-utils.ts
526
+ function campaignMeanComposite(campaign) {
527
+ const composites = [];
528
+ for (const cell of campaign.cells) {
529
+ const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
530
+ if (cellComposites.length > 0) {
531
+ composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
532
+ }
533
+ }
534
+ return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
535
+ }
536
+ function campaignBreakdown(campaign) {
537
+ const dimSums = {};
538
+ const dimCounts = {};
539
+ const byScenario = /* @__PURE__ */ new Map();
540
+ for (const cell of campaign.cells) {
541
+ const judgeScores = Object.values(cell.judgeScores);
542
+ if (judgeScores.length === 0) continue;
543
+ const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
544
+ const arr = byScenario.get(cell.scenarioId) ?? [];
545
+ arr.push(cellComposite);
546
+ byScenario.set(cell.scenarioId, arr);
547
+ for (const score of judgeScores) {
548
+ for (const [key, value] of Object.entries(score.dimensions)) {
549
+ dimSums[key] = (dimSums[key] ?? 0) + value;
550
+ dimCounts[key] = (dimCounts[key] ?? 0) + 1;
551
+ }
552
+ }
553
+ }
554
+ const dimensions = {};
555
+ for (const key of Object.keys(dimSums)) {
556
+ const count = dimCounts[key] ?? 0;
557
+ dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
558
+ }
559
+ const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
560
+ scenarioId,
561
+ composite: comps.reduce((a, b) => a + b, 0) / comps.length
562
+ }));
563
+ return { dimensions, scenarios };
457
564
  }
458
565
 
459
566
  // src/campaign/presets/run-optimization.ts
@@ -470,11 +577,15 @@ async function runOptimization(opts) {
470
577
  let currentSurfaces = [opts.baselineSurface];
471
578
  let winnerSurface = opts.baselineSurface;
472
579
  let winnerSurfaceHash = surfaceHash(opts.baselineSurface);
473
- let winnerComposite = meanComposite2(baselineCampaign);
580
+ let winnerComposite = campaignMeanComposite(baselineCampaign);
474
581
  let winnerLabel;
475
582
  let winnerRationale;
583
+ const scored = [
584
+ toParetoParent(opts.baselineSurface, winnerSurfaceHash, baselineCampaign, -1)
585
+ ];
476
586
  for (let gen = 0; gen < opts.maxGenerations; gen++) {
477
587
  if (opts.driver.decide?.({ history }).stop) break;
588
+ const paretoParents = computeParetoFrontier(scored);
478
589
  const proposed = await opts.driver.propose({
479
590
  currentSurface: currentSurfaces[0] ?? opts.baselineSurface,
480
591
  history,
@@ -484,7 +595,8 @@ async function runOptimization(opts) {
484
595
  signal: new AbortController().signal,
485
596
  report: opts.report,
486
597
  dataset: opts.labeledStore && opts.labeledStore !== "off" ? opts.labeledStore : void 0,
487
- maxImprovementShots: opts.maxImprovementShots
598
+ maxImprovementShots: opts.maxImprovementShots,
599
+ paretoParents
488
600
  });
489
601
  const candidates = proposed.map(
490
602
  (p) => isProposedCandidate(p) ? p : { surface: p, label: "", rationale: "" }
@@ -498,8 +610,11 @@ async function runOptimization(opts) {
498
610
  dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
499
611
  runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`
500
612
  });
501
- const composite = meanComposite2(campaign);
613
+ const composite = campaignMeanComposite(campaign);
502
614
  surfaceResults.push({ surfaceHash: hash, surface, label, rationale, campaign, composite });
615
+ scored.push(
616
+ toParetoParent(surface, hash, campaign, gen, label || void 0, rationale || void 0)
617
+ );
503
618
  }
504
619
  surfaceResults.sort((a, b) => b.composite - a.composite);
505
620
  const promoted = surfaceResults.slice(0, promoteTopK);
@@ -515,7 +630,7 @@ async function runOptimization(opts) {
515
630
  const record = {
516
631
  generationIndex: gen,
517
632
  candidates: surfaceResults.map((s) => {
518
- const breakdown = candidateBreakdown(s.campaign);
633
+ const breakdown = campaignBreakdown(s.campaign);
519
634
  const candidate = {
520
635
  surfaceHash: s.surfaceHash,
521
636
  composite: s.composite,
@@ -545,8 +660,49 @@ async function runOptimization(opts) {
545
660
  winnerSurfaceHash,
546
661
  winnerLabel,
547
662
  winnerRationale,
548
- baselineCampaign
663
+ baselineCampaign,
664
+ paretoFrontier: computeParetoFrontier(scored)
665
+ };
666
+ }
667
+ function toParetoParent(surface, hash, campaign, generation, label, rationale) {
668
+ const objectives = {};
669
+ for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
670
+ objectives[scenarioId] = composite;
671
+ }
672
+ const parent = {
673
+ surface,
674
+ surfaceHash: hash,
675
+ objectives,
676
+ composite: campaignMeanComposite(campaign),
677
+ generation
549
678
  };
679
+ if (label) parent.label = label;
680
+ if (rationale) parent.rationale = rationale;
681
+ return parent;
682
+ }
683
+ function computeParetoFrontier(scored) {
684
+ if (scored.length <= 1) return [...scored];
685
+ const ids = /* @__PURE__ */ new Set();
686
+ for (const p of scored) for (const id of Object.keys(p.objectives)) ids.add(id);
687
+ if (ids.size === 0) return [...scored];
688
+ const floor = {};
689
+ for (const id of ids) {
690
+ let min = Number.POSITIVE_INFINITY;
691
+ for (const p of scored) {
692
+ const v = p.objectives[id];
693
+ if (typeof v === "number" && Number.isFinite(v) && v < min) min = v;
694
+ }
695
+ floor[id] = Number.isFinite(min) ? min : 0;
696
+ }
697
+ const objectives = [...ids].map((id) => ({
698
+ name: id,
699
+ direction: "maximize",
700
+ value: (p) => {
701
+ const v = p.objectives[id];
702
+ return typeof v === "number" && Number.isFinite(v) ? v : floor[id] ?? 0;
703
+ }
704
+ }));
705
+ return paretoFrontier(scored, objectives).frontier;
550
706
  }
551
707
  function surfaceHash(surface) {
552
708
  const material = typeof surface === "string" ? surface : JSON.stringify({
@@ -556,45 +712,6 @@ function surfaceHash(surface) {
556
712
  });
557
713
  return createHash("sha256").update(material).digest("hex").slice(0, 16);
558
714
  }
559
- function meanComposite2(campaign) {
560
- const composites = [];
561
- for (const cell of campaign.cells) {
562
- const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
563
- if (cellComposites.length > 0) {
564
- composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
565
- }
566
- }
567
- return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
568
- }
569
- function candidateBreakdown(campaign) {
570
- const dimSums = {};
571
- const dimCounts = {};
572
- const byScenario = /* @__PURE__ */ new Map();
573
- for (const cell of campaign.cells) {
574
- const judgeScores = Object.values(cell.judgeScores);
575
- if (judgeScores.length === 0) continue;
576
- const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
577
- const arr = byScenario.get(cell.scenarioId) ?? [];
578
- arr.push(cellComposite);
579
- byScenario.set(cell.scenarioId, arr);
580
- for (const score of judgeScores) {
581
- for (const [key, value] of Object.entries(score.dimensions)) {
582
- dimSums[key] = (dimSums[key] ?? 0) + value;
583
- dimCounts[key] = (dimCounts[key] ?? 0) + 1;
584
- }
585
- }
586
- }
587
- const dimensions = {};
588
- for (const key of Object.keys(dimSums)) {
589
- const count = dimCounts[key] ?? 0;
590
- dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
591
- }
592
- const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
593
- scenarioId,
594
- composite: comps.reduce((a, b) => a + b, 0) / comps.length
595
- }));
596
- return { dimensions, scenarios };
597
- }
598
715
 
599
716
  // src/campaign/presets/run-improvement-loop.ts
600
717
  async function runImprovementLoop(opts) {
@@ -612,7 +729,7 @@ async function runImprovementLoop(opts) {
612
729
  throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
613
730
  }
614
731
  const optimization = await runOptimization(opts);
615
- const { runCampaign: runCampaign2 } = await import("./run-campaign-HXPJAUZ3.js");
732
+ const { runCampaign: runCampaign2 } = await import("./run-campaign-5J3ED2UJ.js");
616
733
  const baselineOnHoldout = await runCampaign2({
617
734
  ...opts,
618
735
  scenarios: opts.holdoutScenarios,
@@ -687,6 +804,11 @@ ${fmt(winnerSurface)}`;
687
804
  return lines.join("\n");
688
805
  }
689
806
 
807
+ // src/campaign/presets/run-eval.ts
808
+ async function runEval(opts) {
809
+ return runCampaign(opts);
810
+ }
811
+
690
812
  // src/campaign/provenance.ts
691
813
  import { createHash as createHash2 } from "crypto";
692
814
  import { join as join2 } from "path";
@@ -911,11 +1033,13 @@ export {
911
1033
  heldOutGate,
912
1034
  isProposedCandidate,
913
1035
  labelTrustRank,
914
- runEval,
1036
+ campaignMeanComposite,
1037
+ campaignBreakdown,
915
1038
  runOptimization,
916
1039
  surfaceHash,
917
1040
  runImprovementLoop,
918
1041
  defaultRenderDiff,
1042
+ runEval,
919
1043
  surfaceContentHash,
920
1044
  buildLoopProvenanceRecord,
921
1045
  loopProvenanceSpans,
@@ -923,4 +1047,4 @@ export {
923
1047
  provenanceSpansPath,
924
1048
  emitLoopProvenance
925
1049
  };
926
- //# sourceMappingURL=chunk-SUGME4OT.js.map
1050
+ //# sourceMappingURL=chunk-Z7ZU7IYZ.js.map