npm - @tangle-network/agent-eval - Versions diffs - 0.61.0 → 0.63.0 - Mend

@tangle-network/agent-eval 0.61.0 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/CHANGELOG.md +48 -8
package/dist/adapters/http.d.ts +4 -1
package/dist/adapters/langchain.d.ts +4 -1
package/dist/adapters/otel.d.ts +4 -4
package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
package/dist/benchmarks/index.d.ts +2 -2
package/dist/campaign/index.d.ts +388 -11
package/dist/campaign/index.js +597 -12
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
package/dist/chunk-4ODZXQV2.js.map +1 -0
package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
package/dist/chunk-7TPYV2ER.js.map +1 -0
package/dist/chunk-E22YUOAL.js +111 -0
package/dist/chunk-E22YUOAL.js.map +1 -0
package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
package/dist/contract/index.d.ts +9 -9
package/dist/contract/index.js +4 -3
package/dist/contract/index.js.map +1 -1
package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/hosted/index.d.ts +4 -4
package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
package/dist/index.d.ts +98 -14
package/dist/index.js +331 -128
package/dist/index.js.map +1 -1
package/dist/meta-eval/index.d.ts +2 -2
package/dist/multishot/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
package/dist/reporting.d.ts +4 -4
package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
package/dist/rl.d.ts +6 -6
package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
package/package.json +1 -1
package/dist/chunk-GMXHLSLL.js.map +0 -1
package/dist/chunk-OLULBECP.js.map +0 -1
package/dist/chunk-SUGME4OT.js.map +0 -1
/package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0

package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} RENAMED Viewed

@@ -1,13 +1,16 @@
 import {
   runCampaign
-} from "./chunk-OLULBECP.js";
+} from "./chunk-7TPYV2ER.js";
 import {
   buildReflectionPrompt,
+  paretoFrontier,
   parseReflectionResponse,
   runCanaries,
-  scoreRedTeamOutput,
+  scoreRedTeamOutput
+} from "./chunk-4ODZXQV2.js";
+import {
   summarizeBackendIntegrity
-} from "./chunk-GMXHLSLL.js";
+} from "./chunk-E22YUOAL.js";
 import {
   detectRewardHacking
 } from "./chunk-YV7J7X5N.js";
@@ -145,52 +148,120 @@ function evolutionaryDriver(opts) {
 // src/campaign/drivers/gepa.ts
 var REFLECTION_SYSTEM = 'You are an expert prompt engineer. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} where each `payload` is the FULL improved surface text. No prose outside the JSON.';
+var COMBINE_SYSTEM = 'You are an expert prompt engineer performing a GEPA "combine complementary lessons" merge. You are given several non-dominated versions of one surface; each is uniquely best on different scenarios. Produce ONE new version that keeps what makes each version strong on its winning scenarios and resolves conflicts in favor of the more general rule. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} with exactly one proposal whose `payload` is the FULL merged surface text. No prose outside the JSON.';
 function gepaDriver(opts) {
   const evidenceK = opts.evidenceK ?? 3;
+  const combineParents = opts.combineParents ?? true;
+  const combineMaxParents = opts.combineMaxParents ?? 4;
+  if (combineParents && combineMaxParents < 1) {
+    throw new Error("gepaDriver: combineMaxParents must be >= 1 when combineParents is enabled");
+  }
   return {
     kind: "gepa",
     async propose(ctx) {
       const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
-      const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
-      const userPrompt = buildReflectionPrompt({
-        target,
-        parentPayload: parent,
-        topTrials: top,
-        bottomTrials: bottom,
-        childCount: ctx.populationSize,
-        mutationPrimitives: opts.mutationPrimitives
-      });
-      const result = await callLlm(
-        {
-          model: opts.model,
-          messages: [
-            { role: "system", content: REFLECTION_SYSTEM },
-            { role: "user", content: userPrompt }
-          ],
-          jsonMode: true,
-          temperature: opts.temperature ?? 0.7,
-          maxTokens: opts.maxTokens ?? 6e3
-        },
-        opts.llm
-      );
-      const proposals = parseReflectionResponse(result.content, ctx.populationSize);
-      const out = [];
-      const seen = /* @__PURE__ */ new Set();
       const constraints = opts.constraints;
       const preserveSections = constraints?.preserveSections !== void 0 ? constraints.preserveSections.length === 0 ? extractH2Sections(parent) : constraints.preserveSections : null;
       const maxEdits = constraints?.maxSentenceEdits;
-      for (const proposal of proposals) {
-        const text = typeof proposal.payload === "string" ? proposal.payload.trim() : "";
-        if (!text || text === parent || seen.has(text)) continue;
-        if (preserveSections && !validatePreservedSections(text, preserveSections)) continue;
-        if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) continue;
+      const out = [];
+      const seen = /* @__PURE__ */ new Set();
+      const accept = (payload, label, rationale) => {
+        const text = typeof payload === "string" ? payload.trim() : "";
+        if (!text || text === parent || seen.has(text)) return;
+        if (preserveSections && !validatePreservedSections(text, preserveSections)) return;
+        if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) return;
         seen.add(text);
-        out.push({ surface: text, label: proposal.label, rationale: proposal.rationale });
+        out.push({ surface: text, label, rationale });
+      };
+      const stringParents = (combineParents ? ctx.paretoParents ?? [] : []).filter((p) => typeof p.surface === "string").sort((a, b) => b.composite - a.composite).slice(0, combineMaxParents);
+      if (stringParents.length > 1) {
+        const combinePrompt = buildCombinePrompt({
+          target: opts.target,
+          parents: stringParents,
+          evidenceK
+        });
+        const combineResult = await callLlm(
+          {
+            model: opts.model,
+            messages: [
+              { role: "system", content: COMBINE_SYSTEM },
+              { role: "user", content: combinePrompt }
+            ],
+            jsonMode: true,
+            temperature: opts.temperature ?? 0.7,
+            maxTokens: opts.maxTokens ?? 6e3
+          },
+          opts.llm
+        );
+        const merged = parseReflectionResponse(combineResult.content, 1)[0];
+        if (merged) {
+          accept(
+            merged.payload,
+            merged.label || "pareto-combine",
+            merged.rationale || `combined ${stringParents.length} non-dominated parents (gen ${stringParents.map((p) => p.generation).join(",")})`
+          );
+        }
       }
-      return out;
+      const reflectCount = Math.max(0, ctx.populationSize - out.length);
+      if (reflectCount > 0) {
+        const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
+        const userPrompt = buildReflectionPrompt({
+          target,
+          parentPayload: parent,
+          topTrials: top,
+          bottomTrials: bottom,
+          childCount: reflectCount,
+          mutationPrimitives: opts.mutationPrimitives
+        });
+        const result = await callLlm(
+          {
+            model: opts.model,
+            messages: [
+              { role: "system", content: REFLECTION_SYSTEM },
+              { role: "user", content: userPrompt }
+            ],
+            jsonMode: true,
+            temperature: opts.temperature ?? 0.7,
+            maxTokens: opts.maxTokens ?? 6e3
+          },
+          opts.llm
+        );
+        for (const proposal of parseReflectionResponse(result.content, reflectCount)) {
+          accept(proposal.payload, proposal.label, proposal.rationale);
+        }
+      }
+      return out.slice(0, ctx.populationSize);
     }
   };
 }
+function buildCombinePrompt(args) {
+  const lines = [
+    `You are merging ${args.parents.length} versions of: ${args.target}.`,
+    "",
+    "Each version is on the Pareto frontier \u2014 none dominates the others; each",
+    "wins on different scenarios. Combine their complementary strengths into",
+    "ONE version. Below, each version lists the scenarios it scores highest on.",
+    ""
+  ];
+  args.parents.forEach((p, i) => {
+    const tag = String.fromCharCode(65 + i);
+    const best = Object.entries(p.objectives).sort((a, b) => b[1] - a[1]).slice(0, args.evidenceK).map(([id, score]) => `${id} (${score.toFixed(2)})`);
+    lines.push(
+      `### Version ${tag} (mean ${p.composite.toFixed(2)}; strongest on: ${best.join(", ") || "n/a"})`,
+      "```",
+      p.surface,
+      "```",
+      ""
+    );
+  });
+  lines.push(
+    "Return ONE merged version that would score well on the union of every",
+    "version's winning scenarios. Keep each version's specific winning rule;",
+    "where two rules conflict, prefer the more general one and note the choice",
+    "in your rationale."
+  );
+  return lines.join("\n");
+}
 function extractH2Sections(text) {
   const out = [];
   for (const line of text.split("\n")) {
@@ -451,9 +522,45 @@ function labelTrustRank(trust) {
   return LABEL_TRUST_RANK[trust ?? "unverified"];
 }
-// src/campaign/presets/run-eval.ts
-async function runEval(opts) {
-  return runCampaign(opts);
+// src/campaign/score-utils.ts
+function campaignMeanComposite(campaign) {
+  const composites = [];
+  for (const cell of campaign.cells) {
+    const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
+    if (cellComposites.length > 0) {
+      composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
+    }
+  }
+  return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
+}
+function campaignBreakdown(campaign) {
+  const dimSums = {};
+  const dimCounts = {};
+  const byScenario = /* @__PURE__ */ new Map();
+  for (const cell of campaign.cells) {
+    const judgeScores = Object.values(cell.judgeScores);
+    if (judgeScores.length === 0) continue;
+    const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
+    const arr = byScenario.get(cell.scenarioId) ?? [];
+    arr.push(cellComposite);
+    byScenario.set(cell.scenarioId, arr);
+    for (const score of judgeScores) {
+      for (const [key, value] of Object.entries(score.dimensions)) {
+        dimSums[key] = (dimSums[key] ?? 0) + value;
+        dimCounts[key] = (dimCounts[key] ?? 0) + 1;
+      }
+    }
+  }
+  const dimensions = {};
+  for (const key of Object.keys(dimSums)) {
+    const count = dimCounts[key] ?? 0;
+    dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
+  }
+  const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
+    scenarioId,
+    composite: comps.reduce((a, b) => a + b, 0) / comps.length
+  }));
+  return { dimensions, scenarios };
 }
 // src/campaign/presets/run-optimization.ts
@@ -470,11 +577,15 @@ async function runOptimization(opts) {
   let currentSurfaces = [opts.baselineSurface];
   let winnerSurface = opts.baselineSurface;
   let winnerSurfaceHash = surfaceHash(opts.baselineSurface);
-  let winnerComposite = meanComposite2(baselineCampaign);
+  let winnerComposite = campaignMeanComposite(baselineCampaign);
   let winnerLabel;
   let winnerRationale;
+  const scored = [
+    toParetoParent(opts.baselineSurface, winnerSurfaceHash, baselineCampaign, -1)
+  ];
   for (let gen = 0; gen < opts.maxGenerations; gen++) {
     if (opts.driver.decide?.({ history }).stop) break;
+    const paretoParents = computeParetoFrontier(scored);
     const proposed = await opts.driver.propose({
       currentSurface: currentSurfaces[0] ?? opts.baselineSurface,
       history,
@@ -484,7 +595,8 @@ async function runOptimization(opts) {
       signal: new AbortController().signal,
       report: opts.report,
       dataset: opts.labeledStore && opts.labeledStore !== "off" ? opts.labeledStore : void 0,
-      maxImprovementShots: opts.maxImprovementShots
+      maxImprovementShots: opts.maxImprovementShots,
+      paretoParents
     });
     const candidates = proposed.map(
       (p) => isProposedCandidate(p) ? p : { surface: p, label: "", rationale: "" }
@@ -498,8 +610,11 @@ async function runOptimization(opts) {
         dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
         runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`
       });
-      const composite = meanComposite2(campaign);
+      const composite = campaignMeanComposite(campaign);
       surfaceResults.push({ surfaceHash: hash, surface, label, rationale, campaign, composite });
+      scored.push(
+        toParetoParent(surface, hash, campaign, gen, label || void 0, rationale || void 0)
+      );
     }
     surfaceResults.sort((a, b) => b.composite - a.composite);
     const promoted = surfaceResults.slice(0, promoteTopK);
@@ -515,7 +630,7 @@ async function runOptimization(opts) {
     const record = {
       generationIndex: gen,
       candidates: surfaceResults.map((s) => {
-        const breakdown = candidateBreakdown(s.campaign);
+        const breakdown = campaignBreakdown(s.campaign);
         const candidate = {
           surfaceHash: s.surfaceHash,
           composite: s.composite,
@@ -545,8 +660,49 @@ async function runOptimization(opts) {
     winnerSurfaceHash,
     winnerLabel,
     winnerRationale,
-    baselineCampaign
+    baselineCampaign,
+    paretoFrontier: computeParetoFrontier(scored)
+  };
+}
+function toParetoParent(surface, hash, campaign, generation, label, rationale) {
+  const objectives = {};
+  for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
+    objectives[scenarioId] = composite;
+  }
+  const parent = {
+    surface,
+    surfaceHash: hash,
+    objectives,
+    composite: campaignMeanComposite(campaign),
+    generation
   };
+  if (label) parent.label = label;
+  if (rationale) parent.rationale = rationale;
+  return parent;
+}
+function computeParetoFrontier(scored) {
+  if (scored.length <= 1) return [...scored];
+  const ids = /* @__PURE__ */ new Set();
+  for (const p of scored) for (const id of Object.keys(p.objectives)) ids.add(id);
+  if (ids.size === 0) return [...scored];
+  const floor = {};
+  for (const id of ids) {
+    let min = Number.POSITIVE_INFINITY;
+    for (const p of scored) {
+      const v = p.objectives[id];
+      if (typeof v === "number" && Number.isFinite(v) && v < min) min = v;
+    }
+    floor[id] = Number.isFinite(min) ? min : 0;
+  }
+  const objectives = [...ids].map((id) => ({
+    name: id,
+    direction: "maximize",
+    value: (p) => {
+      const v = p.objectives[id];
+      return typeof v === "number" && Number.isFinite(v) ? v : floor[id] ?? 0;
+    }
+  }));
+  return paretoFrontier(scored, objectives).frontier;
 }
 function surfaceHash(surface) {
   const material = typeof surface === "string" ? surface : JSON.stringify({
@@ -556,45 +712,6 @@ function surfaceHash(surface) {
   });
   return createHash("sha256").update(material).digest("hex").slice(0, 16);
 }
-function meanComposite2(campaign) {
-  const composites = [];
-  for (const cell of campaign.cells) {
-    const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
-    if (cellComposites.length > 0) {
-      composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
-    }
-  }
-  return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
-}
-function candidateBreakdown(campaign) {
-  const dimSums = {};
-  const dimCounts = {};
-  const byScenario = /* @__PURE__ */ new Map();
-  for (const cell of campaign.cells) {
-    const judgeScores = Object.values(cell.judgeScores);
-    if (judgeScores.length === 0) continue;
-    const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
-    const arr = byScenario.get(cell.scenarioId) ?? [];
-    arr.push(cellComposite);
-    byScenario.set(cell.scenarioId, arr);
-    for (const score of judgeScores) {
-      for (const [key, value] of Object.entries(score.dimensions)) {
-        dimSums[key] = (dimSums[key] ?? 0) + value;
-        dimCounts[key] = (dimCounts[key] ?? 0) + 1;
-      }
-    }
-  }
-  const dimensions = {};
-  for (const key of Object.keys(dimSums)) {
-    const count = dimCounts[key] ?? 0;
-    dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
-  }
-  const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
-    scenarioId,
-    composite: comps.reduce((a, b) => a + b, 0) / comps.length
-  }));
-  return { dimensions, scenarios };
-}
 // src/campaign/presets/run-improvement-loop.ts
 async function runImprovementLoop(opts) {
@@ -612,7 +729,7 @@ async function runImprovementLoop(opts) {
     throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
   }
   const optimization = await runOptimization(opts);
-  const { runCampaign: runCampaign2 } = await import("./run-campaign-HXPJAUZ3.js");
+  const { runCampaign: runCampaign2 } = await import("./run-campaign-5J3ED2UJ.js");
   const baselineOnHoldout = await runCampaign2({
     ...opts,
     scenarios: opts.holdoutScenarios,
@@ -687,6 +804,11 @@ ${fmt(winnerSurface)}`;
   return lines.join("\n");
 }
+// src/campaign/presets/run-eval.ts
+async function runEval(opts) {
+  return runCampaign(opts);
+}
 // src/campaign/provenance.ts
 import { createHash as createHash2 } from "crypto";
 import { join as join2 } from "path";
@@ -911,11 +1033,13 @@ export {
   heldOutGate,
   isProposedCandidate,
   labelTrustRank,
-  runEval,
+  campaignMeanComposite,
+  campaignBreakdown,
   runOptimization,
   surfaceHash,
   runImprovementLoop,
   defaultRenderDiff,
+  runEval,
   surfaceContentHash,
   buildLoopProvenanceRecord,
   loopProvenanceSpans,
@@ -923,4 +1047,4 @@ export {
   provenanceSpansPath,
   emitLoopProvenance
 };
-//# sourceMappingURL=chunk-SUGME4OT.js.map
+//# sourceMappingURL=chunk-Z7ZU7IYZ.js.map