npm - @tangle-network/agent-eval - Versions diffs - 0.40.5 → 0.42.0 - Mend

@tangle-network/agent-eval 0.40.5 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

package/dist/campaign/index.d.ts +48 -355
package/dist/campaign/index.js +106 -6
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
package/dist/chunk-H4TOS272.js.map +1 -0
package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
package/dist/chunk-KQ26DYTQ.js.map +1 -0
package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
package/dist/chunk-MNL6LXGQ.js.map +1 -0
package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
package/dist/chunk-N4SBKEPJ.js.map +1 -0
package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
package/dist/index.d.ts +227 -687
package/dist/index.js +753 -1237
package/dist/index.js.map +1 -1
package/dist/integrity-CTDhR1Sg.d.ts +81 -0
package/dist/llm-client-BXVRUZyX.d.ts +234 -0
package/dist/openapi.json +1 -1
package/dist/pipelines/index.js +67 -3
package/dist/pipelines/index.js.map +1 -1
package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
package/dist/reporting.d.ts +2 -3
package/dist/reporting.js +4 -8
package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
package/dist/rl.d.ts +103 -221
package/dist/rl.js +44 -199
package/dist/rl.js.map +1 -1
package/dist/sequential-DdV5ShjT.d.ts +561 -0
package/dist/traces.d.ts +3 -2
package/dist/traces.js +5 -5
package/dist/types-BLbRTxoc.d.ts +367 -0
package/dist/wire/index.d.ts +1 -1
package/package.json +1 -6
package/dist/chunk-5U2DOJU4.js.map +0 -1
package/dist/chunk-AU2JLNSZ.js.map +0 -1
package/dist/chunk-DMW5VENN.js +0 -1412
package/dist/chunk-DMW5VENN.js.map +0 -1
package/dist/chunk-EGIPWXHL.js.map +0 -1
package/dist/chunk-MAZ26DC7.js +0 -99
package/dist/chunk-MAZ26DC7.js.map +0 -1
package/dist/chunk-NKLGKF2Q.js.map +0 -1
package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
package/dist/optimization.d.ts +0 -11
package/dist/optimization.js +0 -71
package/dist/optimization.js.map +0 -1
package/dist/sequential-5iSVfzl2.d.ts +0 -139
package/dist/summary-report-DuZXOk7K.d.ts +0 -917
/package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0

package/dist/rl.js CHANGED Viewed

@@ -6,24 +6,23 @@ import {
 } from "./chunk-YV7J7X5N.js";
 import {
   runEvalCampaign
-} from "./chunk-LCIDRYGP.js";
-import "./chunk-VXNVVBZO.js";
+} from "./chunk-PD3MH6WU.js";
 import "./chunk-BWZEGTES.js";
 import {
   rubricPredictiveValidity
 } from "./chunk-YRZ4M5GS.js";
 import {
   evaluateInterimReleaseConfidence
-} from "./chunk-MAZ26DC7.js";
-import "./chunk-EGIPWXHL.js";
+} from "./chunk-MNL6LXGQ.js";
 import {
   benjaminiHochberg,
   wilcoxonSignedRank
 } from "./chunk-WP7SY7AI.js";
 import "./chunk-UBPIXOC4.js";
-import "./chunk-PC4UYEBM.js";
 import "./chunk-TVVP3ZZQ.js";
 import "./chunk-VSMTAMNK.js";
+import "./chunk-VXNVVBZO.js";
+import "./chunk-PC4UYEBM.js";
 import {
   ValidationError
 } from "./chunk-QYJT52YW.js";
@@ -508,48 +507,44 @@ function scenarioOf(run) {
 }
 // src/rl/run-record-adapters.ts
-function trialToRunRecord(trial, ctx, opts = {}) {
+function campaignToRunRecords(campaign, ctx) {
   const splitTag = ctx.splitTag ?? "search";
-  const promptHash = typeof ctx.promptHash === "function" ? ctx.promptHash(trial) : ctx.promptHash;
-  const configHash = typeof ctx.configHash === "function" ? ctx.configHash(trial) : ctx.configHash;
-  const runId = opts.runId ?? defaultRunId(ctx, trial);
-  const experimentId = opts.experimentIdPerTrial?.(trial) ?? ctx.experimentId;
-  const costRecorded = typeof trial.cost === "number" && Number.isFinite(trial.cost);
-  const costUsd = costRecorded ? trial.cost : ctx.defaultCostUsd ?? 0;
-  const raw = { ...trial.metrics ?? {} };
-  if (!costRecorded) raw.cost_unknown = 1;
-  if (typeof trial.durationMs === "number") raw.duration_ms = trial.durationMs;
-  raw.rep = trial.rep;
-  const score = Number.isFinite(trial.score) ? trial.score : 0;
-  const outcome = { raw };
-  if (splitTag === "holdout") outcome.holdoutScore = score;
-  else outcome.searchScore = score;
-  return {
-    runId,
-    experimentId,
-    candidateId: trial.variantId,
-    seed: trial.rep,
-    model: ctx.model,
-    promptHash,
-    configHash,
-    commitSha: ctx.commitSha,
-    wallMs: trial.durationMs ?? 0,
-    costUsd,
-    tokenUsage: { input: 0, output: 0 },
-    outcome,
-    failureMode: trial.ok ? void 0 : trial.error ? "optimizer_trial_error" : "optimizer_trial_failed",
-    splitTag,
-    scenarioId: trial.scenarioId
-  };
-}
-function trialsToRunRecords(trials, ctx) {
-  return trials.map((t) => trialToRunRecord(t, ctx));
+  const candidateId = ctx.candidateId ?? campaign.manifestHash;
+  return campaign.cells.map((cell) => {
+    const composites = Object.values(cell.judgeScores).map((s) => s.composite);
+    const score = composites.length > 0 ? composites.reduce((a, b) => a + b, 0) / composites.length : 0;
+    const raw = { rep: cell.rep, duration_ms: cell.durationMs };
+    for (const judge of Object.values(cell.judgeScores)) {
+      for (const [dim, value] of Object.entries(judge.dimensions)) {
+        if (Number.isFinite(value)) raw[`dim.${dim}`] = value;
+      }
+    }
+    if (typeof cell.generation === "number") raw.generation = cell.generation;
+    const outcome = { raw };
+    if (splitTag === "holdout") outcome.holdoutScore = score;
+    else outcome.searchScore = score;
+    return {
+      runId: cell.cellId,
+      experimentId: ctx.experimentId,
+      candidateId,
+      seed: cell.seed,
+      model: ctx.model,
+      promptHash: ctx.promptHash,
+      configHash: ctx.configHash,
+      commitSha: ctx.commitSha,
+      wallMs: cell.durationMs,
+      costUsd: Number.isFinite(cell.costUsd) ? cell.costUsd : ctx.defaultCostUsd ?? 0,
+      tokenUsage: { input: 0, output: 0 },
+      outcome,
+      failureMode: cell.error ? "cell_error" : void 0,
+      splitTag,
+      scenarioId: cell.scenarioId
+    };
+  });
 }
 function verificationReportToRunRecord(report, ctx, opts = {}) {
   const splitTag = ctx.splitTag ?? "search";
   const runId = opts.runId ?? `run-${ctx.candidateId}-${ctx.experimentId}-${report.startedAt}`;
-  const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
-  const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
   const raw = {
     pass_count: report.passCount,
     fail_count: report.failCount,
@@ -577,8 +572,8 @@ function verificationReportToRunRecord(report, ctx, opts = {}) {
     candidateId: ctx.candidateId,
     seed: 0,
     model: ctx.model,
-    promptHash,
-    configHash,
+    promptHash: ctx.promptHash,
+    configHash: ctx.configHash,
     commitSha: ctx.commitSha,
     wallMs: report.durationMs,
     costUsd: ctx.defaultCostUsd ?? 0,
@@ -589,39 +584,6 @@ function verificationReportToRunRecord(report, ctx, opts = {}) {
     scenarioId: ctx.scenarioId
   };
 }
-function variantAggregateToRunRecord(agg, ctx, opts = {}) {
-  const splitTag = ctx.splitTag ?? "search";
-  const runId = opts.runId ?? `agg-${agg.variantId}-${ctx.experimentId}`;
-  const promptHash = typeof ctx.promptHash === "function" ? "p".repeat(64) : ctx.promptHash;
-  const configHash = typeof ctx.configHash === "function" ? "c".repeat(64) : ctx.configHash;
-  const raw = {
-    ...agg.metrics,
-    ok_rate: agg.okRate,
-    duration_ms: agg.meanDurationMs,
-    n_scenarios: agg.scenarios.length
-  };
-  const outcome = { raw };
-  if (splitTag === "holdout") outcome.holdoutScore = agg.meanScore;
-  else outcome.searchScore = agg.meanScore;
-  return {
-    runId,
-    experimentId: ctx.experimentId,
-    candidateId: agg.variantId,
-    seed: 0,
-    model: ctx.model,
-    promptHash,
-    configHash,
-    commitSha: ctx.commitSha,
-    wallMs: agg.meanDurationMs,
-    costUsd: agg.meanCost,
-    tokenUsage: { input: 0, output: 0 },
-    outcome,
-    splitTag
-  };
-}
-function defaultRunId(ctx, t) {
-  return `run-${ctx.experimentId}-${t.variantId}-${t.scenarioId}-${t.rep}`;
-}
 function failureModeFromLayer(layer) {
   if (layer.status === "error") return `layer_${layer.layer}_error`;
   if (layer.status === "fail") return `layer_${layer.layer}_fail`;
@@ -1245,120 +1207,6 @@ function defaultReward(run) {
   return typeof v === "number" && Number.isFinite(v) ? v : null;
 }
-// src/rl/auto-research.ts
-async function analyzeOptimizationResult(opts) {
-  const trials = extractTrials(opts.result);
-  const runs = trialsToRunRecords(trials, opts.ctx);
-  const rewardSignals = extractVerifiableRewardsFromRecords(runs, opts.verifiableReward ?? {});
-  const preferences = extractPreferences(runs, {
-    strategy: opts.preferences?.strategy ?? "paired-by-scenario-and-seed",
-    minMargin: opts.preferences?.minMargin ?? 0.05,
-    splitTag: opts.preferences?.splitTag ?? opts.ctx.splitTag ?? "search",
-    rewardOf: opts.preferences?.rewardOf
-  });
-  let interimConfidence = null;
-  if (opts.comparator) {
-    const deltaSeries = collectPairedDeltaSeries(runs, opts.comparator);
-    if (deltaSeries.some((s) => s.deltas.length > 0)) {
-      interimConfidence = evaluateInterimReleaseConfidence({
-        deltaSeries,
-        alpha: opts.sequential?.alpha,
-        bound: opts.sequential?.bound,
-        rope: opts.sequential?.rope
-      });
-    }
-  }
-  const rewardHacking = detectRewardHacking({
-    runs,
-    verifiableRewardOptions: opts.verifiableReward
-  });
-  let predictiveValidity = null;
-  if (opts.outcomes) {
-    predictiveValidity = await rubricPredictiveValidity({
-      runs,
-      outcomes: opts.outcomes.store,
-      outcomeMetrics: opts.outcomes.metrics
-    });
-  }
-  const trainerRows = {};
-  if (opts.trainerExport?.dpo) {
-    trainerRows.dpo = await toDpoRows(preferences.pairs, opts.trainerExport.dpo);
-  }
-  if (opts.trainerExport?.grpo) {
-    trainerRows.grpo = await toGrpoRows(runs, opts.trainerExport.grpo);
-  }
-  const summary = buildSummary({
-    runs,
-    preferences,
-    interimConfidence,
-    rewardHacking,
-    predictiveValidity
-  });
-  return {
-    runs,
-    rewardSignals,
-    preferences,
-    interimConfidence,
-    rewardHacking,
-    predictiveValidity,
-    trainerRows,
-    summary
-  };
-}
-function extractTrials(result) {
-  if ("evolution" in result) {
-    return collectFromEvolution(result.evolution);
-  }
-  return collectFromEvolution(result);
-}
-function collectFromEvolution(evolution) {
-  const trials = [];
-  for (const gen of evolution.generations) {
-    for (const t of gen.trials ?? []) trials.push(t);
-  }
-  return trials;
-}
-function collectPairedDeltaSeries(runs, comparator) {
-  const baseline = /* @__PURE__ */ new Map();
-  for (const r of runs) {
-    if (r.candidateId !== comparator) continue;
-    const sid = r.scenarioId ?? r.experimentId;
-    const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
-    if (typeof score !== "number" || !Number.isFinite(score)) continue;
-    baseline.set(`${sid}::${r.seed}`, score);
-  }
-  const byCandidate = /* @__PURE__ */ new Map();
-  for (const r of runs) {
-    if (r.candidateId === comparator) continue;
-    const sid = r.scenarioId ?? r.experimentId;
-    const score = r.outcome.holdoutScore ?? r.outcome.searchScore;
-    if (typeof score !== "number" || !Number.isFinite(score)) continue;
-    const baseScore = baseline.get(`${sid}::${r.seed}`);
-    if (typeof baseScore !== "number") continue;
-    const arr = byCandidate.get(r.candidateId) ?? [];
-    arr.push(score - baseScore);
-    byCandidate.set(r.candidateId, arr);
-  }
-  return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
-}
-function buildSummary(args) {
-  const lines = [
-    `${args.runs.length} runs analysed`,
-    `${args.preferences.pairs.length} preference pairs (${args.preferences.strategy})`,
-    `reward-hacking verdict: ${args.rewardHacking.verdict}`
-  ];
-  if (args.interimConfidence) {
-    lines.push(
-      `sequential: ${args.interimConfidence.recommendation.decision}` + (args.interimConfidence.recommendation.candidateId ? ` ${args.interimConfidence.recommendation.candidateId}` : "")
-    );
-  }
-  if (args.predictiveValidity?.ranked[0]) {
-    const top = args.predictiveValidity.ranked[0];
-    lines.push(`top-rubric: ${top.rubric} \u03C1=${top.spearman.toFixed(2)}`);
-  }
-  return lines.join(" | ");
-}
 // src/rl/predictive-validity-researcher.ts
 var PredictiveValidityResearcher = class {
   opts;
@@ -1640,7 +1488,7 @@ async function runRLCampaign(opts) {
   let interimConfidence = null;
   if (opts.report?.comparator) {
     const comparator = opts.report.comparator;
-    const deltaSeries = collectPairedDeltaSeries2(campaign.runs, comparator);
+    const deltaSeries = collectPairedDeltaSeries(campaign.runs, comparator);
     if (deltaSeries.some((s) => s.deltas.length > 0)) {
       interimConfidence = evaluateInterimReleaseConfidence({
         deltaSeries,
@@ -1672,7 +1520,7 @@ async function runRLCampaign(opts) {
   if (opts.trainerExport?.sft) {
     trainerRows.sft = await toSftRows(campaign.runs, opts.trainerExport.sft);
   }
-  const summary = buildSummary2({
+  const summary = buildSummary({
     campaign,
     preferences,
     interimConfidence,
@@ -1691,7 +1539,7 @@ async function runRLCampaign(opts) {
     kind: "agent-eval-rl-campaign"
   };
 }
-function collectPairedDeltaSeries2(runs, comparator) {
+function collectPairedDeltaSeries(runs, comparator) {
   const baseline = /* @__PURE__ */ new Map();
   for (const r of runs) {
     if (r.candidateId !== comparator) continue;
@@ -1714,7 +1562,7 @@ function collectPairedDeltaSeries2(runs, comparator) {
   }
   return [...byCandidate.entries()].map(([candidateId, deltas]) => ({ candidateId, deltas }));
 }
-function buildSummary2(args) {
+function buildSummary(args) {
   const c = args.campaign;
   const lines = [
     `${c.campaignId}: ${c.runs.length} successful runs / ${c.failedRuns.length} failed (fingerprint ${c.campaignFingerprint.slice(0, 12)}\u2026)`,
@@ -1739,10 +1587,10 @@ function buildSummary2(args) {
 export {
   PredictiveValidityResearcher,
   adversarialScenarioSearch,
-  analyzeOptimizationResult,
   applyEloUpdate,
   bestOfN,
   buildPairwiseFromCampaign,
+  campaignToRunRecords,
   compareAdaptationCurves,
   detectRewardHacking,
   doublyRobust,
@@ -1781,10 +1629,7 @@ export {
   toSftJsonl,
   toSftRows,
   toTRLFormat,
-  trialToRunRecord,
-  trialsToRunRecords,
   varianceBasedCurriculum,
-  variantAggregateToRunRecord,
   verificationReportToRunRecord
 };
 //# sourceMappingURL=rl.js.map