npm - @tangle-network/agent-eval - Versions diffs - 0.25.0 → 0.27.2 - Mend

@tangle-network/agent-eval 0.25.0 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

package/CHANGELOG.md +145 -0
package/README.md +5 -5
package/dist/builder-eval/index.js +1 -1
package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
package/dist/chunk-4U4BKCXK.js.map +1 -0
package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
package/dist/chunk-5AKPEK5L.js.map +1 -0
package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
package/dist/chunk-K33INZHH.js.map +1 -0
package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
package/dist/chunk-MAZ26DC7.js.map +1 -0
package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
package/dist/chunk-NCRFYPS3.js.map +1 -0
package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
package/dist/chunk-QHF6EQKK.js.map +1 -0
package/dist/chunk-R5UQJNKC.js +722 -0
package/dist/chunk-R5UQJNKC.js.map +1 -0
package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
package/dist/chunk-RUI6SIHY.js.map +1 -0
package/dist/{chunk-EDUKQ5AM.js → chunk-SZSBQUIJ.js} +2 -2
package/dist/chunk-SZSBQUIJ.js.map +1 -0
package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
package/dist/chunk-VSMTAMNK.js.map +1 -0
package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
package/dist/chunk-XFZCM5Z3.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
package/dist/control.d.ts +3 -3
package/dist/control.js +2 -2
package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
package/dist/governance/index.d.ts +1 -1
package/dist/{index-Oj9fAPPN.d.ts → index-BhLlu-qO.d.ts} +63 -2
package/dist/index.d.ts +279 -72
package/dist/index.js +222 -136
package/dist/index.js.map +1 -1
package/dist/knowledge/index.d.ts +1 -1
package/dist/knowledge/index.js +2 -2
package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +5 -5
package/dist/optimization.js +5 -5
package/dist/pipelines/index.d.ts +1 -1
package/dist/pipelines/index.js +2 -2
package/dist/{release-report-BNgMdqPF.d.ts → release-report-CCQqnK46.d.ts} +1 -1
package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
package/dist/reporting.d.ts +4 -4
package/dist/reporting.js +5 -5
package/dist/{researcher-BPT8x_NT.d.ts → researcher-G81CWc0q.d.ts} +9 -10
package/dist/rl.d.ts +26 -44
package/dist/rl.js +5 -5
package/dist/rl.js.map +1 -1
package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
package/dist/{summary-report-C7VPYEj2.d.ts → summary-report-Dl4akLKX.d.ts} +13 -1
package/dist/traces.d.ts +1 -1
package/dist/traces.js +2 -2
package/dist/wire/index.d.ts +2 -2
package/dist/wire/index.js +1 -1
package/docs/concepts.md +11 -0
package/docs/research-report-methodology.md +4 -4
package/docs/three-package-architecture.md +12 -24
package/package.json +1 -1
package/dist/chunk-2A5XJB43.js.map +0 -1
package/dist/chunk-4F5DQN55.js.map +0 -1
package/dist/chunk-5LBB5B3Z.js.map +0 -1
package/dist/chunk-EDUKQ5AM.js.map +0 -1
package/dist/chunk-I4MBDTY5.js +0 -272
package/dist/chunk-I4MBDTY5.js.map +0 -1
package/dist/chunk-JLZQWFV3.js.map +0 -1
package/dist/chunk-K2TPS5LB.js.map +0 -1
package/dist/chunk-LSH4MMOZ.js.map +0 -1
package/dist/chunk-NU65VQ7M.js.map +0 -1
package/dist/chunk-OWLAAMME.js.map +0 -1
package/dist/chunk-SESZDQPX.js.map +0 -1
/package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
/package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
/package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0

package/dist/index.js CHANGED Viewed

@@ -11,7 +11,7 @@ import {
   failureClusterView,
   iqr,
   welchsTTest
-} from "./chunk-JLZQWFV3.js";
+} from "./chunk-K33INZHH.js";
 import {
   exportTrainingData,
   toNdjson
@@ -28,7 +28,7 @@ import {
   pytestTestParser,
   runTestGradedScenario,
   vitestTestParser
-} from "./chunk-OWLAAMME.js";
+} from "./chunk-QHF6EQKK.js";
 import {
   classifyEuAiRisk,
   euAiActReport,
@@ -43,7 +43,7 @@ import {
   knowledgeReadinessTracePayload,
   scoreKnowledgeReadiness,
   userQuestionsForKnowledgeGaps
-} from "./chunk-WWYCWKUM.js";
+} from "./chunk-3CKU6VGU.js";
 import {
   controlFailureClassFromVerification,
   controlRunToRunRecord,
@@ -54,7 +54,7 @@ import {
   runProposeReview,
   runProposeReviewAsControlLoop,
   scoreFromEvals
-} from "./chunk-ZN274SWR.js";
+} from "./chunk-PALJO75S.js";
 import {
   allCriticalPassed,
   objectiveEval,
@@ -62,7 +62,7 @@ import {
   stopOnNoProgress,
   stopOnRepeatedAction,
   subjectiveEval
-} from "./chunk-LSH4MMOZ.js";
+} from "./chunk-NCRFYPS3.js";
 import {
   CallbackResearcher,
   DEFAULT_MUTATION_PRIMITIVES,
@@ -96,7 +96,7 @@ import {
   summarizePreferenceMemory,
   trialTraceFromMultiShotTrial,
   withAssignedFeedbackSplit
-} from "./chunk-EDUKQ5AM.js";
+} from "./chunk-SZSBQUIJ.js";
 import {
   RunRecordValidationError,
   isRunRecord,
@@ -111,10 +111,10 @@ import {
   judgeReplayGate,
   releaseTraceEvidenceFromMultiShotTrials,
   renderReleaseReport
-} from "./chunk-RAF443UI.js";
+} from "./chunk-DBIGN5MJ.js";
 import {
   runEvalCampaign
-} from "./chunk-SESZDQPX.js";
+} from "./chunk-RUI6SIHY.js";
 import {
   LlmCallError,
   LlmClient,
@@ -128,7 +128,7 @@ import {
 import {
   evaluateInterimReleaseConfidence,
   pairedEvalueSequence
-} from "./chunk-NU65VQ7M.js";
+} from "./chunk-MAZ26DC7.js";
 import {
   RESEARCH_REPORT_HARD_PAIR_FLOOR,
   benjaminiHochberg,
@@ -141,18 +141,26 @@ import {
   requiredSampleSize,
   researchReport,
   summaryTable
-} from "./chunk-2A5XJB43.js";
+} from "./chunk-5AKPEK5L.js";
 import {
+  calibrateJudge,
+  calibrateJudgeContinuous,
   cohensD,
   confidenceInterval,
+  continuousAgreement,
+  corpusInterRaterAgreement,
+  corpusInterRaterAgreementFromJudgeScores,
   interRaterReliability,
   mannWhitneyU,
   normalizeScores,
   pairedTTest,
   partialCredit,
+  positionalBias,
+  selfPreference,
+  verbosityBias,
   weightedMean,
   wilcoxonSignedRank
-} from "./chunk-I4MBDTY5.js";
+} from "./chunk-R5UQJNKC.js";
 import {
   DEFAULT_REDACTION_RULES,
   FileSystemTraceStore,
@@ -166,7 +174,7 @@ import {
   iterateRawCalls,
   redactString,
   redactValue
-} from "./chunk-K2TPS5LB.js";
+} from "./chunk-4U4BKCXK.js";
 import {
   aggregateLlm,
   argHash,
@@ -208,7 +216,7 @@ import {
   hashJson,
   signManifest,
   verifyManifest
-} from "./chunk-4F5DQN55.js";
+} from "./chunk-VSMTAMNK.js";
 import {
   AgentEvalError,
   CaptureIntegrityError,
@@ -425,12 +433,12 @@ function ghCliClient(opts = {}) {
       await exec("git", ["branch", "-D", input.branchName], { cwd });
       await run("git", ["checkout", "-b", input.branchName]);
       const { mkdir, writeFile } = await import("fs/promises");
-      const { dirname: dirname5, join: join3, resolve } = await import("path");
+      const { dirname: dirname5, join: join4, resolve } = await import("path");
       for (const change of input.fileChanges) {
         const abs = resolve(cwd, change.path);
         await mkdir(dirname5(abs), { recursive: true });
         await writeFile(abs, change.contents, "utf8");
-        await run("git", ["add", join3(change.path)]);
+        await run("git", ["add", join4(change.path)]);
       }
       const env = {};
       if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
@@ -3073,36 +3081,36 @@ var FileSystemExperimentStore = class {
     return idx.listRuns(experimentId);
   }
   async ensureDir() {
-    const fs = await import("fs/promises");
-    await fs.mkdir(this.dir, { recursive: true });
+    const fs2 = await import("fs/promises");
+    await fs2.mkdir(this.dir, { recursive: true });
   }
   async append(name, record) {
     await this.ensureDir();
-    const fs = await import("fs/promises");
+    const fs2 = await import("fs/promises");
     const path = await import("path");
     const active = path.join(this.dir, `${name}.ndjson`);
     try {
-      const stat = await fs.stat(active);
+      const stat = await fs2.stat(active);
       if (stat.size >= this.maxBytes) {
         const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
-        await fs.rename(active, rolled);
+        await fs2.rename(active, rolled);
       }
     } catch {
     }
-    await fs.appendFile(active, `${JSON.stringify(record)}
+    await fs2.appendFile(active, `${JSON.stringify(record)}
 `, "utf8");
   }
   async load() {
     if (this.loaded && this.index) return this.index;
-    const fs = await import("fs/promises");
+    const fs2 = await import("fs/promises");
     const path = await import("path");
     const store = new InMemoryExperimentStore();
     try {
-      const entries = await fs.readdir(this.dir);
+      const entries = await fs2.readdir(this.dir);
       const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
       for (const file of sorted) {
         const full = path.join(this.dir, file);
-        const content = await fs.readFile(full, "utf8");
+        const content = await fs2.readFile(full, "utf8");
         const base = file.split(".")[0];
         for (const line of content.split("\n")) {
           if (!line.trim()) continue;
@@ -4956,114 +4964,6 @@ function seededShuffle(items, seed) {
   return out;
 }
-// src/judge-calibration.ts
-function calibrateJudge(golden, candidate) {
-  const map = /* @__PURE__ */ new Map();
-  for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
-  for (const c of candidate) {
-    const entry = map.get(c.itemId);
-    if (entry) entry.j = c.score;
-  }
-  const common = [...map.values()].filter((v) => Number.isFinite(v.j));
-  const n = common.length;
-  if (n < 2) {
-    return { n, pearson: NaN, kappa: NaN, mae: NaN, worstItems: [] };
-  }
-  const humans = common.map((c) => c.h);
-  const judges = common.map((c) => c.j);
-  const pearson = pearsonR(humans, judges);
-  const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
-  const absDiffs = common.map((c) => Math.abs(c.j - c.h));
-  const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
-  const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
-  return { n, pearson, kappa, mae, worstItems: worst2 };
-}
-function positionalBias(scores) {
-  const pairs = /* @__PURE__ */ new Map();
-  for (const s of scores) {
-    const slot = pairs.get(s.itemId) ?? {};
-    if (s.positionOfAInput === "first") slot.first = s.score;
-    else if (s.positionOfAInput === "second") slot.second = s.score;
-    pairs.set(s.itemId, slot);
-  }
-  const deltas = [];
-  for (const { first, second } of pairs.values()) {
-    if (first !== void 0 && second !== void 0) deltas.push(first - second);
-  }
-  if (deltas.length === 0) return { avgDelta: 0, n: 0 };
-  return { avgDelta: deltas.reduce((a, b) => a + b, 0) / deltas.length, n: deltas.length };
-}
-function verbosityBias(samples) {
-  const n = samples.length;
-  if (n < 3) return { pearson: NaN, n };
-  return {
-    pearson: pearsonR(
-      samples.map((s) => s.outputLen),
-      samples.map((s) => s.score)
-    ),
-    n
-  };
-}
-function selfPreference(samples) {
-  const inF = samples.filter((s) => s.inFamily).map((s) => s.score);
-  const outF = samples.filter((s) => !s.inFamily).map((s) => s.score);
-  if (inF.length === 0 || outF.length === 0)
-    return { inFamilyMean: 0, outOfFamilyMean: 0, deltaMean: 0, n: 0 };
-  const inMean = inF.reduce((a, b) => a + b, 0) / inF.length;
-  const outMean = outF.reduce((a, b) => a + b, 0) / outF.length;
-  return {
-    inFamilyMean: inMean,
-    outOfFamilyMean: outMean,
-    deltaMean: inMean - outMean,
-    n: samples.length
-  };
-}
-function pearsonR(a, b) {
-  if (a.length !== b.length || a.length < 2) return NaN;
-  const mA = a.reduce((s, v) => s + v, 0) / a.length;
-  const mB = b.reduce((s, v) => s + v, 0) / b.length;
-  let num = 0, dA = 0, dB = 0;
-  for (let i = 0; i < a.length; i++) {
-    const da = a[i] - mA;
-    const db = b[i] - mB;
-    num += da * db;
-    dA += da * da;
-    dB += db * db;
-  }
-  if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
-  return num / Math.sqrt(dA * dB);
-}
-function weightedKappa(a, b) {
-  if (a.length !== b.length || a.length === 0) return NaN;
-  const min = Math.min(...a, ...b);
-  const max = Math.max(...a, ...b);
-  const K = max - min + 1;
-  if (K < 2) return 1;
-  const observed = Array.from({ length: K }, () => new Array(K).fill(0));
-  const rowMarg = new Array(K).fill(0);
-  const colMarg = new Array(K).fill(0);
-  for (let i = 0; i < a.length; i++) {
-    const ai = a[i] - min;
-    const bi = b[i] - min;
-    const row = observed[ai];
-    row[bi] = (row[bi] ?? 0) + 1;
-    rowMarg[ai]++;
-    colMarg[bi]++;
-  }
-  let num = 0;
-  let den = 0;
-  for (let i = 0; i < K; i++) {
-    for (let j = 0; j < K; j++) {
-      const w = (i - j) ** 2 / (K - 1) ** 2;
-      const expected = rowMarg[i] * colMarg[j] / a.length;
-      num += w * observed[i][j];
-      den += w * expected;
-    }
-  }
-  if (den === 0) return 1;
-  return 1 - num / den;
-}
 // src/observability.ts
 async function toLangfuseEnvelope(store, runId) {
   const run = await store.getRun(runId);
@@ -5564,7 +5464,7 @@ async function commitBisect(options) {
 }
 async function promptBisect(options) {
   const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
-  const join3 = (paragraphs) => paragraphs.join("\n\n");
+  const join4 = (paragraphs) => paragraphs.join("\n\n");
   const goodParas = split(options.good);
   const badParas = split(options.bad);
   if (goodParas.length !== badParas.length) {
@@ -5584,7 +5484,7 @@ async function promptBisect(options) {
   const result = await bisect({
     good: goodMask,
     bad: badMask,
-    runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
+    runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
     maxIterations: options.maxIterations ?? n + 5,
     halfway: (g, b) => {
       for (let i = 0; i < g.length; i++) {
@@ -5615,12 +5515,12 @@ async function promptBisect(options) {
     }
   }
   const materializedPath = result.path.map((s) => ({
-    state: join3(paragraphsFor(s.state)),
+    state: join4(paragraphsFor(s.state)),
     score: s.score,
     pass: s.pass
   }));
   return {
-    culprit: join3(paragraphsFor(culprit)),
+    culprit: join4(paragraphsFor(culprit)),
     path: materializedPath,
     converged: result.converged,
     inputInconsistent: result.inputInconsistent,
@@ -8308,6 +8208,52 @@ function createCompositeMutator(opts) {
   };
 }
+// src/discover-personas.ts
+import { promises as fs } from "fs";
+import { basename, extname, join as join3 } from "path";
+var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
+async function discoverPersonas(dir, opts = {}) {
+  const pattern = opts.pattern ?? DEFAULT_PATTERN;
+  const exclude = new Set(opts.exclude ?? []);
+  const include = opts.include;
+  async function walk(d) {
+    let entries;
+    try {
+      const raw = await fs.readdir(d, { withFileTypes: true });
+      entries = raw.map((e) => ({ name: e.name, isDir: e.isDirectory() }));
+    } catch (err) {
+      const code = err.code;
+      if (code === "ENOENT") return [];
+      throw err;
+    }
+    const out = [];
+    for (const entry of entries) {
+      const full = join3(d, entry.name);
+      if (entry.isDir) {
+        if (opts.recursive) out.push(...await walk(full));
+        continue;
+      }
+      if (!pattern.test(entry.name)) continue;
+      if (exclude.has(entry.name) || exclude.has(basename(entry.name, extname(entry.name))))
+        continue;
+      if (include && include.length > 0) {
+        const id = basename(entry.name, extname(entry.name));
+        const matched = include.some((needle) => entry.name.includes(needle) || id.includes(needle));
+        if (!matched) continue;
+      }
+      out.push({
+        path: full,
+        filename: entry.name,
+        id: basename(entry.name, extname(entry.name))
+      });
+    }
+    return out;
+  }
+  const results = await walk(dir);
+  results.sort((a, b) => a.filename.localeCompare(b.filename));
+  return results;
+}
 // src/evolution-telemetry.ts
 import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3, readFileSync as readFileSync4, writeFileSync } from "fs";
 import { dirname as dirname3 } from "path";
@@ -8697,6 +8643,90 @@ var JsonlTrialCache = class {
   }
 };
+// src/judge-retry.ts
+var DEFAULT_MAX_ATTEMPTS = 3;
+var DEFAULT_TIMEOUT_MS = 9e4;
+var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
+var ABORT_PATTERNS = [
+  /AbortError/i,
+  /TimeoutError/i,
+  /fetch failed/i,
+  /ECONNRESET/i,
+  /ETIMEDOUT/i,
+  /EAI_AGAIN/i,
+  /this operation was aborted/i,
+  /stream.*ended.*unexpectedly/i,
+  /socket hang up/i
+];
+var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
+function defaultIsRetryable(err) {
+  if (err instanceof Error) {
+    if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
+    const status = err.status;
+    if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
+  }
+  return false;
+}
+function sleep(ms) {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+async function withJudgeRetry(judgeFn, policy = {}) {
+  const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
+  const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+  const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
+  const isRetryable = policy.isRetryable ?? defaultIsRetryable;
+  const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
+  let totalAttempts = 0;
+  const attemptErrors = [];
+  let lastError;
+  for (const model of models) {
+    for (let attempt = 0; attempt < maxAttempts; attempt++) {
+      totalAttempts += 1;
+      const controller = new AbortController();
+      const timer = setTimeout(() => controller.abort(new Error("TimeoutError")), timeoutMs);
+      try {
+        const value = await judgeFn(model, controller.signal);
+        clearTimeout(timer);
+        return {
+          value,
+          succeeded: true,
+          attempts: totalAttempts,
+          modelUsed: model,
+          attemptErrors
+        };
+      } catch (err) {
+        clearTimeout(timer);
+        const errObj = err instanceof Error ? err : new Error(String(err));
+        lastError = errObj;
+        attemptErrors.push({
+          attempt: totalAttempts,
+          model: model ?? "(default)",
+          error: errObj.message
+        });
+        if (!isRetryable(errObj)) {
+          return {
+            value: null,
+            succeeded: false,
+            attempts: totalAttempts,
+            error: errObj,
+            attemptErrors
+          };
+        }
+        if (attempt < maxAttempts - 1) {
+          await sleep(backoff(attempt));
+        }
+      }
+    }
+  }
+  return {
+    value: null,
+    succeeded: false,
+    attempts: totalAttempts,
+    error: lastError,
+    attemptErrors
+  };
+}
 // src/orthogonality.ts
 function passOrthogonality(input) {
   const passes = input.passes;
@@ -8914,6 +8944,55 @@ function createSandboxPool(opts) {
     utilization
   };
 }
+// src/trial-aggregator.ts
+function meanOf(xs) {
+  if (xs.length === 0) return 0;
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+function meanMetrics(rows) {
+  if (rows.length === 0) return {};
+  const keys = /* @__PURE__ */ new Set();
+  for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
+  const out = {};
+  for (const k of keys) {
+    const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
+    if (xs.length > 0) out[k] = meanOf(xs);
+  }
+  return out;
+}
+function aggregateTrialsByMode(trials, opts) {
+  const gradedTrials = trials.filter((t) => !t.error);
+  const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
+  const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
+  if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
+    return {
+      meanScore: 0,
+      meanCost: 0,
+      meanDurationMs: 0,
+      okRate: 0,
+      countedTrials: 0,
+      excludedFailedTrials: judgeFailed.length,
+      totalTrials: trials.length,
+      metrics: {},
+      strictFailure: {
+        failedCount: judgeFailed.length,
+        firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
+      }
+    };
+  }
+  const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
+  return {
+    meanScore: meanOf(counted.map((t) => t.score)),
+    meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
+    meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
+    okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
+    countedTrials: counted.length,
+    excludedFailedTrials: judgeFailed.length,
+    totalTrials: trials.length,
+    metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
+  };
+}
 export {
   AgentDriver,
   AgentEvalError,
@@ -9003,6 +9082,7 @@ export {
   adversarialJudge,
   aggregateLlm,
   aggregateRunScore,
+  aggregateTrialsByMode,
   allCriticalPassed,
   analyzeAntiSlop,
   analyzeSeries,
@@ -9025,6 +9105,7 @@ export {
   buildTrajectory,
   byteLengthRange,
   calibrateJudge,
+  calibrateJudgeContinuous,
   callLlm,
   callLlmJson,
   canaryLeakView,
@@ -9049,9 +9130,12 @@ export {
   computeToolUseMetrics,
   confidenceInterval,
   containsAll,
+  continuousAgreement,
   controlFailureClassFromVerification,
   controlRunToFeedbackTrajectory,
   controlRunToRunRecord,
+  corpusInterRaterAgreement,
+  corpusInterRaterAgreementFromJudgeScores,
   createAntiSlopJudge,
   createCompositeMutator,
   createCustomJudge,
@@ -9073,6 +9157,7 @@ export {
   defaultProviderRedactor,
   defaultReferenceReplayMatcher,
   deployGateLayer,
+  discoverPersonas,
   distillPlaybook,
   dominates,
   estimateCost,
@@ -9275,6 +9360,7 @@ export {
   whitespaceCollapseMutator,
   wilcoxonSignedRank,
   withAssignedFeedbackSplit,
+  withJudgeRetry,
   wranglerDeployRunner
 };
 //# sourceMappingURL=index.js.map