@tangle-network/agent-eval 0.25.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -96,7 +96,7 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-EDUKQ5AM.js";
99
+ } from "./chunk-WHZMVFUV.js";
100
100
  import {
101
101
  RunRecordValidationError,
102
102
  isRunRecord,
@@ -425,12 +425,12 @@ function ghCliClient(opts = {}) {
425
425
  await exec("git", ["branch", "-D", input.branchName], { cwd });
426
426
  await run("git", ["checkout", "-b", input.branchName]);
427
427
  const { mkdir, writeFile } = await import("fs/promises");
428
- const { dirname: dirname5, join: join3, resolve } = await import("path");
428
+ const { dirname: dirname5, join: join4, resolve } = await import("path");
429
429
  for (const change of input.fileChanges) {
430
430
  const abs = resolve(cwd, change.path);
431
431
  await mkdir(dirname5(abs), { recursive: true });
432
432
  await writeFile(abs, change.contents, "utf8");
433
- await run("git", ["add", join3(change.path)]);
433
+ await run("git", ["add", join4(change.path)]);
434
434
  }
435
435
  const env = {};
436
436
  if (input.authorName) env.GIT_AUTHOR_NAME = input.authorName;
@@ -3073,36 +3073,36 @@ var FileSystemExperimentStore = class {
3073
3073
  return idx.listRuns(experimentId);
3074
3074
  }
3075
3075
  async ensureDir() {
3076
- const fs = await import("fs/promises");
3077
- await fs.mkdir(this.dir, { recursive: true });
3076
+ const fs2 = await import("fs/promises");
3077
+ await fs2.mkdir(this.dir, { recursive: true });
3078
3078
  }
3079
3079
  async append(name, record) {
3080
3080
  await this.ensureDir();
3081
- const fs = await import("fs/promises");
3081
+ const fs2 = await import("fs/promises");
3082
3082
  const path = await import("path");
3083
3083
  const active = path.join(this.dir, `${name}.ndjson`);
3084
3084
  try {
3085
- const stat = await fs.stat(active);
3085
+ const stat = await fs2.stat(active);
3086
3086
  if (stat.size >= this.maxBytes) {
3087
3087
  const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
3088
- await fs.rename(active, rolled);
3088
+ await fs2.rename(active, rolled);
3089
3089
  }
3090
3090
  } catch {
3091
3091
  }
3092
- await fs.appendFile(active, `${JSON.stringify(record)}
3092
+ await fs2.appendFile(active, `${JSON.stringify(record)}
3093
3093
  `, "utf8");
3094
3094
  }
3095
3095
  async load() {
3096
3096
  if (this.loaded && this.index) return this.index;
3097
- const fs = await import("fs/promises");
3097
+ const fs2 = await import("fs/promises");
3098
3098
  const path = await import("path");
3099
3099
  const store = new InMemoryExperimentStore();
3100
3100
  try {
3101
- const entries = await fs.readdir(this.dir);
3101
+ const entries = await fs2.readdir(this.dir);
3102
3102
  const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
3103
3103
  for (const file of sorted) {
3104
3104
  const full = path.join(this.dir, file);
3105
- const content = await fs.readFile(full, "utf8");
3105
+ const content = await fs2.readFile(full, "utf8");
3106
3106
  const base = file.split(".")[0];
3107
3107
  for (const line of content.split("\n")) {
3108
3108
  if (!line.trim()) continue;
@@ -5063,6 +5063,218 @@ function weightedKappa(a, b) {
5063
5063
  if (den === 0) return 1;
5064
5064
  return 1 - num / den;
5065
5065
  }
5066
+ function continuousAgreement(scores, opts = {}) {
5067
+ const bootstrap = opts.bootstrap ?? 1e3;
5068
+ const weights = opts.weights ?? "quadratic";
5069
+ const seed = opts.seed ?? 12648430;
5070
+ const ciLevel = opts.ciLevel ?? 0.95;
5071
+ const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)));
5072
+ const raters = matrix[0]?.length ?? 0;
5073
+ const clean = matrix.filter((row) => row.length === raters);
5074
+ const nClean = clean.length;
5075
+ if (nClean < 2 || raters < 2) {
5076
+ return {
5077
+ weightedKappa: NaN,
5078
+ icc: NaN,
5079
+ pearson: NaN,
5080
+ spearman: NaN,
5081
+ ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },
5082
+ n: nClean,
5083
+ raters
5084
+ };
5085
+ }
5086
+ const kappa = continuousWeightedKappa(clean, weights);
5087
+ const icc = icc21(clean);
5088
+ const pearson = avgPairwise(clean, pearsonR);
5089
+ const spearman = avgPairwise(clean, spearmanR);
5090
+ const ciIcc = [NaN, NaN];
5091
+ const ciKappa = [NaN, NaN];
5092
+ if (bootstrap > 0) {
5093
+ const rng = mulberry32(seed);
5094
+ const iccs = [];
5095
+ const kappas = [];
5096
+ for (let b = 0; b < bootstrap; b++) {
5097
+ const sample = new Array(nClean);
5098
+ for (let i = 0; i < nClean; i++) {
5099
+ sample[i] = clean[Math.floor(rng() * nClean)];
5100
+ }
5101
+ const iccB = icc21(sample);
5102
+ const kB = continuousWeightedKappa(sample, weights);
5103
+ if (Number.isFinite(iccB)) iccs.push(iccB);
5104
+ if (Number.isFinite(kB)) kappas.push(kB);
5105
+ }
5106
+ const [lo, hi] = percentileBounds(ciLevel);
5107
+ if (iccs.length > 0) {
5108
+ iccs.sort((a, b) => a - b);
5109
+ ciIcc[0] = quantile(iccs, lo);
5110
+ ciIcc[1] = quantile(iccs, hi);
5111
+ }
5112
+ if (kappas.length > 0) {
5113
+ kappas.sort((a, b) => a - b);
5114
+ ciKappa[0] = quantile(kappas, lo);
5115
+ ciKappa[1] = quantile(kappas, hi);
5116
+ }
5117
+ }
5118
+ return {
5119
+ weightedKappa: kappa,
5120
+ icc,
5121
+ pearson,
5122
+ spearman,
5123
+ ci: { icc: ciIcc, weightedKappa: ciKappa },
5124
+ n: nClean,
5125
+ raters
5126
+ };
5127
+ }
5128
+ function calibrateJudgeContinuous(golden, candidate, opts = {}) {
5129
+ const base = calibrateJudge(golden, candidate);
5130
+ const map = /* @__PURE__ */ new Map();
5131
+ for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
5132
+ for (const c of candidate) {
5133
+ const entry = map.get(c.itemId);
5134
+ if (entry) entry.j = c.score;
5135
+ }
5136
+ const rows = [];
5137
+ for (const v of map.values()) {
5138
+ if (Number.isFinite(v.j)) rows.push([v.h, v.j]);
5139
+ }
5140
+ const agreement = continuousAgreement(rows, opts);
5141
+ return {
5142
+ ...base,
5143
+ weightedKappaContinuous: agreement.weightedKappa,
5144
+ icc: agreement.icc,
5145
+ spearman: agreement.spearman,
5146
+ ci: agreement.ci
5147
+ };
5148
+ }
5149
+ function continuousWeightedKappa(rows, scheme) {
5150
+ if (rows.length === 0) return NaN;
5151
+ const raters = rows[0].length;
5152
+ if (raters < 2) return NaN;
5153
+ const wFn = scheme === "linear" ? (x, y) => Math.abs(x - y) : (x, y) => (x - y) ** 2;
5154
+ let sum2 = 0;
5155
+ let pairs = 0;
5156
+ for (let r1 = 0; r1 < raters; r1++) {
5157
+ for (let r2 = r1 + 1; r2 < raters; r2++) {
5158
+ const a = rows.map((row) => row[r1]);
5159
+ const b = rows.map((row) => row[r2]);
5160
+ const n = a.length;
5161
+ let obs = 0;
5162
+ for (let i = 0; i < n; i++) obs += wFn(a[i], b[i]);
5163
+ obs /= n;
5164
+ let exp = 0;
5165
+ for (let i = 0; i < n; i++) {
5166
+ for (let j = 0; j < n; j++) exp += wFn(a[i], b[j]);
5167
+ }
5168
+ exp /= n * n;
5169
+ if (exp === 0) {
5170
+ sum2 += obs === 0 ? 1 : 0;
5171
+ } else {
5172
+ sum2 += 1 - obs / exp;
5173
+ }
5174
+ pairs++;
5175
+ }
5176
+ }
5177
+ return pairs === 0 ? NaN : sum2 / pairs;
5178
+ }
5179
+ function icc21(rows) {
5180
+ const n = rows.length;
5181
+ if (n < 2) return NaN;
5182
+ const k = rows[0].length;
5183
+ if (k < 2) return NaN;
5184
+ const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k);
5185
+ const colMeans = new Array(k).fill(0);
5186
+ for (let j = 0; j < k; j++) {
5187
+ let s = 0;
5188
+ for (let i = 0; i < n; i++) s += rows[i][j];
5189
+ colMeans[j] = s / n;
5190
+ }
5191
+ let grand = 0;
5192
+ for (let i = 0; i < n; i++) grand += rowMeans[i];
5193
+ grand /= n;
5194
+ let ssR = 0;
5195
+ for (let i = 0; i < n; i++) ssR += (rowMeans[i] - grand) ** 2;
5196
+ ssR *= k;
5197
+ let ssC = 0;
5198
+ for (let j = 0; j < k; j++) ssC += (colMeans[j] - grand) ** 2;
5199
+ ssC *= n;
5200
+ let ssT = 0;
5201
+ for (let i = 0; i < n; i++) {
5202
+ for (let j = 0; j < k; j++) ssT += (rows[i][j] - grand) ** 2;
5203
+ }
5204
+ const ssE = ssT - ssR - ssC;
5205
+ const dfR = n - 1;
5206
+ const dfC = k - 1;
5207
+ const dfE = (n - 1) * (k - 1);
5208
+ const msR = ssR / dfR;
5209
+ const msC = ssC / dfC;
5210
+ const msE = dfE > 0 ? ssE / dfE : 0;
5211
+ const denom = msR + (k - 1) * msE + k * (msC - msE) / n;
5212
+ if (denom === 0) {
5213
+ return msR === 0 && msE === 0 ? 1 : 0;
5214
+ }
5215
+ return (msR - msE) / denom;
5216
+ }
5217
+ function avgPairwise(rows, fn) {
5218
+ const k = rows[0]?.length ?? 0;
5219
+ if (k < 2) return NaN;
5220
+ let sum2 = 0;
5221
+ let pairs = 0;
5222
+ for (let i = 0; i < k; i++) {
5223
+ for (let j = i + 1; j < k; j++) {
5224
+ const a = rows.map((row) => row[i]);
5225
+ const b = rows.map((row) => row[j]);
5226
+ const r = fn(a, b);
5227
+ if (Number.isFinite(r)) {
5228
+ sum2 += r;
5229
+ pairs++;
5230
+ }
5231
+ }
5232
+ }
5233
+ return pairs === 0 ? NaN : sum2 / pairs;
5234
+ }
5235
+ function spearmanR(a, b) {
5236
+ if (a.length !== b.length || a.length < 2) return NaN;
5237
+ return pearsonR(rankWithTies(a), rankWithTies(b));
5238
+ }
5239
+ function rankWithTies(xs) {
5240
+ const n = xs.length;
5241
+ const indexed = xs.map((v, i2) => ({ v, i: i2 }));
5242
+ indexed.sort((x, y) => x.v - y.v);
5243
+ const ranks = new Array(n).fill(0);
5244
+ let i = 0;
5245
+ while (i < n) {
5246
+ let j = i;
5247
+ while (j + 1 < n && indexed[j + 1].v === indexed[i].v) j++;
5248
+ const avg = (i + j) / 2 + 1;
5249
+ for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
5250
+ i = j + 1;
5251
+ }
5252
+ return ranks;
5253
+ }
5254
+ function mulberry32(seed) {
5255
+ let a = seed >>> 0;
5256
+ return () => {
5257
+ a = a + 1831565813 >>> 0;
5258
+ let t = a;
5259
+ t = Math.imul(t ^ t >>> 15, t | 1);
5260
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
5261
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
5262
+ };
5263
+ }
5264
+ function percentileBounds(ciLevel) {
5265
+ const tail = (1 - ciLevel) / 2;
5266
+ return [tail, 1 - tail];
5267
+ }
5268
+ function quantile(sorted, q) {
5269
+ if (sorted.length === 0) return NaN;
5270
+ if (sorted.length === 1) return sorted[0];
5271
+ const pos = q * (sorted.length - 1);
5272
+ const lo = Math.floor(pos);
5273
+ const hi = Math.ceil(pos);
5274
+ if (lo === hi) return sorted[lo];
5275
+ const frac = pos - lo;
5276
+ return sorted[lo] * (1 - frac) + sorted[hi] * frac;
5277
+ }
5066
5278
 
5067
5279
  // src/observability.ts
5068
5280
  async function toLangfuseEnvelope(store, runId) {
@@ -5564,7 +5776,7 @@ async function commitBisect(options) {
5564
5776
  }
5565
5777
  async function promptBisect(options) {
5566
5778
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
5567
- const join3 = (paragraphs) => paragraphs.join("\n\n");
5779
+ const join4 = (paragraphs) => paragraphs.join("\n\n");
5568
5780
  const goodParas = split(options.good);
5569
5781
  const badParas = split(options.bad);
5570
5782
  if (goodParas.length !== badParas.length) {
@@ -5584,7 +5796,7 @@ async function promptBisect(options) {
5584
5796
  const result = await bisect({
5585
5797
  good: goodMask,
5586
5798
  bad: badMask,
5587
- runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
5799
+ runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
5588
5800
  maxIterations: options.maxIterations ?? n + 5,
5589
5801
  halfway: (g, b) => {
5590
5802
  for (let i = 0; i < g.length; i++) {
@@ -5615,12 +5827,12 @@ async function promptBisect(options) {
5615
5827
  }
5616
5828
  }
5617
5829
  const materializedPath = result.path.map((s) => ({
5618
- state: join3(paragraphsFor(s.state)),
5830
+ state: join4(paragraphsFor(s.state)),
5619
5831
  score: s.score,
5620
5832
  pass: s.pass
5621
5833
  }));
5622
5834
  return {
5623
- culprit: join3(paragraphsFor(culprit)),
5835
+ culprit: join4(paragraphsFor(culprit)),
5624
5836
  path: materializedPath,
5625
5837
  converged: result.converged,
5626
5838
  inputInconsistent: result.inputInconsistent,
@@ -5865,7 +6077,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
5865
6077
  runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
5866
6078
  }
5867
6079
  const runCounts = [...runCountByScenario.values()];
5868
- const p25 = runCounts.length > 0 ? quantile(runCounts, 0.25) : 0;
6080
+ const p25 = runCounts.length > 0 ? quantile2(runCounts, 0.25) : 0;
5869
6081
  for (const s of scenarios) {
5870
6082
  const count = runCountByScenario.get(s.id) ?? 0;
5871
6083
  if (count <= p25 && count < 3) {
@@ -5919,7 +6131,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
5919
6131
  }
5920
6132
  return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
5921
6133
  }
5922
- function quantile(xs, p) {
6134
+ function quantile2(xs, p) {
5923
6135
  const sorted = [...xs].sort((a, b) => a - b);
5924
6136
  const idx = p * (sorted.length - 1);
5925
6137
  const lo = Math.floor(idx);
@@ -8308,6 +8520,52 @@ function createCompositeMutator(opts) {
8308
8520
  };
8309
8521
  }
8310
8522
 
8523
+ // src/discover-personas.ts
8524
+ import { promises as fs } from "fs";
8525
+ import { basename, extname, join as join3 } from "path";
8526
+ var DEFAULT_PATTERN = /^\d{2}-.+\.(yaml|yml|json|md)$/;
8527
+ async function discoverPersonas(dir, opts = {}) {
8528
+ const pattern = opts.pattern ?? DEFAULT_PATTERN;
8529
+ const exclude = new Set(opts.exclude ?? []);
8530
+ const include = opts.include;
8531
+ async function walk(d) {
8532
+ let entries;
8533
+ try {
8534
+ const raw = await fs.readdir(d, { withFileTypes: true });
8535
+ entries = raw.map((e) => ({ name: e.name, isDir: e.isDirectory() }));
8536
+ } catch (err) {
8537
+ const code = err.code;
8538
+ if (code === "ENOENT") return [];
8539
+ throw err;
8540
+ }
8541
+ const out = [];
8542
+ for (const entry of entries) {
8543
+ const full = join3(d, entry.name);
8544
+ if (entry.isDir) {
8545
+ if (opts.recursive) out.push(...await walk(full));
8546
+ continue;
8547
+ }
8548
+ if (!pattern.test(entry.name)) continue;
8549
+ if (exclude.has(entry.name) || exclude.has(basename(entry.name, extname(entry.name))))
8550
+ continue;
8551
+ if (include && include.length > 0) {
8552
+ const id = basename(entry.name, extname(entry.name));
8553
+ const matched = include.some((needle) => entry.name.includes(needle) || id.includes(needle));
8554
+ if (!matched) continue;
8555
+ }
8556
+ out.push({
8557
+ path: full,
8558
+ filename: entry.name,
8559
+ id: basename(entry.name, extname(entry.name))
8560
+ });
8561
+ }
8562
+ return out;
8563
+ }
8564
+ const results = await walk(dir);
8565
+ results.sort((a, b) => a.filename.localeCompare(b.filename));
8566
+ return results;
8567
+ }
8568
+
8311
8569
  // src/evolution-telemetry.ts
8312
8570
  import { appendFileSync as appendFileSync3, existsSync as existsSync5, mkdirSync as mkdirSync3, readFileSync as readFileSync4, writeFileSync } from "fs";
8313
8571
  import { dirname as dirname3 } from "path";
@@ -8697,6 +8955,90 @@ var JsonlTrialCache = class {
8697
8955
  }
8698
8956
  };
8699
8957
 
8958
+ // src/judge-retry.ts
8959
+ var DEFAULT_MAX_ATTEMPTS = 3;
8960
+ var DEFAULT_TIMEOUT_MS = 9e4;
8961
+ var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
8962
+ var ABORT_PATTERNS = [
8963
+ /AbortError/i,
8964
+ /TimeoutError/i,
8965
+ /fetch failed/i,
8966
+ /ECONNRESET/i,
8967
+ /ETIMEDOUT/i,
8968
+ /EAI_AGAIN/i,
8969
+ /this operation was aborted/i,
8970
+ /stream.*ended.*unexpectedly/i,
8971
+ /socket hang up/i
8972
+ ];
8973
+ var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
8974
+ function defaultIsRetryable(err) {
8975
+ if (err instanceof Error) {
8976
+ if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
8977
+ const status = err.status;
8978
+ if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
8979
+ }
8980
+ return false;
8981
+ }
8982
+ function sleep(ms) {
8983
+ return new Promise((resolve) => setTimeout(resolve, ms));
8984
+ }
8985
+ async function withJudgeRetry(judgeFn, policy = {}) {
8986
+ const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
8987
+ const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
8988
+ const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
8989
+ const isRetryable = policy.isRetryable ?? defaultIsRetryable;
8990
+ const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
8991
+ let totalAttempts = 0;
8992
+ const attemptErrors = [];
8993
+ let lastError;
8994
+ for (const model of models) {
8995
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
8996
+ totalAttempts += 1;
8997
+ const controller = new AbortController();
8998
+ const timer = setTimeout(() => controller.abort(new Error("TimeoutError")), timeoutMs);
8999
+ try {
9000
+ const value = await judgeFn(model, controller.signal);
9001
+ clearTimeout(timer);
9002
+ return {
9003
+ value,
9004
+ succeeded: true,
9005
+ attempts: totalAttempts,
9006
+ modelUsed: model,
9007
+ attemptErrors
9008
+ };
9009
+ } catch (err) {
9010
+ clearTimeout(timer);
9011
+ const errObj = err instanceof Error ? err : new Error(String(err));
9012
+ lastError = errObj;
9013
+ attemptErrors.push({
9014
+ attempt: totalAttempts,
9015
+ model: model ?? "(default)",
9016
+ error: errObj.message
9017
+ });
9018
+ if (!isRetryable(errObj)) {
9019
+ return {
9020
+ value: null,
9021
+ succeeded: false,
9022
+ attempts: totalAttempts,
9023
+ error: errObj,
9024
+ attemptErrors
9025
+ };
9026
+ }
9027
+ if (attempt < maxAttempts - 1) {
9028
+ await sleep(backoff(attempt));
9029
+ }
9030
+ }
9031
+ }
9032
+ }
9033
+ return {
9034
+ value: null,
9035
+ succeeded: false,
9036
+ attempts: totalAttempts,
9037
+ error: lastError,
9038
+ attemptErrors
9039
+ };
9040
+ }
9041
+
8700
9042
  // src/orthogonality.ts
8701
9043
  function passOrthogonality(input) {
8702
9044
  const passes = input.passes;
@@ -8914,6 +9256,55 @@ function createSandboxPool(opts) {
8914
9256
  utilization
8915
9257
  };
8916
9258
  }
9259
+
9260
+ // src/trial-aggregator.ts
9261
+ function meanOf(xs) {
9262
+ if (xs.length === 0) return 0;
9263
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
9264
+ }
9265
+ function meanMetrics(rows) {
9266
+ if (rows.length === 0) return {};
9267
+ const keys = /* @__PURE__ */ new Set();
9268
+ for (const row of rows) for (const k of Object.keys(row)) keys.add(k);
9269
+ const out = {};
9270
+ for (const k of keys) {
9271
+ const xs = rows.map((r) => r[k]).filter((x) => typeof x === "number");
9272
+ if (xs.length > 0) out[k] = meanOf(xs);
9273
+ }
9274
+ return out;
9275
+ }
9276
+ function aggregateTrialsByMode(trials, opts) {
9277
+ const gradedTrials = trials.filter((t) => !t.error);
9278
+ const judgeOk = gradedTrials.filter((t) => t.judgeSucceeded !== false);
9279
+ const judgeFailed = gradedTrials.filter((t) => t.judgeSucceeded === false);
9280
+ if (opts.mode === "strict-fail" && judgeFailed.length > 0) {
9281
+ return {
9282
+ meanScore: 0,
9283
+ meanCost: 0,
9284
+ meanDurationMs: 0,
9285
+ okRate: 0,
9286
+ countedTrials: 0,
9287
+ excludedFailedTrials: judgeFailed.length,
9288
+ totalTrials: trials.length,
9289
+ metrics: {},
9290
+ strictFailure: {
9291
+ failedCount: judgeFailed.length,
9292
+ firstError: judgeFailed.find((t) => t.judgeError)?.judgeError
9293
+ }
9294
+ };
9295
+ }
9296
+ const counted = opts.mode === "exclude-failed" ? judgeOk : gradedTrials;
9297
+ return {
9298
+ meanScore: meanOf(counted.map((t) => t.score)),
9299
+ meanCost: meanOf(counted.map((t) => t.cost ?? 0)),
9300
+ meanDurationMs: meanOf(counted.map((t) => t.durationMs ?? 0)),
9301
+ okRate: gradedTrials.length === 0 ? 0 : gradedTrials.filter((t) => t.ok).length / gradedTrials.length,
9302
+ countedTrials: counted.length,
9303
+ excludedFailedTrials: judgeFailed.length,
9304
+ totalTrials: trials.length,
9305
+ metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
9306
+ };
9307
+ }
8917
9308
  export {
8918
9309
  AgentDriver,
8919
9310
  AgentEvalError,
@@ -9003,6 +9394,7 @@ export {
9003
9394
  adversarialJudge,
9004
9395
  aggregateLlm,
9005
9396
  aggregateRunScore,
9397
+ aggregateTrialsByMode,
9006
9398
  allCriticalPassed,
9007
9399
  analyzeAntiSlop,
9008
9400
  analyzeSeries,
@@ -9025,6 +9417,7 @@ export {
9025
9417
  buildTrajectory,
9026
9418
  byteLengthRange,
9027
9419
  calibrateJudge,
9420
+ calibrateJudgeContinuous,
9028
9421
  callLlm,
9029
9422
  callLlmJson,
9030
9423
  canaryLeakView,
@@ -9049,6 +9442,7 @@ export {
9049
9442
  computeToolUseMetrics,
9050
9443
  confidenceInterval,
9051
9444
  containsAll,
9445
+ continuousAgreement,
9052
9446
  controlFailureClassFromVerification,
9053
9447
  controlRunToFeedbackTrajectory,
9054
9448
  controlRunToRunRecord,
@@ -9073,6 +9467,7 @@ export {
9073
9467
  defaultProviderRedactor,
9074
9468
  defaultReferenceReplayMatcher,
9075
9469
  deployGateLayer,
9470
+ discoverPersonas,
9076
9471
  distillPlaybook,
9077
9472
  dominates,
9078
9473
  estimateCost,
@@ -9275,6 +9670,7 @@ export {
9275
9670
  whitespaceCollapseMutator,
9276
9671
  wilcoxonSignedRank,
9277
9672
  withAssignedFeedbackSplit,
9673
+ withJudgeRetry,
9278
9674
  wranglerDeployRunner
9279
9675
  };
9280
9676
  //# sourceMappingURL=index.js.map