@tangle-network/agent-eval 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
410
410
  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
411
  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
412
  const n = scores.length;
413
- const mean3 = scores.reduce((a, b) => a + b, 0) / n;
413
+ const mean4 = scores.reduce((a, b) => a + b, 0) / n;
414
414
  const B = 1e3;
415
415
  const bootstrapMeans = [];
416
416
  for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
425
425
  const lowerIdx = Math.floor(alpha / 2 * B);
426
426
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
427
  return {
428
- mean: mean3,
428
+ mean: mean4,
429
429
  lower: bootstrapMeans[lowerIdx],
430
430
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
431
  };
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
513
513
  const n = before.length;
514
514
  if (n < 2) return { t: 0, df: 0, p: 1 };
515
515
  const diffs = before.map((b, i) => after[i] - b);
516
- const mean3 = diffs.reduce((a, b) => a + b, 0) / n;
517
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean3) ** 2, 0) / (n - 1);
516
+ const mean4 = diffs.reduce((a, b) => a + b, 0) / n;
517
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean4) ** 2, 0) / (n - 1);
518
518
  const se = Math.sqrt(variance2 / n);
519
- if (se === 0) return { t: mean3 === 0 ? 0 : Infinity, df: n - 1, p: mean3 === 0 ? 1 : 0 };
520
- const t = mean3 / se;
519
+ if (se === 0) return { t: mean4 === 0 ? 0 : Infinity, df: n - 1, p: mean4 === 0 ? 1 : 0 };
520
+ const t = mean4 / se;
521
521
  const df = n - 1;
522
522
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
523
  return { t, df, p };
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
541
541
  }
542
542
  let wPlus = 0;
543
543
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
544
- const mean3 = n * (n + 1) / 4;
544
+ const mean4 = n * (n + 1) / 4;
545
545
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
546
- const z = (wPlus - mean3) / Math.sqrt(variance2);
546
+ const z = (wPlus - mean4) / Math.sqrt(variance2);
547
547
  const p = 2 * (1 - normalCdf(Math.abs(z)));
548
548
  return { w: wPlus, p };
549
549
  }
@@ -2135,12 +2135,14 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
2135
2135
  toolUseQuality: 1,
2136
2136
  patchQuality: 1.25,
2137
2137
  testReality: 1.5,
2138
+ finalGate: 3,
2139
+ reviewerBlockers: -2,
2138
2140
  costUsd: -0.2,
2139
2141
  wallSeconds: -0.1
2140
2142
  };
2141
2143
  function aggregateRunScore(score, weights = {}) {
2142
2144
  const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
2143
- return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
2145
+ return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
2144
2146
  }
2145
2147
  function clamp01(value) {
2146
2148
  if (!Number.isFinite(value)) return 0;
@@ -2180,6 +2182,9 @@ var RunCritic = class {
2180
2182
  const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
2181
2183
  const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
2182
2184
  const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
2185
+ const finalGateSpans = judgeSpans2.filter(
2186
+ (span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
2187
+ );
2183
2188
  const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
2184
2189
  if (!success) notes.push("run did not complete with pass=true");
2185
2190
  const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
@@ -2194,6 +2199,15 @@ var RunCritic = class {
2194
2199
  const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
2195
2200
  const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
2196
2201
  if (!testReality) notes.push("no real test/build evidence recorded");
2202
+ const blockerSpans = judgeSpans2.filter(
2203
+ (span) => isBlockingJudge(span)
2204
+ );
2205
+ const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
2206
+ const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
2207
+ if (finalGateBlockers.length) notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
2208
+ else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
2209
+ const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
2210
+ if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
2197
2211
  const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
2198
2212
  const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
2199
2213
  const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
@@ -2209,6 +2223,8 @@ var RunCritic = class {
2209
2223
  toolUseQuality,
2210
2224
  patchQuality,
2211
2225
  testReality,
2226
+ finalGate,
2227
+ reviewerBlockers,
2212
2228
  costUsd,
2213
2229
  wallSeconds,
2214
2230
  notes
@@ -2227,6 +2243,12 @@ function normalizeJudgeScore(score) {
2227
2243
  function looksRepoGrounded(text) {
2228
2244
  return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
2229
2245
  }
2246
+ function isBlockingJudge(span) {
2247
+ return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
2248
+ }
2249
+ function positiveNumber(value) {
2250
+ return typeof value === "number" && value > 0;
2251
+ }
2230
2252
 
2231
2253
  // src/playbook.ts
2232
2254
  function distillPlaybook(entries, options = {}) {
@@ -2430,6 +2452,144 @@ function createAxService(aiFactory, provider, apiKey, model) {
2430
2452
  });
2431
2453
  }
2432
2454
 
2455
+ // src/pareto.ts
2456
+ function dominates(a, b, objectives) {
2457
+ let strictlyBetter = false;
2458
+ for (const obj of objectives) {
2459
+ const av = obj.value(a);
2460
+ const bv = obj.value(b);
2461
+ if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
2462
+ const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
2463
+ const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
2464
+ if (aIsWorse) return false;
2465
+ if (aIsBetter) strictlyBetter = true;
2466
+ }
2467
+ return strictlyBetter;
2468
+ }
2469
+ function paretoFrontier(candidates, objectives) {
2470
+ if (objectives.length === 0) {
2471
+ throw new Error("paretoFrontier: at least 1 objective required");
2472
+ }
2473
+ const valid = candidates.filter(
2474
+ (c) => objectives.every((o) => Number.isFinite(o.value(c)))
2475
+ );
2476
+ const frontier = [];
2477
+ const dominated = [];
2478
+ for (const c of valid) {
2479
+ const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
2480
+ if (isDominated) dominated.push(c);
2481
+ else frontier.push(c);
2482
+ }
2483
+ const dominanceMap = frontier.map((d) => ({
2484
+ dominator: d,
2485
+ dominated: dominated.filter((x) => dominates(d, x, objectives))
2486
+ }));
2487
+ return { frontier, dominated, dominanceMap };
2488
+ }
2489
+
2490
+ // src/harness-optimizer.ts
2491
+ var DEFAULT_HARNESS_OBJECTIVES = [
2492
+ { name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
2493
+ { name: "pass_rate", direction: "maximize", value: (r) => r.passRate },
2494
+ { name: "cost", direction: "minimize", value: (r) => r.costUsdMean },
2495
+ { name: "wall", direction: "minimize", value: (r) => r.wallSecondsMean }
2496
+ ];
2497
+ async function runHarnessExperiment(config) {
2498
+ const jobs = buildJobs(config);
2499
+ const critic = new RunCritic({ weights: config.weights });
2500
+ const score = config.score ?? ((trace) => critic.scoreTrace(trace));
2501
+ const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
2502
+ const trace = await config.adapter.run(request);
2503
+ const runScore = await score(trace, request);
2504
+ const result = {
2505
+ variant: request.variant,
2506
+ scenario: request.scenario,
2507
+ trialIndex: request.trialIndex,
2508
+ trace,
2509
+ score: runScore,
2510
+ aggregate: aggregateRunScore(runScore, config.weights)
2511
+ };
2512
+ await config.onResult?.(result);
2513
+ return result;
2514
+ });
2515
+ return { results, selection: selectHarnessVariant(results, config.objectives) };
2516
+ }
2517
+ function selectHarnessVariant(results, objectives = DEFAULT_HARNESS_OBJECTIVES) {
2518
+ const reports = summarizeHarnessResults(results);
2519
+ if (reports.length === 0) throw new Error("selectHarnessVariant: no results");
2520
+ const frontier = paretoFrontier(reports, objectives);
2521
+ const candidates = frontier.frontier.length ? frontier.frontier : reports;
2522
+ const winner = [...candidates].sort((a, b) => b.aggregateMean - a.aggregateMean)[0];
2523
+ if (!winner) throw new Error("selectHarnessVariant: no winner");
2524
+ return { winner, frontier, reports };
2525
+ }
2526
+ function summarizeHarnessResults(results) {
2527
+ const byVariant = /* @__PURE__ */ new Map();
2528
+ for (const result of results) {
2529
+ byVariant.set(result.variant.id, [...byVariant.get(result.variant.id) ?? [], result]);
2530
+ }
2531
+ return [...byVariant.values()].map((runs) => {
2532
+ const variant = runs[0]?.variant;
2533
+ if (!variant) throw new Error("summarizeHarnessResults: empty variant bucket");
2534
+ return {
2535
+ variant,
2536
+ runs,
2537
+ aggregateMean: mean(runs.map((r) => r.aggregate)),
2538
+ passRate: mean(runs.map((r) => r.score.success)),
2539
+ costUsdMean: mean(runs.map((r) => r.score.costUsd)),
2540
+ wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
2541
+ scoreMean: meanRunScore(runs.map((r) => r.score))
2542
+ };
2543
+ }).sort((a, b) => b.aggregateMean - a.aggregateMean);
2544
+ }
2545
+ function buildJobs(config) {
2546
+ if (config.variants.length === 0) throw new Error("runHarnessExperiment: at least one variant required");
2547
+ if (config.scenarios.length === 0) throw new Error("runHarnessExperiment: at least one scenario required");
2548
+ const trials = Math.max(1, Math.floor(config.trialsPerScenario ?? 1));
2549
+ const jobs = [];
2550
+ for (const variant of config.variants) {
2551
+ for (const scenario of config.scenarios) {
2552
+ for (let trialIndex = 0; trialIndex < trials; trialIndex++) {
2553
+ jobs.push({ variant, scenario, trialIndex });
2554
+ }
2555
+ }
2556
+ }
2557
+ return jobs;
2558
+ }
2559
+ async function mapLimit(items, limit, fn) {
2560
+ const results = new Array(items.length);
2561
+ let next = 0;
2562
+ const workerCount = Math.max(1, Math.min(Math.floor(limit), items.length));
2563
+ await Promise.all(Array.from({ length: workerCount }, async () => {
2564
+ while (next < items.length) {
2565
+ const index = next++;
2566
+ const item = items[index];
2567
+ if (item === void 0) continue;
2568
+ results[index] = await fn(item);
2569
+ }
2570
+ }));
2571
+ return results;
2572
+ }
2573
+ function mean(values) {
2574
+ return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
2575
+ }
2576
+ function meanRunScore(scores) {
2577
+ return {
2578
+ success: mean(scores.map((s) => s.success)),
2579
+ goalProgress: mean(scores.map((s) => s.goalProgress)),
2580
+ repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
2581
+ driftPenalty: mean(scores.map((s) => s.driftPenalty)),
2582
+ toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
2583
+ patchQuality: mean(scores.map((s) => s.patchQuality)),
2584
+ testReality: mean(scores.map((s) => s.testReality)),
2585
+ finalGate: mean(scores.map((s) => s.finalGate)),
2586
+ reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
2587
+ costUsd: mean(scores.map((s) => s.costUsd)),
2588
+ wallSeconds: mean(scores.map((s) => s.wallSeconds)),
2589
+ notes: scores.flatMap((s) => s.notes ?? [])
2590
+ };
2591
+ }
2592
+
2433
2593
  // src/trace/store.ts
2434
2594
  var InMemoryTraceStore = class {
2435
2595
  runs = /* @__PURE__ */ new Map();
@@ -2875,14 +3035,22 @@ function composeParsers(...parsers) {
2875
3035
  }
2876
3036
  var SubprocessSandboxDriver = class {
2877
3037
  id = "subprocess";
3038
+ defaultCwd;
3039
+ defaultEnv;
3040
+ constructor(options = {}) {
3041
+ this.defaultCwd = options.cwd;
3042
+ this.defaultEnv = options.env;
3043
+ }
2878
3044
  async exec(phase, command, config) {
2879
3045
  const { spawn } = await import("child_process");
2880
3046
  const start = Date.now();
3047
+ const effectiveCwd = config.cwd ?? this.defaultCwd;
3048
+ const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
2881
3049
  return await new Promise((resolve) => {
2882
3050
  const child = spawn(command, {
2883
3051
  shell: true,
2884
- cwd: config.cwd,
2885
- env: { ...process.env, ...config.env ?? {} }
3052
+ cwd: effectiveCwd,
3053
+ env: effectiveEnv
2886
3054
  });
2887
3055
  let stdout = "";
2888
3056
  let stderr = "";
@@ -4308,8 +4476,8 @@ function compareToBaseline(samples, options = {}) {
4308
4476
  if (s.baseline.length < 2 || s.candidate.length < 2) {
4309
4477
  throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
4310
4478
  }
4311
- const bMean = mean(s.baseline);
4312
- const cMean = mean(s.candidate);
4479
+ const bMean = mean2(s.baseline);
4480
+ const cMean = mean2(s.candidate);
4313
4481
  const delta = cMean - bMean;
4314
4482
  const d = cohensD(s.baseline, s.candidate);
4315
4483
  const { t, df, p } = welchsTTest(s.baseline, s.candidate);
@@ -4348,7 +4516,7 @@ function compareToBaseline(samples, options = {}) {
4348
4516
  hasUnstable: metrics.some((m) => m.verdict === "unstable")
4349
4517
  };
4350
4518
  }
4351
- function mean(xs) {
4519
+ function mean2(xs) {
4352
4520
  return xs.reduce((a, b) => a + b, 0) / xs.length;
4353
4521
  }
4354
4522
  function iqr(xs) {
@@ -4364,8 +4532,8 @@ function iqr(xs) {
4364
4532
  }
4365
4533
  function welchsTTest(a, b) {
4366
4534
  if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
4367
- const mA = mean(a);
4368
- const mB = mean(b);
4535
+ const mA = mean2(a);
4536
+ const mB = mean2(b);
4369
4537
  const vA = variance(a, mA);
4370
4538
  const vB = variance(b, mB);
4371
4539
  const seSquared = vA / a.length + vB / b.length;
@@ -4721,41 +4889,6 @@ function assertNonNegative(n, name) {
4721
4889
  }
4722
4890
  }
4723
4891
 
4724
- // src/pareto.ts
4725
- function dominates(a, b, objectives) {
4726
- let strictlyBetter = false;
4727
- for (const obj of objectives) {
4728
- const av = obj.value(a);
4729
- const bv = obj.value(b);
4730
- if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
4731
- const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
4732
- const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
4733
- if (aIsWorse) return false;
4734
- if (aIsBetter) strictlyBetter = true;
4735
- }
4736
- return strictlyBetter;
4737
- }
4738
- function paretoFrontier(candidates, objectives) {
4739
- if (objectives.length === 0) {
4740
- throw new Error("paretoFrontier: at least 1 objective required");
4741
- }
4742
- const valid = candidates.filter(
4743
- (c) => objectives.every((o) => Number.isFinite(o.value(c)))
4744
- );
4745
- const frontier = [];
4746
- const dominated = [];
4747
- for (const c of valid) {
4748
- const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
4749
- if (isDominated) dominated.push(c);
4750
- else frontier.push(c);
4751
- }
4752
- const dominanceMap = frontier.map((d) => ({
4753
- dominator: d,
4754
- dominated: dominated.filter((x) => dominates(d, x, objectives))
4755
- }));
4756
- return { frontier, dominated, dominanceMap };
4757
- }
4758
-
4759
4892
  // src/series-convergence.ts
4760
4893
  function analyzeSeries(values, options = {}) {
4761
4894
  const window = options.window ?? 5;
@@ -4765,10 +4898,10 @@ function analyzeSeries(values, options = {}) {
4765
4898
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
4766
4899
  }
4767
4900
  const tail = values.slice(-window);
4768
- const mean3 = tail.reduce((a, b) => a + b, 0) / tail.length;
4769
- const variance2 = tail.reduce((acc, v) => acc + (v - mean3) ** 2, 0) / tail.length;
4901
+ const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
4902
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
4770
4903
  const stdDev = Math.sqrt(variance2);
4771
- const refMean = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
4904
+ const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
4772
4905
  const cv = stdDev / refMean;
4773
4906
  const stable = tail.length >= window && cv <= stableCv;
4774
4907
  let tailRun = 0;
@@ -4789,7 +4922,7 @@ function analyzeSeries(values, options = {}) {
4789
4922
  } else {
4790
4923
  state = "noisy";
4791
4924
  }
4792
- return { state, windowMean: mean3, windowCv: cv, tailRun, stable };
4925
+ return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
4793
4926
  }
4794
4927
 
4795
4928
  // src/state-continuity.ts
@@ -5717,12 +5850,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
5717
5850
  variantScores.push({ mutator: id, score, mutated });
5718
5851
  all.push(score);
5719
5852
  }
5720
- const mean3 = all.reduce((a, b) => a + b, 0) / all.length;
5721
- const variance2 = all.reduce((a, v) => a + (v - mean3) ** 2, 0) / all.length;
5853
+ const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
5854
+ const variance2 = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
5722
5855
  const stdDev = Math.sqrt(variance2);
5723
- const ref = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
5856
+ const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
5724
5857
  const robustness = Math.max(0, 1 - stdDev / ref);
5725
- return { originalScore, variantScores, meanScore: mean3, stdDev, robustness };
5858
+ return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
5726
5859
  }
5727
5860
  var lowercaseMutator = (p) => p.toLowerCase();
5728
5861
  var sentenceReorderMutator = (p, seed) => {
@@ -6407,8 +6540,8 @@ async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMet
6407
6540
  function toBin(chunk, lower, upper) {
6408
6541
  const xs = chunk.map((c) => c.x);
6409
6542
  const ys = chunk.map((c) => c.y);
6410
- const evalMean = mean2(xs);
6411
- const outcomeMean = mean2(ys);
6543
+ const evalMean = mean3(xs);
6544
+ const outcomeMean = mean3(ys);
6412
6545
  return {
6413
6546
  lower: lower ?? Math.min(...xs),
6414
6547
  upper: upper ?? Math.max(...xs),
@@ -6418,7 +6551,7 @@ function toBin(chunk, lower, upper) {
6418
6551
  gap: Math.abs(outcomeMean - evalMean)
6419
6552
  };
6420
6553
  }
6421
- function mean2(xs) {
6554
+ function mean3(xs) {
6422
6555
  return xs.reduce((a, b) => a + b, 0) / xs.length;
6423
6556
  }
6424
6557
  function defaultExtract4(metric) {
@@ -6643,8 +6776,8 @@ async function prmBestOfN(store, grader, runIds) {
6643
6776
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
6644
6777
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
6645
6778
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
6646
- const mean3 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6647
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / graded.length;
6779
+ const mean4 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6780
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / graded.length;
6648
6781
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6649
6782
  }
6650
6783
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -6666,8 +6799,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
6666
6799
  const ranked = [...byRun.values()].sort(
6667
6800
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
6668
6801
  );
6669
- const mean3 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
6670
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / ranked.length;
6802
+ const mean4 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
6803
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / ranked.length;
6671
6804
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6672
6805
  }
6673
6806
 
@@ -7197,8 +7330,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7197
7330
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7198
7331
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7199
7332
  if (scores.length < 3) continue;
7200
- const mean3 = scores.reduce((a, b) => a + b, 0) / scores.length;
7201
- const variance2 = scores.reduce((a, b) => a + (b - mean3) ** 2, 0) / scores.length;
7333
+ const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7334
+ const variance2 = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
7202
7335
  if (variance2 > varianceThreshold) {
7203
7336
  targets.push({
7204
7337
  reason: "high-variance",
@@ -7688,6 +7821,7 @@ export {
7688
7821
  CostTracker,
7689
7822
  DEFAULT_AGENT_SLOS,
7690
7823
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
7824
+ DEFAULT_HARNESS_OBJECTIVES,
7691
7825
  DEFAULT_MUTATORS,
7692
7826
  DEFAULT_REDACTION_RULES,
7693
7827
  DEFAULT_RED_TEAM_CORPUS,
@@ -7851,6 +7985,7 @@ export {
7851
7985
  runE2EWorkflow,
7852
7986
  runExpectations,
7853
7987
  runFailureClass,
7988
+ runHarnessExperiment,
7854
7989
  runJudgeFleet,
7855
7990
  runProposeReview,
7856
7991
  runSelfPlay,
@@ -7861,6 +7996,7 @@ export {
7861
7996
  scoreProject,
7862
7997
  scoreRedTeamOutput,
7863
7998
  securityJudge,
7999
+ selectHarnessVariant,
7864
8000
  selfPreference,
7865
8001
  sentenceReorderMutator,
7866
8002
  signManifest,
@@ -7868,6 +8004,7 @@ export {
7868
8004
  statusAdvanced,
7869
8005
  stuckLoopView,
7870
8006
  summarize,
8007
+ summarizeHarnessResults,
7871
8008
  testJudge,
7872
8009
  textInSnapshot,
7873
8010
  toLangfuseEnvelope,