@tangle-network/agent-eval 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
410
410
  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
411
  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
412
  const n = scores.length;
413
- const mean3 = scores.reduce((a, b) => a + b, 0) / n;
413
+ const mean4 = scores.reduce((a, b) => a + b, 0) / n;
414
414
  const B = 1e3;
415
415
  const bootstrapMeans = [];
416
416
  for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
425
425
  const lowerIdx = Math.floor(alpha / 2 * B);
426
426
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
427
  return {
428
- mean: mean3,
428
+ mean: mean4,
429
429
  lower: bootstrapMeans[lowerIdx],
430
430
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
431
  };
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
513
513
  const n = before.length;
514
514
  if (n < 2) return { t: 0, df: 0, p: 1 };
515
515
  const diffs = before.map((b, i) => after[i] - b);
516
- const mean3 = diffs.reduce((a, b) => a + b, 0) / n;
517
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean3) ** 2, 0) / (n - 1);
516
+ const mean4 = diffs.reduce((a, b) => a + b, 0) / n;
517
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean4) ** 2, 0) / (n - 1);
518
518
  const se = Math.sqrt(variance2 / n);
519
- if (se === 0) return { t: mean3 === 0 ? 0 : Infinity, df: n - 1, p: mean3 === 0 ? 1 : 0 };
520
- const t = mean3 / se;
519
+ if (se === 0) return { t: mean4 === 0 ? 0 : Infinity, df: n - 1, p: mean4 === 0 ? 1 : 0 };
520
+ const t = mean4 / se;
521
521
  const df = n - 1;
522
522
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
523
  return { t, df, p };
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
541
541
  }
542
542
  let wPlus = 0;
543
543
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
544
- const mean3 = n * (n + 1) / 4;
544
+ const mean4 = n * (n + 1) / 4;
545
545
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
546
- const z = (wPlus - mean3) / Math.sqrt(variance2);
546
+ const z = (wPlus - mean4) / Math.sqrt(variance2);
547
547
  const p = 2 * (1 - normalCdf(Math.abs(z)));
548
548
  return { w: wPlus, p };
549
549
  }
@@ -2135,12 +2135,14 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
2135
2135
  toolUseQuality: 1,
2136
2136
  patchQuality: 1.25,
2137
2137
  testReality: 1.5,
2138
+ finalGate: 3,
2139
+ reviewerBlockers: -2,
2138
2140
  costUsd: -0.2,
2139
2141
  wallSeconds: -0.1
2140
2142
  };
2141
2143
  function aggregateRunScore(score, weights = {}) {
2142
2144
  const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
2143
- return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
2145
+ return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
2144
2146
  }
2145
2147
  function clamp01(value) {
2146
2148
  if (!Number.isFinite(value)) return 0;
@@ -2180,6 +2182,9 @@ var RunCritic = class {
2180
2182
  const toolSpans2 = trace.spans.filter((s) => s.kind === "tool");
2181
2183
  const judgeSpans2 = trace.spans.filter((s) => s.kind === "judge");
2182
2184
  const sandboxSpans = trace.spans.filter((s) => s.kind === "sandbox");
2185
+ const finalGateSpans = judgeSpans2.filter(
2186
+ (span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
2187
+ );
2183
2188
  const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
2184
2189
  if (!success) notes.push("run did not complete with pass=true");
2185
2190
  const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
@@ -2194,6 +2199,15 @@ var RunCritic = class {
2194
2199
  const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
2195
2200
  const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
2196
2201
  if (!testReality) notes.push("no real test/build evidence recorded");
2202
+ const blockerSpans = judgeSpans2.filter(
2203
+ (span) => isBlockingJudge(span)
2204
+ );
2205
+ const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
2206
+ const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
2207
+ if (finalGateBlockers.length) notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
2208
+ else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
2209
+ const reviewerBlockers = judgeSpans2.length ? blockerSpans.length / judgeSpans2.length : 0;
2210
+ if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
2197
2211
  const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans2.filter((span) => looksRepoGrounded(span.output ?? "")).length;
2198
2212
  const driftSignals = llmSpans2.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
2199
2213
  const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
@@ -2209,6 +2223,8 @@ var RunCritic = class {
2209
2223
  toolUseQuality,
2210
2224
  patchQuality,
2211
2225
  testReality,
2226
+ finalGate,
2227
+ reviewerBlockers,
2212
2228
  costUsd,
2213
2229
  wallSeconds,
2214
2230
  notes
@@ -2227,6 +2243,12 @@ function normalizeJudgeScore(score) {
2227
2243
  function looksRepoGrounded(text) {
2228
2244
  return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
2229
2245
  }
2246
+ function isBlockingJudge(span) {
2247
+ return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
2248
+ }
2249
+ function positiveNumber(value) {
2250
+ return typeof value === "number" && value > 0;
2251
+ }
2230
2252
 
2231
2253
  // src/playbook.ts
2232
2254
  function distillPlaybook(entries, options = {}) {
@@ -2430,6 +2452,144 @@ function createAxService(aiFactory, provider, apiKey, model) {
2430
2452
  });
2431
2453
  }
2432
2454
 
2455
+ // src/pareto.ts
2456
+ function dominates(a, b, objectives) {
2457
+ let strictlyBetter = false;
2458
+ for (const obj of objectives) {
2459
+ const av = obj.value(a);
2460
+ const bv = obj.value(b);
2461
+ if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
2462
+ const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
2463
+ const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
2464
+ if (aIsWorse) return false;
2465
+ if (aIsBetter) strictlyBetter = true;
2466
+ }
2467
+ return strictlyBetter;
2468
+ }
2469
+ function paretoFrontier(candidates, objectives) {
2470
+ if (objectives.length === 0) {
2471
+ throw new Error("paretoFrontier: at least 1 objective required");
2472
+ }
2473
+ const valid = candidates.filter(
2474
+ (c) => objectives.every((o) => Number.isFinite(o.value(c)))
2475
+ );
2476
+ const frontier = [];
2477
+ const dominated = [];
2478
+ for (const c of valid) {
2479
+ const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
2480
+ if (isDominated) dominated.push(c);
2481
+ else frontier.push(c);
2482
+ }
2483
+ const dominanceMap = frontier.map((d) => ({
2484
+ dominator: d,
2485
+ dominated: dominated.filter((x) => dominates(d, x, objectives))
2486
+ }));
2487
+ return { frontier, dominated, dominanceMap };
2488
+ }
2489
+
2490
+ // src/harness-optimizer.ts
2491
+ var DEFAULT_HARNESS_OBJECTIVES = [
2492
+ { name: "aggregate", direction: "maximize", value: (r) => r.aggregateMean },
2493
+ { name: "pass_rate", direction: "maximize", value: (r) => r.passRate },
2494
+ { name: "cost", direction: "minimize", value: (r) => r.costUsdMean },
2495
+ { name: "wall", direction: "minimize", value: (r) => r.wallSecondsMean }
2496
+ ];
2497
+ async function runHarnessExperiment(config) {
2498
+ const jobs = buildJobs(config);
2499
+ const critic = new RunCritic({ weights: config.weights });
2500
+ const score = config.score ?? ((trace) => critic.scoreTrace(trace));
2501
+ const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
2502
+ const trace = await config.adapter.run(request);
2503
+ const runScore = await score(trace, request);
2504
+ const result = {
2505
+ variant: request.variant,
2506
+ scenario: request.scenario,
2507
+ trialIndex: request.trialIndex,
2508
+ trace,
2509
+ score: runScore,
2510
+ aggregate: aggregateRunScore(runScore, config.weights)
2511
+ };
2512
+ await config.onResult?.(result);
2513
+ return result;
2514
+ });
2515
+ return { results, selection: selectHarnessVariant(results, config.objectives) };
2516
+ }
2517
+ function selectHarnessVariant(results, objectives = DEFAULT_HARNESS_OBJECTIVES) {
2518
+ const reports = summarizeHarnessResults(results);
2519
+ if (reports.length === 0) throw new Error("selectHarnessVariant: no results");
2520
+ const frontier = paretoFrontier(reports, objectives);
2521
+ const candidates = frontier.frontier.length ? frontier.frontier : reports;
2522
+ const winner = [...candidates].sort((a, b) => b.aggregateMean - a.aggregateMean)[0];
2523
+ if (!winner) throw new Error("selectHarnessVariant: no winner");
2524
+ return { winner, frontier, reports };
2525
+ }
2526
+ function summarizeHarnessResults(results) {
2527
+ const byVariant = /* @__PURE__ */ new Map();
2528
+ for (const result of results) {
2529
+ byVariant.set(result.variant.id, [...byVariant.get(result.variant.id) ?? [], result]);
2530
+ }
2531
+ return [...byVariant.values()].map((runs) => {
2532
+ const variant = runs[0]?.variant;
2533
+ if (!variant) throw new Error("summarizeHarnessResults: empty variant bucket");
2534
+ return {
2535
+ variant,
2536
+ runs,
2537
+ aggregateMean: mean(runs.map((r) => r.aggregate)),
2538
+ passRate: mean(runs.map((r) => r.score.success)),
2539
+ costUsdMean: mean(runs.map((r) => r.score.costUsd)),
2540
+ wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
2541
+ scoreMean: meanRunScore(runs.map((r) => r.score))
2542
+ };
2543
+ }).sort((a, b) => b.aggregateMean - a.aggregateMean);
2544
+ }
2545
+ function buildJobs(config) {
2546
+ if (config.variants.length === 0) throw new Error("runHarnessExperiment: at least one variant required");
2547
+ if (config.scenarios.length === 0) throw new Error("runHarnessExperiment: at least one scenario required");
2548
+ const trials = Math.max(1, Math.floor(config.trialsPerScenario ?? 1));
2549
+ const jobs = [];
2550
+ for (const variant of config.variants) {
2551
+ for (const scenario of config.scenarios) {
2552
+ for (let trialIndex = 0; trialIndex < trials; trialIndex++) {
2553
+ jobs.push({ variant, scenario, trialIndex });
2554
+ }
2555
+ }
2556
+ }
2557
+ return jobs;
2558
+ }
2559
+ async function mapLimit(items, limit, fn) {
2560
+ const results = new Array(items.length);
2561
+ let next = 0;
2562
+ const workerCount = Math.max(1, Math.min(Math.floor(limit), items.length));
2563
+ await Promise.all(Array.from({ length: workerCount }, async () => {
2564
+ while (next < items.length) {
2565
+ const index = next++;
2566
+ const item = items[index];
2567
+ if (item === void 0) continue;
2568
+ results[index] = await fn(item);
2569
+ }
2570
+ }));
2571
+ return results;
2572
+ }
2573
+ function mean(values) {
2574
+ return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
2575
+ }
2576
+ function meanRunScore(scores) {
2577
+ return {
2578
+ success: mean(scores.map((s) => s.success)),
2579
+ goalProgress: mean(scores.map((s) => s.goalProgress)),
2580
+ repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
2581
+ driftPenalty: mean(scores.map((s) => s.driftPenalty)),
2582
+ toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
2583
+ patchQuality: mean(scores.map((s) => s.patchQuality)),
2584
+ testReality: mean(scores.map((s) => s.testReality)),
2585
+ finalGate: mean(scores.map((s) => s.finalGate)),
2586
+ reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
2587
+ costUsd: mean(scores.map((s) => s.costUsd)),
2588
+ wallSeconds: mean(scores.map((s) => s.wallSeconds)),
2589
+ notes: scores.flatMap((s) => s.notes ?? [])
2590
+ };
2591
+ }
2592
+
2433
2593
  // src/trace/store.ts
2434
2594
  var InMemoryTraceStore = class {
2435
2595
  runs = /* @__PURE__ */ new Map();
@@ -2875,14 +3035,22 @@ function composeParsers(...parsers) {
2875
3035
  }
2876
3036
  var SubprocessSandboxDriver = class {
2877
3037
  id = "subprocess";
3038
+ defaultCwd;
3039
+ defaultEnv;
3040
+ constructor(options = {}) {
3041
+ this.defaultCwd = options.cwd;
3042
+ this.defaultEnv = options.env;
3043
+ }
2878
3044
  async exec(phase, command, config) {
2879
3045
  const { spawn } = await import("child_process");
2880
3046
  const start = Date.now();
3047
+ const effectiveCwd = config.cwd ?? this.defaultCwd;
3048
+ const effectiveEnv = { ...process.env, ...this.defaultEnv ?? {}, ...config.env ?? {} };
2881
3049
  return await new Promise((resolve) => {
2882
3050
  const child = spawn(command, {
2883
3051
  shell: true,
2884
- cwd: config.cwd,
2885
- env: { ...process.env, ...config.env ?? {} }
3052
+ cwd: effectiveCwd,
3053
+ env: effectiveEnv
2886
3054
  });
2887
3055
  let stdout = "";
2888
3056
  let stderr = "";
@@ -4308,8 +4476,8 @@ function compareToBaseline(samples, options = {}) {
4308
4476
  if (s.baseline.length < 2 || s.candidate.length < 2) {
4309
4477
  throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
4310
4478
  }
4311
- const bMean = mean(s.baseline);
4312
- const cMean = mean(s.candidate);
4479
+ const bMean = mean2(s.baseline);
4480
+ const cMean = mean2(s.candidate);
4313
4481
  const delta = cMean - bMean;
4314
4482
  const d = cohensD(s.baseline, s.candidate);
4315
4483
  const { t, df, p } = welchsTTest(s.baseline, s.candidate);
@@ -4348,7 +4516,7 @@ function compareToBaseline(samples, options = {}) {
4348
4516
  hasUnstable: metrics.some((m) => m.verdict === "unstable")
4349
4517
  };
4350
4518
  }
4351
- function mean(xs) {
4519
+ function mean2(xs) {
4352
4520
  return xs.reduce((a, b) => a + b, 0) / xs.length;
4353
4521
  }
4354
4522
  function iqr(xs) {
@@ -4364,8 +4532,8 @@ function iqr(xs) {
4364
4532
  }
4365
4533
  function welchsTTest(a, b) {
4366
4534
  if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
4367
- const mA = mean(a);
4368
- const mB = mean(b);
4535
+ const mA = mean2(a);
4536
+ const mB = mean2(b);
4369
4537
  const vA = variance(a, mA);
4370
4538
  const vB = variance(b, mB);
4371
4539
  const seSquared = vA / a.length + vB / b.length;
@@ -4685,6 +4853,30 @@ var CostTracker = class {
4685
4853
  if (!bucket) throw new Error(`CostTracker.markOutcome: unknown scenario "${scenarioId}"`);
4686
4854
  bucket.completed = completed;
4687
4855
  }
4856
+ /**
4857
+ * Convenience: record + markOutcome in one call from a
4858
+ * `{ usage, verdict }`-shaped response (starter-foundry's
4859
+ * `invokeMetaJudge` returns this shape; consumers that wrap any
4860
+ * judge/critic can follow the same convention).
4861
+ *
4862
+ * `usage.model` must be present in `MODEL_PRICING` for cost math to
4863
+ * populate; otherwise totalCostUsd stays at 0 for the entry but
4864
+ * tokens still aggregate.
4865
+ */
4866
+ recordVerdict(verdict, scenarioId, tags) {
4867
+ if (!verdict.usage) return null;
4868
+ const entry = this.record({
4869
+ scenarioId,
4870
+ model: verdict.usage.model,
4871
+ inputTokens: verdict.usage.inputTokens,
4872
+ outputTokens: verdict.usage.outputTokens,
4873
+ cachedTokens: verdict.usage.cachedTokens,
4874
+ reasoningTokens: verdict.usage.reasoningTokens,
4875
+ tags
4876
+ });
4877
+ this.markOutcome(scenarioId, verdict.verdict === "pass");
4878
+ return entry;
4879
+ }
4688
4880
  get(scenarioId) {
4689
4881
  return this.byScenario.get(scenarioId);
4690
4882
  }
@@ -4721,39 +4913,177 @@ function assertNonNegative(n, name) {
4721
4913
  }
4722
4914
  }
4723
4915
 
4724
- // src/pareto.ts
4725
- function dominates(a, b, objectives) {
4726
- let strictlyBetter = false;
4727
- for (const obj of objectives) {
4728
- const av = obj.value(a);
4729
- const bv = obj.value(b);
4730
- if (!Number.isFinite(av) || !Number.isFinite(bv)) return false;
4731
- const aIsBetter = obj.direction === "maximize" ? av > bv : av < bv;
4732
- const aIsWorse = obj.direction === "maximize" ? av < bv : av > bv;
4733
- if (aIsWorse) return false;
4734
- if (aIsBetter) strictlyBetter = true;
4735
- }
4736
- return strictlyBetter;
4916
+ // src/muffled-gate-scanner.ts
4917
+ import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
4918
+ import { join } from "path";
4919
+ function codeOf(line) {
4920
+ return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
4737
4921
  }
4738
- function paretoFrontier(candidates, objectives) {
4739
- if (objectives.length === 0) {
4740
- throw new Error("paretoFrontier: at least 1 objective required");
4922
+ function isMuffleOk(line) {
4923
+ return line.includes("muffle-ok:");
4924
+ }
4925
+ var findFallbackToPass = (file, text) => {
4926
+ const out = [];
4927
+ const lines = text.split("\n");
4928
+ for (let i = 0; i < lines.length; i++) {
4929
+ const line = lines[i];
4930
+ if (isMuffleOk(line)) continue;
4931
+ const code = codeOf(line);
4932
+ if (!code.trim()) continue;
4933
+ if (/\|\| true/.test(code) && /(testCommand|setupCommand|cmd|command)/.test(code)) {
4934
+ out.push({ file, line: i + 1, lineText: line.trim(), pattern: "fallback-to-pass (|| true in command string)" });
4935
+ }
4741
4936
  }
4742
- const valid = candidates.filter(
4743
- (c) => objectives.every((o) => Number.isFinite(o.value(c)))
4744
- );
4745
- const frontier = [];
4746
- const dominated = [];
4747
- for (const c of valid) {
4748
- const isDominated = valid.some((other) => other !== c && dominates(other, c, objectives));
4749
- if (isDominated) dominated.push(c);
4750
- else frontier.push(c);
4937
+ return out;
4938
+ };
4939
+ var findLiteralTruePass = (file, text) => {
4940
+ const out = [];
4941
+ const lines = text.split("\n");
4942
+ for (let i = 0; i < lines.length; i++) {
4943
+ const line = lines[i];
4944
+ if (isMuffleOk(line)) continue;
4945
+ const code = codeOf(line);
4946
+ if (!code.trim()) continue;
4947
+ if (/testCommand\s*:\s*['"]true['"]/.test(code)) {
4948
+ out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'literal-true-pass (testCommand: "true")' });
4949
+ }
4751
4950
  }
4752
- const dominanceMap = frontier.map((d) => ({
4753
- dominator: d,
4754
- dominated: dominated.filter((x) => dominates(d, x, objectives))
4755
- }));
4756
- return { frontier, dominated, dominanceMap };
4951
+ return out;
4952
+ };
4953
+ var findConstructorCwdDropped = (file, text) => {
4954
+ const out = [];
4955
+ const lines = text.split("\n");
4956
+ for (let i = 0; i < lines.length; i++) {
4957
+ const line = lines[i];
4958
+ if (isMuffleOk(line)) continue;
4959
+ const code = codeOf(line);
4960
+ if (!code.trim()) continue;
4961
+ if (/new\s+SubprocessSandboxDriver\s*\(\s*\{[^}]*cwd\s*:/.test(code)) {
4962
+ out.push({
4963
+ file,
4964
+ line: i + 1,
4965
+ lineText: line.trim(),
4966
+ pattern: "construct-vs-call cwd dropped (driver.exec reads config.cwd, not constructor.cwd)"
4967
+ });
4968
+ }
4969
+ }
4970
+ return out;
4971
+ };
4972
+ var findAutoMatchNoExpectation = (file, text) => {
4973
+ const out = [];
4974
+ const lines = text.split("\n");
4975
+ for (let i = 0; i < lines.length; i++) {
4976
+ const line = lines[i];
4977
+ if (isMuffleOk(line)) continue;
4978
+ const code = codeOf(line);
4979
+ if (!code.trim()) continue;
4980
+ if (/if\s*\(\s*!expected\s*\)\s*return\s+true/.test(code)) {
4981
+ out.push({
4982
+ file,
4983
+ line: i + 1,
4984
+ lineText: line.trim(),
4985
+ pattern: "auto-match-no-expectation (if (!expected) return true)"
4986
+ });
4987
+ }
4988
+ }
4989
+ return out;
4990
+ };
4991
+ var findSkipCountsAsPass = (file, text) => {
4992
+ const out = [];
4993
+ const lines = text.split("\n");
4994
+ for (let i = 0; i < lines.length; i++) {
4995
+ const line = lines[i];
4996
+ if (isMuffleOk(line)) continue;
4997
+ const code = codeOf(line);
4998
+ if (!code.trim()) continue;
4999
+ if (/if\s*\(\s*\w+\.skipped\s*\)\s*return\s+true/.test(code)) {
5000
+ out.push({
5001
+ file,
5002
+ line: i + 1,
5003
+ lineText: line.trim(),
5004
+ pattern: "skip-counts-as-pass (if (.skipped) return true)"
5005
+ });
5006
+ }
5007
+ }
5008
+ return out;
5009
+ };
5010
+ var DEFAULT_FINDERS = [
5011
+ findFallbackToPass,
5012
+ findLiteralTruePass,
5013
+ findAutoMatchNoExpectation,
5014
+ findSkipCountsAsPass
5015
+ ];
5016
+ var UNIVERSAL_FINDERS = [
5017
+ findConstructorCwdDropped
5018
+ ];
5019
+ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
5020
+ const matches2 = [];
5021
+ const walk = (rel) => {
5022
+ const abs = join(repoRoot, rel);
5023
+ if (!existsSync2(abs)) return;
5024
+ for (const entry of readdirSync(abs)) {
5025
+ const sub = join(rel, entry);
5026
+ const subAbs = join(repoRoot, sub);
5027
+ let st;
5028
+ try {
5029
+ st = statSync(subAbs);
5030
+ } catch {
5031
+ continue;
5032
+ }
5033
+ if (st.isDirectory()) {
5034
+ if (entry === "node_modules" || entry === "dist" || entry === "dist-tests" || entry.startsWith(".")) continue;
5035
+ walk(sub);
5036
+ } else if (st.isFile() && extensions.test(entry)) {
5037
+ if (entry.endsWith(".test.ts") || entry.endsWith(".test.mjs") || entry.endsWith(".test.js")) continue;
5038
+ let text;
5039
+ try {
5040
+ text = readFileSync2(subAbs, "utf8");
5041
+ } catch {
5042
+ continue;
5043
+ }
5044
+ if (text.includes(importsContain)) matches2.push(sub);
5045
+ }
5046
+ }
5047
+ };
5048
+ for (const r of roots) walk(r);
5049
+ return matches2;
5050
+ }
5051
+ function scanForMuffledGates(opts) {
5052
+ const findings = [];
5053
+ const scanned = /* @__PURE__ */ new Set();
5054
+ for (const file of opts.scanFiles) {
5055
+ const abs = join(opts.repoRoot, file);
5056
+ if (!existsSync2(abs)) continue;
5057
+ const text = readFileSync2(abs, "utf8");
5058
+ for (const find of opts.finders) findings.push(...find(file, text));
5059
+ scanned.add(file);
5060
+ }
5061
+ if (opts.autoDerive) {
5062
+ const importers = autoDeriveImporters(
5063
+ opts.repoRoot,
5064
+ opts.autoDerive.roots,
5065
+ opts.autoDerive.extensions,
5066
+ opts.autoDerive.importsContain
5067
+ );
5068
+ for (const file of importers) {
5069
+ if (scanned.has(file)) continue;
5070
+ const abs = join(opts.repoRoot, file);
5071
+ if (!existsSync2(abs)) continue;
5072
+ const text = readFileSync2(abs, "utf8");
5073
+ for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
5074
+ }
5075
+ }
5076
+ return findings;
5077
+ }
5078
+ function formatFindings(findings) {
5079
+ if (findings.length === 0) return "";
5080
+ return [
5081
+ `Found ${findings.length} muffled-gate pattern(s).`,
5082
+ `Fix each or annotate the line with "// muffle-ok: <reason>".`,
5083
+ "",
5084
+ ...findings.map((f) => ` ${f.file}:${f.line} \u2014 ${f.pattern}
5085
+ ${f.lineText}`)
5086
+ ].join("\n");
4757
5087
  }
4758
5088
 
4759
5089
  // src/series-convergence.ts
@@ -4765,10 +5095,10 @@ function analyzeSeries(values, options = {}) {
4765
5095
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
4766
5096
  }
4767
5097
  const tail = values.slice(-window);
4768
- const mean3 = tail.reduce((a, b) => a + b, 0) / tail.length;
4769
- const variance2 = tail.reduce((acc, v) => acc + (v - mean3) ** 2, 0) / tail.length;
5098
+ const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
5099
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
4770
5100
  const stdDev = Math.sqrt(variance2);
4771
- const refMean = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
5101
+ const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
4772
5102
  const cv = stdDev / refMean;
4773
5103
  const stable = tail.length >= window && cv <= stableCv;
4774
5104
  let tailRun = 0;
@@ -4789,7 +5119,7 @@ function analyzeSeries(values, options = {}) {
4789
5119
  } else {
4790
5120
  state = "noisy";
4791
5121
  }
4792
- return { state, windowMean: mean3, windowCv: cv, tailRun, stable };
5122
+ return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
4793
5123
  }
4794
5124
 
4795
5125
  // src/state-continuity.ts
@@ -5717,12 +6047,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
5717
6047
  variantScores.push({ mutator: id, score, mutated });
5718
6048
  all.push(score);
5719
6049
  }
5720
- const mean3 = all.reduce((a, b) => a + b, 0) / all.length;
5721
- const variance2 = all.reduce((a, v) => a + (v - mean3) ** 2, 0) / all.length;
6050
+ const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
6051
+ const variance2 = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
5722
6052
  const stdDev = Math.sqrt(variance2);
5723
- const ref = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
6053
+ const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
5724
6054
  const robustness = Math.max(0, 1 - stdDev / ref);
5725
- return { originalScore, variantScores, meanScore: mean3, stdDev, robustness };
6055
+ return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
5726
6056
  }
5727
6057
  var lowercaseMutator = (p) => p.toLowerCase();
5728
6058
  var sentenceReorderMutator = (p, seed) => {
@@ -6407,8 +6737,8 @@ async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMet
6407
6737
  function toBin(chunk, lower, upper) {
6408
6738
  const xs = chunk.map((c) => c.x);
6409
6739
  const ys = chunk.map((c) => c.y);
6410
- const evalMean = mean2(xs);
6411
- const outcomeMean = mean2(ys);
6740
+ const evalMean = mean3(xs);
6741
+ const outcomeMean = mean3(ys);
6412
6742
  return {
6413
6743
  lower: lower ?? Math.min(...xs),
6414
6744
  upper: upper ?? Math.max(...xs),
@@ -6418,7 +6748,7 @@ function toBin(chunk, lower, upper) {
6418
6748
  gap: Math.abs(outcomeMean - evalMean)
6419
6749
  };
6420
6750
  }
6421
- function mean2(xs) {
6751
+ function mean3(xs) {
6422
6752
  return xs.reduce((a, b) => a + b, 0) / xs.length;
6423
6753
  }
6424
6754
  function defaultExtract4(metric) {
@@ -6643,8 +6973,8 @@ async function prmBestOfN(store, grader, runIds) {
6643
6973
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
6644
6974
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
6645
6975
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
6646
- const mean3 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6647
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / graded.length;
6976
+ const mean4 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6977
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / graded.length;
6648
6978
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6649
6979
  }
6650
6980
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -6666,8 +6996,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
6666
6996
  const ranked = [...byRun.values()].sort(
6667
6997
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
6668
6998
  );
6669
- const mean3 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
6670
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / ranked.length;
6999
+ const mean4 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7000
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / ranked.length;
6671
7001
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6672
7002
  }
6673
7003
 
@@ -6725,7 +7055,7 @@ async function commitBisect(options) {
6725
7055
  }
6726
7056
  async function promptBisect(options) {
6727
7057
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
6728
- const join = (paragraphs) => paragraphs.join("\n\n");
7058
+ const join2 = (paragraphs) => paragraphs.join("\n\n");
6729
7059
  const goodParas = split(options.good);
6730
7060
  const badParas = split(options.bad);
6731
7061
  if (goodParas.length !== badParas.length) {
@@ -6743,7 +7073,7 @@ async function promptBisect(options) {
6743
7073
  const result = await bisect({
6744
7074
  good: goodMask,
6745
7075
  bad: badMask,
6746
- runEval: (mask) => options.runEval(join(paragraphsFor(mask))),
7076
+ runEval: (mask) => options.runEval(join2(paragraphsFor(mask))),
6747
7077
  maxIterations: options.maxIterations ?? n + 5,
6748
7078
  halfway: (g, b) => {
6749
7079
  for (let i = 0; i < g.length; i++) {
@@ -6774,12 +7104,12 @@ async function promptBisect(options) {
6774
7104
  }
6775
7105
  }
6776
7106
  const materializedPath = result.path.map((s) => ({
6777
- state: join(paragraphsFor(s.state)),
7107
+ state: join2(paragraphsFor(s.state)),
6778
7108
  score: s.score,
6779
7109
  pass: s.pass
6780
7110
  }));
6781
7111
  return {
6782
- culprit: join(paragraphsFor(culprit)),
7112
+ culprit: join2(paragraphsFor(culprit)),
6783
7113
  path: materializedPath,
6784
7114
  converged: result.converged,
6785
7115
  inputInconsistent: result.inputInconsistent,
@@ -7197,8 +7527,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7197
7527
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7198
7528
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7199
7529
  if (scores.length < 3) continue;
7200
- const mean3 = scores.reduce((a, b) => a + b, 0) / scores.length;
7201
- const variance2 = scores.reduce((a, b) => a + (b - mean3) ** 2, 0) / scores.length;
7530
+ const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7531
+ const variance2 = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
7202
7532
  if (variance2 > varianceThreshold) {
7203
7533
  targets.push({
7204
7534
  reason: "high-variance",
@@ -7688,6 +8018,8 @@ export {
7688
8018
  CostTracker,
7689
8019
  DEFAULT_AGENT_SLOS,
7690
8020
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
8021
+ DEFAULT_FINDERS,
8022
+ DEFAULT_HARNESS_OBJECTIVES,
7691
8023
  DEFAULT_MUTATORS,
7692
8024
  DEFAULT_REDACTION_RULES,
7693
8025
  DEFAULT_RED_TEAM_CORPUS,
@@ -7724,6 +8056,7 @@ export {
7724
8056
  TRACE_SCHEMA_VERSION,
7725
8057
  TokenCounter,
7726
8058
  TraceEmitter,
8059
+ UNIVERSAL_FINDERS,
7727
8060
  adversarialJudge,
7728
8061
  aggregateLlm,
7729
8062
  aggregateRunScore,
@@ -7782,9 +8115,15 @@ export {
7782
8115
  failureClusterView,
7783
8116
  fileContains,
7784
8117
  fileExists,
8118
+ findAutoMatchNoExpectation,
8119
+ findConstructorCwdDropped,
8120
+ findFallbackToPass,
8121
+ findLiteralTruePass,
8122
+ findSkipCountsAsPass,
7785
8123
  firstDivergenceView,
7786
8124
  formatBenchmarkReport,
7787
8125
  formatDriverReport,
8126
+ formatFindings,
7788
8127
  groupBy,
7789
8128
  hashContent,
7790
8129
  hashScenarios,
@@ -7851,16 +8190,19 @@ export {
7851
8190
  runE2EWorkflow,
7852
8191
  runExpectations,
7853
8192
  runFailureClass,
8193
+ runHarnessExperiment,
7854
8194
  runJudgeFleet,
7855
8195
  runProposeReview,
7856
8196
  runSelfPlay,
7857
8197
  runTestGradedScenario,
7858
8198
  runsForScenario,
8199
+ scanForMuffledGates,
7859
8200
  scoreAllProjects,
7860
8201
  scoreContinuity,
7861
8202
  scoreProject,
7862
8203
  scoreRedTeamOutput,
7863
8204
  securityJudge,
8205
+ selectHarnessVariant,
7864
8206
  selfPreference,
7865
8207
  sentenceReorderMutator,
7866
8208
  signManifest,
@@ -7868,6 +8210,7 @@ export {
7868
8210
  statusAdvanced,
7869
8211
  stuckLoopView,
7870
8212
  summarize,
8213
+ summarizeHarnessResults,
7871
8214
  testJudge,
7872
8215
  textInSnapshot,
7873
8216
  toLangfuseEnvelope,