@tangle-network/agent-eval 0.7.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -410,22 +410,22 @@ function confidenceInterval(scores, confidence = 0.95) {
410
410
  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
411
  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
412
  const n = scores.length;
413
- const mean4 = scores.reduce((a, b) => a + b, 0) / n;
413
+ const mean5 = scores.reduce((a, b) => a + b, 0) / n;
414
414
  const B = 1e3;
415
415
  const bootstrapMeans = [];
416
416
  for (let i = 0; i < B; i++) {
417
- let sum = 0;
417
+ let sum2 = 0;
418
418
  for (let j = 0; j < n; j++) {
419
- sum += scores[Math.floor(Math.random() * n)];
419
+ sum2 += scores[Math.floor(Math.random() * n)];
420
420
  }
421
- bootstrapMeans.push(sum / n);
421
+ bootstrapMeans.push(sum2 / n);
422
422
  }
423
423
  bootstrapMeans.sort((a, b) => a - b);
424
424
  const alpha = 1 - confidence;
425
425
  const lowerIdx = Math.floor(alpha / 2 * B);
426
426
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
427
  return {
428
- mean: mean4,
428
+ mean: mean5,
429
429
  lower: bootstrapMeans[lowerIdx],
430
430
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
431
  };
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
513
513
  const n = before.length;
514
514
  if (n < 2) return { t: 0, df: 0, p: 1 };
515
515
  const diffs = before.map((b, i) => after[i] - b);
516
- const mean4 = diffs.reduce((a, b) => a + b, 0) / n;
517
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean4) ** 2, 0) / (n - 1);
516
+ const mean5 = diffs.reduce((a, b) => a + b, 0) / n;
517
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean5) ** 2, 0) / (n - 1);
518
518
  const se = Math.sqrt(variance2 / n);
519
- if (se === 0) return { t: mean4 === 0 ? 0 : Infinity, df: n - 1, p: mean4 === 0 ? 1 : 0 };
520
- const t = mean4 / se;
519
+ if (se === 0) return { t: mean5 === 0 ? 0 : Infinity, df: n - 1, p: mean5 === 0 ? 1 : 0 };
520
+ const t = mean5 / se;
521
521
  const df = n - 1;
522
522
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
523
  return { t, df, p };
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
541
541
  }
542
542
  let wPlus = 0;
543
543
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
544
- const mean4 = n * (n + 1) / 4;
544
+ const mean5 = n * (n + 1) / 4;
545
545
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
546
- const z = (wPlus - mean4) / Math.sqrt(variance2);
546
+ const z = (wPlus - mean5) / Math.sqrt(variance2);
547
547
  const p = 2 * (1 - normalCdf(Math.abs(z)));
548
548
  return { w: wPlus, p };
549
549
  }
@@ -1562,12 +1562,12 @@ function analyzeAntiSlop(outputs, config) {
1562
1562
  }
1563
1563
  let dupes = 0;
1564
1564
  for (const n of seen.values()) if (n > 1) dupes += n - 1;
1565
- const ratio = dupes / sentences.length;
1566
- if (ratio > config.repetitionThreshold) {
1565
+ const ratio2 = dupes / sentences.length;
1566
+ if (ratio2 > config.repetitionThreshold) {
1567
1567
  counts.repetition += 1;
1568
1568
  issues.push({
1569
1569
  category: "repetition",
1570
- detail: `${(ratio * 100).toFixed(0)}% duplicated (threshold ${(config.repetitionThreshold * 100).toFixed(0)}%)`
1570
+ detail: `${(ratio2 * 100).toFixed(0)}% duplicated (threshold ${(config.repetitionThreshold * 100).toFixed(0)}%)`
1571
1571
  });
1572
1572
  }
1573
1573
  }
@@ -2187,7 +2187,7 @@ var RunCritic = class {
2187
2187
  );
2188
2188
  const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
2189
2189
  if (!success) notes.push("run did not complete with pass=true");
2190
- const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
2190
+ const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
2191
2191
  const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
2192
2192
  const goalProgress = outcomeScore ?? judgeAverage ?? success;
2193
2193
  const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
@@ -2197,7 +2197,7 @@ var RunCritic = class {
2197
2197
  const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
2198
2198
  if (!patchQuality) notes.push("no artifact or edit evidence recorded");
2199
2199
  const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
2200
- const testReality = sandboxTests.length ? sandboxTests.reduce((sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
2200
+ const testReality = sandboxTests.length ? sandboxTests.reduce((sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
2201
2201
  if (!testReality) notes.push("no real test/build evidence recorded");
2202
2202
  const blockerSpans = judgeSpans2.filter(
2203
2203
  (span) => isBlockingJudge(span)
@@ -2213,7 +2213,7 @@ var RunCritic = class {
2213
2213
  const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
2214
2214
  const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
2215
2215
  if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
2216
- const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((sum, span) => sum + (span.costUsd ?? 0), 0);
2216
+ const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
2217
2217
  const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
2218
2218
  return {
2219
2219
  success,
@@ -2423,7 +2423,7 @@ function rankRows(rows, weights) {
2423
2423
  }
2424
2424
  return [...buckets.entries()].map(([variantId, values]) => ({
2425
2425
  variantId,
2426
- mean: values.reduce((sum, value) => sum + value, 0) / values.length,
2426
+ mean: values.reduce((sum2, value) => sum2 + value, 0) / values.length,
2427
2427
  runs: values.length
2428
2428
  })).sort((a, b) => b.mean - a.mean);
2429
2429
  }
@@ -2571,7 +2571,7 @@ async function mapLimit(items, limit, fn) {
2571
2571
  return results;
2572
2572
  }
2573
2573
  function mean(values) {
2574
- return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0;
2574
+ return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
2575
2575
  }
2576
2576
  function meanRunScore(scores) {
2577
2577
  return {
@@ -4853,6 +4853,30 @@ var CostTracker = class {
4853
4853
  if (!bucket) throw new Error(`CostTracker.markOutcome: unknown scenario "${scenarioId}"`);
4854
4854
  bucket.completed = completed;
4855
4855
  }
4856
+ /**
4857
+ * Convenience: record + markOutcome in one call from a
4858
+ * `{ usage, verdict }`-shaped response (starter-foundry's
4859
+ * `invokeMetaJudge` returns this shape; consumers that wrap any
4860
+ * judge/critic can follow the same convention).
4861
+ *
4862
+ * `usage.model` must be present in `MODEL_PRICING` for cost math to
4863
+ * populate; otherwise totalCostUsd stays at 0 for the entry but
4864
+ * tokens still aggregate.
4865
+ */
4866
+ recordVerdict(verdict, scenarioId, tags) {
4867
+ if (!verdict.usage) return null;
4868
+ const entry = this.record({
4869
+ scenarioId,
4870
+ model: verdict.usage.model,
4871
+ inputTokens: verdict.usage.inputTokens,
4872
+ outputTokens: verdict.usage.outputTokens,
4873
+ cachedTokens: verdict.usage.cachedTokens,
4874
+ reasoningTokens: verdict.usage.reasoningTokens,
4875
+ tags
4876
+ });
4877
+ this.markOutcome(scenarioId, verdict.verdict === "pass");
4878
+ return entry;
4879
+ }
4856
4880
  get(scenarioId) {
4857
4881
  return this.byScenario.get(scenarioId);
4858
4882
  }
@@ -4889,6 +4913,179 @@ function assertNonNegative(n, name) {
4889
4913
  }
4890
4914
  }
4891
4915
 
4916
+ // src/muffled-gate-scanner.ts
4917
+ import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
4918
+ import { join } from "path";
4919
+ function codeOf(line) {
4920
+ return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
4921
+ }
4922
+ function isMuffleOk(line) {
4923
+ return line.includes("muffle-ok:");
4924
+ }
4925
+ var findFallbackToPass = (file, text) => {
4926
+ const out = [];
4927
+ const lines = text.split("\n");
4928
+ for (let i = 0; i < lines.length; i++) {
4929
+ const line = lines[i];
4930
+ if (isMuffleOk(line)) continue;
4931
+ const code = codeOf(line);
4932
+ if (!code.trim()) continue;
4933
+ if (/\|\| true/.test(code) && /(testCommand|setupCommand|cmd|command)/.test(code)) {
4934
+ out.push({ file, line: i + 1, lineText: line.trim(), pattern: "fallback-to-pass (|| true in command string)" });
4935
+ }
4936
+ }
4937
+ return out;
4938
+ };
4939
+ var findLiteralTruePass = (file, text) => {
4940
+ const out = [];
4941
+ const lines = text.split("\n");
4942
+ for (let i = 0; i < lines.length; i++) {
4943
+ const line = lines[i];
4944
+ if (isMuffleOk(line)) continue;
4945
+ const code = codeOf(line);
4946
+ if (!code.trim()) continue;
4947
+ if (/testCommand\s*:\s*['"]true['"]/.test(code)) {
4948
+ out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'literal-true-pass (testCommand: "true")' });
4949
+ }
4950
+ }
4951
+ return out;
4952
+ };
4953
+ var findConstructorCwdDropped = (file, text) => {
4954
+ const out = [];
4955
+ const lines = text.split("\n");
4956
+ for (let i = 0; i < lines.length; i++) {
4957
+ const line = lines[i];
4958
+ if (isMuffleOk(line)) continue;
4959
+ const code = codeOf(line);
4960
+ if (!code.trim()) continue;
4961
+ if (/new\s+SubprocessSandboxDriver\s*\(\s*\{[^}]*cwd\s*:/.test(code)) {
4962
+ out.push({
4963
+ file,
4964
+ line: i + 1,
4965
+ lineText: line.trim(),
4966
+ pattern: "construct-vs-call cwd dropped (driver.exec reads config.cwd, not constructor.cwd)"
4967
+ });
4968
+ }
4969
+ }
4970
+ return out;
4971
+ };
4972
+ var findAutoMatchNoExpectation = (file, text) => {
4973
+ const out = [];
4974
+ const lines = text.split("\n");
4975
+ for (let i = 0; i < lines.length; i++) {
4976
+ const line = lines[i];
4977
+ if (isMuffleOk(line)) continue;
4978
+ const code = codeOf(line);
4979
+ if (!code.trim()) continue;
4980
+ if (/if\s*\(\s*!expected\s*\)\s*return\s+true/.test(code)) {
4981
+ out.push({
4982
+ file,
4983
+ line: i + 1,
4984
+ lineText: line.trim(),
4985
+ pattern: "auto-match-no-expectation (if (!expected) return true)"
4986
+ });
4987
+ }
4988
+ }
4989
+ return out;
4990
+ };
4991
+ var findSkipCountsAsPass = (file, text) => {
4992
+ const out = [];
4993
+ const lines = text.split("\n");
4994
+ for (let i = 0; i < lines.length; i++) {
4995
+ const line = lines[i];
4996
+ if (isMuffleOk(line)) continue;
4997
+ const code = codeOf(line);
4998
+ if (!code.trim()) continue;
4999
+ if (/if\s*\(\s*\w+\.skipped\s*\)\s*return\s+true/.test(code)) {
5000
+ out.push({
5001
+ file,
5002
+ line: i + 1,
5003
+ lineText: line.trim(),
5004
+ pattern: "skip-counts-as-pass (if (.skipped) return true)"
5005
+ });
5006
+ }
5007
+ }
5008
+ return out;
5009
+ };
5010
+ var DEFAULT_FINDERS = [
5011
+ findFallbackToPass,
5012
+ findLiteralTruePass,
5013
+ findAutoMatchNoExpectation,
5014
+ findSkipCountsAsPass
5015
+ ];
5016
+ var UNIVERSAL_FINDERS = [
5017
+ findConstructorCwdDropped
5018
+ ];
5019
+ function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
5020
+ const matches2 = [];
5021
+ const walk = (rel) => {
5022
+ const abs = join(repoRoot, rel);
5023
+ if (!existsSync2(abs)) return;
5024
+ for (const entry of readdirSync(abs)) {
5025
+ const sub = join(rel, entry);
5026
+ const subAbs = join(repoRoot, sub);
5027
+ let st;
5028
+ try {
5029
+ st = statSync(subAbs);
5030
+ } catch {
5031
+ continue;
5032
+ }
5033
+ if (st.isDirectory()) {
5034
+ if (entry === "node_modules" || entry === "dist" || entry === "dist-tests" || entry.startsWith(".")) continue;
5035
+ walk(sub);
5036
+ } else if (st.isFile() && extensions.test(entry)) {
5037
+ if (entry.endsWith(".test.ts") || entry.endsWith(".test.mjs") || entry.endsWith(".test.js")) continue;
5038
+ let text;
5039
+ try {
5040
+ text = readFileSync2(subAbs, "utf8");
5041
+ } catch {
5042
+ continue;
5043
+ }
5044
+ if (text.includes(importsContain)) matches2.push(sub);
5045
+ }
5046
+ }
5047
+ };
5048
+ for (const r of roots) walk(r);
5049
+ return matches2;
5050
+ }
5051
+ function scanForMuffledGates(opts) {
5052
+ const findings = [];
5053
+ const scanned = /* @__PURE__ */ new Set();
5054
+ for (const file of opts.scanFiles) {
5055
+ const abs = join(opts.repoRoot, file);
5056
+ if (!existsSync2(abs)) continue;
5057
+ const text = readFileSync2(abs, "utf8");
5058
+ for (const find of opts.finders) findings.push(...find(file, text));
5059
+ scanned.add(file);
5060
+ }
5061
+ if (opts.autoDerive) {
5062
+ const importers = autoDeriveImporters(
5063
+ opts.repoRoot,
5064
+ opts.autoDerive.roots,
5065
+ opts.autoDerive.extensions,
5066
+ opts.autoDerive.importsContain
5067
+ );
5068
+ for (const file of importers) {
5069
+ if (scanned.has(file)) continue;
5070
+ const abs = join(opts.repoRoot, file);
5071
+ if (!existsSync2(abs)) continue;
5072
+ const text = readFileSync2(abs, "utf8");
5073
+ for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
5074
+ }
5075
+ }
5076
+ return findings;
5077
+ }
5078
+ function formatFindings(findings) {
5079
+ if (findings.length === 0) return "";
5080
+ return [
5081
+ `Found ${findings.length} muffled-gate pattern(s).`,
5082
+ `Fix each or annotate the line with "// muffle-ok: <reason>".`,
5083
+ "",
5084
+ ...findings.map((f) => ` ${f.file}:${f.line} \u2014 ${f.pattern}
5085
+ ${f.lineText}`)
5086
+ ].join("\n");
5087
+ }
5088
+
4892
5089
  // src/series-convergence.ts
4893
5090
  function analyzeSeries(values, options = {}) {
4894
5091
  const window = options.window ?? 5;
@@ -4898,10 +5095,10 @@ function analyzeSeries(values, options = {}) {
4898
5095
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
4899
5096
  }
4900
5097
  const tail = values.slice(-window);
4901
- const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
4902
- const variance2 = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
5098
+ const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
5099
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
4903
5100
  const stdDev = Math.sqrt(variance2);
4904
- const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
5101
+ const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
4905
5102
  const cv = stdDev / refMean;
4906
5103
  const stable = tail.length >= window && cv <= stableCv;
4907
5104
  let tailRun = 0;
@@ -4922,7 +5119,7 @@ function analyzeSeries(values, options = {}) {
4922
5119
  } else {
4923
5120
  state = "noisy";
4924
5121
  }
4925
- return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
5122
+ return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
4926
5123
  }
4927
5124
 
4928
5125
  // src/state-continuity.ts
@@ -5515,8 +5712,8 @@ function calibrateJudge(golden, candidate) {
5515
5712
  const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
5516
5713
  const absDiffs = common.map((c) => Math.abs(c.j - c.h));
5517
5714
  const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
5518
- const worst = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
5519
- return { n, pearson: pearson2, kappa, mae, worstItems: worst };
5715
+ const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
5716
+ return { n, pearson: pearson2, kappa, mae, worstItems: worst2 };
5520
5717
  }
5521
5718
  function positionalBias(scores) {
5522
5719
  const pairs = /* @__PURE__ */ new Map();
@@ -5850,12 +6047,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
5850
6047
  variantScores.push({ mutator: id, score, mutated });
5851
6048
  all.push(score);
5852
6049
  }
5853
- const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
5854
- const variance2 = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
6050
+ const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
6051
+ const variance2 = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
5855
6052
  const stdDev = Math.sqrt(variance2);
5856
- const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
6053
+ const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
5857
6054
  const robustness = Math.max(0, 1 - stdDev / ref);
5858
- return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
6055
+ return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
5859
6056
  }
5860
6057
  var lowercaseMutator = (p) => p.toLowerCase();
5861
6058
  var sentenceReorderMutator = (p, seed) => {
@@ -5917,9 +6114,9 @@ function visualDiff(a, b, options = {}) {
5917
6114
  const dg = Math.abs(a.data[i + 1] - b.data[i + 1]);
5918
6115
  const db = Math.abs(a.data[i + 2] - b.data[i + 2]);
5919
6116
  const da = Math.abs(a.data[i + 3] - b.data[i + 3]);
5920
- const worst = Math.max(dr, dg, db, da);
5921
- if (worst > maxDelta) maxDelta = worst;
5922
- if (worst > tolerance) differing++;
6117
+ const worst2 = Math.max(dr, dg, db, da);
6118
+ if (worst2 > maxDelta) maxDelta = worst2;
6119
+ if (worst2 > tolerance) differing++;
5923
6120
  }
5924
6121
  const diffRatio = totalPixels > 0 ? differing / totalPixels : 0;
5925
6122
  const status = diffRatio < 1e-3 ? "unchanged" : diffRatio > 0.05 ? "severely-changed" : "changed";
@@ -6776,8 +6973,8 @@ async function prmBestOfN(store, grader, runIds) {
6776
6973
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
6777
6974
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
6778
6975
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
6779
- const mean4 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6780
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / graded.length;
6976
+ const mean5 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6977
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / graded.length;
6781
6978
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6782
6979
  }
6783
6980
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -6799,8 +6996,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
6799
6996
  const ranked = [...byRun.values()].sort(
6800
6997
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
6801
6998
  );
6802
- const mean4 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
6803
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean4) ** 2, 0) / ranked.length;
6999
+ const mean5 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7000
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / ranked.length;
6804
7001
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6805
7002
  }
6806
7003
 
@@ -6858,7 +7055,7 @@ async function commitBisect(options) {
6858
7055
  }
6859
7056
  async function promptBisect(options) {
6860
7057
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
6861
- const join = (paragraphs) => paragraphs.join("\n\n");
7058
+ const join3 = (paragraphs) => paragraphs.join("\n\n");
6862
7059
  const goodParas = split(options.good);
6863
7060
  const badParas = split(options.bad);
6864
7061
  if (goodParas.length !== badParas.length) {
@@ -6876,7 +7073,7 @@ async function promptBisect(options) {
6876
7073
  const result = await bisect({
6877
7074
  good: goodMask,
6878
7075
  bad: badMask,
6879
- runEval: (mask) => options.runEval(join(paragraphsFor(mask))),
7076
+ runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
6880
7077
  maxIterations: options.maxIterations ?? n + 5,
6881
7078
  halfway: (g, b) => {
6882
7079
  for (let i = 0; i < g.length; i++) {
@@ -6907,12 +7104,12 @@ async function promptBisect(options) {
6907
7104
  }
6908
7105
  }
6909
7106
  const materializedPath = result.path.map((s) => ({
6910
- state: join(paragraphsFor(s.state)),
7107
+ state: join3(paragraphsFor(s.state)),
6911
7108
  score: s.score,
6912
7109
  pass: s.pass
6913
7110
  }));
6914
7111
  return {
6915
- culprit: join(paragraphsFor(culprit)),
7112
+ culprit: join3(paragraphsFor(culprit)),
6916
7113
  path: materializedPath,
6917
7114
  converged: result.converged,
6918
7115
  inputInconsistent: result.inputInconsistent,
@@ -7330,8 +7527,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7330
7527
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7331
7528
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7332
7529
  if (scores.length < 3) continue;
7333
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7334
- const variance2 = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
7530
+ const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
7531
+ const variance2 = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
7335
7532
  if (variance2 > varianceThreshold) {
7336
7533
  targets.push({
7337
7534
  reason: "high-variance",
@@ -7810,6 +8007,1853 @@ async function euAiActReport(ctx, signals) {
7810
8007
  generatedAt: (/* @__PURE__ */ new Date()).toISOString()
7811
8008
  };
7812
8009
  }
8010
+
8011
+ // src/llm-client.ts
8012
+ var LlmCallError = class extends Error {
8013
+ constructor(message, status, body, model) {
8014
+ super(message);
8015
+ this.status = status;
8016
+ this.body = body;
8017
+ this.model = model;
8018
+ this.name = "LlmCallError";
8019
+ }
8020
+ status;
8021
+ body;
8022
+ model;
8023
+ };
8024
+ var DEFAULT_BASE_URL = "https://router.tangle.tools/v1";
8025
+ var DEFAULT_TIMEOUT_MS = 6e4;
8026
+ var DEFAULT_MAX_RETRIES = 3;
8027
+ var RETRYABLE_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
8028
+ function isRetryableError(err) {
8029
+ if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status);
8030
+ if (err instanceof Error) {
8031
+ return err.name === "AbortError" || err.name === "TimeoutError" || /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message);
8032
+ }
8033
+ return false;
8034
+ }
8035
+ function parseRetryAfter(headers) {
8036
+ const h = headers.get("retry-after");
8037
+ if (!h) return null;
8038
+ const asNumber = Number(h);
8039
+ if (Number.isFinite(asNumber) && asNumber > 0) return asNumber * 1e3;
8040
+ const asDate = Date.parse(h);
8041
+ if (Number.isFinite(asDate)) return Math.max(0, asDate - Date.now());
8042
+ return null;
8043
+ }
8044
+ function backoffMs(attempt) {
8045
+ return Math.min(500 * Math.pow(2, attempt), 16e3);
8046
+ }
8047
+ function buildHeaders(opts) {
8048
+ const headers = {
8049
+ "Content-Type": "application/json",
8050
+ Accept: "application/json"
8051
+ };
8052
+ if (opts.authHeader) {
8053
+ headers[opts.authHeader.name] = opts.authHeader.value;
8054
+ } else if (opts.bearer || opts.apiKey) {
8055
+ headers.Authorization = `Bearer ${opts.bearer ?? opts.apiKey}`;
8056
+ }
8057
+ return headers;
8058
+ }
8059
+ function isSchemaRejection(status, body) {
8060
+ if (status !== 400) return false;
8061
+ const lower = body.toLowerCase();
8062
+ return lower.includes("response_format") || lower.includes("json_schema") || lower.includes("is unavailable") || lower.includes("not supported");
8063
+ }
8064
+ function buildBody(req, forceJsonObject) {
8065
+ const body = {
8066
+ model: req.model,
8067
+ messages: req.messages,
8068
+ temperature: req.temperature ?? 0
8069
+ };
8070
+ if (req.maxTokens != null) body.max_tokens = req.maxTokens;
8071
+ if (req.jsonSchema && !forceJsonObject) {
8072
+ body.response_format = {
8073
+ type: "json_schema",
8074
+ json_schema: { name: req.jsonSchema.name, schema: req.jsonSchema.schema, strict: true }
8075
+ };
8076
+ } else if (req.jsonMode || req.jsonSchema) {
8077
+ body.response_format = { type: "json_object" };
8078
+ }
8079
+ return body;
8080
+ }
8081
+ async function sleep(ms) {
8082
+ return new Promise((resolve) => setTimeout(resolve, ms));
8083
+ }
8084
+ function stripFencedJson(raw) {
8085
+ const trimmed = raw.trim();
8086
+ const m = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```\s*$/);
8087
+ return m ? m[1].trim() : trimmed;
8088
+ }
8089
+ async function callLlm(req, opts = {}) {
8090
+ const baseUrl = (opts.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
8091
+ const url = `${baseUrl}/chat/completions`;
8092
+ const timeoutMs = req.timeoutMs ?? opts.defaultTimeoutMs ?? DEFAULT_TIMEOUT_MS;
8093
+ const maxRetries = opts.maxRetries ?? DEFAULT_MAX_RETRIES;
8094
+ const fetchFn = opts.fetch ?? globalThis.fetch;
8095
+ const headers = buildHeaders(opts);
8096
+ let lastErr;
8097
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
8098
+ const controller = new AbortController();
8099
+ const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs);
8100
+ const started = Date.now();
8101
+ try {
8102
+ const res = await fetchFn(url, {
8103
+ method: "POST",
8104
+ headers,
8105
+ body: JSON.stringify(buildBody(req, false)),
8106
+ signal: controller.signal
8107
+ });
8108
+ clearTimeout(timeoutHandle);
8109
+ if (!res.ok) {
8110
+ const body = await res.text();
8111
+ const err = new LlmCallError(
8112
+ `LLM call ${res.status}: ${body.slice(0, 300)}`,
8113
+ res.status,
8114
+ body,
8115
+ req.model
8116
+ );
8117
+ if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {
8118
+ lastErr = err;
8119
+ const retryAfter = parseRetryAfter(res.headers);
8120
+ await sleep(retryAfter ?? backoffMs(attempt));
8121
+ continue;
8122
+ }
8123
+ throw err;
8124
+ }
8125
+ const json = await res.json();
8126
+ const choice = json.choices?.[0];
8127
+ const usageRaw = json.usage ?? {};
8128
+ const costFromProxy = json._response_cost ?? json.cost_usd;
8129
+ return {
8130
+ content: choice?.message?.content ?? "",
8131
+ usage: {
8132
+ promptTokens: Number(usageRaw.prompt_tokens ?? 0),
8133
+ completionTokens: Number(usageRaw.completion_tokens ?? 0),
8134
+ totalTokens: Number(usageRaw.total_tokens ?? 0),
8135
+ cachedPromptTokens: usageRaw.prompt_tokens_details && typeof usageRaw.prompt_tokens_details === "object" ? Number(
8136
+ usageRaw.prompt_tokens_details.cached_tokens ?? 0
8137
+ ) : void 0
8138
+ },
8139
+ costUsd: typeof costFromProxy === "number" ? costFromProxy : null,
8140
+ model: json.model ?? req.model,
8141
+ durationMs: Date.now() - started,
8142
+ raw: json
8143
+ };
8144
+ } catch (err) {
8145
+ clearTimeout(timeoutHandle);
8146
+ lastErr = err;
8147
+ if (attempt < maxRetries - 1 && isRetryableError(err)) {
8148
+ await sleep(backoffMs(attempt));
8149
+ continue;
8150
+ }
8151
+ throw err;
8152
+ }
8153
+ }
8154
+ throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
8155
+ }
8156
+ async function callLlmJson(req, opts = {}) {
8157
+ try {
8158
+ const result = await callLlm({ ...req, jsonMode: req.jsonMode ?? !req.jsonSchema }, opts);
8159
+ const value = parseJsonSafely(result.content, result.model);
8160
+ return { value, result };
8161
+ } catch (err) {
8162
+ if (err instanceof LlmCallError && isSchemaRejection(err.status, err.body) && req.jsonSchema) {
8163
+ const degradedReq = { ...req, jsonMode: true, jsonSchema: void 0 };
8164
+ const result = await callLlm(degradedReq, opts);
8165
+ const value = parseJsonSafely(result.content, result.model);
8166
+ return { value, result };
8167
+ }
8168
+ throw err;
8169
+ }
8170
+ }
8171
+ function parseJsonSafely(content, model) {
8172
+ const stripped = stripFencedJson(content);
8173
+ try {
8174
+ return JSON.parse(stripped);
8175
+ } catch (err) {
8176
+ throw new Error(
8177
+ `LLM returned non-JSON content (model=${model}): ${err instanceof Error ? err.message : String(err)}
8178
+ --- raw content ---
8179
+ ${content.slice(0, 800)}`
8180
+ );
8181
+ }
8182
+ }
8183
+ async function probeLlm(model, opts = {}) {
8184
+ const start = Date.now();
8185
+ try {
8186
+ await callLlm(
8187
+ {
8188
+ model,
8189
+ messages: [{ role: "user", content: "ping" }],
8190
+ maxTokens: 64,
8191
+ timeoutMs: opts.timeoutMs ?? 3e4
8192
+ },
8193
+ opts
8194
+ );
8195
+ return { ok: true, latencyMs: Date.now() - start, error: null };
8196
+ } catch (err) {
8197
+ return {
8198
+ ok: false,
8199
+ latencyMs: Date.now() - start,
8200
+ error: err instanceof Error ? err.message : String(err)
8201
+ };
8202
+ }
8203
+ }
8204
+ var LlmClient = class {
8205
+ constructor(opts = {}) {
8206
+ this.opts = opts;
8207
+ }
8208
+ opts;
8209
+ call(req, per) {
8210
+ return callLlm(req, { ...this.opts, ...per });
8211
+ }
8212
+ callJson(req, per) {
8213
+ return callLlmJson(req, { ...this.opts, ...per });
8214
+ }
8215
+ };
8216
+
8217
+ // src/multi-layer-verifier.ts
8218
+ function gradeSemanticStatus(input) {
8219
+ if (!input.available) return "error";
8220
+ const threshold = input.threshold ?? 0.7;
8221
+ const criticalGaps = input.findings.filter(
8222
+ (f) => f.severity === "critical" && (f.present === false || (f.score ?? 0) < 7)
8223
+ );
8224
+ return input.score >= threshold && criticalGaps.length === 0 ? "pass" : "fail";
8225
+ }
8226
+ var MultiLayerVerifier = class {
8227
+ constructor(layers) {
8228
+ this.layers = layers;
8229
+ const seen = /* @__PURE__ */ new Set();
8230
+ for (const l of layers) {
8231
+ if (seen.has(l.name)) throw new Error(`MultiLayerVerifier: duplicate layer name "${l.name}"`);
8232
+ seen.add(l.name);
8233
+ }
8234
+ for (const l of layers) {
8235
+ for (const dep of l.dependsOn ?? []) {
8236
+ if (!seen.has(dep)) {
8237
+ throw new Error(
8238
+ `MultiLayerVerifier: layer "${l.name}" depends on "${dep}" which is not registered`
8239
+ );
8240
+ }
8241
+ }
8242
+ }
8243
+ }
8244
+ layers;
8245
+ async run(opts) {
8246
+ const startedAtMs = Date.now();
8247
+ const startedAt = new Date(startedAtMs).toISOString();
8248
+ const controller = new AbortController();
8249
+ const overallCap = opts.overallCapMs;
8250
+ const overallTimer = overallCap != null ? setTimeout(() => controller.abort(new Error("overall cap exceeded")), overallCap) : null;
8251
+ const results = [];
8252
+ const byName = {};
8253
+ try {
8254
+ for (const layer of this.layers) {
8255
+ const unmet = (layer.dependsOn ?? []).filter((d) => byName[d]?.status !== "pass");
8256
+ if (unmet.length > 0) {
8257
+ const skipped = {
8258
+ layer: layer.name,
8259
+ status: "skipped",
8260
+ durationMs: 0,
8261
+ findings: [],
8262
+ reason: `skipped \u2014 upstream not passing: ${unmet.join(", ")}`
8263
+ };
8264
+ results.push(skipped);
8265
+ byName[layer.name] = skipped;
8266
+ opts.onLayer?.(skipped);
8267
+ continue;
8268
+ }
8269
+ const perLayerController = new AbortController();
8270
+ const mergedSignal = mergeSignals(controller.signal, perLayerController.signal);
8271
+ const layerTimer = layer.capMs != null ? setTimeout(() => perLayerController.abort(new Error(`layer ${layer.name} cap`)), layer.capMs) : null;
8272
+ const layerStart = Date.now();
8273
+ let result;
8274
+ try {
8275
+ result = await layer.run({ env: opts.env, prior: { ...byName }, signal: mergedSignal });
8276
+ } catch (err) {
8277
+ const aborted = mergedSignal.aborted;
8278
+ result = {
8279
+ layer: layer.name,
8280
+ status: aborted ? "timeout" : "error",
8281
+ durationMs: Date.now() - layerStart,
8282
+ findings: [
8283
+ {
8284
+ severity: "major",
8285
+ message: err instanceof Error ? err.message : String(err),
8286
+ layer: layer.name
8287
+ }
8288
+ ],
8289
+ reason: err instanceof Error ? err.message : String(err)
8290
+ };
8291
+ } finally {
8292
+ if (layerTimer) clearTimeout(layerTimer);
8293
+ }
8294
+ result.findings = result.findings.map((f) => ({ ...f, layer: f.layer ?? layer.name }));
8295
+ results.push(result);
8296
+ byName[layer.name] = result;
8297
+ opts.onLayer?.(result);
8298
+ if (controller.signal.aborted) break;
8299
+ }
8300
+ const report = aggregate(this.layers, results, startedAt, startedAtMs);
8301
+ return report;
8302
+ } finally {
8303
+ if (overallTimer) clearTimeout(overallTimer);
8304
+ }
8305
+ }
8306
+ };
8307
+ function aggregate(layers, results, startedAt, startedAtMs) {
8308
+ const weightByName = /* @__PURE__ */ new Map();
8309
+ const failContribByName = /* @__PURE__ */ new Map();
8310
+ for (const l of layers) {
8311
+ weightByName.set(l.name, l.weight ?? 1);
8312
+ failContribByName.set(l.name, l.failContributesToScore ?? false);
8313
+ }
8314
+ let passCount = 0;
8315
+ let failCount = 0;
8316
+ let skippedCount = 0;
8317
+ let errorCount = 0;
8318
+ let scoredWeightSum = 0;
8319
+ let scoredWeightedTotal = 0;
8320
+ let ranAnyScoredLayer = false;
8321
+ let anyScoredLayerFailed = false;
8322
+ for (const r of results) {
8323
+ const weight = weightByName.get(r.layer) ?? 1;
8324
+ const failContrib = failContribByName.get(r.layer) ?? false;
8325
+ if (r.status === "pass") passCount++;
8326
+ else if (r.status === "fail") failCount++;
8327
+ else if (r.status === "skipped") skippedCount++;
8328
+ else errorCount++;
8329
+ if (r.score != null && weight > 0) {
8330
+ if (r.status === "pass") {
8331
+ ranAnyScoredLayer = true;
8332
+ scoredWeightSum += weight;
8333
+ scoredWeightedTotal += weight * r.score;
8334
+ } else if (r.status === "fail") {
8335
+ if (failContrib) {
8336
+ ranAnyScoredLayer = true;
8337
+ scoredWeightSum += weight;
8338
+ scoredWeightedTotal += weight * r.score;
8339
+ }
8340
+ anyScoredLayerFailed = true;
8341
+ }
8342
+ } else if (r.status === "fail") {
8343
+ anyScoredLayerFailed = true;
8344
+ }
8345
+ }
8346
+ const finishedAtMs = Date.now();
8347
+ return {
8348
+ layers: results,
8349
+ passCount,
8350
+ failCount,
8351
+ skippedCount,
8352
+ errorCount,
8353
+ allPass: ranAnyScoredLayer && !anyScoredLayerFailed && failCount === 0 && errorCount === 0,
8354
+ blendedScore: scoredWeightSum > 0 ? scoredWeightedTotal / scoredWeightSum : 0,
8355
+ durationMs: finishedAtMs - startedAtMs,
8356
+ startedAt,
8357
+ finishedAt: new Date(finishedAtMs).toISOString()
8358
+ };
8359
+ }
8360
+ function mergeSignals(a, b) {
8361
+ if (a.aborted) return a;
8362
+ if (b.aborted) return b;
8363
+ const c = new AbortController();
8364
+ const onAbort = (signal) => () => c.abort(signal.reason);
8365
+ a.addEventListener("abort", onAbort(a), { once: true });
8366
+ b.addEventListener("abort", onAbort(b), { once: true });
8367
+ return c.signal;
8368
+ }
8369
+
8370
+ // src/command-runner.ts
8371
+ import { spawnSync } from "child_process";
8372
+ import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
8373
+ import { join as join2 } from "path";
8374
+ var localCommandRunner = {
8375
+ name: "local",
8376
+ async run(input) {
8377
+ const start = Date.now();
8378
+ const res = spawnSync(input.cmd, input.argv, {
8379
+ cwd: input.cwd,
8380
+ encoding: "utf8",
8381
+ timeout: input.capMs,
8382
+ env: { ...process.env, CI: "1", ...input.env ?? {} },
8383
+ input: input.stdin
8384
+ });
8385
+ const durationMs = Date.now() - start;
8386
+ const timedOut = !!(res.error && "code" in res.error && res.error.code === "ETIMEDOUT");
8387
+ return {
8388
+ status: res.status ?? null,
8389
+ stdout: (res.stdout ?? "").toString(),
8390
+ stderr: (res.stderr ?? "").toString(),
8391
+ durationMs,
8392
+ timedOut,
8393
+ runnerError: res.error && !timedOut ? String(res.error.message ?? res.error) : void 0
8394
+ };
8395
+ },
8396
+ async hasBin(name) {
8397
+ const r = spawnSync("which", [name], { encoding: "utf8", timeout: 2e3 });
8398
+ return r.status === 0 && (r.stdout ?? "").trim().length > 0;
8399
+ },
8400
+ async fileExists(path) {
8401
+ return existsSync3(path);
8402
+ },
8403
+ async readFile(path) {
8404
+ try {
8405
+ return readFileSync3(path, "utf8");
8406
+ } catch {
8407
+ return null;
8408
+ }
8409
+ },
8410
+ async readDir(path) {
8411
+ let entries;
8412
+ try {
8413
+ entries = readdirSync2(path);
8414
+ } catch {
8415
+ return [];
8416
+ }
8417
+ const out = [];
8418
+ for (const name of entries) {
8419
+ try {
8420
+ const st = statSync2(join2(path, name));
8421
+ out.push({
8422
+ name,
8423
+ isDirectory: st.isDirectory(),
8424
+ isFile: st.isFile(),
8425
+ sizeBytes: st.isFile() ? st.size : null
8426
+ });
8427
+ } catch {
8428
+ }
8429
+ }
8430
+ return out;
8431
+ }
8432
+ };
8433
+
8434
+ // src/multi-toolchain-layer.ts
8435
+ var STATUS_RANK = {
8436
+ pass: 0,
8437
+ skipped: 1,
8438
+ fail: 2,
8439
+ timeout: 3,
8440
+ error: 4
8441
+ };
8442
+ function worst(a, b) {
8443
+ return (STATUS_RANK[a] ?? 0) >= (STATUS_RANK[b] ?? 0) ? a : b;
8444
+ }
8445
+ var SEVERITY_RANK = {
8446
+ info: 0,
8447
+ minor: 1,
8448
+ major: 2,
8449
+ critical: 3
8450
+ };
8451
+ function maxSeverity(findings) {
8452
+ let best = "info";
8453
+ for (const f of findings) {
8454
+ if (SEVERITY_RANK[f.severity] > SEVERITY_RANK[best]) best = f.severity;
8455
+ }
8456
+ return best;
8457
+ }
8458
+ function mergeLayerResults(name, perAdapter, options = {}) {
8459
+ const mergeDuration = options.mergeDuration ?? "max";
8460
+ const mergeDiagnostics = options.mergeDiagnostics ?? "max";
8461
+ const prefix = options.messagePrefixer;
8462
+ if (perAdapter.length === 0) {
8463
+ return {
8464
+ layer: name,
8465
+ status: "skipped",
8466
+ durationMs: 0,
8467
+ findings: [],
8468
+ reason: "no adapters"
8469
+ };
8470
+ }
8471
+ if (perAdapter.length === 1) {
8472
+ const only = perAdapter[0];
8473
+ return {
8474
+ ...only.result,
8475
+ layer: name,
8476
+ findings: only.result.findings.map((f) => ({
8477
+ ...f,
8478
+ layer: name,
8479
+ message: prefix ? `${prefix(only.adapter)} ${f.message}` : f.message,
8480
+ detail: { ...f.detail ?? {}, adapter: only.adapter }
8481
+ })),
8482
+ reason: only.result.reason ?? `${only.adapter}: ${only.result.status}`
8483
+ };
8484
+ }
8485
+ let status = "pass";
8486
+ let weightedScoreSum = 0;
8487
+ let weightCount = 0;
8488
+ const findings = [];
8489
+ let durationMs = 0;
8490
+ const reasonParts = [];
8491
+ const diagnostics = {};
8492
+ for (const { adapter, result } of perAdapter) {
8493
+ status = worst(status, result.status);
8494
+ if (typeof result.score === "number") {
8495
+ weightedScoreSum += result.score;
8496
+ weightCount += 1;
8497
+ }
8498
+ durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
8499
+ reasonParts.push(`${adapter}: ${result.status}`);
8500
+ for (const f of result.findings) {
8501
+ findings.push({
8502
+ ...f,
8503
+ layer: name,
8504
+ message: prefix ? `${prefix(adapter)} ${f.message}` : f.message,
8505
+ detail: { ...f.detail ?? {}, adapter }
8506
+ });
8507
+ }
8508
+ for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
8509
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
8510
+ const prev = diagnostics[k];
8511
+ if (prev == null) diagnostics[k] = v;
8512
+ else diagnostics[k] = mergeDiagnostics === "sum" ? prev + v : Math.max(prev, v);
8513
+ }
8514
+ }
8515
+ return {
8516
+ layer: name,
8517
+ status,
8518
+ score: weightCount > 0 ? weightedScoreSum / weightCount : void 0,
8519
+ durationMs,
8520
+ findings,
8521
+ reason: reasonParts.join(" \xB7 "),
8522
+ diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
8523
+ detail: {
8524
+ adapters: perAdapter.map(({ adapter, result }) => ({
8525
+ adapter,
8526
+ status: result.status,
8527
+ score: result.score ?? null
8528
+ })),
8529
+ worstSeverity: maxSeverity(findings)
8530
+ }
8531
+ };
8532
+ }
8533
+ function multiToolchainLayer(config) {
8534
+ const maxParallel = Math.max(1, config.maxParallel ?? 8);
8535
+ return {
8536
+ name: config.name,
8537
+ dependsOn: config.dependsOn,
8538
+ weight: config.weight,
8539
+ failContributesToScore: config.failContributesToScore,
8540
+ capMs: config.capMs,
8541
+ async run(ctx) {
8542
+ if (config.adapters.length === 0) {
8543
+ return {
8544
+ layer: config.name,
8545
+ status: "skipped",
8546
+ durationMs: 0,
8547
+ findings: [],
8548
+ reason: "no adapters detected"
8549
+ };
8550
+ }
8551
+ const runOne = async (adapter) => {
8552
+ const adapterName = config.adapterName(adapter);
8553
+ try {
8554
+ const r = await config.run(adapter, ctx);
8555
+ return { adapter: adapterName, result: r };
8556
+ } catch (err) {
8557
+ return {
8558
+ adapter: adapterName,
8559
+ result: {
8560
+ layer: config.name,
8561
+ status: "error",
8562
+ durationMs: 0,
8563
+ findings: [
8564
+ {
8565
+ severity: "major",
8566
+ layer: config.name,
8567
+ message: err instanceof Error ? err.message : String(err),
8568
+ detail: { adapter: adapterName }
8569
+ }
8570
+ ],
8571
+ reason: err instanceof Error ? err.message : String(err)
8572
+ }
8573
+ };
8574
+ }
8575
+ };
8576
+ const results = [];
8577
+ for (let i = 0; i < config.adapters.length; i += maxParallel) {
8578
+ const chunk = config.adapters.slice(i, i + maxParallel);
8579
+ const chunkResults = await Promise.all(chunk.map(runOne));
8580
+ results.push(...chunkResults);
8581
+ }
8582
+ return mergeLayerResults(config.name, results);
8583
+ }
8584
+ };
8585
+ }
8586
+
8587
+ // src/reviewer.ts
8588
+ var REVIEWER_SCHEMA = {
8589
+ type: "object",
8590
+ additionalProperties: false,
8591
+ required: ["observations", "diagnosis", "nextShotInstruction", "shouldContinue", "confidence"],
8592
+ properties: {
8593
+ observations: { type: "string", minLength: 20, maxLength: 2e3 },
8594
+ diagnosis: { type: "string", minLength: 20, maxLength: 1500 },
8595
+ nextShotInstruction: { type: "string", minLength: 40, maxLength: 3e3 },
8596
+ shouldContinue: { type: "boolean" },
8597
+ confidence: { type: "number", minimum: 0, maximum: 1 }
8598
+ }
8599
+ };
8600
+ function summarizeMemory(memory) {
8601
+ if (memory.length === 0) return "(no prior shots)";
8602
+ return memory.map((m) => {
8603
+ const header = `shot ${m.shot} \u2014 confidence=${(m.confidence ?? 0).toFixed(2)} shouldContinue=${m.shouldContinue ?? "?"}`;
8604
+ const obs = m.observations ? ` observations: ${m.observations.slice(0, 400)}` : "";
8605
+ const diag = m.diagnosis ? ` diagnosis: ${m.diagnosis.slice(0, 400)}` : "";
8606
+ const instr = m.nextShotInstruction ? ` instruction given: ${m.nextShotInstruction.slice(0, 400)}` : "";
8607
+ return [header, obs, diag, instr].filter(Boolean).join("\n");
8608
+ }).join("\n\n");
8609
+ }
8610
+ function buildReviewerPrompt(input) {
8611
+ const system = "You are a senior-engineer-grade reviewer directing an agent through a multi-shot build. Your job is NOT to grade; your job IS to direct the worker's next shot using the trace, verification result, prior memory, and user request. Return STRICT JSON. No prose outside the JSON.";
8612
+ const failingLayersBlock = input.verification.failingLayers && input.verification.failingLayers.length > 0 ? `failing layers: ${input.verification.failingLayers.join(", ")}` : "no layers failing";
8613
+ const user = `=== SHOT NUMBER ===
8614
+ shot ${input.shot} of the review loop
8615
+
8616
+ === USER REQUEST ===
8617
+ ${input.userRequest}
8618
+
8619
+ === WORKER TRACE (shot ${input.shot}) ===
8620
+ ${input.traceSummary}
8621
+ ${input.extraContext ? `
8622
+ === EXTRA CONTEXT ===
8623
+ ${input.extraContext}
8624
+ ` : ""}
8625
+ === VERIFICATION (shot ${input.shot}) ===
8626
+ blendedScore: ${input.verification.blendedScore.toFixed(2)}
8627
+ allPass: ${input.verification.allPass}
8628
+ failCount: ${input.verification.failCount}
8629
+ ${failingLayersBlock}
8630
+
8631
+ === REVIEWER MEMORY ===
8632
+ ${summarizeMemory(input.memory)}
8633
+ ${input.trailingContext ? `
8634
+ === TRAILING CONTEXT ===
8635
+ ${input.trailingContext}
8636
+ ` : ""}
8637
+ === YOUR TASK ===
8638
+ Return STRICT JSON:
8639
+
8640
+ 1. observations (20-2000 chars): first-person worker behavior from the trace (tool call counts, errors, loops).
8641
+ 2. diagnosis (20-1500 chars): root cause of current failures, not a restatement of verification.
8642
+ 3. nextShotInstruction (40-3000 chars): concrete "FIX THESE:" directive for the worker's next shot. Reference memory when instructions repeat.
8643
+ 4. shouldContinue (boolean): FALSE if verification.allPass=true, if worker is thrashing, if confidence < 0.3, or if the request looks unachievable. TRUE otherwise.
8644
+ 5. confidence (0-1): self-assessment.
8645
+
8646
+ RULES:
8647
+ - If verification.allPass is true, shouldContinue MUST be false.
8648
+ - If memory shows the same failing layer for 2 shots, reduce confidence \u2014 strategy isn't working.
8649
+ - If the trace shows zero tool calls, the worker didn't run \u2014 surface that.
8650
+ - Do NOT re-grade. Direct.`;
8651
+ return { system, user };
8652
+ }
8653
+ var DEFAULT_SOFT_FAIL = {
8654
+ observations: "reviewer soft-failed \u2014 no observations captured",
8655
+ diagnosis: "reviewer soft-failed \u2014 inspect verification findings and retry",
8656
+ nextShotInstruction: "Inspect the verification findings above and address the highest-severity failing layer first. If install failed, start there; otherwise work from the first failing gate and address compilation/build errors before layout/semantic issues.",
8657
+ shouldContinue: true,
8658
+ confidence: 0.3
8659
+ };
8660
+ function createDefaultReviewer(options) {
8661
+ const softFail2 = {
8662
+ ...DEFAULT_SOFT_FAIL,
8663
+ ...options.softFailDefaults ?? {}
8664
+ };
8665
+ const promptBuilder = options.promptBuilder ?? buildReviewerPrompt;
8666
+ const timeoutMs = options.timeoutMs ?? 18e4;
8667
+ return async (input) => {
8668
+ const start = Date.now();
8669
+ const { system, user } = promptBuilder(input);
8670
+ try {
8671
+ const { value, result } = await callLlmJson(
8672
+ {
8673
+ model: options.model,
8674
+ messages: [
8675
+ { role: "system", content: system },
8676
+ { role: "user", content: user }
8677
+ ],
8678
+ jsonSchema: { name: "reviewer_output", schema: REVIEWER_SCHEMA },
8679
+ temperature: 0,
8680
+ timeoutMs
8681
+ },
8682
+ options.llm ?? {}
8683
+ );
8684
+ return {
8685
+ shot: input.shot,
8686
+ observations: String(value.observations ?? softFail2.observations),
8687
+ diagnosis: String(value.diagnosis ?? softFail2.diagnosis),
8688
+ nextShotInstruction: String(value.nextShotInstruction ?? softFail2.nextShotInstruction),
8689
+ shouldContinue: Boolean(value.shouldContinue),
8690
+ confidence: Math.max(0, Math.min(1, Number(value.confidence ?? softFail2.confidence))),
8691
+ costUsd: result.costUsd ?? null,
8692
+ durationMs: Date.now() - start,
8693
+ available: true
8694
+ };
8695
+ } catch (err) {
8696
+ return {
8697
+ shot: input.shot,
8698
+ observations: softFail2.observations,
8699
+ diagnosis: softFail2.diagnosis,
8700
+ nextShotInstruction: softFail2.nextShotInstruction,
8701
+ shouldContinue: softFail2.shouldContinue,
8702
+ confidence: softFail2.confidence,
8703
+ costUsd: null,
8704
+ durationMs: Date.now() - start,
8705
+ available: false,
8706
+ error: err instanceof Error ? err.message : String(err)
8707
+ };
8708
+ }
8709
+ };
8710
+ }
8711
+
8712
+ // src/semantic-concept-judge.ts
8713
+ var DEFAULT_COMPLEXITY_WEIGHTS = {
8714
+ render: 1,
8715
+ integrate: 2,
8716
+ compute: 2.5
8717
+ };
8718
+ var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
8719
+ var DEFAULT_MAX_SOURCE = 45e3;
8720
+ var DEFAULT_MAX_HTML = 3e4;
8721
+ var DEFAULT_MAX_PER_FILE = 2e4;
8722
+ var DEFAULT_TIMEOUT = 18e4;
8723
+ var DEFAULT_MODEL = "claude-sonnet-4-6";
8724
+ var SEMANTIC_SCHEMA = {
8725
+ type: "object",
8726
+ additionalProperties: false,
8727
+ required: ["summary", "concepts"],
8728
+ properties: {
8729
+ summary: { type: "string", minLength: 20, maxLength: 600 },
8730
+ concepts: {
8731
+ type: "array",
8732
+ minItems: 1,
8733
+ items: {
8734
+ type: "object",
8735
+ additionalProperties: false,
8736
+ required: ["concept", "present", "score", "evidence", "severity"],
8737
+ properties: {
8738
+ concept: { type: "string", minLength: 1, maxLength: 120 },
8739
+ present: { type: "boolean" },
8740
+ score: { type: "number", minimum: 0, maximum: 10 },
8741
+ evidence: { type: "string", minLength: 5, maxLength: 400 },
8742
+ severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
8743
+ }
8744
+ }
8745
+ }
8746
+ }
8747
+ };
8748
+ function truncate(body, cap, label) {
8749
+ if (body.length <= cap) return body;
8750
+ return body.slice(0, cap) + `
8751
+ \u2026 [truncated ${body.length - cap} chars of ${label}]`;
8752
+ }
8753
+ function buildPrompt(input, opts) {
8754
+ const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
8755
+ ${f.content}`).join("\n\n");
8756
+ const html = input.servedHtml ?? "";
8757
+ return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
8758
+
8759
+ You MUST distinguish:
8760
+ (a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
8761
+ (b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
8762
+ (c) ABSENT (concept nowhere).
8763
+
8764
+ A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
8765
+
8766
+ USER REQUEST (what the agent was asked to build):
8767
+ ${input.userRequest}
8768
+
8769
+ ${input.artifactLabel ? `ARTIFACT METADATA:
8770
+ name: ${input.artifactLabel}
8771
+ description: ${input.artifactDescription ?? ""}
8772
+
8773
+ ` : ""}EXPECTED CONCEPTS (each must be graded independently):
8774
+ ${input.expectedConcepts.map((c, i) => ` ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`).join("\n")}
8775
+
8776
+ ${html ? `SERVED HTML (what the preview returns when hit):
8777
+ ${truncate(html, opts.maxHtmlChars, "HTML")}
8778
+
8779
+ ` : ""}SOURCE FILES (the agent's workdir):
8780
+ ${truncate(sourceBlob, opts.maxSourceChars, "source")}
8781
+
8782
+ For EACH concept, return:
8783
+ - concept: the concept name as given (match exactly)
8784
+ - present: boolean \u2014 does a working implementation exist?
8785
+ - score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
8786
+ - evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
8787
+ - severity:
8788
+ "info" when present: true AND score >= 7
8789
+ "minor" when present: true AND 4 <= score < 7
8790
+ "major" when present: false OR score < 4
8791
+ "critical" when the concept is not only absent but a core user flow depends on it
8792
+
8793
+ Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
8794
+
8795
+ BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
8796
+
8797
+ Return STRICT JSON. No prose outside the JSON.`;
8798
+ }
8799
+ async function runSemanticConceptJudge(input, options = {}) {
8800
+ const start = Date.now();
8801
+ const totalCount = input.expectedConcepts.length;
8802
+ if (totalCount === 0) {
8803
+ return {
8804
+ kind: "semantic-concept",
8805
+ version: SEMANTIC_CONCEPT_JUDGE_VERSION,
8806
+ score: 0,
8807
+ presentCount: 0,
8808
+ totalCount: 0,
8809
+ findings: [],
8810
+ summary: "no expected concepts declared",
8811
+ durationMs: 0,
8812
+ costUsd: null,
8813
+ available: false,
8814
+ error: "no expected concepts declared"
8815
+ };
8816
+ }
8817
+ const opts = {
8818
+ model: options.model ?? DEFAULT_MODEL,
8819
+ timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
8820
+ maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
8821
+ maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
8822
+ maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
8823
+ llm: options.llm ?? {},
8824
+ weightConcepts: options.weightConcepts ?? "mean",
8825
+ complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
8826
+ };
8827
+ const weightForConcept = (spec) => {
8828
+ if (opts.weightConcepts === "mean") return 1;
8829
+ if (spec.weight != null) return spec.weight;
8830
+ if (opts.weightConcepts === "complexity") {
8831
+ return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
8832
+ }
8833
+ return 1;
8834
+ };
8835
+ const weightByName = new Map(
8836
+ input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
8837
+ );
8838
+ try {
8839
+ const { value, result } = await callLlmJson(
8840
+ {
8841
+ model: opts.model,
8842
+ messages: [
8843
+ {
8844
+ role: "system",
8845
+ content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
8846
+ },
8847
+ { role: "user", content: buildPrompt(input, opts) }
8848
+ ],
8849
+ jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
8850
+ temperature: 0,
8851
+ timeoutMs: opts.timeoutMs
8852
+ },
8853
+ opts.llm
8854
+ );
8855
+ if (!value?.concepts || !Array.isArray(value.concepts)) {
8856
+ throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
8857
+ }
8858
+ const findings = value.concepts.map((c) => ({
8859
+ concept: String(c.concept),
8860
+ present: Boolean(c.present),
8861
+ score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
8862
+ evidence: String(c.evidence ?? ""),
8863
+ severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
8864
+ }));
8865
+ const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
8866
+ let weightSum = 0;
8867
+ let weightedScoreSum = 0;
8868
+ for (const f of findings) {
8869
+ const w = weightByName.get(f.concept) ?? 1;
8870
+ weightSum += w;
8871
+ weightedScoreSum += w * f.score;
8872
+ }
8873
+ const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
8874
+ return {
8875
+ kind: "semantic-concept",
8876
+ version: SEMANTIC_CONCEPT_JUDGE_VERSION,
8877
+ score: Number((scoreAvg / 10).toFixed(3)),
8878
+ presentCount,
8879
+ totalCount,
8880
+ findings,
8881
+ summary: String(value.summary ?? ""),
8882
+ durationMs: Date.now() - start,
8883
+ costUsd: result.costUsd ?? null,
8884
+ available: true
8885
+ };
8886
+ } catch (err) {
8887
+ return {
8888
+ kind: "semantic-concept",
8889
+ version: SEMANTIC_CONCEPT_JUDGE_VERSION,
8890
+ score: 0,
8891
+ presentCount: 0,
8892
+ totalCount,
8893
+ findings: [],
8894
+ summary: "",
8895
+ durationMs: Date.now() - start,
8896
+ costUsd: null,
8897
+ available: false,
8898
+ error: err instanceof Error ? err.message : String(err)
8899
+ };
8900
+ }
8901
+ }
8902
+ function createSemanticConceptJudge(options = {}) {
8903
+ return (input) => runSemanticConceptJudge(input, options);
8904
+ }
8905
+
8906
+ // src/intent-match-judge.ts
8907
+ var INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
8908
+ var DEFAULT_MODEL2 = "claude-sonnet-4-6";
8909
+ var DEFAULT_TIMEOUT2 = 9e4;
8910
+ var DEFAULT_MAX_SOURCE2 = 25e3;
8911
+ var DEFAULT_MAX_PER_FILE2 = 12e3;
8912
+ var DEFAULT_MAX_HTML2 = 2e4;
8913
+ var INTENT_SCHEMA = {
8914
+ type: "object",
8915
+ additionalProperties: false,
8916
+ required: ["score", "evidence"],
8917
+ properties: {
8918
+ score: { type: "number", minimum: 0, maximum: 1 },
8919
+ evidence: { type: "string", minLength: 10, maxLength: 400 }
8920
+ }
8921
+ };
8922
+ function truncate2(body, cap, label) {
8923
+ if (body.length <= cap) return body;
8924
+ return body.slice(0, cap) + `
8925
+ \u2026 [truncated ${body.length - cap} chars of ${label}]`;
8926
+ }
8927
+ function buildPrompt2(input, opts) {
8928
+ const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
8929
+ ${f.content}`).join("\n\n");
8930
+ const html = input.servedHtml ?? "";
8931
+ return `You are evaluating whether an agent built THE RIGHT APP for a user request.
8932
+
8933
+ You are NOT scoring feature coverage. You are NOT scoring code quality.
8934
+ You are answering ONE question: when a person looks at this work, do they
8935
+ see an honest attempt at the user's request \u2014 or do they see the wrong
8936
+ project entirely?
8937
+
8938
+ USER REQUEST:
8939
+ ${input.userRequest}
8940
+
8941
+ ${input.artifactLabel ? `ARTIFACT METADATA:
8942
+ name: ${input.artifactLabel}
8943
+ description: ${input.artifactDescription ?? ""}
8944
+
8945
+ ` : ""}${html ? `SERVED HTML (what the preview returns):
8946
+ ${truncate2(html, opts.maxHtmlChars, "HTML")}
8947
+
8948
+ ` : ""}SOURCE FILES (the agent's workdir):
8949
+ ${truncate2(sourceBlob, opts.maxSourceChars, "source")}
8950
+
8951
+ Score 0\u20131:
8952
+ 1.0 \u2014 unmistakably the right app. Even with bugs, gaps, or missing
8953
+ features, a reviewer would say "yes, this is what was asked for."
8954
+ 0.7 \u2014 recognizable. Domain matches; some required surface areas exist.
8955
+ A reviewer would say "right direction, lots of work needed."
8956
+ 0.4 \u2014 partially related. Wrong framing or wrong product entirely but
8957
+ with some shared keywords or a tangential overlap.
8958
+ 0.0 \u2014 wrong project. The agent shipped something unrelated to the
8959
+ request (e.g. asked for an NFT mint page, shipped a generic
8960
+ landing page with zero NFT-related code).
8961
+
8962
+ Evidence: one sentence citing the strongest concrete signal \u2014 a file
8963
+ name, a route, a rendered headline, a missing core surface. Don't
8964
+ restate the request.
8965
+
8966
+ Examples:
8967
+ - "src/App.tsx renders <MintWidget /> with mint-1/mint-5 buttons and
8968
+ wagmi imports \u2014 clearly the requested NFT mint page." \u2192 0.95
8969
+ - "src/App.tsx is the default Vite React template; no
8970
+ mint/wallet/contract code in any file under src/." \u2192 0.05
8971
+
8972
+ Return STRICT JSON. No prose outside.`;
8973
+ }
8974
+ async function runIntentMatchJudge(input, options = {}) {
8975
+ const start = Date.now();
8976
+ const opts = {
8977
+ model: options.model ?? DEFAULT_MODEL2,
8978
+ timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT2,
8979
+ maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE2,
8980
+ maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE2,
8981
+ maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML2,
8982
+ llm: options.llm ?? {}
8983
+ };
8984
+ if (input.sourceFiles.length === 0 && !input.servedHtml) {
8985
+ return {
8986
+ kind: "intent-match",
8987
+ version: INTENT_MATCH_JUDGE_VERSION,
8988
+ score: 0,
8989
+ evidence: "no source files and no served HTML \u2014 nothing to evaluate",
8990
+ durationMs: 0,
8991
+ costUsd: null,
8992
+ available: false,
8993
+ error: "no input artifact"
8994
+ };
8995
+ }
8996
+ try {
8997
+ const { value, result } = await callLlmJson(
8998
+ {
8999
+ model: opts.model,
9000
+ messages: [
9001
+ {
9002
+ role: "system",
9003
+ content: "You are a holistic code reviewer answering one question: did the agent build the right app for the user. Return strict JSON. No prose outside."
9004
+ },
9005
+ { role: "user", content: buildPrompt2(input, opts) }
9006
+ ],
9007
+ jsonSchema: { name: "intent_match_judge", schema: INTENT_SCHEMA },
9008
+ temperature: 0,
9009
+ timeoutMs: opts.timeoutMs
9010
+ },
9011
+ opts.llm
9012
+ );
9013
+ const score = Math.max(0, Math.min(1, Number(value?.score ?? 0)));
9014
+ return {
9015
+ kind: "intent-match",
9016
+ version: INTENT_MATCH_JUDGE_VERSION,
9017
+ score: Number(score.toFixed(3)),
9018
+ evidence: String(value?.evidence ?? "").slice(0, 400),
9019
+ durationMs: Date.now() - start,
9020
+ costUsd: result.costUsd ?? null,
9021
+ available: true
9022
+ };
9023
+ } catch (err) {
9024
+ return {
9025
+ kind: "intent-match",
9026
+ version: INTENT_MATCH_JUDGE_VERSION,
9027
+ score: 0,
9028
+ evidence: "",
9029
+ durationMs: Date.now() - start,
9030
+ costUsd: null,
9031
+ available: false,
9032
+ error: err instanceof Error ? err.message : String(err)
9033
+ };
9034
+ }
9035
+ }
9036
+ function createIntentMatchJudge(options = {}) {
9037
+ return (input) => runIntentMatchJudge(input, options);
9038
+ }
9039
+
9040
+ // src/flow-layer.ts
9041
+ function flowLayer(input) {
9042
+ return {
9043
+ name: input.name ?? "flow",
9044
+ dependsOn: input.dependsOn ?? ["serve"],
9045
+ weight: input.weight ?? 1,
9046
+ capMs: input.capMs ?? 6e4,
9047
+ run: async (ctx) => {
9048
+ const start = Date.now();
9049
+ const spec = ctx.env?.flowSpec ?? input.flowSpec;
9050
+ if (!spec) {
9051
+ return {
9052
+ layer: input.name ?? "flow",
9053
+ status: "skipped",
9054
+ durationMs: 0,
9055
+ findings: [],
9056
+ reason: "no flowSpec supplied"
9057
+ };
9058
+ }
9059
+ let runner;
9060
+ try {
9061
+ runner = await input.runner();
9062
+ } catch (err) {
9063
+ return {
9064
+ layer: input.name ?? "flow",
9065
+ status: "error",
9066
+ durationMs: Date.now() - start,
9067
+ findings: [
9068
+ {
9069
+ severity: "major",
9070
+ message: `flow runner failed to start: ${err instanceof Error ? err.message : String(err)}`
9071
+ }
9072
+ ],
9073
+ reason: "runner-init-failed"
9074
+ };
9075
+ }
9076
+ const findings = [];
9077
+ const stepResults = [];
9078
+ let openOk = false;
9079
+ try {
9080
+ const opened = await runner.open(spec.url);
9081
+ openOk = opened.ok;
9082
+ if (!opened.ok) {
9083
+ findings.push({
9084
+ severity: "major",
9085
+ message: `flow.open(${spec.url}) failed${opened.evidence ? `: ${opened.evidence}` : ""}`
9086
+ });
9087
+ }
9088
+ if (openOk || spec.continueOnFail) {
9089
+ for (let i = 0; i < spec.steps.length; i++) {
9090
+ const step = spec.steps[i];
9091
+ if (ctx.signal.aborted) break;
9092
+ const stepStart = Date.now();
9093
+ let result;
9094
+ try {
9095
+ result = await runner.step(step);
9096
+ } catch (err) {
9097
+ result = {
9098
+ ok: false,
9099
+ evidence: err instanceof Error ? err.message : String(err),
9100
+ durationMs: Date.now() - stepStart
9101
+ };
9102
+ }
9103
+ stepResults.push({ step, result, index: i });
9104
+ if (!result.ok) {
9105
+ findings.push({
9106
+ severity: step.severity ?? "major",
9107
+ message: `step[${i}] ${step.action}${step.target ? `(${step.target})` : ""} failed${result.evidence ? `: ${result.evidence}` : ""}`
9108
+ });
9109
+ if (!spec.continueOnFail) break;
9110
+ }
9111
+ }
9112
+ }
9113
+ } finally {
9114
+ try {
9115
+ await runner.close();
9116
+ } catch {
9117
+ }
9118
+ }
9119
+ const totalSteps = spec.steps.length;
9120
+ const ranSteps = stepResults.length;
9121
+ const passedSteps = stepResults.filter((s) => s.result.ok).length;
9122
+ const status = !openOk ? "fail" : passedSteps === totalSteps ? "pass" : "fail";
9123
+ const denominator = 1 + totalSteps;
9124
+ const numerator = (openOk ? 1 : 0) + passedSteps;
9125
+ const score = denominator > 0 ? Number((numerator / denominator).toFixed(3)) : 0;
9126
+ return {
9127
+ layer: input.name ?? "flow",
9128
+ status,
9129
+ score,
9130
+ durationMs: Date.now() - start,
9131
+ findings,
9132
+ reason: status === "pass" ? `${totalSteps}/${totalSteps} steps passed` : `${passedSteps}/${totalSteps} steps passed${ranSteps < totalSteps ? ` (stopped at step ${ranSteps})` : ""}`,
9133
+ diagnostics: {
9134
+ flowOpenOk: openOk ? 1 : 0,
9135
+ flowStepsTotal: totalSteps,
9136
+ flowStepsPassed: passedSteps,
9137
+ flowStepsRan: ranSteps
9138
+ }
9139
+ };
9140
+ }
9141
+ };
9142
+ }
9143
+
9144
+ // src/deploy-gate-layer.ts
9145
+ function deployGateLayer(input) {
9146
+ const requireArtifact = input.requireArtifact ?? true;
9147
+ return {
9148
+ name: input.name ?? "deploy",
9149
+ dependsOn: input.dependsOn ?? ["build"],
9150
+ weight: input.weight ?? 1,
9151
+ capMs: input.capMs ?? 12e4,
9152
+ run: async (ctx) => {
9153
+ const start = Date.now();
9154
+ let runner;
9155
+ try {
9156
+ runner = await input.runner();
9157
+ } catch (err) {
9158
+ return {
9159
+ layer: input.name ?? "deploy",
9160
+ status: "error",
9161
+ durationMs: Date.now() - start,
9162
+ findings: [
9163
+ {
9164
+ severity: "major",
9165
+ message: `deploy runner failed to start: ${err instanceof Error ? err.message : String(err)}`
9166
+ }
9167
+ ],
9168
+ reason: "runner-init-failed"
9169
+ };
9170
+ }
9171
+ let result;
9172
+ try {
9173
+ result = await runner.run();
9174
+ } catch (err) {
9175
+ return {
9176
+ layer: input.name ?? "deploy",
9177
+ status: "error",
9178
+ durationMs: Date.now() - start,
9179
+ findings: [
9180
+ {
9181
+ severity: "major",
9182
+ message: `deploy command threw: ${err instanceof Error ? err.message : String(err)}`
9183
+ }
9184
+ ],
9185
+ reason: "runner-throw"
9186
+ };
9187
+ }
9188
+ if (ctx.signal.aborted) {
9189
+ return {
9190
+ layer: input.name ?? "deploy",
9191
+ status: "timeout",
9192
+ durationMs: Date.now() - start,
9193
+ findings: [],
9194
+ reason: "aborted by overall cap"
9195
+ };
9196
+ }
9197
+ const artifactOk = !requireArtifact || result.artifactValid === true;
9198
+ const pass = result.ok && artifactOk;
9199
+ const findings = [];
9200
+ if (!result.ok) {
9201
+ findings.push({
9202
+ severity: "critical",
9203
+ message: "deploy build exited non-zero",
9204
+ evidence: (result.output ?? "").slice(-1200)
9205
+ });
9206
+ }
9207
+ if (result.ok && requireArtifact && !result.artifactValid) {
9208
+ findings.push({
9209
+ severity: "major",
9210
+ message: `deploy build succeeded but artifact ${result.artifactDir ?? "(unknown)"} is invalid or empty`
9211
+ });
9212
+ }
9213
+ return {
9214
+ layer: input.name ?? "deploy",
9215
+ status: pass ? "pass" : "fail",
9216
+ score: pass ? 1 : 0,
9217
+ durationMs: Date.now() - start,
9218
+ findings,
9219
+ reason: pass ? `deploy build OK${input.family ? ` (${input.family})` : ""}${result.artifactDir ? ` \u2192 ${result.artifactDir}` : ""}` : !result.ok ? "build command failed" : "artifact missing or invalid",
9220
+ diagnostics: {
9221
+ deployBuildOk: result.ok ? 1 : 0,
9222
+ deployArtifactOk: result.artifactValid === true ? 1 : 0,
9223
+ deployBuildMs: result.durationMs ?? null
9224
+ }
9225
+ };
9226
+ }
9227
+ };
9228
+ }
9229
+ function viteDeployRunner(input) {
9230
+ return {
9231
+ run: async () => {
9232
+ const start = Date.now();
9233
+ const cmd = input.buildCommand ?? "npm run build";
9234
+ const artifactDir = input.artifactDir ?? "dist";
9235
+ const artifactEntry = input.artifactEntry ?? "index.html";
9236
+ const timeoutMs = input.timeoutMs ?? 9e4;
9237
+ const result = await input.exec(cmd, { cwd: input.workdir, timeoutMs });
9238
+ const ok = result.exitCode === 0;
9239
+ let artifactValid = false;
9240
+ try {
9241
+ const entryExists = await input.exists(`${artifactDir}/${artifactEntry}`);
9242
+ artifactValid = ok && Boolean(entryExists);
9243
+ } catch {
9244
+ artifactValid = false;
9245
+ }
9246
+ const tail = ((result.stderr || result.stdout) ?? "").slice(-1500);
9247
+ return {
9248
+ ok,
9249
+ output: tail,
9250
+ durationMs: Date.now() - start,
9251
+ artifactDir,
9252
+ artifactValid
9253
+ };
9254
+ }
9255
+ };
9256
+ }
9257
+
9258
+ // src/keyword-coverage-judge.ts
9259
+ function htmlContainsElement(html, selector) {
9260
+ const tagOnly = /^([a-zA-Z][\w-]*)$/.exec(selector);
9261
+ if (tagOnly) {
9262
+ const re = new RegExp(`<${tagOnly[1]}\\b`, "i");
9263
+ return re.test(html);
9264
+ }
9265
+ const tagAttrEq = /^([a-zA-Z][\w-]*)\[([\w-]+)\s*=\s*["']?([^"'\]]+)["']?\]$/.exec(selector);
9266
+ if (tagAttrEq) {
9267
+ const [, tag, attr, value] = tagAttrEq;
9268
+ const re = new RegExp(
9269
+ `<${tag}\\b[^>]*\\b${attr}\\s*=\\s*["']${value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}["']`,
9270
+ "i"
9271
+ );
9272
+ return re.test(html);
9273
+ }
9274
+ const tagAttrPresence = /^([a-zA-Z][\w-]*)\[([\w-]+)\]$/.exec(selector);
9275
+ if (tagAttrPresence) {
9276
+ const [, tag, attr] = tagAttrPresence;
9277
+ const re = new RegExp(`<${tag}\\b[^>]*\\b${attr}\\b`, "i");
9278
+ return re.test(html);
9279
+ }
9280
+ return null;
9281
+ }
9282
+ function extractAssetUrls(html, baseUrl) {
9283
+ const urls = /* @__PURE__ */ new Set();
9284
+ const linkRe = /<link\b[^>]*\bhref\s*=\s*["']([^"']+)["'][^>]*>/gi;
9285
+ const scriptRe = /<script\b[^>]*\bsrc\s*=\s*["']([^"']+)["'][^>]*>/gi;
9286
+ for (const re of [linkRe, scriptRe]) {
9287
+ let match;
9288
+ while ((match = re.exec(html)) !== null) {
9289
+ const raw = match[1];
9290
+ try {
9291
+ urls.add(new URL(raw, baseUrl).toString());
9292
+ } catch {
9293
+ }
9294
+ }
9295
+ }
9296
+ return Array.from(urls);
9297
+ }
9298
+ function runKeywordCoverageJudge(html, expectedConcepts, assets = []) {
9299
+ const start = Date.now();
9300
+ if (expectedConcepts.length === 0) {
9301
+ return {
9302
+ score: 0,
9303
+ presentCount: 0,
9304
+ totalCount: 0,
9305
+ findings: [],
9306
+ durationMs: 0,
9307
+ totalAssembledBytes: 0
9308
+ };
9309
+ }
9310
+ const haystack = (html + "\n" + assets.join("\n")).toLowerCase();
9311
+ const findings = expectedConcepts.map((concept) => {
9312
+ const matchedKeywords = [];
9313
+ for (const kw of concept.keywords) {
9314
+ if (haystack.includes(kw.toLowerCase())) matchedKeywords.push(kw);
9315
+ }
9316
+ const requiredElementPresent = concept.requiredElement ? htmlContainsElement(html, concept.requiredElement) : null;
9317
+ const passesElementGate = requiredElementPresent === null || requiredElementPresent === true;
9318
+ const found = matchedKeywords.length > 0 && passesElementGate;
9319
+ return { concept: concept.name, found, matchedKeywords, requiredElementPresent };
9320
+ });
9321
+ const presentCount = findings.filter((f) => f.found).length;
9322
+ return {
9323
+ score: presentCount / expectedConcepts.length,
9324
+ presentCount,
9325
+ totalCount: expectedConcepts.length,
9326
+ findings,
9327
+ durationMs: Date.now() - start,
9328
+ totalAssembledBytes: haystack.length
9329
+ };
9330
+ }
9331
+ async function runKeywordCoverageJudgeUrl(previewUrl, expectedConcepts, options = {}) {
9332
+ const start = Date.now();
9333
+ const fetchFn = options.fetch ?? globalThis.fetch;
9334
+ const htmlTimeout = options.htmlTimeoutMs ?? 5e3;
9335
+ const assetTimeout = options.assetTimeoutMs ?? 3e3;
9336
+ if (expectedConcepts.length === 0) {
9337
+ return {
9338
+ score: 0,
9339
+ presentCount: 0,
9340
+ totalCount: 0,
9341
+ findings: [],
9342
+ durationMs: 0,
9343
+ totalAssembledBytes: 0
9344
+ };
9345
+ }
9346
+ let html = "";
9347
+ try {
9348
+ const resp = await fetchFn(previewUrl, {
9349
+ redirect: "follow",
9350
+ signal: AbortSignal.timeout(htmlTimeout)
9351
+ });
9352
+ if (!resp.ok) {
9353
+ return softFail(expectedConcepts, start, `preview HTTP ${resp.status}`);
9354
+ }
9355
+ html = await resp.text();
9356
+ } catch (err) {
9357
+ return softFail(expectedConcepts, start, err instanceof Error ? err.message : String(err));
9358
+ }
9359
+ const assetUrls = extractAssetUrls(html, previewUrl);
9360
+ const assetBodies = await Promise.all(
9361
+ assetUrls.map(async (u) => {
9362
+ try {
9363
+ const r = await fetchFn(u, {
9364
+ redirect: "follow",
9365
+ signal: AbortSignal.timeout(assetTimeout)
9366
+ });
9367
+ if (!r.ok) return "";
9368
+ return await r.text();
9369
+ } catch {
9370
+ return "";
9371
+ }
9372
+ })
9373
+ );
9374
+ return runKeywordCoverageJudge(html, expectedConcepts, assetBodies);
9375
+ }
9376
+ function softFail(expectedConcepts, start, error) {
9377
+ return {
9378
+ score: 0,
9379
+ presentCount: 0,
9380
+ totalCount: expectedConcepts.length,
9381
+ findings: expectedConcepts.map((c) => ({
9382
+ concept: c.name,
9383
+ found: false,
9384
+ matchedKeywords: [],
9385
+ requiredElementPresent: null
9386
+ })),
9387
+ durationMs: Date.now() - start,
9388
+ totalAssembledBytes: 0,
9389
+ error
9390
+ };
9391
+ }
9392
+
9393
+ // src/error-count-extractor.ts
9394
+ var ERROR_COUNT_PATTERNS = [
9395
+ {
9396
+ // tsc / ts-node: `src/foo.ts(12,3): error TS1234: ...`
9397
+ name: "typescript-tsc",
9398
+ regex: /[\w./-]+\(\d+,\d+\): error TS\d+:/g
9399
+ },
9400
+ {
9401
+ // pytest: `FAILED tests/test_foo.py::test_bar`
9402
+ name: "pytest-failed",
9403
+ regex: /^FAILED\s+\S+/gm
9404
+ },
9405
+ {
9406
+ // rustc: `error[E0308]: ...` or `error: ...`
9407
+ name: "rustc",
9408
+ regex: /^error(?:\[[A-Z]\d+\])?:/gm
9409
+ },
9410
+ {
9411
+ // go build: `./foo.go:12:3: ...` — any file:line:col: is an error line
9412
+ name: "golang",
9413
+ regex: /^\.\/[\w./-]+\.go:\d+:\d+:/gm
9414
+ },
9415
+ {
9416
+ // eslint default formatter per-line: ` 12:34 error message rule-id`
9417
+ name: "eslint",
9418
+ regex: /^\s+\d+:\d+\s+error\s+/gm
9419
+ },
9420
+ {
9421
+ // eslint summary line: `✖ 17 problems (12 errors, 5 warnings)`
9422
+ // Use this only when the per-line formatter isn't present; transform
9423
+ // reads the errors count directly.
9424
+ name: "eslint-summary",
9425
+ regex: /✖\s+\d+\s+problems?\s+\((\d+)\s+errors?/gm,
9426
+ transform: (m) => Number(m[1] ?? 0)
9427
+ }
9428
+ ];
9429
+ function extractErrorCount(text, opts = {}) {
9430
+ if (!text) return { count: null, matched: null, samples: [] };
9431
+ const patterns = [...opts.extra ?? [], ...ERROR_COUNT_PATTERNS].filter(
9432
+ (p) => !opts.only || opts.only.includes(p.name)
9433
+ );
9434
+ for (const p of patterns) {
9435
+ const matches2 = Array.from(text.matchAll(p.regex));
9436
+ if (matches2.length === 0) continue;
9437
+ const count = p.transform ? matches2.reduce((sum2, m) => sum2 + p.transform(m), 0) : matches2.length;
9438
+ return {
9439
+ count,
9440
+ matched: p.name,
9441
+ samples: matches2.slice(0, 5).map((m) => m[0])
9442
+ };
9443
+ }
9444
+ return { count: null, matched: null, samples: [] };
9445
+ }
9446
+
9447
+ // src/reference-replay.ts
9448
+ import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
9449
+ import { dirname as dirname2 } from "path";
9450
+ var DEFAULT_MATCH_THRESHOLD = 0.55;
9451
+ var ALL_SPLITS = ["train", "dev", "test", "holdout"];
9452
+ async function runReferenceReplay(cases, options) {
9453
+ const now = options.now ?? Date.now;
9454
+ const startedAt = now();
9455
+ const runId = options.runId ?? `${options.variantId ?? "reference-replay"}-${startedAt}`;
9456
+ const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
9457
+ const caseRuns = [];
9458
+ for (const [caseIndex, replayCase] of cases.entries()) {
9459
+ const split = replayCase.split ?? "train";
9460
+ if (split === "holdout" && !options.includeHoldout) continue;
9461
+ if (!allowedSplits.has(split)) continue;
9462
+ const caseStart = now();
9463
+ const executionScenario = {
9464
+ id: replayCase.id,
9465
+ split,
9466
+ input: replayCase.input,
9467
+ ...replayCase.metadata !== void 0 ? { metadata: replayCase.metadata } : {}
9468
+ };
9469
+ let candidates = [];
9470
+ let error;
9471
+ try {
9472
+ throwIfAborted(options.abortSignal);
9473
+ candidates = await runAdapter(options.adapter, executionScenario, {
9474
+ runId,
9475
+ caseIndex,
9476
+ abortSignal: options.abortSignal
9477
+ });
9478
+ throwIfAborted(options.abortSignal);
9479
+ } catch (cause) {
9480
+ if (options.abortSignal?.aborted) throw cause;
9481
+ if (!options.continueOnError) throw cause;
9482
+ error = cause instanceof Error ? cause.message : String(cause);
9483
+ }
9484
+ const scenario = {
9485
+ id: replayCase.id,
9486
+ split,
9487
+ references: replayCase.references,
9488
+ candidates,
9489
+ ...replayCase.metadata !== void 0 ? { metadata: replayCase.metadata } : {}
9490
+ };
9491
+ const scoreOptions2 = {
9492
+ matcher: options.matcher,
9493
+ matchThreshold: options.matchThreshold,
9494
+ includeHoldout: true
9495
+ };
9496
+ const scenarioScore = scoreReferenceReplay([scenario], scoreOptions2).scenarios[0];
9497
+ caseRuns.push({
9498
+ caseId: replayCase.id,
9499
+ split,
9500
+ input: replayCase.input,
9501
+ references: replayCase.references,
9502
+ candidates,
9503
+ score: scenarioScore,
9504
+ durationMs: Math.max(0, now() - caseStart),
9505
+ ...replayCase.metadata !== void 0 ? { metadata: replayCase.metadata } : {},
9506
+ ...error !== void 0 ? { error } : {}
9507
+ });
9508
+ }
9509
+ const completedAt = now();
9510
+ const scoreOptions = {
9511
+ matcher: options.matcher,
9512
+ matchThreshold: options.matchThreshold,
9513
+ includeHoldout: true
9514
+ };
9515
+ const run = {
9516
+ id: runId,
9517
+ startedAt,
9518
+ completedAt,
9519
+ durationMs: Math.max(0, completedAt - startedAt),
9520
+ cases: caseRuns,
9521
+ score: scoreReferenceReplay(caseRuns.map((caseRun) => ({
9522
+ id: caseRun.caseId,
9523
+ split: caseRun.split,
9524
+ references: caseRun.references,
9525
+ candidates: caseRun.candidates,
9526
+ ...caseRun.metadata !== void 0 ? { metadata: caseRun.metadata } : {}
9527
+ })), scoreOptions),
9528
+ ...options.variantId !== void 0 ? { variantId: options.variantId } : {},
9529
+ ...options.metadata !== void 0 ? { metadata: options.metadata } : {}
9530
+ };
9531
+ await options.store?.save(run);
9532
+ return run;
9533
+ }
9534
+ function decideReferenceReplayRunPromotion(baseline, candidate, policy = {}) {
9535
+ return decideReferenceReplayPromotion(baseline.score, candidate.score, policy);
9536
+ }
9537
+ function inMemoryReferenceReplayStore(initial = []) {
9538
+ const runs = [...initial];
9539
+ return {
9540
+ async save(run) {
9541
+ runs.push(run);
9542
+ },
9543
+ async list() {
9544
+ return [...runs];
9545
+ }
9546
+ };
9547
+ }
9548
+ function jsonlReferenceReplayStore(path) {
9549
+ return {
9550
+ async save(run) {
9551
+ mkdirSync2(dirname2(path), { recursive: true });
9552
+ appendFileSync2(path, JSON.stringify(run) + "\n");
9553
+ },
9554
+ async list() {
9555
+ if (!existsSync4(path)) return [];
9556
+ return readJsonl(path);
9557
+ }
9558
+ };
9559
+ }
9560
+ function scoreReferenceReplay(scenarios, options = {}) {
9561
+ const matcher = options.matcher ?? defaultReferenceReplayMatcher;
9562
+ const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
9563
+ const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
9564
+ const scores = scenarios.filter((scenario) => {
9565
+ const split = scenario.split ?? "train";
9566
+ if (split === "holdout" && !options.includeHoldout) return false;
9567
+ return allowedSplits.has(split);
9568
+ }).map((scenario) => scoreScenario(scenario, matcher, threshold));
9569
+ return {
9570
+ scenarios: scores,
9571
+ aggregate: aggregateScenarioScores(scores),
9572
+ bySplit: aggregateBySplit(scores)
9573
+ };
9574
+ }
9575
+ function compareReferenceReplay(baseline, candidate) {
9576
+ const splits = /* @__PURE__ */ new Set([
9577
+ ...Object.keys(baseline.bySplit),
9578
+ ...Object.keys(candidate.bySplit)
9579
+ ]);
9580
+ return [...splits].sort(bySplitOrder).map((split) => {
9581
+ const before = baseline.bySplit[split] ?? emptyAggregate();
9582
+ const after = candidate.bySplit[split] ?? emptyAggregate();
9583
+ return {
9584
+ split,
9585
+ baselineF1: before.f1,
9586
+ candidateF1: after.f1,
9587
+ f1Delta: after.f1 - before.f1,
9588
+ baselineRecall: before.recall,
9589
+ candidateRecall: after.recall,
9590
+ recallDelta: after.recall - before.recall
9591
+ };
9592
+ });
9593
+ }
9594
+ function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
9595
+ const requiredSplits = policy.requiredSplits ?? ["dev", "test"];
9596
+ const minF1Delta = policy.minF1Delta ?? 0;
9597
+ const maxRegression = policy.maxRegression ?? 0;
9598
+ const requireHoldout = policy.requireHoldoutNonRegression ?? true;
9599
+ const comparisons = compareReferenceReplay(baseline, candidate);
9600
+ const missingRequiredSplits = requiredSplits.filter((split) => !hasSplit(baseline, split) || !hasSplit(candidate, split));
9601
+ const compared = comparisons.filter((item) => requiredSplits.includes(item.split));
9602
+ const regressions = comparisons.filter((item) => item.f1Delta < -maxRegression);
9603
+ const aggregateDelta = candidate.aggregate.f1 - baseline.aggregate.f1;
9604
+ if (missingRequiredSplits.length > 0) {
9605
+ return {
9606
+ promote: false,
9607
+ reason: `Required split missing from baseline or candidate: ${missingRequiredSplits.join(", ")}`,
9608
+ aggregateDelta,
9609
+ comparisons,
9610
+ regressions
9611
+ };
9612
+ }
9613
+ if (compared.length === 0) {
9614
+ return {
9615
+ promote: false,
9616
+ reason: `No required split scores found: ${requiredSplits.join(", ")}`,
9617
+ aggregateDelta,
9618
+ comparisons,
9619
+ regressions
9620
+ };
9621
+ }
9622
+ if (regressions.length > 0) {
9623
+ return {
9624
+ promote: false,
9625
+ reason: `Regression in ${regressions.map((r) => r.split).join(", ")}`,
9626
+ aggregateDelta,
9627
+ comparisons,
9628
+ regressions
9629
+ };
9630
+ }
9631
+ if (requireHoldout && (!hasSplit(baseline, "holdout") || !hasSplit(candidate, "holdout"))) {
9632
+ return {
9633
+ promote: false,
9634
+ reason: "Holdout split is required for promotion",
9635
+ aggregateDelta,
9636
+ comparisons,
9637
+ regressions
9638
+ };
9639
+ }
9640
+ const requiredMeanDelta = mean4(compared.map((item) => item.f1Delta));
9641
+ if (requiredMeanDelta < minF1Delta) {
9642
+ return {
9643
+ promote: false,
9644
+ reason: `Required split F1 delta ${formatPct(requiredMeanDelta)} below ${formatPct(minF1Delta)}`,
9645
+ aggregateDelta,
9646
+ comparisons,
9647
+ regressions
9648
+ };
9649
+ }
9650
+ return {
9651
+ promote: true,
9652
+ reason: `Required splits improved by ${formatPct(requiredMeanDelta)} with no regressions`,
9653
+ aggregateDelta,
9654
+ comparisons,
9655
+ regressions
9656
+ };
9657
+ }
9658
+ function defaultReferenceReplayMatcher(reference, candidate) {
9659
+ const referenceText = `${reference.title} ${reference.description ?? ""}`;
9660
+ const candidateText = `${candidate.title} ${candidate.description ?? ""}`;
9661
+ const textScore = tokenJaccard(referenceText, candidateText);
9662
+ const severityScore = reference.severity && candidate.severity ? normalize(reference.severity) === normalize(candidate.severity) ? 0.1 : -0.05 : 0;
9663
+ const tagScore = tagOverlap(reference.tags, candidate.tags) * 0.15;
9664
+ const score = clamp012(textScore * 0.85 + tagScore + severityScore);
9665
+ return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
9666
+ }
9667
+ function scoreScenario(scenario, matcher, threshold) {
9668
+ const candidatesLeft = scenario.candidates.map((candidate, index) => ({ candidate, index }));
9669
+ const matches2 = [];
9670
+ for (const reference of scenario.references) {
9671
+ let best = null;
9672
+ for (const item of candidatesLeft) {
9673
+ const result = matcher(reference, item.candidate, scenario);
9674
+ if (!Number.isFinite(result.score)) {
9675
+ throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${item.candidate.id}`);
9676
+ }
9677
+ if (!best || result.score > best.score) {
9678
+ best = { ...item, score: clamp012(result.score), reason: result.reason ?? "" };
9679
+ }
9680
+ }
9681
+ const weight = reference.weight ?? 1;
9682
+ if (best && best.score >= threshold) {
9683
+ const matchIndex = candidatesLeft.findIndex((item) => item.index === best.index);
9684
+ if (matchIndex >= 0) candidatesLeft.splice(matchIndex, 1);
9685
+ matches2.push({
9686
+ scenarioId: scenario.id,
9687
+ referenceId: reference.id,
9688
+ candidateId: best.candidate.id,
9689
+ score: best.score,
9690
+ matched: true,
9691
+ weight,
9692
+ reason: best.reason
9693
+ });
9694
+ } else {
9695
+ matches2.push({
9696
+ scenarioId: scenario.id,
9697
+ referenceId: reference.id,
9698
+ candidateId: best?.candidate.id ?? null,
9699
+ score: best?.score ?? 0,
9700
+ matched: false,
9701
+ weight,
9702
+ reason: best?.reason ?? "no candidates"
9703
+ });
9704
+ }
9705
+ }
9706
+ const matched = matches2.filter((match) => match.matched).length;
9707
+ const total = scenario.references.length;
9708
+ const falsePositives = candidatesLeft.length;
9709
+ const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
9710
+ const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
9711
+ const precision = ratio(matched, matched + falsePositives);
9712
+ const recall = ratio(matched, total);
9713
+ return {
9714
+ scenarioId: scenario.id,
9715
+ split: scenario.split ?? "train",
9716
+ matched,
9717
+ total,
9718
+ falsePositives,
9719
+ matchedWeight,
9720
+ totalWeight,
9721
+ precision,
9722
+ recall,
9723
+ f1: f1(precision, recall),
9724
+ matches: matches2
9725
+ };
9726
+ }
9727
+ function aggregateBySplit(scores) {
9728
+ const out = {};
9729
+ for (const split of ALL_SPLITS) {
9730
+ const scoped = scores.filter((score) => score.split === split);
9731
+ if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
9732
+ }
9733
+ return out;
9734
+ }
9735
+ function aggregateScenarioScores(scores) {
9736
+ const matched = sum(scores.map((score) => score.matched));
9737
+ const total = sum(scores.map((score) => score.total));
9738
+ const falsePositives = sum(scores.map((score) => score.falsePositives));
9739
+ const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9740
+ const totalWeight = sum(scores.map((score) => score.totalWeight));
9741
+ const precision = ratio(matched, matched + falsePositives);
9742
+ const recall = ratio(matched, total);
9743
+ return {
9744
+ matched,
9745
+ total,
9746
+ falsePositives,
9747
+ matchedWeight,
9748
+ totalWeight,
9749
+ precision,
9750
+ recall,
9751
+ f1: f1(precision, recall),
9752
+ weightedRecall: ratio(matchedWeight, totalWeight)
9753
+ };
9754
+ }
9755
+ function emptyAggregate() {
9756
+ return {
9757
+ matched: 0,
9758
+ total: 0,
9759
+ falsePositives: 0,
9760
+ matchedWeight: 0,
9761
+ totalWeight: 0,
9762
+ precision: 0,
9763
+ recall: 0,
9764
+ f1: 0,
9765
+ weightedRecall: 0
9766
+ };
9767
+ }
9768
+ function hasSplit(score, split) {
9769
+ return score.bySplit[split] !== void 0;
9770
+ }
9771
+ function f1(precision, recall) {
9772
+ return precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
9773
+ }
9774
+ function ratio(numerator, denominator) {
9775
+ return denominator > 0 ? numerator / denominator : 0;
9776
+ }
9777
+ function tokenJaccard(a, b) {
9778
+ const left = new Set(tokens(a));
9779
+ const right = new Set(tokens(b));
9780
+ if (left.size === 0 || right.size === 0) return 0;
9781
+ let intersection = 0;
9782
+ for (const token of left) {
9783
+ if (right.has(token)) intersection++;
9784
+ }
9785
+ return intersection / (left.size + right.size - intersection);
9786
+ }
9787
+ function tagOverlap(a, b) {
9788
+ if (!a?.length || !b?.length) return 0;
9789
+ const left = new Set(a.map(normalize));
9790
+ const right = new Set(b.map(normalize));
9791
+ let intersection = 0;
9792
+ for (const tag of left) {
9793
+ if (right.has(tag)) intersection++;
9794
+ }
9795
+ return intersection / Math.max(left.size, right.size);
9796
+ }
9797
+ function tokens(text) {
9798
+ return normalize(text).split(/\s+/).filter((token) => token.length >= 3 && !STOP_WORDS.has(token));
9799
+ }
9800
+ function normalize(text) {
9801
+ return text.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
9802
+ }
9803
+ function clamp012(value) {
9804
+ if (!Number.isFinite(value)) return 0;
9805
+ return Math.max(0, Math.min(1, value));
9806
+ }
9807
+ function sum(values) {
9808
+ return values.reduce((acc, value) => acc + value, 0);
9809
+ }
9810
+ function mean4(values) {
9811
+ return values.length ? sum(values) / values.length : 0;
9812
+ }
9813
+ function formatPct(value) {
9814
+ return `${(value * 100).toFixed(1)}%`;
9815
+ }
9816
+ function bySplitOrder(a, b) {
9817
+ return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
9818
+ }
9819
+ function runAdapter(adapter, scenario, context) {
9820
+ return typeof adapter === "function" ? adapter(scenario, context) : adapter.run(scenario, context);
9821
+ }
9822
+ function throwIfAborted(signal) {
9823
+ if (!signal?.aborted) return;
9824
+ if (signal.reason instanceof Error) throw signal.reason;
9825
+ throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
9826
+ }
9827
+ function readJsonl(path) {
9828
+ const raw = readFileSync4(path, "utf8");
9829
+ const out = [];
9830
+ for (const line of raw.split("\n")) {
9831
+ const trimmed = line.trim();
9832
+ if (!trimmed) continue;
9833
+ out.push(JSON.parse(trimmed));
9834
+ }
9835
+ return out;
9836
+ }
9837
+ var STOP_WORDS = /* @__PURE__ */ new Set([
9838
+ "the",
9839
+ "and",
9840
+ "for",
9841
+ "with",
9842
+ "that",
9843
+ "this",
9844
+ "from",
9845
+ "into",
9846
+ "onto",
9847
+ "are",
9848
+ "can",
9849
+ "will",
9850
+ "should",
9851
+ "could",
9852
+ "would",
9853
+ "when",
9854
+ "where",
9855
+ "which"
9856
+ ]);
7813
9857
  export {
7814
9858
  AgentDriver,
7815
9859
  AxGepaSteeringOptimizer,
@@ -7820,7 +9864,9 @@ export {
7820
9864
  ConvergenceTracker,
7821
9865
  CostTracker,
7822
9866
  DEFAULT_AGENT_SLOS,
9867
+ DEFAULT_COMPLEXITY_WEIGHTS,
7823
9868
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
9869
+ DEFAULT_FINDERS,
7824
9870
  DEFAULT_HARNESS_OBJECTIVES,
7825
9871
  DEFAULT_MUTATORS,
7826
9872
  DEFAULT_REDACTION_RULES,
@@ -7829,19 +9875,24 @@ export {
7829
9875
  Dataset,
7830
9876
  DockerSandboxDriver,
7831
9877
  DualAgentBench,
9878
+ ERROR_COUNT_PATTERNS,
7832
9879
  ExperimentTracker,
7833
9880
  FAILURE_CLASSES,
7834
9881
  FileSystemOutcomeStore,
7835
9882
  FileSystemTraceStore,
7836
9883
  HoldoutAuditor,
7837
9884
  HoldoutLockedError,
9885
+ INTENT_MATCH_JUDGE_VERSION,
7838
9886
  InMemoryExperimentStore,
7839
9887
  InMemoryOutcomeStore,
7840
9888
  InMemoryTraceStore,
7841
9889
  InMemoryWorkspaceInspector,
7842
9890
  JudgeRunner,
9891
+ LlmCallError,
9892
+ LlmClient,
7843
9893
  MODEL_PRICING,
7844
9894
  MetricsCollector,
9895
+ MultiLayerVerifier,
7845
9896
  OTEL_AGENT_EVAL_SCOPE,
7846
9897
  OptimizationLoop,
7847
9898
  PairwiseSteeringOptimizer,
@@ -7852,12 +9903,14 @@ export {
7852
9903
  PromptRegistry,
7853
9904
  REDACTION_VERSION,
7854
9905
  RunCritic,
9906
+ SEMANTIC_CONCEPT_JUDGE_VERSION,
7855
9907
  SandboxHarness,
7856
9908
  ScenarioRegistry,
7857
9909
  SubprocessSandboxDriver,
7858
9910
  TRACE_SCHEMA_VERSION,
7859
9911
  TokenCounter,
7860
9912
  TraceEmitter,
9913
+ UNIVERSAL_FINDERS,
7861
9914
  adversarialJudge,
7862
9915
  aggregateLlm,
7863
9916
  aggregateRunScore,
@@ -7869,10 +9922,13 @@ export {
7869
9922
  bisect,
7870
9923
  bonferroni,
7871
9924
  budgetBreachView,
9925
+ buildReviewerPrompt,
7872
9926
  buildTrajectory,
7873
9927
  byteLengthRange,
7874
9928
  calibrateJudge,
7875
9929
  calibrationCurve,
9930
+ callLlm,
9931
+ callLlmJson,
7876
9932
  canaryLeakView,
7877
9933
  causalAttribution,
7878
9934
  checkCanaries,
@@ -7885,6 +9941,7 @@ export {
7885
9941
  coherenceJudge,
7886
9942
  collectionPreserved,
7887
9943
  commitBisect,
9944
+ compareReferenceReplay,
7888
9945
  compareToBaseline,
7889
9946
  compilerJudge,
7890
9947
  composeParsers,
@@ -7896,10 +9953,17 @@ export {
7896
9953
  correlationStudy,
7897
9954
  createAntiSlopJudge,
7898
9955
  createCustomJudge,
9956
+ createDefaultReviewer,
7899
9957
  createDomainExpertJudge,
9958
+ createIntentMatchJudge,
7900
9959
  createLlmReviewer,
9960
+ createSemanticConceptJudge,
7901
9961
  crossTraceDiff,
9962
+ decideReferenceReplayPromotion,
9963
+ decideReferenceReplayRunPromotion,
7902
9964
  defaultJudges,
9965
+ defaultReferenceReplayMatcher,
9966
+ deployGateLayer,
7903
9967
  distillPlaybook,
7904
9968
  dominates,
7905
9969
  estimateCost,
@@ -7913,15 +9977,27 @@ export {
7913
9977
  exportRewardModel,
7914
9978
  exportRunAsOtlp,
7915
9979
  exportTrainingData,
9980
+ extractAssetUrls,
9981
+ extractErrorCount,
7916
9982
  failureClusterView,
7917
9983
  fileContains,
7918
9984
  fileExists,
9985
+ findAutoMatchNoExpectation,
9986
+ findConstructorCwdDropped,
9987
+ findFallbackToPass,
9988
+ findLiteralTruePass,
9989
+ findSkipCountsAsPass,
7919
9990
  firstDivergenceView,
9991
+ flowLayer,
7920
9992
  formatBenchmarkReport,
7921
9993
  formatDriverReport,
9994
+ formatFindings,
9995
+ gradeSemanticStatus,
7922
9996
  groupBy,
7923
9997
  hashContent,
7924
9998
  hashScenarios,
9999
+ htmlContainsElement,
10000
+ inMemoryReferenceReplayStore,
7925
10001
  inMemoryReviewStore,
7926
10002
  interRaterReliability,
7927
10003
  iqr,
@@ -7934,6 +10010,7 @@ export {
7934
10010
  jestTestParser,
7935
10011
  jsonHasKeys,
7936
10012
  jsonShape,
10013
+ jsonlReferenceReplayStore,
7937
10014
  jsonlReviewStore,
7938
10015
  judgeAgreementView,
7939
10016
  judgeSpans,
@@ -7942,9 +10019,12 @@ export {
7942
10019
  llmSpanFromProvider,
7943
10020
  llmSpans,
7944
10021
  loadScorerFromGrader,
10022
+ localCommandRunner,
7945
10023
  lowercaseMutator,
7946
10024
  mannWhitneyU,
10025
+ mergeLayerResults,
7947
10026
  mergeSteeringBundle,
10027
+ multiToolchainLayer,
7948
10028
  nistAiRmfReport,
7949
10029
  nonRefusalRubric,
7950
10030
  normalizeScores,
@@ -7960,6 +10040,7 @@ export {
7960
10040
  printDriverSummary,
7961
10041
  prmBestOfN,
7962
10042
  prmEnsembleBestOfN,
10043
+ probeLlm,
7963
10044
  promptBisect,
7964
10045
  proposeSynthesisTargets,
7965
10046
  pytestTestParser,
@@ -7986,15 +10067,22 @@ export {
7986
10067
  runExpectations,
7987
10068
  runFailureClass,
7988
10069
  runHarnessExperiment,
10070
+ runIntentMatchJudge,
7989
10071
  runJudgeFleet,
10072
+ runKeywordCoverageJudge,
10073
+ runKeywordCoverageJudgeUrl,
7990
10074
  runProposeReview,
10075
+ runReferenceReplay,
7991
10076
  runSelfPlay,
10077
+ runSemanticConceptJudge,
7992
10078
  runTestGradedScenario,
7993
10079
  runsForScenario,
10080
+ scanForMuffledGates,
7994
10081
  scoreAllProjects,
7995
10082
  scoreContinuity,
7996
10083
  scoreProject,
7997
10084
  scoreRedTeamOutput,
10085
+ scoreReferenceReplay,
7998
10086
  securityJudge,
7999
10087
  selectHarnessVariant,
8000
10088
  selfPreference,
@@ -8002,6 +10090,7 @@ export {
8002
10090
  signManifest,
8003
10091
  soc2Report,
8004
10092
  statusAdvanced,
10093
+ stripFencedJson,
8005
10094
  stuckLoopView,
8006
10095
  summarize,
8007
10096
  summarizeHarnessResults,
@@ -8021,6 +10110,7 @@ export {
8021
10110
  verbosityBias,
8022
10111
  verifyManifest,
8023
10112
  visualDiff,
10113
+ viteDeployRunner,
8024
10114
  vitestTestParser,
8025
10115
  weightedMean,
8026
10116
  welchsTTest,