@tangle-network/agent-eval 0.7.2 → 0.11.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +1284 -1
- package/dist/index.js +2050 -42
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -410,22 +410,22 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
410
410
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
411
411
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
412
412
|
const n = scores.length;
|
|
413
|
-
const
|
|
413
|
+
const mean5 = scores.reduce((a, b) => a + b, 0) / n;
|
|
414
414
|
const B = 1e3;
|
|
415
415
|
const bootstrapMeans = [];
|
|
416
416
|
for (let i = 0; i < B; i++) {
|
|
417
|
-
let
|
|
417
|
+
let sum2 = 0;
|
|
418
418
|
for (let j = 0; j < n; j++) {
|
|
419
|
-
|
|
419
|
+
sum2 += scores[Math.floor(Math.random() * n)];
|
|
420
420
|
}
|
|
421
|
-
bootstrapMeans.push(
|
|
421
|
+
bootstrapMeans.push(sum2 / n);
|
|
422
422
|
}
|
|
423
423
|
bootstrapMeans.sort((a, b) => a - b);
|
|
424
424
|
const alpha = 1 - confidence;
|
|
425
425
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
426
426
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
427
427
|
return {
|
|
428
|
-
mean:
|
|
428
|
+
mean: mean5,
|
|
429
429
|
lower: bootstrapMeans[lowerIdx],
|
|
430
430
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
431
431
|
};
|
|
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
|
|
|
513
513
|
const n = before.length;
|
|
514
514
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
515
|
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
-
const
|
|
517
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
516
|
+
const mean5 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
517
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean5) ** 2, 0) / (n - 1);
|
|
518
518
|
const se = Math.sqrt(variance2 / n);
|
|
519
|
-
if (se === 0) return { t:
|
|
520
|
-
const t =
|
|
519
|
+
if (se === 0) return { t: mean5 === 0 ? 0 : Infinity, df: n - 1, p: mean5 === 0 ? 1 : 0 };
|
|
520
|
+
const t = mean5 / se;
|
|
521
521
|
const df = n - 1;
|
|
522
522
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
523
|
return { t, df, p };
|
|
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
541
541
|
}
|
|
542
542
|
let wPlus = 0;
|
|
543
543
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
544
|
-
const
|
|
544
|
+
const mean5 = n * (n + 1) / 4;
|
|
545
545
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
-
const z = (wPlus -
|
|
546
|
+
const z = (wPlus - mean5) / Math.sqrt(variance2);
|
|
547
547
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
548
|
return { w: wPlus, p };
|
|
549
549
|
}
|
|
@@ -1562,12 +1562,12 @@ function analyzeAntiSlop(outputs, config) {
|
|
|
1562
1562
|
}
|
|
1563
1563
|
let dupes = 0;
|
|
1564
1564
|
for (const n of seen.values()) if (n > 1) dupes += n - 1;
|
|
1565
|
-
const
|
|
1566
|
-
if (
|
|
1565
|
+
const ratio2 = dupes / sentences.length;
|
|
1566
|
+
if (ratio2 > config.repetitionThreshold) {
|
|
1567
1567
|
counts.repetition += 1;
|
|
1568
1568
|
issues.push({
|
|
1569
1569
|
category: "repetition",
|
|
1570
|
-
detail: `${(
|
|
1570
|
+
detail: `${(ratio2 * 100).toFixed(0)}% duplicated (threshold ${(config.repetitionThreshold * 100).toFixed(0)}%)`
|
|
1571
1571
|
});
|
|
1572
1572
|
}
|
|
1573
1573
|
}
|
|
@@ -2187,7 +2187,7 @@ var RunCritic = class {
|
|
|
2187
2187
|
);
|
|
2188
2188
|
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
2189
2189
|
if (!success) notes.push("run did not complete with pass=true");
|
|
2190
|
-
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((
|
|
2190
|
+
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
2191
2191
|
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
|
|
2192
2192
|
const goalProgress = outcomeScore ?? judgeAverage ?? success;
|
|
2193
2193
|
const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
|
|
@@ -2197,7 +2197,7 @@ var RunCritic = class {
|
|
|
2197
2197
|
const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
|
|
2198
2198
|
if (!patchQuality) notes.push("no artifact or edit evidence recorded");
|
|
2199
2199
|
const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
|
|
2200
|
-
const testReality = sandboxTests.length ? sandboxTests.reduce((
|
|
2200
|
+
const testReality = sandboxTests.length ? sandboxTests.reduce((sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
|
|
2201
2201
|
if (!testReality) notes.push("no real test/build evidence recorded");
|
|
2202
2202
|
const blockerSpans = judgeSpans2.filter(
|
|
2203
2203
|
(span) => isBlockingJudge(span)
|
|
@@ -2213,7 +2213,7 @@ var RunCritic = class {
|
|
|
2213
2213
|
const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
|
|
2214
2214
|
const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
|
|
2215
2215
|
if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
|
|
2216
|
-
const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((
|
|
2216
|
+
const costUsd = trace.budget.length ? Math.max(...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed), 0) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
|
|
2217
2217
|
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
2218
2218
|
return {
|
|
2219
2219
|
success,
|
|
@@ -2423,7 +2423,7 @@ function rankRows(rows, weights) {
|
|
|
2423
2423
|
}
|
|
2424
2424
|
return [...buckets.entries()].map(([variantId, values]) => ({
|
|
2425
2425
|
variantId,
|
|
2426
|
-
mean: values.reduce((
|
|
2426
|
+
mean: values.reduce((sum2, value) => sum2 + value, 0) / values.length,
|
|
2427
2427
|
runs: values.length
|
|
2428
2428
|
})).sort((a, b) => b.mean - a.mean);
|
|
2429
2429
|
}
|
|
@@ -2571,7 +2571,7 @@ async function mapLimit(items, limit, fn) {
|
|
|
2571
2571
|
return results;
|
|
2572
2572
|
}
|
|
2573
2573
|
function mean(values) {
|
|
2574
|
-
return values.length ? values.reduce((
|
|
2574
|
+
return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
|
|
2575
2575
|
}
|
|
2576
2576
|
function meanRunScore(scores) {
|
|
2577
2577
|
return {
|
|
@@ -5095,10 +5095,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
5095
5095
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
5096
5096
|
}
|
|
5097
5097
|
const tail = values.slice(-window);
|
|
5098
|
-
const
|
|
5099
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
5098
|
+
const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
5099
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
|
|
5100
5100
|
const stdDev = Math.sqrt(variance2);
|
|
5101
|
-
const refMean = Math.abs(
|
|
5101
|
+
const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
5102
5102
|
const cv = stdDev / refMean;
|
|
5103
5103
|
const stable = tail.length >= window && cv <= stableCv;
|
|
5104
5104
|
let tailRun = 0;
|
|
@@ -5119,7 +5119,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
5119
5119
|
} else {
|
|
5120
5120
|
state = "noisy";
|
|
5121
5121
|
}
|
|
5122
|
-
return { state, windowMean:
|
|
5122
|
+
return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
|
|
5123
5123
|
}
|
|
5124
5124
|
|
|
5125
5125
|
// src/state-continuity.ts
|
|
@@ -5712,8 +5712,8 @@ function calibrateJudge(golden, candidate) {
|
|
|
5712
5712
|
const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
|
|
5713
5713
|
const absDiffs = common.map((c) => Math.abs(c.j - c.h));
|
|
5714
5714
|
const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
|
|
5715
|
-
const
|
|
5716
|
-
return { n, pearson: pearson2, kappa, mae, worstItems:
|
|
5715
|
+
const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
|
|
5716
|
+
return { n, pearson: pearson2, kappa, mae, worstItems: worst2 };
|
|
5717
5717
|
}
|
|
5718
5718
|
function positionalBias(scores) {
|
|
5719
5719
|
const pairs = /* @__PURE__ */ new Map();
|
|
@@ -6047,12 +6047,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
6047
6047
|
variantScores.push({ mutator: id, score, mutated });
|
|
6048
6048
|
all.push(score);
|
|
6049
6049
|
}
|
|
6050
|
-
const
|
|
6051
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
6050
|
+
const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
6051
|
+
const variance2 = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
|
|
6052
6052
|
const stdDev = Math.sqrt(variance2);
|
|
6053
|
-
const ref = Math.abs(
|
|
6053
|
+
const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
6054
6054
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
6055
|
-
return { originalScore, variantScores, meanScore:
|
|
6055
|
+
return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
|
|
6056
6056
|
}
|
|
6057
6057
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
6058
6058
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -6114,9 +6114,9 @@ function visualDiff(a, b, options = {}) {
|
|
|
6114
6114
|
const dg = Math.abs(a.data[i + 1] - b.data[i + 1]);
|
|
6115
6115
|
const db = Math.abs(a.data[i + 2] - b.data[i + 2]);
|
|
6116
6116
|
const da = Math.abs(a.data[i + 3] - b.data[i + 3]);
|
|
6117
|
-
const
|
|
6118
|
-
if (
|
|
6119
|
-
if (
|
|
6117
|
+
const worst2 = Math.max(dr, dg, db, da);
|
|
6118
|
+
if (worst2 > maxDelta) maxDelta = worst2;
|
|
6119
|
+
if (worst2 > tolerance) differing++;
|
|
6120
6120
|
}
|
|
6121
6121
|
const diffRatio = totalPixels > 0 ? differing / totalPixels : 0;
|
|
6122
6122
|
const status = diffRatio < 1e-3 ? "unchanged" : diffRatio > 0.05 ? "severely-changed" : "changed";
|
|
@@ -6973,8 +6973,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
6973
6973
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
6974
6974
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
6975
6975
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
6976
|
-
const
|
|
6977
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
6976
|
+
const mean5 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
6977
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / graded.length;
|
|
6978
6978
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
6979
6979
|
}
|
|
6980
6980
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -6996,8 +6996,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
6996
6996
|
const ranked = [...byRun.values()].sort(
|
|
6997
6997
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
6998
6998
|
);
|
|
6999
|
-
const
|
|
7000
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
6999
|
+
const mean5 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
7000
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / ranked.length;
|
|
7001
7001
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
7002
7002
|
}
|
|
7003
7003
|
|
|
@@ -7055,7 +7055,7 @@ async function commitBisect(options) {
|
|
|
7055
7055
|
}
|
|
7056
7056
|
async function promptBisect(options) {
|
|
7057
7057
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
7058
|
-
const
|
|
7058
|
+
const join3 = (paragraphs) => paragraphs.join("\n\n");
|
|
7059
7059
|
const goodParas = split(options.good);
|
|
7060
7060
|
const badParas = split(options.bad);
|
|
7061
7061
|
if (goodParas.length !== badParas.length) {
|
|
@@ -7073,7 +7073,7 @@ async function promptBisect(options) {
|
|
|
7073
7073
|
const result = await bisect({
|
|
7074
7074
|
good: goodMask,
|
|
7075
7075
|
bad: badMask,
|
|
7076
|
-
runEval: (mask) => options.runEval(
|
|
7076
|
+
runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
|
|
7077
7077
|
maxIterations: options.maxIterations ?? n + 5,
|
|
7078
7078
|
halfway: (g, b) => {
|
|
7079
7079
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -7104,12 +7104,12 @@ async function promptBisect(options) {
|
|
|
7104
7104
|
}
|
|
7105
7105
|
}
|
|
7106
7106
|
const materializedPath = result.path.map((s) => ({
|
|
7107
|
-
state:
|
|
7107
|
+
state: join3(paragraphsFor(s.state)),
|
|
7108
7108
|
score: s.score,
|
|
7109
7109
|
pass: s.pass
|
|
7110
7110
|
}));
|
|
7111
7111
|
return {
|
|
7112
|
-
culprit:
|
|
7112
|
+
culprit: join3(paragraphsFor(culprit)),
|
|
7113
7113
|
path: materializedPath,
|
|
7114
7114
|
converged: result.converged,
|
|
7115
7115
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -7527,8 +7527,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7527
7527
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7528
7528
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7529
7529
|
if (scores.length < 3) continue;
|
|
7530
|
-
const
|
|
7531
|
-
const variance2 = scores.reduce((a, b) => a + (b -
|
|
7530
|
+
const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7531
|
+
const variance2 = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
|
|
7532
7532
|
if (variance2 > varianceThreshold) {
|
|
7533
7533
|
targets.push({
|
|
7534
7534
|
reason: "high-variance",
|
|
@@ -8007,6 +8007,1975 @@ async function euAiActReport(ctx, signals) {
|
|
|
8007
8007
|
generatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
8008
8008
|
};
|
|
8009
8009
|
}
|
|
8010
|
+
|
|
8011
|
+
// src/llm-client.ts
|
|
8012
|
+
var LlmCallError = class extends Error {
|
|
8013
|
+
constructor(message, status, body, model) {
|
|
8014
|
+
super(message);
|
|
8015
|
+
this.status = status;
|
|
8016
|
+
this.body = body;
|
|
8017
|
+
this.model = model;
|
|
8018
|
+
this.name = "LlmCallError";
|
|
8019
|
+
}
|
|
8020
|
+
status;
|
|
8021
|
+
body;
|
|
8022
|
+
model;
|
|
8023
|
+
};
|
|
8024
|
+
var DEFAULT_BASE_URL = "https://router.tangle.tools/v1";
|
|
8025
|
+
var DEFAULT_TIMEOUT_MS = 6e4;
|
|
8026
|
+
var DEFAULT_MAX_RETRIES = 3;
|
|
8027
|
+
var RETRYABLE_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
8028
|
+
function isRetryableError(err) {
|
|
8029
|
+
if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status);
|
|
8030
|
+
if (err instanceof Error) {
|
|
8031
|
+
return err.name === "AbortError" || err.name === "TimeoutError" || /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message);
|
|
8032
|
+
}
|
|
8033
|
+
return false;
|
|
8034
|
+
}
|
|
8035
|
+
function parseRetryAfter(headers) {
|
|
8036
|
+
const h = headers.get("retry-after");
|
|
8037
|
+
if (!h) return null;
|
|
8038
|
+
const asNumber = Number(h);
|
|
8039
|
+
if (Number.isFinite(asNumber) && asNumber > 0) return asNumber * 1e3;
|
|
8040
|
+
const asDate = Date.parse(h);
|
|
8041
|
+
if (Number.isFinite(asDate)) return Math.max(0, asDate - Date.now());
|
|
8042
|
+
return null;
|
|
8043
|
+
}
|
|
8044
|
+
function backoffMs(attempt) {
|
|
8045
|
+
return Math.min(500 * Math.pow(2, attempt), 16e3);
|
|
8046
|
+
}
|
|
8047
|
+
function buildHeaders(opts) {
|
|
8048
|
+
const headers = {
|
|
8049
|
+
"Content-Type": "application/json",
|
|
8050
|
+
Accept: "application/json"
|
|
8051
|
+
};
|
|
8052
|
+
if (opts.authHeader) {
|
|
8053
|
+
headers[opts.authHeader.name] = opts.authHeader.value;
|
|
8054
|
+
} else if (opts.bearer || opts.apiKey) {
|
|
8055
|
+
headers.Authorization = `Bearer ${opts.bearer ?? opts.apiKey}`;
|
|
8056
|
+
}
|
|
8057
|
+
return headers;
|
|
8058
|
+
}
|
|
8059
|
+
function isSchemaRejection(status, body) {
|
|
8060
|
+
if (status !== 400) return false;
|
|
8061
|
+
const lower = body.toLowerCase();
|
|
8062
|
+
return lower.includes("response_format") || lower.includes("json_schema") || lower.includes("is unavailable") || lower.includes("not supported");
|
|
8063
|
+
}
|
|
8064
|
+
function buildBody(req, forceJsonObject) {
|
|
8065
|
+
const body = {
|
|
8066
|
+
model: req.model,
|
|
8067
|
+
messages: req.messages,
|
|
8068
|
+
temperature: req.temperature ?? 0
|
|
8069
|
+
};
|
|
8070
|
+
if (req.maxTokens != null) body.max_tokens = req.maxTokens;
|
|
8071
|
+
if (req.jsonSchema && !forceJsonObject) {
|
|
8072
|
+
body.response_format = {
|
|
8073
|
+
type: "json_schema",
|
|
8074
|
+
json_schema: { name: req.jsonSchema.name, schema: req.jsonSchema.schema, strict: true }
|
|
8075
|
+
};
|
|
8076
|
+
} else if (req.jsonMode || req.jsonSchema) {
|
|
8077
|
+
body.response_format = { type: "json_object" };
|
|
8078
|
+
}
|
|
8079
|
+
return body;
|
|
8080
|
+
}
|
|
8081
|
+
async function sleep(ms) {
|
|
8082
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
8083
|
+
}
|
|
8084
|
+
function stripFencedJson(raw) {
|
|
8085
|
+
const trimmed = raw.trim();
|
|
8086
|
+
const m = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```\s*$/);
|
|
8087
|
+
return m ? m[1].trim() : trimmed;
|
|
8088
|
+
}
|
|
8089
|
+
async function callLlm(req, opts = {}) {
|
|
8090
|
+
const baseUrl = (opts.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
|
|
8091
|
+
const url = `${baseUrl}/chat/completions`;
|
|
8092
|
+
const timeoutMs = req.timeoutMs ?? opts.defaultTimeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
8093
|
+
const maxRetries = opts.maxRetries ?? DEFAULT_MAX_RETRIES;
|
|
8094
|
+
const fetchFn = opts.fetch ?? globalThis.fetch;
|
|
8095
|
+
const headers = buildHeaders(opts);
|
|
8096
|
+
let lastErr;
|
|
8097
|
+
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
8098
|
+
const controller = new AbortController();
|
|
8099
|
+
const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs);
|
|
8100
|
+
const started = Date.now();
|
|
8101
|
+
try {
|
|
8102
|
+
const res = await fetchFn(url, {
|
|
8103
|
+
method: "POST",
|
|
8104
|
+
headers,
|
|
8105
|
+
body: JSON.stringify(buildBody(req, false)),
|
|
8106
|
+
signal: controller.signal
|
|
8107
|
+
});
|
|
8108
|
+
clearTimeout(timeoutHandle);
|
|
8109
|
+
if (!res.ok) {
|
|
8110
|
+
const body = await res.text();
|
|
8111
|
+
const err = new LlmCallError(
|
|
8112
|
+
`LLM call ${res.status}: ${body.slice(0, 300)}`,
|
|
8113
|
+
res.status,
|
|
8114
|
+
body,
|
|
8115
|
+
req.model
|
|
8116
|
+
);
|
|
8117
|
+
if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {
|
|
8118
|
+
lastErr = err;
|
|
8119
|
+
const retryAfter = parseRetryAfter(res.headers);
|
|
8120
|
+
await sleep(retryAfter ?? backoffMs(attempt));
|
|
8121
|
+
continue;
|
|
8122
|
+
}
|
|
8123
|
+
throw err;
|
|
8124
|
+
}
|
|
8125
|
+
const json = await res.json();
|
|
8126
|
+
const choice = json.choices?.[0];
|
|
8127
|
+
const usageRaw = json.usage ?? {};
|
|
8128
|
+
const costFromProxy = json._response_cost ?? json.cost_usd;
|
|
8129
|
+
return {
|
|
8130
|
+
content: choice?.message?.content ?? "",
|
|
8131
|
+
usage: {
|
|
8132
|
+
promptTokens: Number(usageRaw.prompt_tokens ?? 0),
|
|
8133
|
+
completionTokens: Number(usageRaw.completion_tokens ?? 0),
|
|
8134
|
+
totalTokens: Number(usageRaw.total_tokens ?? 0),
|
|
8135
|
+
cachedPromptTokens: usageRaw.prompt_tokens_details && typeof usageRaw.prompt_tokens_details === "object" ? Number(
|
|
8136
|
+
usageRaw.prompt_tokens_details.cached_tokens ?? 0
|
|
8137
|
+
) : void 0
|
|
8138
|
+
},
|
|
8139
|
+
costUsd: typeof costFromProxy === "number" ? costFromProxy : null,
|
|
8140
|
+
model: json.model ?? req.model,
|
|
8141
|
+
durationMs: Date.now() - started,
|
|
8142
|
+
raw: json
|
|
8143
|
+
};
|
|
8144
|
+
} catch (err) {
|
|
8145
|
+
clearTimeout(timeoutHandle);
|
|
8146
|
+
lastErr = err;
|
|
8147
|
+
if (attempt < maxRetries - 1 && isRetryableError(err)) {
|
|
8148
|
+
await sleep(backoffMs(attempt));
|
|
8149
|
+
continue;
|
|
8150
|
+
}
|
|
8151
|
+
throw err;
|
|
8152
|
+
}
|
|
8153
|
+
}
|
|
8154
|
+
throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
|
|
8155
|
+
}
|
|
8156
|
+
async function callLlmJson(req, opts = {}) {
|
|
8157
|
+
try {
|
|
8158
|
+
const result = await callLlm({ ...req, jsonMode: req.jsonMode ?? !req.jsonSchema }, opts);
|
|
8159
|
+
const value = parseJsonSafely(result.content, result.model);
|
|
8160
|
+
return { value, result };
|
|
8161
|
+
} catch (err) {
|
|
8162
|
+
if (err instanceof LlmCallError && isSchemaRejection(err.status, err.body) && req.jsonSchema) {
|
|
8163
|
+
const degradedReq = { ...req, jsonMode: true, jsonSchema: void 0 };
|
|
8164
|
+
const result = await callLlm(degradedReq, opts);
|
|
8165
|
+
const value = parseJsonSafely(result.content, result.model);
|
|
8166
|
+
return { value, result };
|
|
8167
|
+
}
|
|
8168
|
+
throw err;
|
|
8169
|
+
}
|
|
8170
|
+
}
|
|
8171
|
+
function parseJsonSafely(content, model) {
|
|
8172
|
+
const stripped = stripFencedJson(content);
|
|
8173
|
+
try {
|
|
8174
|
+
return JSON.parse(stripped);
|
|
8175
|
+
} catch (err) {
|
|
8176
|
+
throw new Error(
|
|
8177
|
+
`LLM returned non-JSON content (model=${model}): ${err instanceof Error ? err.message : String(err)}
|
|
8178
|
+
--- raw content ---
|
|
8179
|
+
${content.slice(0, 800)}`
|
|
8180
|
+
);
|
|
8181
|
+
}
|
|
8182
|
+
}
|
|
8183
|
+
async function probeLlm(model, opts = {}) {
|
|
8184
|
+
const start = Date.now();
|
|
8185
|
+
try {
|
|
8186
|
+
await callLlm(
|
|
8187
|
+
{
|
|
8188
|
+
model,
|
|
8189
|
+
messages: [{ role: "user", content: "ping" }],
|
|
8190
|
+
maxTokens: 64,
|
|
8191
|
+
timeoutMs: opts.timeoutMs ?? 3e4
|
|
8192
|
+
},
|
|
8193
|
+
opts
|
|
8194
|
+
);
|
|
8195
|
+
return { ok: true, latencyMs: Date.now() - start, error: null };
|
|
8196
|
+
} catch (err) {
|
|
8197
|
+
return {
|
|
8198
|
+
ok: false,
|
|
8199
|
+
latencyMs: Date.now() - start,
|
|
8200
|
+
error: err instanceof Error ? err.message : String(err)
|
|
8201
|
+
};
|
|
8202
|
+
}
|
|
8203
|
+
}
|
|
8204
|
+
var LlmClient = class {
|
|
8205
|
+
constructor(opts = {}) {
|
|
8206
|
+
this.opts = opts;
|
|
8207
|
+
}
|
|
8208
|
+
opts;
|
|
8209
|
+
call(req, per) {
|
|
8210
|
+
return callLlm(req, { ...this.opts, ...per });
|
|
8211
|
+
}
|
|
8212
|
+
callJson(req, per) {
|
|
8213
|
+
return callLlmJson(req, { ...this.opts, ...per });
|
|
8214
|
+
}
|
|
8215
|
+
};
|
|
8216
|
+
|
|
8217
|
+
// src/multi-layer-verifier.ts
|
|
8218
|
+
function gradeSemanticStatus(input) {
|
|
8219
|
+
if (!input.available) return "error";
|
|
8220
|
+
const threshold = input.threshold ?? 0.7;
|
|
8221
|
+
const criticalGaps = input.findings.filter(
|
|
8222
|
+
(f) => f.severity === "critical" && (f.present === false || (f.score ?? 0) < 7)
|
|
8223
|
+
);
|
|
8224
|
+
return input.score >= threshold && criticalGaps.length === 0 ? "pass" : "fail";
|
|
8225
|
+
}
|
|
8226
|
+
var MultiLayerVerifier = class {
|
|
8227
|
+
constructor(layers) {
|
|
8228
|
+
this.layers = layers;
|
|
8229
|
+
const seen = /* @__PURE__ */ new Set();
|
|
8230
|
+
for (const l of layers) {
|
|
8231
|
+
if (seen.has(l.name)) throw new Error(`MultiLayerVerifier: duplicate layer name "${l.name}"`);
|
|
8232
|
+
seen.add(l.name);
|
|
8233
|
+
}
|
|
8234
|
+
for (const l of layers) {
|
|
8235
|
+
for (const dep of l.dependsOn ?? []) {
|
|
8236
|
+
if (!seen.has(dep)) {
|
|
8237
|
+
throw new Error(
|
|
8238
|
+
`MultiLayerVerifier: layer "${l.name}" depends on "${dep}" which is not registered`
|
|
8239
|
+
);
|
|
8240
|
+
}
|
|
8241
|
+
}
|
|
8242
|
+
}
|
|
8243
|
+
}
|
|
8244
|
+
layers;
|
|
8245
|
+
async run(opts) {
|
|
8246
|
+
const startedAtMs = Date.now();
|
|
8247
|
+
const startedAt = new Date(startedAtMs).toISOString();
|
|
8248
|
+
const controller = new AbortController();
|
|
8249
|
+
const overallCap = opts.overallCapMs;
|
|
8250
|
+
const overallTimer = overallCap != null ? setTimeout(() => controller.abort(new Error("overall cap exceeded")), overallCap) : null;
|
|
8251
|
+
const results = [];
|
|
8252
|
+
const byName = {};
|
|
8253
|
+
try {
|
|
8254
|
+
for (const layer of this.layers) {
|
|
8255
|
+
const unmet = (layer.dependsOn ?? []).filter((d) => byName[d]?.status !== "pass");
|
|
8256
|
+
if (unmet.length > 0) {
|
|
8257
|
+
const skipped = {
|
|
8258
|
+
layer: layer.name,
|
|
8259
|
+
status: "skipped",
|
|
8260
|
+
durationMs: 0,
|
|
8261
|
+
findings: [],
|
|
8262
|
+
reason: `skipped \u2014 upstream not passing: ${unmet.join(", ")}`
|
|
8263
|
+
};
|
|
8264
|
+
results.push(skipped);
|
|
8265
|
+
byName[layer.name] = skipped;
|
|
8266
|
+
opts.onLayer?.(skipped);
|
|
8267
|
+
continue;
|
|
8268
|
+
}
|
|
8269
|
+
const perLayerController = new AbortController();
|
|
8270
|
+
const mergedSignal = mergeSignals(controller.signal, perLayerController.signal);
|
|
8271
|
+
const layerTimer = layer.capMs != null ? setTimeout(() => perLayerController.abort(new Error(`layer ${layer.name} cap`)), layer.capMs) : null;
|
|
8272
|
+
const layerStart = Date.now();
|
|
8273
|
+
let result;
|
|
8274
|
+
try {
|
|
8275
|
+
result = await layer.run({ env: opts.env, prior: { ...byName }, signal: mergedSignal });
|
|
8276
|
+
} catch (err) {
|
|
8277
|
+
const aborted = mergedSignal.aborted;
|
|
8278
|
+
result = {
|
|
8279
|
+
layer: layer.name,
|
|
8280
|
+
status: aborted ? "timeout" : "error",
|
|
8281
|
+
durationMs: Date.now() - layerStart,
|
|
8282
|
+
findings: [
|
|
8283
|
+
{
|
|
8284
|
+
severity: "major",
|
|
8285
|
+
message: err instanceof Error ? err.message : String(err),
|
|
8286
|
+
layer: layer.name
|
|
8287
|
+
}
|
|
8288
|
+
],
|
|
8289
|
+
reason: err instanceof Error ? err.message : String(err)
|
|
8290
|
+
};
|
|
8291
|
+
} finally {
|
|
8292
|
+
if (layerTimer) clearTimeout(layerTimer);
|
|
8293
|
+
}
|
|
8294
|
+
result.findings = result.findings.map((f) => ({ ...f, layer: f.layer ?? layer.name }));
|
|
8295
|
+
results.push(result);
|
|
8296
|
+
byName[layer.name] = result;
|
|
8297
|
+
opts.onLayer?.(result);
|
|
8298
|
+
if (controller.signal.aborted) break;
|
|
8299
|
+
}
|
|
8300
|
+
const report = aggregate(this.layers, results, startedAt, startedAtMs);
|
|
8301
|
+
return report;
|
|
8302
|
+
} finally {
|
|
8303
|
+
if (overallTimer) clearTimeout(overallTimer);
|
|
8304
|
+
}
|
|
8305
|
+
}
|
|
8306
|
+
};
|
|
8307
|
+
function aggregate(layers, results, startedAt, startedAtMs) {
|
|
8308
|
+
const weightByName = /* @__PURE__ */ new Map();
|
|
8309
|
+
const failContribByName = /* @__PURE__ */ new Map();
|
|
8310
|
+
for (const l of layers) {
|
|
8311
|
+
weightByName.set(l.name, l.weight ?? 1);
|
|
8312
|
+
failContribByName.set(l.name, l.failContributesToScore ?? false);
|
|
8313
|
+
}
|
|
8314
|
+
let passCount = 0;
|
|
8315
|
+
let failCount = 0;
|
|
8316
|
+
let skippedCount = 0;
|
|
8317
|
+
let errorCount = 0;
|
|
8318
|
+
let scoredWeightSum = 0;
|
|
8319
|
+
let scoredWeightedTotal = 0;
|
|
8320
|
+
let ranAnyScoredLayer = false;
|
|
8321
|
+
let anyScoredLayerFailed = false;
|
|
8322
|
+
for (const r of results) {
|
|
8323
|
+
const weight = weightByName.get(r.layer) ?? 1;
|
|
8324
|
+
const failContrib = failContribByName.get(r.layer) ?? false;
|
|
8325
|
+
if (r.status === "pass") passCount++;
|
|
8326
|
+
else if (r.status === "fail") failCount++;
|
|
8327
|
+
else if (r.status === "skipped") skippedCount++;
|
|
8328
|
+
else errorCount++;
|
|
8329
|
+
if (r.score != null && weight > 0) {
|
|
8330
|
+
if (r.status === "pass") {
|
|
8331
|
+
ranAnyScoredLayer = true;
|
|
8332
|
+
scoredWeightSum += weight;
|
|
8333
|
+
scoredWeightedTotal += weight * r.score;
|
|
8334
|
+
} else if (r.status === "fail") {
|
|
8335
|
+
if (failContrib) {
|
|
8336
|
+
ranAnyScoredLayer = true;
|
|
8337
|
+
scoredWeightSum += weight;
|
|
8338
|
+
scoredWeightedTotal += weight * r.score;
|
|
8339
|
+
}
|
|
8340
|
+
anyScoredLayerFailed = true;
|
|
8341
|
+
}
|
|
8342
|
+
} else if (r.status === "fail") {
|
|
8343
|
+
anyScoredLayerFailed = true;
|
|
8344
|
+
}
|
|
8345
|
+
}
|
|
8346
|
+
const finishedAtMs = Date.now();
|
|
8347
|
+
return {
|
|
8348
|
+
layers: results,
|
|
8349
|
+
passCount,
|
|
8350
|
+
failCount,
|
|
8351
|
+
skippedCount,
|
|
8352
|
+
errorCount,
|
|
8353
|
+
allPass: ranAnyScoredLayer && !anyScoredLayerFailed && failCount === 0 && errorCount === 0,
|
|
8354
|
+
blendedScore: scoredWeightSum > 0 ? scoredWeightedTotal / scoredWeightSum : 0,
|
|
8355
|
+
durationMs: finishedAtMs - startedAtMs,
|
|
8356
|
+
startedAt,
|
|
8357
|
+
finishedAt: new Date(finishedAtMs).toISOString()
|
|
8358
|
+
};
|
|
8359
|
+
}
|
|
8360
|
+
function mergeSignals(a, b) {
|
|
8361
|
+
if (a.aborted) return a;
|
|
8362
|
+
if (b.aborted) return b;
|
|
8363
|
+
const c = new AbortController();
|
|
8364
|
+
const onAbort = (signal) => () => c.abort(signal.reason);
|
|
8365
|
+
a.addEventListener("abort", onAbort(a), { once: true });
|
|
8366
|
+
b.addEventListener("abort", onAbort(b), { once: true });
|
|
8367
|
+
return c.signal;
|
|
8368
|
+
}
|
|
8369
|
+
|
|
8370
|
+
// src/command-runner.ts
|
|
8371
|
+
import { spawnSync } from "child_process";
|
|
8372
|
+
import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
|
|
8373
|
+
import { join as join2 } from "path";
|
|
8374
|
+
var localCommandRunner = {
|
|
8375
|
+
name: "local",
|
|
8376
|
+
async run(input) {
|
|
8377
|
+
const start = Date.now();
|
|
8378
|
+
const res = spawnSync(input.cmd, input.argv, {
|
|
8379
|
+
cwd: input.cwd,
|
|
8380
|
+
encoding: "utf8",
|
|
8381
|
+
timeout: input.capMs,
|
|
8382
|
+
env: { ...process.env, CI: "1", ...input.env ?? {} },
|
|
8383
|
+
input: input.stdin
|
|
8384
|
+
});
|
|
8385
|
+
const durationMs = Date.now() - start;
|
|
8386
|
+
const timedOut = !!(res.error && "code" in res.error && res.error.code === "ETIMEDOUT");
|
|
8387
|
+
return {
|
|
8388
|
+
status: res.status ?? null,
|
|
8389
|
+
stdout: (res.stdout ?? "").toString(),
|
|
8390
|
+
stderr: (res.stderr ?? "").toString(),
|
|
8391
|
+
durationMs,
|
|
8392
|
+
timedOut,
|
|
8393
|
+
runnerError: res.error && !timedOut ? String(res.error.message ?? res.error) : void 0
|
|
8394
|
+
};
|
|
8395
|
+
},
|
|
8396
|
+
async hasBin(name) {
|
|
8397
|
+
const r = spawnSync("which", [name], { encoding: "utf8", timeout: 2e3 });
|
|
8398
|
+
return r.status === 0 && (r.stdout ?? "").trim().length > 0;
|
|
8399
|
+
},
|
|
8400
|
+
async fileExists(path) {
|
|
8401
|
+
return existsSync3(path);
|
|
8402
|
+
},
|
|
8403
|
+
async readFile(path) {
|
|
8404
|
+
try {
|
|
8405
|
+
return readFileSync3(path, "utf8");
|
|
8406
|
+
} catch {
|
|
8407
|
+
return null;
|
|
8408
|
+
}
|
|
8409
|
+
},
|
|
8410
|
+
async readDir(path) {
|
|
8411
|
+
let entries;
|
|
8412
|
+
try {
|
|
8413
|
+
entries = readdirSync2(path);
|
|
8414
|
+
} catch {
|
|
8415
|
+
return [];
|
|
8416
|
+
}
|
|
8417
|
+
const out = [];
|
|
8418
|
+
for (const name of entries) {
|
|
8419
|
+
try {
|
|
8420
|
+
const st = statSync2(join2(path, name));
|
|
8421
|
+
out.push({
|
|
8422
|
+
name,
|
|
8423
|
+
isDirectory: st.isDirectory(),
|
|
8424
|
+
isFile: st.isFile(),
|
|
8425
|
+
sizeBytes: st.isFile() ? st.size : null
|
|
8426
|
+
});
|
|
8427
|
+
} catch {
|
|
8428
|
+
}
|
|
8429
|
+
}
|
|
8430
|
+
return out;
|
|
8431
|
+
}
|
|
8432
|
+
};
|
|
8433
|
+
|
|
8434
|
+
// src/multi-toolchain-layer.ts
|
|
8435
|
+
var STATUS_RANK = {
|
|
8436
|
+
pass: 0,
|
|
8437
|
+
skipped: 1,
|
|
8438
|
+
fail: 2,
|
|
8439
|
+
timeout: 3,
|
|
8440
|
+
error: 4
|
|
8441
|
+
};
|
|
8442
|
+
function worst(a, b) {
|
|
8443
|
+
return (STATUS_RANK[a] ?? 0) >= (STATUS_RANK[b] ?? 0) ? a : b;
|
|
8444
|
+
}
|
|
8445
|
+
var SEVERITY_RANK = {
|
|
8446
|
+
info: 0,
|
|
8447
|
+
minor: 1,
|
|
8448
|
+
major: 2,
|
|
8449
|
+
critical: 3
|
|
8450
|
+
};
|
|
8451
|
+
function maxSeverity(findings) {
|
|
8452
|
+
let best = "info";
|
|
8453
|
+
for (const f of findings) {
|
|
8454
|
+
if (SEVERITY_RANK[f.severity] > SEVERITY_RANK[best]) best = f.severity;
|
|
8455
|
+
}
|
|
8456
|
+
return best;
|
|
8457
|
+
}
|
|
8458
|
+
function mergeLayerResults(name, perAdapter, options = {}) {
|
|
8459
|
+
const mergeDuration = options.mergeDuration ?? "max";
|
|
8460
|
+
const mergeDiagnostics = options.mergeDiagnostics ?? "max";
|
|
8461
|
+
const prefix = options.messagePrefixer;
|
|
8462
|
+
if (perAdapter.length === 0) {
|
|
8463
|
+
return {
|
|
8464
|
+
layer: name,
|
|
8465
|
+
status: "skipped",
|
|
8466
|
+
durationMs: 0,
|
|
8467
|
+
findings: [],
|
|
8468
|
+
reason: "no adapters"
|
|
8469
|
+
};
|
|
8470
|
+
}
|
|
8471
|
+
if (perAdapter.length === 1) {
|
|
8472
|
+
const only = perAdapter[0];
|
|
8473
|
+
return {
|
|
8474
|
+
...only.result,
|
|
8475
|
+
layer: name,
|
|
8476
|
+
findings: only.result.findings.map((f) => ({
|
|
8477
|
+
...f,
|
|
8478
|
+
layer: name,
|
|
8479
|
+
message: prefix ? `${prefix(only.adapter)} ${f.message}` : f.message,
|
|
8480
|
+
detail: { ...f.detail ?? {}, adapter: only.adapter }
|
|
8481
|
+
})),
|
|
8482
|
+
reason: only.result.reason ?? `${only.adapter}: ${only.result.status}`
|
|
8483
|
+
};
|
|
8484
|
+
}
|
|
8485
|
+
let status = "pass";
|
|
8486
|
+
let weightedScoreSum = 0;
|
|
8487
|
+
let weightCount = 0;
|
|
8488
|
+
const findings = [];
|
|
8489
|
+
let durationMs = 0;
|
|
8490
|
+
const reasonParts = [];
|
|
8491
|
+
const diagnostics = {};
|
|
8492
|
+
for (const { adapter, result } of perAdapter) {
|
|
8493
|
+
status = worst(status, result.status);
|
|
8494
|
+
if (typeof result.score === "number") {
|
|
8495
|
+
weightedScoreSum += result.score;
|
|
8496
|
+
weightCount += 1;
|
|
8497
|
+
}
|
|
8498
|
+
durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
|
|
8499
|
+
reasonParts.push(`${adapter}: ${result.status}`);
|
|
8500
|
+
for (const f of result.findings) {
|
|
8501
|
+
findings.push({
|
|
8502
|
+
...f,
|
|
8503
|
+
layer: name,
|
|
8504
|
+
message: prefix ? `${prefix(adapter)} ${f.message}` : f.message,
|
|
8505
|
+
detail: { ...f.detail ?? {}, adapter }
|
|
8506
|
+
});
|
|
8507
|
+
}
|
|
8508
|
+
for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
|
|
8509
|
+
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
8510
|
+
const prev = diagnostics[k];
|
|
8511
|
+
if (prev == null) diagnostics[k] = v;
|
|
8512
|
+
else diagnostics[k] = mergeDiagnostics === "sum" ? prev + v : Math.max(prev, v);
|
|
8513
|
+
}
|
|
8514
|
+
}
|
|
8515
|
+
return {
|
|
8516
|
+
layer: name,
|
|
8517
|
+
status,
|
|
8518
|
+
score: weightCount > 0 ? weightedScoreSum / weightCount : void 0,
|
|
8519
|
+
durationMs,
|
|
8520
|
+
findings,
|
|
8521
|
+
reason: reasonParts.join(" \xB7 "),
|
|
8522
|
+
diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
|
|
8523
|
+
detail: {
|
|
8524
|
+
adapters: perAdapter.map(({ adapter, result }) => ({
|
|
8525
|
+
adapter,
|
|
8526
|
+
status: result.status,
|
|
8527
|
+
score: result.score ?? null
|
|
8528
|
+
})),
|
|
8529
|
+
worstSeverity: maxSeverity(findings)
|
|
8530
|
+
}
|
|
8531
|
+
};
|
|
8532
|
+
}
|
|
8533
|
+
function multiToolchainLayer(config) {
|
|
8534
|
+
const maxParallel = Math.max(1, config.maxParallel ?? 8);
|
|
8535
|
+
return {
|
|
8536
|
+
name: config.name,
|
|
8537
|
+
dependsOn: config.dependsOn,
|
|
8538
|
+
weight: config.weight,
|
|
8539
|
+
failContributesToScore: config.failContributesToScore,
|
|
8540
|
+
capMs: config.capMs,
|
|
8541
|
+
async run(ctx) {
|
|
8542
|
+
if (config.adapters.length === 0) {
|
|
8543
|
+
return {
|
|
8544
|
+
layer: config.name,
|
|
8545
|
+
status: "skipped",
|
|
8546
|
+
durationMs: 0,
|
|
8547
|
+
findings: [],
|
|
8548
|
+
reason: "no adapters detected"
|
|
8549
|
+
};
|
|
8550
|
+
}
|
|
8551
|
+
const runOne = async (adapter) => {
|
|
8552
|
+
const adapterName = config.adapterName(adapter);
|
|
8553
|
+
try {
|
|
8554
|
+
const r = await config.run(adapter, ctx);
|
|
8555
|
+
return { adapter: adapterName, result: r };
|
|
8556
|
+
} catch (err) {
|
|
8557
|
+
return {
|
|
8558
|
+
adapter: adapterName,
|
|
8559
|
+
result: {
|
|
8560
|
+
layer: config.name,
|
|
8561
|
+
status: "error",
|
|
8562
|
+
durationMs: 0,
|
|
8563
|
+
findings: [
|
|
8564
|
+
{
|
|
8565
|
+
severity: "major",
|
|
8566
|
+
layer: config.name,
|
|
8567
|
+
message: err instanceof Error ? err.message : String(err),
|
|
8568
|
+
detail: { adapter: adapterName }
|
|
8569
|
+
}
|
|
8570
|
+
],
|
|
8571
|
+
reason: err instanceof Error ? err.message : String(err)
|
|
8572
|
+
}
|
|
8573
|
+
};
|
|
8574
|
+
}
|
|
8575
|
+
};
|
|
8576
|
+
const results = [];
|
|
8577
|
+
for (let i = 0; i < config.adapters.length; i += maxParallel) {
|
|
8578
|
+
const chunk = config.adapters.slice(i, i + maxParallel);
|
|
8579
|
+
const chunkResults = await Promise.all(chunk.map(runOne));
|
|
8580
|
+
results.push(...chunkResults);
|
|
8581
|
+
}
|
|
8582
|
+
return mergeLayerResults(config.name, results);
|
|
8583
|
+
}
|
|
8584
|
+
};
|
|
8585
|
+
}
|
|
8586
|
+
|
|
8587
|
+
// src/reviewer.ts
|
|
8588
|
+
var REVIEWER_SCHEMA = {
|
|
8589
|
+
type: "object",
|
|
8590
|
+
additionalProperties: false,
|
|
8591
|
+
required: ["observations", "diagnosis", "nextShotInstruction", "shouldContinue", "confidence"],
|
|
8592
|
+
properties: {
|
|
8593
|
+
observations: { type: "string", minLength: 20, maxLength: 2e3 },
|
|
8594
|
+
diagnosis: { type: "string", minLength: 20, maxLength: 1500 },
|
|
8595
|
+
nextShotInstruction: { type: "string", minLength: 40, maxLength: 3e3 },
|
|
8596
|
+
shouldContinue: { type: "boolean" },
|
|
8597
|
+
confidence: { type: "number", minimum: 0, maximum: 1 }
|
|
8598
|
+
}
|
|
8599
|
+
};
|
|
8600
|
+
function summarizeMemory(memory) {
|
|
8601
|
+
if (memory.length === 0) return "(no prior shots)";
|
|
8602
|
+
return memory.map((m) => {
|
|
8603
|
+
const header = `shot ${m.shot} \u2014 confidence=${(m.confidence ?? 0).toFixed(2)} shouldContinue=${m.shouldContinue ?? "?"}`;
|
|
8604
|
+
const obs = m.observations ? ` observations: ${m.observations.slice(0, 400)}` : "";
|
|
8605
|
+
const diag = m.diagnosis ? ` diagnosis: ${m.diagnosis.slice(0, 400)}` : "";
|
|
8606
|
+
const instr = m.nextShotInstruction ? ` instruction given: ${m.nextShotInstruction.slice(0, 400)}` : "";
|
|
8607
|
+
return [header, obs, diag, instr].filter(Boolean).join("\n");
|
|
8608
|
+
}).join("\n\n");
|
|
8609
|
+
}
|
|
8610
|
+
function buildReviewerPrompt(input) {
|
|
8611
|
+
const system = "You are a senior-engineer-grade reviewer directing an agent through a multi-shot build. Your job is NOT to grade; your job IS to direct the worker's next shot using the trace, verification result, prior memory, and user request. Return STRICT JSON. No prose outside the JSON.";
|
|
8612
|
+
const failingLayersBlock = input.verification.failingLayers && input.verification.failingLayers.length > 0 ? `failing layers: ${input.verification.failingLayers.join(", ")}` : "no layers failing";
|
|
8613
|
+
const user = `=== SHOT NUMBER ===
|
|
8614
|
+
shot ${input.shot} of the review loop
|
|
8615
|
+
|
|
8616
|
+
=== USER REQUEST ===
|
|
8617
|
+
${input.userRequest}
|
|
8618
|
+
|
|
8619
|
+
=== WORKER TRACE (shot ${input.shot}) ===
|
|
8620
|
+
${input.traceSummary}
|
|
8621
|
+
${input.extraContext ? `
|
|
8622
|
+
=== EXTRA CONTEXT ===
|
|
8623
|
+
${input.extraContext}
|
|
8624
|
+
` : ""}
|
|
8625
|
+
=== VERIFICATION (shot ${input.shot}) ===
|
|
8626
|
+
blendedScore: ${input.verification.blendedScore.toFixed(2)}
|
|
8627
|
+
allPass: ${input.verification.allPass}
|
|
8628
|
+
failCount: ${input.verification.failCount}
|
|
8629
|
+
${failingLayersBlock}
|
|
8630
|
+
|
|
8631
|
+
=== REVIEWER MEMORY ===
|
|
8632
|
+
${summarizeMemory(input.memory)}
|
|
8633
|
+
${input.trailingContext ? `
|
|
8634
|
+
=== TRAILING CONTEXT ===
|
|
8635
|
+
${input.trailingContext}
|
|
8636
|
+
` : ""}
|
|
8637
|
+
=== YOUR TASK ===
|
|
8638
|
+
Return STRICT JSON:
|
|
8639
|
+
|
|
8640
|
+
1. observations (20-2000 chars): first-person worker behavior from the trace (tool call counts, errors, loops).
|
|
8641
|
+
2. diagnosis (20-1500 chars): root cause of current failures, not a restatement of verification.
|
|
8642
|
+
3. nextShotInstruction (40-3000 chars): concrete "FIX THESE:" directive for the worker's next shot. Reference memory when instructions repeat.
|
|
8643
|
+
4. shouldContinue (boolean): FALSE if verification.allPass=true, if worker is thrashing, if confidence < 0.3, or if the request looks unachievable. TRUE otherwise.
|
|
8644
|
+
5. confidence (0-1): self-assessment.
|
|
8645
|
+
|
|
8646
|
+
RULES:
|
|
8647
|
+
- If verification.allPass is true, shouldContinue MUST be false.
|
|
8648
|
+
- If memory shows the same failing layer for 2 shots, reduce confidence \u2014 strategy isn't working.
|
|
8649
|
+
- If the trace shows zero tool calls, the worker didn't run \u2014 surface that.
|
|
8650
|
+
- Do NOT re-grade. Direct.`;
|
|
8651
|
+
return { system, user };
|
|
8652
|
+
}
|
|
8653
|
+
var DEFAULT_SOFT_FAIL = {
|
|
8654
|
+
observations: "reviewer soft-failed \u2014 no observations captured",
|
|
8655
|
+
diagnosis: "reviewer soft-failed \u2014 inspect verification findings and retry",
|
|
8656
|
+
nextShotInstruction: "Inspect the verification findings above and address the highest-severity failing layer first. If install failed, start there; otherwise work from the first failing gate and address compilation/build errors before layout/semantic issues.",
|
|
8657
|
+
shouldContinue: true,
|
|
8658
|
+
confidence: 0.3
|
|
8659
|
+
};
|
|
8660
|
+
function createDefaultReviewer(options) {
|
|
8661
|
+
const softFail2 = {
|
|
8662
|
+
...DEFAULT_SOFT_FAIL,
|
|
8663
|
+
...options.softFailDefaults ?? {}
|
|
8664
|
+
};
|
|
8665
|
+
const promptBuilder = options.promptBuilder ?? buildReviewerPrompt;
|
|
8666
|
+
const timeoutMs = options.timeoutMs ?? 18e4;
|
|
8667
|
+
return async (input) => {
|
|
8668
|
+
const start = Date.now();
|
|
8669
|
+
const { system, user } = promptBuilder(input);
|
|
8670
|
+
try {
|
|
8671
|
+
const { value, result } = await callLlmJson(
|
|
8672
|
+
{
|
|
8673
|
+
model: options.model,
|
|
8674
|
+
messages: [
|
|
8675
|
+
{ role: "system", content: system },
|
|
8676
|
+
{ role: "user", content: user }
|
|
8677
|
+
],
|
|
8678
|
+
jsonSchema: { name: "reviewer_output", schema: REVIEWER_SCHEMA },
|
|
8679
|
+
temperature: 0,
|
|
8680
|
+
timeoutMs
|
|
8681
|
+
},
|
|
8682
|
+
options.llm ?? {}
|
|
8683
|
+
);
|
|
8684
|
+
return {
|
|
8685
|
+
shot: input.shot,
|
|
8686
|
+
observations: String(value.observations ?? softFail2.observations),
|
|
8687
|
+
diagnosis: String(value.diagnosis ?? softFail2.diagnosis),
|
|
8688
|
+
nextShotInstruction: String(value.nextShotInstruction ?? softFail2.nextShotInstruction),
|
|
8689
|
+
shouldContinue: Boolean(value.shouldContinue),
|
|
8690
|
+
confidence: Math.max(0, Math.min(1, Number(value.confidence ?? softFail2.confidence))),
|
|
8691
|
+
costUsd: result.costUsd ?? null,
|
|
8692
|
+
durationMs: Date.now() - start,
|
|
8693
|
+
available: true
|
|
8694
|
+
};
|
|
8695
|
+
} catch (err) {
|
|
8696
|
+
return {
|
|
8697
|
+
shot: input.shot,
|
|
8698
|
+
observations: softFail2.observations,
|
|
8699
|
+
diagnosis: softFail2.diagnosis,
|
|
8700
|
+
nextShotInstruction: softFail2.nextShotInstruction,
|
|
8701
|
+
shouldContinue: softFail2.shouldContinue,
|
|
8702
|
+
confidence: softFail2.confidence,
|
|
8703
|
+
costUsd: null,
|
|
8704
|
+
durationMs: Date.now() - start,
|
|
8705
|
+
available: false,
|
|
8706
|
+
error: err instanceof Error ? err.message : String(err)
|
|
8707
|
+
};
|
|
8708
|
+
}
|
|
8709
|
+
};
|
|
8710
|
+
}
|
|
8711
|
+
|
|
8712
|
+
// src/semantic-concept-judge.ts
|
|
8713
|
+
var DEFAULT_COMPLEXITY_WEIGHTS = {
|
|
8714
|
+
render: 1,
|
|
8715
|
+
integrate: 2,
|
|
8716
|
+
compute: 2.5
|
|
8717
|
+
};
|
|
8718
|
+
var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
|
|
8719
|
+
var DEFAULT_MAX_SOURCE = 45e3;
|
|
8720
|
+
var DEFAULT_MAX_HTML = 3e4;
|
|
8721
|
+
var DEFAULT_MAX_PER_FILE = 2e4;
|
|
8722
|
+
var DEFAULT_TIMEOUT = 18e4;
|
|
8723
|
+
var DEFAULT_MODEL = "claude-sonnet-4-6";
|
|
8724
|
+
var SEMANTIC_SCHEMA = {
|
|
8725
|
+
type: "object",
|
|
8726
|
+
additionalProperties: false,
|
|
8727
|
+
required: ["summary", "concepts"],
|
|
8728
|
+
properties: {
|
|
8729
|
+
summary: { type: "string", minLength: 20, maxLength: 600 },
|
|
8730
|
+
concepts: {
|
|
8731
|
+
type: "array",
|
|
8732
|
+
minItems: 1,
|
|
8733
|
+
items: {
|
|
8734
|
+
type: "object",
|
|
8735
|
+
additionalProperties: false,
|
|
8736
|
+
required: ["concept", "present", "score", "evidence", "severity"],
|
|
8737
|
+
properties: {
|
|
8738
|
+
concept: { type: "string", minLength: 1, maxLength: 120 },
|
|
8739
|
+
present: { type: "boolean" },
|
|
8740
|
+
score: { type: "number", minimum: 0, maximum: 10 },
|
|
8741
|
+
evidence: { type: "string", minLength: 5, maxLength: 400 },
|
|
8742
|
+
severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
|
|
8743
|
+
}
|
|
8744
|
+
}
|
|
8745
|
+
}
|
|
8746
|
+
}
|
|
8747
|
+
};
|
|
8748
|
+
function truncate(body, cap, label) {
|
|
8749
|
+
if (body.length <= cap) return body;
|
|
8750
|
+
return body.slice(0, cap) + `
|
|
8751
|
+
\u2026 [truncated ${body.length - cap} chars of ${label}]`;
|
|
8752
|
+
}
|
|
8753
|
+
function buildPrompt(input, opts) {
|
|
8754
|
+
const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
|
|
8755
|
+
${f.content}`).join("\n\n");
|
|
8756
|
+
const html = input.servedHtml ?? "";
|
|
8757
|
+
return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
|
|
8758
|
+
|
|
8759
|
+
You MUST distinguish:
|
|
8760
|
+
(a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
|
|
8761
|
+
(b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
|
|
8762
|
+
(c) ABSENT (concept nowhere).
|
|
8763
|
+
|
|
8764
|
+
A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
|
|
8765
|
+
|
|
8766
|
+
USER REQUEST (what the agent was asked to build):
|
|
8767
|
+
${input.userRequest}
|
|
8768
|
+
|
|
8769
|
+
${input.artifactLabel ? `ARTIFACT METADATA:
|
|
8770
|
+
name: ${input.artifactLabel}
|
|
8771
|
+
description: ${input.artifactDescription ?? ""}
|
|
8772
|
+
|
|
8773
|
+
` : ""}EXPECTED CONCEPTS (each must be graded independently):
|
|
8774
|
+
${input.expectedConcepts.map((c, i) => ` ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`).join("\n")}
|
|
8775
|
+
|
|
8776
|
+
${html ? `SERVED HTML (what the preview returns when hit):
|
|
8777
|
+
${truncate(html, opts.maxHtmlChars, "HTML")}
|
|
8778
|
+
|
|
8779
|
+
` : ""}SOURCE FILES (the agent's workdir):
|
|
8780
|
+
${truncate(sourceBlob, opts.maxSourceChars, "source")}
|
|
8781
|
+
|
|
8782
|
+
For EACH concept, return:
|
|
8783
|
+
- concept: the concept name as given (match exactly)
|
|
8784
|
+
- present: boolean \u2014 does a working implementation exist?
|
|
8785
|
+
- score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
|
|
8786
|
+
- evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
|
|
8787
|
+
- severity:
|
|
8788
|
+
"info" when present: true AND score >= 7
|
|
8789
|
+
"minor" when present: true AND 4 <= score < 7
|
|
8790
|
+
"major" when present: false OR score < 4
|
|
8791
|
+
"critical" when the concept is not only absent but a core user flow depends on it
|
|
8792
|
+
|
|
8793
|
+
Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
|
|
8794
|
+
|
|
8795
|
+
BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
|
|
8796
|
+
|
|
8797
|
+
Return STRICT JSON. No prose outside the JSON.`;
|
|
8798
|
+
}
|
|
8799
|
+
async function runSemanticConceptJudge(input, options = {}) {
|
|
8800
|
+
const start = Date.now();
|
|
8801
|
+
const totalCount = input.expectedConcepts.length;
|
|
8802
|
+
if (totalCount === 0) {
|
|
8803
|
+
return {
|
|
8804
|
+
kind: "semantic-concept",
|
|
8805
|
+
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
8806
|
+
score: 0,
|
|
8807
|
+
presentCount: 0,
|
|
8808
|
+
totalCount: 0,
|
|
8809
|
+
findings: [],
|
|
8810
|
+
summary: "no expected concepts declared",
|
|
8811
|
+
durationMs: 0,
|
|
8812
|
+
costUsd: null,
|
|
8813
|
+
available: false,
|
|
8814
|
+
error: "no expected concepts declared"
|
|
8815
|
+
};
|
|
8816
|
+
}
|
|
8817
|
+
const opts = {
|
|
8818
|
+
model: options.model ?? DEFAULT_MODEL,
|
|
8819
|
+
timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
|
|
8820
|
+
maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
|
|
8821
|
+
maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
|
|
8822
|
+
maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
|
|
8823
|
+
llm: options.llm ?? {},
|
|
8824
|
+
weightConcepts: options.weightConcepts ?? "mean",
|
|
8825
|
+
complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
|
|
8826
|
+
};
|
|
8827
|
+
const weightForConcept = (spec) => {
|
|
8828
|
+
if (opts.weightConcepts === "mean") return 1;
|
|
8829
|
+
if (spec.weight != null) return spec.weight;
|
|
8830
|
+
if (opts.weightConcepts === "complexity") {
|
|
8831
|
+
return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
|
|
8832
|
+
}
|
|
8833
|
+
return 1;
|
|
8834
|
+
};
|
|
8835
|
+
const weightByName = new Map(
|
|
8836
|
+
input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
|
|
8837
|
+
);
|
|
8838
|
+
try {
|
|
8839
|
+
const { value, result } = await callLlmJson(
|
|
8840
|
+
{
|
|
8841
|
+
model: opts.model,
|
|
8842
|
+
messages: [
|
|
8843
|
+
{
|
|
8844
|
+
role: "system",
|
|
8845
|
+
content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
|
|
8846
|
+
},
|
|
8847
|
+
{ role: "user", content: buildPrompt(input, opts) }
|
|
8848
|
+
],
|
|
8849
|
+
jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
|
|
8850
|
+
temperature: 0,
|
|
8851
|
+
timeoutMs: opts.timeoutMs
|
|
8852
|
+
},
|
|
8853
|
+
opts.llm
|
|
8854
|
+
);
|
|
8855
|
+
if (!value?.concepts || !Array.isArray(value.concepts)) {
|
|
8856
|
+
throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
|
|
8857
|
+
}
|
|
8858
|
+
const findings = value.concepts.map((c) => ({
|
|
8859
|
+
concept: String(c.concept),
|
|
8860
|
+
present: Boolean(c.present),
|
|
8861
|
+
score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
|
|
8862
|
+
evidence: String(c.evidence ?? ""),
|
|
8863
|
+
severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
|
|
8864
|
+
}));
|
|
8865
|
+
const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
|
|
8866
|
+
let weightSum = 0;
|
|
8867
|
+
let weightedScoreSum = 0;
|
|
8868
|
+
for (const f of findings) {
|
|
8869
|
+
const w = weightByName.get(f.concept) ?? 1;
|
|
8870
|
+
weightSum += w;
|
|
8871
|
+
weightedScoreSum += w * f.score;
|
|
8872
|
+
}
|
|
8873
|
+
const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
|
|
8874
|
+
return {
|
|
8875
|
+
kind: "semantic-concept",
|
|
8876
|
+
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
8877
|
+
score: Number((scoreAvg / 10).toFixed(3)),
|
|
8878
|
+
presentCount,
|
|
8879
|
+
totalCount,
|
|
8880
|
+
findings,
|
|
8881
|
+
summary: String(value.summary ?? ""),
|
|
8882
|
+
durationMs: Date.now() - start,
|
|
8883
|
+
costUsd: result.costUsd ?? null,
|
|
8884
|
+
available: true
|
|
8885
|
+
};
|
|
8886
|
+
} catch (err) {
|
|
8887
|
+
return {
|
|
8888
|
+
kind: "semantic-concept",
|
|
8889
|
+
version: SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
8890
|
+
score: 0,
|
|
8891
|
+
presentCount: 0,
|
|
8892
|
+
totalCount,
|
|
8893
|
+
findings: [],
|
|
8894
|
+
summary: "",
|
|
8895
|
+
durationMs: Date.now() - start,
|
|
8896
|
+
costUsd: null,
|
|
8897
|
+
available: false,
|
|
8898
|
+
error: err instanceof Error ? err.message : String(err)
|
|
8899
|
+
};
|
|
8900
|
+
}
|
|
8901
|
+
}
|
|
8902
|
+
function createSemanticConceptJudge(options = {}) {
|
|
8903
|
+
return (input) => runSemanticConceptJudge(input, options);
|
|
8904
|
+
}
|
|
8905
|
+
|
|
8906
|
+
// src/intent-match-judge.ts
|
|
8907
|
+
var INTENT_MATCH_JUDGE_VERSION = "intent-match-judge-v1-2026-04-24";
|
|
8908
|
+
var DEFAULT_MODEL2 = "claude-sonnet-4-6";
|
|
8909
|
+
var DEFAULT_TIMEOUT2 = 9e4;
|
|
8910
|
+
var DEFAULT_MAX_SOURCE2 = 25e3;
|
|
8911
|
+
var DEFAULT_MAX_PER_FILE2 = 12e3;
|
|
8912
|
+
var DEFAULT_MAX_HTML2 = 2e4;
|
|
8913
|
+
var INTENT_SCHEMA = {
|
|
8914
|
+
type: "object",
|
|
8915
|
+
additionalProperties: false,
|
|
8916
|
+
required: ["score", "evidence"],
|
|
8917
|
+
properties: {
|
|
8918
|
+
score: { type: "number", minimum: 0, maximum: 1 },
|
|
8919
|
+
evidence: { type: "string", minLength: 10, maxLength: 400 }
|
|
8920
|
+
}
|
|
8921
|
+
};
|
|
8922
|
+
function truncate2(body, cap, label) {
|
|
8923
|
+
if (body.length <= cap) return body;
|
|
8924
|
+
return body.slice(0, cap) + `
|
|
8925
|
+
\u2026 [truncated ${body.length - cap} chars of ${label}]`;
|
|
8926
|
+
}
|
|
8927
|
+
function buildPrompt2(input, opts) {
|
|
8928
|
+
const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
|
|
8929
|
+
${f.content}`).join("\n\n");
|
|
8930
|
+
const html = input.servedHtml ?? "";
|
|
8931
|
+
return `You are evaluating whether an agent built THE RIGHT APP for a user request.
|
|
8932
|
+
|
|
8933
|
+
You are NOT scoring feature coverage. You are NOT scoring code quality.
|
|
8934
|
+
You are answering ONE question: when a person looks at this work, do they
|
|
8935
|
+
see an honest attempt at the user's request \u2014 or do they see the wrong
|
|
8936
|
+
project entirely?
|
|
8937
|
+
|
|
8938
|
+
USER REQUEST:
|
|
8939
|
+
${input.userRequest}
|
|
8940
|
+
|
|
8941
|
+
${input.artifactLabel ? `ARTIFACT METADATA:
|
|
8942
|
+
name: ${input.artifactLabel}
|
|
8943
|
+
description: ${input.artifactDescription ?? ""}
|
|
8944
|
+
|
|
8945
|
+
` : ""}${html ? `SERVED HTML (what the preview returns):
|
|
8946
|
+
${truncate2(html, opts.maxHtmlChars, "HTML")}
|
|
8947
|
+
|
|
8948
|
+
` : ""}SOURCE FILES (the agent's workdir):
|
|
8949
|
+
${truncate2(sourceBlob, opts.maxSourceChars, "source")}
|
|
8950
|
+
|
|
8951
|
+
Score 0\u20131:
|
|
8952
|
+
1.0 \u2014 unmistakably the right app. Even with bugs, gaps, or missing
|
|
8953
|
+
features, a reviewer would say "yes, this is what was asked for."
|
|
8954
|
+
0.7 \u2014 recognizable. Domain matches; some required surface areas exist.
|
|
8955
|
+
A reviewer would say "right direction, lots of work needed."
|
|
8956
|
+
0.4 \u2014 partially related. Wrong framing or wrong product entirely but
|
|
8957
|
+
with some shared keywords or a tangential overlap.
|
|
8958
|
+
0.0 \u2014 wrong project. The agent shipped something unrelated to the
|
|
8959
|
+
request (e.g. asked for an NFT mint page, shipped a generic
|
|
8960
|
+
landing page with zero NFT-related code).
|
|
8961
|
+
|
|
8962
|
+
Evidence: one sentence citing the strongest concrete signal \u2014 a file
|
|
8963
|
+
name, a route, a rendered headline, a missing core surface. Don't
|
|
8964
|
+
restate the request.
|
|
8965
|
+
|
|
8966
|
+
Examples:
|
|
8967
|
+
- "src/App.tsx renders <MintWidget /> with mint-1/mint-5 buttons and
|
|
8968
|
+
wagmi imports \u2014 clearly the requested NFT mint page." \u2192 0.95
|
|
8969
|
+
- "src/App.tsx is the default Vite React template; no
|
|
8970
|
+
mint/wallet/contract code in any file under src/." \u2192 0.05
|
|
8971
|
+
|
|
8972
|
+
Return STRICT JSON. No prose outside.`;
|
|
8973
|
+
}
|
|
8974
|
+
async function runIntentMatchJudge(input, options = {}) {
|
|
8975
|
+
const start = Date.now();
|
|
8976
|
+
const opts = {
|
|
8977
|
+
model: options.model ?? DEFAULT_MODEL2,
|
|
8978
|
+
timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT2,
|
|
8979
|
+
maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE2,
|
|
8980
|
+
maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE2,
|
|
8981
|
+
maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML2,
|
|
8982
|
+
llm: options.llm ?? {}
|
|
8983
|
+
};
|
|
8984
|
+
if (input.sourceFiles.length === 0 && !input.servedHtml) {
|
|
8985
|
+
return {
|
|
8986
|
+
kind: "intent-match",
|
|
8987
|
+
version: INTENT_MATCH_JUDGE_VERSION,
|
|
8988
|
+
score: 0,
|
|
8989
|
+
evidence: "no source files and no served HTML \u2014 nothing to evaluate",
|
|
8990
|
+
durationMs: 0,
|
|
8991
|
+
costUsd: null,
|
|
8992
|
+
available: false,
|
|
8993
|
+
error: "no input artifact"
|
|
8994
|
+
};
|
|
8995
|
+
}
|
|
8996
|
+
try {
|
|
8997
|
+
const { value, result } = await callLlmJson(
|
|
8998
|
+
{
|
|
8999
|
+
model: opts.model,
|
|
9000
|
+
messages: [
|
|
9001
|
+
{
|
|
9002
|
+
role: "system",
|
|
9003
|
+
content: "You are a holistic code reviewer answering one question: did the agent build the right app for the user. Return strict JSON. No prose outside."
|
|
9004
|
+
},
|
|
9005
|
+
{ role: "user", content: buildPrompt2(input, opts) }
|
|
9006
|
+
],
|
|
9007
|
+
jsonSchema: { name: "intent_match_judge", schema: INTENT_SCHEMA },
|
|
9008
|
+
temperature: 0,
|
|
9009
|
+
timeoutMs: opts.timeoutMs
|
|
9010
|
+
},
|
|
9011
|
+
opts.llm
|
|
9012
|
+
);
|
|
9013
|
+
const score = Math.max(0, Math.min(1, Number(value?.score ?? 0)));
|
|
9014
|
+
return {
|
|
9015
|
+
kind: "intent-match",
|
|
9016
|
+
version: INTENT_MATCH_JUDGE_VERSION,
|
|
9017
|
+
score: Number(score.toFixed(3)),
|
|
9018
|
+
evidence: String(value?.evidence ?? "").slice(0, 400),
|
|
9019
|
+
durationMs: Date.now() - start,
|
|
9020
|
+
costUsd: result.costUsd ?? null,
|
|
9021
|
+
available: true
|
|
9022
|
+
};
|
|
9023
|
+
} catch (err) {
|
|
9024
|
+
return {
|
|
9025
|
+
kind: "intent-match",
|
|
9026
|
+
version: INTENT_MATCH_JUDGE_VERSION,
|
|
9027
|
+
score: 0,
|
|
9028
|
+
evidence: "",
|
|
9029
|
+
durationMs: Date.now() - start,
|
|
9030
|
+
costUsd: null,
|
|
9031
|
+
available: false,
|
|
9032
|
+
error: err instanceof Error ? err.message : String(err)
|
|
9033
|
+
};
|
|
9034
|
+
}
|
|
9035
|
+
}
|
|
9036
|
+
function createIntentMatchJudge(options = {}) {
|
|
9037
|
+
return (input) => runIntentMatchJudge(input, options);
|
|
9038
|
+
}
|
|
9039
|
+
|
|
9040
|
+
// src/flow-layer.ts
|
|
9041
|
+
function flowLayer(input) {
|
|
9042
|
+
return {
|
|
9043
|
+
name: input.name ?? "flow",
|
|
9044
|
+
dependsOn: input.dependsOn ?? ["serve"],
|
|
9045
|
+
weight: input.weight ?? 1,
|
|
9046
|
+
capMs: input.capMs ?? 6e4,
|
|
9047
|
+
run: async (ctx) => {
|
|
9048
|
+
const start = Date.now();
|
|
9049
|
+
const spec = ctx.env?.flowSpec ?? input.flowSpec;
|
|
9050
|
+
if (!spec) {
|
|
9051
|
+
return {
|
|
9052
|
+
layer: input.name ?? "flow",
|
|
9053
|
+
status: "skipped",
|
|
9054
|
+
durationMs: 0,
|
|
9055
|
+
findings: [],
|
|
9056
|
+
reason: "no flowSpec supplied"
|
|
9057
|
+
};
|
|
9058
|
+
}
|
|
9059
|
+
let runner;
|
|
9060
|
+
try {
|
|
9061
|
+
runner = await input.runner();
|
|
9062
|
+
} catch (err) {
|
|
9063
|
+
return {
|
|
9064
|
+
layer: input.name ?? "flow",
|
|
9065
|
+
status: "error",
|
|
9066
|
+
durationMs: Date.now() - start,
|
|
9067
|
+
findings: [
|
|
9068
|
+
{
|
|
9069
|
+
severity: "major",
|
|
9070
|
+
message: `flow runner failed to start: ${err instanceof Error ? err.message : String(err)}`
|
|
9071
|
+
}
|
|
9072
|
+
],
|
|
9073
|
+
reason: "runner-init-failed"
|
|
9074
|
+
};
|
|
9075
|
+
}
|
|
9076
|
+
const findings = [];
|
|
9077
|
+
const stepResults = [];
|
|
9078
|
+
let openOk = false;
|
|
9079
|
+
try {
|
|
9080
|
+
const opened = await runner.open(spec.url);
|
|
9081
|
+
openOk = opened.ok;
|
|
9082
|
+
if (!opened.ok) {
|
|
9083
|
+
findings.push({
|
|
9084
|
+
severity: "major",
|
|
9085
|
+
message: `flow.open(${spec.url}) failed${opened.evidence ? `: ${opened.evidence}` : ""}`
|
|
9086
|
+
});
|
|
9087
|
+
}
|
|
9088
|
+
if (openOk || spec.continueOnFail) {
|
|
9089
|
+
for (let i = 0; i < spec.steps.length; i++) {
|
|
9090
|
+
const step = spec.steps[i];
|
|
9091
|
+
if (ctx.signal.aborted) break;
|
|
9092
|
+
const stepStart = Date.now();
|
|
9093
|
+
let result;
|
|
9094
|
+
try {
|
|
9095
|
+
result = await runner.step(step);
|
|
9096
|
+
} catch (err) {
|
|
9097
|
+
result = {
|
|
9098
|
+
ok: false,
|
|
9099
|
+
evidence: err instanceof Error ? err.message : String(err),
|
|
9100
|
+
durationMs: Date.now() - stepStart
|
|
9101
|
+
};
|
|
9102
|
+
}
|
|
9103
|
+
stepResults.push({ step, result, index: i });
|
|
9104
|
+
if (!result.ok) {
|
|
9105
|
+
findings.push({
|
|
9106
|
+
severity: step.severity ?? "major",
|
|
9107
|
+
message: `step[${i}] ${step.action}${step.target ? `(${step.target})` : ""} failed${result.evidence ? `: ${result.evidence}` : ""}`
|
|
9108
|
+
});
|
|
9109
|
+
if (!spec.continueOnFail) break;
|
|
9110
|
+
}
|
|
9111
|
+
}
|
|
9112
|
+
}
|
|
9113
|
+
} finally {
|
|
9114
|
+
try {
|
|
9115
|
+
await runner.close();
|
|
9116
|
+
} catch {
|
|
9117
|
+
}
|
|
9118
|
+
}
|
|
9119
|
+
const totalSteps = spec.steps.length;
|
|
9120
|
+
const ranSteps = stepResults.length;
|
|
9121
|
+
const passedSteps = stepResults.filter((s) => s.result.ok).length;
|
|
9122
|
+
const status = !openOk ? "fail" : passedSteps === totalSteps ? "pass" : "fail";
|
|
9123
|
+
const denominator = 1 + totalSteps;
|
|
9124
|
+
const numerator = (openOk ? 1 : 0) + passedSteps;
|
|
9125
|
+
const score = denominator > 0 ? Number((numerator / denominator).toFixed(3)) : 0;
|
|
9126
|
+
return {
|
|
9127
|
+
layer: input.name ?? "flow",
|
|
9128
|
+
status,
|
|
9129
|
+
score,
|
|
9130
|
+
durationMs: Date.now() - start,
|
|
9131
|
+
findings,
|
|
9132
|
+
reason: status === "pass" ? `${totalSteps}/${totalSteps} steps passed` : `${passedSteps}/${totalSteps} steps passed${ranSteps < totalSteps ? ` (stopped at step ${ranSteps})` : ""}`,
|
|
9133
|
+
diagnostics: {
|
|
9134
|
+
flowOpenOk: openOk ? 1 : 0,
|
|
9135
|
+
flowStepsTotal: totalSteps,
|
|
9136
|
+
flowStepsPassed: passedSteps,
|
|
9137
|
+
flowStepsRan: ranSteps
|
|
9138
|
+
}
|
|
9139
|
+
};
|
|
9140
|
+
}
|
|
9141
|
+
};
|
|
9142
|
+
}
|
|
9143
|
+
|
|
9144
|
+
// src/deploy-gate-layer.ts
|
|
9145
|
+
function deployGateLayer(input) {
|
|
9146
|
+
const requireArtifact = input.requireArtifact ?? true;
|
|
9147
|
+
return {
|
|
9148
|
+
name: input.name ?? "deploy",
|
|
9149
|
+
dependsOn: input.dependsOn ?? ["build"],
|
|
9150
|
+
weight: input.weight ?? 1,
|
|
9151
|
+
capMs: input.capMs ?? 12e4,
|
|
9152
|
+
run: async (ctx) => {
|
|
9153
|
+
const start = Date.now();
|
|
9154
|
+
let runner;
|
|
9155
|
+
try {
|
|
9156
|
+
runner = await input.runner();
|
|
9157
|
+
} catch (err) {
|
|
9158
|
+
return {
|
|
9159
|
+
layer: input.name ?? "deploy",
|
|
9160
|
+
status: "error",
|
|
9161
|
+
durationMs: Date.now() - start,
|
|
9162
|
+
findings: [
|
|
9163
|
+
{
|
|
9164
|
+
severity: "major",
|
|
9165
|
+
message: `deploy runner failed to start: ${err instanceof Error ? err.message : String(err)}`
|
|
9166
|
+
}
|
|
9167
|
+
],
|
|
9168
|
+
reason: "runner-init-failed"
|
|
9169
|
+
};
|
|
9170
|
+
}
|
|
9171
|
+
let result;
|
|
9172
|
+
try {
|
|
9173
|
+
result = await runner.run();
|
|
9174
|
+
} catch (err) {
|
|
9175
|
+
return {
|
|
9176
|
+
layer: input.name ?? "deploy",
|
|
9177
|
+
status: "error",
|
|
9178
|
+
durationMs: Date.now() - start,
|
|
9179
|
+
findings: [
|
|
9180
|
+
{
|
|
9181
|
+
severity: "major",
|
|
9182
|
+
message: `deploy command threw: ${err instanceof Error ? err.message : String(err)}`
|
|
9183
|
+
}
|
|
9184
|
+
],
|
|
9185
|
+
reason: "runner-throw"
|
|
9186
|
+
};
|
|
9187
|
+
}
|
|
9188
|
+
if (ctx.signal.aborted) {
|
|
9189
|
+
return {
|
|
9190
|
+
layer: input.name ?? "deploy",
|
|
9191
|
+
status: "timeout",
|
|
9192
|
+
durationMs: Date.now() - start,
|
|
9193
|
+
findings: [],
|
|
9194
|
+
reason: "aborted by overall cap"
|
|
9195
|
+
};
|
|
9196
|
+
}
|
|
9197
|
+
const artifactOk = !requireArtifact || result.artifactValid === true;
|
|
9198
|
+
const pass = result.ok && artifactOk;
|
|
9199
|
+
const findings = [];
|
|
9200
|
+
if (!result.ok) {
|
|
9201
|
+
findings.push({
|
|
9202
|
+
severity: "critical",
|
|
9203
|
+
message: "deploy build exited non-zero",
|
|
9204
|
+
evidence: (result.output ?? "").slice(-1200)
|
|
9205
|
+
});
|
|
9206
|
+
}
|
|
9207
|
+
if (result.ok && requireArtifact && !result.artifactValid) {
|
|
9208
|
+
findings.push({
|
|
9209
|
+
severity: "major",
|
|
9210
|
+
message: `deploy build succeeded but artifact ${result.artifactDir ?? "(unknown)"} is invalid or empty`
|
|
9211
|
+
});
|
|
9212
|
+
}
|
|
9213
|
+
return {
|
|
9214
|
+
layer: input.name ?? "deploy",
|
|
9215
|
+
status: pass ? "pass" : "fail",
|
|
9216
|
+
score: pass ? 1 : 0,
|
|
9217
|
+
durationMs: Date.now() - start,
|
|
9218
|
+
findings,
|
|
9219
|
+
reason: pass ? `deploy build OK${input.family ? ` (${input.family})` : ""}${result.artifactDir ? ` \u2192 ${result.artifactDir}` : ""}` : !result.ok ? "build command failed" : "artifact missing or invalid",
|
|
9220
|
+
diagnostics: {
|
|
9221
|
+
deployBuildOk: result.ok ? 1 : 0,
|
|
9222
|
+
deployArtifactOk: result.artifactValid === true ? 1 : 0,
|
|
9223
|
+
deployBuildMs: result.durationMs ?? null
|
|
9224
|
+
}
|
|
9225
|
+
};
|
|
9226
|
+
}
|
|
9227
|
+
};
|
|
9228
|
+
}
|
|
9229
|
+
function viteDeployRunner(input) {
|
|
9230
|
+
return {
|
|
9231
|
+
run: async () => {
|
|
9232
|
+
const start = Date.now();
|
|
9233
|
+
const cmd = input.buildCommand ?? "npm run build";
|
|
9234
|
+
const artifactDir = input.artifactDir ?? "dist";
|
|
9235
|
+
const artifactEntry = input.artifactEntry ?? "index.html";
|
|
9236
|
+
const timeoutMs = input.timeoutMs ?? 9e4;
|
|
9237
|
+
const result = await input.exec(cmd, { cwd: input.workdir, timeoutMs });
|
|
9238
|
+
const ok = result.exitCode === 0;
|
|
9239
|
+
let artifactValid = false;
|
|
9240
|
+
try {
|
|
9241
|
+
const entryExists = await input.exists(`${artifactDir}/${artifactEntry}`);
|
|
9242
|
+
artifactValid = ok && Boolean(entryExists);
|
|
9243
|
+
} catch {
|
|
9244
|
+
artifactValid = false;
|
|
9245
|
+
}
|
|
9246
|
+
const tail = ((result.stderr || result.stdout) ?? "").slice(-1500);
|
|
9247
|
+
return {
|
|
9248
|
+
ok,
|
|
9249
|
+
output: tail,
|
|
9250
|
+
durationMs: Date.now() - start,
|
|
9251
|
+
artifactDir,
|
|
9252
|
+
artifactValid
|
|
9253
|
+
};
|
|
9254
|
+
}
|
|
9255
|
+
};
|
|
9256
|
+
}
|
|
9257
|
+
|
|
9258
|
+
// src/keyword-coverage-judge.ts
|
|
9259
|
+
function htmlContainsElement(html, selector) {
|
|
9260
|
+
const tagOnly = /^([a-zA-Z][\w-]*)$/.exec(selector);
|
|
9261
|
+
if (tagOnly) {
|
|
9262
|
+
const re = new RegExp(`<${tagOnly[1]}\\b`, "i");
|
|
9263
|
+
return re.test(html);
|
|
9264
|
+
}
|
|
9265
|
+
const tagAttrEq = /^([a-zA-Z][\w-]*)\[([\w-]+)\s*=\s*["']?([^"'\]]+)["']?\]$/.exec(selector);
|
|
9266
|
+
if (tagAttrEq) {
|
|
9267
|
+
const [, tag, attr, value] = tagAttrEq;
|
|
9268
|
+
const re = new RegExp(
|
|
9269
|
+
`<${tag}\\b[^>]*\\b${attr}\\s*=\\s*["']${value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}["']`,
|
|
9270
|
+
"i"
|
|
9271
|
+
);
|
|
9272
|
+
return re.test(html);
|
|
9273
|
+
}
|
|
9274
|
+
const tagAttrPresence = /^([a-zA-Z][\w-]*)\[([\w-]+)\]$/.exec(selector);
|
|
9275
|
+
if (tagAttrPresence) {
|
|
9276
|
+
const [, tag, attr] = tagAttrPresence;
|
|
9277
|
+
const re = new RegExp(`<${tag}\\b[^>]*\\b${attr}\\b`, "i");
|
|
9278
|
+
return re.test(html);
|
|
9279
|
+
}
|
|
9280
|
+
return null;
|
|
9281
|
+
}
|
|
9282
|
+
function extractAssetUrls(html, baseUrl) {
|
|
9283
|
+
const urls = /* @__PURE__ */ new Set();
|
|
9284
|
+
const linkRe = /<link\b[^>]*\bhref\s*=\s*["']([^"']+)["'][^>]*>/gi;
|
|
9285
|
+
const scriptRe = /<script\b[^>]*\bsrc\s*=\s*["']([^"']+)["'][^>]*>/gi;
|
|
9286
|
+
for (const re of [linkRe, scriptRe]) {
|
|
9287
|
+
let match;
|
|
9288
|
+
while ((match = re.exec(html)) !== null) {
|
|
9289
|
+
const raw = match[1];
|
|
9290
|
+
try {
|
|
9291
|
+
urls.add(new URL(raw, baseUrl).toString());
|
|
9292
|
+
} catch {
|
|
9293
|
+
}
|
|
9294
|
+
}
|
|
9295
|
+
}
|
|
9296
|
+
return Array.from(urls);
|
|
9297
|
+
}
|
|
9298
|
+
function runKeywordCoverageJudge(html, expectedConcepts, assets = []) {
|
|
9299
|
+
const start = Date.now();
|
|
9300
|
+
if (expectedConcepts.length === 0) {
|
|
9301
|
+
return {
|
|
9302
|
+
score: 0,
|
|
9303
|
+
presentCount: 0,
|
|
9304
|
+
totalCount: 0,
|
|
9305
|
+
findings: [],
|
|
9306
|
+
durationMs: 0,
|
|
9307
|
+
totalAssembledBytes: 0
|
|
9308
|
+
};
|
|
9309
|
+
}
|
|
9310
|
+
const haystack = (html + "\n" + assets.join("\n")).toLowerCase();
|
|
9311
|
+
const findings = expectedConcepts.map((concept) => {
|
|
9312
|
+
const matchedKeywords = [];
|
|
9313
|
+
for (const kw of concept.keywords) {
|
|
9314
|
+
if (haystack.includes(kw.toLowerCase())) matchedKeywords.push(kw);
|
|
9315
|
+
}
|
|
9316
|
+
const requiredElementPresent = concept.requiredElement ? htmlContainsElement(html, concept.requiredElement) : null;
|
|
9317
|
+
const passesElementGate = requiredElementPresent === null || requiredElementPresent === true;
|
|
9318
|
+
const found = matchedKeywords.length > 0 && passesElementGate;
|
|
9319
|
+
return { concept: concept.name, found, matchedKeywords, requiredElementPresent };
|
|
9320
|
+
});
|
|
9321
|
+
const presentCount = findings.filter((f) => f.found).length;
|
|
9322
|
+
return {
|
|
9323
|
+
score: presentCount / expectedConcepts.length,
|
|
9324
|
+
presentCount,
|
|
9325
|
+
totalCount: expectedConcepts.length,
|
|
9326
|
+
findings,
|
|
9327
|
+
durationMs: Date.now() - start,
|
|
9328
|
+
totalAssembledBytes: haystack.length
|
|
9329
|
+
};
|
|
9330
|
+
}
|
|
9331
|
+
async function runKeywordCoverageJudgeUrl(previewUrl, expectedConcepts, options = {}) {
|
|
9332
|
+
const start = Date.now();
|
|
9333
|
+
const fetchFn = options.fetch ?? globalThis.fetch;
|
|
9334
|
+
const htmlTimeout = options.htmlTimeoutMs ?? 5e3;
|
|
9335
|
+
const assetTimeout = options.assetTimeoutMs ?? 3e3;
|
|
9336
|
+
if (expectedConcepts.length === 0) {
|
|
9337
|
+
return {
|
|
9338
|
+
score: 0,
|
|
9339
|
+
presentCount: 0,
|
|
9340
|
+
totalCount: 0,
|
|
9341
|
+
findings: [],
|
|
9342
|
+
durationMs: 0,
|
|
9343
|
+
totalAssembledBytes: 0
|
|
9344
|
+
};
|
|
9345
|
+
}
|
|
9346
|
+
let html = "";
|
|
9347
|
+
try {
|
|
9348
|
+
const resp = await fetchFn(previewUrl, {
|
|
9349
|
+
redirect: "follow",
|
|
9350
|
+
signal: AbortSignal.timeout(htmlTimeout)
|
|
9351
|
+
});
|
|
9352
|
+
if (!resp.ok) {
|
|
9353
|
+
return softFail(expectedConcepts, start, `preview HTTP ${resp.status}`);
|
|
9354
|
+
}
|
|
9355
|
+
html = await resp.text();
|
|
9356
|
+
} catch (err) {
|
|
9357
|
+
return softFail(expectedConcepts, start, err instanceof Error ? err.message : String(err));
|
|
9358
|
+
}
|
|
9359
|
+
const assetUrls = extractAssetUrls(html, previewUrl);
|
|
9360
|
+
const assetBodies = await Promise.all(
|
|
9361
|
+
assetUrls.map(async (u) => {
|
|
9362
|
+
try {
|
|
9363
|
+
const r = await fetchFn(u, {
|
|
9364
|
+
redirect: "follow",
|
|
9365
|
+
signal: AbortSignal.timeout(assetTimeout)
|
|
9366
|
+
});
|
|
9367
|
+
if (!r.ok) return "";
|
|
9368
|
+
return await r.text();
|
|
9369
|
+
} catch {
|
|
9370
|
+
return "";
|
|
9371
|
+
}
|
|
9372
|
+
})
|
|
9373
|
+
);
|
|
9374
|
+
return runKeywordCoverageJudge(html, expectedConcepts, assetBodies);
|
|
9375
|
+
}
|
|
9376
|
+
function softFail(expectedConcepts, start, error) {
|
|
9377
|
+
return {
|
|
9378
|
+
score: 0,
|
|
9379
|
+
presentCount: 0,
|
|
9380
|
+
totalCount: expectedConcepts.length,
|
|
9381
|
+
findings: expectedConcepts.map((c) => ({
|
|
9382
|
+
concept: c.name,
|
|
9383
|
+
found: false,
|
|
9384
|
+
matchedKeywords: [],
|
|
9385
|
+
requiredElementPresent: null
|
|
9386
|
+
})),
|
|
9387
|
+
durationMs: Date.now() - start,
|
|
9388
|
+
totalAssembledBytes: 0,
|
|
9389
|
+
error
|
|
9390
|
+
};
|
|
9391
|
+
}
|
|
9392
|
+
|
|
9393
|
+
// src/error-count-extractor.ts
|
|
9394
|
+
var ERROR_COUNT_PATTERNS = [
|
|
9395
|
+
{
|
|
9396
|
+
// tsc / ts-node: `src/foo.ts(12,3): error TS1234: ...`
|
|
9397
|
+
name: "typescript-tsc",
|
|
9398
|
+
regex: /[\w./-]+\(\d+,\d+\): error TS\d+:/g
|
|
9399
|
+
},
|
|
9400
|
+
{
|
|
9401
|
+
// pytest: `FAILED tests/test_foo.py::test_bar`
|
|
9402
|
+
name: "pytest-failed",
|
|
9403
|
+
regex: /^FAILED\s+\S+/gm
|
|
9404
|
+
},
|
|
9405
|
+
{
|
|
9406
|
+
// rustc: `error[E0308]: ...` or `error: ...`
|
|
9407
|
+
name: "rustc",
|
|
9408
|
+
regex: /^error(?:\[[A-Z]\d+\])?:/gm
|
|
9409
|
+
},
|
|
9410
|
+
{
|
|
9411
|
+
// go build: `./foo.go:12:3: ...` — any file:line:col: is an error line
|
|
9412
|
+
name: "golang",
|
|
9413
|
+
regex: /^\.\/[\w./-]+\.go:\d+:\d+:/gm
|
|
9414
|
+
},
|
|
9415
|
+
{
|
|
9416
|
+
// eslint default formatter per-line: ` 12:34 error message rule-id`
|
|
9417
|
+
name: "eslint",
|
|
9418
|
+
regex: /^\s+\d+:\d+\s+error\s+/gm
|
|
9419
|
+
},
|
|
9420
|
+
{
|
|
9421
|
+
// eslint summary line: `✖ 17 problems (12 errors, 5 warnings)`
|
|
9422
|
+
// Use this only when the per-line formatter isn't present; transform
|
|
9423
|
+
// reads the errors count directly.
|
|
9424
|
+
name: "eslint-summary",
|
|
9425
|
+
regex: /✖\s+\d+\s+problems?\s+\((\d+)\s+errors?/gm,
|
|
9426
|
+
transform: (m) => Number(m[1] ?? 0)
|
|
9427
|
+
}
|
|
9428
|
+
];
|
|
9429
|
+
function extractErrorCount(text, opts = {}) {
|
|
9430
|
+
if (!text) return { count: null, matched: null, samples: [] };
|
|
9431
|
+
const patterns = [...opts.extra ?? [], ...ERROR_COUNT_PATTERNS].filter(
|
|
9432
|
+
(p) => !opts.only || opts.only.includes(p.name)
|
|
9433
|
+
);
|
|
9434
|
+
for (const p of patterns) {
|
|
9435
|
+
const matches2 = Array.from(text.matchAll(p.regex));
|
|
9436
|
+
if (matches2.length === 0) continue;
|
|
9437
|
+
const count = p.transform ? matches2.reduce((sum2, m) => sum2 + p.transform(m), 0) : matches2.length;
|
|
9438
|
+
return {
|
|
9439
|
+
count,
|
|
9440
|
+
matched: p.name,
|
|
9441
|
+
samples: matches2.slice(0, 5).map((m) => m[0])
|
|
9442
|
+
};
|
|
9443
|
+
}
|
|
9444
|
+
return { count: null, matched: null, samples: [] };
|
|
9445
|
+
}
|
|
9446
|
+
|
|
9447
|
+
// src/reference-replay.ts
|
|
9448
|
+
import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
|
|
9449
|
+
import { dirname as dirname2 } from "path";
|
|
9450
|
+
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
9451
|
+
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
9452
|
+
async function runReferenceReplay(cases, options) {
|
|
9453
|
+
const now = options.now ?? Date.now;
|
|
9454
|
+
const startedAt = now();
|
|
9455
|
+
const runId = options.runId ?? `${options.variantId ?? "reference-replay"}-${startedAt}`;
|
|
9456
|
+
const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
|
|
9457
|
+
const caseRuns = [];
|
|
9458
|
+
for (const [caseIndex, replayCase] of cases.entries()) {
|
|
9459
|
+
const split = replayCase.split ?? "train";
|
|
9460
|
+
if (split === "holdout" && !options.includeHoldout) continue;
|
|
9461
|
+
if (!allowedSplits.has(split)) continue;
|
|
9462
|
+
const caseStart = now();
|
|
9463
|
+
const executionScenario = {
|
|
9464
|
+
id: replayCase.id,
|
|
9465
|
+
split,
|
|
9466
|
+
input: replayCase.input,
|
|
9467
|
+
...replayCase.metadata !== void 0 ? { metadata: replayCase.metadata } : {}
|
|
9468
|
+
};
|
|
9469
|
+
let candidates = [];
|
|
9470
|
+
let error;
|
|
9471
|
+
try {
|
|
9472
|
+
throwIfAborted(options.abortSignal);
|
|
9473
|
+
candidates = await runAdapter(options.adapter, executionScenario, {
|
|
9474
|
+
runId,
|
|
9475
|
+
caseIndex,
|
|
9476
|
+
abortSignal: options.abortSignal
|
|
9477
|
+
});
|
|
9478
|
+
throwIfAborted(options.abortSignal);
|
|
9479
|
+
} catch (cause) {
|
|
9480
|
+
if (options.abortSignal?.aborted) throw cause;
|
|
9481
|
+
if (!options.continueOnError) throw cause;
|
|
9482
|
+
error = cause instanceof Error ? cause.message : String(cause);
|
|
9483
|
+
}
|
|
9484
|
+
const scenario = {
|
|
9485
|
+
id: replayCase.id,
|
|
9486
|
+
split,
|
|
9487
|
+
references: replayCase.references,
|
|
9488
|
+
candidates,
|
|
9489
|
+
...replayCase.metadata !== void 0 ? { metadata: replayCase.metadata } : {}
|
|
9490
|
+
};
|
|
9491
|
+
const scoreOptions2 = {
|
|
9492
|
+
matcher: options.matcher,
|
|
9493
|
+
matchThreshold: options.matchThreshold,
|
|
9494
|
+
matchStrategy: options.matchStrategy,
|
|
9495
|
+
includeHoldout: true
|
|
9496
|
+
};
|
|
9497
|
+
const scenarioScore = scoreReferenceReplay([scenario], scoreOptions2).scenarios[0];
|
|
9498
|
+
caseRuns.push({
|
|
9499
|
+
caseId: replayCase.id,
|
|
9500
|
+
split,
|
|
9501
|
+
input: replayCase.input,
|
|
9502
|
+
references: replayCase.references,
|
|
9503
|
+
candidates,
|
|
9504
|
+
score: scenarioScore,
|
|
9505
|
+
durationMs: Math.max(0, now() - caseStart),
|
|
9506
|
+
...replayCase.metadata !== void 0 ? { metadata: replayCase.metadata } : {},
|
|
9507
|
+
...error !== void 0 ? { error } : {}
|
|
9508
|
+
});
|
|
9509
|
+
}
|
|
9510
|
+
const completedAt = now();
|
|
9511
|
+
const scoreOptions = {
|
|
9512
|
+
matcher: options.matcher,
|
|
9513
|
+
matchThreshold: options.matchThreshold,
|
|
9514
|
+
matchStrategy: options.matchStrategy,
|
|
9515
|
+
includeHoldout: true
|
|
9516
|
+
};
|
|
9517
|
+
const run = {
|
|
9518
|
+
id: runId,
|
|
9519
|
+
startedAt,
|
|
9520
|
+
completedAt,
|
|
9521
|
+
durationMs: Math.max(0, completedAt - startedAt),
|
|
9522
|
+
cases: caseRuns,
|
|
9523
|
+
score: scoreReferenceReplay(caseRuns.map((caseRun) => ({
|
|
9524
|
+
id: caseRun.caseId,
|
|
9525
|
+
split: caseRun.split,
|
|
9526
|
+
references: caseRun.references,
|
|
9527
|
+
candidates: caseRun.candidates,
|
|
9528
|
+
...caseRun.metadata !== void 0 ? { metadata: caseRun.metadata } : {}
|
|
9529
|
+
})), scoreOptions),
|
|
9530
|
+
...options.variantId !== void 0 ? { variantId: options.variantId } : {},
|
|
9531
|
+
...options.metadata !== void 0 ? { metadata: options.metadata } : {}
|
|
9532
|
+
};
|
|
9533
|
+
await options.store?.save(run);
|
|
9534
|
+
return run;
|
|
9535
|
+
}
|
|
9536
|
+
function decideReferenceReplayRunPromotion(baseline, candidate, policy = {}) {
|
|
9537
|
+
return decideReferenceReplayPromotion(baseline.score, candidate.score, policy);
|
|
9538
|
+
}
|
|
9539
|
+
function inMemoryReferenceReplayStore(initial = []) {
|
|
9540
|
+
const runs = [...initial];
|
|
9541
|
+
return {
|
|
9542
|
+
async save(run) {
|
|
9543
|
+
runs.push(run);
|
|
9544
|
+
},
|
|
9545
|
+
async list() {
|
|
9546
|
+
return [...runs];
|
|
9547
|
+
}
|
|
9548
|
+
};
|
|
9549
|
+
}
|
|
9550
|
+
function jsonlReferenceReplayStore(path) {
|
|
9551
|
+
return {
|
|
9552
|
+
async save(run) {
|
|
9553
|
+
mkdirSync2(dirname2(path), { recursive: true });
|
|
9554
|
+
appendFileSync2(path, JSON.stringify(run) + "\n");
|
|
9555
|
+
},
|
|
9556
|
+
async list() {
|
|
9557
|
+
if (!existsSync4(path)) return [];
|
|
9558
|
+
return readJsonl(path);
|
|
9559
|
+
}
|
|
9560
|
+
};
|
|
9561
|
+
}
|
|
9562
|
+
function scoreReferenceReplay(scenarios, options = {}) {
|
|
9563
|
+
const matcher = options.matcher ?? defaultReferenceReplayMatcher;
|
|
9564
|
+
const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
|
|
9565
|
+
const matchStrategy = options.matchStrategy ?? "reference-order";
|
|
9566
|
+
const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
|
|
9567
|
+
const scores = scenarios.filter((scenario) => {
|
|
9568
|
+
const split = scenario.split ?? "train";
|
|
9569
|
+
if (split === "holdout" && !options.includeHoldout) return false;
|
|
9570
|
+
return allowedSplits.has(split);
|
|
9571
|
+
}).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
|
|
9572
|
+
return {
|
|
9573
|
+
scenarios: scores,
|
|
9574
|
+
aggregate: aggregateScenarioScores(scores),
|
|
9575
|
+
bySplit: aggregateBySplit(scores)
|
|
9576
|
+
};
|
|
9577
|
+
}
|
|
9578
|
+
function compareReferenceReplay(baseline, candidate) {
|
|
9579
|
+
const splits = /* @__PURE__ */ new Set([
|
|
9580
|
+
...Object.keys(baseline.bySplit),
|
|
9581
|
+
...Object.keys(candidate.bySplit)
|
|
9582
|
+
]);
|
|
9583
|
+
return [...splits].sort(bySplitOrder).map((split) => {
|
|
9584
|
+
const before = baseline.bySplit[split] ?? emptyAggregate();
|
|
9585
|
+
const after = candidate.bySplit[split] ?? emptyAggregate();
|
|
9586
|
+
return {
|
|
9587
|
+
split,
|
|
9588
|
+
baselineF1: before.f1,
|
|
9589
|
+
candidateF1: after.f1,
|
|
9590
|
+
f1Delta: after.f1 - before.f1,
|
|
9591
|
+
baselineRecall: before.recall,
|
|
9592
|
+
candidateRecall: after.recall,
|
|
9593
|
+
recallDelta: after.recall - before.recall
|
|
9594
|
+
};
|
|
9595
|
+
});
|
|
9596
|
+
}
|
|
9597
|
+
function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
|
|
9598
|
+
const requiredSplits = policy.requiredSplits ?? ["dev", "test"];
|
|
9599
|
+
const minF1Delta = policy.minF1Delta ?? 0;
|
|
9600
|
+
const maxRegression = policy.maxRegression ?? 0;
|
|
9601
|
+
const requireHoldout = policy.requireHoldoutNonRegression ?? true;
|
|
9602
|
+
const comparisons = compareReferenceReplay(baseline, candidate);
|
|
9603
|
+
const missingRequiredSplits = requiredSplits.filter((split) => !hasSplit(baseline, split) || !hasSplit(candidate, split));
|
|
9604
|
+
const compared = comparisons.filter((item) => requiredSplits.includes(item.split));
|
|
9605
|
+
const regressions = comparisons.filter((item) => item.f1Delta < -maxRegression);
|
|
9606
|
+
const aggregateDelta = candidate.aggregate.f1 - baseline.aggregate.f1;
|
|
9607
|
+
if (missingRequiredSplits.length > 0) {
|
|
9608
|
+
return {
|
|
9609
|
+
promote: false,
|
|
9610
|
+
reason: `Required split missing from baseline or candidate: ${missingRequiredSplits.join(", ")}`,
|
|
9611
|
+
aggregateDelta,
|
|
9612
|
+
comparisons,
|
|
9613
|
+
regressions
|
|
9614
|
+
};
|
|
9615
|
+
}
|
|
9616
|
+
if (compared.length === 0) {
|
|
9617
|
+
return {
|
|
9618
|
+
promote: false,
|
|
9619
|
+
reason: `No required split scores found: ${requiredSplits.join(", ")}`,
|
|
9620
|
+
aggregateDelta,
|
|
9621
|
+
comparisons,
|
|
9622
|
+
regressions
|
|
9623
|
+
};
|
|
9624
|
+
}
|
|
9625
|
+
if (regressions.length > 0) {
|
|
9626
|
+
return {
|
|
9627
|
+
promote: false,
|
|
9628
|
+
reason: `Regression in ${regressions.map((r) => r.split).join(", ")}`,
|
|
9629
|
+
aggregateDelta,
|
|
9630
|
+
comparisons,
|
|
9631
|
+
regressions
|
|
9632
|
+
};
|
|
9633
|
+
}
|
|
9634
|
+
if (requireHoldout && (!hasSplit(baseline, "holdout") || !hasSplit(candidate, "holdout"))) {
|
|
9635
|
+
return {
|
|
9636
|
+
promote: false,
|
|
9637
|
+
reason: "Holdout split is required for promotion",
|
|
9638
|
+
aggregateDelta,
|
|
9639
|
+
comparisons,
|
|
9640
|
+
regressions
|
|
9641
|
+
};
|
|
9642
|
+
}
|
|
9643
|
+
const requiredMeanDelta = mean4(compared.map((item) => item.f1Delta));
|
|
9644
|
+
if (requiredMeanDelta < minF1Delta) {
|
|
9645
|
+
return {
|
|
9646
|
+
promote: false,
|
|
9647
|
+
reason: `Required split F1 delta ${formatPct(requiredMeanDelta)} below ${formatPct(minF1Delta)}`,
|
|
9648
|
+
aggregateDelta,
|
|
9649
|
+
comparisons,
|
|
9650
|
+
regressions
|
|
9651
|
+
};
|
|
9652
|
+
}
|
|
9653
|
+
return {
|
|
9654
|
+
promote: true,
|
|
9655
|
+
reason: `Required splits improved by ${formatPct(requiredMeanDelta)} with no regressions`,
|
|
9656
|
+
aggregateDelta,
|
|
9657
|
+
comparisons,
|
|
9658
|
+
regressions
|
|
9659
|
+
};
|
|
9660
|
+
}
|
|
9661
|
+
function defaultReferenceReplayMatcher(reference, candidate) {
|
|
9662
|
+
const referenceText = `${reference.title} ${reference.description ?? ""}`;
|
|
9663
|
+
const candidateText = `${candidate.title} ${candidate.description ?? ""}`;
|
|
9664
|
+
const textScore = tokenJaccard(referenceText, candidateText);
|
|
9665
|
+
const severityScore = reference.severity && candidate.severity ? normalize(reference.severity) === normalize(candidate.severity) ? 0.1 : -0.05 : 0;
|
|
9666
|
+
const tagScore = tagOverlap(reference.tags, candidate.tags) * 0.15;
|
|
9667
|
+
const score = clamp012(textScore * 0.85 + tagScore + severityScore);
|
|
9668
|
+
return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
|
|
9669
|
+
}
|
|
9670
|
+
function scoreScenario(scenario, matcher, threshold, matchStrategy) {
|
|
9671
|
+
return matchStrategy === "global-greedy" ? scoreScenarioGlobalGreedy(scenario, matcher, threshold) : scoreScenarioReferenceOrder(scenario, matcher, threshold);
|
|
9672
|
+
}
|
|
9673
|
+
function scoreScenarioReferenceOrder(scenario, matcher, threshold) {
|
|
9674
|
+
const candidatesLeft = scenario.candidates.map((candidate, index) => ({ candidate, index }));
|
|
9675
|
+
const matches2 = [];
|
|
9676
|
+
for (const reference of scenario.references) {
|
|
9677
|
+
let best = null;
|
|
9678
|
+
for (const item of candidatesLeft) {
|
|
9679
|
+
const result = scorePair(scenario, matcher, reference, item.candidate);
|
|
9680
|
+
if (!best || result.score > best.score) {
|
|
9681
|
+
best = { ...item, ...result };
|
|
9682
|
+
}
|
|
9683
|
+
}
|
|
9684
|
+
const weight = reference.weight ?? 1;
|
|
9685
|
+
if (best && best.score >= threshold) {
|
|
9686
|
+
const matchIndex = candidatesLeft.findIndex((item) => item.index === best.index);
|
|
9687
|
+
if (matchIndex >= 0) candidatesLeft.splice(matchIndex, 1);
|
|
9688
|
+
matches2.push({
|
|
9689
|
+
scenarioId: scenario.id,
|
|
9690
|
+
referenceId: reference.id,
|
|
9691
|
+
candidateId: best.candidate.id,
|
|
9692
|
+
score: best.score,
|
|
9693
|
+
matched: true,
|
|
9694
|
+
weight,
|
|
9695
|
+
reason: best.reason
|
|
9696
|
+
});
|
|
9697
|
+
} else {
|
|
9698
|
+
matches2.push({
|
|
9699
|
+
scenarioId: scenario.id,
|
|
9700
|
+
referenceId: reference.id,
|
|
9701
|
+
candidateId: best?.candidate.id ?? null,
|
|
9702
|
+
score: best?.score ?? 0,
|
|
9703
|
+
matched: false,
|
|
9704
|
+
weight,
|
|
9705
|
+
reason: best?.reason ?? "no candidates"
|
|
9706
|
+
});
|
|
9707
|
+
}
|
|
9708
|
+
}
|
|
9709
|
+
return buildScenarioScore(scenario, matches2, candidatesLeft.length);
|
|
9710
|
+
}
|
|
9711
|
+
function scoreScenarioGlobalGreedy(scenario, matcher, threshold) {
|
|
9712
|
+
const pairs = [];
|
|
9713
|
+
for (const [referenceIndex, reference] of scenario.references.entries()) {
|
|
9714
|
+
for (const [candidateIndex, candidate] of scenario.candidates.entries()) {
|
|
9715
|
+
pairs.push({
|
|
9716
|
+
referenceIndex,
|
|
9717
|
+
candidateIndex,
|
|
9718
|
+
reference,
|
|
9719
|
+
candidate,
|
|
9720
|
+
...scorePair(scenario, matcher, reference, candidate)
|
|
9721
|
+
});
|
|
9722
|
+
}
|
|
9723
|
+
}
|
|
9724
|
+
pairs.sort(
|
|
9725
|
+
(a, b) => b.score - a.score || a.referenceIndex - b.referenceIndex || a.candidateIndex - b.candidateIndex
|
|
9726
|
+
);
|
|
9727
|
+
const selectedByReference = /* @__PURE__ */ new Map();
|
|
9728
|
+
const selectedCandidates = /* @__PURE__ */ new Set();
|
|
9729
|
+
for (const pair of pairs) {
|
|
9730
|
+
if (pair.score < threshold) break;
|
|
9731
|
+
if (selectedByReference.has(pair.referenceIndex) || selectedCandidates.has(pair.candidateIndex)) continue;
|
|
9732
|
+
selectedByReference.set(pair.referenceIndex, pair);
|
|
9733
|
+
selectedCandidates.add(pair.candidateIndex);
|
|
9734
|
+
}
|
|
9735
|
+
const matches2 = scenario.references.map((reference, referenceIndex) => {
|
|
9736
|
+
const weight = reference.weight ?? 1;
|
|
9737
|
+
const selected = selectedByReference.get(referenceIndex);
|
|
9738
|
+
if (selected) {
|
|
9739
|
+
return {
|
|
9740
|
+
scenarioId: scenario.id,
|
|
9741
|
+
referenceId: reference.id,
|
|
9742
|
+
candidateId: selected.candidate.id,
|
|
9743
|
+
score: selected.score,
|
|
9744
|
+
matched: true,
|
|
9745
|
+
weight,
|
|
9746
|
+
reason: selected.reason
|
|
9747
|
+
};
|
|
9748
|
+
}
|
|
9749
|
+
const bestRejected = pairs.find((pair) => pair.referenceIndex === referenceIndex);
|
|
9750
|
+
return {
|
|
9751
|
+
scenarioId: scenario.id,
|
|
9752
|
+
referenceId: reference.id,
|
|
9753
|
+
candidateId: bestRejected?.candidate.id ?? null,
|
|
9754
|
+
score: bestRejected?.score ?? 0,
|
|
9755
|
+
matched: false,
|
|
9756
|
+
weight,
|
|
9757
|
+
reason: bestRejected?.reason ?? "no candidates"
|
|
9758
|
+
};
|
|
9759
|
+
});
|
|
9760
|
+
return buildScenarioScore(scenario, matches2, scenario.candidates.length - selectedCandidates.size);
|
|
9761
|
+
}
|
|
9762
|
+
function scorePair(scenario, matcher, reference, candidate) {
|
|
9763
|
+
const result = matcher(reference, candidate, scenario);
|
|
9764
|
+
if (!Number.isFinite(result.score)) {
|
|
9765
|
+
throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
|
|
9766
|
+
}
|
|
9767
|
+
return { score: clamp012(result.score), reason: result.reason ?? "" };
|
|
9768
|
+
}
|
|
9769
|
+
function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
9770
|
+
const matched = matches2.filter((match) => match.matched).length;
|
|
9771
|
+
const total = scenario.references.length;
|
|
9772
|
+
const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
|
|
9773
|
+
const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
|
|
9774
|
+
const precision = ratio(matched, matched + falsePositives);
|
|
9775
|
+
const recall = ratio(matched, total);
|
|
9776
|
+
return {
|
|
9777
|
+
scenarioId: scenario.id,
|
|
9778
|
+
split: scenario.split ?? "train",
|
|
9779
|
+
matched,
|
|
9780
|
+
total,
|
|
9781
|
+
falsePositives,
|
|
9782
|
+
matchedWeight,
|
|
9783
|
+
totalWeight,
|
|
9784
|
+
precision,
|
|
9785
|
+
recall,
|
|
9786
|
+
f1: f1(precision, recall),
|
|
9787
|
+
matches: matches2
|
|
9788
|
+
};
|
|
9789
|
+
}
|
|
9790
|
+
function aggregateBySplit(scores) {
|
|
9791
|
+
const out = {};
|
|
9792
|
+
for (const split of ALL_SPLITS) {
|
|
9793
|
+
const scoped = scores.filter((score) => score.split === split);
|
|
9794
|
+
if (scoped.length > 0) out[split] = aggregateScenarioScores(scoped);
|
|
9795
|
+
}
|
|
9796
|
+
return out;
|
|
9797
|
+
}
|
|
9798
|
+
function aggregateScenarioScores(scores) {
|
|
9799
|
+
const matched = sum(scores.map((score) => score.matched));
|
|
9800
|
+
const total = sum(scores.map((score) => score.total));
|
|
9801
|
+
const falsePositives = sum(scores.map((score) => score.falsePositives));
|
|
9802
|
+
const matchedWeight = sum(scores.map((score) => score.matchedWeight));
|
|
9803
|
+
const totalWeight = sum(scores.map((score) => score.totalWeight));
|
|
9804
|
+
const precision = ratio(matched, matched + falsePositives);
|
|
9805
|
+
const recall = ratio(matched, total);
|
|
9806
|
+
return {
|
|
9807
|
+
matched,
|
|
9808
|
+
total,
|
|
9809
|
+
falsePositives,
|
|
9810
|
+
matchedWeight,
|
|
9811
|
+
totalWeight,
|
|
9812
|
+
precision,
|
|
9813
|
+
recall,
|
|
9814
|
+
f1: f1(precision, recall),
|
|
9815
|
+
weightedRecall: ratio(matchedWeight, totalWeight)
|
|
9816
|
+
};
|
|
9817
|
+
}
|
|
9818
|
+
function emptyAggregate() {
|
|
9819
|
+
return {
|
|
9820
|
+
matched: 0,
|
|
9821
|
+
total: 0,
|
|
9822
|
+
falsePositives: 0,
|
|
9823
|
+
matchedWeight: 0,
|
|
9824
|
+
totalWeight: 0,
|
|
9825
|
+
precision: 0,
|
|
9826
|
+
recall: 0,
|
|
9827
|
+
f1: 0,
|
|
9828
|
+
weightedRecall: 0
|
|
9829
|
+
};
|
|
9830
|
+
}
|
|
9831
|
+
function hasSplit(score, split) {
|
|
9832
|
+
return score.bySplit[split] !== void 0;
|
|
9833
|
+
}
|
|
9834
|
+
function f1(precision, recall) {
|
|
9835
|
+
return precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
|
|
9836
|
+
}
|
|
9837
|
+
function ratio(numerator, denominator) {
|
|
9838
|
+
return denominator > 0 ? numerator / denominator : 0;
|
|
9839
|
+
}
|
|
9840
|
+
function tokenJaccard(a, b) {
|
|
9841
|
+
const left = new Set(tokens(a));
|
|
9842
|
+
const right = new Set(tokens(b));
|
|
9843
|
+
if (left.size === 0 || right.size === 0) return 0;
|
|
9844
|
+
let intersection = 0;
|
|
9845
|
+
for (const token of left) {
|
|
9846
|
+
if (right.has(token)) intersection++;
|
|
9847
|
+
}
|
|
9848
|
+
return intersection / (left.size + right.size - intersection);
|
|
9849
|
+
}
|
|
9850
|
+
function tagOverlap(a, b) {
|
|
9851
|
+
if (!a?.length || !b?.length) return 0;
|
|
9852
|
+
const left = new Set(a.map(normalize));
|
|
9853
|
+
const right = new Set(b.map(normalize));
|
|
9854
|
+
let intersection = 0;
|
|
9855
|
+
for (const tag of left) {
|
|
9856
|
+
if (right.has(tag)) intersection++;
|
|
9857
|
+
}
|
|
9858
|
+
return intersection / Math.max(left.size, right.size);
|
|
9859
|
+
}
|
|
9860
|
+
function tokens(text) {
|
|
9861
|
+
return normalize(text).split(/\s+/).filter((token) => token.length >= 3 && !STOP_WORDS.has(token));
|
|
9862
|
+
}
|
|
9863
|
+
function normalize(text) {
|
|
9864
|
+
return text.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
|
|
9865
|
+
}
|
|
9866
|
+
function clamp012(value) {
|
|
9867
|
+
if (!Number.isFinite(value)) return 0;
|
|
9868
|
+
return Math.max(0, Math.min(1, value));
|
|
9869
|
+
}
|
|
9870
|
+
function sum(values) {
|
|
9871
|
+
return values.reduce((acc, value) => acc + value, 0);
|
|
9872
|
+
}
|
|
9873
|
+
function mean4(values) {
|
|
9874
|
+
return values.length ? sum(values) / values.length : 0;
|
|
9875
|
+
}
|
|
9876
|
+
function formatPct(value) {
|
|
9877
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
9878
|
+
}
|
|
9879
|
+
function bySplitOrder(a, b) {
|
|
9880
|
+
return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
|
|
9881
|
+
}
|
|
9882
|
+
function runAdapter(adapter, scenario, context) {
|
|
9883
|
+
return typeof adapter === "function" ? adapter(scenario, context) : adapter.run(scenario, context);
|
|
9884
|
+
}
|
|
9885
|
+
function throwIfAborted(signal) {
|
|
9886
|
+
if (!signal?.aborted) return;
|
|
9887
|
+
if (signal.reason instanceof Error) throw signal.reason;
|
|
9888
|
+
throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
|
|
9889
|
+
}
|
|
9890
|
+
function readJsonl(path) {
|
|
9891
|
+
const raw = readFileSync4(path, "utf8");
|
|
9892
|
+
const out = [];
|
|
9893
|
+
for (const line of raw.split("\n")) {
|
|
9894
|
+
const trimmed = line.trim();
|
|
9895
|
+
if (!trimmed) continue;
|
|
9896
|
+
out.push(JSON.parse(trimmed));
|
|
9897
|
+
}
|
|
9898
|
+
return out;
|
|
9899
|
+
}
|
|
9900
|
+
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
9901
|
+
"the",
|
|
9902
|
+
"and",
|
|
9903
|
+
"for",
|
|
9904
|
+
"with",
|
|
9905
|
+
"that",
|
|
9906
|
+
"this",
|
|
9907
|
+
"from",
|
|
9908
|
+
"into",
|
|
9909
|
+
"onto",
|
|
9910
|
+
"are",
|
|
9911
|
+
"can",
|
|
9912
|
+
"will",
|
|
9913
|
+
"should",
|
|
9914
|
+
"could",
|
|
9915
|
+
"would",
|
|
9916
|
+
"when",
|
|
9917
|
+
"where",
|
|
9918
|
+
"which"
|
|
9919
|
+
]);
|
|
9920
|
+
|
|
9921
|
+
// src/reference-replay-steering.ts
|
|
9922
|
+
function referenceReplayRunsToSteeringRows(runs, options = {}) {
|
|
9923
|
+
const rows = [];
|
|
9924
|
+
for (const run of runs) {
|
|
9925
|
+
const variantId = run.variantId ?? run.id;
|
|
9926
|
+
const bundle = options.bundleForRun?.(run) ?? {
|
|
9927
|
+
id: variantId,
|
|
9928
|
+
metadata: run.metadata
|
|
9929
|
+
};
|
|
9930
|
+
for (const caseRun of run.cases) {
|
|
9931
|
+
rows.push({
|
|
9932
|
+
variantId,
|
|
9933
|
+
scenarioId: caseRun.caseId,
|
|
9934
|
+
bundle,
|
|
9935
|
+
score: options.scoreForCase?.(caseRun, run) ?? referenceReplayScenarioToRunScore(caseRun.score, caseRun.durationMs),
|
|
9936
|
+
metadata: {
|
|
9937
|
+
runId: run.id,
|
|
9938
|
+
split: caseRun.split,
|
|
9939
|
+
task: caseRun.metadata?.task ?? caseRun.metadata?.repo ?? caseRun.caseId,
|
|
9940
|
+
referenceCount: caseRun.references.length,
|
|
9941
|
+
candidateCount: caseRun.candidates.length,
|
|
9942
|
+
matched: caseRun.score.matched,
|
|
9943
|
+
total: caseRun.score.total,
|
|
9944
|
+
falsePositives: caseRun.score.falsePositives,
|
|
9945
|
+
precision: caseRun.score.precision,
|
|
9946
|
+
recall: caseRun.score.recall,
|
|
9947
|
+
f1: caseRun.score.f1,
|
|
9948
|
+
error: caseRun.error,
|
|
9949
|
+
...caseRun.metadata ?? {}
|
|
9950
|
+
}
|
|
9951
|
+
});
|
|
9952
|
+
}
|
|
9953
|
+
}
|
|
9954
|
+
return rows;
|
|
9955
|
+
}
|
|
9956
|
+
function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
|
|
9957
|
+
const success = scenarioScore.f1;
|
|
9958
|
+
const recall = scenarioScore.recall;
|
|
9959
|
+
const precision = scenarioScore.precision;
|
|
9960
|
+
const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
|
|
9961
|
+
return {
|
|
9962
|
+
success,
|
|
9963
|
+
goalProgress: recall,
|
|
9964
|
+
repoGroundedness: precision,
|
|
9965
|
+
driftPenalty: 1 - precision,
|
|
9966
|
+
toolUseQuality: precision,
|
|
9967
|
+
patchQuality: 0,
|
|
9968
|
+
testReality: scenarioScore.total > 0 ? 1 : 0,
|
|
9969
|
+
finalGate: success,
|
|
9970
|
+
reviewerBlockers: failed ? 1 : 0,
|
|
9971
|
+
costUsd: 0,
|
|
9972
|
+
wallSeconds: Math.max(0, durationMs / 1e3),
|
|
9973
|
+
notes: [
|
|
9974
|
+
`reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
|
|
9975
|
+
`precision=${precision.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
|
|
9976
|
+
]
|
|
9977
|
+
};
|
|
9978
|
+
}
|
|
8010
9979
|
export {
|
|
8011
9980
|
AgentDriver,
|
|
8012
9981
|
AxGepaSteeringOptimizer,
|
|
@@ -8017,6 +9986,7 @@ export {
|
|
|
8017
9986
|
ConvergenceTracker,
|
|
8018
9987
|
CostTracker,
|
|
8019
9988
|
DEFAULT_AGENT_SLOS,
|
|
9989
|
+
DEFAULT_COMPLEXITY_WEIGHTS,
|
|
8020
9990
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
8021
9991
|
DEFAULT_FINDERS,
|
|
8022
9992
|
DEFAULT_HARNESS_OBJECTIVES,
|
|
@@ -8027,19 +9997,24 @@ export {
|
|
|
8027
9997
|
Dataset,
|
|
8028
9998
|
DockerSandboxDriver,
|
|
8029
9999
|
DualAgentBench,
|
|
10000
|
+
ERROR_COUNT_PATTERNS,
|
|
8030
10001
|
ExperimentTracker,
|
|
8031
10002
|
FAILURE_CLASSES,
|
|
8032
10003
|
FileSystemOutcomeStore,
|
|
8033
10004
|
FileSystemTraceStore,
|
|
8034
10005
|
HoldoutAuditor,
|
|
8035
10006
|
HoldoutLockedError,
|
|
10007
|
+
INTENT_MATCH_JUDGE_VERSION,
|
|
8036
10008
|
InMemoryExperimentStore,
|
|
8037
10009
|
InMemoryOutcomeStore,
|
|
8038
10010
|
InMemoryTraceStore,
|
|
8039
10011
|
InMemoryWorkspaceInspector,
|
|
8040
10012
|
JudgeRunner,
|
|
10013
|
+
LlmCallError,
|
|
10014
|
+
LlmClient,
|
|
8041
10015
|
MODEL_PRICING,
|
|
8042
10016
|
MetricsCollector,
|
|
10017
|
+
MultiLayerVerifier,
|
|
8043
10018
|
OTEL_AGENT_EVAL_SCOPE,
|
|
8044
10019
|
OptimizationLoop,
|
|
8045
10020
|
PairwiseSteeringOptimizer,
|
|
@@ -8050,6 +10025,7 @@ export {
|
|
|
8050
10025
|
PromptRegistry,
|
|
8051
10026
|
REDACTION_VERSION,
|
|
8052
10027
|
RunCritic,
|
|
10028
|
+
SEMANTIC_CONCEPT_JUDGE_VERSION,
|
|
8053
10029
|
SandboxHarness,
|
|
8054
10030
|
ScenarioRegistry,
|
|
8055
10031
|
SubprocessSandboxDriver,
|
|
@@ -8068,10 +10044,13 @@ export {
|
|
|
8068
10044
|
bisect,
|
|
8069
10045
|
bonferroni,
|
|
8070
10046
|
budgetBreachView,
|
|
10047
|
+
buildReviewerPrompt,
|
|
8071
10048
|
buildTrajectory,
|
|
8072
10049
|
byteLengthRange,
|
|
8073
10050
|
calibrateJudge,
|
|
8074
10051
|
calibrationCurve,
|
|
10052
|
+
callLlm,
|
|
10053
|
+
callLlmJson,
|
|
8075
10054
|
canaryLeakView,
|
|
8076
10055
|
causalAttribution,
|
|
8077
10056
|
checkCanaries,
|
|
@@ -8084,6 +10063,7 @@ export {
|
|
|
8084
10063
|
coherenceJudge,
|
|
8085
10064
|
collectionPreserved,
|
|
8086
10065
|
commitBisect,
|
|
10066
|
+
compareReferenceReplay,
|
|
8087
10067
|
compareToBaseline,
|
|
8088
10068
|
compilerJudge,
|
|
8089
10069
|
composeParsers,
|
|
@@ -8095,10 +10075,17 @@ export {
|
|
|
8095
10075
|
correlationStudy,
|
|
8096
10076
|
createAntiSlopJudge,
|
|
8097
10077
|
createCustomJudge,
|
|
10078
|
+
createDefaultReviewer,
|
|
8098
10079
|
createDomainExpertJudge,
|
|
10080
|
+
createIntentMatchJudge,
|
|
8099
10081
|
createLlmReviewer,
|
|
10082
|
+
createSemanticConceptJudge,
|
|
8100
10083
|
crossTraceDiff,
|
|
10084
|
+
decideReferenceReplayPromotion,
|
|
10085
|
+
decideReferenceReplayRunPromotion,
|
|
8101
10086
|
defaultJudges,
|
|
10087
|
+
defaultReferenceReplayMatcher,
|
|
10088
|
+
deployGateLayer,
|
|
8102
10089
|
distillPlaybook,
|
|
8103
10090
|
dominates,
|
|
8104
10091
|
estimateCost,
|
|
@@ -8112,6 +10099,8 @@ export {
|
|
|
8112
10099
|
exportRewardModel,
|
|
8113
10100
|
exportRunAsOtlp,
|
|
8114
10101
|
exportTrainingData,
|
|
10102
|
+
extractAssetUrls,
|
|
10103
|
+
extractErrorCount,
|
|
8115
10104
|
failureClusterView,
|
|
8116
10105
|
fileContains,
|
|
8117
10106
|
fileExists,
|
|
@@ -8121,12 +10110,16 @@ export {
|
|
|
8121
10110
|
findLiteralTruePass,
|
|
8122
10111
|
findSkipCountsAsPass,
|
|
8123
10112
|
firstDivergenceView,
|
|
10113
|
+
flowLayer,
|
|
8124
10114
|
formatBenchmarkReport,
|
|
8125
10115
|
formatDriverReport,
|
|
8126
10116
|
formatFindings,
|
|
10117
|
+
gradeSemanticStatus,
|
|
8127
10118
|
groupBy,
|
|
8128
10119
|
hashContent,
|
|
8129
10120
|
hashScenarios,
|
|
10121
|
+
htmlContainsElement,
|
|
10122
|
+
inMemoryReferenceReplayStore,
|
|
8130
10123
|
inMemoryReviewStore,
|
|
8131
10124
|
interRaterReliability,
|
|
8132
10125
|
iqr,
|
|
@@ -8139,6 +10132,7 @@ export {
|
|
|
8139
10132
|
jestTestParser,
|
|
8140
10133
|
jsonHasKeys,
|
|
8141
10134
|
jsonShape,
|
|
10135
|
+
jsonlReferenceReplayStore,
|
|
8142
10136
|
jsonlReviewStore,
|
|
8143
10137
|
judgeAgreementView,
|
|
8144
10138
|
judgeSpans,
|
|
@@ -8147,9 +10141,12 @@ export {
|
|
|
8147
10141
|
llmSpanFromProvider,
|
|
8148
10142
|
llmSpans,
|
|
8149
10143
|
loadScorerFromGrader,
|
|
10144
|
+
localCommandRunner,
|
|
8150
10145
|
lowercaseMutator,
|
|
8151
10146
|
mannWhitneyU,
|
|
10147
|
+
mergeLayerResults,
|
|
8152
10148
|
mergeSteeringBundle,
|
|
10149
|
+
multiToolchainLayer,
|
|
8153
10150
|
nistAiRmfReport,
|
|
8154
10151
|
nonRefusalRubric,
|
|
8155
10152
|
normalizeScores,
|
|
@@ -8165,6 +10162,7 @@ export {
|
|
|
8165
10162
|
printDriverSummary,
|
|
8166
10163
|
prmBestOfN,
|
|
8167
10164
|
prmEnsembleBestOfN,
|
|
10165
|
+
probeLlm,
|
|
8168
10166
|
promptBisect,
|
|
8169
10167
|
proposeSynthesisTargets,
|
|
8170
10168
|
pytestTestParser,
|
|
@@ -8172,6 +10170,8 @@ export {
|
|
|
8172
10170
|
redTeamReport,
|
|
8173
10171
|
redactString,
|
|
8174
10172
|
redactValue,
|
|
10173
|
+
referenceReplayRunsToSteeringRows,
|
|
10174
|
+
referenceReplayScenarioToRunScore,
|
|
8175
10175
|
regexMatch,
|
|
8176
10176
|
regexMatches,
|
|
8177
10177
|
regressionView,
|
|
@@ -8191,9 +10191,14 @@ export {
|
|
|
8191
10191
|
runExpectations,
|
|
8192
10192
|
runFailureClass,
|
|
8193
10193
|
runHarnessExperiment,
|
|
10194
|
+
runIntentMatchJudge,
|
|
8194
10195
|
runJudgeFleet,
|
|
10196
|
+
runKeywordCoverageJudge,
|
|
10197
|
+
runKeywordCoverageJudgeUrl,
|
|
8195
10198
|
runProposeReview,
|
|
10199
|
+
runReferenceReplay,
|
|
8196
10200
|
runSelfPlay,
|
|
10201
|
+
runSemanticConceptJudge,
|
|
8197
10202
|
runTestGradedScenario,
|
|
8198
10203
|
runsForScenario,
|
|
8199
10204
|
scanForMuffledGates,
|
|
@@ -8201,6 +10206,7 @@ export {
|
|
|
8201
10206
|
scoreContinuity,
|
|
8202
10207
|
scoreProject,
|
|
8203
10208
|
scoreRedTeamOutput,
|
|
10209
|
+
scoreReferenceReplay,
|
|
8204
10210
|
securityJudge,
|
|
8205
10211
|
selectHarnessVariant,
|
|
8206
10212
|
selfPreference,
|
|
@@ -8208,6 +10214,7 @@ export {
|
|
|
8208
10214
|
signManifest,
|
|
8209
10215
|
soc2Report,
|
|
8210
10216
|
statusAdvanced,
|
|
10217
|
+
stripFencedJson,
|
|
8211
10218
|
stuckLoopView,
|
|
8212
10219
|
summarize,
|
|
8213
10220
|
summarizeHarnessResults,
|
|
@@ -8227,6 +10234,7 @@ export {
|
|
|
8227
10234
|
verbosityBias,
|
|
8228
10235
|
verifyManifest,
|
|
8229
10236
|
visualDiff,
|
|
10237
|
+
viteDeployRunner,
|
|
8230
10238
|
vitestTestParser,
|
|
8231
10239
|
weightedMean,
|
|
8232
10240
|
welchsTTest,
|