npm - @tangle-network/agent-eval - Versions diffs - 0.33.1 → 0.34.1 - Mend

@tangle-network/agent-eval 0.33.1 → 0.34.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/CHANGELOG.md +33 -0
package/dist/benchmarks/index.d.ts +2 -2
package/dist/{chunk-FT3IAMQR.js → chunk-3HYQXPC2.js} +2 -2
package/dist/{chunk-WRGHMGWT.js → chunk-7PR3WPWE.js} +2 -2
package/dist/{chunk-SQYRO3BT.js → chunk-RL6TERL2.js} +2 -2
package/dist/{chunk-DCZXFOQN.js → chunk-TSPOEDM3.js} +56 -1
package/dist/chunk-TSPOEDM3.js.map +1 -0
package/dist/{control-C3k02SCP.d.ts → control-DVrmvM_k.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/control.js +2 -2
package/dist/{index-ClMxVqe_.d.ts → index-0pu_fBwZ.d.ts} +1 -1
package/dist/index.d.ts +271 -11
package/dist/index.js +487 -92
package/dist/index.js.map +1 -1
package/dist/meta-eval/index.d.ts +2 -2
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +3 -3
package/dist/optimization.js +3 -3
package/dist/{release-report-ChfmCmLi.d.ts → release-report-D2ykiLSe.d.ts} +2 -2
package/dist/reporting.d.ts +4 -4
package/dist/{researcher-CfnL3HEb.d.ts → researcher-DeZ_EArp.d.ts} +2 -2
package/dist/rl.d.ts +5 -5
package/dist/rl.js +2 -2
package/dist/{rubric-predictive-validity-BvaNwfBE.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
package/dist/{run-record-YinVdFwu.d.ts → run-record-BGY6bHRh.d.ts} +37 -1
package/dist/{summary-report-BPJVzIeW.d.ts → summary-report-DuZXOk7K.d.ts} +1 -1
package/package.json +12 -22
package/dist/chunk-DCZXFOQN.js.map +0 -1
/package/dist/{chunk-FT3IAMQR.js.map → chunk-3HYQXPC2.js.map} +0 -0
/package/dist/{chunk-WRGHMGWT.js.map → chunk-7PR3WPWE.js.map} +0 -0
/package/dist/{chunk-SQYRO3BT.js.map → chunk-RL6TERL2.js.map} +0 -0

package/dist/index.js CHANGED Viewed

@@ -54,7 +54,7 @@ import {
   runProposeReview,
   runProposeReviewAsControlLoop,
   scoreFromEvals
-} from "./chunk-WRGHMGWT.js";
+} from "./chunk-7PR3WPWE.js";
 import {
   allCriticalPassed,
   objectiveEval,
@@ -96,7 +96,7 @@ import {
   summarizePreferenceMemory,
   trialTraceFromMultiShotTrial,
   withAssignedFeedbackSplit
-} from "./chunk-FT3IAMQR.js";
+} from "./chunk-3HYQXPC2.js";
 import {
   assertReleaseConfidence,
   bootstrapCi,
@@ -107,7 +107,7 @@ import {
 } from "./chunk-LGAPK7NA.js";
 import {
   runEvalCampaign
-} from "./chunk-SQYRO3BT.js";
+} from "./chunk-RL6TERL2.js";
 import {
   LlmCallError,
   LlmClient,
@@ -121,21 +121,24 @@ import {
   stripFencedJson
 } from "./chunk-VXNVVBZO.js";
 import {
+  AGENT_PROFILE_KINDS,
   AgentProfileCellValidationError,
   RunRecordValidationError,
   agentProfileCellHashMaterial,
   agentProfileCellKey,
   assertRunAgentProfileCell,
   buildAgentProfileCell,
+  buildSandboxAgentProfileCell,
   groupRunsByAgentProfileCell,
   isRunRecord,
   parseRunRecordSafe,
   requireAgentProfileCell,
   roundTripRunRecord,
+  toAgentProfileJson,
   validateAgentProfileCell,
   validateRunRecord,
   verifyAgentProfileCell
-} from "./chunk-DCZXFOQN.js";
+} from "./chunk-TSPOEDM3.js";
 import {
   evaluateInterimReleaseConfidence,
   pairedEvalueSequence
@@ -333,7 +336,7 @@ var RunCritic = class {
     );
     const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
     if (!success) notes.push("run did not complete with pass=true");
-    const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
+    const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
     const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
       trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
     ) : void 0;
@@ -348,7 +351,7 @@ var RunCritic = class {
       (span) => typeof span.testsTotal === "number" && span.testsTotal > 0
     );
     const testReality = sandboxTests.length ? sandboxTests.reduce(
-      (sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
+      (sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
       0
     ) / sandboxTests.length : toolSpans2.some(
       (span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
@@ -370,7 +373,7 @@ var RunCritic = class {
     const costUsd = trace.budget.length ? Math.max(
       ...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
       0
-    ) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
+    ) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
     const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
     return {
       success,
@@ -1960,12 +1963,12 @@ function allocateBudget(policy, args) {
   return policy.totalUsd / Math.max(1, args.runningCount);
 }
 function sumFindingCost(findings) {
-  let sum2 = 0;
+  let sum3 = 0;
   for (const f of findings) {
     const c = f.metadata?.cost_usd;
-    if (typeof c === "number" && Number.isFinite(c)) sum2 += c;
+    if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
   }
-  return sum2;
+  return sum3;
 }
 function selectPriorFindings(source, analystId) {
   if (!source) return void 0;
@@ -2184,10 +2187,10 @@ function ghCliClient(opts = {}) {
       await exec("git", ["branch", "-D", input.branchName], { cwd });
       await run("git", ["checkout", "-b", input.branchName]);
       const { mkdir, writeFile } = await import("fs/promises");
-      const { dirname: dirname5, join: join4, resolve } = await import("path");
+      const { dirname: dirname6, join: join4, resolve } = await import("path");
       for (const change of input.fileChanges) {
         const abs = resolve(cwd, change.path);
-        await mkdir(dirname5(abs), { recursive: true });
+        await mkdir(dirname6(abs), { recursive: true });
         await writeFile(abs, change.contents, "utf8");
         await run("git", ["add", join4(change.path)]);
       }
@@ -3722,6 +3725,178 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
   };
 }
+// src/pr-review-benchmark.ts
+var DEFAULT_PR_REVIEW_SCORE_WEIGHTS = {
+  recall: 4,
+  precision: 2,
+  actionability: 1.5,
+  severityCalibration: 1,
+  lowNoise: 1
+};
+function commentsForSource(auditCase, source) {
+  return auditCase.comments.filter((comment) => comment.source === source);
+}
+function scorePrReviewSource(auditCase, source, weights = {}) {
+  return scorePrReviewComments(auditCase, commentsForSource(auditCase, source), source, weights);
+}
+function scorePrReviewComments(auditCase, comments, source, weights = {}) {
+  const matchedFindings = matchReferenceFindings(auditCase.referenceFindings, comments);
+  const matchedCommentIds = new Set(matchedFindings.map((match) => match.commentId));
+  const positiveComments = comments.filter((comment) => isPositiveOutcome(comment.outcome));
+  const negativeComments = comments.filter((comment) => isNegativeOutcome(comment.outcome));
+  const actionableComments = comments.filter(isActionableComment);
+  const severityComments = comments.filter((comment) => comment.severity);
+  const severityAligned = severityComments.filter(
+    (comment) => isSeverityAligned(comment, auditCase.referenceFindings, matchedFindings)
+  );
+  const recall = auditCase.referenceFindings.length ? matchedFindings.length / auditCase.referenceFindings.length : comments.length === 0 ? 1 : 0;
+  const precisionDenominator = positiveComments.length + negativeComments.length;
+  const precision2 = precisionDenominator > 0 ? positiveComments.length / precisionDenominator : comments.length > 0 ? matchedCommentIds.size / comments.length : auditCase.referenceFindings.length === 0 ? 1 : 0;
+  const actionability = comments.length ? actionableComments.length / comments.length : 1;
+  const severityCalibration = severityComments.length ? severityAligned.length / severityComments.length : matchedFindings.length ? 0.5 : 1;
+  const lowNoise = comments.length ? 1 - negativeComments.length / comments.length : 1;
+  const aggregate2 = aggregatePrReviewScore(
+    { recall, precision: precision2, actionability, severityCalibration, lowNoise },
+    weights
+  );
+  return {
+    caseId: auditCase.id,
+    source,
+    commentCount: comments.length,
+    referenceCount: auditCase.referenceFindings.length,
+    matchedFindings,
+    recall,
+    precision: precision2,
+    actionability,
+    severityCalibration,
+    lowNoise,
+    aggregate: aggregate2,
+    notes: buildScoreNotes({
+      comments,
+      referenceCount: auditCase.referenceFindings.length,
+      matchedFindings,
+      negativeComments,
+      actionableComments
+    })
+  };
+}
+function summarizePrReviewBenchmark(scores) {
+  const bySource = /* @__PURE__ */ new Map();
+  for (const score of scores) {
+    bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
+  }
+  return [...bySource.entries()].map(([source, sourceScores]) => ({
+    source,
+    caseCount: sourceScores.length,
+    commentCount: sum(sourceScores.map((score) => score.commentCount)),
+    aggregateMean: mean(sourceScores.map((score) => score.aggregate)),
+    recallMean: mean(sourceScores.map((score) => score.recall)),
+    precisionMean: mean(sourceScores.map((score) => score.precision)),
+    actionabilityMean: mean(sourceScores.map((score) => score.actionability)),
+    severityCalibrationMean: mean(sourceScores.map((score) => score.severityCalibration)),
+    lowNoiseMean: mean(sourceScores.map((score) => score.lowNoise))
+  })).sort((a, b) => b.aggregateMean - a.aggregateMean);
+}
+function aggregatePrReviewScore(dimensions, weights = {}) {
+  const merged = { ...DEFAULT_PR_REVIEW_SCORE_WEIGHTS, ...weights };
+  const weightSum = Object.values(merged).reduce((total, value) => total + Math.max(0, value), 0);
+  if (weightSum <= 0) return 0;
+  return (merged.recall * clamp01(dimensions.recall) + merged.precision * clamp01(dimensions.precision) + merged.actionability * clamp01(dimensions.actionability) + merged.severityCalibration * clamp01(dimensions.severityCalibration) + merged.lowNoise * clamp01(dimensions.lowNoise)) / weightSum;
+}
+function matchReferenceFindings(references, comments) {
+  const matches = [];
+  const usedCommentIds = /* @__PURE__ */ new Set();
+  for (const reference of references) {
+    const candidates = comments.filter((comment) => !usedCommentIds.has(comment.id)).map((comment) => ({ comment, score: matchScore(reference, comment) })).filter(({ score }) => score >= 0.55).sort((a, b) => b.score - a.score);
+    const best = candidates[0];
+    if (!best) continue;
+    usedCommentIds.add(best.comment.id);
+    matches.push({ referenceId: reference.id, commentId: best.comment.id, score: best.score });
+  }
+  return matches;
+}
+function matchScore(reference, comment) {
+  let score = 0;
+  if (reference.sourceCommentIds?.includes(comment.id)) score += 1;
+  if (reference.path && comment.path && normalizePath(reference.path) === normalizePath(comment.path)) {
+    score += 0.35;
+  }
+  if (reference.line && comment.line && Math.abs(reference.line - comment.line) <= 3) score += 0.15;
+  const terms = [...reference.keywords ?? [], ...tokenize(reference.title)];
+  const uniqueTerms = [...new Set(terms.map(normalizeTerm).filter((term) => term.length >= 3))];
+  if (uniqueTerms.length > 0) {
+    const bodyTerms = new Set(tokenize(comment.body).map(normalizeTerm));
+    const overlap = uniqueTerms.filter((term) => bodyTerms.has(term)).length;
+    score += 0.5 * (overlap / uniqueTerms.length);
+  }
+  return clamp01(score);
+}
+function isActionableComment(comment) {
+  const body = comment.body.trim();
+  if (!comment.path && !/\b(file|line|function|method|class|module|test|migration)\b/i.test(body)) {
+    return false;
+  }
+  return /\b(fix|change|add|remove|guard|check|reject|validate|test|assert|return|throw|fail|block)\b/i.test(
+    body
+  );
+}
+function isSeverityAligned(comment, references, matches) {
+  if (!comment.severity) return false;
+  const match = matches.find((candidate) => candidate.commentId === comment.id);
+  if (!match) return comment.severity === "nit" || comment.severity === "low";
+  const reference = references.find((candidate) => candidate.id === match.referenceId);
+  if (!reference) return false;
+  return Math.abs(severityRank(comment.severity) - severityRank(reference.severity)) <= 1;
+}
+function buildScoreNotes(input) {
+  const notes = [];
+  if (input.referenceCount > 0 && input.matchedFindings.length === 0) {
+    notes.push("no reference findings matched");
+  }
+  if (input.negativeComments.length > 0) {
+    notes.push(`${input.negativeComments.length} comment(s) labelled rejected/duplicate/noise`);
+  }
+  if (input.comments.length > 0 && input.actionableComments.length === 0) {
+    notes.push("comments were not actionable enough for a PR reviewer benchmark");
+  }
+  return notes;
+}
+function isPositiveOutcome(outcome) {
+  return outcome === "accepted" || outcome === "fixed";
+}
+function isNegativeOutcome(outcome) {
+  return outcome === "rejected" || outcome === "duplicate" || outcome === "noise";
+}
+function severityRank(severity) {
+  switch (severity) {
+    case "critical":
+      return 5;
+    case "high":
+      return 4;
+    case "medium":
+      return 3;
+    case "low":
+      return 2;
+    case "nit":
+      return 1;
+  }
+}
+function tokenize(input) {
+  return input.match(/[a-zA-Z0-9_.$/-]+/g) ?? [];
+}
+function normalizeTerm(input) {
+  return input.toLowerCase().replace(/^[^a-z0-9_]+|[^a-z0-9_]+$/g, "");
+}
+function normalizePath(input) {
+  return input.replace(/^\.\/+/, "");
+}
+function mean(values) {
+  return values.length ? sum(values) / values.length : 0;
+}
+function sum(values) {
+  return values.reduce((total, value) => total + value, 0);
+}
 // src/production-loop.ts
 async function runProductionLoop(opts) {
   validate2(opts);
@@ -5217,14 +5392,14 @@ async function runHarnessExperiment(config) {
   const score = config.score ?? ((trace) => critic.scoreTrace(trace));
   const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
     const trace = await config.adapter.run(request);
-    const runScore = await score(trace, request);
+    const runScore2 = await score(trace, request);
     const result = {
       variant: request.variant,
       scenario: request.scenario,
       trialIndex: request.trialIndex,
       trace,
-      score: runScore,
-      aggregate: aggregateRunScore(runScore, config.weights)
+      score: runScore2,
+      aggregate: aggregateRunScore(runScore2, config.weights)
     };
     await config.onResult?.(result);
     return result;
@@ -5251,10 +5426,10 @@ function summarizeHarnessResults(results) {
     return {
       variant,
       runs,
-      aggregateMean: mean(runs.map((r) => r.aggregate)),
-      passRate: mean(runs.map((r) => r.score.success)),
-      costUsdMean: mean(runs.map((r) => r.score.costUsd)),
-      wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
+      aggregateMean: mean2(runs.map((r) => r.aggregate)),
+      passRate: mean2(runs.map((r) => r.score.success)),
+      costUsdMean: mean2(runs.map((r) => r.score.costUsd)),
+      wallSecondsMean: mean2(runs.map((r) => r.score.wallSeconds)),
       scoreMean: meanRunScore(runs.map((r) => r.score))
     };
   }).sort((a, b) => b.aggregateMean - a.aggregateMean);
@@ -5291,22 +5466,22 @@ async function mapLimit(items, limit, fn) {
   );
   return results;
 }
-function mean(values) {
-  return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
+function mean2(values) {
+  return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
 }
 function meanRunScore(scores) {
   return {
-    success: mean(scores.map((s) => s.success)),
-    goalProgress: mean(scores.map((s) => s.goalProgress)),
-    repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
-    driftPenalty: mean(scores.map((s) => s.driftPenalty)),
-    toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
-    patchQuality: mean(scores.map((s) => s.patchQuality)),
-    testReality: mean(scores.map((s) => s.testReality)),
-    finalGate: mean(scores.map((s) => s.finalGate)),
-    reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
-    costUsd: mean(scores.map((s) => s.costUsd)),
-    wallSeconds: mean(scores.map((s) => s.wallSeconds)),
+    success: mean2(scores.map((s) => s.success)),
+    goalProgress: mean2(scores.map((s) => s.goalProgress)),
+    repoGroundedness: mean2(scores.map((s) => s.repoGroundedness)),
+    driftPenalty: mean2(scores.map((s) => s.driftPenalty)),
+    toolUseQuality: mean2(scores.map((s) => s.toolUseQuality)),
+    patchQuality: mean2(scores.map((s) => s.patchQuality)),
+    testReality: mean2(scores.map((s) => s.testReality)),
+    finalGate: mean2(scores.map((s) => s.finalGate)),
+    reviewerBlockers: mean2(scores.map((s) => s.reviewerBlockers)),
+    costUsd: mean2(scores.map((s) => s.costUsd)),
+    wallSeconds: mean2(scores.map((s) => s.wallSeconds)),
     notes: scores.flatMap((s) => s.notes ?? [])
   };
 }
@@ -5645,7 +5820,7 @@ function rankRows(rows, weights) {
   }
   return [...buckets.entries()].map(([variantId, values]) => ({
     variantId,
-    mean: values.reduce((sum2, value) => sum2 + value, 0) / values.length,
+    mean: values.reduce((sum3, value) => sum3 + value, 0) / values.length,
     runs: values.length
   })).sort((a, b) => b.mean - a.mean);
 }
@@ -5815,6 +5990,22 @@ var BudgetGuard = class {
   }
 };
+// src/agent-profile.ts
+import { createHash as createHash2 } from "crypto";
+function agentProfileHash(profile) {
+  if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
+    throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
+  }
+  const behaviour = {
+    model: profile.model.trim(),
+    skills: [...profile.skills ?? []].sort(),
+    promptVersion: profile.promptVersion ?? null,
+    tools: [...profile.tools ?? []].sort(),
+    metadata: profile.metadata ?? {}
+  };
+  return createHash2("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
+}
 // src/cost-tracker.ts
 var CostTracker = class {
   byScenario = /* @__PURE__ */ new Map();
@@ -6221,6 +6412,194 @@ function isObject(v) {
   return typeof v === "object" && v !== null && !Array.isArray(v);
 }
+// src/scorecard.ts
+import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3 } from "fs";
+import { dirname as dirname2 } from "path";
+function median(xs) {
+  if (xs.length === 0) return 0;
+  const sorted = [...xs].sort((a, b) => a - b);
+  const mid = Math.floor(sorted.length / 2);
+  return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
+}
+function runScore(run) {
+  return run.outcome.holdoutScore ?? run.outcome.searchScore;
+}
+function aggregatePerDimension(runs) {
+  const sums = /* @__PURE__ */ new Map();
+  for (const run of runs) {
+    const dims = run.outcome.judgeScores?.perDimMean;
+    if (!dims) continue;
+    for (const [dim, value] of Object.entries(dims)) {
+      if (!Number.isFinite(value)) continue;
+      const acc = sums.get(dim) ?? { total: 0, count: 0 };
+      acc.total += value;
+      acc.count += 1;
+      sums.set(dim, acc);
+    }
+  }
+  if (sums.size === 0) return void 0;
+  const out = {};
+  for (const [dim, acc] of sums) out[dim] = acc.total / acc.count;
+  return out;
+}
+function recordRuns(runs, opts) {
+  const profileHash = agentProfileHash(opts.profile);
+  const timestamp = opts.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
+  const byScenario = /* @__PURE__ */ new Map();
+  for (const run of runs) {
+    const scenarioId = run.scenarioId;
+    if (!scenarioId) continue;
+    const bucket = byScenario.get(scenarioId);
+    if (bucket) bucket.push(run);
+    else byScenario.set(scenarioId, [run]);
+  }
+  const lines = [];
+  for (const [scenarioId, scenarioRuns] of byScenario) {
+    const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
+    if (scored.length === 0) continue;
+    const scores = scored.map((s) => s.score);
+    const entry = {
+      commitSha: opts.commitSha,
+      timestamp,
+      scores,
+      composite: median(scores),
+      runIds: scored.map((s) => s.run.runId)
+    };
+    const perDimension = aggregatePerDimension(scenarioRuns);
+    if (perDimension) entry.perDimension = perDimension;
+    lines.push({
+      scenarioId,
+      profileHash,
+      model: opts.profile.model,
+      profile: opts.profile,
+      entry
+    });
+  }
+  return lines;
+}
+function appendScorecard(logPath, lines) {
+  if (lines.length === 0) return;
+  mkdirSync2(dirname2(logPath), { recursive: true });
+  appendFileSync2(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
+`);
+}
+function recordRunsToScorecard(logPath, runs, opts) {
+  const lines = recordRuns(runs, opts);
+  appendScorecard(logPath, lines);
+  return lines;
+}
+function loadScorecard(logPath) {
+  if (!existsSync4(logPath)) return { cells: [], profiles: {} };
+  const cells = /* @__PURE__ */ new Map();
+  const profiles = {};
+  for (const raw of readFileSync3(logPath, "utf8").split("\n")) {
+    const line = raw.trim();
+    if (!line) continue;
+    let parsed;
+    try {
+      parsed = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    if (!parsed?.scenarioId || !parsed.profileHash || !parsed.entry) continue;
+    const key = `${parsed.scenarioId}::${parsed.profileHash}`;
+    let cell = cells.get(key);
+    if (!cell) {
+      cell = {
+        scenarioId: parsed.scenarioId,
+        profileHash: parsed.profileHash,
+        model: parsed.model,
+        timeline: []
+      };
+      cells.set(key, cell);
+    }
+    cell.timeline.push(parsed.entry);
+    if (parsed.profile) profiles[parsed.profileHash] = parsed.profile;
+  }
+  for (const cell of cells.values()) {
+    cell.timeline.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
+  }
+  return { cells: [...cells.values()], profiles };
+}
+function diffScorecard(scorecard, opts = {}) {
+  const minEffect = opts.minEffect ?? 0.5;
+  const maxP = opts.maxP ?? 0.05;
+  const minDelta = opts.minDelta ?? 0.05;
+  const cells = [];
+  for (const cell of scorecard.cells) {
+    const timeline = cell.timeline;
+    if (timeline.length === 0) continue;
+    const current = timeline[timeline.length - 1];
+    const baseline = opts.baselineCommit ? [...timeline].reverse().find((e) => e.commitSha === opts.baselineCommit && e !== current) : timeline[timeline.length - 2];
+    const base = {
+      scenarioId: cell.scenarioId,
+      profileHash: cell.profileHash,
+      model: cell.model,
+      current: current.composite,
+      currentCommit: current.commitSha
+    };
+    if (!baseline) {
+      cells.push({
+        ...base,
+        verdict: "new",
+        baseline: null,
+        delta: null,
+        cohensD: null,
+        pValue: null,
+        baselineCommit: null
+      });
+      continue;
+    }
+    const delta = current.composite - baseline.composite;
+    const canStat = baseline.scores.length >= 2 && current.scores.length >= 2;
+    let d = null;
+    let p = null;
+    let verdict;
+    if (canStat) {
+      d = cohensD(baseline.scores, current.scores);
+      const t = welchsTTest(baseline.scores, current.scores);
+      p = Number.isFinite(t.p) ? t.p : null;
+      const significant = Math.abs(d) >= minEffect && p !== null && p <= maxP;
+      verdict = significant ? delta > 0 ? "improved" : "regressed" : "flat";
+    } else {
+      verdict = Math.abs(delta) >= minDelta ? delta > 0 ? "improved" : "regressed" : "flat";
+    }
+    cells.push({
+      ...base,
+      verdict,
+      baseline: baseline.composite,
+      delta,
+      cohensD: d,
+      pValue: p,
+      baselineCommit: baseline.commitSha
+    });
+  }
+  const summary = { improved: 0, regressed: 0, flat: 0, new: 0 };
+  for (const cell of cells) summary[cell.verdict] += 1;
+  return { cells, summary };
+}
+function formatScorecardDiff(diff) {
+  const lines = [];
+  const { summary } = diff;
+  lines.push(
+    `Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
+  );
+  const fmt = (n) => n.toFixed(3);
+  const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
+    if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
+    return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
+  });
+  for (const cell of noteworthy) {
+    const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
+    const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt(cell.delta)}` : fmt(cell.delta) : "\u2014";
+    const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
+    lines.push(
+      `  ${mark}  ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)}  ${fmt(cell.baseline ?? 0)} \u2192 ${fmt(cell.current)}  ${deltaStr}${stat}`
+    );
+  }
+  return lines.join("\n");
+}
 // src/series-convergence.ts
 function analyzeSeries(values, options = {}) {
   const window = options.window ?? 5;
@@ -6230,10 +6609,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
+  const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance);
-  const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
+  const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -6254,7 +6633,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
 }
 // src/slo.ts
@@ -7052,12 +7431,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
+  const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance);
-  const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
+  const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -7123,8 +7502,8 @@ async function paraphraseRobustnessScenarios(args) {
         });
         scores.push(out.score);
       }
-      const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
-      deltas[m.name] = mean4 - originalScore;
+      const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
+      deltas[m.name] = mean5 - originalScore;
       paraphrasedAll.push(...scores);
     }
     const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
@@ -7737,8 +8116,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
     const sRuns = runs.filter((r) => r.scenarioId === s.id);
     const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
     if (scores.length < 3) continue;
-    const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
-    const variance = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
+    const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
+    const variance = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
     if (variance > varianceThreshold) {
       targets.push({
         reason: "high-variance",
@@ -7969,7 +8348,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
 // src/command-runner.ts
 import { spawnSync } from "child_process";
-import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
+import { existsSync as existsSync5, readdirSync as readdirSync2, readFileSync as readFileSync4, statSync as statSync2 } from "fs";
 import { join as join2 } from "path";
 var localCommandRunner = {
   name: "local",
@@ -7998,11 +8377,11 @@ var localCommandRunner = {
     return r.status === 0 && (r.stdout ?? "").trim().length > 0;
   },
   async fileExists(path) {
-    return existsSync4(path);
+    return existsSync5(path);
   },
   async readFile(path) {
     try {
-      return readFileSync3(path, "utf8");
+      return readFileSync4(path, "utf8");
     } catch {
       return null;
     }
@@ -8240,7 +8619,7 @@ function extractErrorCount(text, opts = {}) {
   for (const p of patterns) {
     const matches = Array.from(text.matchAll(p.regex));
     if (matches.length === 0) continue;
-    const count = p.transform ? matches.reduce((sum2, m) => sum2 + p.transform(m), 0) : matches.length;
+    const count = p.transform ? matches.reduce((sum3, m) => sum3 + p.transform(m), 0) : matches.length;
     return {
       count,
       matched: p.name,
@@ -8934,8 +9313,8 @@ function multiToolchainLayer(config) {
 }
 // src/reference-replay.ts
-import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
-import { dirname as dirname2 } from "path";
+import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5 } from "fs";
+import { dirname as dirname3 } from "path";
 var DEFAULT_MATCH_THRESHOLD = 0.55;
 var ALL_SPLITS = ["train", "dev", "test", "holdout"];
 async function runReferenceReplay(cases, options) {
@@ -9053,14 +9432,14 @@ function jsonlReferenceReplayStore(path) {
   return {
     async save(run) {
       await lock.runExclusive(() => {
-        mkdirSync2(dirname2(path), { recursive: true });
-        appendFileSync2(path, `${JSON.stringify(run)}
+        mkdirSync3(dirname3(path), { recursive: true });
+        appendFileSync3(path, `${JSON.stringify(run)}
 `);
       });
     },
     async list() {
       return lock.runExclusive(() => {
-        if (!existsSync5(path)) return [];
+        if (!existsSync6(path)) return [];
         return readJsonl(path);
       });
     }
@@ -9149,7 +9528,7 @@ function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
       regressions
     };
   }
-  const requiredMeanDelta = mean2(compared.map((item) => item.f1Delta));
+  const requiredMeanDelta = mean3(compared.map((item) => item.f1Delta));
   if (requiredMeanDelta < minF1Delta) {
     return {
       promote: false,
@@ -9284,8 +9663,8 @@ function scorePair(scenario, matcher, reference, candidate) {
 function buildScenarioScore(scenario, matches, falsePositives) {
   const matched = matches.filter((match) => match.matched).length;
   const total = scenario.references.length;
-  const matchedWeight = matches.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
-  const totalWeight = matches.reduce((sum2, match) => sum2 + match.weight, 0);
+  const matchedWeight = matches.filter((match) => match.matched).reduce((sum3, match) => sum3 + match.weight, 0);
+  const totalWeight = matches.reduce((sum3, match) => sum3 + match.weight, 0);
   const precision2 = ratio(matched, matched + falsePositives);
   const recall = ratio(matched, total);
   return {
@@ -9311,11 +9690,11 @@ function aggregateBySplit(scores) {
   return out;
 }
 function aggregateScenarioScores(scores) {
-  const matched = sum(scores.map((score) => score.matched));
-  const total = sum(scores.map((score) => score.total));
-  const falsePositives = sum(scores.map((score) => score.falsePositives));
-  const matchedWeight = sum(scores.map((score) => score.matchedWeight));
-  const totalWeight = sum(scores.map((score) => score.totalWeight));
+  const matched = sum2(scores.map((score) => score.matched));
+  const total = sum2(scores.map((score) => score.total));
+  const falsePositives = sum2(scores.map((score) => score.falsePositives));
+  const matchedWeight = sum2(scores.map((score) => score.matchedWeight));
+  const totalWeight = sum2(scores.map((score) => score.totalWeight));
   const precision2 = ratio(matched, matched + falsePositives);
   const recall = ratio(matched, total);
   return {
@@ -9382,11 +9761,11 @@ function clamp012(value) {
   if (!Number.isFinite(value)) return 0;
   return Math.max(0, Math.min(1, value));
 }
-function sum(values) {
+function sum2(values) {
   return values.reduce((acc, value) => acc + value, 0);
 }
-function mean2(values) {
-  return values.length ? sum(values) / values.length : 0;
+function mean3(values) {
+  return values.length ? sum2(values) / values.length : 0;
 }
 function formatPct(value) {
   return `${(value * 100).toFixed(1)}%`;
@@ -9403,7 +9782,7 @@ function throwIfAborted(signal) {
   throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
 }
 function readJsonl(path) {
-  const raw = readFileSync4(path, "utf8");
+  const raw = readFileSync5(path, "utf8");
   const out = [];
   for (const line of raw.split("\n")) {
     const trimmed = line.trim();
@@ -9650,8 +10029,8 @@ function detectCalibrationDrift(runs, opts) {
           alpha,
           recentN: recent.length,
           historyN: historical.length,
-          recentMean: mean3(recent),
-          historyMean: mean3(historical)
+          recentMean: mean4(recent),
+          historyMean: mean4(historical)
         }
       }
     ];
@@ -9771,7 +10150,7 @@ function chiSquareCritical(df, alpha) {
   }
   return TABLE[10][idx];
 }
-function mean3(xs) {
+function mean4(xs) {
   if (xs.length === 0) return 0;
   return xs.reduce((s, x) => s + x, 0) / xs.length;
 }
@@ -9971,8 +10350,8 @@ async function discoverPersonas(dir, opts = {}) {
 }
 // src/evolution-telemetry.ts
-import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5, writeFileSync } from "fs";
-import { dirname as dirname3 } from "path";
+import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
+import { dirname as dirname4 } from "path";
 var MutationTelemetry = class {
   appender;
   constructor(path) {
@@ -10001,17 +10380,17 @@ var LineageRecorder = class {
     this.path = path;
     this.snapshotPath = `${path}.snapshot`;
     this.kindOf = kindOf ?? defaultKindOf;
-    mkdirSync3(dirname3(path), { recursive: true });
-    if (existsSync6(this.snapshotPath)) {
+    mkdirSync4(dirname4(path), { recursive: true });
+    if (existsSync7(this.snapshotPath)) {
       try {
-        const parsed = JSON.parse(readFileSync5(this.snapshotPath, "utf-8"));
+        const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
         for (const n of parsed) this.nodes.set(n.id, n);
       } catch {
       }
     }
-    if (existsSync6(path)) {
+    if (existsSync7(path)) {
       try {
-        for (const line of readFileSync5(path, "utf-8").split("\n")) {
+        for (const line of readFileSync6(path, "utf-8").split("\n")) {
           if (!line.trim()) continue;
           try {
             const entry = JSON.parse(line);
@@ -10023,9 +10402,9 @@ var LineageRecorder = class {
       } catch {
       }
     }
-    if (existsSync6(path) && this.nodes.size === 0) {
+    if (existsSync7(path) && this.nodes.size === 0) {
       try {
-        const raw = readFileSync5(path, "utf-8").trim();
+        const raw = readFileSync6(path, "utf-8").trim();
         if (raw.startsWith("[")) {
           const parsed = JSON.parse(raw);
           for (const n of parsed) this.nodes.set(n.id, n);
@@ -10039,15 +10418,15 @@ var LineageRecorder = class {
       const prev = this.nodes.get(node.id);
       this.nodes.set(node.id, { ...prev, ...node });
       try {
-        if (existsSync6(this.path)) {
-          const head = readFileSync5(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
+        if (existsSync7(this.path)) {
+          const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
           if (head === "[") {
             writeFileSync(this.path, "");
           }
         }
       } catch {
       }
-      appendFileSync3(this.path, `${JSON.stringify(this.nodes.get(node.id))}
+      appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
 `);
     });
   }
@@ -10106,9 +10485,9 @@ var CostLedger = class {
   mutex = new Mutex();
   constructor(path) {
     this.path = path;
-    if (existsSync6(path)) {
+    if (existsSync7(path)) {
       try {
-        const loaded = JSON.parse(readFileSync5(path, "utf-8"));
+        const loaded = JSON.parse(readFileSync6(path, "utf-8"));
         for (const k of Object.keys(this.totals)) {
           if (k === "byGeneration") {
             if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
@@ -10125,7 +10504,7 @@ var CostLedger = class {
       } catch {
       }
     } else {
-      mkdirSync3(dirname3(path), { recursive: true });
+      mkdirSync4(dirname4(path), { recursive: true });
     }
   }
   genBucket(generation) {
@@ -10277,16 +10656,16 @@ function precision(goldens, candidates, options = {}) {
 }
 // src/jsonl-trial-cache.ts
-import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6 } from "fs";
-import { dirname as dirname4 } from "path";
+import { appendFileSync as appendFileSync5, existsSync as existsSync8, mkdirSync as mkdirSync5, readFileSync as readFileSync7 } from "fs";
+import { dirname as dirname5 } from "path";
 var JsonlTrialCache = class {
   map = /* @__PURE__ */ new Map();
   path;
   appender;
   constructor(path) {
     this.path = path;
-    if (existsSync7(path)) {
-      for (const line of readFileSync6(path, "utf-8").split("\n")) {
+    if (existsSync8(path)) {
+      for (const line of readFileSync7(path, "utf-8").split("\n")) {
         if (!line.trim()) continue;
         try {
           const entry = JSON.parse(line);
@@ -10295,7 +10674,7 @@ var JsonlTrialCache = class {
         }
       }
     } else {
-      mkdirSync4(dirname4(path), { recursive: true });
+      mkdirSync5(dirname5(path), { recursive: true });
     }
     this.appender = new LockedJsonlAppender(path);
   }
@@ -10318,7 +10697,7 @@ var JsonlTrialCache = class {
   setSync(key, value) {
     this.map.set(key, value);
     const line = { key, result: value, writtenAt: Date.now() };
-    appendFileSync4(this.path, `${JSON.stringify(line)}
+    appendFileSync5(this.path, `${JSON.stringify(line)}
 `);
   }
 };
@@ -10401,9 +10780,9 @@ function passOrthogonality(input) {
       sims.push(cosineSimilarity(vectors[i], vectors[j]));
     }
   }
-  const mean4 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
+  const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
   return {
-    orthogonality: Math.max(0, Math.min(1, 1 - mean4)),
+    orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
     passCount: passes.length,
     similarities: sims
   };
@@ -10653,6 +11032,7 @@ function aggregateTrialsByMode(trials, opts) {
   };
 }
 export {
+  AGENT_PROFILE_KINDS,
   ANALYST_SEVERITIES,
   AgentDriver,
   AgentEvalError,
@@ -10678,6 +11058,7 @@ export {
   DEFAULT_HARNESS_OBJECTIVES,
   DEFAULT_MUTATION_PRIMITIVES,
   DEFAULT_MUTATORS,
+  DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
   DEFAULT_REDACTION_RULES,
   DEFAULT_RED_TEAM_CORPUS,
   DEFAULT_RUN_SCORE_WEIGHTS,
@@ -10766,13 +11147,16 @@ export {
   adversarialJudge,
   agentProfileCellHashMaterial,
   agentProfileCellKey,
+  agentProfileHash,
   aggregateLlm,
+  aggregatePrReviewScore,
   aggregateRunScore,
   aggregateTrialsByMode,
   allCriticalPassed,
   analyzeAntiSlop,
   analyzeSeries,
   analyzeTraces,
+  appendScorecard,
   argHash,
   assertLlmRoute,
   assertRealBackend,
@@ -10793,6 +11177,7 @@ export {
   buildDriverSystemPrompt,
   buildReflectionPrompt,
   buildReviewerPrompt,
+  buildSandboxAgentProfileCell,
   buildTraceAnalystTools,
   buildTraceInsightContext,
   buildTraceInsightPrompt,
@@ -10816,6 +11201,7 @@ export {
   cohensD,
   coherenceJudge,
   collectionPreserved,
+  commentsForSource,
   commitBisect,
   compareReferenceReplay,
   compareToBaseline,
@@ -10866,6 +11252,7 @@ export {
   deployGateLayer,
   describeTraceInsightScope,
   diffFindings,
+  diffScorecard,
   discoverPersonas,
   distillPlaybook,
   domainEvidencePattern,
@@ -10901,6 +11288,7 @@ export {
   formatBenchmarkReport,
   formatDriverReport,
   formatFindings,
+  formatScorecardDiff,
   gainHistogram,
   ghCliClient,
   precision as goldenPrecision,
@@ -10943,6 +11331,7 @@ export {
   linterJudge,
   llmSpanFromProvider,
   llmSpans,
+  loadScorecard,
   loadScorerFromGrader,
   localCommandRunner,
   lowercaseMutator,
@@ -10984,6 +11373,8 @@ export {
   proposeSynthesisTargets,
   providerFromBaseUrl,
   pytestTestParser,
+  recordRuns,
+  recordRunsToScorecard,
   redTeamDataset,
   redTeamReport,
   redactString,
@@ -11042,6 +11433,8 @@ export {
   scoreContinuity,
   scoreFromEvals,
   scoreKnowledgeReadiness,
+  scorePrReviewComments,
+  scorePrReviewSource,
   scoreRedTeamOutput,
   scoreReferenceReplay,
   scoreTraceInsightReadiness,
@@ -11060,11 +11453,13 @@ export {
   summarize,
   summarizeBackendIntegrity,
   summarizeHarnessResults,
+  summarizePrReviewBenchmark,
   summarizePreferenceMemory,
   summaryTable,
   testJudge,
   textInSnapshot,
   throwIfRunIncomplete,
+  toAgentProfileJson,
   toLangfuseEnvelope,
   toPrometheusText,
   tokenizeDomainWords,