@tangle-network/agent-eval 0.33.1 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -333,7 +333,7 @@ var RunCritic = class {
333
333
  );
334
334
  const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
335
335
  if (!success) notes.push("run did not complete with pass=true");
336
- const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
336
+ const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
337
337
  const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
338
338
  trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
339
339
  ) : void 0;
@@ -348,7 +348,7 @@ var RunCritic = class {
348
348
  (span) => typeof span.testsTotal === "number" && span.testsTotal > 0
349
349
  );
350
350
  const testReality = sandboxTests.length ? sandboxTests.reduce(
351
- (sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
351
+ (sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
352
352
  0
353
353
  ) / sandboxTests.length : toolSpans2.some(
354
354
  (span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
@@ -370,7 +370,7 @@ var RunCritic = class {
370
370
  const costUsd = trace.budget.length ? Math.max(
371
371
  ...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
372
372
  0
373
- ) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
373
+ ) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
374
374
  const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
375
375
  return {
376
376
  success,
@@ -1960,12 +1960,12 @@ function allocateBudget(policy, args) {
1960
1960
  return policy.totalUsd / Math.max(1, args.runningCount);
1961
1961
  }
1962
1962
  function sumFindingCost(findings) {
1963
- let sum2 = 0;
1963
+ let sum3 = 0;
1964
1964
  for (const f of findings) {
1965
1965
  const c = f.metadata?.cost_usd;
1966
- if (typeof c === "number" && Number.isFinite(c)) sum2 += c;
1966
+ if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
1967
1967
  }
1968
- return sum2;
1968
+ return sum3;
1969
1969
  }
1970
1970
  function selectPriorFindings(source, analystId) {
1971
1971
  if (!source) return void 0;
@@ -2184,10 +2184,10 @@ function ghCliClient(opts = {}) {
2184
2184
  await exec("git", ["branch", "-D", input.branchName], { cwd });
2185
2185
  await run("git", ["checkout", "-b", input.branchName]);
2186
2186
  const { mkdir, writeFile } = await import("fs/promises");
2187
- const { dirname: dirname5, join: join4, resolve } = await import("path");
2187
+ const { dirname: dirname6, join: join4, resolve } = await import("path");
2188
2188
  for (const change of input.fileChanges) {
2189
2189
  const abs = resolve(cwd, change.path);
2190
- await mkdir(dirname5(abs), { recursive: true });
2190
+ await mkdir(dirname6(abs), { recursive: true });
2191
2191
  await writeFile(abs, change.contents, "utf8");
2192
2192
  await run("git", ["add", join4(change.path)]);
2193
2193
  }
@@ -3722,6 +3722,178 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
3722
3722
  };
3723
3723
  }
3724
3724
 
3725
+ // src/pr-review-benchmark.ts
3726
+ var DEFAULT_PR_REVIEW_SCORE_WEIGHTS = {
3727
+ recall: 4,
3728
+ precision: 2,
3729
+ actionability: 1.5,
3730
+ severityCalibration: 1,
3731
+ lowNoise: 1
3732
+ };
3733
+ function commentsForSource(auditCase, source) {
3734
+ return auditCase.comments.filter((comment) => comment.source === source);
3735
+ }
3736
+ function scorePrReviewSource(auditCase, source, weights = {}) {
3737
+ return scorePrReviewComments(auditCase, commentsForSource(auditCase, source), source, weights);
3738
+ }
3739
+ function scorePrReviewComments(auditCase, comments, source, weights = {}) {
3740
+ const matchedFindings = matchReferenceFindings(auditCase.referenceFindings, comments);
3741
+ const matchedCommentIds = new Set(matchedFindings.map((match) => match.commentId));
3742
+ const positiveComments = comments.filter((comment) => isPositiveOutcome(comment.outcome));
3743
+ const negativeComments = comments.filter((comment) => isNegativeOutcome(comment.outcome));
3744
+ const actionableComments = comments.filter(isActionableComment);
3745
+ const severityComments = comments.filter((comment) => comment.severity);
3746
+ const severityAligned = severityComments.filter(
3747
+ (comment) => isSeverityAligned(comment, auditCase.referenceFindings, matchedFindings)
3748
+ );
3749
+ const recall = auditCase.referenceFindings.length ? matchedFindings.length / auditCase.referenceFindings.length : comments.length === 0 ? 1 : 0;
3750
+ const precisionDenominator = positiveComments.length + negativeComments.length;
3751
+ const precision2 = precisionDenominator > 0 ? positiveComments.length / precisionDenominator : comments.length > 0 ? matchedCommentIds.size / comments.length : auditCase.referenceFindings.length === 0 ? 1 : 0;
3752
+ const actionability = comments.length ? actionableComments.length / comments.length : 1;
3753
+ const severityCalibration = severityComments.length ? severityAligned.length / severityComments.length : matchedFindings.length ? 0.5 : 1;
3754
+ const lowNoise = comments.length ? 1 - negativeComments.length / comments.length : 1;
3755
+ const aggregate2 = aggregatePrReviewScore(
3756
+ { recall, precision: precision2, actionability, severityCalibration, lowNoise },
3757
+ weights
3758
+ );
3759
+ return {
3760
+ caseId: auditCase.id,
3761
+ source,
3762
+ commentCount: comments.length,
3763
+ referenceCount: auditCase.referenceFindings.length,
3764
+ matchedFindings,
3765
+ recall,
3766
+ precision: precision2,
3767
+ actionability,
3768
+ severityCalibration,
3769
+ lowNoise,
3770
+ aggregate: aggregate2,
3771
+ notes: buildScoreNotes({
3772
+ comments,
3773
+ referenceCount: auditCase.referenceFindings.length,
3774
+ matchedFindings,
3775
+ negativeComments,
3776
+ actionableComments
3777
+ })
3778
+ };
3779
+ }
3780
+ function summarizePrReviewBenchmark(scores) {
3781
+ const bySource = /* @__PURE__ */ new Map();
3782
+ for (const score of scores) {
3783
+ bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
3784
+ }
3785
+ return [...bySource.entries()].map(([source, sourceScores]) => ({
3786
+ source,
3787
+ caseCount: sourceScores.length,
3788
+ commentCount: sum(sourceScores.map((score) => score.commentCount)),
3789
+ aggregateMean: mean(sourceScores.map((score) => score.aggregate)),
3790
+ recallMean: mean(sourceScores.map((score) => score.recall)),
3791
+ precisionMean: mean(sourceScores.map((score) => score.precision)),
3792
+ actionabilityMean: mean(sourceScores.map((score) => score.actionability)),
3793
+ severityCalibrationMean: mean(sourceScores.map((score) => score.severityCalibration)),
3794
+ lowNoiseMean: mean(sourceScores.map((score) => score.lowNoise))
3795
+ })).sort((a, b) => b.aggregateMean - a.aggregateMean);
3796
+ }
3797
+ function aggregatePrReviewScore(dimensions, weights = {}) {
3798
+ const merged = { ...DEFAULT_PR_REVIEW_SCORE_WEIGHTS, ...weights };
3799
+ const weightSum = Object.values(merged).reduce((total, value) => total + Math.max(0, value), 0);
3800
+ if (weightSum <= 0) return 0;
3801
+ return (merged.recall * clamp01(dimensions.recall) + merged.precision * clamp01(dimensions.precision) + merged.actionability * clamp01(dimensions.actionability) + merged.severityCalibration * clamp01(dimensions.severityCalibration) + merged.lowNoise * clamp01(dimensions.lowNoise)) / weightSum;
3802
+ }
3803
+ function matchReferenceFindings(references, comments) {
3804
+ const matches = [];
3805
+ const usedCommentIds = /* @__PURE__ */ new Set();
3806
+ for (const reference of references) {
3807
+ const candidates = comments.filter((comment) => !usedCommentIds.has(comment.id)).map((comment) => ({ comment, score: matchScore(reference, comment) })).filter(({ score }) => score >= 0.55).sort((a, b) => b.score - a.score);
3808
+ const best = candidates[0];
3809
+ if (!best) continue;
3810
+ usedCommentIds.add(best.comment.id);
3811
+ matches.push({ referenceId: reference.id, commentId: best.comment.id, score: best.score });
3812
+ }
3813
+ return matches;
3814
+ }
3815
+ function matchScore(reference, comment) {
3816
+ let score = 0;
3817
+ if (reference.sourceCommentIds?.includes(comment.id)) score += 1;
3818
+ if (reference.path && comment.path && normalizePath(reference.path) === normalizePath(comment.path)) {
3819
+ score += 0.35;
3820
+ }
3821
+ if (reference.line && comment.line && Math.abs(reference.line - comment.line) <= 3) score += 0.15;
3822
+ const terms = [...reference.keywords ?? [], ...tokenize(reference.title)];
3823
+ const uniqueTerms = [...new Set(terms.map(normalizeTerm).filter((term) => term.length >= 3))];
3824
+ if (uniqueTerms.length > 0) {
3825
+ const bodyTerms = new Set(tokenize(comment.body).map(normalizeTerm));
3826
+ const overlap = uniqueTerms.filter((term) => bodyTerms.has(term)).length;
3827
+ score += 0.5 * (overlap / uniqueTerms.length);
3828
+ }
3829
+ return clamp01(score);
3830
+ }
3831
+ function isActionableComment(comment) {
3832
+ const body = comment.body.trim();
3833
+ if (!comment.path && !/\b(file|line|function|method|class|module|test|migration)\b/i.test(body)) {
3834
+ return false;
3835
+ }
3836
+ return /\b(fix|change|add|remove|guard|check|reject|validate|test|assert|return|throw|fail|block)\b/i.test(
3837
+ body
3838
+ );
3839
+ }
3840
+ function isSeverityAligned(comment, references, matches) {
3841
+ if (!comment.severity) return false;
3842
+ const match = matches.find((candidate) => candidate.commentId === comment.id);
3843
+ if (!match) return comment.severity === "nit" || comment.severity === "low";
3844
+ const reference = references.find((candidate) => candidate.id === match.referenceId);
3845
+ if (!reference) return false;
3846
+ return Math.abs(severityRank(comment.severity) - severityRank(reference.severity)) <= 1;
3847
+ }
3848
+ function buildScoreNotes(input) {
3849
+ const notes = [];
3850
+ if (input.referenceCount > 0 && input.matchedFindings.length === 0) {
3851
+ notes.push("no reference findings matched");
3852
+ }
3853
+ if (input.negativeComments.length > 0) {
3854
+ notes.push(`${input.negativeComments.length} comment(s) labelled rejected/duplicate/noise`);
3855
+ }
3856
+ if (input.comments.length > 0 && input.actionableComments.length === 0) {
3857
+ notes.push("comments were not actionable enough for a PR reviewer benchmark");
3858
+ }
3859
+ return notes;
3860
+ }
3861
+ function isPositiveOutcome(outcome) {
3862
+ return outcome === "accepted" || outcome === "fixed";
3863
+ }
3864
+ function isNegativeOutcome(outcome) {
3865
+ return outcome === "rejected" || outcome === "duplicate" || outcome === "noise";
3866
+ }
3867
+ function severityRank(severity) {
3868
+ switch (severity) {
3869
+ case "critical":
3870
+ return 5;
3871
+ case "high":
3872
+ return 4;
3873
+ case "medium":
3874
+ return 3;
3875
+ case "low":
3876
+ return 2;
3877
+ case "nit":
3878
+ return 1;
3879
+ }
3880
+ }
3881
+ function tokenize(input) {
3882
+ return input.match(/[a-zA-Z0-9_.$/-]+/g) ?? [];
3883
+ }
3884
+ function normalizeTerm(input) {
3885
+ return input.toLowerCase().replace(/^[^a-z0-9_]+|[^a-z0-9_]+$/g, "");
3886
+ }
3887
+ function normalizePath(input) {
3888
+ return input.replace(/^\.\/+/, "");
3889
+ }
3890
+ function mean(values) {
3891
+ return values.length ? sum(values) / values.length : 0;
3892
+ }
3893
+ function sum(values) {
3894
+ return values.reduce((total, value) => total + value, 0);
3895
+ }
3896
+
3725
3897
  // src/production-loop.ts
3726
3898
  async function runProductionLoop(opts) {
3727
3899
  validate2(opts);
@@ -5217,14 +5389,14 @@ async function runHarnessExperiment(config) {
5217
5389
  const score = config.score ?? ((trace) => critic.scoreTrace(trace));
5218
5390
  const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
5219
5391
  const trace = await config.adapter.run(request);
5220
- const runScore = await score(trace, request);
5392
+ const runScore2 = await score(trace, request);
5221
5393
  const result = {
5222
5394
  variant: request.variant,
5223
5395
  scenario: request.scenario,
5224
5396
  trialIndex: request.trialIndex,
5225
5397
  trace,
5226
- score: runScore,
5227
- aggregate: aggregateRunScore(runScore, config.weights)
5398
+ score: runScore2,
5399
+ aggregate: aggregateRunScore(runScore2, config.weights)
5228
5400
  };
5229
5401
  await config.onResult?.(result);
5230
5402
  return result;
@@ -5251,10 +5423,10 @@ function summarizeHarnessResults(results) {
5251
5423
  return {
5252
5424
  variant,
5253
5425
  runs,
5254
- aggregateMean: mean(runs.map((r) => r.aggregate)),
5255
- passRate: mean(runs.map((r) => r.score.success)),
5256
- costUsdMean: mean(runs.map((r) => r.score.costUsd)),
5257
- wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
5426
+ aggregateMean: mean2(runs.map((r) => r.aggregate)),
5427
+ passRate: mean2(runs.map((r) => r.score.success)),
5428
+ costUsdMean: mean2(runs.map((r) => r.score.costUsd)),
5429
+ wallSecondsMean: mean2(runs.map((r) => r.score.wallSeconds)),
5258
5430
  scoreMean: meanRunScore(runs.map((r) => r.score))
5259
5431
  };
5260
5432
  }).sort((a, b) => b.aggregateMean - a.aggregateMean);
@@ -5291,22 +5463,22 @@ async function mapLimit(items, limit, fn) {
5291
5463
  );
5292
5464
  return results;
5293
5465
  }
5294
- function mean(values) {
5295
- return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
5466
+ function mean2(values) {
5467
+ return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
5296
5468
  }
5297
5469
  function meanRunScore(scores) {
5298
5470
  return {
5299
- success: mean(scores.map((s) => s.success)),
5300
- goalProgress: mean(scores.map((s) => s.goalProgress)),
5301
- repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
5302
- driftPenalty: mean(scores.map((s) => s.driftPenalty)),
5303
- toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
5304
- patchQuality: mean(scores.map((s) => s.patchQuality)),
5305
- testReality: mean(scores.map((s) => s.testReality)),
5306
- finalGate: mean(scores.map((s) => s.finalGate)),
5307
- reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
5308
- costUsd: mean(scores.map((s) => s.costUsd)),
5309
- wallSeconds: mean(scores.map((s) => s.wallSeconds)),
5471
+ success: mean2(scores.map((s) => s.success)),
5472
+ goalProgress: mean2(scores.map((s) => s.goalProgress)),
5473
+ repoGroundedness: mean2(scores.map((s) => s.repoGroundedness)),
5474
+ driftPenalty: mean2(scores.map((s) => s.driftPenalty)),
5475
+ toolUseQuality: mean2(scores.map((s) => s.toolUseQuality)),
5476
+ patchQuality: mean2(scores.map((s) => s.patchQuality)),
5477
+ testReality: mean2(scores.map((s) => s.testReality)),
5478
+ finalGate: mean2(scores.map((s) => s.finalGate)),
5479
+ reviewerBlockers: mean2(scores.map((s) => s.reviewerBlockers)),
5480
+ costUsd: mean2(scores.map((s) => s.costUsd)),
5481
+ wallSeconds: mean2(scores.map((s) => s.wallSeconds)),
5310
5482
  notes: scores.flatMap((s) => s.notes ?? [])
5311
5483
  };
5312
5484
  }
@@ -5645,7 +5817,7 @@ function rankRows(rows, weights) {
5645
5817
  }
5646
5818
  return [...buckets.entries()].map(([variantId, values]) => ({
5647
5819
  variantId,
5648
- mean: values.reduce((sum2, value) => sum2 + value, 0) / values.length,
5820
+ mean: values.reduce((sum3, value) => sum3 + value, 0) / values.length,
5649
5821
  runs: values.length
5650
5822
  })).sort((a, b) => b.mean - a.mean);
5651
5823
  }
@@ -5815,6 +5987,22 @@ var BudgetGuard = class {
5815
5987
  }
5816
5988
  };
5817
5989
 
5990
+ // src/agent-profile.ts
5991
+ import { createHash as createHash2 } from "crypto";
5992
+ function agentProfileHash(profile) {
5993
+ if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
5994
+ throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
5995
+ }
5996
+ const behaviour = {
5997
+ model: profile.model.trim(),
5998
+ skills: [...profile.skills ?? []].sort(),
5999
+ promptVersion: profile.promptVersion ?? null,
6000
+ tools: [...profile.tools ?? []].sort(),
6001
+ metadata: profile.metadata ?? {}
6002
+ };
6003
+ return createHash2("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
6004
+ }
6005
+
5818
6006
  // src/cost-tracker.ts
5819
6007
  var CostTracker = class {
5820
6008
  byScenario = /* @__PURE__ */ new Map();
@@ -6221,6 +6409,194 @@ function isObject(v) {
6221
6409
  return typeof v === "object" && v !== null && !Array.isArray(v);
6222
6410
  }
6223
6411
 
6412
+ // src/scorecard.ts
6413
+ import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3 } from "fs";
6414
+ import { dirname as dirname2 } from "path";
6415
+ function median(xs) {
6416
+ if (xs.length === 0) return 0;
6417
+ const sorted = [...xs].sort((a, b) => a - b);
6418
+ const mid = Math.floor(sorted.length / 2);
6419
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6420
+ }
6421
+ function runScore(run) {
6422
+ return run.outcome.holdoutScore ?? run.outcome.searchScore;
6423
+ }
6424
+ function aggregatePerDimension(runs) {
6425
+ const sums = /* @__PURE__ */ new Map();
6426
+ for (const run of runs) {
6427
+ const dims = run.outcome.judgeScores?.perDimMean;
6428
+ if (!dims) continue;
6429
+ for (const [dim, value] of Object.entries(dims)) {
6430
+ if (!Number.isFinite(value)) continue;
6431
+ const acc = sums.get(dim) ?? { total: 0, count: 0 };
6432
+ acc.total += value;
6433
+ acc.count += 1;
6434
+ sums.set(dim, acc);
6435
+ }
6436
+ }
6437
+ if (sums.size === 0) return void 0;
6438
+ const out = {};
6439
+ for (const [dim, acc] of sums) out[dim] = acc.total / acc.count;
6440
+ return out;
6441
+ }
6442
+ function recordRuns(runs, opts) {
6443
+ const profileHash = agentProfileHash(opts.profile);
6444
+ const timestamp = opts.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
6445
+ const byScenario = /* @__PURE__ */ new Map();
6446
+ for (const run of runs) {
6447
+ const scenarioId = run.scenarioId;
6448
+ if (!scenarioId) continue;
6449
+ const bucket = byScenario.get(scenarioId);
6450
+ if (bucket) bucket.push(run);
6451
+ else byScenario.set(scenarioId, [run]);
6452
+ }
6453
+ const lines = [];
6454
+ for (const [scenarioId, scenarioRuns] of byScenario) {
6455
+ const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
6456
+ if (scored.length === 0) continue;
6457
+ const scores = scored.map((s) => s.score);
6458
+ const entry = {
6459
+ commitSha: opts.commitSha,
6460
+ timestamp,
6461
+ scores,
6462
+ composite: median(scores),
6463
+ runIds: scored.map((s) => s.run.runId)
6464
+ };
6465
+ const perDimension = aggregatePerDimension(scenarioRuns);
6466
+ if (perDimension) entry.perDimension = perDimension;
6467
+ lines.push({
6468
+ scenarioId,
6469
+ profileHash,
6470
+ model: opts.profile.model,
6471
+ profile: opts.profile,
6472
+ entry
6473
+ });
6474
+ }
6475
+ return lines;
6476
+ }
6477
+ function appendScorecard(logPath, lines) {
6478
+ if (lines.length === 0) return;
6479
+ mkdirSync2(dirname2(logPath), { recursive: true });
6480
+ appendFileSync2(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
6481
+ `);
6482
+ }
6483
+ function recordRunsToScorecard(logPath, runs, opts) {
6484
+ const lines = recordRuns(runs, opts);
6485
+ appendScorecard(logPath, lines);
6486
+ return lines;
6487
+ }
6488
+ function loadScorecard(logPath) {
6489
+ if (!existsSync4(logPath)) return { cells: [], profiles: {} };
6490
+ const cells = /* @__PURE__ */ new Map();
6491
+ const profiles = {};
6492
+ for (const raw of readFileSync3(logPath, "utf8").split("\n")) {
6493
+ const line = raw.trim();
6494
+ if (!line) continue;
6495
+ let parsed;
6496
+ try {
6497
+ parsed = JSON.parse(line);
6498
+ } catch {
6499
+ continue;
6500
+ }
6501
+ if (!parsed?.scenarioId || !parsed.profileHash || !parsed.entry) continue;
6502
+ const key = `${parsed.scenarioId}::${parsed.profileHash}`;
6503
+ let cell = cells.get(key);
6504
+ if (!cell) {
6505
+ cell = {
6506
+ scenarioId: parsed.scenarioId,
6507
+ profileHash: parsed.profileHash,
6508
+ model: parsed.model,
6509
+ timeline: []
6510
+ };
6511
+ cells.set(key, cell);
6512
+ }
6513
+ cell.timeline.push(parsed.entry);
6514
+ if (parsed.profile) profiles[parsed.profileHash] = parsed.profile;
6515
+ }
6516
+ for (const cell of cells.values()) {
6517
+ cell.timeline.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
6518
+ }
6519
+ return { cells: [...cells.values()], profiles };
6520
+ }
6521
+ function diffScorecard(scorecard, opts = {}) {
6522
+ const minEffect = opts.minEffect ?? 0.5;
6523
+ const maxP = opts.maxP ?? 0.05;
6524
+ const minDelta = opts.minDelta ?? 0.05;
6525
+ const cells = [];
6526
+ for (const cell of scorecard.cells) {
6527
+ const timeline = cell.timeline;
6528
+ if (timeline.length === 0) continue;
6529
+ const current = timeline[timeline.length - 1];
6530
+ const baseline = opts.baselineCommit ? [...timeline].reverse().find((e) => e.commitSha === opts.baselineCommit && e !== current) : timeline[timeline.length - 2];
6531
+ const base = {
6532
+ scenarioId: cell.scenarioId,
6533
+ profileHash: cell.profileHash,
6534
+ model: cell.model,
6535
+ current: current.composite,
6536
+ currentCommit: current.commitSha
6537
+ };
6538
+ if (!baseline) {
6539
+ cells.push({
6540
+ ...base,
6541
+ verdict: "new",
6542
+ baseline: null,
6543
+ delta: null,
6544
+ cohensD: null,
6545
+ pValue: null,
6546
+ baselineCommit: null
6547
+ });
6548
+ continue;
6549
+ }
6550
+ const delta = current.composite - baseline.composite;
6551
+ const canStat = baseline.scores.length >= 2 && current.scores.length >= 2;
6552
+ let d = null;
6553
+ let p = null;
6554
+ let verdict;
6555
+ if (canStat) {
6556
+ d = cohensD(baseline.scores, current.scores);
6557
+ const t = welchsTTest(baseline.scores, current.scores);
6558
+ p = Number.isFinite(t.p) ? t.p : null;
6559
+ const significant = Math.abs(d) >= minEffect && p !== null && p <= maxP;
6560
+ verdict = significant ? delta > 0 ? "improved" : "regressed" : "flat";
6561
+ } else {
6562
+ verdict = Math.abs(delta) >= minDelta ? delta > 0 ? "improved" : "regressed" : "flat";
6563
+ }
6564
+ cells.push({
6565
+ ...base,
6566
+ verdict,
6567
+ baseline: baseline.composite,
6568
+ delta,
6569
+ cohensD: d,
6570
+ pValue: p,
6571
+ baselineCommit: baseline.commitSha
6572
+ });
6573
+ }
6574
+ const summary = { improved: 0, regressed: 0, flat: 0, new: 0 };
6575
+ for (const cell of cells) summary[cell.verdict] += 1;
6576
+ return { cells, summary };
6577
+ }
6578
+ function formatScorecardDiff(diff) {
6579
+ const lines = [];
6580
+ const { summary } = diff;
6581
+ lines.push(
6582
+ `Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
6583
+ );
6584
+ const fmt = (n) => n.toFixed(3);
6585
+ const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
6586
+ if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
6587
+ return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
6588
+ });
6589
+ for (const cell of noteworthy) {
6590
+ const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
6591
+ const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt(cell.delta)}` : fmt(cell.delta) : "\u2014";
6592
+ const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
6593
+ lines.push(
6594
+ ` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt(cell.baseline ?? 0)} \u2192 ${fmt(cell.current)} ${deltaStr}${stat}`
6595
+ );
6596
+ }
6597
+ return lines.join("\n");
6598
+ }
6599
+
6224
6600
  // src/series-convergence.ts
6225
6601
  function analyzeSeries(values, options = {}) {
6226
6602
  const window = options.window ?? 5;
@@ -6230,10 +6606,10 @@ function analyzeSeries(values, options = {}) {
6230
6606
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
6231
6607
  }
6232
6608
  const tail = values.slice(-window);
6233
- const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
6234
- const variance = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
6609
+ const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
6610
+ const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
6235
6611
  const stdDev = Math.sqrt(variance);
6236
- const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
6612
+ const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
6237
6613
  const cv = stdDev / refMean;
6238
6614
  const stable = tail.length >= window && cv <= stableCv;
6239
6615
  let tailRun = 0;
@@ -6254,7 +6630,7 @@ function analyzeSeries(values, options = {}) {
6254
6630
  } else {
6255
6631
  state = "noisy";
6256
6632
  }
6257
- return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
6633
+ return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
6258
6634
  }
6259
6635
 
6260
6636
  // src/slo.ts
@@ -7052,12 +7428,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
7052
7428
  variantScores.push({ mutator: id, score, mutated });
7053
7429
  all.push(score);
7054
7430
  }
7055
- const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
7056
- const variance = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
7431
+ const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
7432
+ const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
7057
7433
  const stdDev = Math.sqrt(variance);
7058
- const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
7434
+ const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
7059
7435
  const robustness = Math.max(0, 1 - stdDev / ref);
7060
- return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
7436
+ return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
7061
7437
  }
7062
7438
  var lowercaseMutator = (p) => p.toLowerCase();
7063
7439
  var sentenceReorderMutator = (p, seed) => {
@@ -7123,8 +7499,8 @@ async function paraphraseRobustnessScenarios(args) {
7123
7499
  });
7124
7500
  scores.push(out.score);
7125
7501
  }
7126
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7127
- deltas[m.name] = mean4 - originalScore;
7502
+ const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
7503
+ deltas[m.name] = mean5 - originalScore;
7128
7504
  paraphrasedAll.push(...scores);
7129
7505
  }
7130
7506
  const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
@@ -7737,8 +8113,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7737
8113
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7738
8114
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7739
8115
  if (scores.length < 3) continue;
7740
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7741
- const variance = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
8116
+ const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
8117
+ const variance = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
7742
8118
  if (variance > varianceThreshold) {
7743
8119
  targets.push({
7744
8120
  reason: "high-variance",
@@ -7969,7 +8345,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
7969
8345
 
7970
8346
  // src/command-runner.ts
7971
8347
  import { spawnSync } from "child_process";
7972
- import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
8348
+ import { existsSync as existsSync5, readdirSync as readdirSync2, readFileSync as readFileSync4, statSync as statSync2 } from "fs";
7973
8349
  import { join as join2 } from "path";
7974
8350
  var localCommandRunner = {
7975
8351
  name: "local",
@@ -7998,11 +8374,11 @@ var localCommandRunner = {
7998
8374
  return r.status === 0 && (r.stdout ?? "").trim().length > 0;
7999
8375
  },
8000
8376
  async fileExists(path) {
8001
- return existsSync4(path);
8377
+ return existsSync5(path);
8002
8378
  },
8003
8379
  async readFile(path) {
8004
8380
  try {
8005
- return readFileSync3(path, "utf8");
8381
+ return readFileSync4(path, "utf8");
8006
8382
  } catch {
8007
8383
  return null;
8008
8384
  }
@@ -8240,7 +8616,7 @@ function extractErrorCount(text, opts = {}) {
8240
8616
  for (const p of patterns) {
8241
8617
  const matches = Array.from(text.matchAll(p.regex));
8242
8618
  if (matches.length === 0) continue;
8243
- const count = p.transform ? matches.reduce((sum2, m) => sum2 + p.transform(m), 0) : matches.length;
8619
+ const count = p.transform ? matches.reduce((sum3, m) => sum3 + p.transform(m), 0) : matches.length;
8244
8620
  return {
8245
8621
  count,
8246
8622
  matched: p.name,
@@ -8934,8 +9310,8 @@ function multiToolchainLayer(config) {
8934
9310
  }
8935
9311
 
8936
9312
  // src/reference-replay.ts
8937
- import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
8938
- import { dirname as dirname2 } from "path";
9313
+ import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5 } from "fs";
9314
+ import { dirname as dirname3 } from "path";
8939
9315
  var DEFAULT_MATCH_THRESHOLD = 0.55;
8940
9316
  var ALL_SPLITS = ["train", "dev", "test", "holdout"];
8941
9317
  async function runReferenceReplay(cases, options) {
@@ -9053,14 +9429,14 @@ function jsonlReferenceReplayStore(path) {
9053
9429
  return {
9054
9430
  async save(run) {
9055
9431
  await lock.runExclusive(() => {
9056
- mkdirSync2(dirname2(path), { recursive: true });
9057
- appendFileSync2(path, `${JSON.stringify(run)}
9432
+ mkdirSync3(dirname3(path), { recursive: true });
9433
+ appendFileSync3(path, `${JSON.stringify(run)}
9058
9434
  `);
9059
9435
  });
9060
9436
  },
9061
9437
  async list() {
9062
9438
  return lock.runExclusive(() => {
9063
- if (!existsSync5(path)) return [];
9439
+ if (!existsSync6(path)) return [];
9064
9440
  return readJsonl(path);
9065
9441
  });
9066
9442
  }
@@ -9149,7 +9525,7 @@ function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
9149
9525
  regressions
9150
9526
  };
9151
9527
  }
9152
- const requiredMeanDelta = mean2(compared.map((item) => item.f1Delta));
9528
+ const requiredMeanDelta = mean3(compared.map((item) => item.f1Delta));
9153
9529
  if (requiredMeanDelta < minF1Delta) {
9154
9530
  return {
9155
9531
  promote: false,
@@ -9284,8 +9660,8 @@ function scorePair(scenario, matcher, reference, candidate) {
9284
9660
  function buildScenarioScore(scenario, matches, falsePositives) {
9285
9661
  const matched = matches.filter((match) => match.matched).length;
9286
9662
  const total = scenario.references.length;
9287
- const matchedWeight = matches.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
9288
- const totalWeight = matches.reduce((sum2, match) => sum2 + match.weight, 0);
9663
+ const matchedWeight = matches.filter((match) => match.matched).reduce((sum3, match) => sum3 + match.weight, 0);
9664
+ const totalWeight = matches.reduce((sum3, match) => sum3 + match.weight, 0);
9289
9665
  const precision2 = ratio(matched, matched + falsePositives);
9290
9666
  const recall = ratio(matched, total);
9291
9667
  return {
@@ -9311,11 +9687,11 @@ function aggregateBySplit(scores) {
9311
9687
  return out;
9312
9688
  }
9313
9689
  function aggregateScenarioScores(scores) {
9314
- const matched = sum(scores.map((score) => score.matched));
9315
- const total = sum(scores.map((score) => score.total));
9316
- const falsePositives = sum(scores.map((score) => score.falsePositives));
9317
- const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9318
- const totalWeight = sum(scores.map((score) => score.totalWeight));
9690
+ const matched = sum2(scores.map((score) => score.matched));
9691
+ const total = sum2(scores.map((score) => score.total));
9692
+ const falsePositives = sum2(scores.map((score) => score.falsePositives));
9693
+ const matchedWeight = sum2(scores.map((score) => score.matchedWeight));
9694
+ const totalWeight = sum2(scores.map((score) => score.totalWeight));
9319
9695
  const precision2 = ratio(matched, matched + falsePositives);
9320
9696
  const recall = ratio(matched, total);
9321
9697
  return {
@@ -9382,11 +9758,11 @@ function clamp012(value) {
9382
9758
  if (!Number.isFinite(value)) return 0;
9383
9759
  return Math.max(0, Math.min(1, value));
9384
9760
  }
9385
- function sum(values) {
9761
+ function sum2(values) {
9386
9762
  return values.reduce((acc, value) => acc + value, 0);
9387
9763
  }
9388
- function mean2(values) {
9389
- return values.length ? sum(values) / values.length : 0;
9764
+ function mean3(values) {
9765
+ return values.length ? sum2(values) / values.length : 0;
9390
9766
  }
9391
9767
  function formatPct(value) {
9392
9768
  return `${(value * 100).toFixed(1)}%`;
@@ -9403,7 +9779,7 @@ function throwIfAborted(signal) {
9403
9779
  throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
9404
9780
  }
9405
9781
  function readJsonl(path) {
9406
- const raw = readFileSync4(path, "utf8");
9782
+ const raw = readFileSync5(path, "utf8");
9407
9783
  const out = [];
9408
9784
  for (const line of raw.split("\n")) {
9409
9785
  const trimmed = line.trim();
@@ -9650,8 +10026,8 @@ function detectCalibrationDrift(runs, opts) {
9650
10026
  alpha,
9651
10027
  recentN: recent.length,
9652
10028
  historyN: historical.length,
9653
- recentMean: mean3(recent),
9654
- historyMean: mean3(historical)
10029
+ recentMean: mean4(recent),
10030
+ historyMean: mean4(historical)
9655
10031
  }
9656
10032
  }
9657
10033
  ];
@@ -9771,7 +10147,7 @@ function chiSquareCritical(df, alpha) {
9771
10147
  }
9772
10148
  return TABLE[10][idx];
9773
10149
  }
9774
- function mean3(xs) {
10150
+ function mean4(xs) {
9775
10151
  if (xs.length === 0) return 0;
9776
10152
  return xs.reduce((s, x) => s + x, 0) / xs.length;
9777
10153
  }
@@ -9971,8 +10347,8 @@ async function discoverPersonas(dir, opts = {}) {
9971
10347
  }
9972
10348
 
9973
10349
  // src/evolution-telemetry.ts
9974
- import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5, writeFileSync } from "fs";
9975
- import { dirname as dirname3 } from "path";
10350
+ import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
10351
+ import { dirname as dirname4 } from "path";
9976
10352
  var MutationTelemetry = class {
9977
10353
  appender;
9978
10354
  constructor(path) {
@@ -10001,17 +10377,17 @@ var LineageRecorder = class {
10001
10377
  this.path = path;
10002
10378
  this.snapshotPath = `${path}.snapshot`;
10003
10379
  this.kindOf = kindOf ?? defaultKindOf;
10004
- mkdirSync3(dirname3(path), { recursive: true });
10005
- if (existsSync6(this.snapshotPath)) {
10380
+ mkdirSync4(dirname4(path), { recursive: true });
10381
+ if (existsSync7(this.snapshotPath)) {
10006
10382
  try {
10007
- const parsed = JSON.parse(readFileSync5(this.snapshotPath, "utf-8"));
10383
+ const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
10008
10384
  for (const n of parsed) this.nodes.set(n.id, n);
10009
10385
  } catch {
10010
10386
  }
10011
10387
  }
10012
- if (existsSync6(path)) {
10388
+ if (existsSync7(path)) {
10013
10389
  try {
10014
- for (const line of readFileSync5(path, "utf-8").split("\n")) {
10390
+ for (const line of readFileSync6(path, "utf-8").split("\n")) {
10015
10391
  if (!line.trim()) continue;
10016
10392
  try {
10017
10393
  const entry = JSON.parse(line);
@@ -10023,9 +10399,9 @@ var LineageRecorder = class {
10023
10399
  } catch {
10024
10400
  }
10025
10401
  }
10026
- if (existsSync6(path) && this.nodes.size === 0) {
10402
+ if (existsSync7(path) && this.nodes.size === 0) {
10027
10403
  try {
10028
- const raw = readFileSync5(path, "utf-8").trim();
10404
+ const raw = readFileSync6(path, "utf-8").trim();
10029
10405
  if (raw.startsWith("[")) {
10030
10406
  const parsed = JSON.parse(raw);
10031
10407
  for (const n of parsed) this.nodes.set(n.id, n);
@@ -10039,15 +10415,15 @@ var LineageRecorder = class {
10039
10415
  const prev = this.nodes.get(node.id);
10040
10416
  this.nodes.set(node.id, { ...prev, ...node });
10041
10417
  try {
10042
- if (existsSync6(this.path)) {
10043
- const head = readFileSync5(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
10418
+ if (existsSync7(this.path)) {
10419
+ const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
10044
10420
  if (head === "[") {
10045
10421
  writeFileSync(this.path, "");
10046
10422
  }
10047
10423
  }
10048
10424
  } catch {
10049
10425
  }
10050
- appendFileSync3(this.path, `${JSON.stringify(this.nodes.get(node.id))}
10426
+ appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
10051
10427
  `);
10052
10428
  });
10053
10429
  }
@@ -10106,9 +10482,9 @@ var CostLedger = class {
10106
10482
  mutex = new Mutex();
10107
10483
  constructor(path) {
10108
10484
  this.path = path;
10109
- if (existsSync6(path)) {
10485
+ if (existsSync7(path)) {
10110
10486
  try {
10111
- const loaded = JSON.parse(readFileSync5(path, "utf-8"));
10487
+ const loaded = JSON.parse(readFileSync6(path, "utf-8"));
10112
10488
  for (const k of Object.keys(this.totals)) {
10113
10489
  if (k === "byGeneration") {
10114
10490
  if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
@@ -10125,7 +10501,7 @@ var CostLedger = class {
10125
10501
  } catch {
10126
10502
  }
10127
10503
  } else {
10128
- mkdirSync3(dirname3(path), { recursive: true });
10504
+ mkdirSync4(dirname4(path), { recursive: true });
10129
10505
  }
10130
10506
  }
10131
10507
  genBucket(generation) {
@@ -10277,16 +10653,16 @@ function precision(goldens, candidates, options = {}) {
10277
10653
  }
10278
10654
 
10279
10655
  // src/jsonl-trial-cache.ts
10280
- import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6 } from "fs";
10281
- import { dirname as dirname4 } from "path";
10656
+ import { appendFileSync as appendFileSync5, existsSync as existsSync8, mkdirSync as mkdirSync5, readFileSync as readFileSync7 } from "fs";
10657
+ import { dirname as dirname5 } from "path";
10282
10658
  var JsonlTrialCache = class {
10283
10659
  map = /* @__PURE__ */ new Map();
10284
10660
  path;
10285
10661
  appender;
10286
10662
  constructor(path) {
10287
10663
  this.path = path;
10288
- if (existsSync7(path)) {
10289
- for (const line of readFileSync6(path, "utf-8").split("\n")) {
10664
+ if (existsSync8(path)) {
10665
+ for (const line of readFileSync7(path, "utf-8").split("\n")) {
10290
10666
  if (!line.trim()) continue;
10291
10667
  try {
10292
10668
  const entry = JSON.parse(line);
@@ -10295,7 +10671,7 @@ var JsonlTrialCache = class {
10295
10671
  }
10296
10672
  }
10297
10673
  } else {
10298
- mkdirSync4(dirname4(path), { recursive: true });
10674
+ mkdirSync5(dirname5(path), { recursive: true });
10299
10675
  }
10300
10676
  this.appender = new LockedJsonlAppender(path);
10301
10677
  }
@@ -10318,7 +10694,7 @@ var JsonlTrialCache = class {
10318
10694
  setSync(key, value) {
10319
10695
  this.map.set(key, value);
10320
10696
  const line = { key, result: value, writtenAt: Date.now() };
10321
- appendFileSync4(this.path, `${JSON.stringify(line)}
10697
+ appendFileSync5(this.path, `${JSON.stringify(line)}
10322
10698
  `);
10323
10699
  }
10324
10700
  };
@@ -10401,9 +10777,9 @@ function passOrthogonality(input) {
10401
10777
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
10402
10778
  }
10403
10779
  }
10404
- const mean4 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10780
+ const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10405
10781
  return {
10406
- orthogonality: Math.max(0, Math.min(1, 1 - mean4)),
10782
+ orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
10407
10783
  passCount: passes.length,
10408
10784
  similarities: sims
10409
10785
  };
@@ -10678,6 +11054,7 @@ export {
10678
11054
  DEFAULT_HARNESS_OBJECTIVES,
10679
11055
  DEFAULT_MUTATION_PRIMITIVES,
10680
11056
  DEFAULT_MUTATORS,
11057
+ DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
10681
11058
  DEFAULT_REDACTION_RULES,
10682
11059
  DEFAULT_RED_TEAM_CORPUS,
10683
11060
  DEFAULT_RUN_SCORE_WEIGHTS,
@@ -10766,13 +11143,16 @@ export {
10766
11143
  adversarialJudge,
10767
11144
  agentProfileCellHashMaterial,
10768
11145
  agentProfileCellKey,
11146
+ agentProfileHash,
10769
11147
  aggregateLlm,
11148
+ aggregatePrReviewScore,
10770
11149
  aggregateRunScore,
10771
11150
  aggregateTrialsByMode,
10772
11151
  allCriticalPassed,
10773
11152
  analyzeAntiSlop,
10774
11153
  analyzeSeries,
10775
11154
  analyzeTraces,
11155
+ appendScorecard,
10776
11156
  argHash,
10777
11157
  assertLlmRoute,
10778
11158
  assertRealBackend,
@@ -10816,6 +11196,7 @@ export {
10816
11196
  cohensD,
10817
11197
  coherenceJudge,
10818
11198
  collectionPreserved,
11199
+ commentsForSource,
10819
11200
  commitBisect,
10820
11201
  compareReferenceReplay,
10821
11202
  compareToBaseline,
@@ -10866,6 +11247,7 @@ export {
10866
11247
  deployGateLayer,
10867
11248
  describeTraceInsightScope,
10868
11249
  diffFindings,
11250
+ diffScorecard,
10869
11251
  discoverPersonas,
10870
11252
  distillPlaybook,
10871
11253
  domainEvidencePattern,
@@ -10901,6 +11283,7 @@ export {
10901
11283
  formatBenchmarkReport,
10902
11284
  formatDriverReport,
10903
11285
  formatFindings,
11286
+ formatScorecardDiff,
10904
11287
  gainHistogram,
10905
11288
  ghCliClient,
10906
11289
  precision as goldenPrecision,
@@ -10943,6 +11326,7 @@ export {
10943
11326
  linterJudge,
10944
11327
  llmSpanFromProvider,
10945
11328
  llmSpans,
11329
+ loadScorecard,
10946
11330
  loadScorerFromGrader,
10947
11331
  localCommandRunner,
10948
11332
  lowercaseMutator,
@@ -10984,6 +11368,8 @@ export {
10984
11368
  proposeSynthesisTargets,
10985
11369
  providerFromBaseUrl,
10986
11370
  pytestTestParser,
11371
+ recordRuns,
11372
+ recordRunsToScorecard,
10987
11373
  redTeamDataset,
10988
11374
  redTeamReport,
10989
11375
  redactString,
@@ -11042,6 +11428,8 @@ export {
11042
11428
  scoreContinuity,
11043
11429
  scoreFromEvals,
11044
11430
  scoreKnowledgeReadiness,
11431
+ scorePrReviewComments,
11432
+ scorePrReviewSource,
11045
11433
  scoreRedTeamOutput,
11046
11434
  scoreReferenceReplay,
11047
11435
  scoreTraceInsightReadiness,
@@ -11060,6 +11448,7 @@ export {
11060
11448
  summarize,
11061
11449
  summarizeBackendIntegrity,
11062
11450
  summarizeHarnessResults,
11451
+ summarizePrReviewBenchmark,
11063
11452
  summarizePreferenceMemory,
11064
11453
  summaryTable,
11065
11454
  testJudge,