@tangle-network/agent-eval 0.33.0 → 0.34.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/CHANGELOG.md +33 -0
  2. package/dist/benchmarks/index.d.ts +2 -2
  3. package/dist/chunk-DCZXFOQN.js +489 -0
  4. package/dist/chunk-DCZXFOQN.js.map +1 -0
  5. package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
  6. package/dist/chunk-FT3IAMQR.js.map +1 -0
  7. package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
  8. package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
  9. package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
  10. package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
  11. package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
  12. package/dist/chunk-SQYRO3BT.js.map +1 -0
  13. package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
  14. package/dist/chunk-TQL7BAOY.js.map +1 -0
  15. package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
  16. package/dist/chunk-VXNVVBZO.js.map +1 -0
  17. package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
  18. package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
  19. package/dist/cli.js +2 -2
  20. package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
  21. package/dist/control.d.ts +2 -2
  22. package/dist/control.js +3 -2
  23. package/dist/governance/index.d.ts +2 -1
  24. package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
  25. package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
  26. package/dist/index.d.ts +278 -486
  27. package/dist/index.js +522 -134
  28. package/dist/index.js.map +1 -1
  29. package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
  30. package/dist/meta-eval/index.d.ts +2 -2
  31. package/dist/openapi.json +1 -1
  32. package/dist/optimization.d.ts +3 -3
  33. package/dist/optimization.js +6 -6
  34. package/dist/pipelines/index.js +2 -2
  35. package/dist/release-report-ChfmCmLi.d.ts +713 -0
  36. package/dist/reporting.d.ts +6 -4
  37. package/dist/reporting.js +10 -9
  38. package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
  39. package/dist/rl.d.ts +5 -5
  40. package/dist/rl.js +6 -6
  41. package/dist/rl.js.map +1 -1
  42. package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
  43. package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
  44. package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
  45. package/dist/wire/index.js +2 -2
  46. package/docs/product-eval-adoption.md +18 -0
  47. package/package.json +12 -22
  48. package/dist/chunk-B73G44OH.js.map +0 -1
  49. package/dist/chunk-CXJOVDJR.js.map +0 -1
  50. package/dist/chunk-DTEJNZYK.js.map +0 -1
  51. package/dist/chunk-M6RZ5LJN.js.map +0 -1
  52. package/dist/chunk-ZN2CMQIW.js +0 -208
  53. package/dist/chunk-ZN2CMQIW.js.map +0 -1
  54. package/dist/release-report-DLWbBPtH.d.ts +0 -292
  55. /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
  56. /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
  57. /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
  58. /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
package/dist/index.js CHANGED
@@ -11,7 +11,7 @@ import {
11
11
  failureClusterView,
12
12
  iqr,
13
13
  welchsTTest
14
- } from "./chunk-GVQT44CS.js";
14
+ } from "./chunk-KE7TDJUO.js";
15
15
  import {
16
16
  exportTrainingData,
17
17
  toNdjson
@@ -54,7 +54,7 @@ import {
54
54
  runProposeReview,
55
55
  runProposeReviewAsControlLoop,
56
56
  scoreFromEvals
57
- } from "./chunk-S4Y5VXMS.js";
57
+ } from "./chunk-WRGHMGWT.js";
58
58
  import {
59
59
  allCriticalPassed,
60
60
  objectiveEval,
@@ -96,14 +96,7 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-B73G44OH.js";
100
- import {
101
- RunRecordValidationError,
102
- isRunRecord,
103
- parseRunRecordSafe,
104
- roundTripRunRecord,
105
- validateRunRecord
106
- } from "./chunk-ZN2CMQIW.js";
99
+ } from "./chunk-FT3IAMQR.js";
107
100
  import {
108
101
  assertReleaseConfidence,
109
102
  bootstrapCi,
@@ -111,38 +104,52 @@ import {
111
104
  judgeReplayGate,
112
105
  releaseTraceEvidenceFromMultiShotTrials,
113
106
  renderReleaseReport
114
- } from "./chunk-WGXZAQLR.js";
107
+ } from "./chunk-LGAPK7NA.js";
115
108
  import {
116
109
  runEvalCampaign
117
- } from "./chunk-DTEJNZYK.js";
110
+ } from "./chunk-SQYRO3BT.js";
118
111
  import {
119
112
  LlmCallError,
120
113
  LlmClient,
121
114
  LlmRouteAssertionError,
122
115
  assertLlmRoute,
116
+ backoffMs,
123
117
  callLlm,
124
118
  callLlmJson,
119
+ isTransientLlmError,
125
120
  probeLlm,
126
121
  stripFencedJson
127
- } from "./chunk-M6RZ5LJN.js";
122
+ } from "./chunk-VXNVVBZO.js";
123
+ import {
124
+ AgentProfileCellValidationError,
125
+ RunRecordValidationError,
126
+ agentProfileCellHashMaterial,
127
+ agentProfileCellKey,
128
+ assertRunAgentProfileCell,
129
+ buildAgentProfileCell,
130
+ groupRunsByAgentProfileCell,
131
+ isRunRecord,
132
+ parseRunRecordSafe,
133
+ requireAgentProfileCell,
134
+ roundTripRunRecord,
135
+ validateAgentProfileCell,
136
+ validateRunRecord,
137
+ verifyAgentProfileCell
138
+ } from "./chunk-DCZXFOQN.js";
128
139
  import {
129
140
  evaluateInterimReleaseConfidence,
130
141
  pairedEvalueSequence
131
142
  } from "./chunk-MAZ26DC7.js";
132
143
  import {
133
144
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
134
- benjaminiHochberg,
135
- bhAdjust,
136
- bonferroni,
137
145
  gainHistogram,
138
- pairedBootstrap,
139
- pairedWilcoxon,
140
146
  paretoChart,
141
- requiredSampleSize,
142
147
  researchReport,
143
148
  summaryTable
144
- } from "./chunk-CXJOVDJR.js";
149
+ } from "./chunk-TQL7BAOY.js";
145
150
  import {
151
+ benjaminiHochberg,
152
+ bonferroni,
146
153
  calibrateJudge,
147
154
  calibrateJudgeContinuous,
148
155
  cohensD,
@@ -153,14 +160,17 @@ import {
153
160
  interRaterReliability,
154
161
  mannWhitneyU,
155
162
  normalizeScores,
163
+ pairedBootstrap,
164
+ pairedMde,
156
165
  pairedTTest,
157
166
  partialCredit,
158
167
  positionalBias,
168
+ requiredSampleSize,
159
169
  selfPreference,
160
170
  verbosityBias,
161
171
  weightedMean,
162
172
  wilcoxonSignedRank
163
- } from "./chunk-4L3WJXQJ.js";
173
+ } from "./chunk-KHZRNY3F.js";
164
174
  import {
165
175
  DEFAULT_REDACTION_RULES,
166
176
  DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -323,7 +333,7 @@ var RunCritic = class {
323
333
  );
324
334
  const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
325
335
  if (!success) notes.push("run did not complete with pass=true");
326
- const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
336
+ const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
327
337
  const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
328
338
  trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
329
339
  ) : void 0;
@@ -338,7 +348,7 @@ var RunCritic = class {
338
348
  (span) => typeof span.testsTotal === "number" && span.testsTotal > 0
339
349
  );
340
350
  const testReality = sandboxTests.length ? sandboxTests.reduce(
341
- (sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
351
+ (sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
342
352
  0
343
353
  ) / sandboxTests.length : toolSpans2.some(
344
354
  (span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
@@ -360,7 +370,7 @@ var RunCritic = class {
360
370
  const costUsd = trace.budget.length ? Math.max(
361
371
  ...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
362
372
  0
363
- ) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
373
+ ) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
364
374
  const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
365
375
  return {
366
376
  success,
@@ -1950,12 +1960,12 @@ function allocateBudget(policy, args) {
1950
1960
  return policy.totalUsd / Math.max(1, args.runningCount);
1951
1961
  }
1952
1962
  function sumFindingCost(findings) {
1953
- let sum2 = 0;
1963
+ let sum3 = 0;
1954
1964
  for (const f of findings) {
1955
1965
  const c = f.metadata?.cost_usd;
1956
- if (typeof c === "number" && Number.isFinite(c)) sum2 += c;
1966
+ if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
1957
1967
  }
1958
- return sum2;
1968
+ return sum3;
1959
1969
  }
1960
1970
  function selectPriorFindings(source, analystId) {
1961
1971
  if (!source) return void 0;
@@ -2174,10 +2184,10 @@ function ghCliClient(opts = {}) {
2174
2184
  await exec("git", ["branch", "-D", input.branchName], { cwd });
2175
2185
  await run("git", ["checkout", "-b", input.branchName]);
2176
2186
  const { mkdir, writeFile } = await import("fs/promises");
2177
- const { dirname: dirname5, join: join4, resolve } = await import("path");
2187
+ const { dirname: dirname6, join: join4, resolve } = await import("path");
2178
2188
  for (const change of input.fileChanges) {
2179
2189
  const abs = resolve(cwd, change.path);
2180
- await mkdir(dirname5(abs), { recursive: true });
2190
+ await mkdir(dirname6(abs), { recursive: true });
2181
2191
  await writeFile(abs, change.contents, "utf8");
2182
2192
  await run("git", ["add", join4(change.path)]);
2183
2193
  }
@@ -3712,6 +3722,178 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
3712
3722
  };
3713
3723
  }
3714
3724
 
3725
+ // src/pr-review-benchmark.ts
3726
+ var DEFAULT_PR_REVIEW_SCORE_WEIGHTS = {
3727
+ recall: 4,
3728
+ precision: 2,
3729
+ actionability: 1.5,
3730
+ severityCalibration: 1,
3731
+ lowNoise: 1
3732
+ };
3733
+ function commentsForSource(auditCase, source) {
3734
+ return auditCase.comments.filter((comment) => comment.source === source);
3735
+ }
3736
+ function scorePrReviewSource(auditCase, source, weights = {}) {
3737
+ return scorePrReviewComments(auditCase, commentsForSource(auditCase, source), source, weights);
3738
+ }
3739
+ function scorePrReviewComments(auditCase, comments, source, weights = {}) {
3740
+ const matchedFindings = matchReferenceFindings(auditCase.referenceFindings, comments);
3741
+ const matchedCommentIds = new Set(matchedFindings.map((match) => match.commentId));
3742
+ const positiveComments = comments.filter((comment) => isPositiveOutcome(comment.outcome));
3743
+ const negativeComments = comments.filter((comment) => isNegativeOutcome(comment.outcome));
3744
+ const actionableComments = comments.filter(isActionableComment);
3745
+ const severityComments = comments.filter((comment) => comment.severity);
3746
+ const severityAligned = severityComments.filter(
3747
+ (comment) => isSeverityAligned(comment, auditCase.referenceFindings, matchedFindings)
3748
+ );
3749
+ const recall = auditCase.referenceFindings.length ? matchedFindings.length / auditCase.referenceFindings.length : comments.length === 0 ? 1 : 0;
3750
+ const precisionDenominator = positiveComments.length + negativeComments.length;
3751
+ const precision2 = precisionDenominator > 0 ? positiveComments.length / precisionDenominator : comments.length > 0 ? matchedCommentIds.size / comments.length : auditCase.referenceFindings.length === 0 ? 1 : 0;
3752
+ const actionability = comments.length ? actionableComments.length / comments.length : 1;
3753
+ const severityCalibration = severityComments.length ? severityAligned.length / severityComments.length : matchedFindings.length ? 0.5 : 1;
3754
+ const lowNoise = comments.length ? 1 - negativeComments.length / comments.length : 1;
3755
+ const aggregate2 = aggregatePrReviewScore(
3756
+ { recall, precision: precision2, actionability, severityCalibration, lowNoise },
3757
+ weights
3758
+ );
3759
+ return {
3760
+ caseId: auditCase.id,
3761
+ source,
3762
+ commentCount: comments.length,
3763
+ referenceCount: auditCase.referenceFindings.length,
3764
+ matchedFindings,
3765
+ recall,
3766
+ precision: precision2,
3767
+ actionability,
3768
+ severityCalibration,
3769
+ lowNoise,
3770
+ aggregate: aggregate2,
3771
+ notes: buildScoreNotes({
3772
+ comments,
3773
+ referenceCount: auditCase.referenceFindings.length,
3774
+ matchedFindings,
3775
+ negativeComments,
3776
+ actionableComments
3777
+ })
3778
+ };
3779
+ }
3780
+ function summarizePrReviewBenchmark(scores) {
3781
+ const bySource = /* @__PURE__ */ new Map();
3782
+ for (const score of scores) {
3783
+ bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
3784
+ }
3785
+ return [...bySource.entries()].map(([source, sourceScores]) => ({
3786
+ source,
3787
+ caseCount: sourceScores.length,
3788
+ commentCount: sum(sourceScores.map((score) => score.commentCount)),
3789
+ aggregateMean: mean(sourceScores.map((score) => score.aggregate)),
3790
+ recallMean: mean(sourceScores.map((score) => score.recall)),
3791
+ precisionMean: mean(sourceScores.map((score) => score.precision)),
3792
+ actionabilityMean: mean(sourceScores.map((score) => score.actionability)),
3793
+ severityCalibrationMean: mean(sourceScores.map((score) => score.severityCalibration)),
3794
+ lowNoiseMean: mean(sourceScores.map((score) => score.lowNoise))
3795
+ })).sort((a, b) => b.aggregateMean - a.aggregateMean);
3796
+ }
3797
+ function aggregatePrReviewScore(dimensions, weights = {}) {
3798
+ const merged = { ...DEFAULT_PR_REVIEW_SCORE_WEIGHTS, ...weights };
3799
+ const weightSum = Object.values(merged).reduce((total, value) => total + Math.max(0, value), 0);
3800
+ if (weightSum <= 0) return 0;
3801
+ return (merged.recall * clamp01(dimensions.recall) + merged.precision * clamp01(dimensions.precision) + merged.actionability * clamp01(dimensions.actionability) + merged.severityCalibration * clamp01(dimensions.severityCalibration) + merged.lowNoise * clamp01(dimensions.lowNoise)) / weightSum;
3802
+ }
3803
+ function matchReferenceFindings(references, comments) {
3804
+ const matches = [];
3805
+ const usedCommentIds = /* @__PURE__ */ new Set();
3806
+ for (const reference of references) {
3807
+ const candidates = comments.filter((comment) => !usedCommentIds.has(comment.id)).map((comment) => ({ comment, score: matchScore(reference, comment) })).filter(({ score }) => score >= 0.55).sort((a, b) => b.score - a.score);
3808
+ const best = candidates[0];
3809
+ if (!best) continue;
3810
+ usedCommentIds.add(best.comment.id);
3811
+ matches.push({ referenceId: reference.id, commentId: best.comment.id, score: best.score });
3812
+ }
3813
+ return matches;
3814
+ }
3815
+ function matchScore(reference, comment) {
3816
+ let score = 0;
3817
+ if (reference.sourceCommentIds?.includes(comment.id)) score += 1;
3818
+ if (reference.path && comment.path && normalizePath(reference.path) === normalizePath(comment.path)) {
3819
+ score += 0.35;
3820
+ }
3821
+ if (reference.line && comment.line && Math.abs(reference.line - comment.line) <= 3) score += 0.15;
3822
+ const terms = [...reference.keywords ?? [], ...tokenize(reference.title)];
3823
+ const uniqueTerms = [...new Set(terms.map(normalizeTerm).filter((term) => term.length >= 3))];
3824
+ if (uniqueTerms.length > 0) {
3825
+ const bodyTerms = new Set(tokenize(comment.body).map(normalizeTerm));
3826
+ const overlap = uniqueTerms.filter((term) => bodyTerms.has(term)).length;
3827
+ score += 0.5 * (overlap / uniqueTerms.length);
3828
+ }
3829
+ return clamp01(score);
3830
+ }
3831
+ function isActionableComment(comment) {
3832
+ const body = comment.body.trim();
3833
+ if (!comment.path && !/\b(file|line|function|method|class|module|test|migration)\b/i.test(body)) {
3834
+ return false;
3835
+ }
3836
+ return /\b(fix|change|add|remove|guard|check|reject|validate|test|assert|return|throw|fail|block)\b/i.test(
3837
+ body
3838
+ );
3839
+ }
3840
+ function isSeverityAligned(comment, references, matches) {
3841
+ if (!comment.severity) return false;
3842
+ const match = matches.find((candidate) => candidate.commentId === comment.id);
3843
+ if (!match) return comment.severity === "nit" || comment.severity === "low";
3844
+ const reference = references.find((candidate) => candidate.id === match.referenceId);
3845
+ if (!reference) return false;
3846
+ return Math.abs(severityRank(comment.severity) - severityRank(reference.severity)) <= 1;
3847
+ }
3848
+ function buildScoreNotes(input) {
3849
+ const notes = [];
3850
+ if (input.referenceCount > 0 && input.matchedFindings.length === 0) {
3851
+ notes.push("no reference findings matched");
3852
+ }
3853
+ if (input.negativeComments.length > 0) {
3854
+ notes.push(`${input.negativeComments.length} comment(s) labelled rejected/duplicate/noise`);
3855
+ }
3856
+ if (input.comments.length > 0 && input.actionableComments.length === 0) {
3857
+ notes.push("comments were not actionable enough for a PR reviewer benchmark");
3858
+ }
3859
+ return notes;
3860
+ }
3861
+ function isPositiveOutcome(outcome) {
3862
+ return outcome === "accepted" || outcome === "fixed";
3863
+ }
3864
+ function isNegativeOutcome(outcome) {
3865
+ return outcome === "rejected" || outcome === "duplicate" || outcome === "noise";
3866
+ }
3867
+ function severityRank(severity) {
3868
+ switch (severity) {
3869
+ case "critical":
3870
+ return 5;
3871
+ case "high":
3872
+ return 4;
3873
+ case "medium":
3874
+ return 3;
3875
+ case "low":
3876
+ return 2;
3877
+ case "nit":
3878
+ return 1;
3879
+ }
3880
+ }
3881
+ function tokenize(input) {
3882
+ return input.match(/[a-zA-Z0-9_.$/-]+/g) ?? [];
3883
+ }
3884
+ function normalizeTerm(input) {
3885
+ return input.toLowerCase().replace(/^[^a-z0-9_]+|[^a-z0-9_]+$/g, "");
3886
+ }
3887
+ function normalizePath(input) {
3888
+ return input.replace(/^\.\/+/, "");
3889
+ }
3890
+ function mean(values) {
3891
+ return values.length ? sum(values) / values.length : 0;
3892
+ }
3893
+ function sum(values) {
3894
+ return values.reduce((total, value) => total + value, 0);
3895
+ }
3896
+
3715
3897
  // src/production-loop.ts
3716
3898
  async function runProductionLoop(opts) {
3717
3899
  validate2(opts);
@@ -5207,14 +5389,14 @@ async function runHarnessExperiment(config) {
5207
5389
  const score = config.score ?? ((trace) => critic.scoreTrace(trace));
5208
5390
  const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
5209
5391
  const trace = await config.adapter.run(request);
5210
- const runScore = await score(trace, request);
5392
+ const runScore2 = await score(trace, request);
5211
5393
  const result = {
5212
5394
  variant: request.variant,
5213
5395
  scenario: request.scenario,
5214
5396
  trialIndex: request.trialIndex,
5215
5397
  trace,
5216
- score: runScore,
5217
- aggregate: aggregateRunScore(runScore, config.weights)
5398
+ score: runScore2,
5399
+ aggregate: aggregateRunScore(runScore2, config.weights)
5218
5400
  };
5219
5401
  await config.onResult?.(result);
5220
5402
  return result;
@@ -5241,10 +5423,10 @@ function summarizeHarnessResults(results) {
5241
5423
  return {
5242
5424
  variant,
5243
5425
  runs,
5244
- aggregateMean: mean(runs.map((r) => r.aggregate)),
5245
- passRate: mean(runs.map((r) => r.score.success)),
5246
- costUsdMean: mean(runs.map((r) => r.score.costUsd)),
5247
- wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
5426
+ aggregateMean: mean2(runs.map((r) => r.aggregate)),
5427
+ passRate: mean2(runs.map((r) => r.score.success)),
5428
+ costUsdMean: mean2(runs.map((r) => r.score.costUsd)),
5429
+ wallSecondsMean: mean2(runs.map((r) => r.score.wallSeconds)),
5248
5430
  scoreMean: meanRunScore(runs.map((r) => r.score))
5249
5431
  };
5250
5432
  }).sort((a, b) => b.aggregateMean - a.aggregateMean);
@@ -5281,22 +5463,22 @@ async function mapLimit(items, limit, fn) {
5281
5463
  );
5282
5464
  return results;
5283
5465
  }
5284
- function mean(values) {
5285
- return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
5466
+ function mean2(values) {
5467
+ return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
5286
5468
  }
5287
5469
  function meanRunScore(scores) {
5288
5470
  return {
5289
- success: mean(scores.map((s) => s.success)),
5290
- goalProgress: mean(scores.map((s) => s.goalProgress)),
5291
- repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
5292
- driftPenalty: mean(scores.map((s) => s.driftPenalty)),
5293
- toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
5294
- patchQuality: mean(scores.map((s) => s.patchQuality)),
5295
- testReality: mean(scores.map((s) => s.testReality)),
5296
- finalGate: mean(scores.map((s) => s.finalGate)),
5297
- reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
5298
- costUsd: mean(scores.map((s) => s.costUsd)),
5299
- wallSeconds: mean(scores.map((s) => s.wallSeconds)),
5471
+ success: mean2(scores.map((s) => s.success)),
5472
+ goalProgress: mean2(scores.map((s) => s.goalProgress)),
5473
+ repoGroundedness: mean2(scores.map((s) => s.repoGroundedness)),
5474
+ driftPenalty: mean2(scores.map((s) => s.driftPenalty)),
5475
+ toolUseQuality: mean2(scores.map((s) => s.toolUseQuality)),
5476
+ patchQuality: mean2(scores.map((s) => s.patchQuality)),
5477
+ testReality: mean2(scores.map((s) => s.testReality)),
5478
+ finalGate: mean2(scores.map((s) => s.finalGate)),
5479
+ reviewerBlockers: mean2(scores.map((s) => s.reviewerBlockers)),
5480
+ costUsd: mean2(scores.map((s) => s.costUsd)),
5481
+ wallSeconds: mean2(scores.map((s) => s.wallSeconds)),
5300
5482
  notes: scores.flatMap((s) => s.notes ?? [])
5301
5483
  };
5302
5484
  }
@@ -5635,7 +5817,7 @@ function rankRows(rows, weights) {
5635
5817
  }
5636
5818
  return [...buckets.entries()].map(([variantId, values]) => ({
5637
5819
  variantId,
5638
- mean: values.reduce((sum2, value) => sum2 + value, 0) / values.length,
5820
+ mean: values.reduce((sum3, value) => sum3 + value, 0) / values.length,
5639
5821
  runs: values.length
5640
5822
  })).sort((a, b) => b.mean - a.mean);
5641
5823
  }
@@ -5805,6 +5987,22 @@ var BudgetGuard = class {
5805
5987
  }
5806
5988
  };
5807
5989
 
5990
+ // src/agent-profile.ts
5991
+ import { createHash as createHash2 } from "crypto";
5992
+ function agentProfileHash(profile) {
5993
+ if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
5994
+ throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
5995
+ }
5996
+ const behaviour = {
5997
+ model: profile.model.trim(),
5998
+ skills: [...profile.skills ?? []].sort(),
5999
+ promptVersion: profile.promptVersion ?? null,
6000
+ tools: [...profile.tools ?? []].sort(),
6001
+ metadata: profile.metadata ?? {}
6002
+ };
6003
+ return createHash2("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
6004
+ }
6005
+
5808
6006
  // src/cost-tracker.ts
5809
6007
  var CostTracker = class {
5810
6008
  byScenario = /* @__PURE__ */ new Map();
@@ -6211,6 +6409,194 @@ function isObject(v) {
6211
6409
  return typeof v === "object" && v !== null && !Array.isArray(v);
6212
6410
  }
6213
6411
 
6412
+ // src/scorecard.ts
6413
+ import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3 } from "fs";
6414
+ import { dirname as dirname2 } from "path";
6415
+ function median(xs) {
6416
+ if (xs.length === 0) return 0;
6417
+ const sorted = [...xs].sort((a, b) => a - b);
6418
+ const mid = Math.floor(sorted.length / 2);
6419
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6420
+ }
6421
+ function runScore(run) {
6422
+ return run.outcome.holdoutScore ?? run.outcome.searchScore;
6423
+ }
6424
+ function aggregatePerDimension(runs) {
6425
+ const sums = /* @__PURE__ */ new Map();
6426
+ for (const run of runs) {
6427
+ const dims = run.outcome.judgeScores?.perDimMean;
6428
+ if (!dims) continue;
6429
+ for (const [dim, value] of Object.entries(dims)) {
6430
+ if (!Number.isFinite(value)) continue;
6431
+ const acc = sums.get(dim) ?? { total: 0, count: 0 };
6432
+ acc.total += value;
6433
+ acc.count += 1;
6434
+ sums.set(dim, acc);
6435
+ }
6436
+ }
6437
+ if (sums.size === 0) return void 0;
6438
+ const out = {};
6439
+ for (const [dim, acc] of sums) out[dim] = acc.total / acc.count;
6440
+ return out;
6441
+ }
6442
+ function recordRuns(runs, opts) {
6443
+ const profileHash = agentProfileHash(opts.profile);
6444
+ const timestamp = opts.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
6445
+ const byScenario = /* @__PURE__ */ new Map();
6446
+ for (const run of runs) {
6447
+ const scenarioId = run.scenarioId;
6448
+ if (!scenarioId) continue;
6449
+ const bucket = byScenario.get(scenarioId);
6450
+ if (bucket) bucket.push(run);
6451
+ else byScenario.set(scenarioId, [run]);
6452
+ }
6453
+ const lines = [];
6454
+ for (const [scenarioId, scenarioRuns] of byScenario) {
6455
+ const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
6456
+ if (scored.length === 0) continue;
6457
+ const scores = scored.map((s) => s.score);
6458
+ const entry = {
6459
+ commitSha: opts.commitSha,
6460
+ timestamp,
6461
+ scores,
6462
+ composite: median(scores),
6463
+ runIds: scored.map((s) => s.run.runId)
6464
+ };
6465
+ const perDimension = aggregatePerDimension(scenarioRuns);
6466
+ if (perDimension) entry.perDimension = perDimension;
6467
+ lines.push({
6468
+ scenarioId,
6469
+ profileHash,
6470
+ model: opts.profile.model,
6471
+ profile: opts.profile,
6472
+ entry
6473
+ });
6474
+ }
6475
+ return lines;
6476
+ }
6477
+ function appendScorecard(logPath, lines) {
6478
+ if (lines.length === 0) return;
6479
+ mkdirSync2(dirname2(logPath), { recursive: true });
6480
+ appendFileSync2(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
6481
+ `);
6482
+ }
6483
+ function recordRunsToScorecard(logPath, runs, opts) {
6484
+ const lines = recordRuns(runs, opts);
6485
+ appendScorecard(logPath, lines);
6486
+ return lines;
6487
+ }
6488
+ function loadScorecard(logPath) {
6489
+ if (!existsSync4(logPath)) return { cells: [], profiles: {} };
6490
+ const cells = /* @__PURE__ */ new Map();
6491
+ const profiles = {};
6492
+ for (const raw of readFileSync3(logPath, "utf8").split("\n")) {
6493
+ const line = raw.trim();
6494
+ if (!line) continue;
6495
+ let parsed;
6496
+ try {
6497
+ parsed = JSON.parse(line);
6498
+ } catch {
6499
+ continue;
6500
+ }
6501
+ if (!parsed?.scenarioId || !parsed.profileHash || !parsed.entry) continue;
6502
+ const key = `${parsed.scenarioId}::${parsed.profileHash}`;
6503
+ let cell = cells.get(key);
6504
+ if (!cell) {
6505
+ cell = {
6506
+ scenarioId: parsed.scenarioId,
6507
+ profileHash: parsed.profileHash,
6508
+ model: parsed.model,
6509
+ timeline: []
6510
+ };
6511
+ cells.set(key, cell);
6512
+ }
6513
+ cell.timeline.push(parsed.entry);
6514
+ if (parsed.profile) profiles[parsed.profileHash] = parsed.profile;
6515
+ }
6516
+ for (const cell of cells.values()) {
6517
+ cell.timeline.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
6518
+ }
6519
+ return { cells: [...cells.values()], profiles };
6520
+ }
6521
+ function diffScorecard(scorecard, opts = {}) {
6522
+ const minEffect = opts.minEffect ?? 0.5;
6523
+ const maxP = opts.maxP ?? 0.05;
6524
+ const minDelta = opts.minDelta ?? 0.05;
6525
+ const cells = [];
6526
+ for (const cell of scorecard.cells) {
6527
+ const timeline = cell.timeline;
6528
+ if (timeline.length === 0) continue;
6529
+ const current = timeline[timeline.length - 1];
6530
+ const baseline = opts.baselineCommit ? [...timeline].reverse().find((e) => e.commitSha === opts.baselineCommit && e !== current) : timeline[timeline.length - 2];
6531
+ const base = {
6532
+ scenarioId: cell.scenarioId,
6533
+ profileHash: cell.profileHash,
6534
+ model: cell.model,
6535
+ current: current.composite,
6536
+ currentCommit: current.commitSha
6537
+ };
6538
+ if (!baseline) {
6539
+ cells.push({
6540
+ ...base,
6541
+ verdict: "new",
6542
+ baseline: null,
6543
+ delta: null,
6544
+ cohensD: null,
6545
+ pValue: null,
6546
+ baselineCommit: null
6547
+ });
6548
+ continue;
6549
+ }
6550
+ const delta = current.composite - baseline.composite;
6551
+ const canStat = baseline.scores.length >= 2 && current.scores.length >= 2;
6552
+ let d = null;
6553
+ let p = null;
6554
+ let verdict;
6555
+ if (canStat) {
6556
+ d = cohensD(baseline.scores, current.scores);
6557
+ const t = welchsTTest(baseline.scores, current.scores);
6558
+ p = Number.isFinite(t.p) ? t.p : null;
6559
+ const significant = Math.abs(d) >= minEffect && p !== null && p <= maxP;
6560
+ verdict = significant ? delta > 0 ? "improved" : "regressed" : "flat";
6561
+ } else {
6562
+ verdict = Math.abs(delta) >= minDelta ? delta > 0 ? "improved" : "regressed" : "flat";
6563
+ }
6564
+ cells.push({
6565
+ ...base,
6566
+ verdict,
6567
+ baseline: baseline.composite,
6568
+ delta,
6569
+ cohensD: d,
6570
+ pValue: p,
6571
+ baselineCommit: baseline.commitSha
6572
+ });
6573
+ }
6574
+ const summary = { improved: 0, regressed: 0, flat: 0, new: 0 };
6575
+ for (const cell of cells) summary[cell.verdict] += 1;
6576
+ return { cells, summary };
6577
+ }
6578
+ function formatScorecardDiff(diff) {
6579
+ const lines = [];
6580
+ const { summary } = diff;
6581
+ lines.push(
6582
+ `Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
6583
+ );
6584
+ const fmt = (n) => n.toFixed(3);
6585
+ const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
6586
+ if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
6587
+ return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
6588
+ });
6589
+ for (const cell of noteworthy) {
6590
+ const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
6591
+ const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt(cell.delta)}` : fmt(cell.delta) : "\u2014";
6592
+ const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
6593
+ lines.push(
6594
+ ` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt(cell.baseline ?? 0)} \u2192 ${fmt(cell.current)} ${deltaStr}${stat}`
6595
+ );
6596
+ }
6597
+ return lines.join("\n");
6598
+ }
6599
+
6214
6600
  // src/series-convergence.ts
6215
6601
  function analyzeSeries(values, options = {}) {
6216
6602
  const window = options.window ?? 5;
@@ -6220,10 +6606,10 @@ function analyzeSeries(values, options = {}) {
6220
6606
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
6221
6607
  }
6222
6608
  const tail = values.slice(-window);
6223
- const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
6224
- const variance = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
6609
+ const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
6610
+ const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
6225
6611
  const stdDev = Math.sqrt(variance);
6226
- const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
6612
+ const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
6227
6613
  const cv = stdDev / refMean;
6228
6614
  const stable = tail.length >= window && cv <= stableCv;
6229
6615
  let tailRun = 0;
@@ -6244,7 +6630,7 @@ function analyzeSeries(values, options = {}) {
6244
6630
  } else {
6245
6631
  state = "noisy";
6246
6632
  }
6247
- return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
6633
+ return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
6248
6634
  }
6249
6635
 
6250
6636
  // src/slo.ts
@@ -7042,12 +7428,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
7042
7428
  variantScores.push({ mutator: id, score, mutated });
7043
7429
  all.push(score);
7044
7430
  }
7045
- const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
7046
- const variance = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
7431
+ const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
7432
+ const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
7047
7433
  const stdDev = Math.sqrt(variance);
7048
- const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
7434
+ const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
7049
7435
  const robustness = Math.max(0, 1 - stdDev / ref);
7050
- return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
7436
+ return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
7051
7437
  }
7052
7438
  var lowercaseMutator = (p) => p.toLowerCase();
7053
7439
  var sentenceReorderMutator = (p, seed) => {
@@ -7113,8 +7499,8 @@ async function paraphraseRobustnessScenarios(args) {
7113
7499
  });
7114
7500
  scores.push(out.score);
7115
7501
  }
7116
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7117
- deltas[m.name] = mean4 - originalScore;
7502
+ const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
7503
+ deltas[m.name] = mean5 - originalScore;
7118
7504
  paraphrasedAll.push(...scores);
7119
7505
  }
7120
7506
  const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
@@ -7727,8 +8113,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7727
8113
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7728
8114
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7729
8115
  if (scores.length < 3) continue;
7730
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7731
- const variance = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
8116
+ const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
8117
+ const variance = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
7732
8118
  if (variance > varianceThreshold) {
7733
8119
  targets.push({
7734
8120
  reason: "high-variance",
@@ -7959,7 +8345,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
7959
8345
 
7960
8346
  // src/command-runner.ts
7961
8347
  import { spawnSync } from "child_process";
7962
- import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
8348
+ import { existsSync as existsSync5, readdirSync as readdirSync2, readFileSync as readFileSync4, statSync as statSync2 } from "fs";
7963
8349
  import { join as join2 } from "path";
7964
8350
  var localCommandRunner = {
7965
8351
  name: "local",
@@ -7988,11 +8374,11 @@ var localCommandRunner = {
7988
8374
  return r.status === 0 && (r.stdout ?? "").trim().length > 0;
7989
8375
  },
7990
8376
  async fileExists(path) {
7991
- return existsSync4(path);
8377
+ return existsSync5(path);
7992
8378
  },
7993
8379
  async readFile(path) {
7994
8380
  try {
7995
- return readFileSync3(path, "utf8");
8381
+ return readFileSync4(path, "utf8");
7996
8382
  } catch {
7997
8383
  return null;
7998
8384
  }
@@ -8230,7 +8616,7 @@ function extractErrorCount(text, opts = {}) {
8230
8616
  for (const p of patterns) {
8231
8617
  const matches = Array.from(text.matchAll(p.regex));
8232
8618
  if (matches.length === 0) continue;
8233
- const count = p.transform ? matches.reduce((sum2, m) => sum2 + p.transform(m), 0) : matches.length;
8619
+ const count = p.transform ? matches.reduce((sum3, m) => sum3 + p.transform(m), 0) : matches.length;
8234
8620
  return {
8235
8621
  count,
8236
8622
  matched: p.name,
@@ -8924,8 +9310,8 @@ function multiToolchainLayer(config) {
8924
9310
  }
8925
9311
 
8926
9312
  // src/reference-replay.ts
8927
- import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
8928
- import { dirname as dirname2 } from "path";
9313
+ import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5 } from "fs";
9314
+ import { dirname as dirname3 } from "path";
8929
9315
  var DEFAULT_MATCH_THRESHOLD = 0.55;
8930
9316
  var ALL_SPLITS = ["train", "dev", "test", "holdout"];
8931
9317
  async function runReferenceReplay(cases, options) {
@@ -9043,14 +9429,14 @@ function jsonlReferenceReplayStore(path) {
9043
9429
  return {
9044
9430
  async save(run) {
9045
9431
  await lock.runExclusive(() => {
9046
- mkdirSync2(dirname2(path), { recursive: true });
9047
- appendFileSync2(path, `${JSON.stringify(run)}
9432
+ mkdirSync3(dirname3(path), { recursive: true });
9433
+ appendFileSync3(path, `${JSON.stringify(run)}
9048
9434
  `);
9049
9435
  });
9050
9436
  },
9051
9437
  async list() {
9052
9438
  return lock.runExclusive(() => {
9053
- if (!existsSync5(path)) return [];
9439
+ if (!existsSync6(path)) return [];
9054
9440
  return readJsonl(path);
9055
9441
  });
9056
9442
  }
@@ -9139,7 +9525,7 @@ function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
9139
9525
  regressions
9140
9526
  };
9141
9527
  }
9142
- const requiredMeanDelta = mean2(compared.map((item) => item.f1Delta));
9528
+ const requiredMeanDelta = mean3(compared.map((item) => item.f1Delta));
9143
9529
  if (requiredMeanDelta < minF1Delta) {
9144
9530
  return {
9145
9531
  promote: false,
@@ -9274,8 +9660,8 @@ function scorePair(scenario, matcher, reference, candidate) {
9274
9660
  function buildScenarioScore(scenario, matches, falsePositives) {
9275
9661
  const matched = matches.filter((match) => match.matched).length;
9276
9662
  const total = scenario.references.length;
9277
- const matchedWeight = matches.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
9278
- const totalWeight = matches.reduce((sum2, match) => sum2 + match.weight, 0);
9663
+ const matchedWeight = matches.filter((match) => match.matched).reduce((sum3, match) => sum3 + match.weight, 0);
9664
+ const totalWeight = matches.reduce((sum3, match) => sum3 + match.weight, 0);
9279
9665
  const precision2 = ratio(matched, matched + falsePositives);
9280
9666
  const recall = ratio(matched, total);
9281
9667
  return {
@@ -9301,11 +9687,11 @@ function aggregateBySplit(scores) {
9301
9687
  return out;
9302
9688
  }
9303
9689
  function aggregateScenarioScores(scores) {
9304
- const matched = sum(scores.map((score) => score.matched));
9305
- const total = sum(scores.map((score) => score.total));
9306
- const falsePositives = sum(scores.map((score) => score.falsePositives));
9307
- const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9308
- const totalWeight = sum(scores.map((score) => score.totalWeight));
9690
+ const matched = sum2(scores.map((score) => score.matched));
9691
+ const total = sum2(scores.map((score) => score.total));
9692
+ const falsePositives = sum2(scores.map((score) => score.falsePositives));
9693
+ const matchedWeight = sum2(scores.map((score) => score.matchedWeight));
9694
+ const totalWeight = sum2(scores.map((score) => score.totalWeight));
9309
9695
  const precision2 = ratio(matched, matched + falsePositives);
9310
9696
  const recall = ratio(matched, total);
9311
9697
  return {
@@ -9372,11 +9758,11 @@ function clamp012(value) {
9372
9758
  if (!Number.isFinite(value)) return 0;
9373
9759
  return Math.max(0, Math.min(1, value));
9374
9760
  }
9375
- function sum(values) {
9761
+ function sum2(values) {
9376
9762
  return values.reduce((acc, value) => acc + value, 0);
9377
9763
  }
9378
- function mean2(values) {
9379
- return values.length ? sum(values) / values.length : 0;
9764
+ function mean3(values) {
9765
+ return values.length ? sum2(values) / values.length : 0;
9380
9766
  }
9381
9767
  function formatPct(value) {
9382
9768
  return `${(value * 100).toFixed(1)}%`;
@@ -9393,7 +9779,7 @@ function throwIfAborted(signal) {
9393
9779
  throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
9394
9780
  }
9395
9781
  function readJsonl(path) {
9396
- const raw = readFileSync4(path, "utf8");
9782
+ const raw = readFileSync5(path, "utf8");
9397
9783
  const out = [];
9398
9784
  for (const line of raw.split("\n")) {
9399
9785
  const trimmed = line.trim();
@@ -9640,8 +10026,8 @@ function detectCalibrationDrift(runs, opts) {
9640
10026
  alpha,
9641
10027
  recentN: recent.length,
9642
10028
  historyN: historical.length,
9643
- recentMean: mean3(recent),
9644
- historyMean: mean3(historical)
10029
+ recentMean: mean4(recent),
10030
+ historyMean: mean4(historical)
9645
10031
  }
9646
10032
  }
9647
10033
  ];
@@ -9761,7 +10147,7 @@ function chiSquareCritical(df, alpha) {
9761
10147
  }
9762
10148
  return TABLE[10][idx];
9763
10149
  }
9764
- function mean3(xs) {
10150
+ function mean4(xs) {
9765
10151
  if (xs.length === 0) return 0;
9766
10152
  return xs.reduce((s, x) => s + x, 0) / xs.length;
9767
10153
  }
@@ -9961,8 +10347,8 @@ async function discoverPersonas(dir, opts = {}) {
9961
10347
  }
9962
10348
 
9963
10349
  // src/evolution-telemetry.ts
9964
- import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5, writeFileSync } from "fs";
9965
- import { dirname as dirname3 } from "path";
10350
+ import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
10351
+ import { dirname as dirname4 } from "path";
9966
10352
  var MutationTelemetry = class {
9967
10353
  appender;
9968
10354
  constructor(path) {
@@ -9991,17 +10377,17 @@ var LineageRecorder = class {
9991
10377
  this.path = path;
9992
10378
  this.snapshotPath = `${path}.snapshot`;
9993
10379
  this.kindOf = kindOf ?? defaultKindOf;
9994
- mkdirSync3(dirname3(path), { recursive: true });
9995
- if (existsSync6(this.snapshotPath)) {
10380
+ mkdirSync4(dirname4(path), { recursive: true });
10381
+ if (existsSync7(this.snapshotPath)) {
9996
10382
  try {
9997
- const parsed = JSON.parse(readFileSync5(this.snapshotPath, "utf-8"));
10383
+ const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
9998
10384
  for (const n of parsed) this.nodes.set(n.id, n);
9999
10385
  } catch {
10000
10386
  }
10001
10387
  }
10002
- if (existsSync6(path)) {
10388
+ if (existsSync7(path)) {
10003
10389
  try {
10004
- for (const line of readFileSync5(path, "utf-8").split("\n")) {
10390
+ for (const line of readFileSync6(path, "utf-8").split("\n")) {
10005
10391
  if (!line.trim()) continue;
10006
10392
  try {
10007
10393
  const entry = JSON.parse(line);
@@ -10013,9 +10399,9 @@ var LineageRecorder = class {
10013
10399
  } catch {
10014
10400
  }
10015
10401
  }
10016
- if (existsSync6(path) && this.nodes.size === 0) {
10402
+ if (existsSync7(path) && this.nodes.size === 0) {
10017
10403
  try {
10018
- const raw = readFileSync5(path, "utf-8").trim();
10404
+ const raw = readFileSync6(path, "utf-8").trim();
10019
10405
  if (raw.startsWith("[")) {
10020
10406
  const parsed = JSON.parse(raw);
10021
10407
  for (const n of parsed) this.nodes.set(n.id, n);
@@ -10029,15 +10415,15 @@ var LineageRecorder = class {
10029
10415
  const prev = this.nodes.get(node.id);
10030
10416
  this.nodes.set(node.id, { ...prev, ...node });
10031
10417
  try {
10032
- if (existsSync6(this.path)) {
10033
- const head = readFileSync5(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
10418
+ if (existsSync7(this.path)) {
10419
+ const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
10034
10420
  if (head === "[") {
10035
10421
  writeFileSync(this.path, "");
10036
10422
  }
10037
10423
  }
10038
10424
  } catch {
10039
10425
  }
10040
- appendFileSync3(this.path, `${JSON.stringify(this.nodes.get(node.id))}
10426
+ appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
10041
10427
  `);
10042
10428
  });
10043
10429
  }
@@ -10096,9 +10482,9 @@ var CostLedger = class {
10096
10482
  mutex = new Mutex();
10097
10483
  constructor(path) {
10098
10484
  this.path = path;
10099
- if (existsSync6(path)) {
10485
+ if (existsSync7(path)) {
10100
10486
  try {
10101
- const loaded = JSON.parse(readFileSync5(path, "utf-8"));
10487
+ const loaded = JSON.parse(readFileSync6(path, "utf-8"));
10102
10488
  for (const k of Object.keys(this.totals)) {
10103
10489
  if (k === "byGeneration") {
10104
10490
  if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
@@ -10115,7 +10501,7 @@ var CostLedger = class {
10115
10501
  } catch {
10116
10502
  }
10117
10503
  } else {
10118
- mkdirSync3(dirname3(path), { recursive: true });
10504
+ mkdirSync4(dirname4(path), { recursive: true });
10119
10505
  }
10120
10506
  }
10121
10507
  genBucket(generation) {
@@ -10267,16 +10653,16 @@ function precision(goldens, candidates, options = {}) {
10267
10653
  }
10268
10654
 
10269
10655
  // src/jsonl-trial-cache.ts
10270
- import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6 } from "fs";
10271
- import { dirname as dirname4 } from "path";
10656
+ import { appendFileSync as appendFileSync5, existsSync as existsSync8, mkdirSync as mkdirSync5, readFileSync as readFileSync7 } from "fs";
10657
+ import { dirname as dirname5 } from "path";
10272
10658
  var JsonlTrialCache = class {
10273
10659
  map = /* @__PURE__ */ new Map();
10274
10660
  path;
10275
10661
  appender;
10276
10662
  constructor(path) {
10277
10663
  this.path = path;
10278
- if (existsSync7(path)) {
10279
- for (const line of readFileSync6(path, "utf-8").split("\n")) {
10664
+ if (existsSync8(path)) {
10665
+ for (const line of readFileSync7(path, "utf-8").split("\n")) {
10280
10666
  if (!line.trim()) continue;
10281
10667
  try {
10282
10668
  const entry = JSON.parse(line);
@@ -10285,7 +10671,7 @@ var JsonlTrialCache = class {
10285
10671
  }
10286
10672
  }
10287
10673
  } else {
10288
- mkdirSync4(dirname4(path), { recursive: true });
10674
+ mkdirSync5(dirname5(path), { recursive: true });
10289
10675
  }
10290
10676
  this.appender = new LockedJsonlAppender(path);
10291
10677
  }
@@ -10308,7 +10694,7 @@ var JsonlTrialCache = class {
10308
10694
  setSync(key, value) {
10309
10695
  this.map.set(key, value);
10310
10696
  const line = { key, result: value, writtenAt: Date.now() };
10311
- appendFileSync4(this.path, `${JSON.stringify(line)}
10697
+ appendFileSync5(this.path, `${JSON.stringify(line)}
10312
10698
  `);
10313
10699
  }
10314
10700
  };
@@ -10316,35 +10702,14 @@ var JsonlTrialCache = class {
10316
10702
  // src/judge-retry.ts
10317
10703
  var DEFAULT_MAX_ATTEMPTS = 3;
10318
10704
  var DEFAULT_TIMEOUT_MS = 9e4;
10319
- var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
10320
- var ABORT_PATTERNS = [
10321
- /AbortError/i,
10322
- /TimeoutError/i,
10323
- /fetch failed/i,
10324
- /ECONNRESET/i,
10325
- /ETIMEDOUT/i,
10326
- /EAI_AGAIN/i,
10327
- /this operation was aborted/i,
10328
- /stream.*ended.*unexpectedly/i,
10329
- /socket hang up/i
10330
- ];
10331
- var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
10332
- function defaultIsRetryable(err) {
10333
- if (err instanceof Error) {
10334
- if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
10335
- const status = err.status;
10336
- if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
10337
- }
10338
- return false;
10339
- }
10340
10705
  function sleep(ms) {
10341
10706
  return new Promise((resolve) => setTimeout(resolve, ms));
10342
10707
  }
10343
10708
  async function withJudgeRetry(judgeFn, policy = {}) {
10344
10709
  const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
10345
10710
  const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
10346
- const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
10347
- const isRetryable = policy.isRetryable ?? defaultIsRetryable;
10711
+ const backoff = policy.backoffMs ?? backoffMs;
10712
+ const isRetryable = policy.isRetryable ?? isTransientLlmError;
10348
10713
  const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
10349
10714
  let totalAttempts = 0;
10350
10715
  const attemptErrors = [];
@@ -10412,9 +10777,9 @@ function passOrthogonality(input) {
10412
10777
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
10413
10778
  }
10414
10779
  }
10415
- const mean4 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10780
+ const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10416
10781
  return {
10417
- orthogonality: Math.max(0, Math.min(1, 1 - mean4)),
10782
+ orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
10418
10783
  passCount: passes.length,
10419
10784
  similarities: sims
10420
10785
  };
@@ -10667,6 +11032,7 @@ export {
10667
11032
  ANALYST_SEVERITIES,
10668
11033
  AgentDriver,
10669
11034
  AgentEvalError,
11035
+ AgentProfileCellValidationError,
10670
11036
  AnalystRegistry,
10671
11037
  AxGepaSteeringOptimizer,
10672
11038
  BENCHMARK_SPLIT_SEED,
@@ -10688,6 +11054,7 @@ export {
10688
11054
  DEFAULT_HARNESS_OBJECTIVES,
10689
11055
  DEFAULT_MUTATION_PRIMITIVES,
10690
11056
  DEFAULT_MUTATORS,
11057
+ DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
10691
11058
  DEFAULT_REDACTION_RULES,
10692
11059
  DEFAULT_RED_TEAM_CORPUS,
10693
11060
  DEFAULT_RUN_SCORE_WEIGHTS,
@@ -10774,28 +11141,35 @@ export {
10774
11141
  VerificationError,
10775
11142
  acquisitionPlansForKnowledgeGaps,
10776
11143
  adversarialJudge,
11144
+ agentProfileCellHashMaterial,
11145
+ agentProfileCellKey,
11146
+ agentProfileHash,
10777
11147
  aggregateLlm,
11148
+ aggregatePrReviewScore,
10778
11149
  aggregateRunScore,
10779
11150
  aggregateTrialsByMode,
10780
11151
  allCriticalPassed,
10781
11152
  analyzeAntiSlop,
10782
11153
  analyzeSeries,
10783
11154
  analyzeTraces,
11155
+ appendScorecard,
10784
11156
  argHash,
10785
11157
  assertLlmRoute,
10786
11158
  assertRealBackend,
10787
11159
  assertReleaseConfidence,
11160
+ assertRunAgentProfileCell,
10788
11161
  assertRunCaptured,
10789
11162
  assignFeedbackSplit,
10790
11163
  attributeCounterfactuals,
11164
+ backoffMs,
10791
11165
  deterministicSplit as benchmarkDeterministicSplit,
10792
11166
  benchmarks_exports as benchmarks,
10793
11167
  benjaminiHochberg,
10794
- bhAdjust,
10795
11168
  bisect,
10796
11169
  blockingKnowledgeEval,
10797
11170
  bonferroni,
10798
11171
  bootstrapCi,
11172
+ buildAgentProfileCell,
10799
11173
  buildDriverSystemPrompt,
10800
11174
  buildReflectionPrompt,
10801
11175
  buildReviewerPrompt,
@@ -10822,6 +11196,7 @@ export {
10822
11196
  cohensD,
10823
11197
  coherenceJudge,
10824
11198
  collectionPreserved,
11199
+ commentsForSource,
10825
11200
  commitBisect,
10826
11201
  compareReferenceReplay,
10827
11202
  compareToBaseline,
@@ -10872,6 +11247,7 @@ export {
10872
11247
  deployGateLayer,
10873
11248
  describeTraceInsightScope,
10874
11249
  diffFindings,
11250
+ diffScorecard,
10875
11251
  discoverPersonas,
10876
11252
  distillPlaybook,
10877
11253
  domainEvidencePattern,
@@ -10907,11 +11283,13 @@ export {
10907
11283
  formatBenchmarkReport,
10908
11284
  formatDriverReport,
10909
11285
  formatFindings,
11286
+ formatScorecardDiff,
10910
11287
  gainHistogram,
10911
11288
  ghCliClient,
10912
11289
  precision as goldenPrecision,
10913
11290
  gradeSemanticStatus,
10914
11291
  groupBy,
11292
+ groupRunsByAgentProfileCell,
10915
11293
  hashContent,
10916
11294
  hashJson,
10917
11295
  hashScenarios,
@@ -10933,6 +11311,7 @@ export {
10933
11311
  isRunRecord,
10934
11312
  isSandboxSpan,
10935
11313
  isToolSpan,
11314
+ isTransientLlmError,
10936
11315
  iterateRawCalls,
10937
11316
  jestTestParser,
10938
11317
  jsonHasKeys,
@@ -10947,6 +11326,7 @@ export {
10947
11326
  linterJudge,
10948
11327
  llmSpanFromProvider,
10949
11328
  llmSpans,
11329
+ loadScorecard,
10950
11330
  loadScorerFromGrader,
10951
11331
  localCommandRunner,
10952
11332
  lowercaseMutator,
@@ -10962,8 +11342,8 @@ export {
10962
11342
  objectiveEval,
10963
11343
  pairedBootstrap,
10964
11344
  pairedEvalueSequence,
11345
+ pairedMde,
10965
11346
  pairedTTest,
10966
- pairedWilcoxon,
10967
11347
  paraphraseRobustness,
10968
11348
  paraphraseRobustnessScenarios,
10969
11349
  paretoChart,
@@ -10988,6 +11368,8 @@ export {
10988
11368
  proposeSynthesisTargets,
10989
11369
  providerFromBaseUrl,
10990
11370
  pytestTestParser,
11371
+ recordRuns,
11372
+ recordRunsToScorecard,
10991
11373
  redTeamDataset,
10992
11374
  redTeamReport,
10993
11375
  redactString,
@@ -11009,6 +11391,7 @@ export {
11009
11391
  replayFeedbackTrajectory,
11010
11392
  replayScorerOverCorpus,
11011
11393
  replayTraceThroughJudge,
11394
+ requireAgentProfileCell,
11012
11395
  requiredSampleSize,
11013
11396
  researchReport,
11014
11397
  resetLockedAppendersForTesting,
@@ -11045,6 +11428,8 @@ export {
11045
11428
  scoreContinuity,
11046
11429
  scoreFromEvals,
11047
11430
  scoreKnowledgeReadiness,
11431
+ scorePrReviewComments,
11432
+ scorePrReviewSource,
11048
11433
  scoreRedTeamOutput,
11049
11434
  scoreReferenceReplay,
11050
11435
  scoreTraceInsightReadiness,
@@ -11063,6 +11448,7 @@ export {
11063
11448
  summarize,
11064
11449
  summarizeBackendIntegrity,
11065
11450
  summarizeHarnessResults,
11451
+ summarizePrReviewBenchmark,
11066
11452
  summarizePreferenceMemory,
11067
11453
  summaryTable,
11068
11454
  testJudge,
@@ -11079,8 +11465,10 @@ export {
11079
11465
  typoMutator,
11080
11466
  urlContains,
11081
11467
  userQuestionsForKnowledgeGaps,
11468
+ validateAgentProfileCell,
11082
11469
  validateRunRecord,
11083
11470
  verbosityBias,
11471
+ verifyAgentProfileCell,
11084
11472
  verifyCompletion,
11085
11473
  verifyManifest,
11086
11474
  visualDiff,