@tangle-network/agent-eval 0.33.1 → 0.34.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/CHANGELOG.md +33 -0
  2. package/dist/benchmarks/index.d.ts +2 -2
  3. package/dist/{chunk-FT3IAMQR.js → chunk-3HYQXPC2.js} +2 -2
  4. package/dist/{chunk-WRGHMGWT.js → chunk-7PR3WPWE.js} +2 -2
  5. package/dist/{chunk-SQYRO3BT.js → chunk-RL6TERL2.js} +2 -2
  6. package/dist/{chunk-DCZXFOQN.js → chunk-TSPOEDM3.js} +56 -1
  7. package/dist/chunk-TSPOEDM3.js.map +1 -0
  8. package/dist/{control-C3k02SCP.d.ts → control-DVrmvM_k.d.ts} +1 -1
  9. package/dist/control.d.ts +2 -2
  10. package/dist/control.js +2 -2
  11. package/dist/{index-ClMxVqe_.d.ts → index-0pu_fBwZ.d.ts} +1 -1
  12. package/dist/index.d.ts +271 -11
  13. package/dist/index.js +487 -92
  14. package/dist/index.js.map +1 -1
  15. package/dist/meta-eval/index.d.ts +2 -2
  16. package/dist/openapi.json +1 -1
  17. package/dist/optimization.d.ts +3 -3
  18. package/dist/optimization.js +3 -3
  19. package/dist/{release-report-ChfmCmLi.d.ts → release-report-D2ykiLSe.d.ts} +2 -2
  20. package/dist/reporting.d.ts +4 -4
  21. package/dist/{researcher-CfnL3HEb.d.ts → researcher-DeZ_EArp.d.ts} +2 -2
  22. package/dist/rl.d.ts +5 -5
  23. package/dist/rl.js +2 -2
  24. package/dist/{rubric-predictive-validity-BvaNwfBE.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
  25. package/dist/{run-record-YinVdFwu.d.ts → run-record-BGY6bHRh.d.ts} +37 -1
  26. package/dist/{summary-report-BPJVzIeW.d.ts → summary-report-DuZXOk7K.d.ts} +1 -1
  27. package/package.json +12 -22
  28. package/dist/chunk-DCZXFOQN.js.map +0 -1
  29. /package/dist/{chunk-FT3IAMQR.js.map → chunk-3HYQXPC2.js.map} +0 -0
  30. /package/dist/{chunk-WRGHMGWT.js.map → chunk-7PR3WPWE.js.map} +0 -0
  31. /package/dist/{chunk-SQYRO3BT.js.map → chunk-RL6TERL2.js.map} +0 -0
package/dist/index.js CHANGED
@@ -54,7 +54,7 @@ import {
54
54
  runProposeReview,
55
55
  runProposeReviewAsControlLoop,
56
56
  scoreFromEvals
57
- } from "./chunk-WRGHMGWT.js";
57
+ } from "./chunk-7PR3WPWE.js";
58
58
  import {
59
59
  allCriticalPassed,
60
60
  objectiveEval,
@@ -96,7 +96,7 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-FT3IAMQR.js";
99
+ } from "./chunk-3HYQXPC2.js";
100
100
  import {
101
101
  assertReleaseConfidence,
102
102
  bootstrapCi,
@@ -107,7 +107,7 @@ import {
107
107
  } from "./chunk-LGAPK7NA.js";
108
108
  import {
109
109
  runEvalCampaign
110
- } from "./chunk-SQYRO3BT.js";
110
+ } from "./chunk-RL6TERL2.js";
111
111
  import {
112
112
  LlmCallError,
113
113
  LlmClient,
@@ -121,21 +121,24 @@ import {
121
121
  stripFencedJson
122
122
  } from "./chunk-VXNVVBZO.js";
123
123
  import {
124
+ AGENT_PROFILE_KINDS,
124
125
  AgentProfileCellValidationError,
125
126
  RunRecordValidationError,
126
127
  agentProfileCellHashMaterial,
127
128
  agentProfileCellKey,
128
129
  assertRunAgentProfileCell,
129
130
  buildAgentProfileCell,
131
+ buildSandboxAgentProfileCell,
130
132
  groupRunsByAgentProfileCell,
131
133
  isRunRecord,
132
134
  parseRunRecordSafe,
133
135
  requireAgentProfileCell,
134
136
  roundTripRunRecord,
137
+ toAgentProfileJson,
135
138
  validateAgentProfileCell,
136
139
  validateRunRecord,
137
140
  verifyAgentProfileCell
138
- } from "./chunk-DCZXFOQN.js";
141
+ } from "./chunk-TSPOEDM3.js";
139
142
  import {
140
143
  evaluateInterimReleaseConfidence,
141
144
  pairedEvalueSequence
@@ -333,7 +336,7 @@ var RunCritic = class {
333
336
  );
334
337
  const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
335
338
  if (!success) notes.push("run did not complete with pass=true");
336
- const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
339
+ const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
337
340
  const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
338
341
  trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
339
342
  ) : void 0;
@@ -348,7 +351,7 @@ var RunCritic = class {
348
351
  (span) => typeof span.testsTotal === "number" && span.testsTotal > 0
349
352
  );
350
353
  const testReality = sandboxTests.length ? sandboxTests.reduce(
351
- (sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
354
+ (sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
352
355
  0
353
356
  ) / sandboxTests.length : toolSpans2.some(
354
357
  (span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
@@ -370,7 +373,7 @@ var RunCritic = class {
370
373
  const costUsd = trace.budget.length ? Math.max(
371
374
  ...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
372
375
  0
373
- ) : llmSpans2.reduce((sum2, span) => sum2 + (span.costUsd ?? 0), 0);
376
+ ) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
374
377
  const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
375
378
  return {
376
379
  success,
@@ -1960,12 +1963,12 @@ function allocateBudget(policy, args) {
1960
1963
  return policy.totalUsd / Math.max(1, args.runningCount);
1961
1964
  }
1962
1965
  function sumFindingCost(findings) {
1963
- let sum2 = 0;
1966
+ let sum3 = 0;
1964
1967
  for (const f of findings) {
1965
1968
  const c = f.metadata?.cost_usd;
1966
- if (typeof c === "number" && Number.isFinite(c)) sum2 += c;
1969
+ if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
1967
1970
  }
1968
- return sum2;
1971
+ return sum3;
1969
1972
  }
1970
1973
  function selectPriorFindings(source, analystId) {
1971
1974
  if (!source) return void 0;
@@ -2184,10 +2187,10 @@ function ghCliClient(opts = {}) {
2184
2187
  await exec("git", ["branch", "-D", input.branchName], { cwd });
2185
2188
  await run("git", ["checkout", "-b", input.branchName]);
2186
2189
  const { mkdir, writeFile } = await import("fs/promises");
2187
- const { dirname: dirname5, join: join4, resolve } = await import("path");
2190
+ const { dirname: dirname6, join: join4, resolve } = await import("path");
2188
2191
  for (const change of input.fileChanges) {
2189
2192
  const abs = resolve(cwd, change.path);
2190
- await mkdir(dirname5(abs), { recursive: true });
2193
+ await mkdir(dirname6(abs), { recursive: true });
2191
2194
  await writeFile(abs, change.contents, "utf8");
2192
2195
  await run("git", ["add", join4(change.path)]);
2193
2196
  }
@@ -3722,6 +3725,178 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
3722
3725
  };
3723
3726
  }
3724
3727
 
3728
+ // src/pr-review-benchmark.ts
3729
+ var DEFAULT_PR_REVIEW_SCORE_WEIGHTS = {
3730
+ recall: 4,
3731
+ precision: 2,
3732
+ actionability: 1.5,
3733
+ severityCalibration: 1,
3734
+ lowNoise: 1
3735
+ };
3736
+ function commentsForSource(auditCase, source) {
3737
+ return auditCase.comments.filter((comment) => comment.source === source);
3738
+ }
3739
+ function scorePrReviewSource(auditCase, source, weights = {}) {
3740
+ return scorePrReviewComments(auditCase, commentsForSource(auditCase, source), source, weights);
3741
+ }
3742
+ function scorePrReviewComments(auditCase, comments, source, weights = {}) {
3743
+ const matchedFindings = matchReferenceFindings(auditCase.referenceFindings, comments);
3744
+ const matchedCommentIds = new Set(matchedFindings.map((match) => match.commentId));
3745
+ const positiveComments = comments.filter((comment) => isPositiveOutcome(comment.outcome));
3746
+ const negativeComments = comments.filter((comment) => isNegativeOutcome(comment.outcome));
3747
+ const actionableComments = comments.filter(isActionableComment);
3748
+ const severityComments = comments.filter((comment) => comment.severity);
3749
+ const severityAligned = severityComments.filter(
3750
+ (comment) => isSeverityAligned(comment, auditCase.referenceFindings, matchedFindings)
3751
+ );
3752
+ const recall = auditCase.referenceFindings.length ? matchedFindings.length / auditCase.referenceFindings.length : comments.length === 0 ? 1 : 0;
3753
+ const precisionDenominator = positiveComments.length + negativeComments.length;
3754
+ const precision2 = precisionDenominator > 0 ? positiveComments.length / precisionDenominator : comments.length > 0 ? matchedCommentIds.size / comments.length : auditCase.referenceFindings.length === 0 ? 1 : 0;
3755
+ const actionability = comments.length ? actionableComments.length / comments.length : 1;
3756
+ const severityCalibration = severityComments.length ? severityAligned.length / severityComments.length : matchedFindings.length ? 0.5 : 1;
3757
+ const lowNoise = comments.length ? 1 - negativeComments.length / comments.length : 1;
3758
+ const aggregate2 = aggregatePrReviewScore(
3759
+ { recall, precision: precision2, actionability, severityCalibration, lowNoise },
3760
+ weights
3761
+ );
3762
+ return {
3763
+ caseId: auditCase.id,
3764
+ source,
3765
+ commentCount: comments.length,
3766
+ referenceCount: auditCase.referenceFindings.length,
3767
+ matchedFindings,
3768
+ recall,
3769
+ precision: precision2,
3770
+ actionability,
3771
+ severityCalibration,
3772
+ lowNoise,
3773
+ aggregate: aggregate2,
3774
+ notes: buildScoreNotes({
3775
+ comments,
3776
+ referenceCount: auditCase.referenceFindings.length,
3777
+ matchedFindings,
3778
+ negativeComments,
3779
+ actionableComments
3780
+ })
3781
+ };
3782
+ }
3783
+ function summarizePrReviewBenchmark(scores) {
3784
+ const bySource = /* @__PURE__ */ new Map();
3785
+ for (const score of scores) {
3786
+ bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
3787
+ }
3788
+ return [...bySource.entries()].map(([source, sourceScores]) => ({
3789
+ source,
3790
+ caseCount: sourceScores.length,
3791
+ commentCount: sum(sourceScores.map((score) => score.commentCount)),
3792
+ aggregateMean: mean(sourceScores.map((score) => score.aggregate)),
3793
+ recallMean: mean(sourceScores.map((score) => score.recall)),
3794
+ precisionMean: mean(sourceScores.map((score) => score.precision)),
3795
+ actionabilityMean: mean(sourceScores.map((score) => score.actionability)),
3796
+ severityCalibrationMean: mean(sourceScores.map((score) => score.severityCalibration)),
3797
+ lowNoiseMean: mean(sourceScores.map((score) => score.lowNoise))
3798
+ })).sort((a, b) => b.aggregateMean - a.aggregateMean);
3799
+ }
3800
+ function aggregatePrReviewScore(dimensions, weights = {}) {
3801
+ const merged = { ...DEFAULT_PR_REVIEW_SCORE_WEIGHTS, ...weights };
3802
+ const weightSum = Object.values(merged).reduce((total, value) => total + Math.max(0, value), 0);
3803
+ if (weightSum <= 0) return 0;
3804
+ return (merged.recall * clamp01(dimensions.recall) + merged.precision * clamp01(dimensions.precision) + merged.actionability * clamp01(dimensions.actionability) + merged.severityCalibration * clamp01(dimensions.severityCalibration) + merged.lowNoise * clamp01(dimensions.lowNoise)) / weightSum;
3805
+ }
3806
+ function matchReferenceFindings(references, comments) {
3807
+ const matches = [];
3808
+ const usedCommentIds = /* @__PURE__ */ new Set();
3809
+ for (const reference of references) {
3810
+ const candidates = comments.filter((comment) => !usedCommentIds.has(comment.id)).map((comment) => ({ comment, score: matchScore(reference, comment) })).filter(({ score }) => score >= 0.55).sort((a, b) => b.score - a.score);
3811
+ const best = candidates[0];
3812
+ if (!best) continue;
3813
+ usedCommentIds.add(best.comment.id);
3814
+ matches.push({ referenceId: reference.id, commentId: best.comment.id, score: best.score });
3815
+ }
3816
+ return matches;
3817
+ }
3818
+ function matchScore(reference, comment) {
3819
+ let score = 0;
3820
+ if (reference.sourceCommentIds?.includes(comment.id)) score += 1;
3821
+ if (reference.path && comment.path && normalizePath(reference.path) === normalizePath(comment.path)) {
3822
+ score += 0.35;
3823
+ }
3824
+ if (reference.line && comment.line && Math.abs(reference.line - comment.line) <= 3) score += 0.15;
3825
+ const terms = [...reference.keywords ?? [], ...tokenize(reference.title)];
3826
+ const uniqueTerms = [...new Set(terms.map(normalizeTerm).filter((term) => term.length >= 3))];
3827
+ if (uniqueTerms.length > 0) {
3828
+ const bodyTerms = new Set(tokenize(comment.body).map(normalizeTerm));
3829
+ const overlap = uniqueTerms.filter((term) => bodyTerms.has(term)).length;
3830
+ score += 0.5 * (overlap / uniqueTerms.length);
3831
+ }
3832
+ return clamp01(score);
3833
+ }
3834
+ function isActionableComment(comment) {
3835
+ const body = comment.body.trim();
3836
+ if (!comment.path && !/\b(file|line|function|method|class|module|test|migration)\b/i.test(body)) {
3837
+ return false;
3838
+ }
3839
+ return /\b(fix|change|add|remove|guard|check|reject|validate|test|assert|return|throw|fail|block)\b/i.test(
3840
+ body
3841
+ );
3842
+ }
3843
+ function isSeverityAligned(comment, references, matches) {
3844
+ if (!comment.severity) return false;
3845
+ const match = matches.find((candidate) => candidate.commentId === comment.id);
3846
+ if (!match) return comment.severity === "nit" || comment.severity === "low";
3847
+ const reference = references.find((candidate) => candidate.id === match.referenceId);
3848
+ if (!reference) return false;
3849
+ return Math.abs(severityRank(comment.severity) - severityRank(reference.severity)) <= 1;
3850
+ }
3851
+ function buildScoreNotes(input) {
3852
+ const notes = [];
3853
+ if (input.referenceCount > 0 && input.matchedFindings.length === 0) {
3854
+ notes.push("no reference findings matched");
3855
+ }
3856
+ if (input.negativeComments.length > 0) {
3857
+ notes.push(`${input.negativeComments.length} comment(s) labelled rejected/duplicate/noise`);
3858
+ }
3859
+ if (input.comments.length > 0 && input.actionableComments.length === 0) {
3860
+ notes.push("comments were not actionable enough for a PR reviewer benchmark");
3861
+ }
3862
+ return notes;
3863
+ }
3864
+ function isPositiveOutcome(outcome) {
3865
+ return outcome === "accepted" || outcome === "fixed";
3866
+ }
3867
+ function isNegativeOutcome(outcome) {
3868
+ return outcome === "rejected" || outcome === "duplicate" || outcome === "noise";
3869
+ }
3870
+ function severityRank(severity) {
3871
+ switch (severity) {
3872
+ case "critical":
3873
+ return 5;
3874
+ case "high":
3875
+ return 4;
3876
+ case "medium":
3877
+ return 3;
3878
+ case "low":
3879
+ return 2;
3880
+ case "nit":
3881
+ return 1;
3882
+ }
3883
+ }
3884
+ function tokenize(input) {
3885
+ return input.match(/[a-zA-Z0-9_.$/-]+/g) ?? [];
3886
+ }
3887
+ function normalizeTerm(input) {
3888
+ return input.toLowerCase().replace(/^[^a-z0-9_]+|[^a-z0-9_]+$/g, "");
3889
+ }
3890
+ function normalizePath(input) {
3891
+ return input.replace(/^\.\/+/, "");
3892
+ }
3893
+ function mean(values) {
3894
+ return values.length ? sum(values) / values.length : 0;
3895
+ }
3896
+ function sum(values) {
3897
+ return values.reduce((total, value) => total + value, 0);
3898
+ }
3899
+
3725
3900
  // src/production-loop.ts
3726
3901
  async function runProductionLoop(opts) {
3727
3902
  validate2(opts);
@@ -5217,14 +5392,14 @@ async function runHarnessExperiment(config) {
5217
5392
  const score = config.score ?? ((trace) => critic.scoreTrace(trace));
5218
5393
  const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
5219
5394
  const trace = await config.adapter.run(request);
5220
- const runScore = await score(trace, request);
5395
+ const runScore2 = await score(trace, request);
5221
5396
  const result = {
5222
5397
  variant: request.variant,
5223
5398
  scenario: request.scenario,
5224
5399
  trialIndex: request.trialIndex,
5225
5400
  trace,
5226
- score: runScore,
5227
- aggregate: aggregateRunScore(runScore, config.weights)
5401
+ score: runScore2,
5402
+ aggregate: aggregateRunScore(runScore2, config.weights)
5228
5403
  };
5229
5404
  await config.onResult?.(result);
5230
5405
  return result;
@@ -5251,10 +5426,10 @@ function summarizeHarnessResults(results) {
5251
5426
  return {
5252
5427
  variant,
5253
5428
  runs,
5254
- aggregateMean: mean(runs.map((r) => r.aggregate)),
5255
- passRate: mean(runs.map((r) => r.score.success)),
5256
- costUsdMean: mean(runs.map((r) => r.score.costUsd)),
5257
- wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)),
5429
+ aggregateMean: mean2(runs.map((r) => r.aggregate)),
5430
+ passRate: mean2(runs.map((r) => r.score.success)),
5431
+ costUsdMean: mean2(runs.map((r) => r.score.costUsd)),
5432
+ wallSecondsMean: mean2(runs.map((r) => r.score.wallSeconds)),
5258
5433
  scoreMean: meanRunScore(runs.map((r) => r.score))
5259
5434
  };
5260
5435
  }).sort((a, b) => b.aggregateMean - a.aggregateMean);
@@ -5291,22 +5466,22 @@ async function mapLimit(items, limit, fn) {
5291
5466
  );
5292
5467
  return results;
5293
5468
  }
5294
- function mean(values) {
5295
- return values.length ? values.reduce((sum2, value) => sum2 + value, 0) / values.length : 0;
5469
+ function mean2(values) {
5470
+ return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
5296
5471
  }
5297
5472
  function meanRunScore(scores) {
5298
5473
  return {
5299
- success: mean(scores.map((s) => s.success)),
5300
- goalProgress: mean(scores.map((s) => s.goalProgress)),
5301
- repoGroundedness: mean(scores.map((s) => s.repoGroundedness)),
5302
- driftPenalty: mean(scores.map((s) => s.driftPenalty)),
5303
- toolUseQuality: mean(scores.map((s) => s.toolUseQuality)),
5304
- patchQuality: mean(scores.map((s) => s.patchQuality)),
5305
- testReality: mean(scores.map((s) => s.testReality)),
5306
- finalGate: mean(scores.map((s) => s.finalGate)),
5307
- reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)),
5308
- costUsd: mean(scores.map((s) => s.costUsd)),
5309
- wallSeconds: mean(scores.map((s) => s.wallSeconds)),
5474
+ success: mean2(scores.map((s) => s.success)),
5475
+ goalProgress: mean2(scores.map((s) => s.goalProgress)),
5476
+ repoGroundedness: mean2(scores.map((s) => s.repoGroundedness)),
5477
+ driftPenalty: mean2(scores.map((s) => s.driftPenalty)),
5478
+ toolUseQuality: mean2(scores.map((s) => s.toolUseQuality)),
5479
+ patchQuality: mean2(scores.map((s) => s.patchQuality)),
5480
+ testReality: mean2(scores.map((s) => s.testReality)),
5481
+ finalGate: mean2(scores.map((s) => s.finalGate)),
5482
+ reviewerBlockers: mean2(scores.map((s) => s.reviewerBlockers)),
5483
+ costUsd: mean2(scores.map((s) => s.costUsd)),
5484
+ wallSeconds: mean2(scores.map((s) => s.wallSeconds)),
5310
5485
  notes: scores.flatMap((s) => s.notes ?? [])
5311
5486
  };
5312
5487
  }
@@ -5645,7 +5820,7 @@ function rankRows(rows, weights) {
5645
5820
  }
5646
5821
  return [...buckets.entries()].map(([variantId, values]) => ({
5647
5822
  variantId,
5648
- mean: values.reduce((sum2, value) => sum2 + value, 0) / values.length,
5823
+ mean: values.reduce((sum3, value) => sum3 + value, 0) / values.length,
5649
5824
  runs: values.length
5650
5825
  })).sort((a, b) => b.mean - a.mean);
5651
5826
  }
@@ -5815,6 +5990,22 @@ var BudgetGuard = class {
5815
5990
  }
5816
5991
  };
5817
5992
 
5993
+ // src/agent-profile.ts
5994
+ import { createHash as createHash2 } from "crypto";
5995
+ function agentProfileHash(profile) {
5996
+ if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
5997
+ throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
5998
+ }
5999
+ const behaviour = {
6000
+ model: profile.model.trim(),
6001
+ skills: [...profile.skills ?? []].sort(),
6002
+ promptVersion: profile.promptVersion ?? null,
6003
+ tools: [...profile.tools ?? []].sort(),
6004
+ metadata: profile.metadata ?? {}
6005
+ };
6006
+ return createHash2("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
6007
+ }
6008
+
5818
6009
  // src/cost-tracker.ts
5819
6010
  var CostTracker = class {
5820
6011
  byScenario = /* @__PURE__ */ new Map();
@@ -6221,6 +6412,194 @@ function isObject(v) {
6221
6412
  return typeof v === "object" && v !== null && !Array.isArray(v);
6222
6413
  }
6223
6414
 
6415
+ // src/scorecard.ts
6416
+ import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3 } from "fs";
6417
+ import { dirname as dirname2 } from "path";
6418
+ function median(xs) {
6419
+ if (xs.length === 0) return 0;
6420
+ const sorted = [...xs].sort((a, b) => a - b);
6421
+ const mid = Math.floor(sorted.length / 2);
6422
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6423
+ }
6424
+ function runScore(run) {
6425
+ return run.outcome.holdoutScore ?? run.outcome.searchScore;
6426
+ }
6427
+ function aggregatePerDimension(runs) {
6428
+ const sums = /* @__PURE__ */ new Map();
6429
+ for (const run of runs) {
6430
+ const dims = run.outcome.judgeScores?.perDimMean;
6431
+ if (!dims) continue;
6432
+ for (const [dim, value] of Object.entries(dims)) {
6433
+ if (!Number.isFinite(value)) continue;
6434
+ const acc = sums.get(dim) ?? { total: 0, count: 0 };
6435
+ acc.total += value;
6436
+ acc.count += 1;
6437
+ sums.set(dim, acc);
6438
+ }
6439
+ }
6440
+ if (sums.size === 0) return void 0;
6441
+ const out = {};
6442
+ for (const [dim, acc] of sums) out[dim] = acc.total / acc.count;
6443
+ return out;
6444
+ }
6445
+ function recordRuns(runs, opts) {
6446
+ const profileHash = agentProfileHash(opts.profile);
6447
+ const timestamp = opts.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
6448
+ const byScenario = /* @__PURE__ */ new Map();
6449
+ for (const run of runs) {
6450
+ const scenarioId = run.scenarioId;
6451
+ if (!scenarioId) continue;
6452
+ const bucket = byScenario.get(scenarioId);
6453
+ if (bucket) bucket.push(run);
6454
+ else byScenario.set(scenarioId, [run]);
6455
+ }
6456
+ const lines = [];
6457
+ for (const [scenarioId, scenarioRuns] of byScenario) {
6458
+ const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
6459
+ if (scored.length === 0) continue;
6460
+ const scores = scored.map((s) => s.score);
6461
+ const entry = {
6462
+ commitSha: opts.commitSha,
6463
+ timestamp,
6464
+ scores,
6465
+ composite: median(scores),
6466
+ runIds: scored.map((s) => s.run.runId)
6467
+ };
6468
+ const perDimension = aggregatePerDimension(scenarioRuns);
6469
+ if (perDimension) entry.perDimension = perDimension;
6470
+ lines.push({
6471
+ scenarioId,
6472
+ profileHash,
6473
+ model: opts.profile.model,
6474
+ profile: opts.profile,
6475
+ entry
6476
+ });
6477
+ }
6478
+ return lines;
6479
+ }
6480
+ function appendScorecard(logPath, lines) {
6481
+ if (lines.length === 0) return;
6482
+ mkdirSync2(dirname2(logPath), { recursive: true });
6483
+ appendFileSync2(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
6484
+ `);
6485
+ }
6486
+ function recordRunsToScorecard(logPath, runs, opts) {
6487
+ const lines = recordRuns(runs, opts);
6488
+ appendScorecard(logPath, lines);
6489
+ return lines;
6490
+ }
6491
+ function loadScorecard(logPath) {
6492
+ if (!existsSync4(logPath)) return { cells: [], profiles: {} };
6493
+ const cells = /* @__PURE__ */ new Map();
6494
+ const profiles = {};
6495
+ for (const raw of readFileSync3(logPath, "utf8").split("\n")) {
6496
+ const line = raw.trim();
6497
+ if (!line) continue;
6498
+ let parsed;
6499
+ try {
6500
+ parsed = JSON.parse(line);
6501
+ } catch {
6502
+ continue;
6503
+ }
6504
+ if (!parsed?.scenarioId || !parsed.profileHash || !parsed.entry) continue;
6505
+ const key = `${parsed.scenarioId}::${parsed.profileHash}`;
6506
+ let cell = cells.get(key);
6507
+ if (!cell) {
6508
+ cell = {
6509
+ scenarioId: parsed.scenarioId,
6510
+ profileHash: parsed.profileHash,
6511
+ model: parsed.model,
6512
+ timeline: []
6513
+ };
6514
+ cells.set(key, cell);
6515
+ }
6516
+ cell.timeline.push(parsed.entry);
6517
+ if (parsed.profile) profiles[parsed.profileHash] = parsed.profile;
6518
+ }
6519
+ for (const cell of cells.values()) {
6520
+ cell.timeline.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
6521
+ }
6522
+ return { cells: [...cells.values()], profiles };
6523
+ }
6524
+ function diffScorecard(scorecard, opts = {}) {
6525
+ const minEffect = opts.minEffect ?? 0.5;
6526
+ const maxP = opts.maxP ?? 0.05;
6527
+ const minDelta = opts.minDelta ?? 0.05;
6528
+ const cells = [];
6529
+ for (const cell of scorecard.cells) {
6530
+ const timeline = cell.timeline;
6531
+ if (timeline.length === 0) continue;
6532
+ const current = timeline[timeline.length - 1];
6533
+ const baseline = opts.baselineCommit ? [...timeline].reverse().find((e) => e.commitSha === opts.baselineCommit && e !== current) : timeline[timeline.length - 2];
6534
+ const base = {
6535
+ scenarioId: cell.scenarioId,
6536
+ profileHash: cell.profileHash,
6537
+ model: cell.model,
6538
+ current: current.composite,
6539
+ currentCommit: current.commitSha
6540
+ };
6541
+ if (!baseline) {
6542
+ cells.push({
6543
+ ...base,
6544
+ verdict: "new",
6545
+ baseline: null,
6546
+ delta: null,
6547
+ cohensD: null,
6548
+ pValue: null,
6549
+ baselineCommit: null
6550
+ });
6551
+ continue;
6552
+ }
6553
+ const delta = current.composite - baseline.composite;
6554
+ const canStat = baseline.scores.length >= 2 && current.scores.length >= 2;
6555
+ let d = null;
6556
+ let p = null;
6557
+ let verdict;
6558
+ if (canStat) {
6559
+ d = cohensD(baseline.scores, current.scores);
6560
+ const t = welchsTTest(baseline.scores, current.scores);
6561
+ p = Number.isFinite(t.p) ? t.p : null;
6562
+ const significant = Math.abs(d) >= minEffect && p !== null && p <= maxP;
6563
+ verdict = significant ? delta > 0 ? "improved" : "regressed" : "flat";
6564
+ } else {
6565
+ verdict = Math.abs(delta) >= minDelta ? delta > 0 ? "improved" : "regressed" : "flat";
6566
+ }
6567
+ cells.push({
6568
+ ...base,
6569
+ verdict,
6570
+ baseline: baseline.composite,
6571
+ delta,
6572
+ cohensD: d,
6573
+ pValue: p,
6574
+ baselineCommit: baseline.commitSha
6575
+ });
6576
+ }
6577
+ const summary = { improved: 0, regressed: 0, flat: 0, new: 0 };
6578
+ for (const cell of cells) summary[cell.verdict] += 1;
6579
+ return { cells, summary };
6580
+ }
6581
+ function formatScorecardDiff(diff) {
6582
+ const lines = [];
6583
+ const { summary } = diff;
6584
+ lines.push(
6585
+ `Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
6586
+ );
6587
+ const fmt = (n) => n.toFixed(3);
6588
+ const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
6589
+ if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
6590
+ return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
6591
+ });
6592
+ for (const cell of noteworthy) {
6593
+ const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
6594
+ const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt(cell.delta)}` : fmt(cell.delta) : "\u2014";
6595
+ const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
6596
+ lines.push(
6597
+ ` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt(cell.baseline ?? 0)} \u2192 ${fmt(cell.current)} ${deltaStr}${stat}`
6598
+ );
6599
+ }
6600
+ return lines.join("\n");
6601
+ }
6602
+
6224
6603
  // src/series-convergence.ts
6225
6604
  function analyzeSeries(values, options = {}) {
6226
6605
  const window = options.window ?? 5;
@@ -6230,10 +6609,10 @@ function analyzeSeries(values, options = {}) {
6230
6609
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
6231
6610
  }
6232
6611
  const tail = values.slice(-window);
6233
- const mean4 = tail.reduce((a, b) => a + b, 0) / tail.length;
6234
- const variance = tail.reduce((acc, v) => acc + (v - mean4) ** 2, 0) / tail.length;
6612
+ const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
6613
+ const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
6235
6614
  const stdDev = Math.sqrt(variance);
6236
- const refMean = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
6615
+ const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
6237
6616
  const cv = stdDev / refMean;
6238
6617
  const stable = tail.length >= window && cv <= stableCv;
6239
6618
  let tailRun = 0;
@@ -6254,7 +6633,7 @@ function analyzeSeries(values, options = {}) {
6254
6633
  } else {
6255
6634
  state = "noisy";
6256
6635
  }
6257
- return { state, windowMean: mean4, windowCv: cv, tailRun, stable };
6636
+ return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
6258
6637
  }
6259
6638
 
6260
6639
  // src/slo.ts
@@ -7052,12 +7431,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
7052
7431
  variantScores.push({ mutator: id, score, mutated });
7053
7432
  all.push(score);
7054
7433
  }
7055
- const mean4 = all.reduce((a, b) => a + b, 0) / all.length;
7056
- const variance = all.reduce((a, v) => a + (v - mean4) ** 2, 0) / all.length;
7434
+ const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
7435
+ const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
7057
7436
  const stdDev = Math.sqrt(variance);
7058
- const ref = Math.abs(mean4) > 1e-9 ? Math.abs(mean4) : 1;
7437
+ const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
7059
7438
  const robustness = Math.max(0, 1 - stdDev / ref);
7060
- return { originalScore, variantScores, meanScore: mean4, stdDev, robustness };
7439
+ return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
7061
7440
  }
7062
7441
  var lowercaseMutator = (p) => p.toLowerCase();
7063
7442
  var sentenceReorderMutator = (p, seed) => {
@@ -7123,8 +7502,8 @@ async function paraphraseRobustnessScenarios(args) {
7123
7502
  });
7124
7503
  scores.push(out.score);
7125
7504
  }
7126
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7127
- deltas[m.name] = mean4 - originalScore;
7505
+ const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
7506
+ deltas[m.name] = mean5 - originalScore;
7128
7507
  paraphrasedAll.push(...scores);
7129
7508
  }
7130
7509
  const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
@@ -7737,8 +8116,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7737
8116
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7738
8117
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7739
8118
  if (scores.length < 3) continue;
7740
- const mean4 = scores.reduce((a, b) => a + b, 0) / scores.length;
7741
- const variance = scores.reduce((a, b) => a + (b - mean4) ** 2, 0) / scores.length;
8119
+ const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
8120
+ const variance = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
7742
8121
  if (variance > varianceThreshold) {
7743
8122
  targets.push({
7744
8123
  reason: "high-variance",
@@ -7969,7 +8348,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
7969
8348
 
7970
8349
  // src/command-runner.ts
7971
8350
  import { spawnSync } from "child_process";
7972
- import { existsSync as existsSync4, readdirSync as readdirSync2, readFileSync as readFileSync3, statSync as statSync2 } from "fs";
8351
+ import { existsSync as existsSync5, readdirSync as readdirSync2, readFileSync as readFileSync4, statSync as statSync2 } from "fs";
7973
8352
  import { join as join2 } from "path";
7974
8353
  var localCommandRunner = {
7975
8354
  name: "local",
@@ -7998,11 +8377,11 @@ var localCommandRunner = {
7998
8377
  return r.status === 0 && (r.stdout ?? "").trim().length > 0;
7999
8378
  },
8000
8379
  async fileExists(path) {
8001
- return existsSync4(path);
8380
+ return existsSync5(path);
8002
8381
  },
8003
8382
  async readFile(path) {
8004
8383
  try {
8005
- return readFileSync3(path, "utf8");
8384
+ return readFileSync4(path, "utf8");
8006
8385
  } catch {
8007
8386
  return null;
8008
8387
  }
@@ -8240,7 +8619,7 @@ function extractErrorCount(text, opts = {}) {
8240
8619
  for (const p of patterns) {
8241
8620
  const matches = Array.from(text.matchAll(p.regex));
8242
8621
  if (matches.length === 0) continue;
8243
- const count = p.transform ? matches.reduce((sum2, m) => sum2 + p.transform(m), 0) : matches.length;
8622
+ const count = p.transform ? matches.reduce((sum3, m) => sum3 + p.transform(m), 0) : matches.length;
8244
8623
  return {
8245
8624
  count,
8246
8625
  matched: p.name,
@@ -8934,8 +9313,8 @@ function multiToolchainLayer(config) {
8934
9313
  }
8935
9314
 
8936
9315
  // src/reference-replay.ts
8937
- import { appendFileSync as appendFileSync2, existsSync as existsSync5, mkdirSync as mkdirSync2, readFileSync as readFileSync4 } from "fs";
8938
- import { dirname as dirname2 } from "path";
9316
+ import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5 } from "fs";
9317
+ import { dirname as dirname3 } from "path";
8939
9318
  var DEFAULT_MATCH_THRESHOLD = 0.55;
8940
9319
  var ALL_SPLITS = ["train", "dev", "test", "holdout"];
8941
9320
  async function runReferenceReplay(cases, options) {
@@ -9053,14 +9432,14 @@ function jsonlReferenceReplayStore(path) {
9053
9432
  return {
9054
9433
  async save(run) {
9055
9434
  await lock.runExclusive(() => {
9056
- mkdirSync2(dirname2(path), { recursive: true });
9057
- appendFileSync2(path, `${JSON.stringify(run)}
9435
+ mkdirSync3(dirname3(path), { recursive: true });
9436
+ appendFileSync3(path, `${JSON.stringify(run)}
9058
9437
  `);
9059
9438
  });
9060
9439
  },
9061
9440
  async list() {
9062
9441
  return lock.runExclusive(() => {
9063
- if (!existsSync5(path)) return [];
9442
+ if (!existsSync6(path)) return [];
9064
9443
  return readJsonl(path);
9065
9444
  });
9066
9445
  }
@@ -9149,7 +9528,7 @@ function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
9149
9528
  regressions
9150
9529
  };
9151
9530
  }
9152
- const requiredMeanDelta = mean2(compared.map((item) => item.f1Delta));
9531
+ const requiredMeanDelta = mean3(compared.map((item) => item.f1Delta));
9153
9532
  if (requiredMeanDelta < minF1Delta) {
9154
9533
  return {
9155
9534
  promote: false,
@@ -9284,8 +9663,8 @@ function scorePair(scenario, matcher, reference, candidate) {
9284
9663
  function buildScenarioScore(scenario, matches, falsePositives) {
9285
9664
  const matched = matches.filter((match) => match.matched).length;
9286
9665
  const total = scenario.references.length;
9287
- const matchedWeight = matches.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
9288
- const totalWeight = matches.reduce((sum2, match) => sum2 + match.weight, 0);
9666
+ const matchedWeight = matches.filter((match) => match.matched).reduce((sum3, match) => sum3 + match.weight, 0);
9667
+ const totalWeight = matches.reduce((sum3, match) => sum3 + match.weight, 0);
9289
9668
  const precision2 = ratio(matched, matched + falsePositives);
9290
9669
  const recall = ratio(matched, total);
9291
9670
  return {
@@ -9311,11 +9690,11 @@ function aggregateBySplit(scores) {
9311
9690
  return out;
9312
9691
  }
9313
9692
  function aggregateScenarioScores(scores) {
9314
- const matched = sum(scores.map((score) => score.matched));
9315
- const total = sum(scores.map((score) => score.total));
9316
- const falsePositives = sum(scores.map((score) => score.falsePositives));
9317
- const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9318
- const totalWeight = sum(scores.map((score) => score.totalWeight));
9693
+ const matched = sum2(scores.map((score) => score.matched));
9694
+ const total = sum2(scores.map((score) => score.total));
9695
+ const falsePositives = sum2(scores.map((score) => score.falsePositives));
9696
+ const matchedWeight = sum2(scores.map((score) => score.matchedWeight));
9697
+ const totalWeight = sum2(scores.map((score) => score.totalWeight));
9319
9698
  const precision2 = ratio(matched, matched + falsePositives);
9320
9699
  const recall = ratio(matched, total);
9321
9700
  return {
@@ -9382,11 +9761,11 @@ function clamp012(value) {
9382
9761
  if (!Number.isFinite(value)) return 0;
9383
9762
  return Math.max(0, Math.min(1, value));
9384
9763
  }
9385
- function sum(values) {
9764
+ function sum2(values) {
9386
9765
  return values.reduce((acc, value) => acc + value, 0);
9387
9766
  }
9388
- function mean2(values) {
9389
- return values.length ? sum(values) / values.length : 0;
9767
+ function mean3(values) {
9768
+ return values.length ? sum2(values) / values.length : 0;
9390
9769
  }
9391
9770
  function formatPct(value) {
9392
9771
  return `${(value * 100).toFixed(1)}%`;
@@ -9403,7 +9782,7 @@ function throwIfAborted(signal) {
9403
9782
  throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
9404
9783
  }
9405
9784
  function readJsonl(path) {
9406
- const raw = readFileSync4(path, "utf8");
9785
+ const raw = readFileSync5(path, "utf8");
9407
9786
  const out = [];
9408
9787
  for (const line of raw.split("\n")) {
9409
9788
  const trimmed = line.trim();
@@ -9650,8 +10029,8 @@ function detectCalibrationDrift(runs, opts) {
9650
10029
  alpha,
9651
10030
  recentN: recent.length,
9652
10031
  historyN: historical.length,
9653
- recentMean: mean3(recent),
9654
- historyMean: mean3(historical)
10032
+ recentMean: mean4(recent),
10033
+ historyMean: mean4(historical)
9655
10034
  }
9656
10035
  }
9657
10036
  ];
@@ -9771,7 +10150,7 @@ function chiSquareCritical(df, alpha) {
9771
10150
  }
9772
10151
  return TABLE[10][idx];
9773
10152
  }
9774
- function mean3(xs) {
10153
+ function mean4(xs) {
9775
10154
  if (xs.length === 0) return 0;
9776
10155
  return xs.reduce((s, x) => s + x, 0) / xs.length;
9777
10156
  }
@@ -9971,8 +10350,8 @@ async function discoverPersonas(dir, opts = {}) {
9971
10350
  }
9972
10351
 
9973
10352
  // src/evolution-telemetry.ts
9974
- import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5, writeFileSync } from "fs";
9975
- import { dirname as dirname3 } from "path";
10353
+ import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
10354
+ import { dirname as dirname4 } from "path";
9976
10355
  var MutationTelemetry = class {
9977
10356
  appender;
9978
10357
  constructor(path) {
@@ -10001,17 +10380,17 @@ var LineageRecorder = class {
10001
10380
  this.path = path;
10002
10381
  this.snapshotPath = `${path}.snapshot`;
10003
10382
  this.kindOf = kindOf ?? defaultKindOf;
10004
- mkdirSync3(dirname3(path), { recursive: true });
10005
- if (existsSync6(this.snapshotPath)) {
10383
+ mkdirSync4(dirname4(path), { recursive: true });
10384
+ if (existsSync7(this.snapshotPath)) {
10006
10385
  try {
10007
- const parsed = JSON.parse(readFileSync5(this.snapshotPath, "utf-8"));
10386
+ const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
10008
10387
  for (const n of parsed) this.nodes.set(n.id, n);
10009
10388
  } catch {
10010
10389
  }
10011
10390
  }
10012
- if (existsSync6(path)) {
10391
+ if (existsSync7(path)) {
10013
10392
  try {
10014
- for (const line of readFileSync5(path, "utf-8").split("\n")) {
10393
+ for (const line of readFileSync6(path, "utf-8").split("\n")) {
10015
10394
  if (!line.trim()) continue;
10016
10395
  try {
10017
10396
  const entry = JSON.parse(line);
@@ -10023,9 +10402,9 @@ var LineageRecorder = class {
10023
10402
  } catch {
10024
10403
  }
10025
10404
  }
10026
- if (existsSync6(path) && this.nodes.size === 0) {
10405
+ if (existsSync7(path) && this.nodes.size === 0) {
10027
10406
  try {
10028
- const raw = readFileSync5(path, "utf-8").trim();
10407
+ const raw = readFileSync6(path, "utf-8").trim();
10029
10408
  if (raw.startsWith("[")) {
10030
10409
  const parsed = JSON.parse(raw);
10031
10410
  for (const n of parsed) this.nodes.set(n.id, n);
@@ -10039,15 +10418,15 @@ var LineageRecorder = class {
10039
10418
  const prev = this.nodes.get(node.id);
10040
10419
  this.nodes.set(node.id, { ...prev, ...node });
10041
10420
  try {
10042
- if (existsSync6(this.path)) {
10043
- const head = readFileSync5(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
10421
+ if (existsSync7(this.path)) {
10422
+ const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
10044
10423
  if (head === "[") {
10045
10424
  writeFileSync(this.path, "");
10046
10425
  }
10047
10426
  }
10048
10427
  } catch {
10049
10428
  }
10050
- appendFileSync3(this.path, `${JSON.stringify(this.nodes.get(node.id))}
10429
+ appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
10051
10430
  `);
10052
10431
  });
10053
10432
  }
@@ -10106,9 +10485,9 @@ var CostLedger = class {
10106
10485
  mutex = new Mutex();
10107
10486
  constructor(path) {
10108
10487
  this.path = path;
10109
- if (existsSync6(path)) {
10488
+ if (existsSync7(path)) {
10110
10489
  try {
10111
- const loaded = JSON.parse(readFileSync5(path, "utf-8"));
10490
+ const loaded = JSON.parse(readFileSync6(path, "utf-8"));
10112
10491
  for (const k of Object.keys(this.totals)) {
10113
10492
  if (k === "byGeneration") {
10114
10493
  if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
@@ -10125,7 +10504,7 @@ var CostLedger = class {
10125
10504
  } catch {
10126
10505
  }
10127
10506
  } else {
10128
- mkdirSync3(dirname3(path), { recursive: true });
10507
+ mkdirSync4(dirname4(path), { recursive: true });
10129
10508
  }
10130
10509
  }
10131
10510
  genBucket(generation) {
@@ -10277,16 +10656,16 @@ function precision(goldens, candidates, options = {}) {
10277
10656
  }
10278
10657
 
10279
10658
  // src/jsonl-trial-cache.ts
10280
- import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6 } from "fs";
10281
- import { dirname as dirname4 } from "path";
10659
+ import { appendFileSync as appendFileSync5, existsSync as existsSync8, mkdirSync as mkdirSync5, readFileSync as readFileSync7 } from "fs";
10660
+ import { dirname as dirname5 } from "path";
10282
10661
  var JsonlTrialCache = class {
10283
10662
  map = /* @__PURE__ */ new Map();
10284
10663
  path;
10285
10664
  appender;
10286
10665
  constructor(path) {
10287
10666
  this.path = path;
10288
- if (existsSync7(path)) {
10289
- for (const line of readFileSync6(path, "utf-8").split("\n")) {
10667
+ if (existsSync8(path)) {
10668
+ for (const line of readFileSync7(path, "utf-8").split("\n")) {
10290
10669
  if (!line.trim()) continue;
10291
10670
  try {
10292
10671
  const entry = JSON.parse(line);
@@ -10295,7 +10674,7 @@ var JsonlTrialCache = class {
10295
10674
  }
10296
10675
  }
10297
10676
  } else {
10298
- mkdirSync4(dirname4(path), { recursive: true });
10677
+ mkdirSync5(dirname5(path), { recursive: true });
10299
10678
  }
10300
10679
  this.appender = new LockedJsonlAppender(path);
10301
10680
  }
@@ -10318,7 +10697,7 @@ var JsonlTrialCache = class {
10318
10697
  setSync(key, value) {
10319
10698
  this.map.set(key, value);
10320
10699
  const line = { key, result: value, writtenAt: Date.now() };
10321
- appendFileSync4(this.path, `${JSON.stringify(line)}
10700
+ appendFileSync5(this.path, `${JSON.stringify(line)}
10322
10701
  `);
10323
10702
  }
10324
10703
  };
@@ -10401,9 +10780,9 @@ function passOrthogonality(input) {
10401
10780
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
10402
10781
  }
10403
10782
  }
10404
- const mean4 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10783
+ const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10405
10784
  return {
10406
- orthogonality: Math.max(0, Math.min(1, 1 - mean4)),
10785
+ orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
10407
10786
  passCount: passes.length,
10408
10787
  similarities: sims
10409
10788
  };
@@ -10653,6 +11032,7 @@ function aggregateTrialsByMode(trials, opts) {
10653
11032
  };
10654
11033
  }
10655
11034
  export {
11035
+ AGENT_PROFILE_KINDS,
10656
11036
  ANALYST_SEVERITIES,
10657
11037
  AgentDriver,
10658
11038
  AgentEvalError,
@@ -10678,6 +11058,7 @@ export {
10678
11058
  DEFAULT_HARNESS_OBJECTIVES,
10679
11059
  DEFAULT_MUTATION_PRIMITIVES,
10680
11060
  DEFAULT_MUTATORS,
11061
+ DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
10681
11062
  DEFAULT_REDACTION_RULES,
10682
11063
  DEFAULT_RED_TEAM_CORPUS,
10683
11064
  DEFAULT_RUN_SCORE_WEIGHTS,
@@ -10766,13 +11147,16 @@ export {
10766
11147
  adversarialJudge,
10767
11148
  agentProfileCellHashMaterial,
10768
11149
  agentProfileCellKey,
11150
+ agentProfileHash,
10769
11151
  aggregateLlm,
11152
+ aggregatePrReviewScore,
10770
11153
  aggregateRunScore,
10771
11154
  aggregateTrialsByMode,
10772
11155
  allCriticalPassed,
10773
11156
  analyzeAntiSlop,
10774
11157
  analyzeSeries,
10775
11158
  analyzeTraces,
11159
+ appendScorecard,
10776
11160
  argHash,
10777
11161
  assertLlmRoute,
10778
11162
  assertRealBackend,
@@ -10793,6 +11177,7 @@ export {
10793
11177
  buildDriverSystemPrompt,
10794
11178
  buildReflectionPrompt,
10795
11179
  buildReviewerPrompt,
11180
+ buildSandboxAgentProfileCell,
10796
11181
  buildTraceAnalystTools,
10797
11182
  buildTraceInsightContext,
10798
11183
  buildTraceInsightPrompt,
@@ -10816,6 +11201,7 @@ export {
10816
11201
  cohensD,
10817
11202
  coherenceJudge,
10818
11203
  collectionPreserved,
11204
+ commentsForSource,
10819
11205
  commitBisect,
10820
11206
  compareReferenceReplay,
10821
11207
  compareToBaseline,
@@ -10866,6 +11252,7 @@ export {
10866
11252
  deployGateLayer,
10867
11253
  describeTraceInsightScope,
10868
11254
  diffFindings,
11255
+ diffScorecard,
10869
11256
  discoverPersonas,
10870
11257
  distillPlaybook,
10871
11258
  domainEvidencePattern,
@@ -10901,6 +11288,7 @@ export {
10901
11288
  formatBenchmarkReport,
10902
11289
  formatDriverReport,
10903
11290
  formatFindings,
11291
+ formatScorecardDiff,
10904
11292
  gainHistogram,
10905
11293
  ghCliClient,
10906
11294
  precision as goldenPrecision,
@@ -10943,6 +11331,7 @@ export {
10943
11331
  linterJudge,
10944
11332
  llmSpanFromProvider,
10945
11333
  llmSpans,
11334
+ loadScorecard,
10946
11335
  loadScorerFromGrader,
10947
11336
  localCommandRunner,
10948
11337
  lowercaseMutator,
@@ -10984,6 +11373,8 @@ export {
10984
11373
  proposeSynthesisTargets,
10985
11374
  providerFromBaseUrl,
10986
11375
  pytestTestParser,
11376
+ recordRuns,
11377
+ recordRunsToScorecard,
10987
11378
  redTeamDataset,
10988
11379
  redTeamReport,
10989
11380
  redactString,
@@ -11042,6 +11433,8 @@ export {
11042
11433
  scoreContinuity,
11043
11434
  scoreFromEvals,
11044
11435
  scoreKnowledgeReadiness,
11436
+ scorePrReviewComments,
11437
+ scorePrReviewSource,
11045
11438
  scoreRedTeamOutput,
11046
11439
  scoreReferenceReplay,
11047
11440
  scoreTraceInsightReadiness,
@@ -11060,11 +11453,13 @@ export {
11060
11453
  summarize,
11061
11454
  summarizeBackendIntegrity,
11062
11455
  summarizeHarnessResults,
11456
+ summarizePrReviewBenchmark,
11063
11457
  summarizePreferenceMemory,
11064
11458
  summaryTable,
11065
11459
  testJudge,
11066
11460
  textInSnapshot,
11067
11461
  throwIfRunIncomplete,
11462
+ toAgentProfileJson,
11068
11463
  toLangfuseEnvelope,
11069
11464
  toPrometheusText,
11070
11465
  tokenizeDomainWords,