@tangle-network/agent-eval 0.33.1 → 0.34.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-FT3IAMQR.js → chunk-3HYQXPC2.js} +2 -2
- package/dist/{chunk-WRGHMGWT.js → chunk-7PR3WPWE.js} +2 -2
- package/dist/{chunk-SQYRO3BT.js → chunk-RL6TERL2.js} +2 -2
- package/dist/{chunk-DCZXFOQN.js → chunk-TSPOEDM3.js} +56 -1
- package/dist/chunk-TSPOEDM3.js.map +1 -0
- package/dist/{control-C3k02SCP.d.ts → control-DVrmvM_k.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +2 -2
- package/dist/{index-ClMxVqe_.d.ts → index-0pu_fBwZ.d.ts} +1 -1
- package/dist/index.d.ts +271 -11
- package/dist/index.js +487 -92
- package/dist/index.js.map +1 -1
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -3
- package/dist/optimization.js +3 -3
- package/dist/{release-report-ChfmCmLi.d.ts → release-report-D2ykiLSe.d.ts} +2 -2
- package/dist/reporting.d.ts +4 -4
- package/dist/{researcher-CfnL3HEb.d.ts → researcher-DeZ_EArp.d.ts} +2 -2
- package/dist/rl.d.ts +5 -5
- package/dist/rl.js +2 -2
- package/dist/{rubric-predictive-validity-BvaNwfBE.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
- package/dist/{run-record-YinVdFwu.d.ts → run-record-BGY6bHRh.d.ts} +37 -1
- package/dist/{summary-report-BPJVzIeW.d.ts → summary-report-DuZXOk7K.d.ts} +1 -1
- package/package.json +12 -22
- package/dist/chunk-DCZXFOQN.js.map +0 -1
- /package/dist/{chunk-FT3IAMQR.js.map → chunk-3HYQXPC2.js.map} +0 -0
- /package/dist/{chunk-WRGHMGWT.js.map → chunk-7PR3WPWE.js.map} +0 -0
- /package/dist/{chunk-SQYRO3BT.js.map → chunk-RL6TERL2.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -54,7 +54,7 @@ import {
|
|
|
54
54
|
runProposeReview,
|
|
55
55
|
runProposeReviewAsControlLoop,
|
|
56
56
|
scoreFromEvals
|
|
57
|
-
} from "./chunk-
|
|
57
|
+
} from "./chunk-7PR3WPWE.js";
|
|
58
58
|
import {
|
|
59
59
|
allCriticalPassed,
|
|
60
60
|
objectiveEval,
|
|
@@ -96,7 +96,7 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
99
|
+
} from "./chunk-3HYQXPC2.js";
|
|
100
100
|
import {
|
|
101
101
|
assertReleaseConfidence,
|
|
102
102
|
bootstrapCi,
|
|
@@ -107,7 +107,7 @@ import {
|
|
|
107
107
|
} from "./chunk-LGAPK7NA.js";
|
|
108
108
|
import {
|
|
109
109
|
runEvalCampaign
|
|
110
|
-
} from "./chunk-
|
|
110
|
+
} from "./chunk-RL6TERL2.js";
|
|
111
111
|
import {
|
|
112
112
|
LlmCallError,
|
|
113
113
|
LlmClient,
|
|
@@ -121,21 +121,24 @@ import {
|
|
|
121
121
|
stripFencedJson
|
|
122
122
|
} from "./chunk-VXNVVBZO.js";
|
|
123
123
|
import {
|
|
124
|
+
AGENT_PROFILE_KINDS,
|
|
124
125
|
AgentProfileCellValidationError,
|
|
125
126
|
RunRecordValidationError,
|
|
126
127
|
agentProfileCellHashMaterial,
|
|
127
128
|
agentProfileCellKey,
|
|
128
129
|
assertRunAgentProfileCell,
|
|
129
130
|
buildAgentProfileCell,
|
|
131
|
+
buildSandboxAgentProfileCell,
|
|
130
132
|
groupRunsByAgentProfileCell,
|
|
131
133
|
isRunRecord,
|
|
132
134
|
parseRunRecordSafe,
|
|
133
135
|
requireAgentProfileCell,
|
|
134
136
|
roundTripRunRecord,
|
|
137
|
+
toAgentProfileJson,
|
|
135
138
|
validateAgentProfileCell,
|
|
136
139
|
validateRunRecord,
|
|
137
140
|
verifyAgentProfileCell
|
|
138
|
-
} from "./chunk-
|
|
141
|
+
} from "./chunk-TSPOEDM3.js";
|
|
139
142
|
import {
|
|
140
143
|
evaluateInterimReleaseConfidence,
|
|
141
144
|
pairedEvalueSequence
|
|
@@ -333,7 +336,7 @@ var RunCritic = class {
|
|
|
333
336
|
);
|
|
334
337
|
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
335
338
|
if (!success) notes.push("run did not complete with pass=true");
|
|
336
|
-
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((
|
|
339
|
+
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
337
340
|
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
|
|
338
341
|
trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
|
|
339
342
|
) : void 0;
|
|
@@ -348,7 +351,7 @@ var RunCritic = class {
|
|
|
348
351
|
(span) => typeof span.testsTotal === "number" && span.testsTotal > 0
|
|
349
352
|
);
|
|
350
353
|
const testReality = sandboxTests.length ? sandboxTests.reduce(
|
|
351
|
-
(
|
|
354
|
+
(sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
|
|
352
355
|
0
|
|
353
356
|
) / sandboxTests.length : toolSpans2.some(
|
|
354
357
|
(span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
|
|
@@ -370,7 +373,7 @@ var RunCritic = class {
|
|
|
370
373
|
const costUsd = trace.budget.length ? Math.max(
|
|
371
374
|
...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
|
|
372
375
|
0
|
|
373
|
-
) : llmSpans2.reduce((
|
|
376
|
+
) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
|
|
374
377
|
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
375
378
|
return {
|
|
376
379
|
success,
|
|
@@ -1960,12 +1963,12 @@ function allocateBudget(policy, args) {
|
|
|
1960
1963
|
return policy.totalUsd / Math.max(1, args.runningCount);
|
|
1961
1964
|
}
|
|
1962
1965
|
function sumFindingCost(findings) {
|
|
1963
|
-
let
|
|
1966
|
+
let sum3 = 0;
|
|
1964
1967
|
for (const f of findings) {
|
|
1965
1968
|
const c = f.metadata?.cost_usd;
|
|
1966
|
-
if (typeof c === "number" && Number.isFinite(c))
|
|
1969
|
+
if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
|
|
1967
1970
|
}
|
|
1968
|
-
return
|
|
1971
|
+
return sum3;
|
|
1969
1972
|
}
|
|
1970
1973
|
function selectPriorFindings(source, analystId) {
|
|
1971
1974
|
if (!source) return void 0;
|
|
@@ -2184,10 +2187,10 @@ function ghCliClient(opts = {}) {
|
|
|
2184
2187
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
2185
2188
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
2186
2189
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
2187
|
-
const { dirname:
|
|
2190
|
+
const { dirname: dirname6, join: join4, resolve } = await import("path");
|
|
2188
2191
|
for (const change of input.fileChanges) {
|
|
2189
2192
|
const abs = resolve(cwd, change.path);
|
|
2190
|
-
await mkdir(
|
|
2193
|
+
await mkdir(dirname6(abs), { recursive: true });
|
|
2191
2194
|
await writeFile(abs, change.contents, "utf8");
|
|
2192
2195
|
await run("git", ["add", join4(change.path)]);
|
|
2193
2196
|
}
|
|
@@ -3722,6 +3725,178 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
|
|
|
3722
3725
|
};
|
|
3723
3726
|
}
|
|
3724
3727
|
|
|
3728
|
+
// src/pr-review-benchmark.ts
|
|
3729
|
+
var DEFAULT_PR_REVIEW_SCORE_WEIGHTS = {
|
|
3730
|
+
recall: 4,
|
|
3731
|
+
precision: 2,
|
|
3732
|
+
actionability: 1.5,
|
|
3733
|
+
severityCalibration: 1,
|
|
3734
|
+
lowNoise: 1
|
|
3735
|
+
};
|
|
3736
|
+
function commentsForSource(auditCase, source) {
|
|
3737
|
+
return auditCase.comments.filter((comment) => comment.source === source);
|
|
3738
|
+
}
|
|
3739
|
+
function scorePrReviewSource(auditCase, source, weights = {}) {
|
|
3740
|
+
return scorePrReviewComments(auditCase, commentsForSource(auditCase, source), source, weights);
|
|
3741
|
+
}
|
|
3742
|
+
function scorePrReviewComments(auditCase, comments, source, weights = {}) {
|
|
3743
|
+
const matchedFindings = matchReferenceFindings(auditCase.referenceFindings, comments);
|
|
3744
|
+
const matchedCommentIds = new Set(matchedFindings.map((match) => match.commentId));
|
|
3745
|
+
const positiveComments = comments.filter((comment) => isPositiveOutcome(comment.outcome));
|
|
3746
|
+
const negativeComments = comments.filter((comment) => isNegativeOutcome(comment.outcome));
|
|
3747
|
+
const actionableComments = comments.filter(isActionableComment);
|
|
3748
|
+
const severityComments = comments.filter((comment) => comment.severity);
|
|
3749
|
+
const severityAligned = severityComments.filter(
|
|
3750
|
+
(comment) => isSeverityAligned(comment, auditCase.referenceFindings, matchedFindings)
|
|
3751
|
+
);
|
|
3752
|
+
const recall = auditCase.referenceFindings.length ? matchedFindings.length / auditCase.referenceFindings.length : comments.length === 0 ? 1 : 0;
|
|
3753
|
+
const precisionDenominator = positiveComments.length + negativeComments.length;
|
|
3754
|
+
const precision2 = precisionDenominator > 0 ? positiveComments.length / precisionDenominator : comments.length > 0 ? matchedCommentIds.size / comments.length : auditCase.referenceFindings.length === 0 ? 1 : 0;
|
|
3755
|
+
const actionability = comments.length ? actionableComments.length / comments.length : 1;
|
|
3756
|
+
const severityCalibration = severityComments.length ? severityAligned.length / severityComments.length : matchedFindings.length ? 0.5 : 1;
|
|
3757
|
+
const lowNoise = comments.length ? 1 - negativeComments.length / comments.length : 1;
|
|
3758
|
+
const aggregate2 = aggregatePrReviewScore(
|
|
3759
|
+
{ recall, precision: precision2, actionability, severityCalibration, lowNoise },
|
|
3760
|
+
weights
|
|
3761
|
+
);
|
|
3762
|
+
return {
|
|
3763
|
+
caseId: auditCase.id,
|
|
3764
|
+
source,
|
|
3765
|
+
commentCount: comments.length,
|
|
3766
|
+
referenceCount: auditCase.referenceFindings.length,
|
|
3767
|
+
matchedFindings,
|
|
3768
|
+
recall,
|
|
3769
|
+
precision: precision2,
|
|
3770
|
+
actionability,
|
|
3771
|
+
severityCalibration,
|
|
3772
|
+
lowNoise,
|
|
3773
|
+
aggregate: aggregate2,
|
|
3774
|
+
notes: buildScoreNotes({
|
|
3775
|
+
comments,
|
|
3776
|
+
referenceCount: auditCase.referenceFindings.length,
|
|
3777
|
+
matchedFindings,
|
|
3778
|
+
negativeComments,
|
|
3779
|
+
actionableComments
|
|
3780
|
+
})
|
|
3781
|
+
};
|
|
3782
|
+
}
|
|
3783
|
+
function summarizePrReviewBenchmark(scores) {
|
|
3784
|
+
const bySource = /* @__PURE__ */ new Map();
|
|
3785
|
+
for (const score of scores) {
|
|
3786
|
+
bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
|
|
3787
|
+
}
|
|
3788
|
+
return [...bySource.entries()].map(([source, sourceScores]) => ({
|
|
3789
|
+
source,
|
|
3790
|
+
caseCount: sourceScores.length,
|
|
3791
|
+
commentCount: sum(sourceScores.map((score) => score.commentCount)),
|
|
3792
|
+
aggregateMean: mean(sourceScores.map((score) => score.aggregate)),
|
|
3793
|
+
recallMean: mean(sourceScores.map((score) => score.recall)),
|
|
3794
|
+
precisionMean: mean(sourceScores.map((score) => score.precision)),
|
|
3795
|
+
actionabilityMean: mean(sourceScores.map((score) => score.actionability)),
|
|
3796
|
+
severityCalibrationMean: mean(sourceScores.map((score) => score.severityCalibration)),
|
|
3797
|
+
lowNoiseMean: mean(sourceScores.map((score) => score.lowNoise))
|
|
3798
|
+
})).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
3799
|
+
}
|
|
3800
|
+
function aggregatePrReviewScore(dimensions, weights = {}) {
|
|
3801
|
+
const merged = { ...DEFAULT_PR_REVIEW_SCORE_WEIGHTS, ...weights };
|
|
3802
|
+
const weightSum = Object.values(merged).reduce((total, value) => total + Math.max(0, value), 0);
|
|
3803
|
+
if (weightSum <= 0) return 0;
|
|
3804
|
+
return (merged.recall * clamp01(dimensions.recall) + merged.precision * clamp01(dimensions.precision) + merged.actionability * clamp01(dimensions.actionability) + merged.severityCalibration * clamp01(dimensions.severityCalibration) + merged.lowNoise * clamp01(dimensions.lowNoise)) / weightSum;
|
|
3805
|
+
}
|
|
3806
|
+
function matchReferenceFindings(references, comments) {
|
|
3807
|
+
const matches = [];
|
|
3808
|
+
const usedCommentIds = /* @__PURE__ */ new Set();
|
|
3809
|
+
for (const reference of references) {
|
|
3810
|
+
const candidates = comments.filter((comment) => !usedCommentIds.has(comment.id)).map((comment) => ({ comment, score: matchScore(reference, comment) })).filter(({ score }) => score >= 0.55).sort((a, b) => b.score - a.score);
|
|
3811
|
+
const best = candidates[0];
|
|
3812
|
+
if (!best) continue;
|
|
3813
|
+
usedCommentIds.add(best.comment.id);
|
|
3814
|
+
matches.push({ referenceId: reference.id, commentId: best.comment.id, score: best.score });
|
|
3815
|
+
}
|
|
3816
|
+
return matches;
|
|
3817
|
+
}
|
|
3818
|
+
function matchScore(reference, comment) {
|
|
3819
|
+
let score = 0;
|
|
3820
|
+
if (reference.sourceCommentIds?.includes(comment.id)) score += 1;
|
|
3821
|
+
if (reference.path && comment.path && normalizePath(reference.path) === normalizePath(comment.path)) {
|
|
3822
|
+
score += 0.35;
|
|
3823
|
+
}
|
|
3824
|
+
if (reference.line && comment.line && Math.abs(reference.line - comment.line) <= 3) score += 0.15;
|
|
3825
|
+
const terms = [...reference.keywords ?? [], ...tokenize(reference.title)];
|
|
3826
|
+
const uniqueTerms = [...new Set(terms.map(normalizeTerm).filter((term) => term.length >= 3))];
|
|
3827
|
+
if (uniqueTerms.length > 0) {
|
|
3828
|
+
const bodyTerms = new Set(tokenize(comment.body).map(normalizeTerm));
|
|
3829
|
+
const overlap = uniqueTerms.filter((term) => bodyTerms.has(term)).length;
|
|
3830
|
+
score += 0.5 * (overlap / uniqueTerms.length);
|
|
3831
|
+
}
|
|
3832
|
+
return clamp01(score);
|
|
3833
|
+
}
|
|
3834
|
+
function isActionableComment(comment) {
|
|
3835
|
+
const body = comment.body.trim();
|
|
3836
|
+
if (!comment.path && !/\b(file|line|function|method|class|module|test|migration)\b/i.test(body)) {
|
|
3837
|
+
return false;
|
|
3838
|
+
}
|
|
3839
|
+
return /\b(fix|change|add|remove|guard|check|reject|validate|test|assert|return|throw|fail|block)\b/i.test(
|
|
3840
|
+
body
|
|
3841
|
+
);
|
|
3842
|
+
}
|
|
3843
|
+
function isSeverityAligned(comment, references, matches) {
|
|
3844
|
+
if (!comment.severity) return false;
|
|
3845
|
+
const match = matches.find((candidate) => candidate.commentId === comment.id);
|
|
3846
|
+
if (!match) return comment.severity === "nit" || comment.severity === "low";
|
|
3847
|
+
const reference = references.find((candidate) => candidate.id === match.referenceId);
|
|
3848
|
+
if (!reference) return false;
|
|
3849
|
+
return Math.abs(severityRank(comment.severity) - severityRank(reference.severity)) <= 1;
|
|
3850
|
+
}
|
|
3851
|
+
function buildScoreNotes(input) {
|
|
3852
|
+
const notes = [];
|
|
3853
|
+
if (input.referenceCount > 0 && input.matchedFindings.length === 0) {
|
|
3854
|
+
notes.push("no reference findings matched");
|
|
3855
|
+
}
|
|
3856
|
+
if (input.negativeComments.length > 0) {
|
|
3857
|
+
notes.push(`${input.negativeComments.length} comment(s) labelled rejected/duplicate/noise`);
|
|
3858
|
+
}
|
|
3859
|
+
if (input.comments.length > 0 && input.actionableComments.length === 0) {
|
|
3860
|
+
notes.push("comments were not actionable enough for a PR reviewer benchmark");
|
|
3861
|
+
}
|
|
3862
|
+
return notes;
|
|
3863
|
+
}
|
|
3864
|
+
function isPositiveOutcome(outcome) {
|
|
3865
|
+
return outcome === "accepted" || outcome === "fixed";
|
|
3866
|
+
}
|
|
3867
|
+
function isNegativeOutcome(outcome) {
|
|
3868
|
+
return outcome === "rejected" || outcome === "duplicate" || outcome === "noise";
|
|
3869
|
+
}
|
|
3870
|
+
function severityRank(severity) {
|
|
3871
|
+
switch (severity) {
|
|
3872
|
+
case "critical":
|
|
3873
|
+
return 5;
|
|
3874
|
+
case "high":
|
|
3875
|
+
return 4;
|
|
3876
|
+
case "medium":
|
|
3877
|
+
return 3;
|
|
3878
|
+
case "low":
|
|
3879
|
+
return 2;
|
|
3880
|
+
case "nit":
|
|
3881
|
+
return 1;
|
|
3882
|
+
}
|
|
3883
|
+
}
|
|
3884
|
+
function tokenize(input) {
|
|
3885
|
+
return input.match(/[a-zA-Z0-9_.$/-]+/g) ?? [];
|
|
3886
|
+
}
|
|
3887
|
+
function normalizeTerm(input) {
|
|
3888
|
+
return input.toLowerCase().replace(/^[^a-z0-9_]+|[^a-z0-9_]+$/g, "");
|
|
3889
|
+
}
|
|
3890
|
+
function normalizePath(input) {
|
|
3891
|
+
return input.replace(/^\.\/+/, "");
|
|
3892
|
+
}
|
|
3893
|
+
function mean(values) {
|
|
3894
|
+
return values.length ? sum(values) / values.length : 0;
|
|
3895
|
+
}
|
|
3896
|
+
function sum(values) {
|
|
3897
|
+
return values.reduce((total, value) => total + value, 0);
|
|
3898
|
+
}
|
|
3899
|
+
|
|
3725
3900
|
// src/production-loop.ts
|
|
3726
3901
|
async function runProductionLoop(opts) {
|
|
3727
3902
|
validate2(opts);
|
|
@@ -5217,14 +5392,14 @@ async function runHarnessExperiment(config) {
|
|
|
5217
5392
|
const score = config.score ?? ((trace) => critic.scoreTrace(trace));
|
|
5218
5393
|
const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
|
|
5219
5394
|
const trace = await config.adapter.run(request);
|
|
5220
|
-
const
|
|
5395
|
+
const runScore2 = await score(trace, request);
|
|
5221
5396
|
const result = {
|
|
5222
5397
|
variant: request.variant,
|
|
5223
5398
|
scenario: request.scenario,
|
|
5224
5399
|
trialIndex: request.trialIndex,
|
|
5225
5400
|
trace,
|
|
5226
|
-
score:
|
|
5227
|
-
aggregate: aggregateRunScore(
|
|
5401
|
+
score: runScore2,
|
|
5402
|
+
aggregate: aggregateRunScore(runScore2, config.weights)
|
|
5228
5403
|
};
|
|
5229
5404
|
await config.onResult?.(result);
|
|
5230
5405
|
return result;
|
|
@@ -5251,10 +5426,10 @@ function summarizeHarnessResults(results) {
|
|
|
5251
5426
|
return {
|
|
5252
5427
|
variant,
|
|
5253
5428
|
runs,
|
|
5254
|
-
aggregateMean:
|
|
5255
|
-
passRate:
|
|
5256
|
-
costUsdMean:
|
|
5257
|
-
wallSecondsMean:
|
|
5429
|
+
aggregateMean: mean2(runs.map((r) => r.aggregate)),
|
|
5430
|
+
passRate: mean2(runs.map((r) => r.score.success)),
|
|
5431
|
+
costUsdMean: mean2(runs.map((r) => r.score.costUsd)),
|
|
5432
|
+
wallSecondsMean: mean2(runs.map((r) => r.score.wallSeconds)),
|
|
5258
5433
|
scoreMean: meanRunScore(runs.map((r) => r.score))
|
|
5259
5434
|
};
|
|
5260
5435
|
}).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
@@ -5291,22 +5466,22 @@ async function mapLimit(items, limit, fn) {
|
|
|
5291
5466
|
);
|
|
5292
5467
|
return results;
|
|
5293
5468
|
}
|
|
5294
|
-
function
|
|
5295
|
-
return values.length ? values.reduce((
|
|
5469
|
+
function mean2(values) {
|
|
5470
|
+
return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
|
|
5296
5471
|
}
|
|
5297
5472
|
function meanRunScore(scores) {
|
|
5298
5473
|
return {
|
|
5299
|
-
success:
|
|
5300
|
-
goalProgress:
|
|
5301
|
-
repoGroundedness:
|
|
5302
|
-
driftPenalty:
|
|
5303
|
-
toolUseQuality:
|
|
5304
|
-
patchQuality:
|
|
5305
|
-
testReality:
|
|
5306
|
-
finalGate:
|
|
5307
|
-
reviewerBlockers:
|
|
5308
|
-
costUsd:
|
|
5309
|
-
wallSeconds:
|
|
5474
|
+
success: mean2(scores.map((s) => s.success)),
|
|
5475
|
+
goalProgress: mean2(scores.map((s) => s.goalProgress)),
|
|
5476
|
+
repoGroundedness: mean2(scores.map((s) => s.repoGroundedness)),
|
|
5477
|
+
driftPenalty: mean2(scores.map((s) => s.driftPenalty)),
|
|
5478
|
+
toolUseQuality: mean2(scores.map((s) => s.toolUseQuality)),
|
|
5479
|
+
patchQuality: mean2(scores.map((s) => s.patchQuality)),
|
|
5480
|
+
testReality: mean2(scores.map((s) => s.testReality)),
|
|
5481
|
+
finalGate: mean2(scores.map((s) => s.finalGate)),
|
|
5482
|
+
reviewerBlockers: mean2(scores.map((s) => s.reviewerBlockers)),
|
|
5483
|
+
costUsd: mean2(scores.map((s) => s.costUsd)),
|
|
5484
|
+
wallSeconds: mean2(scores.map((s) => s.wallSeconds)),
|
|
5310
5485
|
notes: scores.flatMap((s) => s.notes ?? [])
|
|
5311
5486
|
};
|
|
5312
5487
|
}
|
|
@@ -5645,7 +5820,7 @@ function rankRows(rows, weights) {
|
|
|
5645
5820
|
}
|
|
5646
5821
|
return [...buckets.entries()].map(([variantId, values]) => ({
|
|
5647
5822
|
variantId,
|
|
5648
|
-
mean: values.reduce((
|
|
5823
|
+
mean: values.reduce((sum3, value) => sum3 + value, 0) / values.length,
|
|
5649
5824
|
runs: values.length
|
|
5650
5825
|
})).sort((a, b) => b.mean - a.mean);
|
|
5651
5826
|
}
|
|
@@ -5815,6 +5990,22 @@ var BudgetGuard = class {
|
|
|
5815
5990
|
}
|
|
5816
5991
|
};
|
|
5817
5992
|
|
|
5993
|
+
// src/agent-profile.ts
|
|
5994
|
+
import { createHash as createHash2 } from "crypto";
|
|
5995
|
+
function agentProfileHash(profile) {
|
|
5996
|
+
if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
|
|
5997
|
+
throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
|
|
5998
|
+
}
|
|
5999
|
+
const behaviour = {
|
|
6000
|
+
model: profile.model.trim(),
|
|
6001
|
+
skills: [...profile.skills ?? []].sort(),
|
|
6002
|
+
promptVersion: profile.promptVersion ?? null,
|
|
6003
|
+
tools: [...profile.tools ?? []].sort(),
|
|
6004
|
+
metadata: profile.metadata ?? {}
|
|
6005
|
+
};
|
|
6006
|
+
return createHash2("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
|
|
6007
|
+
}
|
|
6008
|
+
|
|
5818
6009
|
// src/cost-tracker.ts
|
|
5819
6010
|
var CostTracker = class {
|
|
5820
6011
|
byScenario = /* @__PURE__ */ new Map();
|
|
@@ -6221,6 +6412,194 @@ function isObject(v) {
|
|
|
6221
6412
|
return typeof v === "object" && v !== null && !Array.isArray(v);
|
|
6222
6413
|
}
|
|
6223
6414
|
|
|
6415
|
+
// src/scorecard.ts
|
|
6416
|
+
import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3 } from "fs";
|
|
6417
|
+
import { dirname as dirname2 } from "path";
|
|
6418
|
+
function median(xs) {
|
|
6419
|
+
if (xs.length === 0) return 0;
|
|
6420
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
6421
|
+
const mid = Math.floor(sorted.length / 2);
|
|
6422
|
+
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
6423
|
+
}
|
|
6424
|
+
function runScore(run) {
|
|
6425
|
+
return run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
6426
|
+
}
|
|
6427
|
+
function aggregatePerDimension(runs) {
|
|
6428
|
+
const sums = /* @__PURE__ */ new Map();
|
|
6429
|
+
for (const run of runs) {
|
|
6430
|
+
const dims = run.outcome.judgeScores?.perDimMean;
|
|
6431
|
+
if (!dims) continue;
|
|
6432
|
+
for (const [dim, value] of Object.entries(dims)) {
|
|
6433
|
+
if (!Number.isFinite(value)) continue;
|
|
6434
|
+
const acc = sums.get(dim) ?? { total: 0, count: 0 };
|
|
6435
|
+
acc.total += value;
|
|
6436
|
+
acc.count += 1;
|
|
6437
|
+
sums.set(dim, acc);
|
|
6438
|
+
}
|
|
6439
|
+
}
|
|
6440
|
+
if (sums.size === 0) return void 0;
|
|
6441
|
+
const out = {};
|
|
6442
|
+
for (const [dim, acc] of sums) out[dim] = acc.total / acc.count;
|
|
6443
|
+
return out;
|
|
6444
|
+
}
|
|
6445
|
+
function recordRuns(runs, opts) {
|
|
6446
|
+
const profileHash = agentProfileHash(opts.profile);
|
|
6447
|
+
const timestamp = opts.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
6448
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
6449
|
+
for (const run of runs) {
|
|
6450
|
+
const scenarioId = run.scenarioId;
|
|
6451
|
+
if (!scenarioId) continue;
|
|
6452
|
+
const bucket = byScenario.get(scenarioId);
|
|
6453
|
+
if (bucket) bucket.push(run);
|
|
6454
|
+
else byScenario.set(scenarioId, [run]);
|
|
6455
|
+
}
|
|
6456
|
+
const lines = [];
|
|
6457
|
+
for (const [scenarioId, scenarioRuns] of byScenario) {
|
|
6458
|
+
const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
|
|
6459
|
+
if (scored.length === 0) continue;
|
|
6460
|
+
const scores = scored.map((s) => s.score);
|
|
6461
|
+
const entry = {
|
|
6462
|
+
commitSha: opts.commitSha,
|
|
6463
|
+
timestamp,
|
|
6464
|
+
scores,
|
|
6465
|
+
composite: median(scores),
|
|
6466
|
+
runIds: scored.map((s) => s.run.runId)
|
|
6467
|
+
};
|
|
6468
|
+
const perDimension = aggregatePerDimension(scenarioRuns);
|
|
6469
|
+
if (perDimension) entry.perDimension = perDimension;
|
|
6470
|
+
lines.push({
|
|
6471
|
+
scenarioId,
|
|
6472
|
+
profileHash,
|
|
6473
|
+
model: opts.profile.model,
|
|
6474
|
+
profile: opts.profile,
|
|
6475
|
+
entry
|
|
6476
|
+
});
|
|
6477
|
+
}
|
|
6478
|
+
return lines;
|
|
6479
|
+
}
|
|
6480
|
+
function appendScorecard(logPath, lines) {
|
|
6481
|
+
if (lines.length === 0) return;
|
|
6482
|
+
mkdirSync2(dirname2(logPath), { recursive: true });
|
|
6483
|
+
appendFileSync2(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
|
|
6484
|
+
`);
|
|
6485
|
+
}
|
|
6486
|
+
function recordRunsToScorecard(logPath, runs, opts) {
|
|
6487
|
+
const lines = recordRuns(runs, opts);
|
|
6488
|
+
appendScorecard(logPath, lines);
|
|
6489
|
+
return lines;
|
|
6490
|
+
}
|
|
6491
|
+
function loadScorecard(logPath) {
|
|
6492
|
+
if (!existsSync4(logPath)) return { cells: [], profiles: {} };
|
|
6493
|
+
const cells = /* @__PURE__ */ new Map();
|
|
6494
|
+
const profiles = {};
|
|
6495
|
+
for (const raw of readFileSync3(logPath, "utf8").split("\n")) {
|
|
6496
|
+
const line = raw.trim();
|
|
6497
|
+
if (!line) continue;
|
|
6498
|
+
let parsed;
|
|
6499
|
+
try {
|
|
6500
|
+
parsed = JSON.parse(line);
|
|
6501
|
+
} catch {
|
|
6502
|
+
continue;
|
|
6503
|
+
}
|
|
6504
|
+
if (!parsed?.scenarioId || !parsed.profileHash || !parsed.entry) continue;
|
|
6505
|
+
const key = `${parsed.scenarioId}::${parsed.profileHash}`;
|
|
6506
|
+
let cell = cells.get(key);
|
|
6507
|
+
if (!cell) {
|
|
6508
|
+
cell = {
|
|
6509
|
+
scenarioId: parsed.scenarioId,
|
|
6510
|
+
profileHash: parsed.profileHash,
|
|
6511
|
+
model: parsed.model,
|
|
6512
|
+
timeline: []
|
|
6513
|
+
};
|
|
6514
|
+
cells.set(key, cell);
|
|
6515
|
+
}
|
|
6516
|
+
cell.timeline.push(parsed.entry);
|
|
6517
|
+
if (parsed.profile) profiles[parsed.profileHash] = parsed.profile;
|
|
6518
|
+
}
|
|
6519
|
+
for (const cell of cells.values()) {
|
|
6520
|
+
cell.timeline.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
|
|
6521
|
+
}
|
|
6522
|
+
return { cells: [...cells.values()], profiles };
|
|
6523
|
+
}
|
|
6524
|
+
function diffScorecard(scorecard, opts = {}) {
|
|
6525
|
+
const minEffect = opts.minEffect ?? 0.5;
|
|
6526
|
+
const maxP = opts.maxP ?? 0.05;
|
|
6527
|
+
const minDelta = opts.minDelta ?? 0.05;
|
|
6528
|
+
const cells = [];
|
|
6529
|
+
for (const cell of scorecard.cells) {
|
|
6530
|
+
const timeline = cell.timeline;
|
|
6531
|
+
if (timeline.length === 0) continue;
|
|
6532
|
+
const current = timeline[timeline.length - 1];
|
|
6533
|
+
const baseline = opts.baselineCommit ? [...timeline].reverse().find((e) => e.commitSha === opts.baselineCommit && e !== current) : timeline[timeline.length - 2];
|
|
6534
|
+
const base = {
|
|
6535
|
+
scenarioId: cell.scenarioId,
|
|
6536
|
+
profileHash: cell.profileHash,
|
|
6537
|
+
model: cell.model,
|
|
6538
|
+
current: current.composite,
|
|
6539
|
+
currentCommit: current.commitSha
|
|
6540
|
+
};
|
|
6541
|
+
if (!baseline) {
|
|
6542
|
+
cells.push({
|
|
6543
|
+
...base,
|
|
6544
|
+
verdict: "new",
|
|
6545
|
+
baseline: null,
|
|
6546
|
+
delta: null,
|
|
6547
|
+
cohensD: null,
|
|
6548
|
+
pValue: null,
|
|
6549
|
+
baselineCommit: null
|
|
6550
|
+
});
|
|
6551
|
+
continue;
|
|
6552
|
+
}
|
|
6553
|
+
const delta = current.composite - baseline.composite;
|
|
6554
|
+
const canStat = baseline.scores.length >= 2 && current.scores.length >= 2;
|
|
6555
|
+
let d = null;
|
|
6556
|
+
let p = null;
|
|
6557
|
+
let verdict;
|
|
6558
|
+
if (canStat) {
|
|
6559
|
+
d = cohensD(baseline.scores, current.scores);
|
|
6560
|
+
const t = welchsTTest(baseline.scores, current.scores);
|
|
6561
|
+
p = Number.isFinite(t.p) ? t.p : null;
|
|
6562
|
+
const significant = Math.abs(d) >= minEffect && p !== null && p <= maxP;
|
|
6563
|
+
verdict = significant ? delta > 0 ? "improved" : "regressed" : "flat";
|
|
6564
|
+
} else {
|
|
6565
|
+
verdict = Math.abs(delta) >= minDelta ? delta > 0 ? "improved" : "regressed" : "flat";
|
|
6566
|
+
}
|
|
6567
|
+
cells.push({
|
|
6568
|
+
...base,
|
|
6569
|
+
verdict,
|
|
6570
|
+
baseline: baseline.composite,
|
|
6571
|
+
delta,
|
|
6572
|
+
cohensD: d,
|
|
6573
|
+
pValue: p,
|
|
6574
|
+
baselineCommit: baseline.commitSha
|
|
6575
|
+
});
|
|
6576
|
+
}
|
|
6577
|
+
const summary = { improved: 0, regressed: 0, flat: 0, new: 0 };
|
|
6578
|
+
for (const cell of cells) summary[cell.verdict] += 1;
|
|
6579
|
+
return { cells, summary };
|
|
6580
|
+
}
|
|
6581
|
+
function formatScorecardDiff(diff) {
|
|
6582
|
+
const lines = [];
|
|
6583
|
+
const { summary } = diff;
|
|
6584
|
+
lines.push(
|
|
6585
|
+
`Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
|
|
6586
|
+
);
|
|
6587
|
+
const fmt = (n) => n.toFixed(3);
|
|
6588
|
+
const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
|
|
6589
|
+
if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
|
|
6590
|
+
return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
|
|
6591
|
+
});
|
|
6592
|
+
for (const cell of noteworthy) {
|
|
6593
|
+
const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
|
|
6594
|
+
const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt(cell.delta)}` : fmt(cell.delta) : "\u2014";
|
|
6595
|
+
const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
|
|
6596
|
+
lines.push(
|
|
6597
|
+
` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt(cell.baseline ?? 0)} \u2192 ${fmt(cell.current)} ${deltaStr}${stat}`
|
|
6598
|
+
);
|
|
6599
|
+
}
|
|
6600
|
+
return lines.join("\n");
|
|
6601
|
+
}
|
|
6602
|
+
|
|
6224
6603
|
// src/series-convergence.ts
|
|
6225
6604
|
function analyzeSeries(values, options = {}) {
|
|
6226
6605
|
const window = options.window ?? 5;
|
|
@@ -6230,10 +6609,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
6230
6609
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
6231
6610
|
}
|
|
6232
6611
|
const tail = values.slice(-window);
|
|
6233
|
-
const
|
|
6234
|
-
const variance = tail.reduce((acc, v) => acc + (v -
|
|
6612
|
+
const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
6613
|
+
const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
|
|
6235
6614
|
const stdDev = Math.sqrt(variance);
|
|
6236
|
-
const refMean = Math.abs(
|
|
6615
|
+
const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
6237
6616
|
const cv = stdDev / refMean;
|
|
6238
6617
|
const stable = tail.length >= window && cv <= stableCv;
|
|
6239
6618
|
let tailRun = 0;
|
|
@@ -6254,7 +6633,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
6254
6633
|
} else {
|
|
6255
6634
|
state = "noisy";
|
|
6256
6635
|
}
|
|
6257
|
-
return { state, windowMean:
|
|
6636
|
+
return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
|
|
6258
6637
|
}
|
|
6259
6638
|
|
|
6260
6639
|
// src/slo.ts
|
|
@@ -7052,12 +7431,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
7052
7431
|
variantScores.push({ mutator: id, score, mutated });
|
|
7053
7432
|
all.push(score);
|
|
7054
7433
|
}
|
|
7055
|
-
const
|
|
7056
|
-
const variance = all.reduce((a, v) => a + (v -
|
|
7434
|
+
const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
7435
|
+
const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
|
|
7057
7436
|
const stdDev = Math.sqrt(variance);
|
|
7058
|
-
const ref = Math.abs(
|
|
7437
|
+
const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
7059
7438
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
7060
|
-
return { originalScore, variantScores, meanScore:
|
|
7439
|
+
return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
|
|
7061
7440
|
}
|
|
7062
7441
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
7063
7442
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -7123,8 +7502,8 @@ async function paraphraseRobustnessScenarios(args) {
|
|
|
7123
7502
|
});
|
|
7124
7503
|
scores.push(out.score);
|
|
7125
7504
|
}
|
|
7126
|
-
const
|
|
7127
|
-
deltas[m.name] =
|
|
7505
|
+
const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7506
|
+
deltas[m.name] = mean5 - originalScore;
|
|
7128
7507
|
paraphrasedAll.push(...scores);
|
|
7129
7508
|
}
|
|
7130
7509
|
const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
|
|
@@ -7737,8 +8116,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7737
8116
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7738
8117
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7739
8118
|
if (scores.length < 3) continue;
|
|
7740
|
-
const
|
|
7741
|
-
const variance = scores.reduce((a, b) => a + (b -
|
|
8119
|
+
const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
8120
|
+
const variance = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
|
|
7742
8121
|
if (variance > varianceThreshold) {
|
|
7743
8122
|
targets.push({
|
|
7744
8123
|
reason: "high-variance",
|
|
@@ -7969,7 +8348,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
7969
8348
|
|
|
7970
8349
|
// src/command-runner.ts
|
|
7971
8350
|
import { spawnSync } from "child_process";
|
|
7972
|
-
import { existsSync as
|
|
8351
|
+
import { existsSync as existsSync5, readdirSync as readdirSync2, readFileSync as readFileSync4, statSync as statSync2 } from "fs";
|
|
7973
8352
|
import { join as join2 } from "path";
|
|
7974
8353
|
var localCommandRunner = {
|
|
7975
8354
|
name: "local",
|
|
@@ -7998,11 +8377,11 @@ var localCommandRunner = {
|
|
|
7998
8377
|
return r.status === 0 && (r.stdout ?? "").trim().length > 0;
|
|
7999
8378
|
},
|
|
8000
8379
|
async fileExists(path) {
|
|
8001
|
-
return
|
|
8380
|
+
return existsSync5(path);
|
|
8002
8381
|
},
|
|
8003
8382
|
async readFile(path) {
|
|
8004
8383
|
try {
|
|
8005
|
-
return
|
|
8384
|
+
return readFileSync4(path, "utf8");
|
|
8006
8385
|
} catch {
|
|
8007
8386
|
return null;
|
|
8008
8387
|
}
|
|
@@ -8240,7 +8619,7 @@ function extractErrorCount(text, opts = {}) {
|
|
|
8240
8619
|
for (const p of patterns) {
|
|
8241
8620
|
const matches = Array.from(text.matchAll(p.regex));
|
|
8242
8621
|
if (matches.length === 0) continue;
|
|
8243
|
-
const count = p.transform ? matches.reduce((
|
|
8622
|
+
const count = p.transform ? matches.reduce((sum3, m) => sum3 + p.transform(m), 0) : matches.length;
|
|
8244
8623
|
return {
|
|
8245
8624
|
count,
|
|
8246
8625
|
matched: p.name,
|
|
@@ -8934,8 +9313,8 @@ function multiToolchainLayer(config) {
|
|
|
8934
9313
|
}
|
|
8935
9314
|
|
|
8936
9315
|
// src/reference-replay.ts
|
|
8937
|
-
import { appendFileSync as
|
|
8938
|
-
import { dirname as
|
|
9316
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5 } from "fs";
|
|
9317
|
+
import { dirname as dirname3 } from "path";
|
|
8939
9318
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
8940
9319
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
8941
9320
|
async function runReferenceReplay(cases, options) {
|
|
@@ -9053,14 +9432,14 @@ function jsonlReferenceReplayStore(path) {
|
|
|
9053
9432
|
return {
|
|
9054
9433
|
async save(run) {
|
|
9055
9434
|
await lock.runExclusive(() => {
|
|
9056
|
-
|
|
9057
|
-
|
|
9435
|
+
mkdirSync3(dirname3(path), { recursive: true });
|
|
9436
|
+
appendFileSync3(path, `${JSON.stringify(run)}
|
|
9058
9437
|
`);
|
|
9059
9438
|
});
|
|
9060
9439
|
},
|
|
9061
9440
|
async list() {
|
|
9062
9441
|
return lock.runExclusive(() => {
|
|
9063
|
-
if (!
|
|
9442
|
+
if (!existsSync6(path)) return [];
|
|
9064
9443
|
return readJsonl(path);
|
|
9065
9444
|
});
|
|
9066
9445
|
}
|
|
@@ -9149,7 +9528,7 @@ function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
|
|
|
9149
9528
|
regressions
|
|
9150
9529
|
};
|
|
9151
9530
|
}
|
|
9152
|
-
const requiredMeanDelta =
|
|
9531
|
+
const requiredMeanDelta = mean3(compared.map((item) => item.f1Delta));
|
|
9153
9532
|
if (requiredMeanDelta < minF1Delta) {
|
|
9154
9533
|
return {
|
|
9155
9534
|
promote: false,
|
|
@@ -9284,8 +9663,8 @@ function scorePair(scenario, matcher, reference, candidate) {
|
|
|
9284
9663
|
function buildScenarioScore(scenario, matches, falsePositives) {
|
|
9285
9664
|
const matched = matches.filter((match) => match.matched).length;
|
|
9286
9665
|
const total = scenario.references.length;
|
|
9287
|
-
const matchedWeight = matches.filter((match) => match.matched).reduce((
|
|
9288
|
-
const totalWeight = matches.reduce((
|
|
9666
|
+
const matchedWeight = matches.filter((match) => match.matched).reduce((sum3, match) => sum3 + match.weight, 0);
|
|
9667
|
+
const totalWeight = matches.reduce((sum3, match) => sum3 + match.weight, 0);
|
|
9289
9668
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9290
9669
|
const recall = ratio(matched, total);
|
|
9291
9670
|
return {
|
|
@@ -9311,11 +9690,11 @@ function aggregateBySplit(scores) {
|
|
|
9311
9690
|
return out;
|
|
9312
9691
|
}
|
|
9313
9692
|
function aggregateScenarioScores(scores) {
|
|
9314
|
-
const matched =
|
|
9315
|
-
const total =
|
|
9316
|
-
const falsePositives =
|
|
9317
|
-
const matchedWeight =
|
|
9318
|
-
const totalWeight =
|
|
9693
|
+
const matched = sum2(scores.map((score) => score.matched));
|
|
9694
|
+
const total = sum2(scores.map((score) => score.total));
|
|
9695
|
+
const falsePositives = sum2(scores.map((score) => score.falsePositives));
|
|
9696
|
+
const matchedWeight = sum2(scores.map((score) => score.matchedWeight));
|
|
9697
|
+
const totalWeight = sum2(scores.map((score) => score.totalWeight));
|
|
9319
9698
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9320
9699
|
const recall = ratio(matched, total);
|
|
9321
9700
|
return {
|
|
@@ -9382,11 +9761,11 @@ function clamp012(value) {
|
|
|
9382
9761
|
if (!Number.isFinite(value)) return 0;
|
|
9383
9762
|
return Math.max(0, Math.min(1, value));
|
|
9384
9763
|
}
|
|
9385
|
-
function
|
|
9764
|
+
function sum2(values) {
|
|
9386
9765
|
return values.reduce((acc, value) => acc + value, 0);
|
|
9387
9766
|
}
|
|
9388
|
-
function
|
|
9389
|
-
return values.length ?
|
|
9767
|
+
function mean3(values) {
|
|
9768
|
+
return values.length ? sum2(values) / values.length : 0;
|
|
9390
9769
|
}
|
|
9391
9770
|
function formatPct(value) {
|
|
9392
9771
|
return `${(value * 100).toFixed(1)}%`;
|
|
@@ -9403,7 +9782,7 @@ function throwIfAborted(signal) {
|
|
|
9403
9782
|
throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
|
|
9404
9783
|
}
|
|
9405
9784
|
function readJsonl(path) {
|
|
9406
|
-
const raw =
|
|
9785
|
+
const raw = readFileSync5(path, "utf8");
|
|
9407
9786
|
const out = [];
|
|
9408
9787
|
for (const line of raw.split("\n")) {
|
|
9409
9788
|
const trimmed = line.trim();
|
|
@@ -9650,8 +10029,8 @@ function detectCalibrationDrift(runs, opts) {
|
|
|
9650
10029
|
alpha,
|
|
9651
10030
|
recentN: recent.length,
|
|
9652
10031
|
historyN: historical.length,
|
|
9653
|
-
recentMean:
|
|
9654
|
-
historyMean:
|
|
10032
|
+
recentMean: mean4(recent),
|
|
10033
|
+
historyMean: mean4(historical)
|
|
9655
10034
|
}
|
|
9656
10035
|
}
|
|
9657
10036
|
];
|
|
@@ -9771,7 +10150,7 @@ function chiSquareCritical(df, alpha) {
|
|
|
9771
10150
|
}
|
|
9772
10151
|
return TABLE[10][idx];
|
|
9773
10152
|
}
|
|
9774
|
-
function
|
|
10153
|
+
function mean4(xs) {
|
|
9775
10154
|
if (xs.length === 0) return 0;
|
|
9776
10155
|
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
9777
10156
|
}
|
|
@@ -9971,8 +10350,8 @@ async function discoverPersonas(dir, opts = {}) {
|
|
|
9971
10350
|
}
|
|
9972
10351
|
|
|
9973
10352
|
// src/evolution-telemetry.ts
|
|
9974
|
-
import { appendFileSync as
|
|
9975
|
-
import { dirname as
|
|
10353
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
|
|
10354
|
+
import { dirname as dirname4 } from "path";
|
|
9976
10355
|
var MutationTelemetry = class {
|
|
9977
10356
|
appender;
|
|
9978
10357
|
constructor(path) {
|
|
@@ -10001,17 +10380,17 @@ var LineageRecorder = class {
|
|
|
10001
10380
|
this.path = path;
|
|
10002
10381
|
this.snapshotPath = `${path}.snapshot`;
|
|
10003
10382
|
this.kindOf = kindOf ?? defaultKindOf;
|
|
10004
|
-
|
|
10005
|
-
if (
|
|
10383
|
+
mkdirSync4(dirname4(path), { recursive: true });
|
|
10384
|
+
if (existsSync7(this.snapshotPath)) {
|
|
10006
10385
|
try {
|
|
10007
|
-
const parsed = JSON.parse(
|
|
10386
|
+
const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
|
|
10008
10387
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
10009
10388
|
} catch {
|
|
10010
10389
|
}
|
|
10011
10390
|
}
|
|
10012
|
-
if (
|
|
10391
|
+
if (existsSync7(path)) {
|
|
10013
10392
|
try {
|
|
10014
|
-
for (const line of
|
|
10393
|
+
for (const line of readFileSync6(path, "utf-8").split("\n")) {
|
|
10015
10394
|
if (!line.trim()) continue;
|
|
10016
10395
|
try {
|
|
10017
10396
|
const entry = JSON.parse(line);
|
|
@@ -10023,9 +10402,9 @@ var LineageRecorder = class {
|
|
|
10023
10402
|
} catch {
|
|
10024
10403
|
}
|
|
10025
10404
|
}
|
|
10026
|
-
if (
|
|
10405
|
+
if (existsSync7(path) && this.nodes.size === 0) {
|
|
10027
10406
|
try {
|
|
10028
|
-
const raw =
|
|
10407
|
+
const raw = readFileSync6(path, "utf-8").trim();
|
|
10029
10408
|
if (raw.startsWith("[")) {
|
|
10030
10409
|
const parsed = JSON.parse(raw);
|
|
10031
10410
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
@@ -10039,15 +10418,15 @@ var LineageRecorder = class {
|
|
|
10039
10418
|
const prev = this.nodes.get(node.id);
|
|
10040
10419
|
this.nodes.set(node.id, { ...prev, ...node });
|
|
10041
10420
|
try {
|
|
10042
|
-
if (
|
|
10043
|
-
const head =
|
|
10421
|
+
if (existsSync7(this.path)) {
|
|
10422
|
+
const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
10044
10423
|
if (head === "[") {
|
|
10045
10424
|
writeFileSync(this.path, "");
|
|
10046
10425
|
}
|
|
10047
10426
|
}
|
|
10048
10427
|
} catch {
|
|
10049
10428
|
}
|
|
10050
|
-
|
|
10429
|
+
appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
|
|
10051
10430
|
`);
|
|
10052
10431
|
});
|
|
10053
10432
|
}
|
|
@@ -10106,9 +10485,9 @@ var CostLedger = class {
|
|
|
10106
10485
|
mutex = new Mutex();
|
|
10107
10486
|
constructor(path) {
|
|
10108
10487
|
this.path = path;
|
|
10109
|
-
if (
|
|
10488
|
+
if (existsSync7(path)) {
|
|
10110
10489
|
try {
|
|
10111
|
-
const loaded = JSON.parse(
|
|
10490
|
+
const loaded = JSON.parse(readFileSync6(path, "utf-8"));
|
|
10112
10491
|
for (const k of Object.keys(this.totals)) {
|
|
10113
10492
|
if (k === "byGeneration") {
|
|
10114
10493
|
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
@@ -10125,7 +10504,7 @@ var CostLedger = class {
|
|
|
10125
10504
|
} catch {
|
|
10126
10505
|
}
|
|
10127
10506
|
} else {
|
|
10128
|
-
|
|
10507
|
+
mkdirSync4(dirname4(path), { recursive: true });
|
|
10129
10508
|
}
|
|
10130
10509
|
}
|
|
10131
10510
|
genBucket(generation) {
|
|
@@ -10277,16 +10656,16 @@ function precision(goldens, candidates, options = {}) {
|
|
|
10277
10656
|
}
|
|
10278
10657
|
|
|
10279
10658
|
// src/jsonl-trial-cache.ts
|
|
10280
|
-
import { appendFileSync as
|
|
10281
|
-
import { dirname as
|
|
10659
|
+
import { appendFileSync as appendFileSync5, existsSync as existsSync8, mkdirSync as mkdirSync5, readFileSync as readFileSync7 } from "fs";
|
|
10660
|
+
import { dirname as dirname5 } from "path";
|
|
10282
10661
|
var JsonlTrialCache = class {
|
|
10283
10662
|
map = /* @__PURE__ */ new Map();
|
|
10284
10663
|
path;
|
|
10285
10664
|
appender;
|
|
10286
10665
|
constructor(path) {
|
|
10287
10666
|
this.path = path;
|
|
10288
|
-
if (
|
|
10289
|
-
for (const line of
|
|
10667
|
+
if (existsSync8(path)) {
|
|
10668
|
+
for (const line of readFileSync7(path, "utf-8").split("\n")) {
|
|
10290
10669
|
if (!line.trim()) continue;
|
|
10291
10670
|
try {
|
|
10292
10671
|
const entry = JSON.parse(line);
|
|
@@ -10295,7 +10674,7 @@ var JsonlTrialCache = class {
|
|
|
10295
10674
|
}
|
|
10296
10675
|
}
|
|
10297
10676
|
} else {
|
|
10298
|
-
|
|
10677
|
+
mkdirSync5(dirname5(path), { recursive: true });
|
|
10299
10678
|
}
|
|
10300
10679
|
this.appender = new LockedJsonlAppender(path);
|
|
10301
10680
|
}
|
|
@@ -10318,7 +10697,7 @@ var JsonlTrialCache = class {
|
|
|
10318
10697
|
setSync(key, value) {
|
|
10319
10698
|
this.map.set(key, value);
|
|
10320
10699
|
const line = { key, result: value, writtenAt: Date.now() };
|
|
10321
|
-
|
|
10700
|
+
appendFileSync5(this.path, `${JSON.stringify(line)}
|
|
10322
10701
|
`);
|
|
10323
10702
|
}
|
|
10324
10703
|
};
|
|
@@ -10401,9 +10780,9 @@ function passOrthogonality(input) {
|
|
|
10401
10780
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10402
10781
|
}
|
|
10403
10782
|
}
|
|
10404
|
-
const
|
|
10783
|
+
const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10405
10784
|
return {
|
|
10406
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
10785
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
|
|
10407
10786
|
passCount: passes.length,
|
|
10408
10787
|
similarities: sims
|
|
10409
10788
|
};
|
|
@@ -10653,6 +11032,7 @@ function aggregateTrialsByMode(trials, opts) {
|
|
|
10653
11032
|
};
|
|
10654
11033
|
}
|
|
10655
11034
|
export {
|
|
11035
|
+
AGENT_PROFILE_KINDS,
|
|
10656
11036
|
ANALYST_SEVERITIES,
|
|
10657
11037
|
AgentDriver,
|
|
10658
11038
|
AgentEvalError,
|
|
@@ -10678,6 +11058,7 @@ export {
|
|
|
10678
11058
|
DEFAULT_HARNESS_OBJECTIVES,
|
|
10679
11059
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
10680
11060
|
DEFAULT_MUTATORS,
|
|
11061
|
+
DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
|
|
10681
11062
|
DEFAULT_REDACTION_RULES,
|
|
10682
11063
|
DEFAULT_RED_TEAM_CORPUS,
|
|
10683
11064
|
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
@@ -10766,13 +11147,16 @@ export {
|
|
|
10766
11147
|
adversarialJudge,
|
|
10767
11148
|
agentProfileCellHashMaterial,
|
|
10768
11149
|
agentProfileCellKey,
|
|
11150
|
+
agentProfileHash,
|
|
10769
11151
|
aggregateLlm,
|
|
11152
|
+
aggregatePrReviewScore,
|
|
10770
11153
|
aggregateRunScore,
|
|
10771
11154
|
aggregateTrialsByMode,
|
|
10772
11155
|
allCriticalPassed,
|
|
10773
11156
|
analyzeAntiSlop,
|
|
10774
11157
|
analyzeSeries,
|
|
10775
11158
|
analyzeTraces,
|
|
11159
|
+
appendScorecard,
|
|
10776
11160
|
argHash,
|
|
10777
11161
|
assertLlmRoute,
|
|
10778
11162
|
assertRealBackend,
|
|
@@ -10793,6 +11177,7 @@ export {
|
|
|
10793
11177
|
buildDriverSystemPrompt,
|
|
10794
11178
|
buildReflectionPrompt,
|
|
10795
11179
|
buildReviewerPrompt,
|
|
11180
|
+
buildSandboxAgentProfileCell,
|
|
10796
11181
|
buildTraceAnalystTools,
|
|
10797
11182
|
buildTraceInsightContext,
|
|
10798
11183
|
buildTraceInsightPrompt,
|
|
@@ -10816,6 +11201,7 @@ export {
|
|
|
10816
11201
|
cohensD,
|
|
10817
11202
|
coherenceJudge,
|
|
10818
11203
|
collectionPreserved,
|
|
11204
|
+
commentsForSource,
|
|
10819
11205
|
commitBisect,
|
|
10820
11206
|
compareReferenceReplay,
|
|
10821
11207
|
compareToBaseline,
|
|
@@ -10866,6 +11252,7 @@ export {
|
|
|
10866
11252
|
deployGateLayer,
|
|
10867
11253
|
describeTraceInsightScope,
|
|
10868
11254
|
diffFindings,
|
|
11255
|
+
diffScorecard,
|
|
10869
11256
|
discoverPersonas,
|
|
10870
11257
|
distillPlaybook,
|
|
10871
11258
|
domainEvidencePattern,
|
|
@@ -10901,6 +11288,7 @@ export {
|
|
|
10901
11288
|
formatBenchmarkReport,
|
|
10902
11289
|
formatDriverReport,
|
|
10903
11290
|
formatFindings,
|
|
11291
|
+
formatScorecardDiff,
|
|
10904
11292
|
gainHistogram,
|
|
10905
11293
|
ghCliClient,
|
|
10906
11294
|
precision as goldenPrecision,
|
|
@@ -10943,6 +11331,7 @@ export {
|
|
|
10943
11331
|
linterJudge,
|
|
10944
11332
|
llmSpanFromProvider,
|
|
10945
11333
|
llmSpans,
|
|
11334
|
+
loadScorecard,
|
|
10946
11335
|
loadScorerFromGrader,
|
|
10947
11336
|
localCommandRunner,
|
|
10948
11337
|
lowercaseMutator,
|
|
@@ -10984,6 +11373,8 @@ export {
|
|
|
10984
11373
|
proposeSynthesisTargets,
|
|
10985
11374
|
providerFromBaseUrl,
|
|
10986
11375
|
pytestTestParser,
|
|
11376
|
+
recordRuns,
|
|
11377
|
+
recordRunsToScorecard,
|
|
10987
11378
|
redTeamDataset,
|
|
10988
11379
|
redTeamReport,
|
|
10989
11380
|
redactString,
|
|
@@ -11042,6 +11433,8 @@ export {
|
|
|
11042
11433
|
scoreContinuity,
|
|
11043
11434
|
scoreFromEvals,
|
|
11044
11435
|
scoreKnowledgeReadiness,
|
|
11436
|
+
scorePrReviewComments,
|
|
11437
|
+
scorePrReviewSource,
|
|
11045
11438
|
scoreRedTeamOutput,
|
|
11046
11439
|
scoreReferenceReplay,
|
|
11047
11440
|
scoreTraceInsightReadiness,
|
|
@@ -11060,11 +11453,13 @@ export {
|
|
|
11060
11453
|
summarize,
|
|
11061
11454
|
summarizeBackendIntegrity,
|
|
11062
11455
|
summarizeHarnessResults,
|
|
11456
|
+
summarizePrReviewBenchmark,
|
|
11063
11457
|
summarizePreferenceMemory,
|
|
11064
11458
|
summaryTable,
|
|
11065
11459
|
testJudge,
|
|
11066
11460
|
textInSnapshot,
|
|
11067
11461
|
throwIfRunIncomplete,
|
|
11462
|
+
toAgentProfileJson,
|
|
11068
11463
|
toLangfuseEnvelope,
|
|
11069
11464
|
toPrometheusText,
|
|
11070
11465
|
tokenizeDomainWords,
|