@tangle-network/agent-eval 0.33.0 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/chunk-DCZXFOQN.js +489 -0
- package/dist/chunk-DCZXFOQN.js.map +1 -0
- package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
- package/dist/chunk-FT3IAMQR.js.map +1 -0
- package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
- package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
- package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
- package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
- package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
- package/dist/chunk-SQYRO3BT.js.map +1 -0
- package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
- package/dist/chunk-TQL7BAOY.js.map +1 -0
- package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
- package/dist/chunk-VXNVVBZO.js.map +1 -0
- package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
- package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
- package/dist/cli.js +2 -2
- package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/control.js +3 -2
- package/dist/governance/index.d.ts +2 -1
- package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
- package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
- package/dist/index.d.ts +278 -486
- package/dist/index.js +522 -134
- package/dist/index.js.map +1 -1
- package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
- package/dist/meta-eval/index.d.ts +2 -2
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +3 -3
- package/dist/optimization.js +6 -6
- package/dist/pipelines/index.js +2 -2
- package/dist/release-report-ChfmCmLi.d.ts +713 -0
- package/dist/reporting.d.ts +6 -4
- package/dist/reporting.js +10 -9
- package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
- package/dist/rl.d.ts +5 -5
- package/dist/rl.js +6 -6
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
- package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
- package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
- package/dist/wire/index.js +2 -2
- package/docs/product-eval-adoption.md +18 -0
- package/package.json +12 -22
- package/dist/chunk-B73G44OH.js.map +0 -1
- package/dist/chunk-CXJOVDJR.js.map +0 -1
- package/dist/chunk-DTEJNZYK.js.map +0 -1
- package/dist/chunk-M6RZ5LJN.js.map +0 -1
- package/dist/chunk-ZN2CMQIW.js +0 -208
- package/dist/chunk-ZN2CMQIW.js.map +0 -1
- package/dist/release-report-DLWbBPtH.d.ts +0 -292
- /package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
- /package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
- /package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
- /package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
failureClusterView,
|
|
12
12
|
iqr,
|
|
13
13
|
welchsTTest
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-KE7TDJUO.js";
|
|
15
15
|
import {
|
|
16
16
|
exportTrainingData,
|
|
17
17
|
toNdjson
|
|
@@ -54,7 +54,7 @@ import {
|
|
|
54
54
|
runProposeReview,
|
|
55
55
|
runProposeReviewAsControlLoop,
|
|
56
56
|
scoreFromEvals
|
|
57
|
-
} from "./chunk-
|
|
57
|
+
} from "./chunk-WRGHMGWT.js";
|
|
58
58
|
import {
|
|
59
59
|
allCriticalPassed,
|
|
60
60
|
objectiveEval,
|
|
@@ -96,14 +96,7 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
100
|
-
import {
|
|
101
|
-
RunRecordValidationError,
|
|
102
|
-
isRunRecord,
|
|
103
|
-
parseRunRecordSafe,
|
|
104
|
-
roundTripRunRecord,
|
|
105
|
-
validateRunRecord
|
|
106
|
-
} from "./chunk-ZN2CMQIW.js";
|
|
99
|
+
} from "./chunk-FT3IAMQR.js";
|
|
107
100
|
import {
|
|
108
101
|
assertReleaseConfidence,
|
|
109
102
|
bootstrapCi,
|
|
@@ -111,38 +104,52 @@ import {
|
|
|
111
104
|
judgeReplayGate,
|
|
112
105
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
113
106
|
renderReleaseReport
|
|
114
|
-
} from "./chunk-
|
|
107
|
+
} from "./chunk-LGAPK7NA.js";
|
|
115
108
|
import {
|
|
116
109
|
runEvalCampaign
|
|
117
|
-
} from "./chunk-
|
|
110
|
+
} from "./chunk-SQYRO3BT.js";
|
|
118
111
|
import {
|
|
119
112
|
LlmCallError,
|
|
120
113
|
LlmClient,
|
|
121
114
|
LlmRouteAssertionError,
|
|
122
115
|
assertLlmRoute,
|
|
116
|
+
backoffMs,
|
|
123
117
|
callLlm,
|
|
124
118
|
callLlmJson,
|
|
119
|
+
isTransientLlmError,
|
|
125
120
|
probeLlm,
|
|
126
121
|
stripFencedJson
|
|
127
|
-
} from "./chunk-
|
|
122
|
+
} from "./chunk-VXNVVBZO.js";
|
|
123
|
+
import {
|
|
124
|
+
AgentProfileCellValidationError,
|
|
125
|
+
RunRecordValidationError,
|
|
126
|
+
agentProfileCellHashMaterial,
|
|
127
|
+
agentProfileCellKey,
|
|
128
|
+
assertRunAgentProfileCell,
|
|
129
|
+
buildAgentProfileCell,
|
|
130
|
+
groupRunsByAgentProfileCell,
|
|
131
|
+
isRunRecord,
|
|
132
|
+
parseRunRecordSafe,
|
|
133
|
+
requireAgentProfileCell,
|
|
134
|
+
roundTripRunRecord,
|
|
135
|
+
validateAgentProfileCell,
|
|
136
|
+
validateRunRecord,
|
|
137
|
+
verifyAgentProfileCell
|
|
138
|
+
} from "./chunk-DCZXFOQN.js";
|
|
128
139
|
import {
|
|
129
140
|
evaluateInterimReleaseConfidence,
|
|
130
141
|
pairedEvalueSequence
|
|
131
142
|
} from "./chunk-MAZ26DC7.js";
|
|
132
143
|
import {
|
|
133
144
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
134
|
-
benjaminiHochberg,
|
|
135
|
-
bhAdjust,
|
|
136
|
-
bonferroni,
|
|
137
145
|
gainHistogram,
|
|
138
|
-
pairedBootstrap,
|
|
139
|
-
pairedWilcoxon,
|
|
140
146
|
paretoChart,
|
|
141
|
-
requiredSampleSize,
|
|
142
147
|
researchReport,
|
|
143
148
|
summaryTable
|
|
144
|
-
} from "./chunk-
|
|
149
|
+
} from "./chunk-TQL7BAOY.js";
|
|
145
150
|
import {
|
|
151
|
+
benjaminiHochberg,
|
|
152
|
+
bonferroni,
|
|
146
153
|
calibrateJudge,
|
|
147
154
|
calibrateJudgeContinuous,
|
|
148
155
|
cohensD,
|
|
@@ -153,14 +160,17 @@ import {
|
|
|
153
160
|
interRaterReliability,
|
|
154
161
|
mannWhitneyU,
|
|
155
162
|
normalizeScores,
|
|
163
|
+
pairedBootstrap,
|
|
164
|
+
pairedMde,
|
|
156
165
|
pairedTTest,
|
|
157
166
|
partialCredit,
|
|
158
167
|
positionalBias,
|
|
168
|
+
requiredSampleSize,
|
|
159
169
|
selfPreference,
|
|
160
170
|
verbosityBias,
|
|
161
171
|
weightedMean,
|
|
162
172
|
wilcoxonSignedRank
|
|
163
|
-
} from "./chunk-
|
|
173
|
+
} from "./chunk-KHZRNY3F.js";
|
|
164
174
|
import {
|
|
165
175
|
DEFAULT_REDACTION_RULES,
|
|
166
176
|
DEFAULT_TRACE_ANALYST_BUDGETS,
|
|
@@ -323,7 +333,7 @@ var RunCritic = class {
|
|
|
323
333
|
);
|
|
324
334
|
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
325
335
|
if (!success) notes.push("run did not complete with pass=true");
|
|
326
|
-
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((
|
|
336
|
+
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum3, span) => sum3 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
327
337
|
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
|
|
328
338
|
trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
|
|
329
339
|
) : void 0;
|
|
@@ -338,7 +348,7 @@ var RunCritic = class {
|
|
|
338
348
|
(span) => typeof span.testsTotal === "number" && span.testsTotal > 0
|
|
339
349
|
);
|
|
340
350
|
const testReality = sandboxTests.length ? sandboxTests.reduce(
|
|
341
|
-
(
|
|
351
|
+
(sum3, span) => sum3 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
|
|
342
352
|
0
|
|
343
353
|
) / sandboxTests.length : toolSpans2.some(
|
|
344
354
|
(span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
|
|
@@ -360,7 +370,7 @@ var RunCritic = class {
|
|
|
360
370
|
const costUsd = trace.budget.length ? Math.max(
|
|
361
371
|
...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
|
|
362
372
|
0
|
|
363
|
-
) : llmSpans2.reduce((
|
|
373
|
+
) : llmSpans2.reduce((sum3, span) => sum3 + (span.costUsd ?? 0), 0);
|
|
364
374
|
const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
|
|
365
375
|
return {
|
|
366
376
|
success,
|
|
@@ -1950,12 +1960,12 @@ function allocateBudget(policy, args) {
|
|
|
1950
1960
|
return policy.totalUsd / Math.max(1, args.runningCount);
|
|
1951
1961
|
}
|
|
1952
1962
|
function sumFindingCost(findings) {
|
|
1953
|
-
let
|
|
1963
|
+
let sum3 = 0;
|
|
1954
1964
|
for (const f of findings) {
|
|
1955
1965
|
const c = f.metadata?.cost_usd;
|
|
1956
|
-
if (typeof c === "number" && Number.isFinite(c))
|
|
1966
|
+
if (typeof c === "number" && Number.isFinite(c)) sum3 += c;
|
|
1957
1967
|
}
|
|
1958
|
-
return
|
|
1968
|
+
return sum3;
|
|
1959
1969
|
}
|
|
1960
1970
|
function selectPriorFindings(source, analystId) {
|
|
1961
1971
|
if (!source) return void 0;
|
|
@@ -2174,10 +2184,10 @@ function ghCliClient(opts = {}) {
|
|
|
2174
2184
|
await exec("git", ["branch", "-D", input.branchName], { cwd });
|
|
2175
2185
|
await run("git", ["checkout", "-b", input.branchName]);
|
|
2176
2186
|
const { mkdir, writeFile } = await import("fs/promises");
|
|
2177
|
-
const { dirname:
|
|
2187
|
+
const { dirname: dirname6, join: join4, resolve } = await import("path");
|
|
2178
2188
|
for (const change of input.fileChanges) {
|
|
2179
2189
|
const abs = resolve(cwd, change.path);
|
|
2180
|
-
await mkdir(
|
|
2190
|
+
await mkdir(dirname6(abs), { recursive: true });
|
|
2181
2191
|
await writeFile(abs, change.contents, "utf8");
|
|
2182
2192
|
await run("git", ["add", join4(change.path)]);
|
|
2183
2193
|
}
|
|
@@ -3712,6 +3722,178 @@ function liveProofToReleaseTrace(config, trajectory, durationMs) {
|
|
|
3712
3722
|
};
|
|
3713
3723
|
}
|
|
3714
3724
|
|
|
3725
|
+
// src/pr-review-benchmark.ts
|
|
3726
|
+
var DEFAULT_PR_REVIEW_SCORE_WEIGHTS = {
|
|
3727
|
+
recall: 4,
|
|
3728
|
+
precision: 2,
|
|
3729
|
+
actionability: 1.5,
|
|
3730
|
+
severityCalibration: 1,
|
|
3731
|
+
lowNoise: 1
|
|
3732
|
+
};
|
|
3733
|
+
function commentsForSource(auditCase, source) {
|
|
3734
|
+
return auditCase.comments.filter((comment) => comment.source === source);
|
|
3735
|
+
}
|
|
3736
|
+
function scorePrReviewSource(auditCase, source, weights = {}) {
|
|
3737
|
+
return scorePrReviewComments(auditCase, commentsForSource(auditCase, source), source, weights);
|
|
3738
|
+
}
|
|
3739
|
+
function scorePrReviewComments(auditCase, comments, source, weights = {}) {
|
|
3740
|
+
const matchedFindings = matchReferenceFindings(auditCase.referenceFindings, comments);
|
|
3741
|
+
const matchedCommentIds = new Set(matchedFindings.map((match) => match.commentId));
|
|
3742
|
+
const positiveComments = comments.filter((comment) => isPositiveOutcome(comment.outcome));
|
|
3743
|
+
const negativeComments = comments.filter((comment) => isNegativeOutcome(comment.outcome));
|
|
3744
|
+
const actionableComments = comments.filter(isActionableComment);
|
|
3745
|
+
const severityComments = comments.filter((comment) => comment.severity);
|
|
3746
|
+
const severityAligned = severityComments.filter(
|
|
3747
|
+
(comment) => isSeverityAligned(comment, auditCase.referenceFindings, matchedFindings)
|
|
3748
|
+
);
|
|
3749
|
+
const recall = auditCase.referenceFindings.length ? matchedFindings.length / auditCase.referenceFindings.length : comments.length === 0 ? 1 : 0;
|
|
3750
|
+
const precisionDenominator = positiveComments.length + negativeComments.length;
|
|
3751
|
+
const precision2 = precisionDenominator > 0 ? positiveComments.length / precisionDenominator : comments.length > 0 ? matchedCommentIds.size / comments.length : auditCase.referenceFindings.length === 0 ? 1 : 0;
|
|
3752
|
+
const actionability = comments.length ? actionableComments.length / comments.length : 1;
|
|
3753
|
+
const severityCalibration = severityComments.length ? severityAligned.length / severityComments.length : matchedFindings.length ? 0.5 : 1;
|
|
3754
|
+
const lowNoise = comments.length ? 1 - negativeComments.length / comments.length : 1;
|
|
3755
|
+
const aggregate2 = aggregatePrReviewScore(
|
|
3756
|
+
{ recall, precision: precision2, actionability, severityCalibration, lowNoise },
|
|
3757
|
+
weights
|
|
3758
|
+
);
|
|
3759
|
+
return {
|
|
3760
|
+
caseId: auditCase.id,
|
|
3761
|
+
source,
|
|
3762
|
+
commentCount: comments.length,
|
|
3763
|
+
referenceCount: auditCase.referenceFindings.length,
|
|
3764
|
+
matchedFindings,
|
|
3765
|
+
recall,
|
|
3766
|
+
precision: precision2,
|
|
3767
|
+
actionability,
|
|
3768
|
+
severityCalibration,
|
|
3769
|
+
lowNoise,
|
|
3770
|
+
aggregate: aggregate2,
|
|
3771
|
+
notes: buildScoreNotes({
|
|
3772
|
+
comments,
|
|
3773
|
+
referenceCount: auditCase.referenceFindings.length,
|
|
3774
|
+
matchedFindings,
|
|
3775
|
+
negativeComments,
|
|
3776
|
+
actionableComments
|
|
3777
|
+
})
|
|
3778
|
+
};
|
|
3779
|
+
}
|
|
3780
|
+
function summarizePrReviewBenchmark(scores) {
|
|
3781
|
+
const bySource = /* @__PURE__ */ new Map();
|
|
3782
|
+
for (const score of scores) {
|
|
3783
|
+
bySource.set(score.source, [...bySource.get(score.source) ?? [], score]);
|
|
3784
|
+
}
|
|
3785
|
+
return [...bySource.entries()].map(([source, sourceScores]) => ({
|
|
3786
|
+
source,
|
|
3787
|
+
caseCount: sourceScores.length,
|
|
3788
|
+
commentCount: sum(sourceScores.map((score) => score.commentCount)),
|
|
3789
|
+
aggregateMean: mean(sourceScores.map((score) => score.aggregate)),
|
|
3790
|
+
recallMean: mean(sourceScores.map((score) => score.recall)),
|
|
3791
|
+
precisionMean: mean(sourceScores.map((score) => score.precision)),
|
|
3792
|
+
actionabilityMean: mean(sourceScores.map((score) => score.actionability)),
|
|
3793
|
+
severityCalibrationMean: mean(sourceScores.map((score) => score.severityCalibration)),
|
|
3794
|
+
lowNoiseMean: mean(sourceScores.map((score) => score.lowNoise))
|
|
3795
|
+
})).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
3796
|
+
}
|
|
3797
|
+
function aggregatePrReviewScore(dimensions, weights = {}) {
|
|
3798
|
+
const merged = { ...DEFAULT_PR_REVIEW_SCORE_WEIGHTS, ...weights };
|
|
3799
|
+
const weightSum = Object.values(merged).reduce((total, value) => total + Math.max(0, value), 0);
|
|
3800
|
+
if (weightSum <= 0) return 0;
|
|
3801
|
+
return (merged.recall * clamp01(dimensions.recall) + merged.precision * clamp01(dimensions.precision) + merged.actionability * clamp01(dimensions.actionability) + merged.severityCalibration * clamp01(dimensions.severityCalibration) + merged.lowNoise * clamp01(dimensions.lowNoise)) / weightSum;
|
|
3802
|
+
}
|
|
3803
|
+
function matchReferenceFindings(references, comments) {
|
|
3804
|
+
const matches = [];
|
|
3805
|
+
const usedCommentIds = /* @__PURE__ */ new Set();
|
|
3806
|
+
for (const reference of references) {
|
|
3807
|
+
const candidates = comments.filter((comment) => !usedCommentIds.has(comment.id)).map((comment) => ({ comment, score: matchScore(reference, comment) })).filter(({ score }) => score >= 0.55).sort((a, b) => b.score - a.score);
|
|
3808
|
+
const best = candidates[0];
|
|
3809
|
+
if (!best) continue;
|
|
3810
|
+
usedCommentIds.add(best.comment.id);
|
|
3811
|
+
matches.push({ referenceId: reference.id, commentId: best.comment.id, score: best.score });
|
|
3812
|
+
}
|
|
3813
|
+
return matches;
|
|
3814
|
+
}
|
|
3815
|
+
function matchScore(reference, comment) {
|
|
3816
|
+
let score = 0;
|
|
3817
|
+
if (reference.sourceCommentIds?.includes(comment.id)) score += 1;
|
|
3818
|
+
if (reference.path && comment.path && normalizePath(reference.path) === normalizePath(comment.path)) {
|
|
3819
|
+
score += 0.35;
|
|
3820
|
+
}
|
|
3821
|
+
if (reference.line && comment.line && Math.abs(reference.line - comment.line) <= 3) score += 0.15;
|
|
3822
|
+
const terms = [...reference.keywords ?? [], ...tokenize(reference.title)];
|
|
3823
|
+
const uniqueTerms = [...new Set(terms.map(normalizeTerm).filter((term) => term.length >= 3))];
|
|
3824
|
+
if (uniqueTerms.length > 0) {
|
|
3825
|
+
const bodyTerms = new Set(tokenize(comment.body).map(normalizeTerm));
|
|
3826
|
+
const overlap = uniqueTerms.filter((term) => bodyTerms.has(term)).length;
|
|
3827
|
+
score += 0.5 * (overlap / uniqueTerms.length);
|
|
3828
|
+
}
|
|
3829
|
+
return clamp01(score);
|
|
3830
|
+
}
|
|
3831
|
+
function isActionableComment(comment) {
|
|
3832
|
+
const body = comment.body.trim();
|
|
3833
|
+
if (!comment.path && !/\b(file|line|function|method|class|module|test|migration)\b/i.test(body)) {
|
|
3834
|
+
return false;
|
|
3835
|
+
}
|
|
3836
|
+
return /\b(fix|change|add|remove|guard|check|reject|validate|test|assert|return|throw|fail|block)\b/i.test(
|
|
3837
|
+
body
|
|
3838
|
+
);
|
|
3839
|
+
}
|
|
3840
|
+
function isSeverityAligned(comment, references, matches) {
|
|
3841
|
+
if (!comment.severity) return false;
|
|
3842
|
+
const match = matches.find((candidate) => candidate.commentId === comment.id);
|
|
3843
|
+
if (!match) return comment.severity === "nit" || comment.severity === "low";
|
|
3844
|
+
const reference = references.find((candidate) => candidate.id === match.referenceId);
|
|
3845
|
+
if (!reference) return false;
|
|
3846
|
+
return Math.abs(severityRank(comment.severity) - severityRank(reference.severity)) <= 1;
|
|
3847
|
+
}
|
|
3848
|
+
function buildScoreNotes(input) {
|
|
3849
|
+
const notes = [];
|
|
3850
|
+
if (input.referenceCount > 0 && input.matchedFindings.length === 0) {
|
|
3851
|
+
notes.push("no reference findings matched");
|
|
3852
|
+
}
|
|
3853
|
+
if (input.negativeComments.length > 0) {
|
|
3854
|
+
notes.push(`${input.negativeComments.length} comment(s) labelled rejected/duplicate/noise`);
|
|
3855
|
+
}
|
|
3856
|
+
if (input.comments.length > 0 && input.actionableComments.length === 0) {
|
|
3857
|
+
notes.push("comments were not actionable enough for a PR reviewer benchmark");
|
|
3858
|
+
}
|
|
3859
|
+
return notes;
|
|
3860
|
+
}
|
|
3861
|
+
function isPositiveOutcome(outcome) {
|
|
3862
|
+
return outcome === "accepted" || outcome === "fixed";
|
|
3863
|
+
}
|
|
3864
|
+
function isNegativeOutcome(outcome) {
|
|
3865
|
+
return outcome === "rejected" || outcome === "duplicate" || outcome === "noise";
|
|
3866
|
+
}
|
|
3867
|
+
function severityRank(severity) {
|
|
3868
|
+
switch (severity) {
|
|
3869
|
+
case "critical":
|
|
3870
|
+
return 5;
|
|
3871
|
+
case "high":
|
|
3872
|
+
return 4;
|
|
3873
|
+
case "medium":
|
|
3874
|
+
return 3;
|
|
3875
|
+
case "low":
|
|
3876
|
+
return 2;
|
|
3877
|
+
case "nit":
|
|
3878
|
+
return 1;
|
|
3879
|
+
}
|
|
3880
|
+
}
|
|
3881
|
+
function tokenize(input) {
|
|
3882
|
+
return input.match(/[a-zA-Z0-9_.$/-]+/g) ?? [];
|
|
3883
|
+
}
|
|
3884
|
+
function normalizeTerm(input) {
|
|
3885
|
+
return input.toLowerCase().replace(/^[^a-z0-9_]+|[^a-z0-9_]+$/g, "");
|
|
3886
|
+
}
|
|
3887
|
+
function normalizePath(input) {
|
|
3888
|
+
return input.replace(/^\.\/+/, "");
|
|
3889
|
+
}
|
|
3890
|
+
function mean(values) {
|
|
3891
|
+
return values.length ? sum(values) / values.length : 0;
|
|
3892
|
+
}
|
|
3893
|
+
function sum(values) {
|
|
3894
|
+
return values.reduce((total, value) => total + value, 0);
|
|
3895
|
+
}
|
|
3896
|
+
|
|
3715
3897
|
// src/production-loop.ts
|
|
3716
3898
|
async function runProductionLoop(opts) {
|
|
3717
3899
|
validate2(opts);
|
|
@@ -5207,14 +5389,14 @@ async function runHarnessExperiment(config) {
|
|
|
5207
5389
|
const score = config.score ?? ((trace) => critic.scoreTrace(trace));
|
|
5208
5390
|
const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => {
|
|
5209
5391
|
const trace = await config.adapter.run(request);
|
|
5210
|
-
const
|
|
5392
|
+
const runScore2 = await score(trace, request);
|
|
5211
5393
|
const result = {
|
|
5212
5394
|
variant: request.variant,
|
|
5213
5395
|
scenario: request.scenario,
|
|
5214
5396
|
trialIndex: request.trialIndex,
|
|
5215
5397
|
trace,
|
|
5216
|
-
score:
|
|
5217
|
-
aggregate: aggregateRunScore(
|
|
5398
|
+
score: runScore2,
|
|
5399
|
+
aggregate: aggregateRunScore(runScore2, config.weights)
|
|
5218
5400
|
};
|
|
5219
5401
|
await config.onResult?.(result);
|
|
5220
5402
|
return result;
|
|
@@ -5241,10 +5423,10 @@ function summarizeHarnessResults(results) {
|
|
|
5241
5423
|
return {
|
|
5242
5424
|
variant,
|
|
5243
5425
|
runs,
|
|
5244
|
-
aggregateMean:
|
|
5245
|
-
passRate:
|
|
5246
|
-
costUsdMean:
|
|
5247
|
-
wallSecondsMean:
|
|
5426
|
+
aggregateMean: mean2(runs.map((r) => r.aggregate)),
|
|
5427
|
+
passRate: mean2(runs.map((r) => r.score.success)),
|
|
5428
|
+
costUsdMean: mean2(runs.map((r) => r.score.costUsd)),
|
|
5429
|
+
wallSecondsMean: mean2(runs.map((r) => r.score.wallSeconds)),
|
|
5248
5430
|
scoreMean: meanRunScore(runs.map((r) => r.score))
|
|
5249
5431
|
};
|
|
5250
5432
|
}).sort((a, b) => b.aggregateMean - a.aggregateMean);
|
|
@@ -5281,22 +5463,22 @@ async function mapLimit(items, limit, fn) {
|
|
|
5281
5463
|
);
|
|
5282
5464
|
return results;
|
|
5283
5465
|
}
|
|
5284
|
-
function
|
|
5285
|
-
return values.length ? values.reduce((
|
|
5466
|
+
function mean2(values) {
|
|
5467
|
+
return values.length ? values.reduce((sum3, value) => sum3 + value, 0) / values.length : 0;
|
|
5286
5468
|
}
|
|
5287
5469
|
function meanRunScore(scores) {
|
|
5288
5470
|
return {
|
|
5289
|
-
success:
|
|
5290
|
-
goalProgress:
|
|
5291
|
-
repoGroundedness:
|
|
5292
|
-
driftPenalty:
|
|
5293
|
-
toolUseQuality:
|
|
5294
|
-
patchQuality:
|
|
5295
|
-
testReality:
|
|
5296
|
-
finalGate:
|
|
5297
|
-
reviewerBlockers:
|
|
5298
|
-
costUsd:
|
|
5299
|
-
wallSeconds:
|
|
5471
|
+
success: mean2(scores.map((s) => s.success)),
|
|
5472
|
+
goalProgress: mean2(scores.map((s) => s.goalProgress)),
|
|
5473
|
+
repoGroundedness: mean2(scores.map((s) => s.repoGroundedness)),
|
|
5474
|
+
driftPenalty: mean2(scores.map((s) => s.driftPenalty)),
|
|
5475
|
+
toolUseQuality: mean2(scores.map((s) => s.toolUseQuality)),
|
|
5476
|
+
patchQuality: mean2(scores.map((s) => s.patchQuality)),
|
|
5477
|
+
testReality: mean2(scores.map((s) => s.testReality)),
|
|
5478
|
+
finalGate: mean2(scores.map((s) => s.finalGate)),
|
|
5479
|
+
reviewerBlockers: mean2(scores.map((s) => s.reviewerBlockers)),
|
|
5480
|
+
costUsd: mean2(scores.map((s) => s.costUsd)),
|
|
5481
|
+
wallSeconds: mean2(scores.map((s) => s.wallSeconds)),
|
|
5300
5482
|
notes: scores.flatMap((s) => s.notes ?? [])
|
|
5301
5483
|
};
|
|
5302
5484
|
}
|
|
@@ -5635,7 +5817,7 @@ function rankRows(rows, weights) {
|
|
|
5635
5817
|
}
|
|
5636
5818
|
return [...buckets.entries()].map(([variantId, values]) => ({
|
|
5637
5819
|
variantId,
|
|
5638
|
-
mean: values.reduce((
|
|
5820
|
+
mean: values.reduce((sum3, value) => sum3 + value, 0) / values.length,
|
|
5639
5821
|
runs: values.length
|
|
5640
5822
|
})).sort((a, b) => b.mean - a.mean);
|
|
5641
5823
|
}
|
|
@@ -5805,6 +5987,22 @@ var BudgetGuard = class {
|
|
|
5805
5987
|
}
|
|
5806
5988
|
};
|
|
5807
5989
|
|
|
5990
|
+
// src/agent-profile.ts
|
|
5991
|
+
import { createHash as createHash2 } from "crypto";
|
|
5992
|
+
function agentProfileHash(profile) {
|
|
5993
|
+
if (typeof profile.model !== "string" || profile.model.trim().length === 0) {
|
|
5994
|
+
throw new ValidationError(`AgentProfile "${profile.id}" has no model \u2014 cannot hash`);
|
|
5995
|
+
}
|
|
5996
|
+
const behaviour = {
|
|
5997
|
+
model: profile.model.trim(),
|
|
5998
|
+
skills: [...profile.skills ?? []].sort(),
|
|
5999
|
+
promptVersion: profile.promptVersion ?? null,
|
|
6000
|
+
tools: [...profile.tools ?? []].sort(),
|
|
6001
|
+
metadata: profile.metadata ?? {}
|
|
6002
|
+
};
|
|
6003
|
+
return createHash2("sha256").update(JSON.stringify(canonicalize(behaviour))).digest("hex");
|
|
6004
|
+
}
|
|
6005
|
+
|
|
5808
6006
|
// src/cost-tracker.ts
|
|
5809
6007
|
var CostTracker = class {
|
|
5810
6008
|
byScenario = /* @__PURE__ */ new Map();
|
|
@@ -6211,6 +6409,194 @@ function isObject(v) {
|
|
|
6211
6409
|
return typeof v === "object" && v !== null && !Array.isArray(v);
|
|
6212
6410
|
}
|
|
6213
6411
|
|
|
6412
|
+
// src/scorecard.ts
|
|
6413
|
+
import { appendFileSync as appendFileSync2, existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3 } from "fs";
|
|
6414
|
+
import { dirname as dirname2 } from "path";
|
|
6415
|
+
function median(xs) {
|
|
6416
|
+
if (xs.length === 0) return 0;
|
|
6417
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
6418
|
+
const mid = Math.floor(sorted.length / 2);
|
|
6419
|
+
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
6420
|
+
}
|
|
6421
|
+
function runScore(run) {
|
|
6422
|
+
return run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
6423
|
+
}
|
|
6424
|
+
function aggregatePerDimension(runs) {
|
|
6425
|
+
const sums = /* @__PURE__ */ new Map();
|
|
6426
|
+
for (const run of runs) {
|
|
6427
|
+
const dims = run.outcome.judgeScores?.perDimMean;
|
|
6428
|
+
if (!dims) continue;
|
|
6429
|
+
for (const [dim, value] of Object.entries(dims)) {
|
|
6430
|
+
if (!Number.isFinite(value)) continue;
|
|
6431
|
+
const acc = sums.get(dim) ?? { total: 0, count: 0 };
|
|
6432
|
+
acc.total += value;
|
|
6433
|
+
acc.count += 1;
|
|
6434
|
+
sums.set(dim, acc);
|
|
6435
|
+
}
|
|
6436
|
+
}
|
|
6437
|
+
if (sums.size === 0) return void 0;
|
|
6438
|
+
const out = {};
|
|
6439
|
+
for (const [dim, acc] of sums) out[dim] = acc.total / acc.count;
|
|
6440
|
+
return out;
|
|
6441
|
+
}
|
|
6442
|
+
function recordRuns(runs, opts) {
|
|
6443
|
+
const profileHash = agentProfileHash(opts.profile);
|
|
6444
|
+
const timestamp = opts.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
6445
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
6446
|
+
for (const run of runs) {
|
|
6447
|
+
const scenarioId = run.scenarioId;
|
|
6448
|
+
if (!scenarioId) continue;
|
|
6449
|
+
const bucket = byScenario.get(scenarioId);
|
|
6450
|
+
if (bucket) bucket.push(run);
|
|
6451
|
+
else byScenario.set(scenarioId, [run]);
|
|
6452
|
+
}
|
|
6453
|
+
const lines = [];
|
|
6454
|
+
for (const [scenarioId, scenarioRuns] of byScenario) {
|
|
6455
|
+
const scored = scenarioRuns.map((run) => ({ run, score: runScore(run) })).filter((s) => s.score !== void 0);
|
|
6456
|
+
if (scored.length === 0) continue;
|
|
6457
|
+
const scores = scored.map((s) => s.score);
|
|
6458
|
+
const entry = {
|
|
6459
|
+
commitSha: opts.commitSha,
|
|
6460
|
+
timestamp,
|
|
6461
|
+
scores,
|
|
6462
|
+
composite: median(scores),
|
|
6463
|
+
runIds: scored.map((s) => s.run.runId)
|
|
6464
|
+
};
|
|
6465
|
+
const perDimension = aggregatePerDimension(scenarioRuns);
|
|
6466
|
+
if (perDimension) entry.perDimension = perDimension;
|
|
6467
|
+
lines.push({
|
|
6468
|
+
scenarioId,
|
|
6469
|
+
profileHash,
|
|
6470
|
+
model: opts.profile.model,
|
|
6471
|
+
profile: opts.profile,
|
|
6472
|
+
entry
|
|
6473
|
+
});
|
|
6474
|
+
}
|
|
6475
|
+
return lines;
|
|
6476
|
+
}
|
|
6477
|
+
function appendScorecard(logPath, lines) {
|
|
6478
|
+
if (lines.length === 0) return;
|
|
6479
|
+
mkdirSync2(dirname2(logPath), { recursive: true });
|
|
6480
|
+
appendFileSync2(logPath, `${lines.map((line) => JSON.stringify(line)).join("\n")}
|
|
6481
|
+
`);
|
|
6482
|
+
}
|
|
6483
|
+
function recordRunsToScorecard(logPath, runs, opts) {
|
|
6484
|
+
const lines = recordRuns(runs, opts);
|
|
6485
|
+
appendScorecard(logPath, lines);
|
|
6486
|
+
return lines;
|
|
6487
|
+
}
|
|
6488
|
+
function loadScorecard(logPath) {
|
|
6489
|
+
if (!existsSync4(logPath)) return { cells: [], profiles: {} };
|
|
6490
|
+
const cells = /* @__PURE__ */ new Map();
|
|
6491
|
+
const profiles = {};
|
|
6492
|
+
for (const raw of readFileSync3(logPath, "utf8").split("\n")) {
|
|
6493
|
+
const line = raw.trim();
|
|
6494
|
+
if (!line) continue;
|
|
6495
|
+
let parsed;
|
|
6496
|
+
try {
|
|
6497
|
+
parsed = JSON.parse(line);
|
|
6498
|
+
} catch {
|
|
6499
|
+
continue;
|
|
6500
|
+
}
|
|
6501
|
+
if (!parsed?.scenarioId || !parsed.profileHash || !parsed.entry) continue;
|
|
6502
|
+
const key = `${parsed.scenarioId}::${parsed.profileHash}`;
|
|
6503
|
+
let cell = cells.get(key);
|
|
6504
|
+
if (!cell) {
|
|
6505
|
+
cell = {
|
|
6506
|
+
scenarioId: parsed.scenarioId,
|
|
6507
|
+
profileHash: parsed.profileHash,
|
|
6508
|
+
model: parsed.model,
|
|
6509
|
+
timeline: []
|
|
6510
|
+
};
|
|
6511
|
+
cells.set(key, cell);
|
|
6512
|
+
}
|
|
6513
|
+
cell.timeline.push(parsed.entry);
|
|
6514
|
+
if (parsed.profile) profiles[parsed.profileHash] = parsed.profile;
|
|
6515
|
+
}
|
|
6516
|
+
for (const cell of cells.values()) {
|
|
6517
|
+
cell.timeline.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
|
|
6518
|
+
}
|
|
6519
|
+
return { cells: [...cells.values()], profiles };
|
|
6520
|
+
}
|
|
6521
|
+
function diffScorecard(scorecard, opts = {}) {
|
|
6522
|
+
const minEffect = opts.minEffect ?? 0.5;
|
|
6523
|
+
const maxP = opts.maxP ?? 0.05;
|
|
6524
|
+
const minDelta = opts.minDelta ?? 0.05;
|
|
6525
|
+
const cells = [];
|
|
6526
|
+
for (const cell of scorecard.cells) {
|
|
6527
|
+
const timeline = cell.timeline;
|
|
6528
|
+
if (timeline.length === 0) continue;
|
|
6529
|
+
const current = timeline[timeline.length - 1];
|
|
6530
|
+
const baseline = opts.baselineCommit ? [...timeline].reverse().find((e) => e.commitSha === opts.baselineCommit && e !== current) : timeline[timeline.length - 2];
|
|
6531
|
+
const base = {
|
|
6532
|
+
scenarioId: cell.scenarioId,
|
|
6533
|
+
profileHash: cell.profileHash,
|
|
6534
|
+
model: cell.model,
|
|
6535
|
+
current: current.composite,
|
|
6536
|
+
currentCommit: current.commitSha
|
|
6537
|
+
};
|
|
6538
|
+
if (!baseline) {
|
|
6539
|
+
cells.push({
|
|
6540
|
+
...base,
|
|
6541
|
+
verdict: "new",
|
|
6542
|
+
baseline: null,
|
|
6543
|
+
delta: null,
|
|
6544
|
+
cohensD: null,
|
|
6545
|
+
pValue: null,
|
|
6546
|
+
baselineCommit: null
|
|
6547
|
+
});
|
|
6548
|
+
continue;
|
|
6549
|
+
}
|
|
6550
|
+
const delta = current.composite - baseline.composite;
|
|
6551
|
+
const canStat = baseline.scores.length >= 2 && current.scores.length >= 2;
|
|
6552
|
+
let d = null;
|
|
6553
|
+
let p = null;
|
|
6554
|
+
let verdict;
|
|
6555
|
+
if (canStat) {
|
|
6556
|
+
d = cohensD(baseline.scores, current.scores);
|
|
6557
|
+
const t = welchsTTest(baseline.scores, current.scores);
|
|
6558
|
+
p = Number.isFinite(t.p) ? t.p : null;
|
|
6559
|
+
const significant = Math.abs(d) >= minEffect && p !== null && p <= maxP;
|
|
6560
|
+
verdict = significant ? delta > 0 ? "improved" : "regressed" : "flat";
|
|
6561
|
+
} else {
|
|
6562
|
+
verdict = Math.abs(delta) >= minDelta ? delta > 0 ? "improved" : "regressed" : "flat";
|
|
6563
|
+
}
|
|
6564
|
+
cells.push({
|
|
6565
|
+
...base,
|
|
6566
|
+
verdict,
|
|
6567
|
+
baseline: baseline.composite,
|
|
6568
|
+
delta,
|
|
6569
|
+
cohensD: d,
|
|
6570
|
+
pValue: p,
|
|
6571
|
+
baselineCommit: baseline.commitSha
|
|
6572
|
+
});
|
|
6573
|
+
}
|
|
6574
|
+
const summary = { improved: 0, regressed: 0, flat: 0, new: 0 };
|
|
6575
|
+
for (const cell of cells) summary[cell.verdict] += 1;
|
|
6576
|
+
return { cells, summary };
|
|
6577
|
+
}
|
|
6578
|
+
function formatScorecardDiff(diff) {
|
|
6579
|
+
const lines = [];
|
|
6580
|
+
const { summary } = diff;
|
|
6581
|
+
lines.push(
|
|
6582
|
+
`Scorecard: ${summary.regressed} regressed \xB7 ${summary.improved} improved \xB7 ${summary.flat} flat \xB7 ${summary.new} new`
|
|
6583
|
+
);
|
|
6584
|
+
const fmt = (n) => n.toFixed(3);
|
|
6585
|
+
const noteworthy = diff.cells.filter((c) => c.verdict === "regressed" || c.verdict === "improved").sort((a, b) => {
|
|
6586
|
+
if (a.verdict !== b.verdict) return a.verdict === "regressed" ? -1 : 1;
|
|
6587
|
+
return Math.abs(b.delta ?? 0) - Math.abs(a.delta ?? 0);
|
|
6588
|
+
});
|
|
6589
|
+
for (const cell of noteworthy) {
|
|
6590
|
+
const mark = cell.verdict === "regressed" ? "REGRESSED" : "improved";
|
|
6591
|
+
const deltaStr = cell.delta !== null ? cell.delta >= 0 ? `+${fmt(cell.delta)}` : fmt(cell.delta) : "\u2014";
|
|
6592
|
+
const stat = cell.cohensD !== null ? ` (d=${cell.cohensD.toFixed(2)}${cell.pValue !== null ? `, p=${cell.pValue.toFixed(3)}` : ""})` : "";
|
|
6593
|
+
lines.push(
|
|
6594
|
+
` ${mark} ${cell.scenarioId} \xB7 ${cell.model} \xB7 ${cell.profileHash.slice(0, 8)} ${fmt(cell.baseline ?? 0)} \u2192 ${fmt(cell.current)} ${deltaStr}${stat}`
|
|
6595
|
+
);
|
|
6596
|
+
}
|
|
6597
|
+
return lines.join("\n");
|
|
6598
|
+
}
|
|
6599
|
+
|
|
6214
6600
|
// src/series-convergence.ts
|
|
6215
6601
|
function analyzeSeries(values, options = {}) {
|
|
6216
6602
|
const window = options.window ?? 5;
|
|
@@ -6220,10 +6606,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
6220
6606
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
6221
6607
|
}
|
|
6222
6608
|
const tail = values.slice(-window);
|
|
6223
|
-
const
|
|
6224
|
-
const variance = tail.reduce((acc, v) => acc + (v -
|
|
6609
|
+
const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
6610
|
+
const variance = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
|
|
6225
6611
|
const stdDev = Math.sqrt(variance);
|
|
6226
|
-
const refMean = Math.abs(
|
|
6612
|
+
const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
6227
6613
|
const cv = stdDev / refMean;
|
|
6228
6614
|
const stable = tail.length >= window && cv <= stableCv;
|
|
6229
6615
|
let tailRun = 0;
|
|
@@ -6244,7 +6630,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
6244
6630
|
} else {
|
|
6245
6631
|
state = "noisy";
|
|
6246
6632
|
}
|
|
6247
|
-
return { state, windowMean:
|
|
6633
|
+
return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
|
|
6248
6634
|
}
|
|
6249
6635
|
|
|
6250
6636
|
// src/slo.ts
|
|
@@ -7042,12 +7428,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
7042
7428
|
variantScores.push({ mutator: id, score, mutated });
|
|
7043
7429
|
all.push(score);
|
|
7044
7430
|
}
|
|
7045
|
-
const
|
|
7046
|
-
const variance = all.reduce((a, v) => a + (v -
|
|
7431
|
+
const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
7432
|
+
const variance = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
|
|
7047
7433
|
const stdDev = Math.sqrt(variance);
|
|
7048
|
-
const ref = Math.abs(
|
|
7434
|
+
const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
|
|
7049
7435
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
7050
|
-
return { originalScore, variantScores, meanScore:
|
|
7436
|
+
return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
|
|
7051
7437
|
}
|
|
7052
7438
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
7053
7439
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -7113,8 +7499,8 @@ async function paraphraseRobustnessScenarios(args) {
|
|
|
7113
7499
|
});
|
|
7114
7500
|
scores.push(out.score);
|
|
7115
7501
|
}
|
|
7116
|
-
const
|
|
7117
|
-
deltas[m.name] =
|
|
7502
|
+
const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7503
|
+
deltas[m.name] = mean5 - originalScore;
|
|
7118
7504
|
paraphrasedAll.push(...scores);
|
|
7119
7505
|
}
|
|
7120
7506
|
const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
|
|
@@ -7727,8 +8113,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7727
8113
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7728
8114
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7729
8115
|
if (scores.length < 3) continue;
|
|
7730
|
-
const
|
|
7731
|
-
const variance = scores.reduce((a, b) => a + (b -
|
|
8116
|
+
const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
8117
|
+
const variance = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
|
|
7732
8118
|
if (variance > varianceThreshold) {
|
|
7733
8119
|
targets.push({
|
|
7734
8120
|
reason: "high-variance",
|
|
@@ -7959,7 +8345,7 @@ async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
|
7959
8345
|
|
|
7960
8346
|
// src/command-runner.ts
|
|
7961
8347
|
import { spawnSync } from "child_process";
|
|
7962
|
-
import { existsSync as
|
|
8348
|
+
import { existsSync as existsSync5, readdirSync as readdirSync2, readFileSync as readFileSync4, statSync as statSync2 } from "fs";
|
|
7963
8349
|
import { join as join2 } from "path";
|
|
7964
8350
|
var localCommandRunner = {
|
|
7965
8351
|
name: "local",
|
|
@@ -7988,11 +8374,11 @@ var localCommandRunner = {
|
|
|
7988
8374
|
return r.status === 0 && (r.stdout ?? "").trim().length > 0;
|
|
7989
8375
|
},
|
|
7990
8376
|
async fileExists(path) {
|
|
7991
|
-
return
|
|
8377
|
+
return existsSync5(path);
|
|
7992
8378
|
},
|
|
7993
8379
|
async readFile(path) {
|
|
7994
8380
|
try {
|
|
7995
|
-
return
|
|
8381
|
+
return readFileSync4(path, "utf8");
|
|
7996
8382
|
} catch {
|
|
7997
8383
|
return null;
|
|
7998
8384
|
}
|
|
@@ -8230,7 +8616,7 @@ function extractErrorCount(text, opts = {}) {
|
|
|
8230
8616
|
for (const p of patterns) {
|
|
8231
8617
|
const matches = Array.from(text.matchAll(p.regex));
|
|
8232
8618
|
if (matches.length === 0) continue;
|
|
8233
|
-
const count = p.transform ? matches.reduce((
|
|
8619
|
+
const count = p.transform ? matches.reduce((sum3, m) => sum3 + p.transform(m), 0) : matches.length;
|
|
8234
8620
|
return {
|
|
8235
8621
|
count,
|
|
8236
8622
|
matched: p.name,
|
|
@@ -8924,8 +9310,8 @@ function multiToolchainLayer(config) {
|
|
|
8924
9310
|
}
|
|
8925
9311
|
|
|
8926
9312
|
// src/reference-replay.ts
|
|
8927
|
-
import { appendFileSync as
|
|
8928
|
-
import { dirname as
|
|
9313
|
+
import { appendFileSync as appendFileSync3, existsSync as existsSync6, mkdirSync as mkdirSync3, readFileSync as readFileSync5 } from "fs";
|
|
9314
|
+
import { dirname as dirname3 } from "path";
|
|
8929
9315
|
var DEFAULT_MATCH_THRESHOLD = 0.55;
|
|
8930
9316
|
var ALL_SPLITS = ["train", "dev", "test", "holdout"];
|
|
8931
9317
|
async function runReferenceReplay(cases, options) {
|
|
@@ -9043,14 +9429,14 @@ function jsonlReferenceReplayStore(path) {
|
|
|
9043
9429
|
return {
|
|
9044
9430
|
async save(run) {
|
|
9045
9431
|
await lock.runExclusive(() => {
|
|
9046
|
-
|
|
9047
|
-
|
|
9432
|
+
mkdirSync3(dirname3(path), { recursive: true });
|
|
9433
|
+
appendFileSync3(path, `${JSON.stringify(run)}
|
|
9048
9434
|
`);
|
|
9049
9435
|
});
|
|
9050
9436
|
},
|
|
9051
9437
|
async list() {
|
|
9052
9438
|
return lock.runExclusive(() => {
|
|
9053
|
-
if (!
|
|
9439
|
+
if (!existsSync6(path)) return [];
|
|
9054
9440
|
return readJsonl(path);
|
|
9055
9441
|
});
|
|
9056
9442
|
}
|
|
@@ -9139,7 +9525,7 @@ function decideReferenceReplayPromotion(baseline, candidate, policy = {}) {
|
|
|
9139
9525
|
regressions
|
|
9140
9526
|
};
|
|
9141
9527
|
}
|
|
9142
|
-
const requiredMeanDelta =
|
|
9528
|
+
const requiredMeanDelta = mean3(compared.map((item) => item.f1Delta));
|
|
9143
9529
|
if (requiredMeanDelta < minF1Delta) {
|
|
9144
9530
|
return {
|
|
9145
9531
|
promote: false,
|
|
@@ -9274,8 +9660,8 @@ function scorePair(scenario, matcher, reference, candidate) {
|
|
|
9274
9660
|
function buildScenarioScore(scenario, matches, falsePositives) {
|
|
9275
9661
|
const matched = matches.filter((match) => match.matched).length;
|
|
9276
9662
|
const total = scenario.references.length;
|
|
9277
|
-
const matchedWeight = matches.filter((match) => match.matched).reduce((
|
|
9278
|
-
const totalWeight = matches.reduce((
|
|
9663
|
+
const matchedWeight = matches.filter((match) => match.matched).reduce((sum3, match) => sum3 + match.weight, 0);
|
|
9664
|
+
const totalWeight = matches.reduce((sum3, match) => sum3 + match.weight, 0);
|
|
9279
9665
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9280
9666
|
const recall = ratio(matched, total);
|
|
9281
9667
|
return {
|
|
@@ -9301,11 +9687,11 @@ function aggregateBySplit(scores) {
|
|
|
9301
9687
|
return out;
|
|
9302
9688
|
}
|
|
9303
9689
|
function aggregateScenarioScores(scores) {
|
|
9304
|
-
const matched =
|
|
9305
|
-
const total =
|
|
9306
|
-
const falsePositives =
|
|
9307
|
-
const matchedWeight =
|
|
9308
|
-
const totalWeight =
|
|
9690
|
+
const matched = sum2(scores.map((score) => score.matched));
|
|
9691
|
+
const total = sum2(scores.map((score) => score.total));
|
|
9692
|
+
const falsePositives = sum2(scores.map((score) => score.falsePositives));
|
|
9693
|
+
const matchedWeight = sum2(scores.map((score) => score.matchedWeight));
|
|
9694
|
+
const totalWeight = sum2(scores.map((score) => score.totalWeight));
|
|
9309
9695
|
const precision2 = ratio(matched, matched + falsePositives);
|
|
9310
9696
|
const recall = ratio(matched, total);
|
|
9311
9697
|
return {
|
|
@@ -9372,11 +9758,11 @@ function clamp012(value) {
|
|
|
9372
9758
|
if (!Number.isFinite(value)) return 0;
|
|
9373
9759
|
return Math.max(0, Math.min(1, value));
|
|
9374
9760
|
}
|
|
9375
|
-
function
|
|
9761
|
+
function sum2(values) {
|
|
9376
9762
|
return values.reduce((acc, value) => acc + value, 0);
|
|
9377
9763
|
}
|
|
9378
|
-
function
|
|
9379
|
-
return values.length ?
|
|
9764
|
+
function mean3(values) {
|
|
9765
|
+
return values.length ? sum2(values) / values.length : 0;
|
|
9380
9766
|
}
|
|
9381
9767
|
function formatPct(value) {
|
|
9382
9768
|
return `${(value * 100).toFixed(1)}%`;
|
|
@@ -9393,7 +9779,7 @@ function throwIfAborted(signal) {
|
|
|
9393
9779
|
throw new Error(signal.reason ? String(signal.reason) : "reference replay aborted");
|
|
9394
9780
|
}
|
|
9395
9781
|
function readJsonl(path) {
|
|
9396
|
-
const raw =
|
|
9782
|
+
const raw = readFileSync5(path, "utf8");
|
|
9397
9783
|
const out = [];
|
|
9398
9784
|
for (const line of raw.split("\n")) {
|
|
9399
9785
|
const trimmed = line.trim();
|
|
@@ -9640,8 +10026,8 @@ function detectCalibrationDrift(runs, opts) {
|
|
|
9640
10026
|
alpha,
|
|
9641
10027
|
recentN: recent.length,
|
|
9642
10028
|
historyN: historical.length,
|
|
9643
|
-
recentMean:
|
|
9644
|
-
historyMean:
|
|
10029
|
+
recentMean: mean4(recent),
|
|
10030
|
+
historyMean: mean4(historical)
|
|
9645
10031
|
}
|
|
9646
10032
|
}
|
|
9647
10033
|
];
|
|
@@ -9761,7 +10147,7 @@ function chiSquareCritical(df, alpha) {
|
|
|
9761
10147
|
}
|
|
9762
10148
|
return TABLE[10][idx];
|
|
9763
10149
|
}
|
|
9764
|
-
function
|
|
10150
|
+
function mean4(xs) {
|
|
9765
10151
|
if (xs.length === 0) return 0;
|
|
9766
10152
|
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
9767
10153
|
}
|
|
@@ -9961,8 +10347,8 @@ async function discoverPersonas(dir, opts = {}) {
|
|
|
9961
10347
|
}
|
|
9962
10348
|
|
|
9963
10349
|
// src/evolution-telemetry.ts
|
|
9964
|
-
import { appendFileSync as
|
|
9965
|
-
import { dirname as
|
|
10350
|
+
import { appendFileSync as appendFileSync4, existsSync as existsSync7, mkdirSync as mkdirSync4, readFileSync as readFileSync6, writeFileSync } from "fs";
|
|
10351
|
+
import { dirname as dirname4 } from "path";
|
|
9966
10352
|
var MutationTelemetry = class {
|
|
9967
10353
|
appender;
|
|
9968
10354
|
constructor(path) {
|
|
@@ -9991,17 +10377,17 @@ var LineageRecorder = class {
|
|
|
9991
10377
|
this.path = path;
|
|
9992
10378
|
this.snapshotPath = `${path}.snapshot`;
|
|
9993
10379
|
this.kindOf = kindOf ?? defaultKindOf;
|
|
9994
|
-
|
|
9995
|
-
if (
|
|
10380
|
+
mkdirSync4(dirname4(path), { recursive: true });
|
|
10381
|
+
if (existsSync7(this.snapshotPath)) {
|
|
9996
10382
|
try {
|
|
9997
|
-
const parsed = JSON.parse(
|
|
10383
|
+
const parsed = JSON.parse(readFileSync6(this.snapshotPath, "utf-8"));
|
|
9998
10384
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
9999
10385
|
} catch {
|
|
10000
10386
|
}
|
|
10001
10387
|
}
|
|
10002
|
-
if (
|
|
10388
|
+
if (existsSync7(path)) {
|
|
10003
10389
|
try {
|
|
10004
|
-
for (const line of
|
|
10390
|
+
for (const line of readFileSync6(path, "utf-8").split("\n")) {
|
|
10005
10391
|
if (!line.trim()) continue;
|
|
10006
10392
|
try {
|
|
10007
10393
|
const entry = JSON.parse(line);
|
|
@@ -10013,9 +10399,9 @@ var LineageRecorder = class {
|
|
|
10013
10399
|
} catch {
|
|
10014
10400
|
}
|
|
10015
10401
|
}
|
|
10016
|
-
if (
|
|
10402
|
+
if (existsSync7(path) && this.nodes.size === 0) {
|
|
10017
10403
|
try {
|
|
10018
|
-
const raw =
|
|
10404
|
+
const raw = readFileSync6(path, "utf-8").trim();
|
|
10019
10405
|
if (raw.startsWith("[")) {
|
|
10020
10406
|
const parsed = JSON.parse(raw);
|
|
10021
10407
|
for (const n of parsed) this.nodes.set(n.id, n);
|
|
@@ -10029,15 +10415,15 @@ var LineageRecorder = class {
|
|
|
10029
10415
|
const prev = this.nodes.get(node.id);
|
|
10030
10416
|
this.nodes.set(node.id, { ...prev, ...node });
|
|
10031
10417
|
try {
|
|
10032
|
-
if (
|
|
10033
|
-
const head =
|
|
10418
|
+
if (existsSync7(this.path)) {
|
|
10419
|
+
const head = readFileSync6(this.path, { encoding: "utf-8", flag: "r" }).slice(0, 1);
|
|
10034
10420
|
if (head === "[") {
|
|
10035
10421
|
writeFileSync(this.path, "");
|
|
10036
10422
|
}
|
|
10037
10423
|
}
|
|
10038
10424
|
} catch {
|
|
10039
10425
|
}
|
|
10040
|
-
|
|
10426
|
+
appendFileSync4(this.path, `${JSON.stringify(this.nodes.get(node.id))}
|
|
10041
10427
|
`);
|
|
10042
10428
|
});
|
|
10043
10429
|
}
|
|
@@ -10096,9 +10482,9 @@ var CostLedger = class {
|
|
|
10096
10482
|
mutex = new Mutex();
|
|
10097
10483
|
constructor(path) {
|
|
10098
10484
|
this.path = path;
|
|
10099
|
-
if (
|
|
10485
|
+
if (existsSync7(path)) {
|
|
10100
10486
|
try {
|
|
10101
|
-
const loaded = JSON.parse(
|
|
10487
|
+
const loaded = JSON.parse(readFileSync6(path, "utf-8"));
|
|
10102
10488
|
for (const k of Object.keys(this.totals)) {
|
|
10103
10489
|
if (k === "byGeneration") {
|
|
10104
10490
|
if (loaded.byGeneration && typeof loaded.byGeneration === "object") {
|
|
@@ -10115,7 +10501,7 @@ var CostLedger = class {
|
|
|
10115
10501
|
} catch {
|
|
10116
10502
|
}
|
|
10117
10503
|
} else {
|
|
10118
|
-
|
|
10504
|
+
mkdirSync4(dirname4(path), { recursive: true });
|
|
10119
10505
|
}
|
|
10120
10506
|
}
|
|
10121
10507
|
genBucket(generation) {
|
|
@@ -10267,16 +10653,16 @@ function precision(goldens, candidates, options = {}) {
|
|
|
10267
10653
|
}
|
|
10268
10654
|
|
|
10269
10655
|
// src/jsonl-trial-cache.ts
|
|
10270
|
-
import { appendFileSync as
|
|
10271
|
-
import { dirname as
|
|
10656
|
+
import { appendFileSync as appendFileSync5, existsSync as existsSync8, mkdirSync as mkdirSync5, readFileSync as readFileSync7 } from "fs";
|
|
10657
|
+
import { dirname as dirname5 } from "path";
|
|
10272
10658
|
var JsonlTrialCache = class {
|
|
10273
10659
|
map = /* @__PURE__ */ new Map();
|
|
10274
10660
|
path;
|
|
10275
10661
|
appender;
|
|
10276
10662
|
constructor(path) {
|
|
10277
10663
|
this.path = path;
|
|
10278
|
-
if (
|
|
10279
|
-
for (const line of
|
|
10664
|
+
if (existsSync8(path)) {
|
|
10665
|
+
for (const line of readFileSync7(path, "utf-8").split("\n")) {
|
|
10280
10666
|
if (!line.trim()) continue;
|
|
10281
10667
|
try {
|
|
10282
10668
|
const entry = JSON.parse(line);
|
|
@@ -10285,7 +10671,7 @@ var JsonlTrialCache = class {
|
|
|
10285
10671
|
}
|
|
10286
10672
|
}
|
|
10287
10673
|
} else {
|
|
10288
|
-
|
|
10674
|
+
mkdirSync5(dirname5(path), { recursive: true });
|
|
10289
10675
|
}
|
|
10290
10676
|
this.appender = new LockedJsonlAppender(path);
|
|
10291
10677
|
}
|
|
@@ -10308,7 +10694,7 @@ var JsonlTrialCache = class {
|
|
|
10308
10694
|
setSync(key, value) {
|
|
10309
10695
|
this.map.set(key, value);
|
|
10310
10696
|
const line = { key, result: value, writtenAt: Date.now() };
|
|
10311
|
-
|
|
10697
|
+
appendFileSync5(this.path, `${JSON.stringify(line)}
|
|
10312
10698
|
`);
|
|
10313
10699
|
}
|
|
10314
10700
|
};
|
|
@@ -10316,35 +10702,14 @@ var JsonlTrialCache = class {
|
|
|
10316
10702
|
// src/judge-retry.ts
|
|
10317
10703
|
var DEFAULT_MAX_ATTEMPTS = 3;
|
|
10318
10704
|
var DEFAULT_TIMEOUT_MS = 9e4;
|
|
10319
|
-
var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
|
|
10320
|
-
var ABORT_PATTERNS = [
|
|
10321
|
-
/AbortError/i,
|
|
10322
|
-
/TimeoutError/i,
|
|
10323
|
-
/fetch failed/i,
|
|
10324
|
-
/ECONNRESET/i,
|
|
10325
|
-
/ETIMEDOUT/i,
|
|
10326
|
-
/EAI_AGAIN/i,
|
|
10327
|
-
/this operation was aborted/i,
|
|
10328
|
-
/stream.*ended.*unexpectedly/i,
|
|
10329
|
-
/socket hang up/i
|
|
10330
|
-
];
|
|
10331
|
-
var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
|
|
10332
|
-
function defaultIsRetryable(err) {
|
|
10333
|
-
if (err instanceof Error) {
|
|
10334
|
-
if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
|
|
10335
|
-
const status = err.status;
|
|
10336
|
-
if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
|
|
10337
|
-
}
|
|
10338
|
-
return false;
|
|
10339
|
-
}
|
|
10340
10705
|
function sleep(ms) {
|
|
10341
10706
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
10342
10707
|
}
|
|
10343
10708
|
async function withJudgeRetry(judgeFn, policy = {}) {
|
|
10344
10709
|
const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
10345
10710
|
const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
10346
|
-
const backoff = policy.backoffMs ??
|
|
10347
|
-
const isRetryable = policy.isRetryable ??
|
|
10711
|
+
const backoff = policy.backoffMs ?? backoffMs;
|
|
10712
|
+
const isRetryable = policy.isRetryable ?? isTransientLlmError;
|
|
10348
10713
|
const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
|
|
10349
10714
|
let totalAttempts = 0;
|
|
10350
10715
|
const attemptErrors = [];
|
|
@@ -10412,9 +10777,9 @@ function passOrthogonality(input) {
|
|
|
10412
10777
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10413
10778
|
}
|
|
10414
10779
|
}
|
|
10415
|
-
const
|
|
10780
|
+
const mean5 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10416
10781
|
return {
|
|
10417
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
10782
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean5)),
|
|
10418
10783
|
passCount: passes.length,
|
|
10419
10784
|
similarities: sims
|
|
10420
10785
|
};
|
|
@@ -10667,6 +11032,7 @@ export {
|
|
|
10667
11032
|
ANALYST_SEVERITIES,
|
|
10668
11033
|
AgentDriver,
|
|
10669
11034
|
AgentEvalError,
|
|
11035
|
+
AgentProfileCellValidationError,
|
|
10670
11036
|
AnalystRegistry,
|
|
10671
11037
|
AxGepaSteeringOptimizer,
|
|
10672
11038
|
BENCHMARK_SPLIT_SEED,
|
|
@@ -10688,6 +11054,7 @@ export {
|
|
|
10688
11054
|
DEFAULT_HARNESS_OBJECTIVES,
|
|
10689
11055
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
10690
11056
|
DEFAULT_MUTATORS,
|
|
11057
|
+
DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
|
|
10691
11058
|
DEFAULT_REDACTION_RULES,
|
|
10692
11059
|
DEFAULT_RED_TEAM_CORPUS,
|
|
10693
11060
|
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
@@ -10774,28 +11141,35 @@ export {
|
|
|
10774
11141
|
VerificationError,
|
|
10775
11142
|
acquisitionPlansForKnowledgeGaps,
|
|
10776
11143
|
adversarialJudge,
|
|
11144
|
+
agentProfileCellHashMaterial,
|
|
11145
|
+
agentProfileCellKey,
|
|
11146
|
+
agentProfileHash,
|
|
10777
11147
|
aggregateLlm,
|
|
11148
|
+
aggregatePrReviewScore,
|
|
10778
11149
|
aggregateRunScore,
|
|
10779
11150
|
aggregateTrialsByMode,
|
|
10780
11151
|
allCriticalPassed,
|
|
10781
11152
|
analyzeAntiSlop,
|
|
10782
11153
|
analyzeSeries,
|
|
10783
11154
|
analyzeTraces,
|
|
11155
|
+
appendScorecard,
|
|
10784
11156
|
argHash,
|
|
10785
11157
|
assertLlmRoute,
|
|
10786
11158
|
assertRealBackend,
|
|
10787
11159
|
assertReleaseConfidence,
|
|
11160
|
+
assertRunAgentProfileCell,
|
|
10788
11161
|
assertRunCaptured,
|
|
10789
11162
|
assignFeedbackSplit,
|
|
10790
11163
|
attributeCounterfactuals,
|
|
11164
|
+
backoffMs,
|
|
10791
11165
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
10792
11166
|
benchmarks_exports as benchmarks,
|
|
10793
11167
|
benjaminiHochberg,
|
|
10794
|
-
bhAdjust,
|
|
10795
11168
|
bisect,
|
|
10796
11169
|
blockingKnowledgeEval,
|
|
10797
11170
|
bonferroni,
|
|
10798
11171
|
bootstrapCi,
|
|
11172
|
+
buildAgentProfileCell,
|
|
10799
11173
|
buildDriverSystemPrompt,
|
|
10800
11174
|
buildReflectionPrompt,
|
|
10801
11175
|
buildReviewerPrompt,
|
|
@@ -10822,6 +11196,7 @@ export {
|
|
|
10822
11196
|
cohensD,
|
|
10823
11197
|
coherenceJudge,
|
|
10824
11198
|
collectionPreserved,
|
|
11199
|
+
commentsForSource,
|
|
10825
11200
|
commitBisect,
|
|
10826
11201
|
compareReferenceReplay,
|
|
10827
11202
|
compareToBaseline,
|
|
@@ -10872,6 +11247,7 @@ export {
|
|
|
10872
11247
|
deployGateLayer,
|
|
10873
11248
|
describeTraceInsightScope,
|
|
10874
11249
|
diffFindings,
|
|
11250
|
+
diffScorecard,
|
|
10875
11251
|
discoverPersonas,
|
|
10876
11252
|
distillPlaybook,
|
|
10877
11253
|
domainEvidencePattern,
|
|
@@ -10907,11 +11283,13 @@ export {
|
|
|
10907
11283
|
formatBenchmarkReport,
|
|
10908
11284
|
formatDriverReport,
|
|
10909
11285
|
formatFindings,
|
|
11286
|
+
formatScorecardDiff,
|
|
10910
11287
|
gainHistogram,
|
|
10911
11288
|
ghCliClient,
|
|
10912
11289
|
precision as goldenPrecision,
|
|
10913
11290
|
gradeSemanticStatus,
|
|
10914
11291
|
groupBy,
|
|
11292
|
+
groupRunsByAgentProfileCell,
|
|
10915
11293
|
hashContent,
|
|
10916
11294
|
hashJson,
|
|
10917
11295
|
hashScenarios,
|
|
@@ -10933,6 +11311,7 @@ export {
|
|
|
10933
11311
|
isRunRecord,
|
|
10934
11312
|
isSandboxSpan,
|
|
10935
11313
|
isToolSpan,
|
|
11314
|
+
isTransientLlmError,
|
|
10936
11315
|
iterateRawCalls,
|
|
10937
11316
|
jestTestParser,
|
|
10938
11317
|
jsonHasKeys,
|
|
@@ -10947,6 +11326,7 @@ export {
|
|
|
10947
11326
|
linterJudge,
|
|
10948
11327
|
llmSpanFromProvider,
|
|
10949
11328
|
llmSpans,
|
|
11329
|
+
loadScorecard,
|
|
10950
11330
|
loadScorerFromGrader,
|
|
10951
11331
|
localCommandRunner,
|
|
10952
11332
|
lowercaseMutator,
|
|
@@ -10962,8 +11342,8 @@ export {
|
|
|
10962
11342
|
objectiveEval,
|
|
10963
11343
|
pairedBootstrap,
|
|
10964
11344
|
pairedEvalueSequence,
|
|
11345
|
+
pairedMde,
|
|
10965
11346
|
pairedTTest,
|
|
10966
|
-
pairedWilcoxon,
|
|
10967
11347
|
paraphraseRobustness,
|
|
10968
11348
|
paraphraseRobustnessScenarios,
|
|
10969
11349
|
paretoChart,
|
|
@@ -10988,6 +11368,8 @@ export {
|
|
|
10988
11368
|
proposeSynthesisTargets,
|
|
10989
11369
|
providerFromBaseUrl,
|
|
10990
11370
|
pytestTestParser,
|
|
11371
|
+
recordRuns,
|
|
11372
|
+
recordRunsToScorecard,
|
|
10991
11373
|
redTeamDataset,
|
|
10992
11374
|
redTeamReport,
|
|
10993
11375
|
redactString,
|
|
@@ -11009,6 +11391,7 @@ export {
|
|
|
11009
11391
|
replayFeedbackTrajectory,
|
|
11010
11392
|
replayScorerOverCorpus,
|
|
11011
11393
|
replayTraceThroughJudge,
|
|
11394
|
+
requireAgentProfileCell,
|
|
11012
11395
|
requiredSampleSize,
|
|
11013
11396
|
researchReport,
|
|
11014
11397
|
resetLockedAppendersForTesting,
|
|
@@ -11045,6 +11428,8 @@ export {
|
|
|
11045
11428
|
scoreContinuity,
|
|
11046
11429
|
scoreFromEvals,
|
|
11047
11430
|
scoreKnowledgeReadiness,
|
|
11431
|
+
scorePrReviewComments,
|
|
11432
|
+
scorePrReviewSource,
|
|
11048
11433
|
scoreRedTeamOutput,
|
|
11049
11434
|
scoreReferenceReplay,
|
|
11050
11435
|
scoreTraceInsightReadiness,
|
|
@@ -11063,6 +11448,7 @@ export {
|
|
|
11063
11448
|
summarize,
|
|
11064
11449
|
summarizeBackendIntegrity,
|
|
11065
11450
|
summarizeHarnessResults,
|
|
11451
|
+
summarizePrReviewBenchmark,
|
|
11066
11452
|
summarizePreferenceMemory,
|
|
11067
11453
|
summaryTable,
|
|
11068
11454
|
testJudge,
|
|
@@ -11079,8 +11465,10 @@ export {
|
|
|
11079
11465
|
typoMutator,
|
|
11080
11466
|
urlContains,
|
|
11081
11467
|
userQuestionsForKnowledgeGaps,
|
|
11468
|
+
validateAgentProfileCell,
|
|
11082
11469
|
validateRunRecord,
|
|
11083
11470
|
verbosityBias,
|
|
11471
|
+
verifyAgentProfileCell,
|
|
11084
11472
|
verifyCompletion,
|
|
11085
11473
|
verifyManifest,
|
|
11086
11474
|
visualDiff,
|